summaryrefslogtreecommitdiffstats
path: root/src/spdk/test/nvme/perf/common.sh
blob: ddd01ec529e2df1787923bf87e1df70cadbbf4dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
#!/usr/bin/env bash

function discover_bdevs() {
	local rootdir=$1
	local config_file=$2
	local cfg_type=$3
	local wait_for_spdk_bdev=${4:-30}
	local rpc_server=/var/tmp/spdk-discover-bdevs.sock

	if [ ! -e $config_file ]; then
		echo "Invalid Configuration File: $config_file"
		return 1
	fi

	if [ -z $cfg_type ]; then
		cfg_type="-c"
	fi

	# Start the bdev service to query for the list of available
	# bdevs.
	$rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \
		$cfg_type $config_file &> /dev/null &
	stubpid=$!
	while ! [ -e /var/run/spdk_bdev0 ]; do
		# If this counter drops to zero, errexit will be caught to abort the test
		((wait_for_spdk_bdev--))
		sleep 1
	done

	# Get all of the bdevs
	$rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs

	# Shut down the bdev service
	kill $stubpid
	wait $stubpid
	rm -f /var/run/spdk_bdev0
}

function create_spdk_bdev_conf() {
	local output
	local disk_cfg
	local bdev_io_cache_size=$1
	local bdev_io_pool_size=$2
	local bdev_json_cfg=()
	local bdev_opts=()

	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))

	if [[ -n "$bdev_io_cache_size" ]]; then
		bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size")
	fi

	if [[ -n "$bdev_io_pool_size" ]]; then
		bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size")
	fi

	local IFS=","
	if [[ ${#bdev_opts[@]} -gt 0 ]]; then
		bdev_json_cfg+=("$(
			cat <<- JSON
				{
					"method": "bdev_set_options",
					"params": {
						${bdev_opts[*]}
					}
				}
			JSON
		)")
	fi

	for i in "${!disk_cfg[@]}"; do
		bdev_json_cfg+=("$(
			cat <<- JSON
				{
					"method": "bdev_nvme_attach_controller",
					"params": {
						"trtype": "PCIe",
						"name":"Nvme${i}",
						"traddr":"${disk_cfg[i]}"
					}
				}
			JSON
		)")
	done

	local IFS=","
	jq -r '.' <<- JSON > $testdir/bdev.conf
		{
			"subsystems": [
				{
					"subsystem": "bdev",
					"config": [
						${bdev_json_cfg[*]}
					]
				}
			]
		}
	JSON
}

function is_bdf_not_mounted() {
	local bdf=$1
	local blkname
	local mountpoints
	blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
	mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
	return $mountpoints
}

function get_cores() {
	local cpu_list="$1"
	for cpu in ${cpu_list//,/ }; do
		echo $cpu
	done
}

function get_cores_numa_node() {
	local cores=$1
	for core in $cores; do
		lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}'
	done
}

function get_numa_node() {
	local plugin=$1
	local disks=$2
	if [[ "$plugin" =~ "nvme" ]]; then
		for bdf in $disks; do
			local driver
			driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}')
			# Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script )
			if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then
				cat /sys/bus/pci/devices/$bdf/numa_node
			fi
		done
	elif [[ "$plugin" =~ "bdev" ]]; then
		local bdevs
		bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf --json)
		for name in $disks; do
			local bdev_bdf
			bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs)
			cat /sys/bus/pci/devices/$bdev_bdf/numa_node
		done
	else
		for name in $disks; do
			local bdf
			# Not reading directly from /sys/block/nvme* because of a kernel bug
			# which results in NUMA 0 always getting reported.
			bdf=$(cat /sys/block/$name/device/address)
			cat /sys/bus/pci/devices/$bdf/numa_node
		done
	fi
}

function get_disks() {
	local plugin=$1
	local disk_cfg

	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
	if [[ "$plugin" =~ "nvme" ]]; then
		# PCI BDF address is enough for nvme-perf and nvme-fio-plugin,
		# so just print them from configuration file
		echo "${disk_cfg[*]}"
	elif [[ "$plugin" =~ "bdev" ]]; then
		# Generate NvmeXn1 bdev name configuration file for bdev-perf
		# and bdev-fio-plugin
		local bdevs
		local disk_no
		disk_no=${#disk_cfg[@]}
		eval echo "Nvme{0..$((disk_no - 1))}n1"
	else
		# Find nvme block devices and only use the ones which
		# are not mounted
		for bdf in "${disk_cfg[@]}"; do
			if is_bdf_not_mounted $bdf; then
				local blkname
				blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
				echo $blkname
			fi
		done
	fi
}

function get_disks_on_numa() {
	local devs=($1)
	local numas=($2)
	local numa_no=$3
	local disks_on_numa=""
	local i

	for ((i = 0; i < ${#devs[@]}; i++)); do
		if [ ${numas[$i]} = $numa_no ]; then
			disks_on_numa=$((disks_on_numa + 1))
		fi
	done
	echo $disks_on_numa
}

function create_fio_config() {
	local disk_no=$1
	local plugin=$2
	local disks=($3)
	local disks_numa=($4)
	local cores=($5)
	local total_disks=${#disks[@]}
	local fio_job_section=()
	local num_cores=${#cores[@]}
	local disks_per_core=$((disk_no / num_cores))
	local disks_per_core_mod=$((disk_no % num_cores))
	local cores_numa
	cores_numa=($(get_cores_numa_node "${cores[*]}"))

	# Following part of this function still leverages global variables a lot.
	# It's a mix of local variables passed as aruments to function with global variables. This is messy.
	# TODO: Modify this to be consistent with how variables are used here. Aim for using only
	# local variables to get rid of globals as much as possible.
	desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\""
	cp "$testdir/config.fio.tmp" "$testdir/config.fio"
	cat <<- EOF >> $testdir/config.fio
		description=$desc

		rw=$RW
		rwmixread=$MIX
		bs=$BLK_SIZE
		runtime=$RUNTIME
		ramp_time=$RAMP_TIME
		numjobs=$NUMJOBS
		log_avg_msec=$SAMPLING_INT
	EOF

	if $GTOD_REDUCE; then
		echo "gtod_reduce=1" >> $testdir/config.fio
	fi

	for i in "${!cores[@]}"; do
		local m=0 #Counter of disks per NUMA node
		local n=0 #Counter of all disks in test
		core_numa=${cores_numa[$i]}

		total_disks_per_core=$disks_per_core
		# Check how many "stray" disks are unassigned to CPU cores
		# Assign one disk to current CPU core and substract it from the total of
		# unassigned disks
		if [[ "$disks_per_core_mod" -gt "0" ]]; then
			total_disks_per_core=$((disks_per_core + 1))
			disks_per_core_mod=$((disks_per_core_mod - 1))
		fi
		# SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread.
		# Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread.
		QD=$IODEPTH
		if [[ "$NOIOSCALING" = false ]]; then
			QD=$((IODEPTH * total_disks_per_core))
		fi

		fio_job_section+=("")
		fio_job_section+=("[filename${i}]")
		fio_job_section+=("iodepth=$QD")
		fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")

		while [[ "$m" -lt "$total_disks_per_core" ]]; do
			# Try to add disks to job section if it's NUMA node matches NUMA
			# for currently selected CPU
			if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then
				if [[ "$plugin" == "spdk-plugin-nvme" ]]; then
					fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}")
				elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then
					fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
				elif [[ "$plugin" =~ "kernel" ]]; then
					fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
				fi
				m=$((m + 1))

				#Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations
				disks_numa[$n]="x"
			fi
			n=$((n + 1))

			# If there is no more disks with numa node same as cpu numa node, switch to
			# other numa node, go back to start of loop and try again.
			if [[ $n -ge $total_disks ]]; then
				echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}"
				echo "NVMe assignment for this CPU will be cross-NUMA."
				if [[ "$core_numa" == "1" ]]; then
					core_numa=0
				else
					core_numa=1
				fi
				n=0
			fi
		done
	done

	printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio
	echo "INFO: Generated fio configuration file:"
	cat $testdir/config.fio
}

function preconditioning() {
	local dev_name=""
	local filename=""
	local nvme_list

	HUGEMEM=8192 $rootdir/scripts/setup.sh
	cp $testdir/config.fio.tmp $testdir/config.fio
	echo "[Preconditioning]" >> $testdir/config.fio

	# Generate filename argument for FIO.
	# We only want to target NVMes not bound to nvme driver.
	# If they're still bound to nvme that means they were skipped by
	# setup.sh on purpose.
	nvme_list=$(get_disks nvme)
	for nvme in $nvme_list; do
		dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1'
		filename+=$(printf %s":" "$dev_name")
	done
	echo "** Preconditioning disks, this can take a while, depending on the size of disks."
	run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \
		--rw=write --iodepth=32 --output-format=normal
	rm -f $testdir/config.fio
}

function get_results() {
	local reads_pct
	local writes_pct

	reads_pct=$(bc -l <<< "scale=3; $2/100")
	writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct")
	case "$1" in
		iops)
			iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE)
			iops=${iops%.*}
			echo $iops
			;;
		mean_lat_usec)
			mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
			mean_lat=${mean_lat%.*}
			echo $((mean_lat / 1000))
			;;
		p99_lat_usec)
			p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
			p99_lat=${p99_lat%.*}
			echo $((p99_lat / 1000))
			;;
		p99_99_lat_usec)
			p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
			p99_99_lat=${p99_99_lat%.*}
			echo $((p99_99_lat / 1000))
			;;
		stdev_usec)
			stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $TMP_RESULT_FILE)
			stdev=${stdev%.*}
			echo $((stdev / 1000))
			;;
		mean_slat_usec)
			mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
			mean_slat=${mean_slat%.*}
			echo $((mean_slat / 1000))
			;;
		mean_clat_usec)
			mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
			mean_clat=${mean_clat%.*}
			echo $((mean_clat / 1000))
			;;
		bw_Kibs)
			bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE)
			bw=${bw%.*}
			echo $((bw))
			;;
	esac
}

function get_bdevperf_results() {
	case "$1" in
		iops)
			iops=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $2}')
			iops=${iops%.*}
			echo $iops
			;;
		bw_Kibs)
			bw_MBs=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $4}')
			bw_MBs=${bw_MBs%.*}
			echo $((bw_MBs * 1024))
			;;
	esac
}

function get_nvmeperf_results() {
	local iops
	local bw_MBs
	local mean_lat_usec
	local max_lat_usec
	local min_lat_usec

	read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)")

	# We need to get rid of the decimal spaces due
	# to use of arithmetic expressions instead of "bc" for calculations
	iops=${iops%.*}
	bw_MBs=${bw_MBs%.*}
	mean_lat_usec=${mean_lat_usec%.*}
	min_lat_usec=${min_lat_usec%.*}
	max_lat_usec=${max_lat_usec%.*}

	echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec"
}

function run_spdk_nvme_fio() {
	local plugin=$1
	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
	if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
		LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk
	elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then
		LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096
	fi

	sleep 1
}

function run_nvme_fio() {
	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
	$FIO_BIN $testdir/config.fio --output-format=json "$@"
	sleep 1
}

function run_bdevperf() {
	echo "** Running bdevperf test, this can take a while, depending on the run-time setting."
	$bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r /var/tmp/spdk.sock
	sleep 1
}

function run_nvmeperf() {
	# Prepare -r argument string for nvme perf command
	local r_opt
	local disks

	# Limit the number of disks to $1 if needed
	disks=($(get_disks nvme))
	disks=("${disks[@]:0:$1}")
	r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}")

	echo "** Running nvme perf test, this can take a while, depending on the run-time setting."

	# Run command in separate shell as this solves quoting issues related to r_opt var
	$SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]"
	sleep 1
}

function wait_for_nvme_reload() {
	local nvmes=$1

	shopt -s extglob
	for disk in $nvmes; do
		cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*"
		until $cmd 2> /dev/null; do
			echo "Waiting for full nvme driver reload..."
			sleep 0.5
		done
	done
	shopt -q extglob
}

function verify_disk_number() {
	# Check if we have appropriate number of disks to carry out the test
	disks=($(get_disks $PLUGIN))
	if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then
		DISKNO=${#disks[@]}
	elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then
		echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})"
		false
	fi
}