summaryrefslogtreecommitdiffstats
path: root/src/spdk/test/nvme/perf/common.sh
diff options
context:
space:
mode:
Diffstat (limited to '')
-rwxr-xr-xsrc/spdk/test/nvme/perf/common.sh471
1 files changed, 471 insertions, 0 deletions
diff --git a/src/spdk/test/nvme/perf/common.sh b/src/spdk/test/nvme/perf/common.sh
new file mode 100755
index 000000000..ddd01ec52
--- /dev/null
+++ b/src/spdk/test/nvme/perf/common.sh
@@ -0,0 +1,471 @@
+#!/usr/bin/env bash
+
+function discover_bdevs() {
+ local rootdir=$1
+ local config_file=$2
+ local cfg_type=$3
+ local wait_for_spdk_bdev=${4:-30}
+ local rpc_server=/var/tmp/spdk-discover-bdevs.sock
+
+ if [ ! -e $config_file ]; then
+ echo "Invalid Configuration File: $config_file"
+ return 1
+ fi
+
+ if [ -z $cfg_type ]; then
+ cfg_type="-c"
+ fi
+
+ # Start the bdev service to query for the list of available
+ # bdevs.
+ $rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \
+ $cfg_type $config_file &> /dev/null &
+ stubpid=$!
+ while ! [ -e /var/run/spdk_bdev0 ]; do
+ # If this counter drops to zero, errexit will be caught to abort the test
+ ((wait_for_spdk_bdev--))
+ sleep 1
+ done
+
+ # Get all of the bdevs
+ $rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs
+
+ # Shut down the bdev service
+ kill $stubpid
+ wait $stubpid
+ rm -f /var/run/spdk_bdev0
+}
+
+function create_spdk_bdev_conf() {
+ local output
+ local disk_cfg
+ local bdev_io_cache_size=$1
+ local bdev_io_pool_size=$2
+ local bdev_json_cfg=()
+ local bdev_opts=()
+
+ disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
+
+ if [[ -n "$bdev_io_cache_size" ]]; then
+ bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size")
+ fi
+
+ if [[ -n "$bdev_io_pool_size" ]]; then
+ bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size")
+ fi
+
+ local IFS=","
+ if [[ ${#bdev_opts[@]} -gt 0 ]]; then
+ bdev_json_cfg+=("$(
+ cat <<- JSON
+ {
+ "method": "bdev_set_options",
+ "params": {
+ ${bdev_opts[*]}
+ }
+ }
+ JSON
+ )")
+ fi
+
+ for i in "${!disk_cfg[@]}"; do
+ bdev_json_cfg+=("$(
+ cat <<- JSON
+ {
+ "method": "bdev_nvme_attach_controller",
+ "params": {
+ "trtype": "PCIe",
+ "name":"Nvme${i}",
+ "traddr":"${disk_cfg[i]}"
+ }
+ }
+ JSON
+ )")
+ done
+
+ local IFS=","
+ jq -r '.' <<- JSON > $testdir/bdev.conf
+ {
+ "subsystems": [
+ {
+ "subsystem": "bdev",
+ "config": [
+ ${bdev_json_cfg[*]}
+ ]
+ }
+ ]
+ }
+ JSON
+}
+
+function is_bdf_not_mounted() {
+ local bdf=$1
+ local blkname
+ local mountpoints
+ blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
+ mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
+ return $mountpoints
+}
+
+function get_cores() {
+ local cpu_list="$1"
+ for cpu in ${cpu_list//,/ }; do
+ echo $cpu
+ done
+}
+
+function get_cores_numa_node() {
+ local cores=$1
+ for core in $cores; do
+ lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}'
+ done
+}
+
+function get_numa_node() {
+ local plugin=$1
+ local disks=$2
+ if [[ "$plugin" =~ "nvme" ]]; then
+ for bdf in $disks; do
+ local driver
+ driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}')
+ # Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script )
+ if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then
+ cat /sys/bus/pci/devices/$bdf/numa_node
+ fi
+ done
+ elif [[ "$plugin" =~ "bdev" ]]; then
+ local bdevs
+ bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf --json)
+ for name in $disks; do
+ local bdev_bdf
+ bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs)
+ cat /sys/bus/pci/devices/$bdev_bdf/numa_node
+ done
+ else
+ for name in $disks; do
+ local bdf
+ # Not reading directly from /sys/block/nvme* because of a kernel bug
+ # which results in NUMA 0 always getting reported.
+ bdf=$(cat /sys/block/$name/device/address)
+ cat /sys/bus/pci/devices/$bdf/numa_node
+ done
+ fi
+}
+
+function get_disks() {
+ local plugin=$1
+ local disk_cfg
+
+ disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
+ if [[ "$plugin" =~ "nvme" ]]; then
+ # PCI BDF address is enough for nvme-perf and nvme-fio-plugin,
+ # so just print them from configuration file
+ echo "${disk_cfg[*]}"
+ elif [[ "$plugin" =~ "bdev" ]]; then
+ # Generate NvmeXn1 bdev name configuration file for bdev-perf
+ # and bdev-fio-plugin
+ local bdevs
+ local disk_no
+ disk_no=${#disk_cfg[@]}
+ eval echo "Nvme{0..$((disk_no - 1))}n1"
+ else
+ # Find nvme block devices and only use the ones which
+ # are not mounted
+ for bdf in "${disk_cfg[@]}"; do
+ if is_bdf_not_mounted $bdf; then
+ local blkname
+ blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
+ echo $blkname
+ fi
+ done
+ fi
+}
+
+function get_disks_on_numa() {
+ local devs=($1)
+ local numas=($2)
+ local numa_no=$3
+ local disks_on_numa=""
+ local i
+
+ for ((i = 0; i < ${#devs[@]}; i++)); do
+ if [ ${numas[$i]} = $numa_no ]; then
+ disks_on_numa=$((disks_on_numa + 1))
+ fi
+ done
+ echo $disks_on_numa
+}
+
+function create_fio_config() {
+ local disk_no=$1
+ local plugin=$2
+ local disks=($3)
+ local disks_numa=($4)
+ local cores=($5)
+ local total_disks=${#disks[@]}
+ local fio_job_section=()
+ local num_cores=${#cores[@]}
+ local disks_per_core=$((disk_no / num_cores))
+ local disks_per_core_mod=$((disk_no % num_cores))
+ local cores_numa
+ cores_numa=($(get_cores_numa_node "${cores[*]}"))
+
+ # Following part of this function still leverages global variables a lot.
+ # It's a mix of local variables passed as aruments to function with global variables. This is messy.
+ # TODO: Modify this to be consistent with how variables are used here. Aim for using only
+ # local variables to get rid of globals as much as possible.
+ desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\""
+ cp "$testdir/config.fio.tmp" "$testdir/config.fio"
+ cat <<- EOF >> $testdir/config.fio
+ description=$desc
+
+ rw=$RW
+ rwmixread=$MIX
+ bs=$BLK_SIZE
+ runtime=$RUNTIME
+ ramp_time=$RAMP_TIME
+ numjobs=$NUMJOBS
+ log_avg_msec=$SAMPLING_INT
+ EOF
+
+ if $GTOD_REDUCE; then
+ echo "gtod_reduce=1" >> $testdir/config.fio
+ fi
+
+ for i in "${!cores[@]}"; do
+ local m=0 #Counter of disks per NUMA node
+ local n=0 #Counter of all disks in test
+ core_numa=${cores_numa[$i]}
+
+ total_disks_per_core=$disks_per_core
+ # Check how many "stray" disks are unassigned to CPU cores
+ # Assign one disk to current CPU core and substract it from the total of
+ # unassigned disks
+ if [[ "$disks_per_core_mod" -gt "0" ]]; then
+ total_disks_per_core=$((disks_per_core + 1))
+ disks_per_core_mod=$((disks_per_core_mod - 1))
+ fi
+ # SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread.
+ # Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread.
+ QD=$IODEPTH
+ if [[ "$NOIOSCALING" = false ]]; then
+ QD=$((IODEPTH * total_disks_per_core))
+ fi
+
+ fio_job_section+=("")
+ fio_job_section+=("[filename${i}]")
+ fio_job_section+=("iodepth=$QD")
+ fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
+
+ while [[ "$m" -lt "$total_disks_per_core" ]]; do
+ # Try to add disks to job section if it's NUMA node matches NUMA
+ # for currently selected CPU
+ if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then
+ if [[ "$plugin" == "spdk-plugin-nvme" ]]; then
+ fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}")
+ elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then
+ fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
+ elif [[ "$plugin" =~ "kernel" ]]; then
+ fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
+ fi
+ m=$((m + 1))
+
+ #Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations
+ disks_numa[$n]="x"
+ fi
+ n=$((n + 1))
+
+ # If there is no more disks with numa node same as cpu numa node, switch to
+ # other numa node, go back to start of loop and try again.
+ if [[ $n -ge $total_disks ]]; then
+ echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}"
+ echo "NVMe assignment for this CPU will be cross-NUMA."
+ if [[ "$core_numa" == "1" ]]; then
+ core_numa=0
+ else
+ core_numa=1
+ fi
+ n=0
+ fi
+ done
+ done
+
+ printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio
+ echo "INFO: Generated fio configuration file:"
+ cat $testdir/config.fio
+}
+
+function preconditioning() {
+ local dev_name=""
+ local filename=""
+ local nvme_list
+
+ HUGEMEM=8192 $rootdir/scripts/setup.sh
+ cp $testdir/config.fio.tmp $testdir/config.fio
+ echo "[Preconditioning]" >> $testdir/config.fio
+
+ # Generate filename argument for FIO.
+ # We only want to target NVMes not bound to nvme driver.
+ # If they're still bound to nvme that means they were skipped by
+ # setup.sh on purpose.
+ nvme_list=$(get_disks nvme)
+ for nvme in $nvme_list; do
+ dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1'
+ filename+=$(printf %s":" "$dev_name")
+ done
+ echo "** Preconditioning disks, this can take a while, depending on the size of disks."
+ run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \
+ --rw=write --iodepth=32 --output-format=normal
+ rm -f $testdir/config.fio
+}
+
+function get_results() {
+ local reads_pct
+ local writes_pct
+
+ reads_pct=$(bc -l <<< "scale=3; $2/100")
+ writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct")
+ case "$1" in
+ iops)
+ iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE)
+ iops=${iops%.*}
+ echo $iops
+ ;;
+ mean_lat_usec)
+ mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
+ mean_lat=${mean_lat%.*}
+ echo $((mean_lat / 1000))
+ ;;
+ p99_lat_usec)
+ p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
+ p99_lat=${p99_lat%.*}
+ echo $((p99_lat / 1000))
+ ;;
+ p99_99_lat_usec)
+ p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
+ p99_99_lat=${p99_99_lat%.*}
+ echo $((p99_99_lat / 1000))
+ ;;
+ stdev_usec)
+ stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $TMP_RESULT_FILE)
+ stdev=${stdev%.*}
+ echo $((stdev / 1000))
+ ;;
+ mean_slat_usec)
+ mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
+ mean_slat=${mean_slat%.*}
+ echo $((mean_slat / 1000))
+ ;;
+ mean_clat_usec)
+ mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
+ mean_clat=${mean_clat%.*}
+ echo $((mean_clat / 1000))
+ ;;
+ bw_Kibs)
+ bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE)
+ bw=${bw%.*}
+ echo $((bw))
+ ;;
+ esac
+}
+
+function get_bdevperf_results() {
+ case "$1" in
+ iops)
+ iops=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $2}')
+ iops=${iops%.*}
+ echo $iops
+ ;;
+ bw_Kibs)
+ bw_MBs=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $4}')
+ bw_MBs=${bw_MBs%.*}
+ echo $((bw_MBs * 1024))
+ ;;
+ esac
+}
+
+function get_nvmeperf_results() {
+ local iops
+ local bw_MBs
+ local mean_lat_usec
+ local max_lat_usec
+ local min_lat_usec
+
+ read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)")
+
+ # We need to get rid of the decimal spaces due
+ # to use of arithmetic expressions instead of "bc" for calculations
+ iops=${iops%.*}
+ bw_MBs=${bw_MBs%.*}
+ mean_lat_usec=${mean_lat_usec%.*}
+ min_lat_usec=${min_lat_usec%.*}
+ max_lat_usec=${max_lat_usec%.*}
+
+ echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec"
+}
+
+function run_spdk_nvme_fio() {
+ local plugin=$1
+ echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
+ if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
+ LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk
+ elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then
+ LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096
+ fi
+
+ sleep 1
+}
+
+function run_nvme_fio() {
+ echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
+ $FIO_BIN $testdir/config.fio --output-format=json "$@"
+ sleep 1
+}
+
+function run_bdevperf() {
+ echo "** Running bdevperf test, this can take a while, depending on the run-time setting."
+ $bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r /var/tmp/spdk.sock
+ sleep 1
+}
+
+function run_nvmeperf() {
+ # Prepare -r argument string for nvme perf command
+ local r_opt
+ local disks
+
+ # Limit the number of disks to $1 if needed
+ disks=($(get_disks nvme))
+ disks=("${disks[@]:0:$1}")
+ r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}")
+
+ echo "** Running nvme perf test, this can take a while, depending on the run-time setting."
+
+ # Run command in separate shell as this solves quoting issues related to r_opt var
+ $SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]"
+ sleep 1
+}
+
+function wait_for_nvme_reload() {
+ local nvmes=$1
+
+ shopt -s extglob
+ for disk in $nvmes; do
+ cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*"
+ until $cmd 2> /dev/null; do
+ echo "Waiting for full nvme driver reload..."
+ sleep 0.5
+ done
+ done
+ shopt -q extglob
+}
+
+function verify_disk_number() {
+ # Check if we have appropriate number of disks to carry out the test
+ disks=($(get_disks $PLUGIN))
+ if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then
+ DISKNO=${#disks[@]}
+ elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then
+ echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})"
+ false
+ fi
+}