diff options
Diffstat (limited to 'src/spdk/test/nvme/perf')
-rw-r--r-- | src/spdk/test/nvme/perf/README.md | 103 | ||||
-rwxr-xr-x | src/spdk/test/nvme/perf/common.sh | 471 | ||||
-rw-r--r-- | src/spdk/test/nvme/perf/config.fio.tmp | 6 | ||||
-rwxr-xr-x | src/spdk/test/nvme/perf/run_perf.sh | 374 |
4 files changed, 954 insertions, 0 deletions
diff --git a/src/spdk/test/nvme/perf/README.md b/src/spdk/test/nvme/perf/README.md new file mode 100644 index 000000000..3e0b4aa30 --- /dev/null +++ b/src/spdk/test/nvme/perf/README.md @@ -0,0 +1,103 @@ +# Automated script for NVMe performance test + +## Compile SPDK with LTO + +The link time optimization (lto) gcc flag allows the linker to run a post-link optimization pass on the code. During that pass the linker inlines thin wrappers such as those around DPDK calls which results in a shallow call stack and significantly improves performance. Therefore, we recommend compiling SPDK with the lto flag prior to running this benchmark script to archieve optimal performance. +Link time optimization can be enabled in SPDK by doing the following: + +~{.sh} +./configure --enable-lto +~ + +## Configuration + +Test is configured by using command-line options. + +### Available options + +#### -h, --help + +Prints available commands and help. + +#### --run-time + +Tell fio to terminate processing after the specified period of time. Value in seconds. + +#### --ramp-time + +Fio will run the specified workload for this amount of time before logging any performance numbers. +Value in seconds. + +#### --fio-bin + +Path to fio binary. + +#### --driver + +Select between SPDK driver and kernel driver. The Linux Kernel driver has three configurations: +Default mode, Hybrid Polling and Classic Polling. The SPDK driver supports 2 fio_plugin modes: bdev and NVMe PMD. Before running test with spdk, you will need to bind NVMe devics to the Linux uio_pci_generic or vfio-pci driver. When running test with the Kernel driver, NVMe devices use the Kernel driver. The 5 valid values for this option are: +'bdev', 'nvme', 'kernel-libaio', 'kernel-classic-polling' and 'kernel-hybrid-polling'. + +#### --max-disk + +This option will run multiple fio jobs with varying number of NVMe devices. First it will start with +max-disk number of devices then decrease number of disk by two until there are no more devices. +If set to 'all' then max-disk number will be set to all available devices. +Only one of the max-disk or disk-no option can be used. + +#### --disk-no + +This option will run fio job on specified number of NVMe devices. If set to 'all' then max-disk number +will be set to all available devices. Only one of the max-disk or disk-no option can be used. + +#### --cpu-allowed + +Specifies the CPU cores that will be used by fio to execute the performance test cases. When spdk driver is chosen, Nthe script attempts to assign NVMe devices to CPU cores on the same NUMA node. The script will try to align each core with devices matching +core's NUMA first but if the is no devices left within the CPU core NUMA then it will use devices from the other +NUMA node. It is important to choose cores that will ensure best NUMA node alignment. For example: +On System with 8 devices on NUMA node 0 and 8 devices on NUMA node 1, cores 0-27 on numa node 0 and 28-55 +on numa node 1, if test is set to use 16 disk and four cores then "--cpu-allowed=1,2,28,29" can be used +resulting with 4 devices with node0 per core 1 and 2 and 4 devices with node1 per core 28 and 29. If 10 cores +are required then best option would be "--cpu-allowed=1,2,3,4,28,29,30,31,32,33" because cores 1-4 will be +aligned with 2 devices on numa0 per core and cores 28-33 will be aligned with 1 device on numa1 per core. +If kernel driver is chosen then for each job with NVME device, all cpu cores with corresponding NUMA node are picked. + +#### --rw + +Type of I/O pattern. Accepted values are: randrw, rw + +#### --rwmixread + +Percentage of a mixed workload that should be reads. + +#### --iodepth + +Number of I/O units to keep in flight against each file. + +#### --block-size + +The block size in bytes used for I/O units. + +#### --numjobs + +Create the specified number of clones of a job. + +#### --repeat-no + +Specifies how many times run each workload. End results are averages of these workloads + +#### --no-preconditioning + +By default disks are preconditioned before test using fio with parameters: size=100%, loops=2, bs=1M, w=write, +iodepth=32, ioengine=spdk. It can be skiped when this option is set. + +#### "--no-io-scaling" + +For SPDK fio plugin iodepth is multiplied by number of devices. When this option is set this multiplication will be disabled. + +## Results + +Results are stored in "results" folder. After each workload, to this folder are copied files with: +fio configuration file, json files with fio results and logs with latiencies with sampling interval 250 ms. +Number of copied files depends from number of repeats of each workload. Additionall csv file is created with averaged +results of all workloads. diff --git a/src/spdk/test/nvme/perf/common.sh b/src/spdk/test/nvme/perf/common.sh new file mode 100755 index 000000000..ddd01ec52 --- /dev/null +++ b/src/spdk/test/nvme/perf/common.sh @@ -0,0 +1,471 @@ +#!/usr/bin/env bash + +function discover_bdevs() { + local rootdir=$1 + local config_file=$2 + local cfg_type=$3 + local wait_for_spdk_bdev=${4:-30} + local rpc_server=/var/tmp/spdk-discover-bdevs.sock + + if [ ! -e $config_file ]; then + echo "Invalid Configuration File: $config_file" + return 1 + fi + + if [ -z $cfg_type ]; then + cfg_type="-c" + fi + + # Start the bdev service to query for the list of available + # bdevs. + $rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \ + $cfg_type $config_file &> /dev/null & + stubpid=$! + while ! [ -e /var/run/spdk_bdev0 ]; do + # If this counter drops to zero, errexit will be caught to abort the test + ((wait_for_spdk_bdev--)) + sleep 1 + done + + # Get all of the bdevs + $rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs + + # Shut down the bdev service + kill $stubpid + wait $stubpid + rm -f /var/run/spdk_bdev0 +} + +function create_spdk_bdev_conf() { + local output + local disk_cfg + local bdev_io_cache_size=$1 + local bdev_io_pool_size=$2 + local bdev_json_cfg=() + local bdev_opts=() + + disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) + + if [[ -n "$bdev_io_cache_size" ]]; then + bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size") + fi + + if [[ -n "$bdev_io_pool_size" ]]; then + bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size") + fi + + local IFS="," + if [[ ${#bdev_opts[@]} -gt 0 ]]; then + bdev_json_cfg+=("$( + cat <<- JSON + { + "method": "bdev_set_options", + "params": { + ${bdev_opts[*]} + } + } + JSON + )") + fi + + for i in "${!disk_cfg[@]}"; do + bdev_json_cfg+=("$( + cat <<- JSON + { + "method": "bdev_nvme_attach_controller", + "params": { + "trtype": "PCIe", + "name":"Nvme${i}", + "traddr":"${disk_cfg[i]}" + } + } + JSON + )") + done + + local IFS="," + jq -r '.' <<- JSON > $testdir/bdev.conf + { + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + ${bdev_json_cfg[*]} + ] + } + ] + } + JSON +} + +function is_bdf_not_mounted() { + local bdf=$1 + local blkname + local mountpoints + blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') + mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) + return $mountpoints +} + +function get_cores() { + local cpu_list="$1" + for cpu in ${cpu_list//,/ }; do + echo $cpu + done +} + +function get_cores_numa_node() { + local cores=$1 + for core in $cores; do + lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}' + done +} + +function get_numa_node() { + local plugin=$1 + local disks=$2 + if [[ "$plugin" =~ "nvme" ]]; then + for bdf in $disks; do + local driver + driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') + # Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script ) + if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then + cat /sys/bus/pci/devices/$bdf/numa_node + fi + done + elif [[ "$plugin" =~ "bdev" ]]; then + local bdevs + bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf --json) + for name in $disks; do + local bdev_bdf + bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs) + cat /sys/bus/pci/devices/$bdev_bdf/numa_node + done + else + for name in $disks; do + local bdf + # Not reading directly from /sys/block/nvme* because of a kernel bug + # which results in NUMA 0 always getting reported. + bdf=$(cat /sys/block/$name/device/address) + cat /sys/bus/pci/devices/$bdf/numa_node + done + fi +} + +function get_disks() { + local plugin=$1 + local disk_cfg + + disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) + if [[ "$plugin" =~ "nvme" ]]; then + # PCI BDF address is enough for nvme-perf and nvme-fio-plugin, + # so just print them from configuration file + echo "${disk_cfg[*]}" + elif [[ "$plugin" =~ "bdev" ]]; then + # Generate NvmeXn1 bdev name configuration file for bdev-perf + # and bdev-fio-plugin + local bdevs + local disk_no + disk_no=${#disk_cfg[@]} + eval echo "Nvme{0..$((disk_no - 1))}n1" + else + # Find nvme block devices and only use the ones which + # are not mounted + for bdf in "${disk_cfg[@]}"; do + if is_bdf_not_mounted $bdf; then + local blkname + blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') + echo $blkname + fi + done + fi +} + +function get_disks_on_numa() { + local devs=($1) + local numas=($2) + local numa_no=$3 + local disks_on_numa="" + local i + + for ((i = 0; i < ${#devs[@]}; i++)); do + if [ ${numas[$i]} = $numa_no ]; then + disks_on_numa=$((disks_on_numa + 1)) + fi + done + echo $disks_on_numa +} + +function create_fio_config() { + local disk_no=$1 + local plugin=$2 + local disks=($3) + local disks_numa=($4) + local cores=($5) + local total_disks=${#disks[@]} + local fio_job_section=() + local num_cores=${#cores[@]} + local disks_per_core=$((disk_no / num_cores)) + local disks_per_core_mod=$((disk_no % num_cores)) + local cores_numa + cores_numa=($(get_cores_numa_node "${cores[*]}")) + + # Following part of this function still leverages global variables a lot. + # It's a mix of local variables passed as aruments to function with global variables. This is messy. + # TODO: Modify this to be consistent with how variables are used here. Aim for using only + # local variables to get rid of globals as much as possible. + desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\"" + cp "$testdir/config.fio.tmp" "$testdir/config.fio" + cat <<- EOF >> $testdir/config.fio + description=$desc + + rw=$RW + rwmixread=$MIX + bs=$BLK_SIZE + runtime=$RUNTIME + ramp_time=$RAMP_TIME + numjobs=$NUMJOBS + log_avg_msec=$SAMPLING_INT + EOF + + if $GTOD_REDUCE; then + echo "gtod_reduce=1" >> $testdir/config.fio + fi + + for i in "${!cores[@]}"; do + local m=0 #Counter of disks per NUMA node + local n=0 #Counter of all disks in test + core_numa=${cores_numa[$i]} + + total_disks_per_core=$disks_per_core + # Check how many "stray" disks are unassigned to CPU cores + # Assign one disk to current CPU core and substract it from the total of + # unassigned disks + if [[ "$disks_per_core_mod" -gt "0" ]]; then + total_disks_per_core=$((disks_per_core + 1)) + disks_per_core_mod=$((disks_per_core_mod - 1)) + fi + # SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread. + # Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread. + QD=$IODEPTH + if [[ "$NOIOSCALING" = false ]]; then + QD=$((IODEPTH * total_disks_per_core)) + fi + + fio_job_section+=("") + fio_job_section+=("[filename${i}]") + fio_job_section+=("iodepth=$QD") + fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}") + + while [[ "$m" -lt "$total_disks_per_core" ]]; do + # Try to add disks to job section if it's NUMA node matches NUMA + # for currently selected CPU + if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then + if [[ "$plugin" == "spdk-plugin-nvme" ]]; then + fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}") + elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then + fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") + elif [[ "$plugin" =~ "kernel" ]]; then + fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") + fi + m=$((m + 1)) + + #Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations + disks_numa[$n]="x" + fi + n=$((n + 1)) + + # If there is no more disks with numa node same as cpu numa node, switch to + # other numa node, go back to start of loop and try again. + if [[ $n -ge $total_disks ]]; then + echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}" + echo "NVMe assignment for this CPU will be cross-NUMA." + if [[ "$core_numa" == "1" ]]; then + core_numa=0 + else + core_numa=1 + fi + n=0 + fi + done + done + + printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio + echo "INFO: Generated fio configuration file:" + cat $testdir/config.fio +} + +function preconditioning() { + local dev_name="" + local filename="" + local nvme_list + + HUGEMEM=8192 $rootdir/scripts/setup.sh + cp $testdir/config.fio.tmp $testdir/config.fio + echo "[Preconditioning]" >> $testdir/config.fio + + # Generate filename argument for FIO. + # We only want to target NVMes not bound to nvme driver. + # If they're still bound to nvme that means they were skipped by + # setup.sh on purpose. + nvme_list=$(get_disks nvme) + for nvme in $nvme_list; do + dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1' + filename+=$(printf %s":" "$dev_name") + done + echo "** Preconditioning disks, this can take a while, depending on the size of disks." + run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \ + --rw=write --iodepth=32 --output-format=normal + rm -f $testdir/config.fio +} + +function get_results() { + local reads_pct + local writes_pct + + reads_pct=$(bc -l <<< "scale=3; $2/100") + writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct") + case "$1" in + iops) + iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE) + iops=${iops%.*} + echo $iops + ;; + mean_lat_usec) + mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $TMP_RESULT_FILE) + mean_lat=${mean_lat%.*} + echo $((mean_lat / 1000)) + ;; + p99_lat_usec) + p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $TMP_RESULT_FILE) + p99_lat=${p99_lat%.*} + echo $((p99_lat / 1000)) + ;; + p99_99_lat_usec) + p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $TMP_RESULT_FILE) + p99_99_lat=${p99_99_lat%.*} + echo $((p99_99_lat / 1000)) + ;; + stdev_usec) + stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $TMP_RESULT_FILE) + stdev=${stdev%.*} + echo $((stdev / 1000)) + ;; + mean_slat_usec) + mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $TMP_RESULT_FILE) + mean_slat=${mean_slat%.*} + echo $((mean_slat / 1000)) + ;; + mean_clat_usec) + mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $TMP_RESULT_FILE) + mean_clat=${mean_clat%.*} + echo $((mean_clat / 1000)) + ;; + bw_Kibs) + bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE) + bw=${bw%.*} + echo $((bw)) + ;; + esac +} + +function get_bdevperf_results() { + case "$1" in + iops) + iops=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $2}') + iops=${iops%.*} + echo $iops + ;; + bw_Kibs) + bw_MBs=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $4}') + bw_MBs=${bw_MBs%.*} + echo $((bw_MBs * 1024)) + ;; + esac +} + +function get_nvmeperf_results() { + local iops + local bw_MBs + local mean_lat_usec + local max_lat_usec + local min_lat_usec + + read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)") + + # We need to get rid of the decimal spaces due + # to use of arithmetic expressions instead of "bc" for calculations + iops=${iops%.*} + bw_MBs=${bw_MBs%.*} + mean_lat_usec=${mean_lat_usec%.*} + min_lat_usec=${min_lat_usec%.*} + max_lat_usec=${max_lat_usec%.*} + + echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec" +} + +function run_spdk_nvme_fio() { + local plugin=$1 + echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." + if [[ "$plugin" = "spdk-plugin-nvme" ]]; then + LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk + elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then + LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096 + fi + + sleep 1 +} + +function run_nvme_fio() { + echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." + $FIO_BIN $testdir/config.fio --output-format=json "$@" + sleep 1 +} + +function run_bdevperf() { + echo "** Running bdevperf test, this can take a while, depending on the run-time setting." + $bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r /var/tmp/spdk.sock + sleep 1 +} + +function run_nvmeperf() { + # Prepare -r argument string for nvme perf command + local r_opt + local disks + + # Limit the number of disks to $1 if needed + disks=($(get_disks nvme)) + disks=("${disks[@]:0:$1}") + r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}") + + echo "** Running nvme perf test, this can take a while, depending on the run-time setting." + + # Run command in separate shell as this solves quoting issues related to r_opt var + $SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]" + sleep 1 +} + +function wait_for_nvme_reload() { + local nvmes=$1 + + shopt -s extglob + for disk in $nvmes; do + cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*" + until $cmd 2> /dev/null; do + echo "Waiting for full nvme driver reload..." + sleep 0.5 + done + done + shopt -q extglob +} + +function verify_disk_number() { + # Check if we have appropriate number of disks to carry out the test + disks=($(get_disks $PLUGIN)) + if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then + DISKNO=${#disks[@]} + elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then + echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})" + false + fi +} diff --git a/src/spdk/test/nvme/perf/config.fio.tmp b/src/spdk/test/nvme/perf/config.fio.tmp new file mode 100644 index 000000000..dfaea5df5 --- /dev/null +++ b/src/spdk/test/nvme/perf/config.fio.tmp @@ -0,0 +1,6 @@ +[global] +direct=1 +thread=1 +norandommap=1 +group_reporting=1 +time_based=1 diff --git a/src/spdk/test/nvme/perf/run_perf.sh b/src/spdk/test/nvme/perf/run_perf.sh new file mode 100755 index 000000000..133aaa75c --- /dev/null +++ b/src/spdk/test/nvme/perf/run_perf.sh @@ -0,0 +1,374 @@ +#!/usr/bin/env bash +set -e + +# Dir variables and sourcing common files +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +plugin_dir=$rootdir/build/fio +bdevperf_dir=$rootdir/test/bdev/bdevperf +nvmeperf_dir=$rootdir/build/examples +source $testdir/common.sh +source $rootdir/scripts/common.sh || exit 1 +source $rootdir/test/common/autotest_common.sh + +# Global & default variables +declare -A KERNEL_ENGINES +KERNEL_ENGINES=( + ["kernel-libaio"]="--ioengine=libaio" + ["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100" + ["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100" + ["kernel-io-uring"]="--ioengine=io_uring") + +RW=randrw +MIX=100 +IODEPTH=256 +BLK_SIZE=4096 +RUNTIME=600 +RAMP_TIME=30 +NUMJOBS=1 +REPEAT_NO=3 +GTOD_REDUCE=false +SAMPLING_INT=0 +FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio +TMP_RESULT_FILE=$testdir/result.json +PLUGIN="nvme" +DISKCFG="" +BDEV_CACHE="" +BDEV_POOL="" +DISKNO="ALL" +CPUS_ALLOWED=1 +NOIOSCALING=false +PRECONDITIONING=true +CPUFREQ="" +PERFTOP=false +DPDKMEM=false +DATE="$(date +'%m_%d_%Y_%H%M%S')" + +function usage() { + set +x + [[ -n $2 ]] && ( + echo "$2" + echo "" + ) + echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration" + echo "Usage: $(basename $1) [options]" + echo "-h, --help Print help and exit" + echo + echo "Workload parameters:" + echo " --rw=STR Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]" + echo " --rwmixread=INT Percentage of a mixed workload that should be reads. [default=$MIX]" + echo " --iodepth=INT Number of I/Os to keep in flight against the file. [default=$IODEPTH]" + echo " --block-size=INT The block size in bytes used for I/O units. [default=$BLK_SIZE]" + echo " --run-time=TIME[s] Tell fio to run the workload for the specified period of time. [default=$RUNTIME]" + echo " --ramp-time=TIME[s] Fio will run the specified workload for this amount of time before" + echo " logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests." + echo " --numjobs=INT Create the specified number of clones of this job. [default=$NUMJOBS]" + echo " Applicable only for fio-based tests." + echo " --repeat-no=INT How many times to repeat workload test. [default=$REPEAT_NO]" + echo " Test result will be an average of repeated test runs." + echo " --gtod-reduce Enable fio gtod_reduce option. [default=$GTOD_REDUCE]" + echo " --sampling-int=INT Value for fio log_avg_msec parameters [default=$SAMPLING_INT]" + echo " --fio-bin=PATH Path to fio binary. [default=$FIO_BIN]" + echo " Applicable only for fio-based tests." + echo + echo "Test setup parameters:" + echo " --driver=STR Selects tool used for testing. Choices available:" + echo " - spdk-perf-nvme (SPDK nvme perf)" + echo " - spdk-perf-bdev (SPDK bdev perf)" + echo " - spdk-plugin-nvme (SPDK nvme fio plugin)" + echo " - spdk-plugin-bdev (SPDK bdev fio plugin)" + echo " - kernel-classic-polling" + echo " - kernel-hybrid-polling" + echo " - kernel-libaio" + echo " - kernel-io-uring" + echo " --disk-config Configuration file containing PCI BDF addresses of NVMe disks to use in test." + echo " It consists a single column of PCI addresses. SPDK Bdev names will be assigned" + echo " and Kernel block device names detected." + echo " Lines starting with # are ignored as comments." + echo " --bdev-io-cache-size Set IO cache size for for SPDK bdev subsystem." + echo " --bdev-io-pool-size Set IO pool size for for SPDK bdev subsystem." + echo " --max-disk=INT,ALL Number of disks to test on, this will run multiple workloads with increasing number of disk each run." + echo " If =ALL then test on all found disk. [default=$DISKNO]" + echo " --cpu-allowed=INT/PATH Comma-separated list of CPU cores used to run the workload. Ranges allowed." + echo " Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]" + echo " --no-preconditioning Skip preconditioning" + echo " --no-io-scaling Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]" + echo " --cpu-frequency=INT Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in" + echo " GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to" + echo " check list of available frequencies. Example: --cpu-frequency=1100000." + echo + echo "Other options:" + echo " --perftop Run perftop measurements on the same CPU cores as specified in --cpu-allowed option." + echo " --dpdk-mem-stats Dump DPDK memory stats during the test." + set -x +} + +while getopts 'h-:' optchar; do + case "$optchar" in + -) + case "$OPTARG" in + help) + usage $0 + exit 0 + ;; + rw=*) RW="${OPTARG#*=}" ;; + rwmixread=*) MIX="${OPTARG#*=}" ;; + iodepth=*) IODEPTH="${OPTARG#*=}" ;; + block-size=*) BLK_SIZE="${OPTARG#*=}" ;; + run-time=*) RUNTIME="${OPTARG#*=}" ;; + ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;; + numjobs=*) NUMJOBS="${OPTARG#*=}" ;; + repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;; + gtod-reduce) GTOD_REDUCE=true ;; + sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;; + fio-bin=*) FIO_BIN="${OPTARG#*=}" ;; + driver=*) PLUGIN="${OPTARG#*=}" ;; + disk-config=*) + DISKCFG="${OPTARG#*=}" + if [[ ! -f "$DISKCFG" ]]; then + echo "Disk confiuration file $DISKCFG does not exist!" + exit 1 + fi + ;; + bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;; + bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;; + max-disk=*) DISKNO="${OPTARG#*=}" ;; + cpu-allowed=*) + CPUS_ALLOWED="${OPTARG#*=}" + if [[ -f "$CPUS_ALLOWED" ]]; then + CPUS_ALLOWED=$(cat "$CPUS_ALLOWED") + fi + ;; + no-preconditioning) PRECONDITIONING=false ;; + no-io-scaling) NOIOSCALING=true ;; + cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;; + perftop) PERFTOP=true ;; + dpdk-mem-stats) DPDKMEM=true ;; + *) + usage $0 echo "Invalid argument '$OPTARG'" + exit 1 + ;; + esac + ;; + h) + usage $0 + exit 0 + ;; + *) + usage $0 "Invalid argument '$optchar'" + exit 1 + ;; + esac +done + +result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE} +result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv +mkdir -p $result_dir +unset iops_disks bw mean_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec +echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file +printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file +echo "num_of_disks,iops,avg_lat[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file + +trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT + +if [[ "$PLUGIN" =~ "bdev" ]]; then + create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL" +fi +verify_disk_number +DISK_NAMES=$(get_disks $PLUGIN) +DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES") +CORES=$(get_cores "$CPUS_ALLOWED") +NO_CORES_ARRAY=($CORES) +NO_CORES=${#NO_CORES_ARRAY[@]} + +if $PRECONDITIONING; then + preconditioning +fi + +if [[ "$PLUGIN" =~ "kernel" ]]; then + $rootdir/scripts/setup.sh reset + fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}" + + if [[ $PLUGIN = "kernel-classic-polling" ]]; then + for disk in $DISK_NAMES; do + echo -1 > /sys/block/$disk/queue/io_poll_delay + done + elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then + for disk in $DISK_NAMES; do + echo 0 > /sys/block/$disk/queue/io_poll_delay + done + elif [[ $PLUGIN = "kernel-io-uring" ]]; then + modprobe -rv nvme + modprobe nvme poll_queues=8 + wait_for_nvme_reload $DISK_NAMES + + backup_dir="/tmp/nvme_param_bak" + mkdir -p $backup_dir + + for disk in $DISK_NAMES; do + echo "INFO: Backing up device parameters for $disk" + sysfs=/sys/block/$disk/queue + mkdir -p $backup_dir/$disk + cat $sysfs/iostats > $backup_dir/$disk/iostats + cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity + cat $sysfs/nomerges > $backup_dir/$disk/nomerges + cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay + done + + for disk in $DISK_NAMES; do + echo "INFO: Setting device parameters for $disk" + sysfs=/sys/block/$disk/queue + echo 0 > $sysfs/iostats + echo 0 > $sysfs/rq_affinity + echo 2 > $sysfs/nomerges + echo 0 > $sysfs/io_poll_delay + done + fi +fi + +if [[ -n "$CPUFREQ" ]]; then + if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then + echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options." + false + else + cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)" + cpupower frequency-set -g userspace + cpupower frequency-set -f $CPUFREQ + fi +fi + +if $PERFTOP; then + echo "INFO: starting perf record on cores $CPUS_ALLOWED" + perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" & + perf_pid=$! +fi + +if $DPDKMEM; then + echo "INFO: waiting to generate DPDK memory usage" + wait_time=$((RUNTIME / 2)) + if [[ ! "$PLUGIN" =~ "perf" ]]; then + wait_time=$((wait_time + RAMP_TIME)) + fi + ( + sleep $wait_time + echo "INFO: generating DPDK memory usage" + $rootdir/scripts/rpc.py env_dpdk_get_mem_stats + ) & + dpdk_mem_pid=$! +fi + +#Run each workolad $REPEAT_NO times +for ((j = 0; j < REPEAT_NO; j++)); do + if [ $PLUGIN = "spdk-perf-bdev" ]; then + run_bdevperf > $TMP_RESULT_FILE + iops_disks=$((iops_disks + $(get_bdevperf_results iops))) + bw=$((bw + $(get_bdevperf_results bw_Kibs))) + cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output + elif [ $PLUGIN = "spdk-perf-nvme" ]; then + run_nvmeperf $DISKNO > $TMP_RESULT_FILE + read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results) + + iops_disks=$((iops_disks + iops)) + bw=$((bw + bandwidth)) + mean_lat_disks_usec=$((mean_lat_disks_usec + mean_lat)) + min_lat_disks_usec=$((min_lat_disks_usec + min_lat)) + max_lat_disks_usec=$((max_lat_disks_usec + max_lat)) + + cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output + else + create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES" + + if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then + run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" \ + "--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" + else + run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" \ + "--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" + fi + + #Store values for every number of used disks + #Use recalculated value for mixread param in case rw mode is not rw. + rwmixread=$MIX + if [[ $RW = *"read"* ]]; then + rwmixread=100 + elif [[ $RW = *"write"* ]]; then + rwmixread=0 + fi + iops_disks=$((iops_disks + $(get_results iops $rwmixread))) + mean_lat_disks_usec=$((mean_lat_disks_usec + $(get_results mean_lat_usec $rwmixread))) + p99_lat_disks_usec=$((p99_lat_disks_usec + $(get_results p99_lat_usec $rwmixread))) + p99_99_lat_disks_usec=$((p99_99_lat_disks_usec + $(get_results p99_99_lat_usec $rwmixread))) + stdev_disks_usec=$((stdev_disks_usec + $(get_results stdev_usec $rwmixread))) + + mean_slat_disks_usec=$((mean_slat_disks_usec + $(get_results mean_slat_usec $rwmixread))) + mean_clat_disks_usec=$((mean_clat_disks_usec + $(get_results mean_clat_usec $rwmixread))) + bw=$((bw + $(get_results bw_Kibs $rwmixread))) + + cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json + cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio + rm -f $testdir/config.fio + fi +done + +if $PERFTOP; then + echo "INFO: Stopping perftop measurements." + kill $perf_pid + wait $perf_pid || true + perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt + rm -f "$testdir/perf.data" +fi + +if $DPDKMEM; then + mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt + echo "INFO: DPDK memory usage saved in $result_dir" +fi + +#Write results to csv file +iops_disks=$((iops_disks / REPEAT_NO)) +bw=$((bw / REPEAT_NO)) +if [[ "$PLUGIN" =~ "plugin" ]]; then + mean_lat_disks_usec=$((mean_lat_disks_usec / REPEAT_NO)) + p99_lat_disks_usec=$((p99_lat_disks_usec / REPEAT_NO)) + p99_99_lat_disks_usec=$((p99_99_lat_disks_usec / REPEAT_NO)) + stdev_disks_usec=$((stdev_disks_usec / REPEAT_NO)) + mean_slat_disks_usec=$((mean_slat_disks_usec / REPEAT_NO)) + mean_clat_disks_usec=$((mean_clat_disks_usec / REPEAT_NO)) +elif [[ "$PLUGIN" == "spdk-perf-bdev" ]]; then + mean_lat_disks_usec=0 + p99_lat_disks_usec=0 + p99_99_lat_disks_usec=0 + stdev_disks_usec=0 + mean_slat_disks_usec=0 + mean_clat_disks_usec=0 +elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then + mean_lat_disks_usec=$((mean_lat_disks_usec / REPEAT_NO)) + p99_lat_disks_usec=0 + p99_99_lat_disks_usec=0 + stdev_disks_usec=0 + mean_slat_disks_usec=0 + mean_clat_disks_usec=0 +fi + +printf "%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p99_lat_disks_usec} \ + ${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file + +if [[ -n "$CPUFREQ" ]]; then + cpupower frequency-set -g $cpu_governor +fi + +if [ $PLUGIN = "kernel-io-uring" ]; then + # Reload the nvme driver so that other test runs are not affected + modprobe -rv nvme + modprobe nvme + wait_for_nvme_reload $DISK_NAMES + + for disk in $DISK_NAMES; do + echo "INFO: Restoring device parameters for $disk" + sysfs=/sys/block/$disk/queue + cat $backup_dir/$disk/iostats > $sysfs/iostats + cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity + cat $backup_dir/$disk/nomerges > $sysfs/nomerges + cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay + done +fi +rm -f $testdir/bdev.conf $testdir/config.fio |