diff options
Diffstat (limited to '')
-rwxr-xr-x | src/spdk/test/nvme/perf/common.sh | 471 |
1 files changed, 471 insertions, 0 deletions
diff --git a/src/spdk/test/nvme/perf/common.sh b/src/spdk/test/nvme/perf/common.sh new file mode 100755 index 000000000..ddd01ec52 --- /dev/null +++ b/src/spdk/test/nvme/perf/common.sh @@ -0,0 +1,471 @@ +#!/usr/bin/env bash + +function discover_bdevs() { + local rootdir=$1 + local config_file=$2 + local cfg_type=$3 + local wait_for_spdk_bdev=${4:-30} + local rpc_server=/var/tmp/spdk-discover-bdevs.sock + + if [ ! -e $config_file ]; then + echo "Invalid Configuration File: $config_file" + return 1 + fi + + if [ -z $cfg_type ]; then + cfg_type="-c" + fi + + # Start the bdev service to query for the list of available + # bdevs. + $rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \ + $cfg_type $config_file &> /dev/null & + stubpid=$! + while ! [ -e /var/run/spdk_bdev0 ]; do + # If this counter drops to zero, errexit will be caught to abort the test + ((wait_for_spdk_bdev--)) + sleep 1 + done + + # Get all of the bdevs + $rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs + + # Shut down the bdev service + kill $stubpid + wait $stubpid + rm -f /var/run/spdk_bdev0 +} + +function create_spdk_bdev_conf() { + local output + local disk_cfg + local bdev_io_cache_size=$1 + local bdev_io_pool_size=$2 + local bdev_json_cfg=() + local bdev_opts=() + + disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) + + if [[ -n "$bdev_io_cache_size" ]]; then + bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size") + fi + + if [[ -n "$bdev_io_pool_size" ]]; then + bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size") + fi + + local IFS="," + if [[ ${#bdev_opts[@]} -gt 0 ]]; then + bdev_json_cfg+=("$( + cat <<- JSON + { + "method": "bdev_set_options", + "params": { + ${bdev_opts[*]} + } + } + JSON + )") + fi + + for i in "${!disk_cfg[@]}"; do + bdev_json_cfg+=("$( + cat <<- JSON + { + "method": "bdev_nvme_attach_controller", + "params": { + "trtype": "PCIe", + "name":"Nvme${i}", + "traddr":"${disk_cfg[i]}" + } + } + JSON + )") + done + + local IFS="," + jq -r '.' <<- JSON > $testdir/bdev.conf + { + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + ${bdev_json_cfg[*]} + ] + } + ] + } + JSON +} + +function is_bdf_not_mounted() { + local bdf=$1 + local blkname + local mountpoints + blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') + mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) + return $mountpoints +} + +function get_cores() { + local cpu_list="$1" + for cpu in ${cpu_list//,/ }; do + echo $cpu + done +} + +function get_cores_numa_node() { + local cores=$1 + for core in $cores; do + lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}' + done +} + +function get_numa_node() { + local plugin=$1 + local disks=$2 + if [[ "$plugin" =~ "nvme" ]]; then + for bdf in $disks; do + local driver + driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') + # Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script ) + if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then + cat /sys/bus/pci/devices/$bdf/numa_node + fi + done + elif [[ "$plugin" =~ "bdev" ]]; then + local bdevs + bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf --json) + for name in $disks; do + local bdev_bdf + bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs) + cat /sys/bus/pci/devices/$bdev_bdf/numa_node + done + else + for name in $disks; do + local bdf + # Not reading directly from /sys/block/nvme* because of a kernel bug + # which results in NUMA 0 always getting reported. + bdf=$(cat /sys/block/$name/device/address) + cat /sys/bus/pci/devices/$bdf/numa_node + done + fi +} + +function get_disks() { + local plugin=$1 + local disk_cfg + + disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) + if [[ "$plugin" =~ "nvme" ]]; then + # PCI BDF address is enough for nvme-perf and nvme-fio-plugin, + # so just print them from configuration file + echo "${disk_cfg[*]}" + elif [[ "$plugin" =~ "bdev" ]]; then + # Generate NvmeXn1 bdev name configuration file for bdev-perf + # and bdev-fio-plugin + local bdevs + local disk_no + disk_no=${#disk_cfg[@]} + eval echo "Nvme{0..$((disk_no - 1))}n1" + else + # Find nvme block devices and only use the ones which + # are not mounted + for bdf in "${disk_cfg[@]}"; do + if is_bdf_not_mounted $bdf; then + local blkname + blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') + echo $blkname + fi + done + fi +} + +function get_disks_on_numa() { + local devs=($1) + local numas=($2) + local numa_no=$3 + local disks_on_numa="" + local i + + for ((i = 0; i < ${#devs[@]}; i++)); do + if [ ${numas[$i]} = $numa_no ]; then + disks_on_numa=$((disks_on_numa + 1)) + fi + done + echo $disks_on_numa +} + +function create_fio_config() { + local disk_no=$1 + local plugin=$2 + local disks=($3) + local disks_numa=($4) + local cores=($5) + local total_disks=${#disks[@]} + local fio_job_section=() + local num_cores=${#cores[@]} + local disks_per_core=$((disk_no / num_cores)) + local disks_per_core_mod=$((disk_no % num_cores)) + local cores_numa + cores_numa=($(get_cores_numa_node "${cores[*]}")) + + # Following part of this function still leverages global variables a lot. + # It's a mix of local variables passed as aruments to function with global variables. This is messy. + # TODO: Modify this to be consistent with how variables are used here. Aim for using only + # local variables to get rid of globals as much as possible. + desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\"" + cp "$testdir/config.fio.tmp" "$testdir/config.fio" + cat <<- EOF >> $testdir/config.fio + description=$desc + + rw=$RW + rwmixread=$MIX + bs=$BLK_SIZE + runtime=$RUNTIME + ramp_time=$RAMP_TIME + numjobs=$NUMJOBS + log_avg_msec=$SAMPLING_INT + EOF + + if $GTOD_REDUCE; then + echo "gtod_reduce=1" >> $testdir/config.fio + fi + + for i in "${!cores[@]}"; do + local m=0 #Counter of disks per NUMA node + local n=0 #Counter of all disks in test + core_numa=${cores_numa[$i]} + + total_disks_per_core=$disks_per_core + # Check how many "stray" disks are unassigned to CPU cores + # Assign one disk to current CPU core and substract it from the total of + # unassigned disks + if [[ "$disks_per_core_mod" -gt "0" ]]; then + total_disks_per_core=$((disks_per_core + 1)) + disks_per_core_mod=$((disks_per_core_mod - 1)) + fi + # SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread. + # Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread. + QD=$IODEPTH + if [[ "$NOIOSCALING" = false ]]; then + QD=$((IODEPTH * total_disks_per_core)) + fi + + fio_job_section+=("") + fio_job_section+=("[filename${i}]") + fio_job_section+=("iodepth=$QD") + fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}") + + while [[ "$m" -lt "$total_disks_per_core" ]]; do + # Try to add disks to job section if it's NUMA node matches NUMA + # for currently selected CPU + if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then + if [[ "$plugin" == "spdk-plugin-nvme" ]]; then + fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}") + elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then + fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") + elif [[ "$plugin" =~ "kernel" ]]; then + fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") + fi + m=$((m + 1)) + + #Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations + disks_numa[$n]="x" + fi + n=$((n + 1)) + + # If there is no more disks with numa node same as cpu numa node, switch to + # other numa node, go back to start of loop and try again. + if [[ $n -ge $total_disks ]]; then + echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}" + echo "NVMe assignment for this CPU will be cross-NUMA." + if [[ "$core_numa" == "1" ]]; then + core_numa=0 + else + core_numa=1 + fi + n=0 + fi + done + done + + printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio + echo "INFO: Generated fio configuration file:" + cat $testdir/config.fio +} + +function preconditioning() { + local dev_name="" + local filename="" + local nvme_list + + HUGEMEM=8192 $rootdir/scripts/setup.sh + cp $testdir/config.fio.tmp $testdir/config.fio + echo "[Preconditioning]" >> $testdir/config.fio + + # Generate filename argument for FIO. + # We only want to target NVMes not bound to nvme driver. + # If they're still bound to nvme that means they were skipped by + # setup.sh on purpose. + nvme_list=$(get_disks nvme) + for nvme in $nvme_list; do + dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1' + filename+=$(printf %s":" "$dev_name") + done + echo "** Preconditioning disks, this can take a while, depending on the size of disks." + run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \ + --rw=write --iodepth=32 --output-format=normal + rm -f $testdir/config.fio +} + +function get_results() { + local reads_pct + local writes_pct + + reads_pct=$(bc -l <<< "scale=3; $2/100") + writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct") + case "$1" in + iops) + iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE) + iops=${iops%.*} + echo $iops + ;; + mean_lat_usec) + mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $TMP_RESULT_FILE) + mean_lat=${mean_lat%.*} + echo $((mean_lat / 1000)) + ;; + p99_lat_usec) + p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $TMP_RESULT_FILE) + p99_lat=${p99_lat%.*} + echo $((p99_lat / 1000)) + ;; + p99_99_lat_usec) + p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $TMP_RESULT_FILE) + p99_99_lat=${p99_99_lat%.*} + echo $((p99_99_lat / 1000)) + ;; + stdev_usec) + stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $TMP_RESULT_FILE) + stdev=${stdev%.*} + echo $((stdev / 1000)) + ;; + mean_slat_usec) + mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $TMP_RESULT_FILE) + mean_slat=${mean_slat%.*} + echo $((mean_slat / 1000)) + ;; + mean_clat_usec) + mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $TMP_RESULT_FILE) + mean_clat=${mean_clat%.*} + echo $((mean_clat / 1000)) + ;; + bw_Kibs) + bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE) + bw=${bw%.*} + echo $((bw)) + ;; + esac +} + +function get_bdevperf_results() { + case "$1" in + iops) + iops=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $2}') + iops=${iops%.*} + echo $iops + ;; + bw_Kibs) + bw_MBs=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $4}') + bw_MBs=${bw_MBs%.*} + echo $((bw_MBs * 1024)) + ;; + esac +} + +function get_nvmeperf_results() { + local iops + local bw_MBs + local mean_lat_usec + local max_lat_usec + local min_lat_usec + + read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)") + + # We need to get rid of the decimal spaces due + # to use of arithmetic expressions instead of "bc" for calculations + iops=${iops%.*} + bw_MBs=${bw_MBs%.*} + mean_lat_usec=${mean_lat_usec%.*} + min_lat_usec=${min_lat_usec%.*} + max_lat_usec=${max_lat_usec%.*} + + echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec" +} + +function run_spdk_nvme_fio() { + local plugin=$1 + echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." + if [[ "$plugin" = "spdk-plugin-nvme" ]]; then + LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk + elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then + LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096 + fi + + sleep 1 +} + +function run_nvme_fio() { + echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." + $FIO_BIN $testdir/config.fio --output-format=json "$@" + sleep 1 +} + +function run_bdevperf() { + echo "** Running bdevperf test, this can take a while, depending on the run-time setting." + $bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r /var/tmp/spdk.sock + sleep 1 +} + +function run_nvmeperf() { + # Prepare -r argument string for nvme perf command + local r_opt + local disks + + # Limit the number of disks to $1 if needed + disks=($(get_disks nvme)) + disks=("${disks[@]:0:$1}") + r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}") + + echo "** Running nvme perf test, this can take a while, depending on the run-time setting." + + # Run command in separate shell as this solves quoting issues related to r_opt var + $SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]" + sleep 1 +} + +function wait_for_nvme_reload() { + local nvmes=$1 + + shopt -s extglob + for disk in $nvmes; do + cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*" + until $cmd 2> /dev/null; do + echo "Waiting for full nvme driver reload..." + sleep 0.5 + done + done + shopt -q extglob +} + +function verify_disk_number() { + # Check if we have appropriate number of disks to carry out the test + disks=($(get_disks $PLUGIN)) + if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then + DISKNO=${#disks[@]} + elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then + echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})" + false + fi +} |