From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- .../block_cache_analyzer/block_cache_pysim.sh | 156 +++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh (limited to 'src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh') diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh new file mode 100644 index 000000000..295f734aa --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# A shell script to run a batch of pysims and combine individual pysim output files. +# +# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs +# trace_file_path: The file path that stores the traces. +# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml +# downsample_size: The downsample size used to collect the trace. +# warmup_seconds: The number of seconds used for warmup. +# max_jobs: The max number of concurrent pysims to run. + +# Install required packages to run simulations. +# sudo dnf install -y numpy scipy python-matplotlib ipython python-pandas sympy python-nose atlas-devel +ulimit -c 0 + +if [ $# -ne 5 ]; then + echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs" + exit 0 +fi + +trace_file="$1" +result_dir="$2" +downsample_size="$3" +warmup_seconds="$4" +max_jobs="$5" +max_num_accesses=100000000 +current_jobs=1 + +ml_tmp_result_dir="$result_dir/ml" +rm -rf "$ml_tmp_result_dir" +mkdir -p "$result_dir" +mkdir -p "$ml_tmp_result_dir" + +# Report miss ratio in the trace. +current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) +for cf_name in "all" +do +for cache_size in "1G" "2G" "4G" "8G" "16G" #"12G" "16G" "1T" +do +for cache_type in "opt" "lru" "pylru" "pycctbbt" "pyhb" "ts" "trace" "lru_hybrid" #"pycctblevelbt" #"lru_hybridn" "opt" #"pylru" "pylru_hybrid" "pycctbbt" "pycccfbt" "trace" +do + if [[ $cache_type == "trace" && $cache_size != "16G" ]]; then + # We only need to collect miss ratios observed in the trace once. + continue + fi + while [ "$current_jobs" -ge "$max_jobs" ] + do + sleep 10 + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + done + output="log-ml-$cache_type-$cache_size-$cf_name" + echo "Running simulation for $cache_type, cache size $cache_size, and cf_name $cf_name. Number of running jobs: $current_jobs. " + nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" "$max_num_accesses" "$cf_name" >& "$ml_tmp_result_dir/$output" & + current_jobs=$((current_jobs+1)) +done +done +done + +# Wait for all jobs to complete. +while [ $current_jobs -gt 0 ] +do + sleep 10 + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" +done + +echo "Combine individual pysim output files" + +rm -rf "$result_dir/ml_*" +for header in "header-" "data-" +do +for fn in "$ml_tmp_result_dir"/* +do + sum_file="" + time_unit="" + capacity="" + target_cf_name="" + if [[ $fn == *"timeline"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit_index=0 + capacity_index=0 + for i in "${elements[@]}" + do + if [[ $i == "timeline" ]]; then + break + fi + time_unit_index=$((time_unit_index+1)) + done + time_unit_index=$((time_unit_index+1)) + capacity_index=$((time_unit_index+2)) + target_cf_name_index=$((time_unit_index+3)) + time_unit="${elements[$time_unit_index]}_" + capacity="${elements[$capacity_index]}_" + target_cf_name="${elements[$target_cf_name_index]}_" + fi + + if [[ $fn == *"${header}ml-policy-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_timeline" + fi + if [[ $fn == *"${header}ml-policy-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_ratio_timeline" + fi + if [[ $fn == *"${header}ml-miss-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_timeline" + fi + if [[ $fn == *"${header}ml-miss-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_ratio_timeline" + fi + if [[ $fn == *"${header}ml-mrc"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${target_cf_name}_mrc" + fi + if [[ $fn == *"${header}ml-avgmb"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit=${elements[3]} + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_avgmb" + fi + if [[ $fn == *"${header}ml-p95mb"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit=${elements[3]} + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_p95mb" + fi + if [[ $sum_file == "" ]]; then + continue + fi + if [[ $header == "header-" ]]; then + if [ -e "$sum_file" ]; then + continue + fi + fi + cat "$fn" >> "$sum_file" +done +done + +echo "Done" +for fn in $result_dir/* +do + if [[ $fn == *"_mrc" || $fn == *"_avgmb" || $fn == *"_p95mb" ]]; then + # Sort MRC file by cache_type and cache_size. + tmp_file="$result_dir/tmp_mrc" + cat "$fn" | sort -t ',' -k1,1 -k4,4n > "$tmp_file" + cat "$tmp_file" > "$fn" + rm -rf "$tmp_file" + fi +done -- cgit v1.2.3