diff options
Diffstat (limited to 'tools/power/rapl.cpp')
-rw-r--r-- | tools/power/rapl.cpp | 874 |
1 files changed, 874 insertions, 0 deletions
diff --git a/tools/power/rapl.cpp b/tools/power/rapl.cpp new file mode 100644 index 0000000000..1aa5fcf6ee --- /dev/null +++ b/tools/power/rapl.cpp @@ -0,0 +1,874 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// This program provides processor power estimates. It does this by reading +// model-specific registers (MSRs) that are part Intel's Running Average Power +// Limit (RAPL) interface. These MSRs provide good quality estimates of the +// energy consumption of up to four system components: +// - PKG: the entire processor package; +// - PP0: the cores (a subset of the package); +// - PP1: the GPU (a subset of the package); +// - DRAM: main memory. +// +// For more details about RAPL, see section 14.9 of Volume 3 of the "Intel 64 +// and IA-32 Architecture's Software Developer's Manual", Order Number 325384. +// +// This program exists because there are no existing tools on Mac that can +// obtain all four RAPL estimates. (|powermetrics| can obtain the package +// estimate, but not the others. Intel Power Gadget can obtain the package and +// cores estimates.) +// +// On Linux |perf| can obtain all four estimates (as Joules, which are easily +// converted to Watts), but this program is implemented for Linux because it's +// not too hard to do, and that gives us multi-platform consistency. +// +// This program does not support Windows, unfortunately. It's not obvious how +// to access the RAPL MSRs on Windows. +// +// This program deliberately uses only standard libraries and avoids +// Mozilla-specific code, to make it easy to compile and test on different +// machines. + +#include <assert.h> +#include <getopt.h> +#include <math.h> +#include <signal.h> +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <unistd.h> + +#include <algorithm> +#include <numeric> +#include <vector> + +//--------------------------------------------------------------------------- +// Utilities +//--------------------------------------------------------------------------- + +// The value of argv[0] passed to main(). Used in error messages. +static const char* gArgv0; + +static void Abort(const char* aFormat, ...) { + va_list vargs; + va_start(vargs, aFormat); + fprintf(stderr, "%s: ", gArgv0); + vfprintf(stderr, aFormat, vargs); + fprintf(stderr, "\n"); + va_end(vargs); + + exit(1); +} + +static void CmdLineAbort(const char* aMsg) { + if (aMsg) { + fprintf(stderr, "%s: %s\n", gArgv0, aMsg); + } + fprintf(stderr, "Use --help for more information.\n"); + exit(1); +} + +// A special value that represents an estimate from an unsupported RAPL domain. +static const double kUnsupported_j = -1.0; + +// Print to stdout and flush it, so that the output appears immediately even if +// being redirected through |tee| or anything like that. +static void PrintAndFlush(const char* aFormat, ...) { + va_list vargs; + va_start(vargs, aFormat); + vfprintf(stdout, aFormat, vargs); + va_end(vargs); + + fflush(stdout); +} + +//--------------------------------------------------------------------------- +// Mac-specific code +//--------------------------------------------------------------------------- + +#if defined(__APPLE__) + +// Because of the pkg_energy_statistics_t::pkes_version check below, the +// earliest OS X version this code will work with is 10.9.0 (xnu-2422.1.72). + +# include <sys/types.h> +# include <sys/sysctl.h> + +// OS X has four kinds of system calls: +// +// 1. Mach traps; +// 2. UNIX system calls; +// 3. machine-dependent calls; +// 4. diagnostic calls. +// +// (See "Mac OS X and iOS Internals" by Jonathan Levin for more details.) +// +// The last category has a single call named diagCall() or diagCall64(). Its +// mode is controlled by its first argument, and one of the modes allows access +// to the Intel RAPL MSRs. +// +// The interface to diagCall64() is not exported, so we have to import some +// definitions from the XNU kernel. All imported definitions are annotated with +// the XNU source file they come from, and information about what XNU versions +// they were introduced in and (if relevant) modified. + +// The diagCall64() mode. +// From osfmk/i386/Diagnostics.h +// - In 10.8.4 (xnu-2050.24.15) this value was introduced. (In 10.8.3 the value +// 17 was used for dgGzallocTest.) +# define dgPowerStat 17 + +// From osfmk/i386/cpu_data.h +// - In 10.8.5 these values were introduced, along with core_energy_stat_t. +# define CPU_RTIME_BINS (12) +# define CPU_ITIME_BINS (CPU_RTIME_BINS) + +// core_energy_stat_t and pkg_energy_statistics_t are both from +// osfmk/i386/Diagnostics.c. +// - In 10.8.4 (xnu-2050.24.15) both structs were introduced, but with many +// fewer fields. +// - In 10.8.5 (xnu-2050.48.11) both structs were substantially expanded, with +// numerous new fields. +// - In 10.9.0 (xnu-2422.1.72) pkg_energy_statistics_t::pkes_version was added. +// diagCall64(dgPowerStat) fills it with '1' in all versions since (up to +// 10.10.2 at time of writing). +// - in 10.10.2 (xnu-2782.10.72) core_energy_stat_t::gpmcs was conditionally +// added, if DIAG_ALL_PMCS is true. (DIAG_ALL_PMCS is not even defined in the +// source code, but it could be defined at compile-time via compiler flags.) +// pkg_energy_statistics_t::pkes_version did not change, though. + +typedef struct { + uint64_t caperf; + uint64_t cmperf; + uint64_t ccres[6]; + uint64_t crtimes[CPU_RTIME_BINS]; + uint64_t citimes[CPU_ITIME_BINS]; + uint64_t crtime_total; + uint64_t citime_total; + uint64_t cpu_idle_exits; + uint64_t cpu_insns; + uint64_t cpu_ucc; + uint64_t cpu_urc; +# if DIAG_ALL_PMCS // Added in 10.10.2 (xnu-2782.10.72). + uint64_t gpmcs[4]; // Added in 10.10.2 (xnu-2782.10.72). +# endif /* DIAG_ALL_PMCS */ // Added in 10.10.2 (xnu-2782.10.72). +} core_energy_stat_t; + +typedef struct { + uint64_t pkes_version; // Added in 10.9.0 (xnu-2422.1.72). + uint64_t pkg_cres[2][7]; + + // This is read from MSR 0x606, which Intel calls MSR_RAPL_POWER_UNIT + // and XNU calls MSR_IA32_PKG_POWER_SKU_UNIT. + uint64_t pkg_power_unit; + + // These are the four fields for the four RAPL domains. For each field + // we list: + // + // - the corresponding MSR number; + // - Intel's name for that MSR; + // - XNU's name for that MSR; + // - which Intel processors the MSR is supported on. + // + // The last of these is determined from chapter 35 of Volume 3 of the + // "Intel 64 and IA-32 Architecture's Software Developer's Manual", + // Order Number 325384. (Note that chapter 35 contradicts section 14.9 + // to some degree.) + + // 0x611 == MSR_PKG_ENERGY_STATUS == MSR_IA32_PKG_ENERGY_STATUS + // Atom (various), Sandy Bridge, Next Gen Xeon Phi (model 0x57). + uint64_t pkg_energy; + + // 0x639 == MSR_PP0_ENERGY_STATUS == MSR_IA32_PP0_ENERGY_STATUS + // Atom (various), Sandy Bridge, Next Gen Xeon Phi (model 0x57). + uint64_t pp0_energy; + + // 0x641 == MSR_PP1_ENERGY_STATUS == MSR_PP1_ENERGY_STATUS + // Sandy Bridge, Haswell. + uint64_t pp1_energy; + + // 0x619 == MSR_DRAM_ENERGY_STATUS == MSR_IA32_DDR_ENERGY_STATUS + // Xeon E5, Xeon E5 v2, Haswell/Haswell-E, Next Gen Xeon Phi (model + // 0x57) + uint64_t ddr_energy; + + uint64_t llc_flushed_cycles; + uint64_t ring_ratio_instantaneous; + uint64_t IA_frequency_clipping_cause; + uint64_t GT_frequency_clipping_cause; + uint64_t pkg_idle_exits; + uint64_t pkg_rtimes[CPU_RTIME_BINS]; + uint64_t pkg_itimes[CPU_ITIME_BINS]; + uint64_t mbus_delay_time; + uint64_t mint_delay_time; + uint32_t ncpus; + core_energy_stat_t cest[]; +} pkg_energy_statistics_t; + +static int diagCall64(uint64_t aMode, void* aBuf) { + // We cannot use syscall() here because it doesn't work with diagnostic + // system calls -- it raises SIGSYS if you try. So we have to use asm. + +# ifdef __x86_64__ + // The 0x40000 prefix indicates it's a diagnostic system call. The 0x01 + // suffix indicates the syscall number is 1, which also happens to be the + // only diagnostic system call. See osfmk/mach/i386/syscall_sw.h for more + // details. + static const uint64_t diagCallNum = 0x4000001; + uint64_t rv; + + __asm__ __volatile__( + "syscall" + + // Return value goes in "a" (%rax). + : /* outputs */ "=a"(rv) + + // The syscall number goes in "0", a synonym (from outputs) for "a" + // (%rax). The syscall arguments go in "D" (%rdi) and "S" (%rsi). + : /* inputs */ "0"(diagCallNum), "D"(aMode), "S"(aBuf) + + // The |syscall| instruction clobbers %rcx, %r11, and %rflags ("cc"). And + // this particular syscall also writes memory (aBuf). + : /* clobbers */ "rcx", "r11", "cc", "memory"); + return rv; +# else +# error Sorry, only x86-64 is supported +# endif +} + +static void diagCall64_dgPowerStat(pkg_energy_statistics_t* aPkes) { + static const uint64_t supported_version = 1; + + // Write an unsupported version number into pkes_version so that the check + // below cannot succeed by dumb luck. + aPkes->pkes_version = supported_version - 1; + + // diagCall64() returns 1 on success, and 0 on failure (which can only happen + // if the mode is unrecognized, e.g. in 10.7.x or earlier versions). + if (diagCall64(dgPowerStat, aPkes) != 1) { + Abort("diagCall64() failed"); + } + + if (aPkes->pkes_version != 1) { + Abort("unexpected pkes_version: %llu", aPkes->pkes_version); + } +} + +class RAPL { + bool mIsGpuSupported; // Is the GPU domain supported by the processor? + bool mIsRamSupported; // Is the RAM domain supported by the processor? + + // The DRAM domain on Haswell servers has a fixed energy unit (1/65536 J == + // 15.3 microJoules) which is different to the power unit MSR. (See the + // "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, Volume 2 of + // 2, Registers" datasheet, September 2014, Reference Number: 330784-001.) + // This field records whether the quirk is present. + bool mHasRamUnitsQuirk; + + // The abovementioned 15.3 microJoules value. + static const double kQuirkyRamJoulesPerTick; + + // The previous sample's MSR values. + uint64_t mPrevPkgTicks; + uint64_t mPrevPp0Ticks; + uint64_t mPrevPp1Ticks; + uint64_t mPrevDdrTicks; + + // The struct passed to diagCall64(). + pkg_energy_statistics_t* mPkes; + + public: + RAPL() : mHasRamUnitsQuirk(false) { + // Work out which RAPL MSRs this CPU model supports. + int cpuModel; + size_t size = sizeof(cpuModel); + if (sysctlbyname("machdep.cpu.model", &cpuModel, &size, NULL, 0) != 0) { + Abort("sysctlbyname(\"machdep.cpu.model\") failed"); + } + + // This is similar to arch/x86/kernel/cpu/perf_event_intel_rapl.c in + // linux-4.1.5/. + // + // By linux-5.6.14/, this stuff had moved into + // arch/x86/events/intel/rapl.c, which references processor families in + // arch/x86/include/asm/intel-family.h. + switch (cpuModel) { + case 0x2a: // Sandy Bridge + case 0x3a: // Ivy Bridge + // Supports package, cores, GPU. + mIsGpuSupported = true; + mIsRamSupported = false; + break; + + case 0x3f: // Haswell X + case 0x4f: // Broadwell X + case 0x55: // Skylake X + case 0x56: // Broadwell D + // Supports package, cores, RAM. Has the units quirk. + mIsGpuSupported = false; + mIsRamSupported = true; + mHasRamUnitsQuirk = true; + break; + + case 0x2d: // Sandy Bridge X + case 0x3e: // Ivy Bridge X + // Supports package, cores, RAM. + mIsGpuSupported = false; + mIsRamSupported = true; + break; + + case 0x3c: // Haswell + case 0x3d: // Broadwell + case 0x45: // Haswell L + case 0x46: // Haswell G + case 0x47: // Broadwell G + // Supports package, cores, GPU, RAM. + mIsGpuSupported = true; + mIsRamSupported = true; + break; + + case 0x4e: // Skylake L + case 0x5e: // Skylake + case 0x8e: // Kaby Lake L + case 0x9e: // Kaby Lake + case 0x66: // Cannon Lake L + case 0x7d: // Ice Lake + case 0x7e: // Ice Lake L + case 0xa5: // Comet Lake + case 0xa6: // Comet Lake L + // Supports package, cores, GPU, RAM, PSYS. + // XXX: this tool currently doesn't measure PSYS. + mIsGpuSupported = true; + mIsRamSupported = true; + break; + + default: + Abort("unknown CPU model: %d", cpuModel); + break; + } + + // Get the maximum number of logical CPUs so that we know how big to make + // |mPkes|. + int logicalcpu_max; + size = sizeof(logicalcpu_max); + if (sysctlbyname("hw.logicalcpu_max", &logicalcpu_max, &size, NULL, 0) != + 0) { + Abort("sysctlbyname(\"hw.logicalcpu_max\") failed"); + } + + // Over-allocate by 1024 bytes per CPU to allow for the uncertainty around + // core_energy_stat_t::gpmcs and for any other future extensions to that + // struct. (The fields we read all come before the core_energy_stat_t + // array, so it won't matter to us whether gpmcs is present or not.) + size_t pkesSize = sizeof(pkg_energy_statistics_t) + + logicalcpu_max * sizeof(core_energy_stat_t) + + logicalcpu_max * 1024; + mPkes = (pkg_energy_statistics_t*)malloc(pkesSize); + if (!mPkes) { + Abort("malloc() failed"); + } + + // Do an initial measurement so that the first sample's diffs are sensible. + double dummy1, dummy2, dummy3, dummy4; + EnergyEstimates(dummy1, dummy2, dummy3, dummy4); + } + + ~RAPL() { free(mPkes); } + + static double Joules(uint64_t aTicks, double aJoulesPerTick) { + return double(aTicks) * aJoulesPerTick; + } + + void EnergyEstimates(double& aPkg_J, double& aCores_J, double& aGpu_J, + double& aRam_J) { + diagCall64_dgPowerStat(mPkes); + + // Bits 12:8 are the ESU. + // Energy measurements come in multiples of 1/(2^ESU). + uint32_t energyStatusUnits = (mPkes->pkg_power_unit >> 8) & 0x1f; + double joulesPerTick = ((double)1 / (1 << energyStatusUnits)); + + aPkg_J = Joules(mPkes->pkg_energy - mPrevPkgTicks, joulesPerTick); + aCores_J = Joules(mPkes->pp0_energy - mPrevPp0Ticks, joulesPerTick); + aGpu_J = mIsGpuSupported + ? Joules(mPkes->pp1_energy - mPrevPp1Ticks, joulesPerTick) + : kUnsupported_j; + aRam_J = mIsRamSupported + ? Joules(mPkes->ddr_energy - mPrevDdrTicks, + mHasRamUnitsQuirk ? kQuirkyRamJoulesPerTick + : joulesPerTick) + : kUnsupported_j; + + mPrevPkgTicks = mPkes->pkg_energy; + mPrevPp0Ticks = mPkes->pp0_energy; + if (mIsGpuSupported) { + mPrevPp1Ticks = mPkes->pp1_energy; + } + if (mIsRamSupported) { + mPrevDdrTicks = mPkes->ddr_energy; + } + } +}; + +/* static */ const double RAPL::kQuirkyRamJoulesPerTick = (double)1 / 65536; + +//--------------------------------------------------------------------------- +// Linux-specific code +//--------------------------------------------------------------------------- + +#elif defined(__linux__) + +# include <linux/perf_event.h> +# include <sys/syscall.h> + +// There is no glibc wrapper for this system call so we provide our own. +static int perf_event_open(struct perf_event_attr* aAttr, pid_t aPid, int aCpu, + int aGroupFd, unsigned long aFlags) { + return syscall(__NR_perf_event_open, aAttr, aPid, aCpu, aGroupFd, aFlags); +} + +// Returns false if the file cannot be opened. +template <typename T> +static bool ReadValueFromPowerFile(const char* aStr1, const char* aStr2, + const char* aStr3, const char* aScanfString, + T* aOut) { + // The filenames going into this buffer are under our control and the longest + // one is "/sys/bus/event_source/devices/power/events/energy-cores.scale". + // So 256 chars is plenty. + char filename[256]; + + sprintf(filename, "/sys/bus/event_source/devices/power/%s%s%s", aStr1, aStr2, + aStr3); + FILE* fp = fopen(filename, "r"); + if (!fp) { + return false; + } + if (fscanf(fp, aScanfString, aOut) != 1) { + Abort("fscanf() failed"); + } + fclose(fp); + + return true; +} + +// This class encapsulates the reading of a single RAPL domain. +class Domain { + bool mIsSupported; // Is the domain supported by the processor? + + // These three are only set if |mIsSupported| is true. + double mJoulesPerTick; // How many Joules each tick of the MSR represents. + int mFd; // The fd through which the MSR is read. + double mPrevTicks; // The previous sample's MSR value. + + public: + enum IsOptional { Optional, NonOptional }; + + Domain(const char* aName, uint32_t aType, + IsOptional aOptional = NonOptional) { + uint64_t config; + if (!ReadValueFromPowerFile("events/energy-", aName, "", "event=%llx", + &config)) { + // Failure is allowed for optional domains. + if (aOptional == NonOptional) { + Abort( + "failed to open file for non-optional domain '%s'\n" + "- Is your kernel version 3.14 or later, as required? " + "Run |uname -r| to see.", + aName); + } + mIsSupported = false; + return; + } + + mIsSupported = true; + + if (!ReadValueFromPowerFile("events/energy-", aName, ".scale", "%lf", + &mJoulesPerTick)) { + Abort("failed to read from .scale file"); + } + + // The unit should be "Joules", so 128 chars should be plenty. + char unit[128]; + if (!ReadValueFromPowerFile("events/energy-", aName, ".unit", "%127s", + unit)) { + Abort("failed to read from .unit file"); + } + if (strcmp(unit, "Joules") != 0) { + Abort("unexpected unit '%s' in .unit file", unit); + } + + struct perf_event_attr attr; + memset(&attr, 0, sizeof(attr)); + attr.type = aType; + attr.size = uint32_t(sizeof(attr)); + attr.config = config; + + // Measure all processes/threads. The specified CPU doesn't matter. + mFd = perf_event_open(&attr, /* aPid = */ -1, /* aCpu = */ 0, + /* aGroupFd = */ -1, /* aFlags = */ 0); + if (mFd < 0) { + Abort( + "perf_event_open() failed\n" + "- Did you run as root (e.g. with |sudo|) or set\n" + " /proc/sys/kernel/perf_event_paranoid to 0, as required?"); + } + + mPrevTicks = 0; + } + + ~Domain() { + if (mIsSupported) { + close(mFd); + } + } + + double EnergyEstimate() { + if (!mIsSupported) { + return kUnsupported_j; + } + + uint64_t thisTicks; + if (read(mFd, &thisTicks, sizeof(uint64_t)) != sizeof(uint64_t)) { + Abort("read() failed"); + } + + uint64_t ticks = thisTicks - mPrevTicks; + mPrevTicks = thisTicks; + double joules = ticks * mJoulesPerTick; + return joules; + } +}; + +class RAPL { + Domain* mPkg; + Domain* mCores; + Domain* mGpu; + Domain* mRam; + + public: + RAPL() { + uint32_t type; + if (!ReadValueFromPowerFile("type", "", "", "%u", &type)) { + Abort("failed to read from type file"); + } + + mPkg = new Domain("pkg", type); + mCores = new Domain("cores", type); + mGpu = new Domain("gpu", type, Domain::Optional); + mRam = new Domain("ram", type, Domain::Optional); + if (!mPkg || !mCores || !mGpu || !mRam) { + Abort("new Domain() failed"); + } + } + + ~RAPL() { + delete mPkg; + delete mCores; + delete mGpu; + delete mRam; + } + + void EnergyEstimates(double& aPkg_J, double& aCores_J, double& aGpu_J, + double& aRam_J) { + aPkg_J = mPkg->EnergyEstimate(); + aCores_J = mCores->EnergyEstimate(); + aGpu_J = mGpu->EnergyEstimate(); + aRam_J = mRam->EnergyEstimate(); + } +}; + +#else + +//--------------------------------------------------------------------------- +// Unsupported platforms +//--------------------------------------------------------------------------- + +# error Sorry, this platform is not supported + +#endif // platform + +//--------------------------------------------------------------------------- +// The main loop +//--------------------------------------------------------------------------- + +// The sample interval, measured in seconds. +static double gSampleInterval_sec; + +// The platform-specific RAPL-reading machinery. +static RAPL* gRapl; + +// All the sampled "total" values, in Watts. +static std::vector<double> gTotals_W; + +// Power = Energy / Time, where power is measured in Watts, Energy is measured +// in Joules, and Time is measured in seconds. +static double JoulesToWatts(double aJoules) { + return aJoules / gSampleInterval_sec; +} + +// "Normalize" here means convert kUnsupported_j to zero so it can be used in +// additive expressions. All printed values are 5 or maybe 6 chars (though 6 +// chars would require a value > 100 W, which is unlikely). Values above 1000 W +// are normalized to " n/a ", so 6 chars is the longest that may be printed. +static void NormalizeAndPrintAsWatts(char* aBuf, double& aValue_J) { + if (aValue_J == kUnsupported_j || aValue_J >= 1000) { + aValue_J = 0; + sprintf(aBuf, "%s", " n/a "); + } else { + sprintf(aBuf, "%5.2f", JoulesToWatts(aValue_J)); + } +} + +static void SigAlrmHandler(int aSigNum, siginfo_t* aInfo, void* aContext) { + static int sampleNumber = 1; + + double pkg_J, cores_J, gpu_J, ram_J; + gRapl->EnergyEstimates(pkg_J, cores_J, gpu_J, ram_J); + + // We should have pkg and cores estimates, but might not have gpu and ram + // estimates. + assert(pkg_J != kUnsupported_j); + assert(cores_J != kUnsupported_j); + + // This needs to be big enough to print watt values to two decimal places. 16 + // should be plenty. + static const size_t kNumStrLen = 16; + + static char pkgStr[kNumStrLen], coresStr[kNumStrLen], gpuStr[kNumStrLen], + ramStr[kNumStrLen]; + NormalizeAndPrintAsWatts(pkgStr, pkg_J); + NormalizeAndPrintAsWatts(coresStr, cores_J); + NormalizeAndPrintAsWatts(gpuStr, gpu_J); + NormalizeAndPrintAsWatts(ramStr, ram_J); + + // Core and GPU power are a subset of the package power. + assert(pkg_J >= cores_J + gpu_J); + + // Compute "other" (i.e. rest of the package) and "total" only after the + // other values have been normalized. + + char otherStr[kNumStrLen]; + double other_J = pkg_J - cores_J - gpu_J; + NormalizeAndPrintAsWatts(otherStr, other_J); + + char totalStr[kNumStrLen]; + double total_J = pkg_J + ram_J; + NormalizeAndPrintAsWatts(totalStr, total_J); + + gTotals_W.push_back(JoulesToWatts(total_J)); + + // Print and flush so that the output appears immediately even if being + // redirected through |tee| or anything like that. + PrintAndFlush("#%02d %s W = %s (%s + %s + %s) + %s W\n", sampleNumber++, + totalStr, pkgStr, coresStr, gpuStr, otherStr, ramStr); +} + +static void Finish() { + size_t n = gTotals_W.size(); + + // This time calculation assumes that the timers are perfectly accurate which + // is not true but the inaccuracy should be small in practice. + double time = n * gSampleInterval_sec; + + printf("\n"); + printf("%d sample%s taken over a period of %.3f second%s\n", int(n), + n == 1 ? "" : "s", n * gSampleInterval_sec, time == 1.0 ? "" : "s"); + + if (n == 0 || n == 1) { + exit(0); + } + + // Compute the mean. + double sum = std::accumulate(gTotals_W.begin(), gTotals_W.end(), 0.0); + double mean = sum / n; + + // Compute the *population* standard deviation: + // + // popStdDev = sqrt(Sigma(x - m)^2 / n) + // + // where |x| is the sum variable, |m| is the mean, and |n| is the + // population size. + // + // This is different from the *sample* standard deviation, which divides by + // |n - 1|, and would be appropriate if we were using a random sample of a + // larger population. + double sumOfSquaredDeviations = 0; + for (double& iter : gTotals_W) { + double deviation = (iter - mean); + sumOfSquaredDeviations += deviation * deviation; + } + double popStdDev = sqrt(sumOfSquaredDeviations / n); + + // Sort so that percentiles can be determined. We use the "Nearest Rank" + // method of determining percentiles, which is simplest to compute and which + // chooses values from those that appear in the input set. + std::sort(gTotals_W.begin(), gTotals_W.end()); + + printf("\n"); + printf("Distribution of 'total' values:\n"); + printf(" mean = %5.2f W\n", mean); + printf(" std dev = %5.2f W\n", popStdDev); + printf(" 0th percentile = %5.2f W (min)\n", gTotals_W[0]); + printf(" 5th percentile = %5.2f W\n", gTotals_W[ceil(0.05 * n) - 1]); + printf(" 25th percentile = %5.2f W\n", gTotals_W[ceil(0.25 * n) - 1]); + printf(" 50th percentile = %5.2f W\n", gTotals_W[ceil(0.50 * n) - 1]); + printf(" 75th percentile = %5.2f W\n", gTotals_W[ceil(0.75 * n) - 1]); + printf(" 95th percentile = %5.2f W\n", gTotals_W[ceil(0.95 * n) - 1]); + printf("100th percentile = %5.2f W (max)\n", gTotals_W[n - 1]); + + exit(0); +} + +static void SigIntHandler(int aSigNum, siginfo_t* aInfo, void* aContext) { + Finish(); +} + +static void PrintUsage() { + printf( + "usage: rapl [options]\n" + "\n" + "Options:\n" + "\n" + " -h --help show this message\n" + " -i --sample-interval <N> sample every N ms [default=1000]\n" + " -n --sample-count <N> get N samples (0 means unlimited) " + "[default=0]\n" + "\n" +#if defined(__APPLE__) + "On Mac this program can be run by any user.\n" +#elif defined(__linux__) + "On Linux this program can only be run by the super-user unless the " + "contents\n" + "of /proc/sys/kernel/perf_event_paranoid is set to 0 or lower.\n" +#else +# error Sorry, this platform is not supported +#endif + "\n"); +} + +int main(int argc, char** argv) { + // Process command line options. + + gArgv0 = argv[0]; + + // Default values. + int sampleInterval_msec = 1000; + int sampleCount = 0; + + struct option longOptions[] = { + {"help", no_argument, NULL, 'h'}, + {"sample-interval", required_argument, NULL, 'i'}, + {"sample-count", required_argument, NULL, 'n'}, + {NULL, 0, NULL, 0}}; + const char* shortOptions = "hi:n:"; + + int c; + char* endPtr; + while ((c = getopt_long(argc, argv, shortOptions, longOptions, NULL)) != -1) { + switch (c) { + case 'h': + PrintUsage(); + exit(0); + + case 'i': + sampleInterval_msec = strtol(optarg, &endPtr, /* base = */ 10); + if (*endPtr) { + CmdLineAbort("sample interval is not an integer"); + } + if (sampleInterval_msec < 1 || sampleInterval_msec > 3600000) { + CmdLineAbort("sample interval must be in the range 1..3600000 ms"); + } + break; + + case 'n': + sampleCount = strtol(optarg, &endPtr, /* base = */ 10); + if (*endPtr) { + CmdLineAbort("sample count is not an integer"); + } + if (sampleCount < 0 || sampleCount > 1000000) { + CmdLineAbort("sample count must be in the range 0..1000000"); + } + break; + + default: + CmdLineAbort(NULL); + } + } + + // The RAPL MSRs update every ~1 ms, but the measurement period isn't exactly + // 1 ms, which means the sample periods are not exact. "Power Measurement + // Techniques on Standard Compute Nodes: A Quantitative Comparison" by + // Hackenberg et al. suggests the following. + // + // "RAPL provides energy (and not power) consumption data without + // timestamps associated to each counter update. This makes sampling rates + // above 20 Samples/s unfeasible if the systematic error should be below + // 5%... Constantly polling the RAPL registers will both occupy a processor + // core and distort the measurement itself." + // + // So warn about this case. + if (sampleInterval_msec < 50) { + fprintf(stderr, + "\nWARNING: sample intervals < 50 ms are likely to produce " + "inaccurate estimates\n\n"); + } + gSampleInterval_sec = double(sampleInterval_msec) / 1000; + + // Initialize the platform-specific RAPL reading machinery. + gRapl = new RAPL(); + if (!gRapl) { + Abort("new RAPL() failed"); + } + + // Install the signal handlers. + + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_flags = SA_RESTART | SA_SIGINFO; + // The extra parens around (0) suppress a -Wunreachable-code warning on OS X + // where sigemptyset() is a macro that can never fail and always returns 0. + if (sigemptyset(&sa.sa_mask) < (0)) { + Abort("sigemptyset() failed"); + } + sa.sa_sigaction = SigAlrmHandler; + if (sigaction(SIGALRM, &sa, NULL) < 0) { + Abort("sigaction(SIGALRM) failed"); + } + sa.sa_sigaction = SigIntHandler; + if (sigaction(SIGINT, &sa, NULL) < 0) { + Abort("sigaction(SIGINT) failed"); + } + + // Set up the timer. + struct itimerval timer; + timer.it_interval.tv_sec = sampleInterval_msec / 1000; + timer.it_interval.tv_usec = (sampleInterval_msec % 1000) * 1000; + timer.it_value = timer.it_interval; + if (setitimer(ITIMER_REAL, &timer, NULL) < 0) { + Abort("setitimer() failed"); + } + + // Print header. + PrintAndFlush(" total W = _pkg_ (cores + _gpu_ + other) + _ram_ W\n"); + + // Take samples. + if (sampleCount == 0) { + while (true) { + pause(); + } + } else { + for (int i = 0; i < sampleCount; i++) { + pause(); + } + } + + Finish(); + + return 0; +} |