diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /memory/replace/logalloc | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'memory/replace/logalloc')
-rw-r--r-- | memory/replace/logalloc/FdPrintf.cpp | 200 | ||||
-rw-r--r-- | memory/replace/logalloc/FdPrintf.h | 27 | ||||
-rw-r--r-- | memory/replace/logalloc/LogAlloc.cpp | 238 | ||||
-rw-r--r-- | memory/replace/logalloc/README | 95 | ||||
-rw-r--r-- | memory/replace/logalloc/moz.build | 30 | ||||
-rw-r--r-- | memory/replace/logalloc/replay/Makefile.in | 48 | ||||
-rw-r--r-- | memory/replace/logalloc/replay/Replay.cpp | 1159 | ||||
-rw-r--r-- | memory/replace/logalloc/replay/expected_output_minimal.log | 17 | ||||
-rw-r--r-- | memory/replace/logalloc/replay/logalloc_munge.py | 147 | ||||
-rw-r--r-- | memory/replace/logalloc/replay/moz.build | 92 | ||||
-rw-r--r-- | memory/replace/logalloc/replay/replay.log | 18 |
11 files changed, 2071 insertions, 0 deletions
diff --git a/memory/replace/logalloc/FdPrintf.cpp b/memory/replace/logalloc/FdPrintf.cpp new file mode 100644 index 0000000000..4a8e48af78 --- /dev/null +++ b/memory/replace/logalloc/FdPrintf.cpp @@ -0,0 +1,200 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <cstdarg> + +#ifdef _WIN32 +# include <windows.h> +#else +# include <unistd.h> +#endif +#include <cmath> +#include <cstring> +#include "mozilla/Assertions.h" +#include "mozilla/Unused.h" +#include "FdPrintf.h" + +/* Template class allowing a limited number of increments on a value */ +template <typename T> +class CheckedIncrement { + public: + CheckedIncrement(T aValue, size_t aMaxIncrement) + : mValue(aValue), mMaxIncrement(aMaxIncrement) {} + + T operator++(int) { + if (!mMaxIncrement) { + MOZ_CRASH("overflow detected"); + } + mMaxIncrement--; + return mValue++; + } + + T& operator++() { + (*this)++; + return mValue; + } + + void advance(T end) { + // Only makes sense if T is a pointer type. + size_t diff = end - mValue; + if (diff > mMaxIncrement) { + MOZ_CRASH("overflow detected"); + } + mMaxIncrement -= diff; + mValue = end; + }; + + void rewind(T pos) { + size_t diff = mValue - pos; + mMaxIncrement += diff; + mValue = pos; + } + + operator T() { return mValue; } + T value() { return mValue; } + + private: + T mValue; + size_t mMaxIncrement; +}; + +template <typename T> +static unsigned NumDigits(T n) { + if (n < 1) { + // We want one digit, it will be 0. + return 1; + } + + double l = log10(static_cast<double>(n)); + double cl = ceil(l); + return l == cl ? unsigned(cl) + 1 : unsigned(cl); +} + +static void LeftPad(CheckedIncrement<char*>& b, size_t pad) { + while (pad-- > 0) { + *(b++) = ' '; + } +} + +// Write the digits into the buffer. +static void WriteDigits(CheckedIncrement<char*>& b, size_t i, + size_t num_digits) { + size_t x = pow(10, double(num_digits - 1)); + do { + *(b++) = "0123456789"[(i / x) % 10]; + x /= 10; + } while (x > 0); +} + +void FdPrintf(intptr_t aFd, const char* aFormat, ...) { + if (aFd == 0) { + return; + } + char buf[256]; + CheckedIncrement<char*> b(buf, sizeof(buf)); + CheckedIncrement<const char*> f(aFormat, strlen(aFormat) + 1); + va_list ap; + va_start(ap, aFormat); + while (true) { + switch (*f) { + case '\0': + goto out; + + case '%': { + // The start of the format specifier is used if this specifier is + // invalid. + const char* start = f; + + // Read the field width + f++; + char* end = nullptr; + size_t width = strtoul(f, &end, 10); + // If strtol can't find a number that's okay, that means 0 in our + // case, but we must advance f). + f.advance(end); + + switch (*f) { + case 'z': { + if (*(++f) == 'u') { + size_t i = va_arg(ap, size_t); + + size_t num_digits = NumDigits(i); + LeftPad(b, width > num_digits ? width - num_digits : 0); + WriteDigits(b, i, num_digits); + } else { + // If the format specifier is unknown then write out '%' and + // rewind to the beginning of the specifier causing it to be + // printed normally. + *(b++) = '%'; + f.rewind(start); + } + break; + } + + case 'p': { + intptr_t ptr = va_arg(ap, intptr_t); + *(b++) = '0'; + *(b++) = 'x'; + int x = sizeof(intptr_t) * 8; + bool wrote_msb = false; + do { + x -= 4; + size_t hex_digit = ptr >> x & 0xf; + if (hex_digit || wrote_msb) { + *(b++) = "0123456789abcdef"[hex_digit]; + wrote_msb = true; + } + } while (x > 0); + if (!wrote_msb) { + *(b++) = '0'; + } + break; + } + + case 's': { + const char* str = va_arg(ap, const char*); + size_t len = strlen(str); + + LeftPad(b, width > len ? width - len : 0); + + while (*str) { + *(b++) = *(str++); + } + + break; + } + + case '%': + // Print a single raw '%'. + *(b++) = '%'; + break; + + default: + // If the format specifier is unknown then write out '%' and + // rewind to the beginning of the specifier causing it to be + // printed normally. + *(b++) = '%'; + f.rewind(start); + break; + } + break; + } + default: + *(b++) = *f; + break; + } + f++; + } +out: +#ifdef _WIN32 + // See comment in FdPrintf.h as to why WriteFile is used. + DWORD written; + WriteFile(reinterpret_cast<HANDLE>(aFd), buf, b - buf, &written, nullptr); +#else + MOZ_UNUSED(write(aFd, buf, b - buf)); +#endif + va_end(ap); +} diff --git a/memory/replace/logalloc/FdPrintf.h b/memory/replace/logalloc/FdPrintf.h new file mode 100644 index 0000000000..f390d57ed5 --- /dev/null +++ b/memory/replace/logalloc/FdPrintf.h @@ -0,0 +1,27 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef __FdPrintf_h__ +#define __FdPrintf_h__ + +/* We can't use libc's (f)printf because it would reenter in replace_malloc, + * So use a custom and simplified version. Only %p, %zu, %s and %% are + * supported, %zu, %s, support width specifiers. + * + * /!\ This function used a fixed-size internal buffer. The caller is + * expected to not use a format string that may overflow. + * The aFd argument is a file descriptor on UNIX and a native win32 file + * handle on Windows (from CreateFile). We can't use the windows POSIX + * APIs is that they don't support O_APPEND in a multi-process-safe way, + * while CreateFile does. + */ +extern void FdPrintf(intptr_t aFd, const char* aFormat, ...) +#ifdef __GNUC__ + __attribute__((format(printf, 2, 3))) +#endif + ; + +#endif /* __FdPrintf_h__ */ diff --git a/memory/replace/logalloc/LogAlloc.cpp b/memory/replace/logalloc/LogAlloc.cpp new file mode 100644 index 0000000000..a976b0c674 --- /dev/null +++ b/memory/replace/logalloc/LogAlloc.cpp @@ -0,0 +1,238 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <cstdlib> +#include <fcntl.h> + +#ifdef _WIN32 +# include <windows.h> +# include <io.h> +# include <process.h> +#else +# include <unistd.h> +# include <pthread.h> +#endif + +#include "replace_malloc.h" +#include "FdPrintf.h" +#include "Mutex.h" + +static malloc_table_t sFuncs; +static intptr_t sFd = 0; +static bool sStdoutOrStderr = false; + +static Mutex sMutex MOZ_UNANNOTATED; + +#ifndef _WIN32 +static void prefork() MOZ_NO_THREAD_SAFETY_ANALYSIS { sMutex.Lock(); } +static void postfork_parent() MOZ_NO_THREAD_SAFETY_ANALYSIS { sMutex.Unlock(); } +static void postfork_child() { sMutex.Init(); } +#endif + +static size_t GetPid() { return size_t(getpid()); } + +static size_t GetTid() { +#if defined(_WIN32) + return size_t(GetCurrentThreadId()); +#else + return size_t(pthread_self()); +#endif +} + +#ifdef ANDROID +/* Android doesn't have pthread_atfork defined in pthread.h */ +extern "C" MOZ_EXPORT int pthread_atfork(void (*)(void), void (*)(void), + void (*)(void)); +#endif + +class LogAllocBridge : public ReplaceMallocBridge { + virtual void InitDebugFd(mozilla::DebugFdRegistry& aRegistry) override { + if (!sStdoutOrStderr) { + aRegistry.RegisterHandle(sFd); + } + } +}; + +/* Do a simple, text-form, log of all calls to replace-malloc functions. + * Use locking to guarantee that an allocation that did happen is logged + * before any other allocation/free happens. + */ + +static void* replace_malloc(size_t aSize) { + MutexAutoLock lock(sMutex); + void* ptr = sFuncs.malloc(aSize); + FdPrintf(sFd, "%zu %zu malloc(%zu)=%p\n", GetPid(), GetTid(), aSize, ptr); + return ptr; +} + +static int replace_posix_memalign(void** aPtr, size_t aAlignment, + size_t aSize) { + MutexAutoLock lock(sMutex); + int ret = sFuncs.posix_memalign(aPtr, aAlignment, aSize); + FdPrintf(sFd, "%zu %zu posix_memalign(%zu,%zu)=%p\n", GetPid(), GetTid(), + aAlignment, aSize, (ret == 0) ? *aPtr : nullptr); + return ret; +} + +static void* replace_aligned_alloc(size_t aAlignment, size_t aSize) { + MutexAutoLock lock(sMutex); + void* ptr = sFuncs.aligned_alloc(aAlignment, aSize); + FdPrintf(sFd, "%zu %zu aligned_alloc(%zu,%zu)=%p\n", GetPid(), GetTid(), + aAlignment, aSize, ptr); + return ptr; +} + +static void* replace_calloc(size_t aNum, size_t aSize) { + MutexAutoLock lock(sMutex); + void* ptr = sFuncs.calloc(aNum, aSize); + FdPrintf(sFd, "%zu %zu calloc(%zu,%zu)=%p\n", GetPid(), GetTid(), aNum, aSize, + ptr); + return ptr; +} + +static void* replace_realloc(void* aPtr, size_t aSize) { + MutexAutoLock lock(sMutex); + void* new_ptr = sFuncs.realloc(aPtr, aSize); + FdPrintf(sFd, "%zu %zu realloc(%p,%zu)=%p\n", GetPid(), GetTid(), aPtr, aSize, + new_ptr); + return new_ptr; +} + +static void replace_free(void* aPtr) { + MutexAutoLock lock(sMutex); + FdPrintf(sFd, "%zu %zu free(%p)\n", GetPid(), GetTid(), aPtr); + sFuncs.free(aPtr); +} + +static void* replace_memalign(size_t aAlignment, size_t aSize) { + MutexAutoLock lock(sMutex); + void* ptr = sFuncs.memalign(aAlignment, aSize); + FdPrintf(sFd, "%zu %zu memalign(%zu,%zu)=%p\n", GetPid(), GetTid(), + aAlignment, aSize, ptr); + return ptr; +} + +static void* replace_valloc(size_t aSize) { + MutexAutoLock lock(sMutex); + void* ptr = sFuncs.valloc(aSize); + FdPrintf(sFd, "%zu %zu valloc(%zu)=%p\n", GetPid(), GetTid(), aSize, ptr); + return ptr; +} + +static void replace_jemalloc_stats(jemalloc_stats_t* aStats, + jemalloc_bin_stats_t* aBinStats) { + MutexAutoLock lock(sMutex); + sFuncs.jemalloc_stats_internal(aStats, aBinStats); + FdPrintf(sFd, "%zu %zu jemalloc_stats()\n", GetPid(), GetTid()); +} + +void replace_init(malloc_table_t* aTable, ReplaceMallocBridge** aBridge) { + /* Initialize output file descriptor from the MALLOC_LOG environment + * variable. Numbers up to 9999 are considered as a preopened file + * descriptor number. Other values are considered as a file name. */ +#ifdef _WIN32 + wchar_t* log = _wgetenv(L"MALLOC_LOG"); +#else + char* log = getenv("MALLOC_LOG"); +#endif + if (log && *log) { + int fd = 0; + const auto* fd_num = log; + while (*fd_num) { + /* Reject non digits. */ + if (*fd_num < '0' || *fd_num > '9') { + fd = -1; + break; + } + fd = fd * 10 + (*fd_num - '0'); + /* Reject values >= 10000. */ + if (fd >= 10000) { + fd = -1; + break; + } + fd_num++; + } + if (fd == 1 || fd == 2) { + sStdoutOrStderr = true; + } +#ifdef _WIN32 + // See comment in FdPrintf.h as to why CreateFile is used. + HANDLE handle; + if (fd > 0) { + handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd)); + } else { + handle = + CreateFileW(log, FILE_APPEND_DATA, FILE_SHARE_READ | FILE_SHARE_WRITE, + nullptr, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr); + } + if (handle != INVALID_HANDLE_VALUE) { + sFd = reinterpret_cast<intptr_t>(handle); + } +#else + if (fd == -1) { + fd = open(log, O_WRONLY | O_CREAT | O_APPEND, 0644); + } + if (fd > 0) { + sFd = fd; + } +#endif + } + + // Don't initialize if we weren't passed a valid MALLOC_LOG. + if (sFd == 0) { + return; + } + + sMutex.Init(); + static LogAllocBridge bridge; + sFuncs = *aTable; +#define MALLOC_FUNCS MALLOC_FUNCS_MALLOC_BASE +#define MALLOC_DECL(name, ...) aTable->name = replace_##name; +#include "malloc_decls.h" + aTable->jemalloc_stats_internal = replace_jemalloc_stats; + if (!getenv("MALLOC_LOG_MINIMAL")) { + aTable->posix_memalign = replace_posix_memalign; + aTable->aligned_alloc = replace_aligned_alloc; + aTable->valloc = replace_valloc; + } + *aBridge = &bridge; + +#ifndef _WIN32 + /* When another thread has acquired a lock before forking, the child + * process will inherit the lock state but the thread, being nonexistent + * in the child process, will never release it, leading to a dead-lock + * whenever the child process gets the lock. We thus need to ensure no + * other thread is holding the lock before forking, by acquiring it + * ourselves, and releasing it after forking in the parent process and + * resetting it to its initial state in the child process. The latter is + * important because some implementations (notably macOS) prevent a lock from + * being unlocked by a different thread than the one which locked it in the + * first place. + * Windows doesn't have this problem since there is no fork(). + * The real allocator, however, might be doing the same thing (jemalloc + * does). But pthread_atfork `prepare` handlers (first argument) are + * processed in reverse order they were established. But replace_init + * runs before the real allocator has had any chance to initialize and + * call pthread_atfork itself. This leads to its prefork running before + * ours. This leads to a race condition that can lead to a deadlock like + * the following: + * - thread A forks. + * - libc calls real allocator's prefork, so thread A holds the real + * allocator lock. + * - thread B calls malloc, which calls our replace_malloc. + * - consequently, thread B holds our lock. + * - thread B then proceeds to call the real allocator's malloc, and + * waits for the real allocator's lock, which thread A holds. + * - libc calls our prefork, so thread A waits for our lock, which + * thread B holds. + * To avoid this race condition, the real allocator's prefork must be + * called after ours, which means it needs to be registered before ours. + * So trick the real allocator into initializing itself without more side + * effects by calling malloc with a size it can't possibly allocate. */ + sFuncs.malloc(-1); + pthread_atfork(prefork, postfork_parent, postfork_child); +#endif +} diff --git a/memory/replace/logalloc/README b/memory/replace/logalloc/README new file mode 100644 index 0000000000..c2e8cf66ce --- /dev/null +++ b/memory/replace/logalloc/README @@ -0,0 +1,95 @@ +Logalloc is a replace-malloc library for Firefox (see +memory/build/replace_malloc.h) that dumps a log of memory allocations to a +given file descriptor or file name. That log can then be replayed against +Firefox's default memory allocator independently or through another +replace-malloc library, allowing the testing of other allocators under the +exact same workload. + +To get an allocation log the following environment variable when starting +Firefox: + MALLOC_LOG=/path/to/log-file + or + MALLOC_LOG=number + +When MALLOC_LOG is a number below 10000, it is considered as a file +descriptor number that is fed to Firefox when it is started. Otherwise, +it is considered as a file name. + +As those allocation logs can grow large quite quickly, it can be useful +to pipe the output to a compression tool. + +MALLOC_LOG=1 would send to Firefox's stdout, MALLOC_LOG=2 would send to +its stderr. Since in both cases that could be mixed with other output +from Firefox, it is usually better to use another file descriptor +by shell redirections, such as: + + MALLOC_LOG=3 firefox 3>&1 1>&2 | gzip -c > log.gz + +(3>&1 copies the `| gzip` pipe file descriptor to file descriptor #3, 1>&2 +then copies stderr to stdout. This leads to: fd1 and fd2 sending to stderr +of the parent process (the shell), and fd3 sending to gzip.) + +Each line of the allocations log is formatted as follows: + <pid> <tid> <function>([<args>])[=<result>] +where <args> is a comma separated list of values. The number of <args> and +the presence of <result> depend on the <function>. + +Example log: + 18545 18545 malloc(32)=0x7f90495120e0 + 18545 18545 calloc(1,148)=0x7f9049537480 + 18545 18545 realloc(0x7f90495120e0,64)=0x7f9049536680 + 18545 18545 posix_memalign(256,240)=0x7f9049583300 + 18545 18545 jemalloc_stats() + 18545 18545 free(0x7f9049536680) + +This log can be replayed with the logalloc-replay tool in +memory/replace/logalloc/replay. However, as the goal of that tool is to +reproduce the recorded memory allocations, it needs to avoid as much as +possible doing its own allocations for bookkeeping. Reading the logs as +they are would require data structures and memory allocations. As a +consequence, the logs need to be preprocessed beforehand. + +The logalloc_munge.py script is responsible for that preprocessing. It simply +takes a raw log on its stdin, and outputs the preprocessed log on its stdout. +It replaces pointer addresses with indexes the logalloc-replay tool can use +in a large (almost) linear array of allocation tracking slots (prefixed with +'#'). It also replaces the pids with numbers starting from 1 (such as the +first seen pid number is 1, the second is 2, etc.). + +The above example log would become the following, once preprocessed: + 1 1 malloc(32)=#1 + 1 1 calloc(1,148)=#2 + 1 1 realloc(#1,64)=#1 + 1 1 posix_memalign(256,240)=#3 + 1 1 jemalloc_stats() + 1 1 free(#1) + +The logalloc-replay tool then takes the preprocessed log on its stdin and +replays the allocations printed there, but will only replay those with the +same process id as the first line (which normally is 1). + +As the log files are simple text files, though, it is easy to separate out +the different processes log with e.g. grep, and feed the separate processes +logs to logalloc-replay. + +The logalloc-replay program won't output anything unless jemalloc_stats +records appears in the log. You can expect those to be recorded when going +to about:memory in Firefox, but they can also be added after preprocessing. + +Here is an example of what one can do: + + gunzip -c log.gz | python logalloc_munge.py | \ + awk '$1 == "2" { print $0 } !(NR % 10000) { print "2 1 jemalloc_stats()" }' | \ + ./logalloc-replay + +The above command replays the allocations of process #2, with some stats +output every 10000 records. + +The logalloc-replay tool itself being hooked with replace-malloc, it is possible +to set LD_PRELOAD/DYLD_INSERT_LIBRARIES/MOZ_REPLACE_MALLOC_LIB and replay a log +through a different allocator. For example: + + LD_PRELOAD=libreplace_jemalloc.so logalloc-replay < log + +Will replay the log against jemalloc4 (which is, as of writing, what +libreplace_jemalloc.so contains). diff --git a/memory/replace/logalloc/moz.build b/memory/replace/logalloc/moz.build new file mode 100644 index 0000000000..c52d9e69e0 --- /dev/null +++ b/memory/replace/logalloc/moz.build @@ -0,0 +1,30 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +ReplaceMalloc("logalloc") + +SOURCES += [ + "FdPrintf.cpp", + "LogAlloc.cpp", +] + +DisableStlWrapping() +NO_PGO = True +DEFINES["MOZ_NO_MOZALLOC"] = True + +LOCAL_INCLUDES += [ + "/memory/build", +] + +# Android doesn't have pthread_atfork, but we have our own in mozglue. +if CONFIG["OS_TARGET"] == "Android" and FORCE_SHARED_LIB: + USE_LIBS += [ + "mozglue", + ] + +DIRS += [ + "replay", +] diff --git a/memory/replace/logalloc/replay/Makefile.in b/memory/replace/logalloc/replay/Makefile.in new file mode 100644 index 0000000000..73659add98 --- /dev/null +++ b/memory/replace/logalloc/replay/Makefile.in @@ -0,0 +1,48 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +ifdef MOZ_CODE_COVERAGE +SKIP = 1 +endif + +ifdef CROSS_COMPILE +SKIP = 1 +endif + +ifneq ($(SKIP),1) + +ifeq ($(OS_TARGET),WINNT) +LOGALLOC_VAR = MOZ_REPLACE_MALLOC_LIB +else +ifeq ($(OS_TARGET),Darwin) +LOGALLOC_VAR = DYLD_INSERT_LIBRARIES +else +LOGALLOC_VAR = LD_PRELOAD +endif +endif + +ifndef MOZ_REPLACE_MALLOC_STATIC +LOGALLOC = $(LOGALLOC_VAR)=$(CURDIR)/../$(DLL_PREFIX)logalloc$(DLL_SUFFIX) +endif + +expected_output.log: $(srcdir)/replay.log +# The logalloc-replay program will only replay entries from the first pid, +# so the expected output only contains entries beginning with "1 " + grep "^1 " $< > $@ + +check:: $(srcdir)/replay.log expected_output.log $(srcdir)/expected_output_minimal.log +# Test with MALLOC_LOG as a file descriptor number +# We filter out anything happening before the first jemalloc_stats (first +# command in replay.log) because starting with libstdc++ 5, a static +# initializer in the STL allocates memory, which we obviously don't have +# in expected_output.log. + MALLOC_LOG=1 $(LOGALLOC) ./$(PROGRAM) < $< | sed -n '/jemalloc_stats/,$$p' | $(PYTHON3) $(srcdir)/logalloc_munge.py | diff -w - expected_output.log +# Test with MALLOC_LOG as a file name + $(RM) test_output.log + MALLOC_LOG=test_output.log $(LOGALLOC) ./$(PROGRAM) < $< + sed -n '/jemalloc_stats/,$$p' test_output.log | $(PYTHON3) $(srcdir)/logalloc_munge.py | diff -w - expected_output.log + + MALLOC_LOG=1 MALLOC_LOG_MINIMAL=1 $(LOGALLOC) ./$(PROGRAM) < $< | sed -n '/jemalloc_stats/,$$p' | $(PYTHON3) $(srcdir)/logalloc_munge.py | diff -w - $(srcdir)/expected_output_minimal.log + +endif diff --git a/memory/replace/logalloc/replay/Replay.cpp b/memory/replace/logalloc/replay/Replay.cpp new file mode 100644 index 0000000000..b5ad0c540e --- /dev/null +++ b/memory/replace/logalloc/replay/Replay.cpp @@ -0,0 +1,1159 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#define MOZ_MEMORY_IMPL +#include "mozmemory_wrap.h" + +#ifdef _WIN32 +# include <windows.h> +# include <io.h> +typedef intptr_t ssize_t; +#else +# include <sys/mman.h> +# include <unistd.h> +#endif +#ifdef XP_LINUX +# include <fcntl.h> +# include <stdlib.h> +#endif +#include <algorithm> +#include <cmath> +#include <cstdio> +#include <cstring> + +#include "mozilla/Assertions.h" +#include "mozilla/MathAlgorithms.h" +#include "mozilla/Maybe.h" +#include "FdPrintf.h" + +using namespace mozilla; + +static void die(const char* message) { + /* Here, it doesn't matter that fprintf may allocate memory. */ + fprintf(stderr, "%s\n", message); + exit(1); +} + +#ifdef XP_LINUX +static size_t sPageSize = []() { return sysconf(_SC_PAGESIZE); }(); +#endif + +/* We don't want to be using malloc() to allocate our internal tracking + * data, because that would change the parameters of what is being measured, + * so we want to use data types that directly use mmap/VirtualAlloc. */ +template <typename T, size_t Len> +class MappedArray { + public: + MappedArray() : mPtr(nullptr) { +#ifdef XP_LINUX + MOZ_RELEASE_ASSERT(!((sizeof(T) * Len) & (sPageSize - 1)), + "MappedArray size must be a multiple of the page size"); +#endif + } + + ~MappedArray() { + if (mPtr) { +#ifdef _WIN32 + VirtualFree(mPtr, sizeof(T) * Len, MEM_RELEASE); +#elif defined(XP_LINUX) + munmap(reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(mPtr) - + sPageSize), + sizeof(T) * Len + sPageSize * 2); +#else + munmap(mPtr, sizeof(T) * Len); +#endif + } + } + + T& operator[](size_t aIndex) const { + if (mPtr) { + return mPtr[aIndex]; + } + +#ifdef _WIN32 + mPtr = reinterpret_cast<T*>(VirtualAlloc( + nullptr, sizeof(T) * Len, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE)); + if (mPtr == nullptr) { + die("VirtualAlloc error"); + } +#else + size_t data_size = sizeof(T) * Len; + size_t size = data_size; +# ifdef XP_LINUX + // See below + size += sPageSize * 2; +# endif + mPtr = reinterpret_cast<T*>(mmap(nullptr, size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0)); + if (mPtr == MAP_FAILED) { + die("Mmap error"); + } +# ifdef XP_LINUX + // On Linux we request a page on either side of the allocation and + // mprotect them. This prevents mappings in /proc/self/smaps from being + // merged and allows us to parse this file to calculate the allocator's RSS. + MOZ_ASSERT(0 == mprotect(mPtr, sPageSize, 0)); + MOZ_ASSERT(0 == mprotect(reinterpret_cast<void*>( + reinterpret_cast<uintptr_t>(mPtr) + data_size + + sPageSize), + sPageSize, 0)); + mPtr = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(mPtr) + sPageSize); +# endif +#endif + return mPtr[aIndex]; + } + + bool ownsMapping(uintptr_t addr) const { return addr == (uintptr_t)mPtr; } + + bool allocated() const { return !!mPtr; } + + private: + mutable T* mPtr; +}; + +/* Type for records of allocations. */ +struct MemSlot { + void* mPtr; + + // mRequest is only valid if mPtr is non-null. It doesn't need to be cleared + // when memory is freed or realloc()ed. + size_t mRequest; +}; + +/* An almost infinite list of slots. + * In essence, this is a linked list of arrays of groups of slots. + * Each group is 1MB. On 64-bits, one group allows to store 64k allocations. + * Each MemSlotList instance can store 1023 such groups, which means more + * than 67M allocations. In case more would be needed, we chain to another + * MemSlotList, and so on. + * Using 1023 groups makes the MemSlotList itself page sized on 32-bits + * and 2 pages-sized on 64-bits. + */ +class MemSlotList { + static constexpr size_t kGroups = 1024 - 1; + static constexpr size_t kGroupSize = (1024 * 1024) / sizeof(MemSlot); + + MappedArray<MemSlot, kGroupSize> mSlots[kGroups]; + MappedArray<MemSlotList, 1> mNext; + + public: + MemSlot& operator[](size_t aIndex) const { + if (aIndex < kGroupSize * kGroups) { + return mSlots[aIndex / kGroupSize][aIndex % kGroupSize]; + } + aIndex -= kGroupSize * kGroups; + return mNext[0][aIndex]; + } + + // Ask if any of the memory-mapped buffers use this range. + bool ownsMapping(uintptr_t aStart) const { + for (const auto& slot : mSlots) { + if (slot.allocated() && slot.ownsMapping(aStart)) { + return true; + } + } + return mNext.ownsMapping(aStart) || + (mNext.allocated() && mNext[0].ownsMapping(aStart)); + } +}; + +/* Helper class for memory buffers */ +class Buffer { + public: + Buffer() : mBuf(nullptr), mLength(0) {} + + Buffer(const void* aBuf, size_t aLength) + : mBuf(reinterpret_cast<const char*>(aBuf)), mLength(aLength) {} + + /* Constructor for string literals. */ + template <size_t Size> + explicit Buffer(const char (&aStr)[Size]) : mBuf(aStr), mLength(Size - 1) {} + + /* Returns a sub-buffer up-to but not including the given aNeedle character. + * The "parent" buffer itself is altered to begin after the aNeedle + * character. + * If the aNeedle character is not found, return the entire buffer, and empty + * the "parent" buffer. */ + Buffer SplitChar(char aNeedle) { + char* buf = const_cast<char*>(mBuf); + char* c = reinterpret_cast<char*>(memchr(buf, aNeedle, mLength)); + if (!c) { + return Split(mLength); + } + + Buffer result = Split(c - buf); + // Remove the aNeedle character itself. + Split(1); + return result; + } + + // Advance to the position after aNeedle. This is like SplitChar but does not + // return the skipped portion. + void Skip(char aNeedle, unsigned nTimes = 1) { + for (unsigned i = 0; i < nTimes; i++) { + SplitChar(aNeedle); + } + } + + void SkipWhitespace() { + while (mLength > 0) { + if (!IsSpace(mBuf[0])) { + break; + } + mBuf++; + mLength--; + } + } + + static bool IsSpace(char c) { + switch (c) { + case ' ': + case '\t': + case '\n': + case '\v': + case '\f': + case '\r': + return true; + } + return false; + } + + /* Returns a sub-buffer of at most aLength characters. The "parent" buffer is + * amputated of those aLength characters. If the "parent" buffer is smaller + * than aLength, then its length is used instead. */ + Buffer Split(size_t aLength) { + Buffer result(mBuf, std::min(aLength, mLength)); + mLength -= result.mLength; + mBuf += result.mLength; + return result; + } + + /* Move the buffer (including its content) to the memory address of the aOther + * buffer. */ + void Slide(Buffer aOther) { + memmove(const_cast<char*>(aOther.mBuf), mBuf, mLength); + mBuf = aOther.mBuf; + } + + /* Returns whether the two involved buffers have the same content. */ + bool operator==(Buffer aOther) { + return mLength == aOther.mLength && + (mBuf == aOther.mBuf || !strncmp(mBuf, aOther.mBuf, mLength)); + } + + bool operator!=(Buffer aOther) { return !(*this == aOther); } + + /* Returns true if the buffer is not empty. */ + explicit operator bool() { return mLength; } + + char operator[](size_t n) const { return mBuf[n]; } + + /* Returns the memory location of the buffer. */ + const char* get() { return mBuf; } + + /* Returns the memory location of the end of the buffer (technically, the + * first byte after the buffer). */ + const char* GetEnd() { return mBuf + mLength; } + + /* Extend the buffer over the content of the other buffer, assuming it is + * adjacent. */ + void Extend(Buffer aOther) { + MOZ_ASSERT(aOther.mBuf == GetEnd()); + mLength += aOther.mLength; + } + + size_t Length() const { return mLength; } + + private: + const char* mBuf; + size_t mLength; +}; + +/* Helper class to read from a file descriptor line by line. */ +class FdReader { + public: + explicit FdReader(int aFd, bool aNeedClose = false) + : mFd(aFd), + mNeedClose(aNeedClose), + mData(&mRawBuf, 0), + mBuf(&mRawBuf, sizeof(mRawBuf)) {} + + FdReader(FdReader&& aOther) noexcept + : mFd(aOther.mFd), + mNeedClose(aOther.mNeedClose), + mData(&mRawBuf, 0), + mBuf(&mRawBuf, sizeof(mRawBuf)) { + memcpy(mRawBuf, aOther.mRawBuf, sizeof(mRawBuf)); + aOther.mFd = -1; + aOther.mNeedClose = false; + aOther.mData = Buffer(); + aOther.mBuf = Buffer(); + } + + FdReader& operator=(const FdReader&) = delete; + FdReader(const FdReader&) = delete; + + ~FdReader() { + if (mNeedClose) { + close(mFd); + } + } + + /* Read a line from the file descriptor and returns it as a Buffer instance */ + Buffer ReadLine() { + while (true) { + Buffer result = mData.SplitChar('\n'); + + /* There are essentially three different cases here: + * - '\n' was found "early". In this case, the end of the result buffer + * is before the beginning of the mData buffer (since SplitChar + * amputated it). + * - '\n' was found as the last character of mData. In this case, mData + * is empty, but still points at the end of mBuf. result points to what + * used to be in mData, without the last character. + * - '\n' was not found. In this case too, mData is empty and points at + * the end of mBuf. But result points to the entire buffer that used to + * be pointed by mData. + * Only in the latter case do both result and mData's end match, and it's + * the only case where we need to refill the buffer. + */ + if (result.GetEnd() != mData.GetEnd()) { + return result; + } + + /* Since SplitChar emptied mData, make it point to what it had before. */ + mData = result; + + /* And move it to the beginning of the read buffer. */ + mData.Slide(mBuf); + + FillBuffer(); + + if (!mData) { + return Buffer(); + } + } + } + + private: + /* Fill the read buffer. */ + void FillBuffer() { + size_t size = mBuf.GetEnd() - mData.GetEnd(); + Buffer remainder(mData.GetEnd(), size); + + ssize_t len = 1; + while (remainder && len > 0) { + len = ::read(mFd, const_cast<char*>(remainder.get()), size); + if (len < 0) { + die("Read error"); + } + size -= len; + mData.Extend(remainder.Split(len)); + } + } + + /* File descriptor to read from. */ + int mFd; + bool mNeedClose; + + /* Part of data that was read from the file descriptor but not returned with + * ReadLine yet. */ + Buffer mData; + /* Buffer representation of mRawBuf */ + Buffer mBuf; + /* read() buffer */ + char mRawBuf[4096]; +}; + +MOZ_BEGIN_EXTERN_C + +/* Function declarations for all the replace_malloc _impl functions. + * See memory/build/replace_malloc.c */ +#define MALLOC_DECL(name, return_type, ...) \ + return_type name##_impl(__VA_ARGS__); +#define MALLOC_FUNCS MALLOC_FUNCS_MALLOC +#include "malloc_decls.h" + +#define MALLOC_DECL(name, return_type, ...) return_type name(__VA_ARGS__); +#define MALLOC_FUNCS MALLOC_FUNCS_JEMALLOC +#include "malloc_decls.h" + +#ifdef ANDROID + +/* mozjemalloc and jemalloc use pthread_atfork, which Android doesn't have. + * While gecko has one in libmozglue, the replay program can't use that. + * Since we're not going to fork anyways, make it a dummy function. */ +int pthread_atfork(void (*aPrepare)(void), void (*aParent)(void), + void (*aChild)(void)) { + return 0; +} +#endif + +MOZ_END_EXTERN_C + +template <unsigned Base = 10> +size_t parseNumber(Buffer aBuf) { + if (!aBuf) { + die("Malformed input"); + } + + size_t result = 0; + for (const char *c = aBuf.get(), *end = aBuf.GetEnd(); c < end; c++) { + result *= Base; + if ((*c >= '0' && *c <= '9')) { + result += *c - '0'; + } else if (Base == 16 && *c >= 'a' && *c <= 'f') { + result += *c - 'a' + 10; + } else if (Base == 16 && *c >= 'A' && *c <= 'F') { + result += *c - 'A' + 10; + } else { + die("Malformed input"); + } + } + return result; +} + +static size_t percent(size_t a, size_t b) { + if (!b) { + return 0; + } + return size_t(round(double(a) / double(b) * 100.0)); +} + +class Distribution { + public: + // Default constructor used for array initialisation. + Distribution() + : mMaxSize(0), + mNextSmallest(0), + mShift(0), + mArrayOffset(0), + mArraySlots(0), + mTotalRequests(0), + mRequests{0} {} + + Distribution(size_t max_size, size_t next_smallest, size_t bucket_size) + : mMaxSize(max_size), + mNextSmallest(next_smallest), + mShift(CeilingLog2(bucket_size)), + mArrayOffset(1 + next_smallest), + mArraySlots((max_size - next_smallest) >> mShift), + mTotalRequests(0), + mRequests{ + 0, + } { + MOZ_ASSERT(mMaxSize); + MOZ_RELEASE_ASSERT(mArraySlots <= MAX_NUM_BUCKETS); + } + + Distribution& operator=(const Distribution& aOther) = default; + + void addRequest(size_t request) { + MOZ_ASSERT(mMaxSize); + + mRequests[(request - mArrayOffset) >> mShift]++; + mTotalRequests++; + } + + void printDist(intptr_t std_err) { + MOZ_ASSERT(mMaxSize); + + // The translation to turn a slot index into a memory request size. + const size_t array_offset_add = (1 << mShift) + mNextSmallest; + + FdPrintf(std_err, "\n%zu-bin Distribution:\n", mMaxSize); + FdPrintf(std_err, " request : count percent\n"); + size_t range_start = mNextSmallest + 1; + for (size_t j = 0; j < mArraySlots; j++) { + size_t range_end = (j << mShift) + array_offset_add; + FdPrintf(std_err, "%5zu - %5zu: %6zu %6zu%%\n", range_start, range_end, + mRequests[j], percent(mRequests[j], mTotalRequests)); + range_start = range_end + 1; + } + } + + size_t maxSize() const { return mMaxSize; } + + private: + static constexpr size_t MAX_NUM_BUCKETS = 16; + + // If size is zero this distribution is uninitialised. + size_t mMaxSize; + size_t mNextSmallest; + + // Parameters to convert a size into a slot number. + unsigned mShift; + unsigned mArrayOffset; + + // The number of slots. + unsigned mArraySlots; + + size_t mTotalRequests; + size_t mRequests[MAX_NUM_BUCKETS]; +}; + +#ifdef XP_LINUX +struct MemoryMap { + uintptr_t mStart; + uintptr_t mEnd; + bool mReadable; + bool mPrivate; + bool mAnon; + bool mIsStack; + bool mIsSpecial; + size_t mRSS; + + bool IsCandidate() const { + // Candidates mappings are: + // * anonymous + // * they are private (not shared), + // * anonymous or "[heap]" (not another area such as stack), + // + // The only mappings we're falsely including are the .bss segments for + // shared libraries. + return mReadable && mPrivate && mAnon && !mIsStack && !mIsSpecial; + } +}; + +class SMapsReader : private FdReader { + private: + explicit SMapsReader(FdReader&& reader) : FdReader(std::move(reader)) {} + + public: + static Maybe<SMapsReader> open() { + int fd = ::open(FILENAME, O_RDONLY); + if (fd < 0) { + perror(FILENAME); + return mozilla::Nothing(); + } + + return Some(SMapsReader(FdReader(fd, true))); + } + + Maybe<MemoryMap> readMap(intptr_t aStdErr) { + // This is not very tolerant of format changes because things like + // parseNumber will crash if they get a bad value. TODO: make this + // soft-fail. + + Buffer line = ReadLine(); + if (!line) { + return Nothing(); + } + + // We're going to be at the start of an entry, start tokenising the first + // line. + + // Range + Buffer range = line.SplitChar(' '); + uintptr_t range_start = parseNumber<16>(range.SplitChar('-')); + uintptr_t range_end = parseNumber<16>(range); + + // Mode. + Buffer mode = line.SplitChar(' '); + if (mode.Length() != 4) { + FdPrintf(aStdErr, "Couldn't parse SMAPS file\n"); + return Nothing(); + } + bool readable = mode[0] == 'r'; + bool private_ = mode[3] == 'p'; + + // Offset, device and inode. + line.SkipWhitespace(); + bool zero_offset = !parseNumber<16>(line.SplitChar(' ')); + line.SkipWhitespace(); + bool no_device = line.SplitChar(' ') == Buffer("00:00"); + line.SkipWhitespace(); + bool zero_inode = !parseNumber(line.SplitChar(' ')); + bool is_anon = zero_offset && no_device && zero_inode; + + // Filename, or empty for anon mappings. + line.SkipWhitespace(); + Buffer filename = line.SplitChar(' '); + + bool is_stack; + bool is_special; + if (filename && filename[0] == '[') { + is_stack = filename == Buffer("[stack]"); + is_special = filename == Buffer("[vdso]") || + filename == Buffer("[vvar]") || + filename == Buffer("[vsyscall]"); + } else { + is_stack = false; + is_special = false; + } + + size_t rss = 0; + while ((line = ReadLine())) { + Buffer field = line.SplitChar(':'); + if (field == Buffer("VmFlags")) { + // This is the last field, at least in the current format. Break this + // loop to read the next mapping. + break; + } + + if (field == Buffer("Rss")) { + line.SkipWhitespace(); + Buffer value = line.SplitChar(' '); + rss = parseNumber(value) * 1024; + } + } + + return Some(MemoryMap({range_start, range_end, readable, private_, is_anon, + is_stack, is_special, rss})); + } + + static constexpr char FILENAME[] = "/proc/self/smaps"; +}; +#endif // XP_LINUX + +/* Class to handle dispatching the replay function calls to replace-malloc. */ +class Replay { + public: + Replay() { +#ifdef _WIN32 + // See comment in FdPrintf.h as to why native win32 handles are used. + mStdErr = reinterpret_cast<intptr_t>(GetStdHandle(STD_ERROR_HANDLE)); +#else + mStdErr = fileno(stderr); +#endif +#ifdef XP_LINUX + BuildInitialMapInfo(); +#endif + } + + void enableSlopCalculation() { mCalculateSlop = true; } + void enableMemset() { mDoMemset = true; } + + MemSlot& operator[](size_t index) const { return mSlots[index]; } + + void malloc(Buffer& aArgs, Buffer& aResult) { + MemSlot& aSlot = SlotForResult(aResult); + mOps++; + size_t size = parseNumber(aArgs); + aSlot.mPtr = ::malloc_impl(size); + if (aSlot.mPtr) { + aSlot.mRequest = size; + MaybeCommit(aSlot); + if (mCalculateSlop) { + mTotalRequestedSize += size; + mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr); + } + } + } + + void posix_memalign(Buffer& aArgs, Buffer& aResult) { + MemSlot& aSlot = SlotForResult(aResult); + mOps++; + size_t alignment = parseNumber(aArgs.SplitChar(',')); + size_t size = parseNumber(aArgs); + void* ptr; + if (::posix_memalign_impl(&ptr, alignment, size) == 0) { + aSlot.mPtr = ptr; + aSlot.mRequest = size; + MaybeCommit(aSlot); + if (mCalculateSlop) { + mTotalRequestedSize += size; + mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr); + } + } else { + aSlot.mPtr = nullptr; + } + } + + void aligned_alloc(Buffer& aArgs, Buffer& aResult) { + MemSlot& aSlot = SlotForResult(aResult); + mOps++; + size_t alignment = parseNumber(aArgs.SplitChar(',')); + size_t size = parseNumber(aArgs); + aSlot.mPtr = ::aligned_alloc_impl(alignment, size); + if (aSlot.mPtr) { + aSlot.mRequest = size; + MaybeCommit(aSlot); + if (mCalculateSlop) { + mTotalRequestedSize += size; + mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr); + } + } + } + + void calloc(Buffer& aArgs, Buffer& aResult) { + MemSlot& aSlot = SlotForResult(aResult); + mOps++; + size_t num = parseNumber(aArgs.SplitChar(',')); + size_t size = parseNumber(aArgs); + aSlot.mPtr = ::calloc_impl(num, size); + if (aSlot.mPtr) { + aSlot.mRequest = num * size; + MaybeCommit(aSlot); + if (mCalculateSlop) { + mTotalRequestedSize += num * size; + mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr); + } + } + } + + void realloc(Buffer& aArgs, Buffer& aResult) { + MemSlot& aSlot = SlotForResult(aResult); + mOps++; + Buffer dummy = aArgs.SplitChar('#'); + if (dummy) { + die("Malformed input"); + } + size_t slot_id = parseNumber(aArgs.SplitChar(',')); + size_t size = parseNumber(aArgs); + MemSlot& old_slot = (*this)[slot_id]; + void* old_ptr = old_slot.mPtr; + old_slot.mPtr = nullptr; + aSlot.mPtr = ::realloc_impl(old_ptr, size); + if (aSlot.mPtr) { + aSlot.mRequest = size; + MaybeCommit(aSlot); + if (mCalculateSlop) { + mTotalRequestedSize += size; + mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr); + } + } + } + + void free(Buffer& aArgs, Buffer& aResult) { + if (aResult) { + die("Malformed input"); + } + mOps++; + Buffer dummy = aArgs.SplitChar('#'); + if (dummy) { + die("Malformed input"); + } + size_t slot_id = parseNumber(aArgs); + MemSlot& slot = (*this)[slot_id]; + ::free_impl(slot.mPtr); + slot.mPtr = nullptr; + } + + void memalign(Buffer& aArgs, Buffer& aResult) { + MemSlot& aSlot = SlotForResult(aResult); + mOps++; + size_t alignment = parseNumber(aArgs.SplitChar(',')); + size_t size = parseNumber(aArgs); + aSlot.mPtr = ::memalign_impl(alignment, size); + if (aSlot.mPtr) { + aSlot.mRequest = size; + MaybeCommit(aSlot); + if (mCalculateSlop) { + mTotalRequestedSize += size; + mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr); + } + } + } + + void valloc(Buffer& aArgs, Buffer& aResult) { + MemSlot& aSlot = SlotForResult(aResult); + mOps++; + size_t size = parseNumber(aArgs); + aSlot.mPtr = ::valloc_impl(size); + if (aSlot.mPtr) { + aSlot.mRequest = size; + MaybeCommit(aSlot); + if (mCalculateSlop) { + mTotalRequestedSize += size; + mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr); + } + } + } + + void jemalloc_stats(Buffer& aArgs, Buffer& aResult) { + if (aArgs || aResult) { + die("Malformed input"); + } + mOps++; + jemalloc_stats_t stats; + // Using a variable length array here is a GCC & Clang extension. But it + // allows us to place this on the stack and not alter jemalloc's profiling. + const size_t num_bins = ::jemalloc_stats_num_bins(); + const size_t MAX_NUM_BINS = 100; + if (num_bins > MAX_NUM_BINS) { + die("Exceeded maximum number of jemalloc stats bins"); + } + jemalloc_bin_stats_t bin_stats[MAX_NUM_BINS] = {{0}}; + ::jemalloc_stats_internal(&stats, bin_stats); + +#ifdef XP_LINUX + size_t rss = get_rss(); +#endif + + size_t num_objects = 0; + size_t num_sloppy_objects = 0; + size_t total_allocated = 0; + size_t total_slop = 0; + size_t large_slop = 0; + size_t large_used = 0; + size_t huge_slop = 0; + size_t huge_used = 0; + size_t bin_slop[MAX_NUM_BINS] = {0}; + + for (size_t slot_id = 0; slot_id < mNumUsedSlots; slot_id++) { + MemSlot& slot = mSlots[slot_id]; + if (slot.mPtr) { + size_t used = ::malloc_usable_size_impl(slot.mPtr); + size_t slop = used - slot.mRequest; + total_allocated += used; + total_slop += slop; + num_objects++; + if (slop) { + num_sloppy_objects++; + } + + if (used <= + (stats.subpage_max ? stats.subpage_max : stats.quantum_wide_max)) { + // We know that this is an inefficient linear search, but there's a + // small number of bins and this is simple. + for (unsigned i = 0; i < num_bins; i++) { + auto& bin = bin_stats[i]; + if (used == bin.size) { + bin_slop[i] += slop; + break; + } + } + } else if (used <= stats.large_max) { + large_slop += slop; + large_used += used; + } else { + huge_slop += slop; + huge_used += used; + } + } + } + + // This formula corresponds to the calculation of wasted (from committed and + // the other parameters) within jemalloc_stats() + size_t committed = stats.allocated + stats.waste + stats.page_cache + + stats.bookkeeping + stats.bin_unused; + + FdPrintf(mStdErr, "\n"); + FdPrintf(mStdErr, "Objects: %9zu\n", num_objects); + FdPrintf(mStdErr, "Slots: %9zu\n", mNumUsedSlots); + FdPrintf(mStdErr, "Ops: %9zu\n", mOps); + FdPrintf(mStdErr, "mapped: %9zu\n", stats.mapped); + FdPrintf(mStdErr, "committed: %9zu\n", committed); +#ifdef XP_LINUX + if (rss) { + FdPrintf(mStdErr, "rss: %9zu\n", rss); + } +#endif + FdPrintf(mStdErr, "allocated: %9zu\n", stats.allocated); + FdPrintf(mStdErr, "waste: %9zu\n", stats.waste); + FdPrintf(mStdErr, "dirty: %9zu\n", stats.page_cache); + FdPrintf(mStdErr, "bookkeep: %9zu\n", stats.bookkeeping); + FdPrintf(mStdErr, "bin-unused: %9zu\n", stats.bin_unused); + FdPrintf(mStdErr, "quantum-max: %9zu\n", stats.quantum_max); + FdPrintf(mStdErr, "quantum-wide-max: %9zu\n", stats.quantum_wide_max); + FdPrintf(mStdErr, "subpage-max: %9zu\n", stats.subpage_max); + FdPrintf(mStdErr, "large-max: %9zu\n", stats.large_max); + if (mCalculateSlop) { + size_t slop = mTotalAllocatedSize - mTotalRequestedSize; + FdPrintf(mStdErr, + "Total slop for all allocations: %zuKiB/%zuKiB (%zu%%)\n", + slop / 1024, mTotalAllocatedSize / 1024, + percent(slop, mTotalAllocatedSize)); + } + FdPrintf(mStdErr, "Live sloppy objects: %zu/%zu (%zu%%)\n", + num_sloppy_objects, num_objects, + percent(num_sloppy_objects, num_objects)); + FdPrintf(mStdErr, "Live sloppy bytes: %zuKiB/%zuKiB (%zu%%)\n", + total_slop / 1024, total_allocated / 1024, + percent(total_slop, total_allocated)); + + FdPrintf(mStdErr, "\n%8s %11s %10s %8s %9s %9s %8s\n", "bin-size", + "unused (c)", "total (c)", "used (c)", "non-full (r)", "total (r)", + "used (r)"); + for (unsigned i = 0; i < num_bins; i++) { + auto& bin = bin_stats[i]; + MOZ_ASSERT(bin.size); + FdPrintf(mStdErr, "%8zu %8zuKiB %7zuKiB %7zu%% %12zu %9zu %7zu%%\n", + bin.size, bin.bytes_unused / 1024, bin.bytes_total / 1024, + percent(bin.bytes_total - bin.bytes_unused, bin.bytes_total), + bin.num_non_full_runs, bin.num_runs, + percent(bin.num_runs - bin.num_non_full_runs, bin.num_runs)); + } + + FdPrintf(mStdErr, "\n%5s %8s %9s %7s\n", "bin", "slop", "used", "percent"); + for (unsigned i = 0; i < num_bins; i++) { + auto& bin = bin_stats[i]; + size_t used = bin.bytes_total - bin.bytes_unused; + FdPrintf(mStdErr, "%5zu %8zu %9zu %6zu%%\n", bin.size, bin_slop[i], used, + percent(bin_slop[i], used)); + } + FdPrintf(mStdErr, "%5s %8zu %9zu %6zu%%\n", "large", large_slop, large_used, + percent(large_slop, large_used)); + FdPrintf(mStdErr, "%5s %8zu %9zu %6zu%%\n", "huge", huge_slop, huge_used, + percent(huge_slop, huge_used)); + + print_distributions(stats, bin_stats); + } + + private: + /* + * Create and print frequency distributions of memory requests. + */ + void print_distributions(jemalloc_stats_t& stats, + jemalloc_bin_stats_t* bin_stats) { + const size_t num_bins = ::jemalloc_stats_num_bins(); + + // We compute distributions for all of the bins for small allocations + // (num_bins) plus two more distributions for larger allocations. + Distribution dists[num_bins + 2]; + + unsigned last_size = 0; + unsigned num_dists = 0; + for (unsigned i = 0; i < num_bins; i++) { + auto& bin = bin_stats[i]; + auto& dist = dists[num_dists++]; + + MOZ_ASSERT(bin.size); + if (bin.size <= 16) { + // 1 byte buckets. + dist = Distribution(bin.size, last_size, 1); + } else if (bin.size <= stats.quantum_max) { + // 4 buckets, (4 bytes per bucket with a 16 byte quantum). + dist = Distribution(bin.size, last_size, stats.quantum / 4); + } else if (bin.size <= stats.quantum_wide_max) { + // 8 buckets, (32 bytes per bucket with a 256 byte quantum-wide). + dist = Distribution(bin.size, last_size, stats.quantum_wide / 8); + } else { + // 16 buckets. + dist = Distribution(bin.size, last_size, (bin.size - last_size) / 16); + } + last_size = bin.size; + } + + // 16 buckets. + dists[num_dists] = Distribution(stats.page_size, last_size, + (stats.page_size - last_size) / 16); + num_dists++; + + // Buckets are 1/4 of the page size (12 buckets). + dists[num_dists] = + Distribution(stats.page_size * 4, stats.page_size, stats.page_size / 4); + num_dists++; + + MOZ_RELEASE_ASSERT(num_dists <= num_bins + 2); + + for (size_t slot_id = 0; slot_id < mNumUsedSlots; slot_id++) { + MemSlot& slot = mSlots[slot_id]; + if (slot.mPtr) { + for (size_t i = 0; i < num_dists; i++) { + if (slot.mRequest <= dists[i].maxSize()) { + dists[i].addRequest(slot.mRequest); + break; + } + } + } + } + + for (unsigned i = 0; i < num_dists; i++) { + dists[i].printDist(mStdErr); + } + } + +#ifdef XP_LINUX + size_t get_rss() { + if (mGetRSSFailed) { + return 0; + } + + // On Linux we can determine the RSS of the heap area by examining the + // smaps file. + mozilla::Maybe<SMapsReader> reader = SMapsReader::open(); + if (!reader) { + mGetRSSFailed = true; + return 0; + } + + size_t rss = 0; + while (Maybe<MemoryMap> map = reader->readMap(mStdErr)) { + if (map->IsCandidate() && !mSlots.ownsMapping(map->mStart) && + !InitialMapsContains(map->mStart)) { + rss += map->mRSS; + } + } + + return rss; + } + + bool InitialMapsContains(uintptr_t aRangeStart) { + for (unsigned i = 0; i < mNumInitialMaps; i++) { + MOZ_ASSERT(i < MAX_INITIAL_MAPS); + + if (mInitialMaps[i] == aRangeStart) { + return true; + } + } + return false; + } + + public: + void BuildInitialMapInfo() { + if (mGetRSSFailed) { + return; + } + + Maybe<SMapsReader> reader = SMapsReader::open(); + if (!reader) { + mGetRSSFailed = true; + return; + } + + while (Maybe<MemoryMap> map = reader->readMap(mStdErr)) { + if (map->IsCandidate()) { + if (mNumInitialMaps >= MAX_INITIAL_MAPS) { + FdPrintf(mStdErr, "Too many initial mappings, can't compute RSS\n"); + mGetRSSFailed = false; + return; + } + + mInitialMaps[mNumInitialMaps++] = map->mStart; + } + } + } +#endif + + private: + MemSlot& SlotForResult(Buffer& aResult) { + /* Parse result value and get the corresponding slot. */ + Buffer dummy = aResult.SplitChar('='); + Buffer dummy2 = aResult.SplitChar('#'); + if (dummy || dummy2) { + die("Malformed input"); + } + + size_t slot_id = parseNumber(aResult); + mNumUsedSlots = std::max(mNumUsedSlots, slot_id + 1); + + return mSlots[slot_id]; + } + + void MaybeCommit(MemSlot& aSlot) { + if (mDoMemset) { + // Write any byte, 0x55 isn't significant. + memset(aSlot.mPtr, 0x55, aSlot.mRequest); + } + } + + intptr_t mStdErr; + size_t mOps = 0; + + // The number of slots that have been used. It is used to iterate over slots + // without accessing those we haven't initialised. + size_t mNumUsedSlots = 0; + + MemSlotList mSlots; + size_t mTotalRequestedSize = 0; + size_t mTotalAllocatedSize = 0; + // Whether to calculate slop for all allocations over the runtime of a + // process. + bool mCalculateSlop = false; + bool mDoMemset = false; + +#ifdef XP_LINUX + // If we have a failure reading smaps info then this is used to disable that + // feature. + bool mGetRSSFailed = false; + + // The initial memory mappings are recorded here at start up. We exclude + // memory in these mappings when computing RSS. We assume they do not grow + // and that no regions are allocated near them, this is true because they'll + // only record the .bss and .data segments from our binary and shared objects + // or regions that logalloc-replay has created for MappedArrays. + // + // 64 should be enough for anybody. + static constexpr unsigned MAX_INITIAL_MAPS = 64; + uintptr_t mInitialMaps[MAX_INITIAL_MAPS]; + unsigned mNumInitialMaps = 0; +#endif // XP_LINUX +}; + +static Replay replay; + +int main(int argc, const char* argv[]) { + size_t first_pid = 0; + FdReader reader(0); + + for (int i = 1; i < argc; i++) { + const char* option = argv[i]; + if (strcmp(option, "-s") == 0) { + // Do accounting to calculate allocation slop. + replay.enableSlopCalculation(); + } else if (strcmp(option, "-c") == 0) { + // Touch memory as we allocate it. + replay.enableMemset(); + } else { + fprintf(stderr, "Unknown command line option: %s\n", option); + return EXIT_FAILURE; + } + } + + /* Read log from stdin and dispatch function calls to the Replay instance. + * The log format is essentially: + * <pid> <tid> <function>([<args>])[=<result>] + * <args> is a comma separated list of arguments. + * + * The logs are expected to be preprocessed so that allocations are + * attributed a tracking slot. The input is trusted not to have crazy + * values for these slot numbers. + * + * <result>, as well as some of the args to some of the function calls are + * such slot numbers. + */ + while (true) { + Buffer line = reader.ReadLine(); + + if (!line) { + break; + } + + size_t pid = parseNumber(line.SplitChar(' ')); + if (!first_pid) { + first_pid = pid; + } + + /* The log may contain data for several processes, only entries for the + * very first that appears are treated. */ + if (first_pid != pid) { + continue; + } + + /* The log contains thread ids for manual analysis, but we just ignore them + * for now. */ + parseNumber(line.SplitChar(' ')); + + Buffer func = line.SplitChar('('); + Buffer args = line.SplitChar(')'); + + if (func == Buffer("jemalloc_stats")) { + replay.jemalloc_stats(args, line); + } else if (func == Buffer("free")) { + replay.free(args, line); + } else if (func == Buffer("malloc")) { + replay.malloc(args, line); + } else if (func == Buffer("posix_memalign")) { + replay.posix_memalign(args, line); + } else if (func == Buffer("aligned_alloc")) { + replay.aligned_alloc(args, line); + } else if (func == Buffer("calloc")) { + replay.calloc(args, line); + } else if (func == Buffer("realloc")) { + replay.realloc(args, line); + } else if (func == Buffer("memalign")) { + replay.memalign(args, line); + } else if (func == Buffer("valloc")) { + replay.valloc(args, line); + } else { + die("Malformed input"); + } + } + + return 0; +} diff --git a/memory/replace/logalloc/replay/expected_output_minimal.log b/memory/replace/logalloc/replay/expected_output_minimal.log new file mode 100644 index 0000000000..332fe20957 --- /dev/null +++ b/memory/replace/logalloc/replay/expected_output_minimal.log @@ -0,0 +1,17 @@ +1 1 jemalloc_stats() +1 1 malloc(42)=#1 +1 1 malloc(24)=#2 +1 1 free(#1) +1 1 memalign(4096,1024)=#1 +1 1 calloc(4,42)=#3 +1 1 free(#2) +1 1 realloc(#3,84)=#2 +1 1 memalign(256,1024)=#3 +1 1 memalign(512,1024)=#4 +1 1 memalign(4096,1024)=#5 +1 1 jemalloc_stats() +1 1 free(#5) +1 1 free(#4) +1 1 free(#3) +1 1 free(#2) +1 1 free(#1) diff --git a/memory/replace/logalloc/replay/logalloc_munge.py b/memory/replace/logalloc/replay/logalloc_munge.py new file mode 100644 index 0000000000..52d0032463 --- /dev/null +++ b/memory/replace/logalloc/replay/logalloc_munge.py @@ -0,0 +1,147 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +This script takes a log from the replace-malloc logalloc library on stdin +and munges it so that it can be used with the logalloc-replay tool. + +Given the following output: + 13663 malloc(42)=0x7f0c33502040 + 13663 malloc(24)=0x7f0c33503040 + 13663 free(0x7f0c33502040) +The resulting output is: + 1 malloc(42)=#1 + 1 malloc(24)=#2 + 1 free(#1) + +See README for more details. +""" + +import sys +from collections import defaultdict, deque + + +class IdMapping(object): + """Class to map values to ids. + + Each value is associated to an increasing id, starting from 1. + When a value is removed, its id is recycled and will be reused for + subsequent values. + """ + + def __init__(self): + self.id = 1 + self._values = {} + self._recycle = deque() + + def __getitem__(self, value): + if value not in self._values: + if self._recycle: + self._values[value] = self._recycle.popleft() + else: + self._values[value] = self.id + self.id += 1 + return self._values[value] + + def __delitem__(self, value): + if value == 0: + return + self._recycle.append(self._values[value]) + del self._values[value] + + def __contains__(self, value): + return value == 0 or value in self._values + + +class Ignored(Exception): + pass + + +def split_log_line(line): + try: + # The format for each line is: + # <pid> [<tid>] <function>([<args>])[=<result>] + # + # The original format didn't include the tid, so we try to parse + # lines whether they have one or not. + pid, func_call = line.split(" ", 1) + call, result = func_call.split(")") + func, args = call.split("(") + args = args.split(",") if args else [] + if result: + if result[0] != "=": + raise Ignored("Malformed input") + result = result[1:] + if " " in func: + tid, func = func.split(" ", 1) + else: + tid = pid + return pid, tid, func, args, result + except Exception: + raise Ignored("Malformed input") + + +NUM_ARGUMENTS = { + "jemalloc_stats": 0, + "free": 1, + "malloc": 1, + "posix_memalign": 2, + "aligned_alloc": 2, + "calloc": 2, + "realloc": 2, + "memalign": 2, + "valloc": 1, +} + + +def main(): + pids = IdMapping() + processes = defaultdict(lambda: {"pointers": IdMapping(), "tids": IdMapping()}) + for line in sys.stdin: + line = line.strip() + + try: + pid, tid, func, args, result = split_log_line(line) + + # Replace pid with an id. + pid = pids[int(pid)] + + process = processes[pid] + tid = process["tids"][int(tid)] + + pointers = process["pointers"] + + if func not in NUM_ARGUMENTS: + raise Ignored("Unknown function") + + if len(args) != NUM_ARGUMENTS[func]: + raise Ignored("Malformed input") + + if func in ("jemalloc_stats", "free") and result: + raise Ignored("Malformed input") + + if func in ("free", "realloc"): + ptr = int(args[0], 16) + if ptr and ptr not in pointers: + raise Ignored("Did not see an alloc for pointer") + args[0] = "#%d" % pointers[ptr] + del pointers[ptr] + + if result: + result = int(result, 16) + if not result: + raise Ignored("Result is NULL") + result = "#%d" % pointers[result] + + print( + "%d %d %s(%s)%s" + % (pid, tid, func, ",".join(args), "=%s" % result if result else "") + ) + + except Exception as e: + print('Ignored "%s": %s' % (line, e), file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/memory/replace/logalloc/replay/moz.build b/memory/replace/logalloc/replay/moz.build new file mode 100644 index 0000000000..1d39864699 --- /dev/null +++ b/memory/replace/logalloc/replay/moz.build @@ -0,0 +1,92 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Program("logalloc-replay") + +SOURCES += [ + "/mfbt/Assertions.cpp", + "/mfbt/Poison.cpp", + "/mfbt/RandomNum.cpp", + "/mfbt/TaggedAnonymousMemory.cpp", + "/mfbt/Unused.cpp", + "/mozglue/misc/StackWalk.cpp", + "Replay.cpp", +] + +if CONFIG["OS_TARGET"] == "WINNT": + SOURCES += [ + "/mozglue/misc/ProcessType.cpp", + ] + +if CONFIG["OS_TARGET"] == "Linux": + LDFLAGS += ["-static-libstdc++"] + +if CONFIG["OS_TARGET"] == "Darwin": + # Work around "warning: 'aligned_alloc' is only available on macOS 10.15 or newer" + # when building with MACOSX_DEPLOYMENT_TARGET < 10.15 with >= 10.15 SDK. + # We have our own definition of the function, so it doesn't matter what the SDK says. + SOURCES["Replay.cpp"].flags += ["-Wno-unguarded-availability-new"] + +if CONFIG["MOZ_REPLACE_MALLOC_STATIC"] and (CONFIG["MOZ_DMD"] or CONFIG["MOZ_PHC"]): + UNIFIED_SOURCES += [ + "/mfbt/HashFunctions.cpp", + "/mfbt/JSONWriter.cpp", + ] + +if CONFIG["OS_ARCH"] == "WINNT": + OS_LIBS += [ + "advapi32", + "dbghelp", + ] + +if CONFIG["MOZ_LINKER"] and CONFIG["MOZ_WIDGET_TOOLKIT"] == "android": + LOCAL_INCLUDES += [ + "/mozglue/linker", + ] + DEFINES["__wrap_dladdr"] = "dladdr" + + +if CONFIG["MOZ_BUILD_APP"] == "memory": + EXPORTS.mozilla += [ + "/mozglue/misc/StackWalk.h", + ] + +if CONFIG["MOZ_BUILD_APP"] == "memory" or CONFIG["MOZ_REPLACE_MALLOC_STATIC"]: + UNIFIED_SOURCES += [ + "/mfbt/double-conversion/double-conversion/bignum-dtoa.cc", + "/mfbt/double-conversion/double-conversion/bignum.cc", + "/mfbt/double-conversion/double-conversion/cached-powers.cc", + "/mfbt/double-conversion/double-conversion/double-to-string.cc", + "/mfbt/double-conversion/double-conversion/fast-dtoa.cc", + "/mfbt/double-conversion/double-conversion/fixed-dtoa.cc", + "/mfbt/double-conversion/double-conversion/string-to-double.cc", + "/mfbt/double-conversion/double-conversion/strtod.cc", + "/mozglue/misc/Printf.cpp", + ] + +if not CONFIG["MOZ_REPLACE_MALLOC_STATIC"]: + SOURCES += [ + "../FdPrintf.cpp", + ] + +LOCAL_INCLUDES += [ + "..", +] + +# Link replace-malloc and the default allocator. +USE_LIBS += [ + "memory", +] + +# The memory library defines this, so it's needed here too. +DEFINES["IMPL_MFBT"] = True + +if CONFIG["MOZ_NEEDS_LIBATOMIC"]: + OS_LIBS += ["atomic"] + +DisableStlWrapping() + +include("/mozglue/build/replace_malloc.mozbuild") diff --git a/memory/replace/logalloc/replay/replay.log b/memory/replace/logalloc/replay/replay.log new file mode 100644 index 0000000000..f1e6de788b --- /dev/null +++ b/memory/replace/logalloc/replay/replay.log @@ -0,0 +1,18 @@ +1 1 jemalloc_stats() +1 1 malloc(42)=#1 +1 1 malloc(24)=#2 +2 2 malloc(42)=#1 +1 1 free(#1) +1 1 posix_memalign(4096,1024)=#1 +1 1 calloc(4,42)=#3 +1 1 free(#2) +1 1 realloc(#3,84)=#2 +1 1 aligned_alloc(256,1024)=#3 +1 1 memalign(512,1024)=#4 +1 1 valloc(1024)=#5 +1 1 jemalloc_stats() +1 1 free(#5) +1 1 free(#4) +1 1 free(#3) +1 1 free(#2) +1 1 free(#1) |