From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- tools/jprof/README.html | 330 ++++++++++++++++ tools/jprof/bfd.cpp | 221 +++++++++++ tools/jprof/coff.cpp | 93 +++++ tools/jprof/elf.cpp | 128 +++++++ tools/jprof/intcnt.cpp | 69 ++++ tools/jprof/intcnt.h | 39 ++ tools/jprof/jprofsig | 46 +++ tools/jprof/leaky.cpp | 827 +++++++++++++++++++++++++++++++++++++++++ tools/jprof/leaky.h | 124 ++++++ tools/jprof/moz.build | 28 ++ tools/jprof/split-profile.py | 156 ++++++++ tools/jprof/strset.cpp | 37 ++ tools/jprof/strset.h | 19 + tools/jprof/stub/Makefile.in | 8 + tools/jprof/stub/config.h | 18 + tools/jprof/stub/jprof.h | 17 + tools/jprof/stub/libmalloc.cpp | 740 ++++++++++++++++++++++++++++++++++++ tools/jprof/stub/libmalloc.h | 45 +++ tools/jprof/stub/moz.build | 17 + 19 files changed, 2962 insertions(+) create mode 100644 tools/jprof/README.html create mode 100644 tools/jprof/bfd.cpp create mode 100644 tools/jprof/coff.cpp create mode 100644 tools/jprof/elf.cpp create mode 100644 tools/jprof/intcnt.cpp create mode 100644 tools/jprof/intcnt.h create mode 100755 tools/jprof/jprofsig create mode 100644 tools/jprof/leaky.cpp create mode 100644 tools/jprof/leaky.h create mode 100644 tools/jprof/moz.build create mode 100755 tools/jprof/split-profile.py create mode 100644 tools/jprof/strset.cpp create mode 100644 tools/jprof/strset.h create mode 100644 tools/jprof/stub/Makefile.in create mode 100644 tools/jprof/stub/config.h create mode 100644 tools/jprof/stub/jprof.h create mode 100644 tools/jprof/stub/libmalloc.cpp create mode 100644 tools/jprof/stub/libmalloc.h create mode 100644 tools/jprof/stub/moz.build (limited to 'tools/jprof') diff --git a/tools/jprof/README.html b/tools/jprof/README.html new file mode 100644 index 0000000000..ac25acc5ef --- /dev/null +++ b/tools/jprof/README.html @@ -0,0 +1,330 @@ + + + +The Jprof Profiler + + +
+

The Jprof Profiler

+ +jim_nance@yahoo.com

+Recent (4/2011) updates Randell Jesup (see bugzilla for contact info) + +


+ +Introduction | Operation | +Setup | Usage | +Interpretation + +
+
+ +

Introduction

+ +Jprof is a profiling tool. I am writing it because I need to find out +where mozilla is spending its time, and there do not seem to be any +profilers for Linux that can handle threads and/or shared libraries. +This code is based heavily on Kipp Hickman's leaky. + +

Operation

+ +Jprof operates by installing a timer which periodically interrupts mozilla. +When this timer goes off, the jprof code inside mozilla walks the function call +stack to determine which code was executing and saves the results into the +jprof-log and jprof-map files. By collecting a large +number of these call stacks, it is possible to deduce where mozilla is spending +its time. + +

Setup

+ +

Configure your mozilla with jprof support by adding +--enable-jprof to your configure options (eg adding +ac_add_options --enable-jprof to your .mozconfig) and +making sure that you do not have the +--enable-strip configure option set -- jprof needs symbols to +operate. On many architectures with GCC, you'll need to add +--enable-optimize="-O3 -fno-omit-frame-pointer" or the +equivalent to ensure frame pointer generation in the compiler you're using.

+ +

Finally, build mozilla with your new configuration. Now you can run jprof.

+ +

Usage

+
 jprof [-v] [-t] [-e exclude] [-i include] [-s stackdepth] [--last] [--all] [--start n [--end m]] [--output-dir dir] prog log [log2 ...]
+Options: + +The behavior of jprof is determined by the value of the JPROF_FLAGS environment +variable. This environment variable can be composed of several substrings +which have the following meanings: + + +

Starting and stopping jprof from JavaScript

+

+A build with jprof enabled adds four functions to the Window object:

+JProfStartProfiling() and JProfStopProfiling(): When used with JP_DEFER, these +allow one to start and stop the timer just around whatever critical section is +being profiled.

+JProfClearCircular() and JProfSaveCircular(): +These clear the circular buffer and save the buffer (without stopping), respectively.

+ +

Examples of JPROF_FLAGS usage

+ + +

Pausing profiles

+ +

jprof can be paused at any time by sending a SIGUSR1 to mozilla (kill +-USR1). This will cause the timer signals to stop and jprof-map to be +written, but it will not close jprof-log. Combining SIGUSR1 with the JP_DEFER +option allows profiling of one sequence of actions by starting the timer right +before starting the actions and stopping the timer right afterward. + +

After a SIGUSR1, sending another timer signal (SIGPROF, SIGALRM, or SIGPOLL (aka SIGIO), +depending on the mode) can be used to continue writing data to the same +output. + +

SIGUSR2 will cause the circular buffer to be cleared, if it's in use. +This is useful right before running a test when you're using a large, +continuous circular buffer, or programmatically at the start of an action +which might take too long (JProfClearCircular()). + +

Looking at the results

+ +Now that we have jprof-log and jprof-map files, we +can use the jprof executable is used to turn them into readable output. To do +this jprof needs the name of the mozilla binary and the log file. It deduces +the name of the map file: + +
+  ./jprof /home/user/mozilla/objdir/dist/bin/firefox ./jprof-log > tmp.html
+
+ +This will generate the file tmp.html which you should view in a +web browser. + +
+  ./jprof --output-dir=/tmp /home/user/mozilla/objdir/dist/bin/firefox ./jprof-log*
+
+ +This will generate a set of files in /tmp for each process. + + +

Interpretation

+ + +The Jprof output is split into a flat portion and a hierarchical portion. +There are links to each section at the top of the page. It is typically +easier to analyze the profile by starting with the flat output and following +the links contained in the flat output up to the hierarchical output. + +

Flat output

+ +The flat portion of the profile indicates which functions were executing +when the timer was going off. It is displayed as a list of functions names +on the right and the number of times that function was interrupted on the +left. The list is sorted by decreasing interrupt count. For example: + +
+Total hit count: 151603
+Count %Total  Function Name
+
+8806   5.8     __libc_poll
+2254   1.5     __i686.get_pc_thunk.bx
+2053   1.4     _int_malloc
+1777   1.2     ComputedStyle::GetStyleData(nsStyleStructID)
+1600   1.1     __libc_malloc
+1552   1.0     nsCOMPtr_base::~nsCOMPtr_base()
+
+ +This shows that of the 151603 times the timer fired, 1777 (1.2% of the total) were inside ComputedStyle::GetStyleData() and 1552 (1.0% of the total) were in the nsCOMPtr_base destructor. + +

+In general, the functions with the highest count are the functions which +are taking the most time. + +

+The function names are linked to the entry for that function in the +hierarchical profile, which is described in the next section. + +

Hierarchical output

+ +The hierarchical output is divided up into sections, with each section +corresponding to one function. A typical section looks something like +this: + +
+ index  Count         Hits      Function Name
+                           545 (46.4%) nsBlockFrame::ReflowInlineFrames(nsBlockReflowState&, nsLineList_iterator, int*)
+                           100 (8.5%)  nsBlockFrame::ReflowDirtyLines(nsBlockReflowState&)
+ 72870      4 (0.3%)       645 (54.9%) nsBlockFrame::DoReflowInlineFrames(nsBlockReflowState&, nsLineLayout&, nsLineList_iterator, nsFlowAreaRect&, int&, nsFloatManager::SavedState*, int*, LineReflowStatus*, int)
+                           545 (46.4%) nsBlockFrame::ReflowInlineFrame(nsBlockReflowState&, nsLineLayout&, nsLineList_iterator, nsIFrame*, LineReflowStatus*)
+                            83 (7.1%)  nsBlockFrame::PlaceLine(nsBlockReflowState&, nsLineLayout&, nsLineList_iterator, nsFloatManager::SavedState*, nsRect&, int&, int*)
+                             9 (0.8%)  nsLineLayout::BeginLineReflow(int, int, int, int, int, int)
+                             1 (0.1%)  nsTextFrame::GetType() const
+                             1 (0.1%)  nsLineLayout::RelativePositionFrames(nsOverflowAreas&)
+                             1 (0.1%)  __i686.get_pc_thunk.bx
+                             1 (0.1%)  PL_ArenaAllocate
+
+ +The information this block tells us is: + + + + +The rest of this section explains how to read this information off from the jprof output. + +

This block corresponds to the function nsBlockFrame::DoReflowInlineFrames, which is +therefore bolded and not a link. The name of this function is preceded by +five numbers which have the following meaning. The number on the left (72870) +is the index number, and is not important. The next number (4) and the +percentage following (0.3%) are the number +of times this function was interrupted by the timer and the percentage of +the total hits that is. The last number pair ("645 (54.9%)") +are the number of times this function was in the call stack when the timer went +off. That is, the timer went off while we were in code that was ultimately +called from nsBlockFrame::DoReflowInlineFrames. +

For our example we can see that our function was in the call stack for +645 interrupt ticks, but we were only the function that was running when +the interrupt arrived 4 times. +

+The functions listed above the line for nsBlockFrame::DoReflowInlineFrames are its +callers. The numbers to the left of these function names are the numbers of +times these functions were in the call stack as callers of +nsBlockFrame::DoReflowInlineFrames. In our example, we were called 545 times by +nsBlockFrame::ReflowInlineFrames and 100 times by +nsBlockFrame::ReflowDirtyLines. +

+The functions listed below the line for nsBlockFrame::DoReflowInlineFrames are its +callees. The numbers to the left of the function names are the numbers of +times these functions were in the callstack as callees of +nsBlockFrame::DoReflowInlineFrames and the corresponding percentages. In our example, of the 645 profiler hits under nsBlockFrame::DoReflowInlineFrames 545 were under nsBlockFrame::ReflowInlineFrame, 83 were under nsBlockFrame::PlaceLine, and so forth.

+ +NOTE: If there are loops of execution or recursion, the numbers will +not add up and percentages can exceed 100%. If a function directly calls +itself "(self)" will be appended to the line, but indirect recursion will +not be marked. + +

Bugs

+The current build of Jprof has only been tested under Ubuntu 8.04 LTS, but +should work under any fairly modern linux distribution using GCC/GLIBC. +Please update this document with any known compatibilities/incompatibilities. +

+If you get an error:

Inconsistency detected by ld.so: dl-open.c: 260: dl_open_worker: Assertion `_dl_debug_initialize (0, args->nsid)->r_state == RT_CONSISTENT' failed! +

that means you've hit a timing hole in the version of glibc you're +running. See Redhat bug 4578. + + + + diff --git a/tools/jprof/bfd.cpp b/tools/jprof/bfd.cpp new file mode 100644 index 0000000000..6be8fde760 --- /dev/null +++ b/tools/jprof/bfd.cpp @@ -0,0 +1,221 @@ +// vim:ts=8:sw=2:et: +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "leaky.h" + +#ifdef USE_BFD +# include +# include +# include +# include +# include +# include +# include + +static bfd* try_debug_file(const char* filename, unsigned long crc32) { + int fd = open(filename, O_RDONLY); + if (fd < 0) return nullptr; + + unsigned char buf[4 * 1024]; + unsigned long crc = 0; + + while (1) { + ssize_t count = read(fd, buf, sizeof(buf)); + if (count <= 0) break; + + crc = bfd_calc_gnu_debuglink_crc32(crc, buf, count); + } + + close(fd); + + if (crc != crc32) return nullptr; + + bfd* object = bfd_openr(filename, nullptr); + if (!bfd_check_format(object, bfd_object)) { + bfd_close(object); + return nullptr; + } + + return object; +} + +static bfd* find_debug_file(bfd* lib, const char* aFileName) { + // check for a separate debug file with symbols + asection* sect = bfd_get_section_by_name(lib, ".gnu_debuglink"); + + if (!sect) return nullptr; + + bfd_size_type debuglinkSize = bfd_section_size(objfile->obfd, sect); + + char* debuglink = new char[debuglinkSize]; + bfd_get_section_contents(lib, sect, debuglink, 0, debuglinkSize); + + // crc checksum is aligned to 4 bytes, and after the NUL. + int crc_offset = (int(strlen(debuglink)) & ~3) + 4; + unsigned long crc32 = bfd_get_32(lib, debuglink + crc_offset); + + // directory component + char* dirbuf = strdup(aFileName); + const char* dir = dirname(dirbuf); + + static const char debug_subdir[] = ".debug"; + // This is gdb's default global debugging info directory, but gdb can + // be instructed to use a different directory. + static const char global_debug_dir[] = "/usr/lib/debug"; + + char* filename = + new char[strlen(global_debug_dir) + strlen(dir) + crc_offset + 3]; + + // /path/debuglink + sprintf(filename, "%s/%s", dir, debuglink); + bfd* debugFile = try_debug_file(filename, crc32); + if (!debugFile) { + // /path/.debug/debuglink + sprintf(filename, "%s/%s/%s", dir, debug_subdir, debuglink); + debugFile = try_debug_file(filename, crc32); + if (!debugFile) { + // /usr/lib/debug/path/debuglink + sprintf(filename, "%s/%s/%s", global_debug_dir, dir, debuglink); + debugFile = try_debug_file(filename, crc32); + } + } + + delete[] filename; + free(dirbuf); + delete[] debuglink; + + return debugFile; +} + +// Use an indirect array to avoid copying tons of objects +Symbol** leaky::ExtendSymbols(int num) { + long n = numExternalSymbols + num; + + externalSymbols = (Symbol**)realloc(externalSymbols, + (size_t)(sizeof(externalSymbols[0]) * n)); + Symbol* new_array = new Symbol[n]; + for (int i = 0; i < num; i++) { + externalSymbols[i + numExternalSymbols] = &new_array[i]; + } + lastSymbol = externalSymbols + n; + Symbol** sp = externalSymbols + numExternalSymbols; + numExternalSymbols = n; + return sp; +} + +# define NEXT_SYMBOL \ + do { \ + sp++; \ + if (sp >= lastSymbol) { \ + sp = ExtendSymbols(16384); \ + } \ + } while (0) + +void leaky::ReadSymbols(const char* aFileName, u_long aBaseAddress) { + int initialSymbols = usefulSymbols; + if (nullptr == externalSymbols) { + externalSymbols = (Symbol**)calloc(sizeof(Symbol*), 10000); + Symbol* new_array = new Symbol[10000]; + for (int i = 0; i < 10000; i++) { + externalSymbols[i] = &new_array[i]; + } + numExternalSymbols = 10000; + } + Symbol** sp = externalSymbols + usefulSymbols; + lastSymbol = externalSymbols + numExternalSymbols; + + // Create a dummy symbol for the library so, if it doesn't have any + // symbols, we show it by library. + (*sp)->Init(aFileName, aBaseAddress); + NEXT_SYMBOL; + + bfd_boolean kDynamic = (bfd_boolean) false; + + static int firstTime = 1; + if (firstTime) { + firstTime = 0; + bfd_init(); + } + + bfd* lib = bfd_openr(aFileName, nullptr); + if (nullptr == lib) { + return; + } + if (!bfd_check_format(lib, bfd_object)) { + bfd_close(lib); + return; + } + + bfd* symbolFile = find_debug_file(lib, aFileName); + + // read mini symbols + PTR minisyms; + unsigned int size; + long symcount = 0; + + if (symbolFile) { + symcount = bfd_read_minisymbols(symbolFile, kDynamic, &minisyms, &size); + if (symcount == 0) { + bfd_close(symbolFile); + } else { + bfd_close(lib); + } + } + if (symcount == 0) { + symcount = bfd_read_minisymbols(lib, kDynamic, &minisyms, &size); + if (symcount == 0) { + // symtab is empty; try dynamic symbols + kDynamic = (bfd_boolean) true; + symcount = bfd_read_minisymbols(lib, kDynamic, &minisyms, &size); + } + symbolFile = lib; + } + + asymbol* store; + store = bfd_make_empty_symbol(symbolFile); + + // Scan symbols + size_t demangle_buffer_size = 128; + char* demangle_buffer = (char*)malloc(demangle_buffer_size); + bfd_byte* from = (bfd_byte*)minisyms; + bfd_byte* fromend = from + symcount * size; + for (; from < fromend; from += size) { + asymbol* sym; + sym = + bfd_minisymbol_to_symbol(symbolFile, kDynamic, (const PTR)from, store); + + symbol_info syminfo; + bfd_get_symbol_info(symbolFile, sym, &syminfo); + + // if ((syminfo.type == 'T') || (syminfo.type == 't')) { + const char* nm = bfd_asymbol_name(sym); + if (nm && nm[0]) { + char* dnm = nullptr; + if (strncmp("__thunk", nm, 7)) { + dnm = + abi::__cxa_demangle(nm, demangle_buffer, &demangle_buffer_size, 0); + if (dnm) { + demangle_buffer = dnm; + } + } + (*sp)->Init(dnm ? dnm : nm, syminfo.value + aBaseAddress); + NEXT_SYMBOL; + } + // } + } + + free(demangle_buffer); + demangle_buffer = nullptr; + + bfd_close(symbolFile); + + int interesting = sp - externalSymbols; + if (!quiet) { + printf("%s provided %d symbols\n", aFileName, interesting - initialSymbols); + } + usefulSymbols = interesting; +} + +#endif /* USE_BFD */ diff --git a/tools/jprof/coff.cpp b/tools/jprof/coff.cpp new file mode 100644 index 0000000000..0efa83960c --- /dev/null +++ b/tools/jprof/coff.cpp @@ -0,0 +1,93 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "leaky.h" + +#ifdef USE_COFF + +# define LANGUAGE_C +# include +# include +# include +# include +# include +# include +# include + +# ifdef IRIX4 +extern "C" { +extern char* demangle(char const* in); +}; +# else +# include +# endif + +static char* Demangle(char* rawName) { +# ifdef IRIX4 + return strdup(demangle(rawName)); +# else + char namebuf[4000]; + demangle(rawName, namebuf); + return strdup(namebuf); +# endif +} + +void leaky::readSymbols(const char* fileName) { + LDFILE* ldptr; + + ldptr = ldopen(fileName, nullptr); + if (!ldptr) { + fprintf(stderr, "%s: unable to open \"%s\"\n", applicationName, fileName); + exit(-1); + } + if (PSYMTAB(ldptr) == 0) { + fprintf(stderr, "%s: \"%s\": has no symbol table\n", applicationName, + fileName); + exit(-1); + } + + long isymMax = SYMHEADER(ldptr).isymMax; + long iextMax = SYMHEADER(ldptr).iextMax; + long iMax = isymMax + iextMax; + + long alloced = 10000; + Symbol* syms = (Symbol*)malloc(sizeof(Symbol) * 10000); + Symbol* sp = syms; + Symbol* last = syms + alloced; + SYMR symr; + + for (long isym = 0; isym < iMax; isym++) { + if (ldtbread(ldptr, isym, &symr) != SUCCESS) { + fprintf(stderr, "%s: can't read symbol #%d\n", applicationName, isym); + exit(-1); + } + if (isym < isymMax) { + if ((symr.st == stStaticProc) || + ((symr.st == stProc) && + ((symr.sc == scText) || (symr.sc == scAbs))) || + ((symr.st == stBlock) && (symr.sc == scText))) { + // Text symbol. Set name field to point to the symbol name + sp->name = Demangle(ldgetname(ldptr, &symr)); + sp->address = symr.value; + sp++; + if (sp >= last) { + long n = alloced + 10000; + syms = (Symbol*)realloc(syms, (size_t)(sizeof(Symbol) * n)); + last = syms + n; + sp = syms + alloced; + alloced = n; + } + } + } + } + + int interesting = sp - syms; + if (!quiet) { + printf("Total of %d symbols\n", interesting); + } + usefulSymbols = interesting; + externalSymbols = syms; +} + +#endif /* USE_COFF */ diff --git a/tools/jprof/elf.cpp b/tools/jprof/elf.cpp new file mode 100644 index 0000000000..c2e00f60da --- /dev/null +++ b/tools/jprof/elf.cpp @@ -0,0 +1,128 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "leaky.h" + +#ifdef USE_ELF + +# include "leaky.h" +# include +# include +# include +# include +# include +# include + +void leaky::readSymbols(const char* fileName) { + int fd = ::open(fileName, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s: unable to open \"%s\"\n", applicationName, fileName); + exit(-1); + } + + elf_version(EV_CURRENT); + Elf* elf = elf_begin(fd, ELF_C_READ, 0); + if (!elf) { + fprintf(stderr, "%s: \"%s\": has no symbol table\n", applicationName, + fileName); + exit(-1); + } + + long alloced = 10000; + Symbol* syms = (Symbol*)malloc(sizeof(Symbol) * 10000); + Symbol* sp = syms; + Symbol* last = syms + alloced; + + // Get each of the relevant sections and add them to the list of + // symbols. + Elf32_Ehdr* ehdr = elf32_getehdr(elf); + if (!ehdr) { + fprintf(stderr, "%s: elf library lossage\n", applicationName); + exit(-1); + } +# if 0 + Elf32_Half ndx = ehdr->e_shstrndx; +# endif + + Elf_Scn* scn = 0; + int strtabndx = -1; + for (int i = 1; (scn = elf_nextscn(elf, scn)) != 0; i++) { + Elf32_Shdr* shdr = elf32_getshdr(scn); +# if 0 + char *name = elf_strptr(elf, ndx, (size_t) shdr->sh_name); + printf("Section %s (%d 0x%x)\n", name ? name : "(null)", + shdr->sh_type, shdr->sh_type); +# endif + if (shdr->sh_type == SHT_STRTAB) { + /* We assume here that string tables preceed symbol tables... */ + strtabndx = i; + continue; + } +# if 0 + if (shdr->sh_type == SHT_DYNAMIC) { + /* Dynamic */ + Elf_Data *data = elf_getdata(scn, 0); + if (!data || !data->d_size) { + printf("No data..."); + continue; + } + + Elf32_Dyn *dyn = (Elf32_Dyn*) data->d_buf; + Elf32_Dyn *lastdyn = + (Elf32_Dyn*) ((char*) data->d_buf + data->d_size); + for (; dyn < lastdyn; dyn++) { + printf("tag=%d value=0x%x\n", dyn->d_tag, dyn->d_un.d_val); + } + } else +# endif + if ((shdr->sh_type == SHT_SYMTAB) || (shdr->sh_type == SHT_DYNSYM)) { + /* Symbol table */ + Elf_Data* data = elf_getdata(scn, 0); + if (!data || !data->d_size) { + printf("No data..."); + continue; + } + + /* In theory we now have the symbols... */ + Elf32_Sym* esym = (Elf32_Sym*)data->d_buf; + Elf32_Sym* lastsym = (Elf32_Sym*)((char*)data->d_buf + data->d_size); + for (; esym < lastsym; esym++) { +# if 0 + char *nm = elf_strptr(elf, strtabndx, (size_t)esym->st_name); + printf("%20s 0x%08x %02x %02x\n", + nm, esym->st_value, ELF32_ST_BIND(esym->st_info), + ELF32_ST_TYPE(esym->st_info)); +# endif + if ((esym->st_value == 0) || + (ELF32_ST_BIND(esym->st_info) == STB_WEAK) || + (ELF32_ST_BIND(esym->st_info) == STB_NUM) || + (ELF32_ST_TYPE(esym->st_info) != STT_FUNC)) { + continue; + } +# if 1 + char* nm = elf_strptr(elf, strtabndx, (size_t)esym->st_name); +# endif + sp->name = nm ? strdup(nm) : "(no name)"; + sp->address = esym->st_value; + sp++; + if (sp >= last) { + long n = alloced + 10000; + syms = (Symbol*)realloc(syms, (size_t)(sizeof(Symbol) * n)); + last = syms + n; + sp = syms + alloced; + alloced = n; + } + } + } + } + + int interesting = sp - syms; + if (!quiet) { + printf("Total of %d symbols\n", interesting); + } + usefulSymbols = interesting; + externalSymbols = syms; +} + +#endif /* USE_ELF */ diff --git a/tools/jprof/intcnt.cpp b/tools/jprof/intcnt.cpp new file mode 100644 index 0000000000..a87b6ccf74 --- /dev/null +++ b/tools/jprof/intcnt.cpp @@ -0,0 +1,69 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "intcnt.h" + +IntCount::IntCount() : numInts(0), iPair(nullptr) {} +IntCount::~IntCount() { delete[] iPair; } +int IntCount::getSize() { return numInts; } +int IntCount::getCount(int pos) { return iPair[pos].cnt; } +int IntCount::getIndex(int pos) { return iPair[pos].idx; } + +void IntCount::clear() { + delete[] iPair; + iPair = new IntPair[0]; + numInts = 0; +} + +int IntCount::countAdd(int index, int increment) { + if (numInts) { + // Do a binary search to find the element + int divPoint = 0; + + if (index > iPair[numInts - 1].idx) { + divPoint = numInts; + } else if (index < iPair[0].idx) { + divPoint = 0; + } else { + int low = 0, high = numInts - 1; + int mid = (low + high) / 2; + while (1) { + mid = (low + high) / 2; + + if (index < iPair[mid].idx) { + high = mid; + } else if (index > iPair[mid].idx) { + if (mid < numInts - 1 && index < iPair[mid + 1].idx) { + divPoint = mid + 1; + break; + } else { + low = mid + 1; + } + } else if (index == iPair[mid].idx) { + return iPair[mid].cnt += increment; + } + } + } + + int i; + IntPair* tpair = new IntPair[numInts + 1]; + for (i = 0; i < divPoint; i++) { + tpair[i] = iPair[i]; + } + for (i = divPoint; i < numInts; i++) { + tpair[i + 1] = iPair[i]; + } + ++numInts; + delete[] iPair; + iPair = tpair; + iPair[divPoint].idx = index; + iPair[divPoint].cnt = increment; + return increment; + } else { + iPair = new IntPair[1]; + numInts = 1; + iPair[0].idx = index; + return iPair[0].cnt = increment; + } +} diff --git a/tools/jprof/intcnt.h b/tools/jprof/intcnt.h new file mode 100644 index 0000000000..2cf9ec1ff1 --- /dev/null +++ b/tools/jprof/intcnt.h @@ -0,0 +1,39 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef INTCNT_H +#define INTCNT_H + +class IntCount { + public: + IntCount(); + ~IntCount(); + void clear(); + int countAdd(int index, int increment = 1); + int countGet(int index); + int getSize(); + int getCount(int pos); + int getIndex(int pos); + + IntCount(const IntCount& old) { + numInts = old.numInts; + if (numInts > 0) { + iPair = new IntPair[numInts]; + for (int i = 0; i < numInts; i++) { + iPair[i] = old.iPair[i]; + } + } else { + iPair = nullptr; + } + } + + private: + int numInts; + struct IntPair { + int idx; + int cnt; + }* iPair; +}; + +#endif diff --git a/tools/jprof/jprofsig b/tools/jprof/jprofsig new file mode 100755 index 0000000000..02226fc4b6 --- /dev/null +++ b/tools/jprof/jprofsig @@ -0,0 +1,46 @@ +#!/bin/sh +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# +# Find Mozilla PID and send it a signal, to be used +# with the jprof tool. +# + +jpsignal_usage() { + echo "Usage: jprofsig [start|stop]" + exit 1 +} + +if [ $# != 1 ]; then + echo "Wrong number of arguments." + jpsignal_usage +fi + +jpsignal_arg="$1" + +# Find & print mozilla PID +tmpmoz=`ps aux | grep mozilla-bin | head -1 | awk '{ print $2 }'` +echo "Mozilla PID = $tmpmoz" + +# See how we were called. +case "$jpsignal_arg" in + start) + if [ "$JP_REALTIME" = 1 ]; then + kill -ALRM $tmpmoz + else + # Normal, non-realtime mode. + kill -PROF $tmpmoz + fi + ;; + stop) + kill -USR1 $tmpmoz + ;; + *) + jpsignal_usage + exit 1 +esac + +exit 0 diff --git a/tools/jprof/leaky.cpp b/tools/jprof/leaky.cpp new file mode 100644 index 0000000000..91e9d8aa82 --- /dev/null +++ b/tools/jprof/leaky.cpp @@ -0,0 +1,827 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "leaky.h" +#include "intcnt.h" + +#include +#include +#include +#include +#include +#include +#ifndef NTO +# include +#endif +#include +#include +#include + +#ifdef NTO +# include +#endif + +#ifndef FALSE +# define FALSE 0 +#endif +#ifndef TRUE +# define TRUE 1 +#endif + +static const u_int DefaultBuckets = 10007; // arbitrary, but prime +static const u_int MaxBuckets = 1000003; // arbitrary, but prime + +//---------------------------------------------------------------------- + +int main(int argc, char** argv) { + leaky* l = new leaky; + + l->initialize(argc, argv); + l->outputfd = stdout; + + for (int i = 0; i < l->numLogFiles; i++) { + if (l->output_dir || l->numLogFiles > 1) { + char name[2048]; // XXX fix + if (l->output_dir) + snprintf(name, sizeof(name), "%s/%s.html", l->output_dir, + argv[l->logFileIndex + i]); + else + snprintf(name, sizeof(name), "%s.html", argv[l->logFileIndex + i]); + + fprintf(stderr, "opening %s\n", name); + l->outputfd = fopen(name, "w"); + // if an error we won't process the file + } + if (l->outputfd) { // paranoia + l->open(argv[l->logFileIndex + i]); + + if (l->outputfd != stderr) { + fclose(l->outputfd); + l->outputfd = nullptr; + } + } + } + + return 0; +} + +char* htmlify(const char* in) { + const char* p = in; + char *out, *q; + int n = 0; + size_t newlen; + + // Count the number of '<' and '>' in the input. + while ((p = strpbrk(p, "<>"))) { + ++n; + ++p; + } + + // Knowing the number of '<' and '>', we can calculate the space + // needed for the output string. + newlen = strlen(in) + n * 3 + 1; + out = new char[newlen]; + + // Copy the input to the output, with substitutions. + p = in; + q = out; + do { + if (*p == '<') { + strcpy(q, "<"); + q += 4; + } else if (*p == '>') { + strcpy(q, ">"); + q += 4; + } else { + *q++ = *p; + } + p++; + } while (*p); + *q = '\0'; + + return out; +} + +leaky::leaky() { + applicationName = nullptr; + progFile = nullptr; + + quiet = true; + showAddress = false; + showThreads = false; + stackDepth = 100000; + onlyThread = 0; + cleo = false; + + mappedLogFile = -1; + firstLogEntry = lastLogEntry = 0; + + sfd = -1; + externalSymbols = 0; + usefulSymbols = 0; + numExternalSymbols = 0; + lowestSymbolAddr = 0; + highestSymbolAddr = 0; + + loadMap = nullptr; + + collect_last = false; + collect_start = -1; + collect_end = -1; +} + +leaky::~leaky() {} + +void leaky::usageError() { + fprintf(stderr, + "Usage: %s [-v] [-t] [-e exclude] [-i include] [-s stackdepth] " + "[--last] [--all] [--start n [--end m]] [--cleo] [--output-dir dir] " + "prog log [log2 ...]\n", + (char*)applicationName); + fprintf( + stderr, + "\t-v: verbose\n" + "\t-t | --threads: split threads\n" + "\t--only-thread n: only profile thread N\n" + "\t-i include-id: stack must include specified id\n" + "\t-e exclude-id: stack must NOT include specified id\n" + "\t-s stackdepth: Limit depth looked at from captured stack frames\n" + "\t--last: only profile the last capture section\n" + "\t--start n [--end m]: profile n to m (or end) capture sections\n" + "\t--cleo: format output for 'cleopatra' display\n" + "\t--output-dir dir: write output files to dir\n" + "\tIf there's one log, output goes to stdout unless --output-dir is set\n" + "\tIf there are more than one log, output files will be named with .html " + "added\n"); + exit(-1); +} + +static struct option longopts[] = { + {"threads", 0, nullptr, 't'}, {"only-thread", 1, nullptr, 'T'}, + {"last", 0, nullptr, 'l'}, {"start", 1, nullptr, 'x'}, + {"end", 1, nullptr, 'n'}, {"cleo", 0, nullptr, 'c'}, + {"output-dir", 1, nullptr, 'd'}, {nullptr, 0, nullptr, 0}, +}; + +void leaky::initialize(int argc, char** argv) { + applicationName = argv[0]; + applicationName = strrchr(applicationName, '/'); + if (!applicationName) { + applicationName = argv[0]; + } else { + applicationName++; + } + + int arg; + int errflg = 0; + int longindex = 0; + + onlyThread = 0; + output_dir = nullptr; + cleo = false; + + // XXX tons of cruft here left over from tracemalloc + // XXX The -- options shouldn't need short versions, or they should be + // documented + while (((arg = getopt_long(argc, argv, "adEe:gh:i:r:Rs:tT:qvx:ln:", longopts, + &longindex)) != -1)) { + switch (arg) { + case '?': + default: + fprintf(stderr, "error: unknown option %c\n", optopt); + errflg++; + break; + case 'a': + break; + case 'A': // not implemented + showAddress = true; + break; + case 'c': + cleo = true; + break; + case 'd': + output_dir = optarg; // reference to an argv pointer + break; + case 'R': + break; + case 'e': + exclusions.add(optarg); + break; + case 'g': + break; + case 'r': // not implemented + roots.add(optarg); + if (!includes.IsEmpty()) { + errflg++; + } + break; + case 'i': + includes.add(optarg); + if (!roots.IsEmpty()) { + errflg++; + } + break; + case 'h': + break; + case 's': + stackDepth = atoi(optarg); + if (stackDepth < 2) { + stackDepth = 2; + } + break; + case 'x': + // --start + collect_start = atoi(optarg); + break; + case 'n': + // --end + collect_end = atoi(optarg); + break; + case 'l': + // --last + collect_last = true; + break; + case 'q': + break; + case 'v': + quiet = !quiet; + break; + case 't': + showThreads = true; + break; + case 'T': + showThreads = true; + onlyThread = atoi(optarg); + break; + } + } + if (errflg || ((argc - optind) < 2)) { + usageError(); + } + progFile = argv[optind++]; + logFileIndex = optind; + numLogFiles = argc - optind; + if (!quiet) fprintf(stderr, "numlogfiles = %d\n", numLogFiles); +} + +static void* mapFile(int fd, u_int flags, off_t* sz) { + struct stat sb; + if (fstat(fd, &sb) < 0) { + perror("fstat"); + exit(-1); + } + void* base = mmap(0, (int)sb.st_size, flags, MAP_PRIVATE, fd, 0); + if (!base) { + perror("mmap"); + exit(-1); + } + *sz = sb.st_size; + return base; +} + +void leaky::LoadMap() { + malloc_map_entry mme; + char name[1000]; + + if (!loadMap) { + // all files use the same map + int fd = ::open(M_MAPFILE, O_RDONLY); + if (fd < 0) { + perror("open: " M_MAPFILE); + exit(-1); + } + for (;;) { + int nb = read(fd, &mme, sizeof(mme)); + if (nb != sizeof(mme)) break; + nb = read(fd, name, mme.nameLen); + if (nb != (int)mme.nameLen) break; + name[mme.nameLen] = 0; + if (!quiet) { + fprintf(stderr, "%s @ %lx\n", name, mme.address); + } + + LoadMapEntry* lme = new LoadMapEntry; + lme->address = mme.address; + lme->name = strdup(name); + lme->next = loadMap; + loadMap = lme; + } + close(fd); + } +} + +void leaky::open(char* logFile) { + int threadArray[100]; // should auto-expand + int last_thread = -1; + int numThreads = 0; + int section = -1; + bool collecting = false; + + LoadMap(); + + setupSymbols(progFile); + + // open up the log file + if (mappedLogFile) ::close(mappedLogFile); + + mappedLogFile = ::open(logFile, O_RDONLY); + if (mappedLogFile < 0) { + perror("open"); + exit(-1); + } + off_t size; + firstLogEntry = (malloc_log_entry*)mapFile(mappedLogFile, PROT_READ, &size); + lastLogEntry = (malloc_log_entry*)((char*)firstLogEntry + size); + + if (!collect_last || collect_start < 0) { + collecting = true; + } + + // First, restrict it to the capture sections specified (all, last, start/end) + // This loop walks through all the call stacks we recorded + for (malloc_log_entry* lep = firstLogEntry; lep < lastLogEntry; + lep = reinterpret_cast(&lep->pcs[lep->numpcs])) { + if (lep->flags & JP_FIRST_AFTER_PAUSE) { + section++; + if (collect_last) { + firstLogEntry = lep; + numThreads = 0; + collecting = true; + } + if (collect_start == section) { + collecting = true; + firstLogEntry = lep; + } + if (collect_end == section) { + collecting = false; + lastLogEntry = lep; + } + if (!quiet) + fprintf(stderr, "New section %d: first=%p, last=%p, collecting=%d\n", + section, (void*)firstLogEntry, (void*)lastLogEntry, collecting); + } + + // Capture thread info at the same time + + // Find all the threads captured + + // pthread/linux docs say the signal can be delivered to any thread in + // the process. In practice, it appears in Linux that it's always + // delivered to the thread that called setitimer(), and each thread can + // have a separate itimer. There's a support library for gprof that + // overlays pthread_create() to set timers in any threads you spawn. + if (showThreads && collecting) { + if (lep->thread != last_thread) { + int i; + for (i = 0; i < numThreads; i++) { + if (lep->thread == threadArray[i]) break; + } + if (i == numThreads && + i < (int)(sizeof(threadArray) / sizeof(threadArray[0]))) { + threadArray[i] = lep->thread; + numThreads++; + if (!quiet) fprintf(stderr, "new thread %d\n", lep->thread); + } + } + } + } + if (!quiet) + fprintf(stderr, + "Done collecting: sections %d: first=%p, last=%p, numThreads=%d\n", + section, (void*)firstLogEntry, (void*)lastLogEntry, numThreads); + + if (!cleo) { + fprintf(outputfd, + "Jprof Profile Report\n"); + fprintf(outputfd, "

Jprof Profile Report

\n"); + } + + if (showThreads) { + fprintf(stderr, "Num threads %d\n", numThreads); + + if (!cleo) { + fprintf(outputfd, "
Threads:

\n");
+      for (int i = 0; i < numThreads; i++) {
+        fprintf(outputfd, "   %d  ", threadArray[i],
+                threadArray[i]);
+        if ((i + 1) % 10 == 0) fprintf(outputfd, "
\n"); + } + fprintf(outputfd, "
"); + } + + for (int i = 0; i < numThreads; i++) { + if (!onlyThread || onlyThread == threadArray[i]) analyze(threadArray[i]); + } + } else { + analyze(0); + } + + if (!cleo) fprintf(outputfd, "\n"); +} + +//---------------------------------------------------------------------- + +static int symbolOrder(void const* a, void const* b) { + Symbol const** ap = (Symbol const**)a; + Symbol const** bp = (Symbol const**)b; + return (*ap)->address == (*bp)->address + ? 0 + : ((*ap)->address > (*bp)->address ? 1 : -1); +} + +void leaky::ReadSharedLibrarySymbols() { + LoadMapEntry* lme = loadMap; + while (nullptr != lme) { + ReadSymbols(lme->name, lme->address); + lme = lme->next; + } +} + +void leaky::setupSymbols(const char* fileName) { + if (usefulSymbols == 0) { + // only read once! + + // Read in symbols from the program + ReadSymbols(fileName, 0); + + // Read in symbols from the .so's + ReadSharedLibrarySymbols(); + + if (!quiet) { + fprintf(stderr, "A total of %d symbols were loaded\n", usefulSymbols); + } + + // Now sort them + qsort(externalSymbols, usefulSymbols, sizeof(Symbol*), symbolOrder); + lowestSymbolAddr = externalSymbols[0]->address; + highestSymbolAddr = externalSymbols[usefulSymbols - 1]->address; + } +} + +// Binary search the table, looking for a symbol that covers this +// address. +int leaky::findSymbolIndex(u_long addr) { + u_int base = 0; + u_int limit = usefulSymbols - 1; + Symbol** end = &externalSymbols[limit]; + while (base <= limit) { + u_int midPoint = (base + limit) >> 1; + Symbol** sp = &externalSymbols[midPoint]; + if (addr < (*sp)->address) { + if (midPoint == 0) { + return -1; + } + limit = midPoint - 1; + } else { + if (sp + 1 < end) { + if (addr < (*(sp + 1))->address) { + return midPoint; + } + } else { + return midPoint; + } + base = midPoint + 1; + } + } + return -1; +} + +Symbol* leaky::findSymbol(u_long addr) { + int idx = findSymbolIndex(addr); + + if (idx < 0) { + return nullptr; + } else { + return externalSymbols[idx]; + } +} + +//---------------------------------------------------------------------- + +bool leaky::excluded(malloc_log_entry* lep) { + if (exclusions.IsEmpty()) { + return false; + } + + char** pcp = &lep->pcs[0]; + u_int n = lep->numpcs; + for (u_int i = 0; i < n; i++, pcp++) { + Symbol* sp = findSymbol((u_long)*pcp); + if (sp && exclusions.contains(sp->name)) { + return true; + } + } + return false; +} + +bool leaky::included(malloc_log_entry* lep) { + if (includes.IsEmpty()) { + return true; + } + + char** pcp = &lep->pcs[0]; + u_int n = lep->numpcs; + for (u_int i = 0; i < n; i++, pcp++) { + Symbol* sp = findSymbol((u_long)*pcp); + if (sp && includes.contains(sp->name)) { + return true; + } + } + return false; +} + +//---------------------------------------------------------------------- + +void leaky::displayStackTrace(FILE* out, malloc_log_entry* lep) { + char** pcp = &lep->pcs[0]; + u_int n = (lep->numpcs < stackDepth) ? lep->numpcs : stackDepth; + for (u_int i = 0; i < n; i++, pcp++) { + u_long addr = (u_long)*pcp; + Symbol* sp = findSymbol(addr); + if (sp) { + fputs(sp->name, out); + if (showAddress) { + fprintf(out, "[%p]", (char*)addr); + } + } else { + fprintf(out, "<%p>", (char*)addr); + } + fputc(' ', out); + } + fputc('\n', out); +} + +void leaky::dumpEntryToLog(malloc_log_entry* lep) { + printf("%ld\t", lep->delTime); + printf(" --> "); + displayStackTrace(outputfd, lep); +} + +void leaky::generateReportHTML(FILE* fp, int* countArray, int count, + int thread) { + fprintf(fp, "
"); + if (showThreads) { + fprintf(fp, "
Thread: %d

", thread, + thread); + } + fprintf( + fp, + "flat | hierarchical", + thread, thread); + fprintf(fp, "

\n"); + + int totalTimerHits = count; + int* rankingTable = new int[usefulSymbols]; + + for (int cnt = usefulSymbols; --cnt >= 0; rankingTable[cnt] = cnt) + ; + + // Drat. I would use ::qsort() but I would need a global variable and my + // intro-pascal professor threatened to flunk anyone who used globals. + // She damaged me for life :-) (That was 1986. See how much influence + // she had. I don't remember her name but I always feel guilty about globals) + + // Shell Sort. 581130733 is the max 31 bit value of h = 3h+1 + int mx, i, h; + for (mx = usefulSymbols / 9, h = 581130733; h > 0; h /= 3) { + if (h < mx) { + for (i = h - 1; i < usefulSymbols; i++) { + int j, tmp = rankingTable[i], val = countArray[tmp]; + for (j = i; (j >= h) && (countArray[rankingTable[j - h]] < val); + j -= h) { + rankingTable[j] = rankingTable[j - h]; + } + rankingTable[j] = tmp; + } + } + } + + // Ok, We are sorted now. Let's go through the table until we get to + // functions that were never called. Right now we don't do much inside + // this loop. Later we can get callers and callees into it like gprof + // does + fprintf(fp, + "

Hierarchical Profile


\n", + thread); + fprintf(fp, "
\n");
+  fprintf(fp, "%6s %6s         %4s      %s\n", "index", "Count", "Hits",
+          "Function Name");
+
+  for (i = 0; i < usefulSymbols && countArray[rankingTable[i]] > 0; i++) {
+    Symbol** sp = &externalSymbols[rankingTable[i]];
+
+    (*sp)->cntP.printReport(fp, this, rankingTable[i], totalTimerHits);
+
+    char* symname = htmlify((*sp)->name);
+    fprintf(fp,
+            "%6d %6d (%3.1f%%)%s %8d (%3.1f%%)%s %s\n",
+            rankingTable[i], (*sp)->timerHit,
+            ((*sp)->timerHit * 1000 / totalTimerHits) / 10.0,
+            ((*sp)->timerHit * 1000 / totalTimerHits) / 10.0 >= 10.0 ? "" : " ",
+            rankingTable[i], countArray[rankingTable[i]],
+            (countArray[rankingTable[i]] * 1000 / totalTimerHits) / 10.0,
+            (countArray[rankingTable[i]] * 1000 / totalTimerHits) / 10.0 >= 10.0
+                ? ""
+                : " ",
+            symname);
+    delete[] symname;
+
+    (*sp)->cntC.printReport(fp, this, rankingTable[i], totalTimerHits);
+
+    fprintf(fp, "
\n"); + } + fprintf(fp, "
\n"); + + // OK, Now we want to print the flat profile. To do this we resort on + // the hit count. + + // Cut-N-Paste Shell sort from above. The Ranking Table has already been + // populated, so we do not have to reinitialize it. + for (mx = usefulSymbols / 9, h = 581130733; h > 0; h /= 3) { + if (h < mx) { + for (i = h - 1; i < usefulSymbols; i++) { + int j, tmp = rankingTable[i], val = externalSymbols[tmp]->timerHit; + for (j = i; + (j >= h) && (externalSymbols[rankingTable[j - h]]->timerHit < val); + j -= h) { + rankingTable[j] = rankingTable[j - h]; + } + rankingTable[j] = tmp; + } + } + } + + // Pre-count up total counter hits, to get a percentage. + // I wanted the total before walking the list, if this + // double-pass over externalSymbols gets slow we can + // do single-pass and print this out after the loop finishes. + totalTimerHits = 0; + for (i = 0; + i < usefulSymbols && externalSymbols[rankingTable[i]]->timerHit > 0; + i++) { + Symbol** sp = &externalSymbols[rankingTable[i]]; + totalTimerHits += (*sp)->timerHit; + } + if (totalTimerHits == 0) totalTimerHits = 1; + + if (totalTimerHits != count) + fprintf(stderr, "Hit count mismatch: count=%d; totalTimerHits=%d", count, + totalTimerHits); + + fprintf(fp, + "

Flat Profile


\n", + thread); + fprintf(fp, "
\n");
+
+  fprintf(fp, "Total hit count: %d\n", totalTimerHits);
+  fprintf(fp, "Count %%Total  Function Name\n");
+  // Now loop for as long as we have timer hits
+  for (i = 0;
+       i < usefulSymbols && externalSymbols[rankingTable[i]]->timerHit > 0;
+       i++) {
+    Symbol** sp = &externalSymbols[rankingTable[i]];
+
+    char* symname = htmlify((*sp)->name);
+    fprintf(fp, "%3d   %-2.1f     %s\n", rankingTable[i],
+            (*sp)->timerHit,
+            ((float)(*sp)->timerHit / (float)totalTimerHits) * 100.0, symname);
+    delete[] symname;
+  }
+}
+
+void leaky::analyze(int thread) {
+  int* countArray = new int[usefulSymbols];
+  int* flagArray = new int[usefulSymbols];
+
+  // Zero our function call counter
+  memset(countArray, 0, sizeof(countArray[0]) * usefulSymbols);
+
+  // reset hit counts
+  for (int i = 0; i < usefulSymbols; i++) {
+    externalSymbols[i]->timerHit = 0;
+    externalSymbols[i]->regClear();
+  }
+
+  // The flag array is used to prevent counting symbols multiple times
+  // if functions are called recursively.  In order to keep from having
+  // to zero it on each pass through the loop, we mark it with the value
+  // of stacks on each trip through the loop.  This means we can determine
+  // if we have seen this symbol for this stack trace w/o having to reset
+  // from the prior stacktrace.
+  memset(flagArray, -1, sizeof(flagArray[0]) * usefulSymbols);
+
+  if (cleo) fprintf(outputfd, "m-Start\n");
+
+  // This loop walks through all the call stacks we recorded
+  // --last, --start and --end can restrict it, as can excludes/includes
+  stacks = 0;
+  for (malloc_log_entry* lep = firstLogEntry; lep < lastLogEntry;
+       lep = reinterpret_cast(&lep->pcs[lep->numpcs])) {
+    if ((thread != 0 && lep->thread != thread) || excluded(lep) ||
+        !included(lep)) {
+      continue;
+    }
+
+    ++stacks;  // How many stack frames did we collect
+
+    u_int n = (lep->numpcs < stackDepth) ? lep->numpcs : stackDepth;
+    char** pcp = &lep->pcs[n - 1];
+    int idx = -1, parrentIdx = -1;  // Init idx incase n==0
+    if (cleo) {
+      // This loop walks through every symbol in the call stack.  By walking it
+      // backwards we know who called the function when we get there.
+      char type = 's';
+      for (int i = n - 1; i >= 0; --i, --pcp) {
+        idx = findSymbolIndex(reinterpret_cast(*pcp));
+
+        if (idx >= 0) {
+          // Skip over bogus __restore_rt frames that realtime profiling
+          // can introduce.
+          if (i > 0 && !strcmp(externalSymbols[idx]->name, "__restore_rt")) {
+            --pcp;
+            --i;
+            idx = findSymbolIndex(reinterpret_cast(*pcp));
+            if (idx < 0) {
+              continue;
+            }
+          }
+          Symbol** sp = &externalSymbols[idx];
+          char* symname = htmlify((*sp)->name);
+          fprintf(outputfd, "%c-%s\n", type, symname);
+          delete[] symname;
+        }
+        // else can't find symbol - ignore
+        type = 'c';
+      }
+    } else {
+      // This loop walks through every symbol in the call stack.  By walking it
+      // backwards we know who called the function when we get there.
+      for (int i = n - 1; i >= 0; --i, --pcp) {
+        idx = findSymbolIndex(reinterpret_cast(*pcp));
+
+        if (idx >= 0) {
+          // Skip over bogus __restore_rt frames that realtime profiling
+          // can introduce.
+          if (i > 0 && !strcmp(externalSymbols[idx]->name, "__restore_rt")) {
+            --pcp;
+            --i;
+            idx = findSymbolIndex(reinterpret_cast(*pcp));
+            if (idx < 0) {
+              continue;
+            }
+          }
+
+          // If we have not seen this symbol before count it and mark it as seen
+          if (flagArray[idx] != stacks && ((flagArray[idx] = stacks) || true)) {
+            ++countArray[idx];
+          }
+
+          // We know who we are and we know who our parrent is.  Count this
+          if (parrentIdx >= 0) {
+            externalSymbols[parrentIdx]->regChild(idx);
+            externalSymbols[idx]->regParrent(parrentIdx);
+          }
+          // inside if() so an unknown in the middle of a stack won't break
+          // the link!
+          parrentIdx = idx;
+        }
+      }
+
+      // idx should be the function that we were in when we received the signal.
+      if (idx >= 0) {
+        ++externalSymbols[idx]->timerHit;
+      }
+    }
+  }
+  if (!cleo) generateReportHTML(outputfd, countArray, stacks, thread);
+}
+
+void FunctionCount::printReport(FILE* fp, leaky* lk, int parent, int total) {
+  const char* fmt =
+      "                      %8d (%3.1f%%)%s %s%s\n";
+
+  int nmax, tmax = ((~0U) >> 1);
+
+  do {
+    nmax = 0;
+    for (int j = getSize(); --j >= 0;) {
+      int cnt = getCount(j);
+      if (cnt == tmax) {
+        int idx = getIndex(j);
+        char* symname = htmlify(lk->indexToName(idx));
+        fprintf(fp, fmt, idx, getCount(j), getCount(j) * 100.0 / total,
+                getCount(j) * 100.0 / total >= 10.0 ? "" : " ", symname,
+                parent == idx ? " (self)" : "");
+        delete[] symname;
+      } else if (cnt < tmax && cnt > nmax) {
+        nmax = cnt;
+      }
+    }
+  } while ((tmax = nmax) > 0);
+}
diff --git a/tools/jprof/leaky.h b/tools/jprof/leaky.h
new file mode 100644
index 0000000000..6c9beb7b42
--- /dev/null
+++ b/tools/jprof/leaky.h
@@ -0,0 +1,124 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef __leaky_h_
+#define __leaky_h_
+
+#include "config.h"
+#include 
+#include 
+#include 
+#include "libmalloc.h"
+#include "strset.h"
+#include "intcnt.h"
+
+typedef unsigned int u_int;
+
+struct Symbol;
+struct leaky;
+
+class FunctionCount : public IntCount {
+ public:
+  void printReport(FILE* fp, leaky* lk, int parent, int total);
+};
+
+struct Symbol {
+  char* name;
+  u_long address;
+  int timerHit;
+  FunctionCount cntP, cntC;
+
+  int regChild(int id) { return cntC.countAdd(id, 1); }
+  int regParrent(int id) { return cntP.countAdd(id, 1); }
+  void regClear() {
+    cntC.clear();
+    cntP.clear();
+  }
+
+  Symbol() : timerHit(0) {}
+  void Init(const char* aName, u_long aAddress) {
+    name = aName ? strdup(aName) : (char*)"";
+    address = aAddress;
+  }
+};
+
+struct LoadMapEntry {
+  char* name;      // name of .so
+  u_long address;  // base address where it was mapped in
+  LoadMapEntry* next;
+};
+
+struct leaky {
+  leaky();
+  ~leaky();
+
+  void initialize(int argc, char** argv);
+  void open(char* arg);
+
+  char* applicationName;
+  int logFileIndex;
+  int numLogFiles;
+  char* progFile;
+  FILE* outputfd;
+
+  bool quiet;
+  bool showAddress;
+  bool showThreads;
+  bool cleo;
+  u_int stackDepth;
+  int onlyThread;
+  char* output_dir;
+
+  int mappedLogFile;
+  malloc_log_entry* firstLogEntry;
+  malloc_log_entry* lastLogEntry;
+
+  int stacks;
+
+  int sfd;
+  Symbol** externalSymbols;
+  Symbol** lastSymbol;
+  int usefulSymbols;
+  int numExternalSymbols;
+  StrSet exclusions;
+  u_long lowestSymbolAddr;
+  u_long highestSymbolAddr;
+
+  LoadMapEntry* loadMap;
+
+  bool collect_last;
+  int collect_start;
+  int collect_end;
+
+  StrSet roots;
+  StrSet includes;
+
+  void usageError();
+
+  void LoadMap();
+
+  void analyze(int thread);
+
+  void dumpEntryToLog(malloc_log_entry* lep);
+
+  void insertAddress(u_long address, malloc_log_entry* lep);
+  void removeAddress(u_long address, malloc_log_entry* lep);
+
+  void displayStackTrace(FILE* out, malloc_log_entry* lep);
+
+  Symbol** ExtendSymbols(int num);
+  void ReadSymbols(const char* fileName, u_long aBaseAddress);
+  void ReadSharedLibrarySymbols();
+  void setupSymbols(const char* fileName);
+  Symbol* findSymbol(u_long address);
+  bool excluded(malloc_log_entry* lep);
+  bool included(malloc_log_entry* lep);
+  const char* indexToName(int idx) { return externalSymbols[idx]->name; }
+
+ private:
+  void generateReportHTML(FILE* fp, int* countArray, int count, int thread);
+  int findSymbolIndex(u_long address);
+};
+
+#endif /* __leaky_h_ */
diff --git a/tools/jprof/moz.build b/tools/jprof/moz.build
new file mode 100644
index 0000000000..2aa4a8c294
--- /dev/null
+++ b/tools/jprof/moz.build
@@ -0,0 +1,28 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+DIRS += ["stub"]
+
+Program("jprof")
+
+SOURCES += [
+    "bfd.cpp",
+    "coff.cpp",
+    "elf.cpp",
+    "intcnt.cpp",
+    "leaky.cpp",
+    "strset.cpp",
+]
+
+LOCAL_INCLUDES += [
+    "stub",
+]
+
+OS_LIBS += [
+    "dl",
+    "bfd",
+    "iberty",
+]
diff --git a/tools/jprof/split-profile.py b/tools/jprof/split-profile.py
new file mode 100755
index 0000000000..c280c130c2
--- /dev/null
+++ b/tools/jprof/split-profile.py
@@ -0,0 +1,156 @@
+#!/usr/bin/python
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# This program splits up a jprof profile into multiple files based on a
+# list of functions in a text file.  First, a complete profile is
+# generated.  Then, for each line in the text file, a profile is
+# generated containing only stacks that go through that line, and also
+# excluding all stacks in earlier lines in the text file.  This means
+# that the text file, from start to end, is splitting out pieces of the
+# profile in their own file.  Finally, a final profile containing the
+# remainder is produced.
+
+# The program takes four arguments:
+#   (1) The path to jprof.
+#   (2) The path to the text file describing the splits.  The output
+#       will be placed in the same directory as this file.
+#   (3) The program that was profiled.
+#   (4) The jprof-log file generated by the profile, to be split up.
+# (Really, all arguments from (3) and later are passed through to
+# jprof, so additional arguments could be provided if you want to pass
+# additional arguments to jprof.)
+
+# In slightly more detail:
+#
+# This script uses jprof's includes (-i) and excludes (-e) options to
+# split profiles into segments.  It takes as input a single text file,
+# and from that text file creates a series of jprof profiles in the
+# directory the text file is in.
+#
+# The input file format looks like the following:
+#
+#   poll g_main_poll
+#   GetRuleCascade CSSRuleProcessor::GetRuleCascade(nsPresContext *, nsAtom *)
+#   RuleProcessorData RuleProcessorData::RuleProcessorData
+#        (nsPresContext *, nsIContent *, nsRuleWalker *, nsCompatibility *)
+#
+#
+# From this input file, the script will construct a profile called
+# jprof-0.html that contains the whole profile, a profile called
+# jprof-1-poll.html that includes only stacks with g_main_poll, a
+# profile called jprof-2-GetRuleCascade.html that includes only stacks
+# that have GetRuleCascade and do not have g_main_poll, a profile called
+# jprof-3-RuleProcessorData.html that includes only stacks that have the
+# RuleProcessorData constructor and do not have GetRuleCascade or
+# g_main_poll, and a profile called jprof-4.html that includes only
+# stacks that do not have any of the three functions in them.
+#
+# This means that all of the segments of the profile, except
+# jprof-0.html, are mutually exclusive.  Thus clever ordering of the
+# functions in the input file can lead to a logical splitting of the
+# profile into segments.
+
+import os.path
+import subprocess
+import sys
+
+if len(sys.argv) < 5:
+    sys.stderr.write("Expected arguments:    \n")
+    sys.exit(1)
+
+jprof = sys.argv[1]
+splitfile = sys.argv[2]
+passthrough = sys.argv[3:]
+
+for f in [jprof, splitfile]:
+    if not os.path.isfile(f):
+        sys.stderr.write("could not find file: {0}\n".format(f))
+        sys.exit(1)
+
+
+def read_splits(splitfile):
+    """
+    Read splitfile (each line of which contains a name, a space, and
+    then a function name to split on), and return a list of pairs
+    representing exactly that.  (Note that the name cannot contain
+    spaces, but the function name can, and often does.)
+    """
+
+    def line_to_split(line):
+        line = line.strip("\r\n")
+        idx = line.index(" ")
+        return (line[0:idx], line[idx + 1 :])
+
+    io = open(splitfile, "r")
+    result = [line_to_split(line) for line in io]
+    io.close()
+    return result
+
+
+splits = read_splits(splitfile)
+
+
+def generate_profile(options, destfile):
+    """
+    Run jprof to generate one split of the profile.
+    """
+    args = [jprof] + options + passthrough
+    print("Generating {}".format(destfile))
+    destio = open(destfile, "w")
+    # jprof expects the "jprof-map" file to be in its current working directory
+    cwd = None
+    for option in passthrough:
+        if option.find("jprof-log"):
+            cwd = os.path.dirname(option)
+    if cwd is None:
+        raise Exception("no jprof-log option given")
+    process = subprocess.Popen(args, stdout=destio, cwd=cwd)
+    process.wait()
+    destio.close()
+    if process.returncode != 0:
+        os.remove(destfile)
+        sys.stderr.write(
+            "Error {0} from command:\n  {1}\n".format(
+                process.returncode, " ".join(args)
+            )
+        )
+        sys.exit(process.returncode)
+
+
+def output_filename(number, splitname):
+    """
+    Return the filename (absolute path) we should use to output the
+    profile segment with the given number and splitname.  Splitname
+    should be None for the complete profile and the remainder.
+    """
+
+    def pad_count(i):
+        result = str(i)
+        # 0-pad to the same length
+        result = "0" * (len(str(len(splits) + 1)) - len(result)) + result
+        return result
+
+    name = pad_count(number)
+    if splitname is not None:
+        name += "-" + splitname
+
+    return os.path.join(os.path.dirname(splitfile), "jprof-{0}.html".format(name))
+
+
+# generate the complete profile
+generate_profile([], output_filename(0, None))
+
+# generate the listed splits
+count = 1
+excludes = []
+for (splitname, splitfunction) in splits:
+    generate_profile(
+        excludes + ["-i" + splitfunction], output_filename(count, splitname)
+    )
+    excludes += ["-e" + splitfunction]
+    count = count + 1
+
+# generate the remainder after the splits
+generate_profile(excludes, output_filename(count, None))
diff --git a/tools/jprof/strset.cpp b/tools/jprof/strset.cpp
new file mode 100644
index 0000000000..514b8c03e0
--- /dev/null
+++ b/tools/jprof/strset.cpp
@@ -0,0 +1,37 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "strset.h"
+#include 
+#include 
+
+StrSet::StrSet() {
+  strings = 0;
+  numstrings = 0;
+}
+
+void StrSet::add(const char* s) {
+  if (strings) {
+    strings = (char**)realloc(strings, (numstrings + 1) * sizeof(char*));
+  } else {
+    strings = (char**)malloc(sizeof(char*));
+  }
+  strings[numstrings] = strdup(s);
+  numstrings++;
+}
+
+int StrSet::contains(const char* s) {
+  char** sp = strings;
+  int i = numstrings;
+
+  while (--i >= 0) {
+    char* ss = *sp++;
+    if (ss[0] == s[0]) {
+      if (strcmp(ss, s) == 0) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
diff --git a/tools/jprof/strset.h b/tools/jprof/strset.h
new file mode 100644
index 0000000000..681ed22a25
--- /dev/null
+++ b/tools/jprof/strset.h
@@ -0,0 +1,19 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef __strset_h_
+#define __strset_h_
+
+struct StrSet {
+  StrSet();
+
+  void add(const char* string);
+  int contains(const char* string);
+  bool IsEmpty() const { return 0 == numstrings; }
+
+  char** strings;
+  int numstrings;
+};
+
+#endif /* __strset_h_ */
diff --git a/tools/jprof/stub/Makefile.in b/tools/jprof/stub/Makefile.in
new file mode 100644
index 0000000000..8e6b6b8f8d
--- /dev/null
+++ b/tools/jprof/stub/Makefile.in
@@ -0,0 +1,8 @@
+#! gmake
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# override optimization
+MOZ_OPTIMIZE_FLAGS = -fno-omit-frame-pointer
diff --git a/tools/jprof/stub/config.h b/tools/jprof/stub/config.h
new file mode 100644
index 0000000000..6e5789452a
--- /dev/null
+++ b/tools/jprof/stub/config.h
@@ -0,0 +1,18 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef config_h___
+#define config_h___
+
+#define MAX_STACK_CRAWL 500
+#define M_LOGFILE "jprof-log"
+#define M_MAPFILE "jprof-map"
+
+#if defined(linux) || defined(NTO)
+#  define USE_BFD
+#  undef NEED_WRAPPERS
+
+#endif /* linux */
+
+#endif /* config_h___ */
diff --git a/tools/jprof/stub/jprof.h b/tools/jprof/stub/jprof.h
new file mode 100644
index 0000000000..c118760750
--- /dev/null
+++ b/tools/jprof/stub/jprof.h
@@ -0,0 +1,17 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef jprof_h___
+#define jprof_h___
+#include "nscore.h"
+
+#ifdef _IMPL_JPPROF_API
+#  define JPROF_API(type) NS_EXPORT_(type)
+#else
+#  define JPROF_API(type) NS_IMPORT_(type)
+#endif
+
+JPROF_API(void) setupProfilingStuff(void);
+
+#endif /* jprof_h___ */
diff --git a/tools/jprof/stub/libmalloc.cpp b/tools/jprof/stub/libmalloc.cpp
new file mode 100644
index 0000000000..3003543a93
--- /dev/null
+++ b/tools/jprof/stub/libmalloc.cpp
@@ -0,0 +1,740 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+// vim:cindent:sw=4:et:ts=8:
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// The linux glibc hides part of sigaction if _POSIX_SOURCE is defined
+#if defined(linux)
+#  undef _POSIX_SOURCE
+#  undef _SVID_SOURCE
+#  ifndef _GNU_SOURCE
+#    define _GNU_SOURCE
+#  endif
+#endif
+
+#include 
+#if defined(linux)
+#  include 
+#  include 
+#endif
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libmalloc.h"
+#include "jprof.h"
+#include 
+#include 
+#include 
+
+#ifdef NTO
+#  include 
+extern r_debug _r_debug;
+#else
+#  include 
+#endif
+
+#define USE_GLIBC_BACKTRACE 1
+// To debug, use #define JPROF_STATIC
+#define JPROF_STATIC static
+
+static int gLogFD = -1;
+static pthread_t main_thread;
+
+static bool gIsChild = false;
+static int gFilenamePID;
+
+static void startSignalCounter(unsigned long millisec);
+static int enableRTCSignals(bool enable);
+
+//----------------------------------------------------------------------
+// replace use of atexit()
+
+static void DumpAddressMap();
+
+struct JprofShutdown {
+  JprofShutdown() {}
+  ~JprofShutdown() { DumpAddressMap(); }
+};
+
+static void RegisterJprofShutdown() {
+  // This instanciates the dummy class above, and will trigger the class
+  // destructor when libxul is unloaded. This is equivalent to atexit(),
+  // but gracefully handles dlclose().
+  static JprofShutdown t;
+}
+
+#if defined(i386) || defined(_i386) || defined(__x86_64__)
+JPROF_STATIC void CrawlStack(malloc_log_entry* me, void* stack_top,
+                             void* top_instr_ptr) {
+#  if USE_GLIBC_BACKTRACE
+  // This probably works on more than x86!  But we need a way to get the
+  // top instruction pointer, which is kindof arch-specific
+  void* array[500];
+  int cnt, i;
+  u_long numpcs = 0;
+
+  // This is from glibc.  A more generic version might use
+  // libunwind and/or CaptureStackBackTrace() on Windows
+  cnt = backtrace(&array[0], sizeof(array) / sizeof(array[0]));
+
+  // StackHook->JprofLog->CrawlStack
+  // Then we have sigaction, which replaced top_instr_ptr
+  array[3] = top_instr_ptr;
+  for (i = 3; i < cnt; i++) {
+    me->pcs[numpcs++] = (char*)array[i];
+  }
+  me->numpcs = numpcs;
+
+#  else
+  // original code - this breaks on many platforms
+  void** bp;
+#    if defined(__i386)
+  __asm__("movl %%ebp, %0" : "=g"(bp));
+#    elif defined(__x86_64__)
+  __asm__("movq %%rbp, %0" : "=g"(bp));
+#    else
+  // It would be nice if this worked uniformly, but at least on i386 and
+  // x86_64, it stopped working with gcc 4.1, because it points to the
+  // end of the saved registers instead of the start.
+  bp = __builtin_frame_address(0);
+#    endif
+  u_long numpcs = 0;
+  bool tracing = false;
+
+  me->pcs[numpcs++] = (char*)top_instr_ptr;
+
+  while (numpcs < MAX_STACK_CRAWL) {
+    void** nextbp = (void**)*bp++;
+    void* pc = *bp;
+    if (nextbp < bp) {
+      break;
+    }
+    if (tracing) {
+      // Skip the signal handling.
+      me->pcs[numpcs++] = (char*)pc;
+    } else if (pc == top_instr_ptr) {
+      tracing = true;
+    }
+    bp = nextbp;
+  }
+  me->numpcs = numpcs;
+#  endif
+}
+#endif
+
+//----------------------------------------------------------------------
+
+static int rtcHz;
+static int rtcFD = -1;
+static bool circular = false;
+
+#if defined(linux) || defined(NTO)
+static void DumpAddressMap() {
+  // Turn off the timer so we don't get interrupts during shutdown
+#  if defined(linux)
+  if (rtcHz) {
+    enableRTCSignals(false);
+  } else
+#  endif
+  {
+    startSignalCounter(0);
+  }
+
+  char filename[2048];
+  if (gIsChild)
+    snprintf(filename, sizeof(filename), "%s-%d", M_MAPFILE, gFilenamePID);
+  else
+    snprintf(filename, sizeof(filename), "%s", M_MAPFILE);
+
+  int mfd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666);
+  if (mfd >= 0) {
+    malloc_map_entry mme;
+    link_map* map = _r_debug.r_map;
+    while (nullptr != map) {
+      if (map->l_name && *map->l_name) {
+        mme.nameLen = strlen(map->l_name);
+        mme.address = map->l_addr;
+        write(mfd, &mme, sizeof(mme));
+        write(mfd, map->l_name, mme.nameLen);
+#  if 0
+	write(1, map->l_name, mme.nameLen);
+	write(1, "\n", 1);
+#  endif
+      }
+      map = map->l_next;
+    }
+    close(mfd);
+  }
+}
+#endif
+
+static bool was_paused = true;
+
+JPROF_STATIC void JprofBufferDump();
+JPROF_STATIC void JprofBufferClear();
+
+static void ClearProfilingHook(int signum) {
+  if (circular) {
+    JprofBufferClear();
+    puts("Jprof: cleared circular buffer.");
+  }
+}
+
+static void EndProfilingHook(int signum) {
+  if (circular) JprofBufferDump();
+
+  DumpAddressMap();
+  was_paused = true;
+  puts("Jprof: profiling paused.");
+}
+
+//----------------------------------------------------------------------
+// proper usage would be a template, including the function to find the
+// size of an entry, or include a size header explicitly to each entry.
+#if defined(linux)
+#  define DUMB_LOCK() pthread_mutex_lock(&mutex);
+#  define DUMB_UNLOCK() pthread_mutex_unlock(&mutex);
+#else
+#  define DUMB_LOCK() FIXME()
+#  define DUMB_UNLOCK() FIXME()
+#endif
+
+class DumbCircularBuffer {
+ public:
+  DumbCircularBuffer(size_t init_buffer_size) {
+    used = 0;
+    buffer_size = init_buffer_size;
+    buffer = (unsigned char*)malloc(buffer_size);
+    head = tail = buffer;
+
+#if defined(linux)
+    pthread_mutexattr_t mAttr;
+    pthread_mutexattr_settype(&mAttr, PTHREAD_MUTEX_RECURSIVE_NP);
+    pthread_mutex_init(&mutex, &mAttr);
+    pthread_mutexattr_destroy(&mAttr);
+#endif
+  }
+  ~DumbCircularBuffer() {
+    free(buffer);
+#if defined(linux)
+    pthread_mutex_destroy(&mutex);
+#endif
+  }
+
+  void clear() {
+    DUMB_LOCK();
+    head = tail;
+    used = 0;
+    DUMB_UNLOCK();
+  }
+
+  bool empty() { return head == tail; }
+
+  size_t space_available() {
+    size_t result;
+    DUMB_LOCK();
+    if (tail > head)
+      result = buffer_size - (tail - head) - 1;
+    else
+      result = head - tail - 1;
+    DUMB_UNLOCK();
+    return result;
+  }
+
+  void drop(size_t size) {
+    // assumes correctness!
+    DUMB_LOCK();
+    head += size;
+    if (head >= &buffer[buffer_size]) head -= buffer_size;
+    used--;
+    DUMB_UNLOCK();
+  }
+
+  bool insert(void* data, size_t size) {
+    // can fail if not enough space in the entire buffer
+    DUMB_LOCK();
+    if (space_available() < size) return false;
+
+    size_t max_without_wrap = &buffer[buffer_size] - tail;
+    size_t initial = size > max_without_wrap ? max_without_wrap : size;
+#if DEBUG_CIRCULAR
+    fprintf(stderr, "insert(%d): max_without_wrap %d, size %d, initial %d\n",
+            used, max_without_wrap, size, initial);
+#endif
+    memcpy(tail, data, initial);
+    tail += initial;
+    data = ((char*)data) + initial;
+    size -= initial;
+    if (size != 0) {
+#if DEBUG_CIRCULAR
+      fprintf(stderr, "wrapping by %d bytes\n", size);
+#endif
+      memcpy(buffer, data, size);
+      tail = &(((unsigned char*)buffer)[size]);
+    }
+
+    used++;
+    DUMB_UNLOCK();
+
+    return true;
+  }
+
+  // for external access to the buffer (saving)
+  void lock() { DUMB_LOCK(); }
+
+  void unlock() { DUMB_UNLOCK(); }
+
+  // XXX These really shouldn't be public...
+  unsigned char* head;
+  unsigned char* tail;
+  unsigned int used;
+  unsigned char* buffer;
+  size_t buffer_size;
+
+ private:
+  pthread_mutex_t mutex;
+};
+
+class DumbCircularBuffer* JprofBuffer;
+
+JPROF_STATIC void JprofBufferInit(size_t size) {
+  JprofBuffer = new DumbCircularBuffer(size);
+}
+
+JPROF_STATIC void JprofBufferClear() {
+  fprintf(stderr, "Told to clear JPROF circular buffer\n");
+  JprofBuffer->clear();
+}
+
+JPROF_STATIC size_t JprofEntrySizeof(malloc_log_entry* me) {
+  return offsetof(malloc_log_entry, pcs) + me->numpcs * sizeof(char*);
+}
+
+JPROF_STATIC void JprofBufferAppend(malloc_log_entry* me) {
+  size_t size = JprofEntrySizeof(me);
+
+  do {
+    while (JprofBuffer->space_available() < size && JprofBuffer->used > 0) {
+#if DEBUG_CIRCULAR
+      fprintf(
+          stderr,
+          "dropping entry: %d in use, %d free, need %d, size_to_free = %d\n",
+          JprofBuffer->used, JprofBuffer->space_available(), size,
+          JprofEntrySizeof((malloc_log_entry*)JprofBuffer->head));
+#endif
+      JprofBuffer->drop(JprofEntrySizeof((malloc_log_entry*)JprofBuffer->head));
+    }
+    if (JprofBuffer->space_available() < size) return;
+
+  } while (!JprofBuffer->insert(me, size));
+}
+
+JPROF_STATIC void JprofBufferDump() {
+  JprofBuffer->lock();
+#if DEBUG_CIRCULAR
+  fprintf(
+      stderr, "dumping JP_CIRCULAR buffer, %d of %d bytes\n",
+      JprofBuffer->tail > JprofBuffer->head
+          ? JprofBuffer->tail - JprofBuffer->head
+          : JprofBuffer->buffer_size + JprofBuffer->tail - JprofBuffer->head,
+      JprofBuffer->buffer_size);
+#endif
+  if (JprofBuffer->tail >= JprofBuffer->head) {
+    write(gLogFD, JprofBuffer->head, JprofBuffer->tail - JprofBuffer->head);
+  } else {
+    write(gLogFD, JprofBuffer->head,
+          &(JprofBuffer->buffer[JprofBuffer->buffer_size]) - JprofBuffer->head);
+    write(gLogFD, JprofBuffer->buffer, JprofBuffer->tail - JprofBuffer->buffer);
+  }
+  JprofBuffer->clear();
+  JprofBuffer->unlock();
+}
+
+//----------------------------------------------------------------------
+
+JPROF_STATIC void JprofLog(u_long aTime, void* stack_top, void* top_instr_ptr) {
+  // Static is simply to make debugging tolerable
+  static malloc_log_entry me;
+
+  me.delTime = aTime;
+  me.thread = syscall(SYS_gettid);  // gettid();
+  if (was_paused) {
+    me.flags = JP_FIRST_AFTER_PAUSE;
+    was_paused = 0;
+  } else {
+    me.flags = 0;
+  }
+
+  CrawlStack(&me, stack_top, top_instr_ptr);
+
+#ifndef NTO
+  if (circular) {
+    JprofBufferAppend(&me);
+  } else {
+    write(gLogFD, &me, JprofEntrySizeof(&me));
+  }
+#else
+  printf("Neutrino is missing the pcs member of malloc_log_entry!! \n");
+#endif
+}
+
+static int realTime;
+
+/* Lets interrupt at 10 Hz.  This is so my log files don't get too large.
+ * This can be changed to a faster value latter.  This timer is not
+ * programmed to reset, even though it is capable of doing so.  This is
+ * to keep from getting interrupts from inside of the handler.
+ */
+static void startSignalCounter(unsigned long millisec) {
+  struct itimerval tvalue;
+
+  tvalue.it_interval.tv_sec = 0;
+  tvalue.it_interval.tv_usec = 0;
+  tvalue.it_value.tv_sec = millisec / 1000;
+  tvalue.it_value.tv_usec = (millisec % 1000) * 1000;
+
+  if (realTime) {
+    setitimer(ITIMER_REAL, &tvalue, nullptr);
+  } else {
+    setitimer(ITIMER_PROF, &tvalue, nullptr);
+  }
+}
+
+static long timerMilliSec = 50;
+
+#if defined(linux)
+static int setupRTCSignals(int hz, struct sigaction* sap) {
+  /* global */ rtcFD = open("/dev/rtc", O_RDONLY);
+  if (rtcFD < 0) {
+    perror("JPROF_RTC setup: open(\"/dev/rtc\", O_RDONLY)");
+    return 0;
+  }
+
+  if (sigaction(SIGIO, sap, nullptr) == -1) {
+    perror("JPROF_RTC setup: sigaction(SIGIO)");
+    return 0;
+  }
+
+  if (ioctl(rtcFD, RTC_IRQP_SET, hz) == -1) {
+    perror("JPROF_RTC setup: ioctl(/dev/rtc, RTC_IRQP_SET, $JPROF_RTC_HZ)");
+    return 0;
+  }
+
+  if (ioctl(rtcFD, RTC_PIE_ON, 0) == -1) {
+    perror("JPROF_RTC setup: ioctl(/dev/rtc, RTC_PIE_ON)");
+    return 0;
+  }
+
+  if (fcntl(rtcFD, F_SETSIG, 0) == -1) {
+    perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETSIG, 0)");
+    return 0;
+  }
+
+  if (fcntl(rtcFD, F_SETOWN, getpid()) == -1) {
+    perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETOWN, getpid())");
+    return 0;
+  }
+
+  return 1;
+}
+
+static int enableRTCSignals(bool enable) {
+  static bool enabled = false;
+  if (enabled == enable) {
+    return 0;
+  }
+  enabled = enable;
+
+  int flags = fcntl(rtcFD, F_GETFL);
+  if (flags < 0) {
+    perror("JPROF_RTC setup: fcntl(/dev/rtc, F_GETFL)");
+    return 0;
+  }
+
+  if (enable) {
+    flags |= FASYNC;
+  } else {
+    flags &= ~FASYNC;
+  }
+
+  if (fcntl(rtcFD, F_SETFL, flags) == -1) {
+    if (enable) {
+      perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETFL, flags | FASYNC)");
+    } else {
+      perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETFL, flags & ~FASYNC)");
+    }
+    return 0;
+  }
+
+  return 1;
+}
+#endif
+
+JPROF_STATIC void StackHook(int signum, siginfo_t* info, void* ucontext) {
+  static struct timeval tFirst;
+  static int first = 1;
+  size_t millisec = 0;
+
+#if defined(linux)
+  if (rtcHz && pthread_self() != main_thread) {
+    // Only collect stack data on the main thread, for now.
+    return;
+  }
+#endif
+
+  if (first && !(first = 0)) {
+    puts("Jprof: received first signal");
+#if defined(linux)
+    if (rtcHz) {
+      enableRTCSignals(true);
+    } else
+#endif
+    {
+      gettimeofday(&tFirst, 0);
+      millisec = 0;
+    }
+  } else {
+#if defined(linux)
+    if (rtcHz) {
+      enableRTCSignals(true);
+    } else
+#endif
+    {
+      struct timeval tNow;
+      gettimeofday(&tNow, 0);
+      double usec = 1e6 * (tNow.tv_sec - tFirst.tv_sec);
+      usec += (tNow.tv_usec - tFirst.tv_usec);
+      millisec = static_cast(usec * 1e-3);
+    }
+  }
+
+  gregset_t& gregs = ((ucontext_t*)ucontext)->uc_mcontext.gregs;
+#ifdef __x86_64__
+  JprofLog(millisec, (void*)gregs[REG_RSP], (void*)gregs[REG_RIP]);
+#else
+  JprofLog(millisec, (void*)gregs[REG_ESP], (void*)gregs[REG_EIP]);
+#endif
+
+  if (!rtcHz) startSignalCounter(timerMilliSec);
+}
+
+NS_EXPORT_(void) setupProfilingStuff(void) {
+  static int gFirstTime = 1;
+  char filename[2048];  // XXX fix
+
+  if (gFirstTime && !(gFirstTime = 0)) {
+    int startTimer = 1;
+    int doNotStart = 1;
+    int firstDelay = 0;
+    int append = O_TRUNC;
+    char* tst = getenv("JPROF_FLAGS");
+
+    /* Options from JPROF_FLAGS environment variable:
+     *   JP_DEFER  -> Wait for a SIGPROF (or SIGALRM, if JP_REALTIME
+     *               is set) from userland before starting
+     *               to generate them internally
+     *   JP_START  -> Install the signal handler
+     *   JP_PERIOD -> Time between profiler ticks
+     *   JP_FIRST  -> Extra delay before starting
+     *   JP_REALTIME -> Take stack traces in intervals of real time
+     *               rather than time used by the process (and the
+     *               system for the process).  This is useful for
+     *               finding time spent by the X server.
+     *   JP_APPEND -> Append to jprof-log rather than overwriting it.
+     *               This is somewhat risky since it depends on the
+     *               address map staying constant across multiple runs.
+     *   JP_FILENAME -> base filename to use when saving logs.  Note that
+     *               this does not affect the mapfile.
+     *   JP_CIRCULAR -> use a circular buffer of size N, write/clear on SIGUSR1
+     *
+     * JPROF_ISCHILD is set if this is not the first process.
+     */
+
+    circular = false;
+
+    if (tst) {
+      if (strstr(tst, "JP_DEFER")) {
+        doNotStart = 0;
+        startTimer = 0;
+      }
+      if (strstr(tst, "JP_START")) doNotStart = 0;
+      if (strstr(tst, "JP_REALTIME")) realTime = 1;
+      if (strstr(tst, "JP_APPEND")) append = O_APPEND;
+
+      char* delay = strstr(tst, "JP_PERIOD=");
+      if (delay) {
+        double tmp = strtod(delay + strlen("JP_PERIOD="), nullptr);
+        if (tmp >= 1e-3) {
+          timerMilliSec = static_cast(1000 * tmp);
+        } else {
+          fprintf(stderr, "JP_PERIOD of %g less than 0.001 (1ms), using 1ms\n",
+                  tmp);
+          timerMilliSec = 1;
+        }
+      }
+
+      char* circular_op = strstr(tst, "JP_CIRCULAR=");
+      if (circular_op) {
+        size_t size = atol(circular_op + strlen("JP_CIRCULAR="));
+        if (size < 1000) {
+          fprintf(stderr, "JP_CIRCULAR of %lu less than 1000, using 10000\n",
+                  (unsigned long)size);
+          size = 10000;
+        }
+        JprofBufferInit(size);
+        fprintf(stderr, "JP_CIRCULAR buffer of %lu bytes\n",
+                (unsigned long)size);
+        circular = true;
+      }
+
+      char* first = strstr(tst, "JP_FIRST=");
+      if (first) {
+        firstDelay = atol(first + strlen("JP_FIRST="));
+      }
+
+      char* rtc = strstr(tst, "JP_RTC_HZ=");
+      if (rtc) {
+#if defined(linux)
+        rtcHz = atol(rtc + strlen("JP_RTC_HZ="));
+        timerMilliSec = 0; /* This makes JP_FIRST work right. */
+        realTime = 1;      /* It's the _R_TC and all.  ;) */
+
+#  define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
+
+        if (!IS_POWER_OF_TWO(rtcHz) || rtcHz < 2) {
+          fprintf(stderr,
+                  "JP_RTC_HZ must be power of two and >= 2, "
+                  "but %d was provided; using default of 2048\n",
+                  rtcHz);
+          rtcHz = 2048;
+        }
+#else
+        fputs(
+            "JP_RTC_HZ found, but RTC profiling only supported on "
+            "Linux!\n",
+            stderr);
+
+#endif
+      }
+      const char* f = strstr(tst, "JP_FILENAME=");
+      if (f)
+        f = f + strlen("JP_FILENAME=");
+      else
+        f = M_LOGFILE;
+
+      char* is_child = getenv("JPROF_ISCHILD");
+      if (!is_child) setenv("JPROF_ISCHILD", "", 0);
+      gIsChild = !!is_child;
+
+      gFilenamePID = syscall(SYS_gettid);  // gettid();
+      if (is_child)
+        snprintf(filename, sizeof(filename), "%s-%d", f, gFilenamePID);
+      else
+        snprintf(filename, sizeof(filename), "%s", f);
+
+      // XXX FIX! inherit current capture state!
+    }
+
+    if (!doNotStart) {
+      if (gLogFD < 0) {
+        gLogFD = open(filename, O_CREAT | O_WRONLY | append, 0666);
+        if (gLogFD < 0) {
+          fprintf(stderr, "Unable to create " M_LOGFILE);
+          perror(":");
+        } else {
+          struct sigaction action;
+          sigset_t mset;
+
+          // Dump out the address map when we terminate
+          RegisterJprofShutdown();
+
+          main_thread = pthread_self();
+          // fprintf(stderr,"jprof: main_thread = %u\n",
+          //        (unsigned int)main_thread);
+
+          // FIX!  probably should block these against each other
+          // Very unlikely.
+          sigemptyset(&mset);
+          action.sa_handler = nullptr;
+          action.sa_sigaction = StackHook;
+          action.sa_mask = mset;
+          action.sa_flags = SA_RESTART | SA_SIGINFO;
+#if defined(linux)
+          if (rtcHz) {
+            if (!setupRTCSignals(rtcHz, &action)) {
+              fputs(
+                  "jprof: Error initializing RTC, NOT "
+                  "profiling\n",
+                  stderr);
+              return;
+            }
+          }
+
+          if (!rtcHz || firstDelay != 0)
+#endif
+          {
+            if (realTime) {
+              sigaction(SIGALRM, &action, nullptr);
+            }
+          }
+          // enable PROF in all cases to simplify JP_DEFER/pause/restart
+          sigaction(SIGPROF, &action, nullptr);
+
+          // make it so a SIGUSR1 will stop the profiling
+          // Note:  It currently does not close the logfile.
+          // This could be configurable (so that it could
+          // later be reopened).
+
+          struct sigaction stop_action;
+          stop_action.sa_handler = EndProfilingHook;
+          stop_action.sa_mask = mset;
+          stop_action.sa_flags = SA_RESTART;
+          sigaction(SIGUSR1, &stop_action, nullptr);
+
+          // make it so a SIGUSR2 will clear the circular buffer
+
+          stop_action.sa_handler = ClearProfilingHook;
+          stop_action.sa_mask = mset;
+          stop_action.sa_flags = SA_RESTART;
+          sigaction(SIGUSR2, &stop_action, nullptr);
+
+          printf(
+              "Jprof: Initialized signal handler and set "
+              "timer for %lu %s, %d s "
+              "initial delay\n",
+              rtcHz ? rtcHz : timerMilliSec, rtcHz ? "Hz" : "ms", firstDelay);
+
+          if (startTimer) {
+#if defined(linux)
+            /* If we have an initial delay we can just use
+               startSignalCounter to set up a timer to fire the
+               first stackHook after that delay.  When that happens
+               we'll go and switch to RTC profiling. */
+            if (rtcHz && firstDelay == 0) {
+              puts("Jprof: enabled RTC signals");
+              enableRTCSignals(true);
+            } else
+#endif
+            {
+              puts("Jprof: started timer");
+              startSignalCounter(firstDelay * 1000 + timerMilliSec);
+            }
+          }
+        }
+      }
+    }
+  } else {
+    printf("setupProfilingStuff() called multiple times\n");
+  }
+}
diff --git a/tools/jprof/stub/libmalloc.h b/tools/jprof/stub/libmalloc.h
new file mode 100644
index 0000000000..a78b35ade8
--- /dev/null
+++ b/tools/jprof/stub/libmalloc.h
@@ -0,0 +1,45 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef libmalloc_h___
+#define libmalloc_h___
+
+#include 
+#include 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config.h"
+
+typedef unsigned long u_long;
+
+// For me->flags
+#define JP_FIRST_AFTER_PAUSE 1
+
+// Format of a jprof log entry. This is what's written out to the
+// "jprof-log" file.
+// It's called malloc_log_entry because the history of jprof is that
+// it's a modified version of tracemalloc.
+struct malloc_log_entry {
+  u_long delTime;
+  u_long numpcs;
+  unsigned int flags;
+  int thread;
+  char* pcs[MAX_STACK_CRAWL];
+};
+
+// Format of a malloc map entry; after this struct is nameLen+1 bytes of
+// name data.
+struct malloc_map_entry {
+  u_long nameLen;
+  u_long address;  // base address
+};
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* libmalloc_h___ */
diff --git a/tools/jprof/stub/moz.build b/tools/jprof/stub/moz.build
new file mode 100644
index 0000000000..692c6ea37f
--- /dev/null
+++ b/tools/jprof/stub/moz.build
@@ -0,0 +1,17 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+EXPORTS += [
+    "jprof.h",
+]
+
+SOURCES += [
+    "libmalloc.cpp",
+]
+
+SharedLibrary("jprof")
+
+DEFINES["_IMPL_JPROF_API"] = True
-- 
cgit v1.2.3