From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 19:32:43 +0200
Subject: Adding upstream version 1:115.7.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 tools/profiler/core/EHABIStackWalk.cpp             |  597 ++
 tools/profiler/core/EHABIStackWalk.h               |   28 +
 tools/profiler/core/MicroGeckoProfiler.cpp         |  203 +
 tools/profiler/core/PageInformation.cpp            |   44 +
 tools/profiler/core/PageInformation.h              |   68 +
 tools/profiler/core/PlatformMacros.h               |  130 +
 tools/profiler/core/PowerCounters-linux.cpp        |  287 +
 tools/profiler/core/PowerCounters-mac-amd64.cpp    |  419 ++
 tools/profiler/core/PowerCounters-mac-arm64.cpp    |   47 +
 tools/profiler/core/PowerCounters-win.cpp          |  342 +
 tools/profiler/core/PowerCounters.h                |   52 +
 .../profiler/core/ProfileAdditionalInformation.cpp |  102 +
 tools/profiler/core/ProfileBuffer.cpp              |  243 +
 tools/profiler/core/ProfileBuffer.h                |  260 +
 tools/profiler/core/ProfileBufferEntry.cpp         | 2321 +++++++
 tools/profiler/core/ProfileBufferEntry.h           |  532 ++
 tools/profiler/core/ProfiledThreadData.cpp         |  455 ++
 tools/profiler/core/ProfiledThreadData.h           |  250 +
 tools/profiler/core/ProfilerBacktrace.cpp          |  101 +
 tools/profiler/core/ProfilerBacktrace.h            |  184 +
 tools/profiler/core/ProfilerBindings.cpp           |  386 ++
 tools/profiler/core/ProfilerCodeAddressService.cpp |   75 +
 tools/profiler/core/ProfilerMarkers.cpp            |   32 +
 tools/profiler/core/ProfilerThreadRegistration.cpp |  198 +
 .../core/ProfilerThreadRegistrationData.cpp        |  303 +
 tools/profiler/core/ProfilerThreadRegistry.cpp     |   40 +
 tools/profiler/core/ProfilerUtils.cpp              |  118 +
 tools/profiler/core/VTuneProfiler.cpp              |   80 +
 tools/profiler/core/VTuneProfiler.h                |   78 +
 tools/profiler/core/memory_hooks.cpp               |  632 ++
 tools/profiler/core/memory_hooks.h                 |   25 +
 tools/profiler/core/platform-linux-android.cpp     |  636 ++
 tools/profiler/core/platform-macos.cpp             |  297 +
 tools/profiler/core/platform-win32.cpp             |  496 ++
 tools/profiler/core/platform.cpp                   | 7067 ++++++++++++++++++++
 tools/profiler/core/platform.h                     |  381 ++
 tools/profiler/core/shared-libraries-linux.cc      |  280 +
 tools/profiler/core/shared-libraries-macos.cc      |  211 +
 tools/profiler/core/shared-libraries-win32.cc      |  167 +
 tools/profiler/core/vtune/ittnotify.h              | 4123 ++++++++++++
 40 files changed, 22290 insertions(+)
 create mode 100644 tools/profiler/core/EHABIStackWalk.cpp
 create mode 100644 tools/profiler/core/EHABIStackWalk.h
 create mode 100644 tools/profiler/core/MicroGeckoProfiler.cpp
 create mode 100644 tools/profiler/core/PageInformation.cpp
 create mode 100644 tools/profiler/core/PageInformation.h
 create mode 100644 tools/profiler/core/PlatformMacros.h
 create mode 100644 tools/profiler/core/PowerCounters-linux.cpp
 create mode 100644 tools/profiler/core/PowerCounters-mac-amd64.cpp
 create mode 100644 tools/profiler/core/PowerCounters-mac-arm64.cpp
 create mode 100644 tools/profiler/core/PowerCounters-win.cpp
 create mode 100644 tools/profiler/core/PowerCounters.h
 create mode 100644 tools/profiler/core/ProfileAdditionalInformation.cpp
 create mode 100644 tools/profiler/core/ProfileBuffer.cpp
 create mode 100644 tools/profiler/core/ProfileBuffer.h
 create mode 100644 tools/profiler/core/ProfileBufferEntry.cpp
 create mode 100644 tools/profiler/core/ProfileBufferEntry.h
 create mode 100644 tools/profiler/core/ProfiledThreadData.cpp
 create mode 100644 tools/profiler/core/ProfiledThreadData.h
 create mode 100644 tools/profiler/core/ProfilerBacktrace.cpp
 create mode 100644 tools/profiler/core/ProfilerBacktrace.h
 create mode 100644 tools/profiler/core/ProfilerBindings.cpp
 create mode 100644 tools/profiler/core/ProfilerCodeAddressService.cpp
 create mode 100644 tools/profiler/core/ProfilerMarkers.cpp
 create mode 100644 tools/profiler/core/ProfilerThreadRegistration.cpp
 create mode 100644 tools/profiler/core/ProfilerThreadRegistrationData.cpp
 create mode 100644 tools/profiler/core/ProfilerThreadRegistry.cpp
 create mode 100644 tools/profiler/core/ProfilerUtils.cpp
 create mode 100644 tools/profiler/core/VTuneProfiler.cpp
 create mode 100644 tools/profiler/core/VTuneProfiler.h
 create mode 100644 tools/profiler/core/memory_hooks.cpp
 create mode 100644 tools/profiler/core/memory_hooks.h
 create mode 100644 tools/profiler/core/platform-linux-android.cpp
 create mode 100644 tools/profiler/core/platform-macos.cpp
 create mode 100644 tools/profiler/core/platform-win32.cpp
 create mode 100644 tools/profiler/core/platform.cpp
 create mode 100644 tools/profiler/core/platform.h
 create mode 100644 tools/profiler/core/shared-libraries-linux.cc
 create mode 100644 tools/profiler/core/shared-libraries-macos.cc
 create mode 100644 tools/profiler/core/shared-libraries-win32.cc
 create mode 100644 tools/profiler/core/vtune/ittnotify.h

(limited to 'tools/profiler/core')

diff --git a/tools/profiler/core/EHABIStackWalk.cpp b/tools/profiler/core/EHABIStackWalk.cpp
new file mode 100644
index 0000000000..e3099b89ec
--- /dev/null
+++ b/tools/profiler/core/EHABIStackWalk.cpp
@@ -0,0 +1,597 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * This is an implementation of stack unwinding according to a subset
+ * of the ARM Exception Handling ABI, as described in:
+ *   http://infocenter.arm.com/help/topic/com.arm.doc.ihi0038a/IHI0038A_ehabi.pdf
+ *
+ * This handles only the ARM-defined "personality routines" (chapter
+ * 9), and don't track the value of FP registers, because profiling
+ * needs only chain of PC/SP values.
+ *
+ * Because the exception handling info may not be accurate for all
+ * possible places where an async signal could occur (e.g., in a
+ * prologue or epilogue), this bounds-checks all stack accesses.
+ *
+ * This file uses "struct" for structures in the exception tables and
+ * "class" otherwise.  We should avoid violating the C++11
+ * standard-layout rules in the former.
+ */
+
+#include "EHABIStackWalk.h"
+
+#include "shared-libraries.h"
+#include "platform.h"
+
+#include "mozilla/Atomics.h"
+#include "mozilla/Attributes.h"
+#include "mozilla/DebugOnly.h"
+#include "mozilla/EndianUtils.h"
+
+#include <algorithm>
+#include <elf.h>
+#include <stdint.h>
+#include <vector>
+#include <string>
+
+#ifndef PT_ARM_EXIDX
+#  define PT_ARM_EXIDX 0x70000001
+#endif
+
+namespace mozilla {
+
+struct PRel31 {
+  uint32_t mBits;
+  bool topBit() const { return mBits & 0x80000000; }
+  uint32_t value() const { return mBits & 0x7fffffff; }
+  int32_t offset() const { return (static_cast<int32_t>(mBits) << 1) >> 1; }
+  const void* compute() const {
+    return reinterpret_cast<const char*>(this) + offset();
+  }
+
+ private:
+  PRel31(const PRel31& copied) = delete;
+  PRel31() = delete;
+};
+
+struct EHEntry {
+  PRel31 startPC;
+  PRel31 exidx;
+
+ private:
+  EHEntry(const EHEntry& copied) = delete;
+  EHEntry() = delete;
+};
+
+class EHState {
+  // Note that any core register can be used as a "frame pointer" to
+  // influence the unwinding process, so this must track all of them.
+  uint32_t mRegs[16];
+
+ public:
+  bool unwind(const EHEntry* aEntry, const void* stackBase);
+  uint32_t& operator[](int i) { return mRegs[i]; }
+  const uint32_t& operator[](int i) const { return mRegs[i]; }
+  explicit EHState(const mcontext_t&);
+};
+
+enum { R_SP = 13, R_LR = 14, R_PC = 15 };
+
+class EHTable {
+  uint32_t mStartPC;
+  uint32_t mEndPC;
+  uint32_t mBaseAddress;
+  const EHEntry* mEntriesBegin;
+  const EHEntry* mEntriesEnd;
+  std::string mName;
+
+ public:
+  EHTable(const void* aELF, size_t aSize, const std::string& aName);
+  const EHEntry* lookup(uint32_t aPC) const;
+  bool isValid() const { return mEntriesEnd != mEntriesBegin; }
+  const std::string& name() const { return mName; }
+  uint32_t startPC() const { return mStartPC; }
+  uint32_t endPC() const { return mEndPC; }
+  uint32_t baseAddress() const { return mBaseAddress; }
+};
+
+class EHAddrSpace {
+  std::vector<uint32_t> mStarts;
+  std::vector<EHTable> mTables;
+  static mozilla::Atomic<const EHAddrSpace*> sCurrent;
+
+ public:
+  explicit EHAddrSpace(const std::vector<EHTable>& aTables);
+  const EHTable* lookup(uint32_t aPC) const;
+  static void Update();
+  static const EHAddrSpace* Get();
+};
+
+void EHABIStackWalkInit() { EHAddrSpace::Update(); }
+
+size_t EHABIStackWalk(const mcontext_t& aContext, void* stackBase, void** aSPs,
+                      void** aPCs, const size_t aNumFrames) {
+  const EHAddrSpace* space = EHAddrSpace::Get();
+  EHState state(aContext);
+  size_t count = 0;
+
+  while (count < aNumFrames) {
+    uint32_t pc = state[R_PC], sp = state[R_SP];
+
+    // ARM instructions are always aligned to 2 or 4 bytes.
+    // The last bit of the pc / lr indicates ARM or Thumb mode.
+    // We're only interested in the instruction address, so we mask off that
+    // bit.
+    constexpr uint32_t instrAddrMask = ~1;
+    uint32_t instrAddress = pc & instrAddrMask;
+
+    aPCs[count] = reinterpret_cast<void*>(instrAddress);
+    aSPs[count] = reinterpret_cast<void*>(sp);
+    count++;
+
+    if (!space) break;
+    // TODO: cache these lookups.  Binary-searching libxul is
+    // expensive (possibly more expensive than doing the actual
+    // unwind), and even a small cache should help.
+    const EHTable* table = space->lookup(pc);
+    if (!table) break;
+    const EHEntry* entry = table->lookup(pc);
+    if (!entry) break;
+    if (!state.unwind(entry, stackBase)) break;
+  }
+
+  return count;
+}
+
+class EHInterp {
+ public:
+  // Note that stackLimit is exclusive and stackBase is inclusive
+  // (i.e, stackLimit < SP <= stackBase), following the convention
+  // set by the AAPCS spec.
+  EHInterp(EHState& aState, const EHEntry* aEntry, uint32_t aStackLimit,
+           uint32_t aStackBase)
+      : mState(aState),
+        mStackLimit(aStackLimit),
+        mStackBase(aStackBase),
+        mNextWord(0),
+        mWordsLeft(0),
+        mFailed(false) {
+    const PRel31& exidx = aEntry->exidx;
+    uint32_t firstWord;
+
+    if (exidx.mBits == 1) {  // EXIDX_CANTUNWIND
+      mFailed = true;
+      return;
+    }
+    if (exidx.topBit()) {
+      firstWord = exidx.mBits;
+    } else {
+      mNextWord = reinterpret_cast<const uint32_t*>(exidx.compute());
+      firstWord = *mNextWord++;
+    }
+
+    switch (firstWord >> 24) {
+      case 0x80:  // short
+        mWord = firstWord << 8;
+        mBytesLeft = 3;
+        break;
+      case 0x81:
+      case 0x82:  // long; catch descriptor size ignored
+        mWord = firstWord << 16;
+        mBytesLeft = 2;
+        mWordsLeft = (firstWord >> 16) & 0xff;
+        break;
+      default:
+        // unknown personality
+        mFailed = true;
+    }
+  }
+
+  bool unwind();
+
+ private:
+  // TODO: GCC has been observed not CSEing repeated reads of
+  // mState[R_SP] with writes to mFailed between them, suggesting that
+  // it hasn't determined that they can't alias and is thus missing
+  // optimization opportunities.  So, we may want to flatten EHState
+  // into this class; this may also make the code simpler.
+  EHState& mState;
+  uint32_t mStackLimit;
+  uint32_t mStackBase;
+  const uint32_t* mNextWord;
+  uint32_t mWord;
+  uint8_t mWordsLeft;
+  uint8_t mBytesLeft;
+  bool mFailed;
+
+  enum {
+    I_ADDSP = 0x00,  // 0sxxxxxx (subtract if s)
+    M_ADDSP = 0x80,
+    I_POPMASK = 0x80,  // 1000iiii iiiiiiii (if any i set)
+    M_POPMASK = 0xf0,
+    I_MOVSP = 0x90,  // 1001nnnn
+    M_MOVSP = 0xf0,
+    I_POPN = 0xa0,  // 1010lnnn
+    M_POPN = 0xf0,
+    I_FINISH = 0xb0,    // 10110000
+    I_POPLO = 0xb1,     // 10110001 0000iiii (if any i set)
+    I_ADDSPBIG = 0xb2,  // 10110010 uleb128
+    I_POPFDX = 0xb3,    // 10110011 sssscccc
+    I_POPFDX8 = 0xb8,   // 10111nnn
+    M_POPFDX8 = 0xf8,
+    // "Intel Wireless MMX" extensions omitted.
+    I_POPFDD = 0xc8,  // 1100100h sssscccc
+    M_POPFDD = 0xfe,
+    I_POPFDD8 = 0xd0,  // 11010nnn
+    M_POPFDD8 = 0xf8
+  };
+
+  uint8_t next() {
+    if (mBytesLeft == 0) {
+      if (mWordsLeft == 0) {
+        return I_FINISH;
+      }
+      mWordsLeft--;
+      mWord = *mNextWord++;
+      mBytesLeft = 4;
+    }
+    mBytesLeft--;
+    mWord = (mWord << 8) | (mWord >> 24);  // rotate
+    return mWord;
+  }
+
+  uint32_t& vSP() { return mState[R_SP]; }
+  uint32_t* ptrSP() { return reinterpret_cast<uint32_t*>(vSP()); }
+
+  void checkStackBase() {
+    if (vSP() > mStackBase) mFailed = true;
+  }
+  void checkStackLimit() {
+    if (vSP() <= mStackLimit) mFailed = true;
+  }
+  void checkStackAlign() {
+    if ((vSP() & 3) != 0) mFailed = true;
+  }
+  void checkStack() {
+    checkStackBase();
+    checkStackLimit();
+    checkStackAlign();
+  }
+
+  void popRange(uint8_t first, uint8_t last, uint16_t mask) {
+    bool hasSP = false;
+    uint32_t tmpSP;
+    if (mask == 0) mFailed = true;
+    for (uint8_t r = first; r <= last; ++r) {
+      if (mask & 1) {
+        if (r == R_SP) {
+          hasSP = true;
+          tmpSP = *ptrSP();
+        } else
+          mState[r] = *ptrSP();
+        vSP() += 4;
+        checkStackBase();
+        if (mFailed) return;
+      }
+      mask >>= 1;
+    }
+    if (hasSP) {
+      vSP() = tmpSP;
+      checkStack();
+    }
+  }
+};
+
+bool EHState::unwind(const EHEntry* aEntry, const void* stackBasePtr) {
+  // The unwinding program cannot set SP to less than the initial value.
+  uint32_t stackLimit = mRegs[R_SP] - 4;
+  uint32_t stackBase = reinterpret_cast<uint32_t>(stackBasePtr);
+  EHInterp interp(*this, aEntry, stackLimit, stackBase);
+  return interp.unwind();
+}
+
+bool EHInterp::unwind() {
+  mState[R_PC] = 0;
+  checkStack();
+  while (!mFailed) {
+    uint8_t insn = next();
+#if DEBUG_EHABI_UNWIND
+    LOG("unwind insn = %02x", (unsigned)insn);
+#endif
+    // Try to put the common cases first.
+
+    // 00xxxxxx: vsp = vsp + (xxxxxx << 2) + 4
+    // 01xxxxxx: vsp = vsp - (xxxxxx << 2) - 4
+    if ((insn & M_ADDSP) == I_ADDSP) {
+      uint32_t offset = ((insn & 0x3f) << 2) + 4;
+      if (insn & 0x40) {
+        vSP() -= offset;
+        checkStackLimit();
+      } else {
+        vSP() += offset;
+        checkStackBase();
+      }
+      continue;
+    }
+
+    // 10100nnn: Pop r4-r[4+nnn]
+    // 10101nnn: Pop r4-r[4+nnn], r14
+    if ((insn & M_POPN) == I_POPN) {
+      uint8_t n = (insn & 0x07) + 1;
+      bool lr = insn & 0x08;
+      uint32_t* ptr = ptrSP();
+      vSP() += (n + (lr ? 1 : 0)) * 4;
+      checkStackBase();
+      for (uint8_t r = 4; r < 4 + n; ++r) mState[r] = *ptr++;
+      if (lr) mState[R_LR] = *ptr++;
+      continue;
+    }
+
+    // 1011000: Finish
+    if (insn == I_FINISH) {
+      if (mState[R_PC] == 0) {
+        mState[R_PC] = mState[R_LR];
+        // Non-standard change (bug 916106): Prevent the caller from
+        // re-using LR.  Since the caller is by definition not a leaf
+        // routine, it will have to restore LR from somewhere to
+        // return to its own caller, so we can safely zero it here.
+        // This makes a difference only if an error in unwinding
+        // (e.g., caused by starting from within a prologue/epilogue)
+        // causes us to load a pointer to a leaf routine as LR; if we
+        // don't do something, we'll go into an infinite loop of
+        // "returning" to that same function.
+        mState[R_LR] = 0;
+      }
+      return true;
+    }
+
+    // 1001nnnn: Set vsp = r[nnnn]
+    if ((insn & M_MOVSP) == I_MOVSP) {
+      vSP() = mState[insn & 0x0f];
+      checkStack();
+      continue;
+    }
+
+    // 11001000 sssscccc: Pop VFP regs D[16+ssss]-D[16+ssss+cccc] (as FLDMFDD)
+    // 11001001 sssscccc: Pop VFP regs D[ssss]-D[ssss+cccc] (as FLDMFDD)
+    if ((insn & M_POPFDD) == I_POPFDD) {
+      uint8_t n = (next() & 0x0f) + 1;
+      // Note: if the 16+ssss+cccc > 31, the encoding is reserved.
+      // As the space is currently unused, we don't try to check.
+      vSP() += 8 * n;
+      checkStackBase();
+      continue;
+    }
+
+    // 11010nnn: Pop VFP regs D[8]-D[8+nnn] (as FLDMFDD)
+    if ((insn & M_POPFDD8) == I_POPFDD8) {
+      uint8_t n = (insn & 0x07) + 1;
+      vSP() += 8 * n;
+      checkStackBase();
+      continue;
+    }
+
+    // 10110010 uleb128: vsp = vsp + 0x204 + (uleb128 << 2)
+    if (insn == I_ADDSPBIG) {
+      uint32_t acc = 0;
+      uint8_t shift = 0;
+      uint8_t byte;
+      do {
+        if (shift >= 32) return false;
+        byte = next();
+        acc |= (byte & 0x7f) << shift;
+        shift += 7;
+      } while (byte & 0x80);
+      uint32_t offset = 0x204 + (acc << 2);
+      // The calculations above could have overflowed.
+      // But the one we care about is this:
+      if (vSP() + offset < vSP()) mFailed = true;
+      vSP() += offset;
+      // ...so that this is the only other check needed:
+      checkStackBase();
+      continue;
+    }
+
+    // 1000iiii iiiiiiii (i not all 0): Pop under masks {r15-r12}, {r11-r4}
+    if ((insn & M_POPMASK) == I_POPMASK) {
+      popRange(4, 15, ((insn & 0x0f) << 8) | next());
+      continue;
+    }
+
+    // 1011001 0000iiii (i not all 0): Pop under mask {r3-r0}
+    if (insn == I_POPLO) {
+      popRange(0, 3, next() & 0x0f);
+      continue;
+    }
+
+    // 10110011 sssscccc: Pop VFP regs D[ssss]-D[ssss+cccc] (as FLDMFDX)
+    if (insn == I_POPFDX) {
+      uint8_t n = (next() & 0x0f) + 1;
+      vSP() += 8 * n + 4;
+      checkStackBase();
+      continue;
+    }
+
+    // 10111nnn: Pop VFP regs D[8]-D[8+nnn] (as FLDMFDX)
+    if ((insn & M_POPFDX8) == I_POPFDX8) {
+      uint8_t n = (insn & 0x07) + 1;
+      vSP() += 8 * n + 4;
+      checkStackBase();
+      continue;
+    }
+
+    // unhandled instruction
+#ifdef DEBUG_EHABI_UNWIND
+    LOG("Unhandled EHABI instruction 0x%02x", insn);
+#endif
+    mFailed = true;
+  }
+  return false;
+}
+
+bool operator<(const EHTable& lhs, const EHTable& rhs) {
+  return lhs.startPC() < rhs.startPC();
+}
+
+// Async signal unsafe.
+EHAddrSpace::EHAddrSpace(const std::vector<EHTable>& aTables)
+    : mTables(aTables) {
+  std::sort(mTables.begin(), mTables.end());
+  DebugOnly<uint32_t> lastEnd = 0;
+  for (std::vector<EHTable>::iterator i = mTables.begin(); i != mTables.end();
+       ++i) {
+    MOZ_ASSERT(i->startPC() >= lastEnd);
+    mStarts.push_back(i->startPC());
+    lastEnd = i->endPC();
+  }
+}
+
+const EHTable* EHAddrSpace::lookup(uint32_t aPC) const {
+  ptrdiff_t i = (std::upper_bound(mStarts.begin(), mStarts.end(), aPC) -
+                 mStarts.begin()) -
+                1;
+
+  if (i < 0 || aPC >= mTables[i].endPC()) return 0;
+  return &mTables[i];
+}
+
+const EHEntry* EHTable::lookup(uint32_t aPC) const {
+  MOZ_ASSERT(aPC >= mStartPC);
+  if (aPC >= mEndPC) return nullptr;
+
+  const EHEntry* begin = mEntriesBegin;
+  const EHEntry* end = mEntriesEnd;
+  MOZ_ASSERT(begin < end);
+  if (aPC < reinterpret_cast<uint32_t>(begin->startPC.compute()))
+    return nullptr;
+
+  while (end - begin > 1) {
+#ifdef EHABI_UNWIND_MORE_ASSERTS
+    if ((end - 1)->startPC.compute() < begin->startPC.compute()) {
+      MOZ_CRASH("unsorted exidx");
+    }
+#endif
+    const EHEntry* mid = begin + (end - begin) / 2;
+    if (aPC < reinterpret_cast<uint32_t>(mid->startPC.compute()))
+      end = mid;
+    else
+      begin = mid;
+  }
+  return begin;
+}
+
+#if MOZ_LITTLE_ENDIAN()
+static const unsigned char hostEndian = ELFDATA2LSB;
+#elif MOZ_BIG_ENDIAN()
+static const unsigned char hostEndian = ELFDATA2MSB;
+#else
+#  error "No endian?"
+#endif
+
+// Async signal unsafe: std::vector::reserve, std::string copy ctor.
+EHTable::EHTable(const void* aELF, size_t aSize, const std::string& aName)
+    : mStartPC(~0),  // largest uint32_t
+      mEndPC(0),
+      mEntriesBegin(nullptr),
+      mEntriesEnd(nullptr),
+      mName(aName) {
+  const uint32_t fileHeaderAddr = reinterpret_cast<uint32_t>(aELF);
+
+  if (aSize < sizeof(Elf32_Ehdr)) return;
+
+  const Elf32_Ehdr& file = *(reinterpret_cast<Elf32_Ehdr*>(fileHeaderAddr));
+  if (memcmp(&file.e_ident[EI_MAG0], ELFMAG, SELFMAG) != 0 ||
+      file.e_ident[EI_CLASS] != ELFCLASS32 ||
+      file.e_ident[EI_DATA] != hostEndian ||
+      file.e_ident[EI_VERSION] != EV_CURRENT || file.e_machine != EM_ARM ||
+      file.e_version != EV_CURRENT)
+    // e_flags?
+    return;
+
+  MOZ_ASSERT(file.e_phoff + file.e_phnum * file.e_phentsize <= aSize);
+  const Elf32_Phdr *exidxHdr = 0, *zeroHdr = 0;
+  for (unsigned i = 0; i < file.e_phnum; ++i) {
+    const Elf32_Phdr& phdr = *(reinterpret_cast<Elf32_Phdr*>(
+        fileHeaderAddr + file.e_phoff + i * file.e_phentsize));
+    if (phdr.p_type == PT_ARM_EXIDX) {
+      exidxHdr = &phdr;
+    } else if (phdr.p_type == PT_LOAD) {
+      if (phdr.p_offset == 0) {
+        zeroHdr = &phdr;
+      }
+      if (phdr.p_flags & PF_X) {
+        mStartPC = std::min(mStartPC, phdr.p_vaddr);
+        mEndPC = std::max(mEndPC, phdr.p_vaddr + phdr.p_memsz);
+      }
+    }
+  }
+  if (!exidxHdr) return;
+  if (!zeroHdr) return;
+  mBaseAddress = fileHeaderAddr - zeroHdr->p_vaddr;
+  mStartPC += mBaseAddress;
+  mEndPC += mBaseAddress;
+  mEntriesBegin =
+      reinterpret_cast<const EHEntry*>(mBaseAddress + exidxHdr->p_vaddr);
+  mEntriesEnd = reinterpret_cast<const EHEntry*>(
+      mBaseAddress + exidxHdr->p_vaddr + exidxHdr->p_memsz);
+}
+
+mozilla::Atomic<const EHAddrSpace*> EHAddrSpace::sCurrent(nullptr);
+
+// Async signal safe; can fail if Update() hasn't returned yet.
+const EHAddrSpace* EHAddrSpace::Get() { return sCurrent; }
+
+// Collect unwinding information from loaded objects.  Calls after the
+// first have no effect.  Async signal unsafe.
+void EHAddrSpace::Update() {
+  const EHAddrSpace* space = sCurrent;
+  if (space) return;
+
+  SharedLibraryInfo info = SharedLibraryInfo::GetInfoForSelf();
+  std::vector<EHTable> tables;
+
+  for (size_t i = 0; i < info.GetSize(); ++i) {
+    const SharedLibrary& lib = info.GetEntry(i);
+    // FIXME: This isn't correct if the start address isn't p_offset 0, because
+    // the start address will not point at the file header. But this is worked
+    // around by magic number checks in the EHTable constructor.
+    EHTable tab(reinterpret_cast<const void*>(lib.GetStart()),
+                lib.GetEnd() - lib.GetStart(), lib.GetNativeDebugPath());
+    if (tab.isValid()) tables.push_back(tab);
+  }
+  space = new EHAddrSpace(tables);
+
+  if (!sCurrent.compareExchange(nullptr, space)) {
+    delete space;
+    space = sCurrent;
+  }
+}
+
+EHState::EHState(const mcontext_t& context) {
+#ifdef linux
+  mRegs[0] = context.arm_r0;
+  mRegs[1] = context.arm_r1;
+  mRegs[2] = context.arm_r2;
+  mRegs[3] = context.arm_r3;
+  mRegs[4] = context.arm_r4;
+  mRegs[5] = context.arm_r5;
+  mRegs[6] = context.arm_r6;
+  mRegs[7] = context.arm_r7;
+  mRegs[8] = context.arm_r8;
+  mRegs[9] = context.arm_r9;
+  mRegs[10] = context.arm_r10;
+  mRegs[11] = context.arm_fp;
+  mRegs[12] = context.arm_ip;
+  mRegs[13] = context.arm_sp;
+  mRegs[14] = context.arm_lr;
+  mRegs[15] = context.arm_pc;
+#else
+#  error "Unhandled OS for ARM EHABI unwinding"
+#endif
+}
+
+}  // namespace mozilla
diff --git a/tools/profiler/core/EHABIStackWalk.h b/tools/profiler/core/EHABIStackWalk.h
new file mode 100644
index 0000000000..61286290b8
--- /dev/null
+++ b/tools/profiler/core/EHABIStackWalk.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * This is an implementation of stack unwinding according to a subset
+ * of the ARM Exception Handling ABI; see the comment at the top of
+ * the .cpp file for details.
+ */
+
+#ifndef mozilla_EHABIStackWalk_h__
+#define mozilla_EHABIStackWalk_h__
+
+#include <stddef.h>
+#include <ucontext.h>
+
+namespace mozilla {
+
+void EHABIStackWalkInit();
+
+size_t EHABIStackWalk(const mcontext_t& aContext, void* stackBase, void** aSPs,
+                      void** aPCs, size_t aNumFrames);
+
+}  // namespace mozilla
+
+#endif
diff --git a/tools/profiler/core/MicroGeckoProfiler.cpp b/tools/profiler/core/MicroGeckoProfiler.cpp
new file mode 100644
index 0000000000..bedb755742
--- /dev/null
+++ b/tools/profiler/core/MicroGeckoProfiler.cpp
@@ -0,0 +1,203 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "GeckoProfiler.h"
+
+#include "mozilla/Maybe.h"
+#include "nsPrintfCString.h"
+#include "public/GeckoTraceEvent.h"
+
+using namespace mozilla;
+using webrtc::trace_event_internal::TraceValueUnion;
+
+void uprofiler_register_thread(const char* name, void* stacktop) {
+#ifdef MOZ_GECKO_PROFILER
+  profiler_register_thread(name, stacktop);
+#endif  // MOZ_GECKO_PROFILER
+}
+
+void uprofiler_unregister_thread() {
+#ifdef MOZ_GECKO_PROFILER
+  profiler_unregister_thread();
+#endif  // MOZ_GECKO_PROFILER
+}
+
+#ifdef MOZ_GECKO_PROFILER
+namespace {
+Maybe<MarkerTiming> ToTiming(char phase) {
+  switch (phase) {
+    case 'B':
+      return Some(MarkerTiming::IntervalStart());
+    case 'E':
+      return Some(MarkerTiming::IntervalEnd());
+    case 'I':
+      return Some(MarkerTiming::InstantNow());
+    default:
+      return Nothing();
+  }
+}
+
+struct TraceOption {
+  bool mPassed = false;
+  ProfilerString8View mName;
+  Variant<int64_t, bool, double, ProfilerString8View> mValue = AsVariant(false);
+};
+
+struct TraceMarker {
+  static constexpr int MAX_NUM_ARGS = 2;
+  using OptionsType = std::tuple<TraceOption, TraceOption>;
+  static constexpr mozilla::Span<const char> MarkerTypeName() {
+    return MakeStringSpan("TraceEvent");
+  }
+  static void StreamJSONMarkerData(
+      mozilla::baseprofiler::SpliceableJSONWriter& aWriter,
+      const OptionsType& aArgs) {
+    auto writeValue = [&](const auto& aName, const auto& aVariant) {
+      aVariant.match(
+          [&](const int64_t& aValue) { aWriter.IntProperty(aName, aValue); },
+          [&](const bool& aValue) { aWriter.BoolProperty(aName, aValue); },
+          [&](const double& aValue) { aWriter.DoubleProperty(aName, aValue); },
+          [&](const ProfilerString8View& aValue) {
+            aWriter.StringProperty(aName, aValue);
+          });
+    };
+    if (const auto& arg = std::get<0>(aArgs); arg.mPassed) {
+      aWriter.StringProperty("name1", arg.mName);
+      writeValue("val1", arg.mValue);
+    }
+    if (const auto& arg = std::get<1>(aArgs); arg.mPassed) {
+      aWriter.StringProperty("name2", arg.mName);
+      writeValue("val2", arg.mValue);
+    }
+  }
+  static mozilla::MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::MarkerChart, MS::Location::MarkerTable};
+    schema.SetChartLabel("{marker.name}");
+    schema.SetTableLabel(
+        "{marker.name}  {marker.data.name1} {marker.data.val1}  "
+        "{marker.data.name2} {marker.data.val2}");
+    schema.AddKeyLabelFormatSearchable("name1", "Key 1", MS::Format::String,
+                                       MS::Searchable::Searchable);
+    schema.AddKeyLabelFormatSearchable("val1", "Value 1", MS::Format::String,
+                                       MS::Searchable::Searchable);
+    schema.AddKeyLabelFormatSearchable("name2", "Key 2", MS::Format::String,
+                                       MS::Searchable::Searchable);
+    schema.AddKeyLabelFormatSearchable("val2", "Value 2", MS::Format::String,
+                                       MS::Searchable::Searchable);
+    return schema;
+  }
+};
+}  // namespace
+
+namespace mozilla {
+template <>
+struct ProfileBufferEntryWriter::Serializer<TraceOption> {
+  static Length Bytes(const TraceOption& aOption) {
+    // 1 byte to store passed flag, then object size if passed.
+    return aOption.mPassed ? (1 + SumBytes(aOption.mName, aOption.mValue)) : 1;
+  }
+
+  static void Write(ProfileBufferEntryWriter& aEW, const TraceOption& aOption) {
+    // 'T'/'t' is just an arbitrary 1-byte value to distinguish states.
+    if (aOption.mPassed) {
+      aEW.WriteObject<char>('T');
+      // Use the Serializer for the name/value pair.
+      aEW.WriteObject(aOption.mName);
+      aEW.WriteObject(aOption.mValue);
+    } else {
+      aEW.WriteObject<char>('t');
+    }
+  }
+};
+
+template <>
+struct ProfileBufferEntryReader::Deserializer<TraceOption> {
+  static void ReadInto(ProfileBufferEntryReader& aER, TraceOption& aOption) {
+    char c = aER.ReadObject<char>();
+    if ((aOption.mPassed = (c == 'T'))) {
+      aER.ReadIntoObject(aOption.mName);
+      aER.ReadIntoObject(aOption.mValue);
+    } else {
+      MOZ_ASSERT(c == 't');
+    }
+  }
+
+  static TraceOption Read(ProfileBufferEntryReader& aER) {
+    TraceOption option;
+    ReadInto(aER, option);
+    return option;
+  }
+};
+}  // namespace mozilla
+#endif  // MOZ_GECKO_PROFILER
+
+void uprofiler_simple_event_marker(const char* name, char phase, int num_args,
+                                   const char** arg_names,
+                                   const unsigned char* arg_types,
+                                   const unsigned long long* arg_values) {
+#ifdef MOZ_GECKO_PROFILER
+  if (!profiler_thread_is_being_profiled_for_markers()) {
+    return;
+  }
+  Maybe<MarkerTiming> timing = ToTiming(phase);
+  if (!timing) {
+    if (getenv("MOZ_LOG_UNKNOWN_TRACE_EVENT_PHASES")) {
+      fprintf(stderr, "XXX UProfiler: phase not handled: '%c'\n", phase);
+    }
+    return;
+  }
+  MOZ_ASSERT(num_args <= TraceMarker::MAX_NUM_ARGS);
+  TraceMarker::OptionsType tuple;
+  TraceOption* args[2] = {&std::get<0>(tuple), &std::get<1>(tuple)};
+  for (int i = 0; i < std::min(num_args, TraceMarker::MAX_NUM_ARGS); ++i) {
+    auto& arg = *args[i];
+    arg.mPassed = true;
+    arg.mName = ProfilerString8View::WrapNullTerminatedString(arg_names[i]);
+    switch (arg_types[i]) {
+      case TRACE_VALUE_TYPE_UINT:
+        MOZ_ASSERT(arg_values[i] <= std::numeric_limits<int64_t>::max());
+        arg.mValue = AsVariant(static_cast<int64_t>(
+            reinterpret_cast<const TraceValueUnion*>(&arg_values[i])->as_uint));
+        break;
+      case TRACE_VALUE_TYPE_INT:
+        arg.mValue = AsVariant(static_cast<int64_t>(
+            reinterpret_cast<const TraceValueUnion*>(&arg_values[i])->as_int));
+        break;
+      case TRACE_VALUE_TYPE_BOOL:
+        arg.mValue = AsVariant(
+            reinterpret_cast<const TraceValueUnion*>(&arg_values[i])->as_bool);
+        break;
+      case TRACE_VALUE_TYPE_DOUBLE:
+        arg.mValue =
+            AsVariant(reinterpret_cast<const TraceValueUnion*>(&arg_values[i])
+                          ->as_double);
+        break;
+      case TRACE_VALUE_TYPE_POINTER:
+        arg.mValue = AsVariant(ProfilerString8View(nsPrintfCString(
+            "%p", reinterpret_cast<const TraceValueUnion*>(&arg_values[i])
+                      ->as_pointer)));
+        break;
+      case TRACE_VALUE_TYPE_STRING:
+        arg.mValue = AsVariant(ProfilerString8View::WrapNullTerminatedString(
+            reinterpret_cast<const TraceValueUnion*>(&arg_values[i])
+                ->as_string));
+        break;
+      case TRACE_VALUE_TYPE_COPY_STRING:
+        arg.mValue = AsVariant(ProfilerString8View(
+            nsCString(reinterpret_cast<const TraceValueUnion*>(&arg_values[i])
+                          ->as_string)));
+        break;
+      default:
+        MOZ_ASSERT_UNREACHABLE("Unexpected trace value type");
+        arg.mValue = AsVariant(ProfilerString8View(
+            nsPrintfCString("Unexpected type: %u", arg_types[i])));
+        break;
+    }
+  }
+  profiler_add_marker(ProfilerString8View::WrapNullTerminatedString(name),
+                      geckoprofiler::category::MEDIA_RT, {timing.extract()},
+                      TraceMarker{}, tuple);
+#endif  // MOZ_GECKO_PROFILER
+}
diff --git a/tools/profiler/core/PageInformation.cpp b/tools/profiler/core/PageInformation.cpp
new file mode 100644
index 0000000000..83d2d508a1
--- /dev/null
+++ b/tools/profiler/core/PageInformation.cpp
@@ -0,0 +1,44 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "PageInformation.h"
+
+#include "mozilla/ProfileJSONWriter.h"
+
+PageInformation::PageInformation(uint64_t aTabID, uint64_t aInnerWindowID,
+                                 const nsCString& aUrl,
+                                 uint64_t aEmbedderInnerWindowID,
+                                 bool aIsPrivateBrowsing)
+    : mTabID(aTabID),
+      mInnerWindowID(aInnerWindowID),
+      mUrl(aUrl),
+      mEmbedderInnerWindowID(aEmbedderInnerWindowID),
+      mIsPrivateBrowsing(aIsPrivateBrowsing) {}
+
+bool PageInformation::Equals(PageInformation* aOtherPageInfo) const {
+  // It's enough to check inner window IDs because they are unique for each
+  // page. Therefore, we don't have to check the tab ID or url.
+  return InnerWindowID() == aOtherPageInfo->InnerWindowID();
+}
+
+void PageInformation::StreamJSON(SpliceableJSONWriter& aWriter) const {
+  // Here, we are converting uint64_t to double. Both tab and Inner
+  // Window IDs are created using `nsContentUtils::GenerateProcessSpecificId`,
+  // which is specifically designed to only use 53 of the 64 bits to be lossless
+  // when passed into and out of JS as a double.
+  aWriter.StartObjectElement();
+  aWriter.DoubleProperty("tabID", TabID());
+  aWriter.DoubleProperty("innerWindowID", InnerWindowID());
+  aWriter.StringProperty("url", Url());
+  aWriter.DoubleProperty("embedderInnerWindowID", EmbedderInnerWindowID());
+  aWriter.BoolProperty("isPrivateBrowsing", IsPrivateBrowsing());
+  aWriter.EndObject();
+}
+
+size_t PageInformation::SizeOfIncludingThis(
+    mozilla::MallocSizeOf aMallocSizeOf) const {
+  return aMallocSizeOf(this);
+}
diff --git a/tools/profiler/core/PageInformation.h b/tools/profiler/core/PageInformation.h
new file mode 100644
index 0000000000..6c9039b9a4
--- /dev/null
+++ b/tools/profiler/core/PageInformation.h
@@ -0,0 +1,68 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef PageInformation_h
+#define PageInformation_h
+
+#include "mozilla/Maybe.h"
+#include "mozilla/MemoryReporting.h"
+#include "nsISupportsImpl.h"
+#include "nsString.h"
+
+namespace mozilla {
+namespace baseprofiler {
+class SpliceableJSONWriter;
+}  // namespace baseprofiler
+}  // namespace mozilla
+
+// This class contains information that's relevant to a single page only
+// while the page information is important and registered with the profiler,
+// but regardless of whether the profiler is running. All accesses to it are
+// protected by the profiler state lock.
+// When the page gets unregistered, we keep the profiler buffer position
+// to determine if we are still using this page. If not, we unregister
+// it in the next page registration.
+class PageInformation final {
+ public:
+  NS_INLINE_DECL_THREADSAFE_REFCOUNTING(PageInformation)
+  PageInformation(uint64_t aTabID, uint64_t aInnerWindowID,
+                  const nsCString& aUrl, uint64_t aEmbedderInnerWindowID,
+                  bool aIsPrivateBrowsing);
+
+  size_t SizeOfIncludingThis(mozilla::MallocSizeOf aMallocSizeOf) const;
+  bool Equals(PageInformation* aOtherPageInfo) const;
+  void StreamJSON(mozilla::baseprofiler::SpliceableJSONWriter& aWriter) const;
+
+  uint64_t InnerWindowID() const { return mInnerWindowID; }
+  uint64_t TabID() const { return mTabID; }
+  const nsCString& Url() const { return mUrl; }
+  uint64_t EmbedderInnerWindowID() const { return mEmbedderInnerWindowID; }
+  bool IsPrivateBrowsing() const { return mIsPrivateBrowsing; }
+
+  mozilla::Maybe<uint64_t> BufferPositionWhenUnregistered() const {
+    return mBufferPositionWhenUnregistered;
+  }
+
+  void NotifyUnregistered(uint64_t aBufferPosition) {
+    mBufferPositionWhenUnregistered = mozilla::Some(aBufferPosition);
+  }
+
+ private:
+  const uint64_t mTabID;
+  const uint64_t mInnerWindowID;
+  const nsCString mUrl;
+  const uint64_t mEmbedderInnerWindowID;
+  const bool mIsPrivateBrowsing;
+
+  // Holds the buffer position when page is unregistered.
+  // It's used to determine if we still use this page in the profiler or
+  // not.
+  mozilla::Maybe<uint64_t> mBufferPositionWhenUnregistered;
+
+  virtual ~PageInformation() = default;
+};
+
+#endif  // PageInformation_h
diff --git a/tools/profiler/core/PlatformMacros.h b/tools/profiler/core/PlatformMacros.h
new file mode 100644
index 0000000000..c72e94c128
--- /dev/null
+++ b/tools/profiler/core/PlatformMacros.h
@@ -0,0 +1,130 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef PLATFORM_MACROS_H
+#define PLATFORM_MACROS_H
+
+// Define platform selection macros in a consistent way. Don't add anything
+// else to this file, so it can remain freestanding. The primary factorisation
+// is on (ARCH,OS) pairs ("PLATforms") but ARCH_ and OS_ macros are defined
+// too, since they are sometimes convenient.
+//
+// Note: "GP" is short for "Gecko Profiler".
+
+#undef GP_PLAT_x86_android
+#undef GP_PLAT_amd64_android
+#undef GP_PLAT_arm_android
+#undef GP_PLAT_arm64_android
+#undef GP_PLAT_x86_linux
+#undef GP_PLAT_amd64_linux
+#undef GP_PLAT_arm_linux
+#undef GP_PLAT_mips64_linux
+#undef GP_PLAT_amd64_darwin
+#undef GP_PLAT_arm64_darwin
+#undef GP_PLAT_x86_windows
+#undef GP_PLAT_amd64_windows
+#undef GP_PLAT_arm64_windows
+
+#undef GP_ARCH_x86
+#undef GP_ARCH_amd64
+#undef GP_ARCH_arm
+#undef GP_ARCH_arm64
+#undef GP_ARCH_mips64
+
+#undef GP_OS_android
+#undef GP_OS_linux
+#undef GP_OS_darwin
+#undef GP_OS_windows
+
+// We test __ANDROID__ before __linux__ because __linux__ is defined on both
+// Android and Linux, whereas GP_OS_android is not defined on vanilla Linux.
+
+#if defined(__ANDROID__) && defined(__i386__)
+#  define GP_PLAT_x86_android 1
+#  define GP_ARCH_x86 1
+#  define GP_OS_android 1
+
+#elif defined(__ANDROID__) && defined(__x86_64__)
+#  define GP_PLAT_amd64_android 1
+#  define GP_ARCH_amd64 1
+#  define GP_OS_android 1
+
+#elif defined(__ANDROID__) && defined(__arm__)
+#  define GP_PLAT_arm_android 1
+#  define GP_ARCH_arm 1
+#  define GP_OS_android 1
+
+#elif defined(__ANDROID__) && defined(__aarch64__)
+#  define GP_PLAT_arm64_android 1
+#  define GP_ARCH_arm64 1
+#  define GP_OS_android 1
+
+#elif defined(__linux__) && defined(__i386__)
+#  define GP_PLAT_x86_linux 1
+#  define GP_ARCH_x86 1
+#  define GP_OS_linux 1
+
+#elif defined(__linux__) && defined(__x86_64__)
+#  define GP_PLAT_amd64_linux 1
+#  define GP_ARCH_amd64 1
+#  define GP_OS_linux 1
+
+#elif defined(__linux__) && defined(__arm__)
+#  define GP_PLAT_arm_linux 1
+#  define GP_ARCH_arm 1
+#  define GP_OS_linux 1
+
+#elif defined(__linux__) && defined(__aarch64__)
+#  define GP_PLAT_arm64_linux 1
+#  define GP_ARCH_arm64 1
+#  define GP_OS_linux 1
+
+#elif defined(__linux__) && defined(__mips64)
+#  define GP_PLAT_mips64_linux 1
+#  define GP_ARCH_mips64 1
+#  define GP_OS_linux 1
+
+#elif defined(__APPLE__) && defined(__aarch64__)
+#  define GP_PLAT_arm64_darwin 1
+#  define GP_ARCH_arm64 1
+#  define GP_OS_darwin 1
+
+#elif defined(__APPLE__) && defined(__x86_64__)
+#  define GP_PLAT_amd64_darwin 1
+#  define GP_ARCH_amd64 1
+#  define GP_OS_darwin 1
+
+#elif defined(__FreeBSD__) && defined(__x86_64__)
+#  define GP_PLAT_amd64_freebsd 1
+#  define GP_ARCH_amd64 1
+#  define GP_OS_freebsd 1
+
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+#  define GP_PLAT_arm64_freebsd 1
+#  define GP_ARCH_arm64 1
+#  define GP_OS_freebsd 1
+
+#elif (defined(_MSC_VER) || defined(__MINGW32__)) && \
+    (defined(_M_IX86) || defined(__i386__))
+#  define GP_PLAT_x86_windows 1
+#  define GP_ARCH_x86 1
+#  define GP_OS_windows 1
+
+#elif (defined(_MSC_VER) || defined(__MINGW32__)) && \
+    (defined(_M_X64) || defined(__x86_64__))
+#  define GP_PLAT_amd64_windows 1
+#  define GP_ARCH_amd64 1
+#  define GP_OS_windows 1
+
+#elif defined(_MSC_VER) && defined(_M_ARM64)
+#  define GP_PLAT_arm64_windows 1
+#  define GP_ARCH_arm64 1
+#  define GP_OS_windows 1
+
+#else
+#  error "Unsupported platform"
+#endif
+
+#endif /* ndef PLATFORM_MACROS_H */
diff --git a/tools/profiler/core/PowerCounters-linux.cpp b/tools/profiler/core/PowerCounters-linux.cpp
new file mode 100644
index 0000000000..006cea4867
--- /dev/null
+++ b/tools/profiler/core/PowerCounters-linux.cpp
@@ -0,0 +1,287 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "PowerCounters.h"
+#include "nsXULAppAPI.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/Logging.h"
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+#include <linux/perf_event.h>
+
+// From the kernel rapl_scale() function:
+//
+// > users must then scale back: count * 1/(1e9*2^32) to get Joules
+#define PERF_EVENT_SCALE_NANOJOULES 2.3283064365386962890625e-1
+#define SCALE_NANOJOULES_TO_PICOWATTHOUR 3.6
+#define SYSFS_PERF_POWER_TYPE_PATH "/sys/bus/event_source/devices/power/type"
+
+static mozilla::LazyLogModule sRaplEventLog("profiler.rapl");
+#define RAPL_LOG(...) \
+  MOZ_LOG(sRaplEventLog, mozilla::LogLevel::Debug, (__VA_ARGS__));
+
+enum class RaplEventType : uint64_t {
+  RAPL_ENERGY_CORES = 0x01,
+  RAPL_ENERGY_PKG = 0x02,
+  RAPL_ENERGY_DRAM = 0x03,
+  RAPL_ENERGY_GPU = 0x04,
+  RAPL_ENERGY_PSYS = 0x05,
+};
+
+struct RaplDomain {
+  RaplEventType mRaplEventType;
+  const char* mLabel;
+  const char* mDescription;
+};
+
+constexpr RaplDomain kSupportedRaplDomains[] = {
+    {RaplEventType::RAPL_ENERGY_CORES, "Power: CPU cores",
+     "Consumption of all physical cores"},
+    {
+        RaplEventType::RAPL_ENERGY_PKG,
+        "Power: CPU package",
+        "Consumption of the whole processor package",
+    },
+    {
+        RaplEventType::RAPL_ENERGY_DRAM,
+        "Power: DRAM",
+        "Consumption of the dram domain",
+    },
+    {
+        RaplEventType::RAPL_ENERGY_GPU,
+        "Power: iGPU",
+        "Consumption of the builtin-gpu domain",
+    },
+    {
+        RaplEventType::RAPL_ENERGY_PSYS,
+        "Power: System",
+        "Consumption of the builtin-psys domain",
+    }};
+
+static std::string GetSysfsFileID(RaplEventType aEventType) {
+  switch (aEventType) {
+    case RaplEventType::RAPL_ENERGY_CORES:
+      return "cores";
+    case RaplEventType::RAPL_ENERGY_PKG:
+      return "pkg";
+    case RaplEventType::RAPL_ENERGY_DRAM:
+      return "ram";
+    case RaplEventType::RAPL_ENERGY_GPU:
+      return "gpu";
+    case RaplEventType::RAPL_ENERGY_PSYS:
+      return "psys";
+  }
+
+  return "";
+}
+
+static double GetRaplPerfEventScale(RaplEventType aEventType) {
+  const std::string sysfsFileName =
+      "/sys/bus/event_source/devices/power/events/energy-" +
+      GetSysfsFileID(aEventType) + ".scale";
+  std::ifstream sysfsFile(sysfsFileName);
+
+  if (!sysfsFile) {
+    return PERF_EVENT_SCALE_NANOJOULES;
+  }
+
+  double scale;
+
+  if (sysfsFile >> scale) {
+    RAPL_LOG("Read scale from %s: %.22e", sysfsFileName.c_str(), scale);
+    return scale * 1e9;
+  }
+
+  return PERF_EVENT_SCALE_NANOJOULES;
+}
+
+static uint64_t GetRaplPerfEventConfig(RaplEventType aEventType) {
+  const std::string sysfsFileName =
+      "/sys/bus/event_source/devices/power/events/energy-" +
+      GetSysfsFileID(aEventType);
+  std::ifstream sysfsFile(sysfsFileName);
+
+  if (!sysfsFile) {
+    return static_cast<uint64_t>(aEventType);
+  }
+
+  char buffer[7] = {};
+  const std::string key = "event=";
+
+  if (!sysfsFile.get(buffer, static_cast<std::streamsize>(key.length()) + 1) ||
+      key != buffer) {
+    return static_cast<uint64_t>(aEventType);
+  }
+
+  uint64_t config;
+
+  if (sysfsFile >> std::hex >> config) {
+    RAPL_LOG("Read config from %s: 0x%" PRIx64, sysfsFileName.c_str(), config);
+    return config;
+  }
+
+  return static_cast<uint64_t>(aEventType);
+}
+
+class RaplProfilerCount final : public BaseProfilerCount {
+ public:
+  explicit RaplProfilerCount(int aPerfEventType,
+                             const RaplEventType& aPerfEventConfig,
+                             const char* aLabel, const char* aDescription)
+      : BaseProfilerCount(aLabel, nullptr, nullptr, "power", aDescription),
+        mLastResult(0),
+        mPerfEventFd(-1) {
+    RAPL_LOG("Creating RAPL Event for type: %s", mLabel);
+
+    // Optimize for ease of use and do not set an excludes value. This
+    // ensures we do not require PERF_PMU_CAP_NO_EXCLUDE.
+    struct perf_event_attr attr = {0};
+    memset(&attr, 0, sizeof(attr));
+    attr.type = aPerfEventType;
+    attr.size = sizeof(struct perf_event_attr);
+    attr.config = GetRaplPerfEventConfig(aPerfEventConfig);
+    attr.sample_period = 0;
+    attr.sample_type = PERF_SAMPLE_IDENTIFIER;
+    attr.inherit = 1;
+
+    RAPL_LOG("Config for event %s: 0x%llx", mLabel, attr.config);
+
+    mEventScale = GetRaplPerfEventScale(aPerfEventConfig);
+    RAPL_LOG("Scale for event %s: %.22e", mLabel, mEventScale);
+
+    long fd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, 0);
+    if (fd < 0) {
+      RAPL_LOG("Event descriptor creation failed for event: %s", mLabel);
+      mPerfEventFd = -1;
+      return;
+    }
+
+    RAPL_LOG("Created descriptor for event: %s", mLabel)
+    mPerfEventFd = static_cast<int>(fd);
+  }
+
+  ~RaplProfilerCount() {
+    if (ValidPerfEventFd()) {
+      ioctl(mPerfEventFd, PERF_EVENT_IOC_DISABLE, 0);
+      close(mPerfEventFd);
+    }
+  }
+
+  RaplProfilerCount(const RaplProfilerCount&) = delete;
+  RaplProfilerCount& operator=(const RaplProfilerCount&) = delete;
+
+  CountSample Sample() override {
+    CountSample result = {
+        .count = 0,
+        .number = 0,
+        .isSampleNew = false,
+    };
+    mozilla::Maybe<uint64_t> raplEventResult = ReadEventFd();
+
+    if (raplEventResult.isNothing()) {
+      return result;
+    }
+
+    // We need to return picowatthour to be consistent with the Windows
+    // EMI API. As a result, the scale calculation should:
+    //
+    //  - Convert the returned value to nanojoules
+    //  - Convert nanojoules to picowatthour
+    double nanojoules =
+        static_cast<double>(raplEventResult.value()) * mEventScale;
+    double picowatthours = nanojoules / SCALE_NANOJOULES_TO_PICOWATTHOUR;
+    RAPL_LOG("Sample %s { count: %lu, last-result: %lu } = %lfJ", mLabel,
+             raplEventResult.value(), mLastResult, nanojoules * 1e-9);
+
+    result.count = static_cast<int64_t>(picowatthours);
+
+    // If the tick count is the same as the returned value or if this is the
+    // first sample, treat this sample as a duplicate.
+    result.isSampleNew =
+        (mLastResult != 0 && mLastResult != raplEventResult.value() &&
+         result.count >= 0);
+    mLastResult = raplEventResult.value();
+
+    return result;
+  }
+
+  bool ValidPerfEventFd() { return mPerfEventFd >= 0; }
+
+ private:
+  mozilla::Maybe<uint64_t> ReadEventFd() {
+    MOZ_ASSERT(ValidPerfEventFd());
+
+    uint64_t eventResult;
+    ssize_t readBytes = read(mPerfEventFd, &eventResult, sizeof(uint64_t));
+    if (readBytes != sizeof(uint64_t)) {
+      RAPL_LOG("Invalid RAPL event read size: %ld", readBytes);
+      return mozilla::Nothing();
+    }
+
+    return mozilla::Some(eventResult);
+  }
+
+  uint64_t mLastResult;
+  int mPerfEventFd;
+  double mEventScale;
+};
+
+static int GetRaplPerfEventType() {
+  FILE* fp = fopen(SYSFS_PERF_POWER_TYPE_PATH, "r");
+  if (!fp) {
+    RAPL_LOG("Open of " SYSFS_PERF_POWER_TYPE_PATH " failed");
+    return -1;
+  }
+
+  int readTypeValue = -1;
+  if (fscanf(fp, "%d", &readTypeValue) != 1) {
+    RAPL_LOG("Read of " SYSFS_PERF_POWER_TYPE_PATH " failed");
+  }
+  fclose(fp);
+
+  return readTypeValue;
+}
+
+PowerCounters::PowerCounters() {
+  if (!XRE_IsParentProcess()) {
+    // Energy meters are global, so only sample them on the parent.
+    return;
+  }
+
+  // Get the value perf_event_attr.type should be set to for RAPL
+  // perf events.
+  int perfEventType = GetRaplPerfEventType();
+  if (perfEventType < 0) {
+    RAPL_LOG("Failed to find the event type for RAPL perf events.");
+    return;
+  }
+
+  for (const auto& raplEventDomain : kSupportedRaplDomains) {
+    RaplProfilerCount* raplEvent = new RaplProfilerCount(
+        perfEventType, raplEventDomain.mRaplEventType, raplEventDomain.mLabel,
+        raplEventDomain.mDescription);
+    if (!raplEvent->ValidPerfEventFd() || !mCounters.emplaceBack(raplEvent)) {
+      delete raplEvent;
+    }
+  }
+}
+
+PowerCounters::~PowerCounters() {
+  for (auto* raplEvent : mCounters) {
+    delete raplEvent;
+  }
+  mCounters.clear();
+}
+
+void PowerCounters::Sample() {}
diff --git a/tools/profiler/core/PowerCounters-mac-amd64.cpp b/tools/profiler/core/PowerCounters-mac-amd64.cpp
new file mode 100644
index 0000000000..540cee155d
--- /dev/null
+++ b/tools/profiler/core/PowerCounters-mac-amd64.cpp
@@ -0,0 +1,419 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "PowerCounters.h"
+#include "nsDebug.h"
+#include "nsPrintfCString.h"
+#include "nsXULAppAPI.h"  // for XRE_IsParentProcess
+
+// Because of the pkg_energy_statistics_t::pkes_version check below, the
+// earliest OS X version this code will work with is 10.9.0 (xnu-2422.1.72).
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+// OS X has four kinds of system calls:
+//
+//  1. Mach traps;
+//  2. UNIX system calls;
+//  3. machine-dependent calls;
+//  4. diagnostic calls.
+//
+// (See "Mac OS X and iOS Internals" by Jonathan Levin for more details.)
+//
+// The last category has a single call named diagCall() or diagCall64(). Its
+// mode is controlled by its first argument, and one of the modes allows access
+// to the Intel RAPL MSRs.
+//
+// The interface to diagCall64() is not exported, so we have to import some
+// definitions from the XNU kernel. All imported definitions are annotated with
+// the XNU source file they come from, and information about what XNU versions
+// they were introduced in and (if relevant) modified.
+
+// The diagCall64() mode.
+// From osfmk/i386/Diagnostics.h
+// - In 10.8.4 (xnu-2050.24.15) this value was introduced. (In 10.8.3 the value
+//   17 was used for dgGzallocTest.)
+#define dgPowerStat 17
+
+// From osfmk/i386/cpu_data.h
+// - In 10.8.5 these values were introduced, along with core_energy_stat_t.
+#define CPU_RTIME_BINS (12)
+#define CPU_ITIME_BINS (CPU_RTIME_BINS)
+
+// core_energy_stat_t and pkg_energy_statistics_t are both from
+// osfmk/i386/Diagnostics.c.
+// - In 10.8.4 (xnu-2050.24.15) both structs were introduced, but with many
+//   fewer fields.
+// - In 10.8.5 (xnu-2050.48.11) both structs were substantially expanded, with
+//   numerous new fields.
+// - In 10.9.0 (xnu-2422.1.72) pkg_energy_statistics_t::pkes_version was added.
+//   diagCall64(dgPowerStat) fills it with '1' in all versions since (up to
+//   10.10.2 at time of writing).
+// - in 10.10.2 (xnu-2782.10.72) core_energy_stat_t::gpmcs was conditionally
+//   added, if DIAG_ALL_PMCS is true. (DIAG_ALL_PMCS is not even defined in the
+//   source code, but it could be defined at compile-time via compiler flags.)
+//   pkg_energy_statistics_t::pkes_version did not change, though.
+
+typedef struct {
+  uint64_t caperf;
+  uint64_t cmperf;
+  uint64_t ccres[6];
+  uint64_t crtimes[CPU_RTIME_BINS];
+  uint64_t citimes[CPU_ITIME_BINS];
+  uint64_t crtime_total;
+  uint64_t citime_total;
+  uint64_t cpu_idle_exits;
+  uint64_t cpu_insns;
+  uint64_t cpu_ucc;
+  uint64_t cpu_urc;
+#if DIAG_ALL_PMCS           // Added in 10.10.2 (xnu-2782.10.72).
+  uint64_t gpmcs[4];        // Added in 10.10.2 (xnu-2782.10.72).
+#endif /* DIAG_ALL_PMCS */  // Added in 10.10.2 (xnu-2782.10.72).
+} core_energy_stat_t;
+
+typedef struct {
+  uint64_t pkes_version;  // Added in 10.9.0 (xnu-2422.1.72).
+  uint64_t pkg_cres[2][7];
+
+  // This is read from MSR 0x606, which Intel calls MSR_RAPL_POWER_UNIT
+  // and XNU calls MSR_IA32_PKG_POWER_SKU_UNIT.
+  uint64_t pkg_power_unit;
+
+  // These are the four fields for the four RAPL domains. For each field
+  // we list:
+  //
+  // - the corresponding MSR number;
+  // - Intel's name for that MSR;
+  // - XNU's name for that MSR;
+  // - which Intel processors the MSR is supported on.
+  //
+  // The last of these is determined from chapter 35 of Volume 3 of the
+  // "Intel 64 and IA-32 Architecture's Software Developer's Manual",
+  // Order Number 325384. (Note that chapter 35 contradicts section 14.9
+  // to some degree.)
+
+  // 0x611 == MSR_PKG_ENERGY_STATUS == MSR_IA32_PKG_ENERGY_STATUS
+  // Atom (various), Sandy Bridge, Next Gen Xeon Phi (model 0x57).
+  uint64_t pkg_energy;
+
+  // 0x639 == MSR_PP0_ENERGY_STATUS == MSR_IA32_PP0_ENERGY_STATUS
+  // Atom (various), Sandy Bridge, Next Gen Xeon Phi (model 0x57).
+  uint64_t pp0_energy;
+
+  // 0x641 == MSR_PP1_ENERGY_STATUS == MSR_PP1_ENERGY_STATUS
+  // Sandy Bridge, Haswell.
+  uint64_t pp1_energy;
+
+  // 0x619 == MSR_DRAM_ENERGY_STATUS == MSR_IA32_DDR_ENERGY_STATUS
+  // Xeon E5, Xeon E5 v2, Haswell/Haswell-E, Next Gen Xeon Phi (model
+  // 0x57)
+  uint64_t ddr_energy;
+
+  uint64_t llc_flushed_cycles;
+  uint64_t ring_ratio_instantaneous;
+  uint64_t IA_frequency_clipping_cause;
+  uint64_t GT_frequency_clipping_cause;
+  uint64_t pkg_idle_exits;
+  uint64_t pkg_rtimes[CPU_RTIME_BINS];
+  uint64_t pkg_itimes[CPU_ITIME_BINS];
+  uint64_t mbus_delay_time;
+  uint64_t mint_delay_time;
+  uint32_t ncpus;
+  core_energy_stat_t cest[];
+} pkg_energy_statistics_t;
+
+static int diagCall64(uint64_t aMode, void* aBuf) {
+  // We cannot use syscall() here because it doesn't work with diagnostic
+  // system calls -- it raises SIGSYS if you try. So we have to use asm.
+
+#ifdef __x86_64__
+  // The 0x40000 prefix indicates it's a diagnostic system call. The 0x01
+  // suffix indicates the syscall number is 1, which also happens to be the
+  // only diagnostic system call. See osfmk/mach/i386/syscall_sw.h for more
+  // details.
+  static const uint64_t diagCallNum = 0x4000001;
+  uint64_t rv;
+
+  __asm__ __volatile__(
+      "syscall"
+
+      // Return value goes in "a" (%rax).
+      : /* outputs */ "=a"(rv)
+
+      // The syscall number goes in "0", a synonym (from outputs) for "a"
+      // (%rax). The syscall arguments go in "D" (%rdi) and "S" (%rsi).
+      : /* inputs */ "0"(diagCallNum), "D"(aMode), "S"(aBuf)
+
+      // The |syscall| instruction clobbers %rcx, %r11, and %rflags ("cc"). And
+      // this particular syscall also writes memory (aBuf).
+      : /* clobbers */ "rcx", "r11", "cc", "memory");
+  return rv;
+#else
+#  error Sorry, only x86-64 is supported
+#endif
+}
+
+// This is a counter to collect power utilization during profiling.
+// It cannot be a raw `ProfilerCounter` because we need to manually add/remove
+// it while the profiler lock is already held.
+class RaplDomain final : public BaseProfilerCount {
+ public:
+  explicit RaplDomain(const char* aLabel, const char* aDescription)
+      : BaseProfilerCount(aLabel, nullptr, nullptr, "power", aDescription),
+        mSample(0),
+        mEnergyStatusUnits(0),
+        mWrapAroundCount(0),
+        mIsSampleNew(false) {}
+
+  CountSample Sample() override {
+    CountSample result;
+
+    // To be consistent with the Windows EMI API,
+    // return values in picowatt-hour.
+    constexpr double NANOJOULES_PER_JOULE = 1'000'000'000;
+    constexpr double NANOJOULES_TO_PICOWATTHOUR = 3.6;
+
+    uint64_t ticks = (uint64_t(mWrapAroundCount) << 32) + mSample;
+    double joulesPerTick = (double)1 / (1 << mEnergyStatusUnits);
+    result.count = static_cast<double>(ticks) * joulesPerTick *
+                   NANOJOULES_PER_JOULE / NANOJOULES_TO_PICOWATTHOUR;
+
+    result.number = 0;
+    result.isSampleNew = mIsSampleNew;
+    mIsSampleNew = false;
+    return result;
+  }
+
+  void AddSample(uint32_t aSample, uint32_t aEnergyStatusUnits) {
+    if (aSample == mSample) {
+      return;
+    }
+
+    mEnergyStatusUnits = aEnergyStatusUnits;
+
+    if (aSample > mSample) {
+      mIsSampleNew = true;
+      mSample = aSample;
+      return;
+    }
+
+    // Despite being returned in uint64_t fields, the power counter values
+    // only use the lowest 32 bits of their fields, and we need to handle
+    // wraparounds to avoid our power tracks stopping after a few hours.
+    constexpr uint32_t highestBit = 1 << 31;
+    if ((mSample & highestBit) && !(aSample & highestBit)) {
+      mIsSampleNew = true;
+      ++mWrapAroundCount;
+      mSample = aSample;
+    } else {
+      NS_WARNING("unexpected sample with smaller value");
+    }
+  }
+
+ private:
+  uint32_t mSample;
+  uint32_t mEnergyStatusUnits;
+  uint32_t mWrapAroundCount;
+  bool mIsSampleNew;
+};
+
+class RAPL {
+  bool mIsGpuSupported;  // Is the GPU domain supported by the processor?
+  bool mIsRamSupported;  // Is the RAM domain supported by the processor?
+
+  // The DRAM domain on Haswell servers has a fixed energy unit (1/65536 J ==
+  // 15.3 microJoules) which is different to the power unit MSR. (See the
+  // "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, Volume 2 of
+  // 2, Registers" datasheet, September 2014, Reference Number: 330784-001.)
+  // This field records whether the quirk is present.
+  bool mHasRamUnitsQuirk;
+
+  // The abovementioned 15.3 microJoules value. (2^16 = 65536)
+  static constexpr double kQuirkyRamEnergyStatusUnits = 16;
+
+  // The struct passed to diagCall64().
+  pkg_energy_statistics_t* mPkes;
+
+  RaplDomain* mPkg = nullptr;
+  RaplDomain* mCores = nullptr;
+  RaplDomain* mGpu = nullptr;
+  RaplDomain* mRam = nullptr;
+
+ public:
+  explicit RAPL(PowerCounters::CountVector& aCounters)
+      : mHasRamUnitsQuirk(false) {
+    // Work out which RAPL MSRs this CPU model supports.
+    int cpuModel;
+    size_t size = sizeof(cpuModel);
+    if (sysctlbyname("machdep.cpu.model", &cpuModel, &size, NULL, 0) != 0) {
+      NS_WARNING("sysctlbyname(\"machdep.cpu.model\") failed");
+      return;
+    }
+
+    // This is similar to arch/x86/kernel/cpu/perf_event_intel_rapl.c in
+    // linux-4.1.5/.
+    //
+    // By linux-5.6.14/, this stuff had moved into
+    // arch/x86/events/intel/rapl.c, which references processor families in
+    // arch/x86/include/asm/intel-family.h.
+    switch (cpuModel) {
+      case 0x2a:  // Sandy Bridge
+      case 0x3a:  // Ivy Bridge
+        // Supports package, cores, GPU.
+        mIsGpuSupported = true;
+        mIsRamSupported = false;
+        break;
+
+      case 0x3f:  // Haswell X
+      case 0x4f:  // Broadwell X
+      case 0x55:  // Skylake X
+      case 0x56:  // Broadwell D
+        // Supports package, cores, RAM. Has the units quirk.
+        mIsGpuSupported = false;
+        mIsRamSupported = true;
+        mHasRamUnitsQuirk = true;
+        break;
+
+      case 0x2d:  // Sandy Bridge X
+      case 0x3e:  // Ivy Bridge X
+        // Supports package, cores, RAM.
+        mIsGpuSupported = false;
+        mIsRamSupported = true;
+        break;
+
+      case 0x3c:  // Haswell
+      case 0x3d:  // Broadwell
+      case 0x45:  // Haswell L
+      case 0x46:  // Haswell G
+      case 0x47:  // Broadwell G
+        // Supports package, cores, GPU, RAM.
+        mIsGpuSupported = true;
+        mIsRamSupported = true;
+        break;
+
+      case 0x4e:  // Skylake L
+      case 0x5e:  // Skylake
+      case 0x8e:  // Kaby Lake L
+      case 0x9e:  // Kaby Lake
+      case 0x66:  // Cannon Lake L
+      case 0x7d:  // Ice Lake
+      case 0x7e:  // Ice Lake L
+      case 0xa5:  // Comet Lake
+      case 0xa6:  // Comet Lake L
+        // Supports package, cores, GPU, RAM, PSYS.
+        // XXX: this tool currently doesn't measure PSYS.
+        mIsGpuSupported = true;
+        mIsRamSupported = true;
+        break;
+
+      default:
+        NS_WARNING(nsPrintfCString("unknown CPU model: %d", cpuModel).get());
+        return;
+    }
+
+    // Get the maximum number of logical CPUs so that we know how big to make
+    // |mPkes|.
+    int logicalcpu_max;
+    size = sizeof(logicalcpu_max);
+    if (sysctlbyname("hw.logicalcpu_max", &logicalcpu_max, &size, NULL, 0) !=
+        0) {
+      NS_WARNING("sysctlbyname(\"hw.logicalcpu_max\") failed");
+      return;
+    }
+
+    // Over-allocate by 1024 bytes per CPU to allow for the uncertainty around
+    // core_energy_stat_t::gpmcs and for any other future extensions to that
+    // struct. (The fields we read all come before the core_energy_stat_t
+    // array, so it won't matter to us whether gpmcs is present or not.)
+    size_t pkesSize = sizeof(pkg_energy_statistics_t) +
+                      logicalcpu_max * sizeof(core_energy_stat_t) +
+                      logicalcpu_max * 1024;
+    mPkes = (pkg_energy_statistics_t*)malloc(pkesSize);
+    if (mPkes && aCounters.reserve(4)) {
+      mPkg = new RaplDomain("Power: CPU package", "RAPL PKG");
+      aCounters.infallibleAppend(mPkg);
+
+      mCores = new RaplDomain("Power: CPU cores", "RAPL PP0");
+      aCounters.infallibleAppend(mCores);
+
+      if (mIsGpuSupported) {
+        mGpu = new RaplDomain("Power: iGPU", "RAPL PP1");
+        aCounters.infallibleAppend(mGpu);
+      }
+
+      if (mIsRamSupported) {
+        mRam = new RaplDomain("Power: DRAM", "RAPL DRAM");
+        aCounters.infallibleAppend(mRam);
+      }
+    }
+  }
+
+  ~RAPL() {
+    free(mPkes);
+    delete mPkg;
+    delete mCores;
+    delete mGpu;
+    delete mRam;
+  }
+
+  void Sample() {
+    constexpr uint64_t kSupportedVersion = 1;
+
+    // If we failed to allocate the memory for package energy statistics, we
+    // have nothing to sample.
+    if (MOZ_UNLIKELY(!mPkes)) {
+      return;
+    }
+
+    // Write an unsupported version number into pkes_version so that the check
+    // below cannot succeed by dumb luck.
+    mPkes->pkes_version = kSupportedVersion - 1;
+
+    // diagCall64() returns 1 on success, and 0 on failure (which can only
+    // happen if the mode is unrecognized, e.g. in 10.7.x or earlier versions).
+    if (diagCall64(dgPowerStat, mPkes) != 1) {
+      NS_WARNING("diagCall64() failed");
+      return;
+    }
+
+    if (mPkes->pkes_version != kSupportedVersion) {
+      NS_WARNING(
+          nsPrintfCString("unexpected pkes_version: %llu", mPkes->pkes_version)
+              .get());
+      return;
+    }
+
+    // Bits 12:8 are the ESU.
+    // Energy measurements come in multiples of 1/(2^ESU).
+    uint32_t energyStatusUnits = (mPkes->pkg_power_unit >> 8) & 0x1f;
+    mPkg->AddSample(mPkes->pkg_energy, energyStatusUnits);
+    mCores->AddSample(mPkes->pp0_energy, energyStatusUnits);
+    if (mIsGpuSupported) {
+      mGpu->AddSample(mPkes->pp1_energy, energyStatusUnits);
+    }
+    if (mIsRamSupported) {
+      mRam->AddSample(mPkes->ddr_energy, mHasRamUnitsQuirk
+                                             ? kQuirkyRamEnergyStatusUnits
+                                             : energyStatusUnits);
+    }
+  }
+};
+
+PowerCounters::PowerCounters() {
+  // RAPL values are global, so only sample them on the parent.
+  mRapl = XRE_IsParentProcess() ? new RAPL(mCounters) : nullptr;
+}
+
+PowerCounters::~PowerCounters() {
+  mCounters.clear();
+  delete mRapl;
+  mRapl = nullptr;
+}
+
+void PowerCounters::Sample() {
+  if (mRapl) {
+    mRapl->Sample();
+  }
+}
diff --git a/tools/profiler/core/PowerCounters-mac-arm64.cpp b/tools/profiler/core/PowerCounters-mac-arm64.cpp
new file mode 100644
index 0000000000..3a84a479ef
--- /dev/null
+++ b/tools/profiler/core/PowerCounters-mac-arm64.cpp
@@ -0,0 +1,47 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "PowerCounters.h"
+
+#include <mach/mach.h>
+
+class ProcessPower final : public BaseProfilerCount {
+ public:
+  ProcessPower()
+      : BaseProfilerCount("Process Power", nullptr, nullptr, "power",
+                          "Power utilization") {}
+
+  CountSample Sample() override {
+    CountSample result;
+    result.count = GetTaskEnergy();
+    result.number = 0;
+    result.isSampleNew = true;
+    return result;
+  }
+
+ private:
+  int64_t GetTaskEnergy() {
+    task_power_info_v2_data_t task_power_info;
+    mach_msg_type_number_t count = TASK_POWER_INFO_V2_COUNT;
+    kern_return_t kr = task_info(mach_task_self(), TASK_POWER_INFO_V2,
+                                 (task_info_t)&task_power_info, &count);
+    if (kr != KERN_SUCCESS) {
+      return 0;
+    }
+
+    // task_energy is in nanojoules. To be consistent with the Windows EMI
+    // API, return values in picowatt-hour.
+    return task_power_info.task_energy / 3.6;
+  }
+};
+
+PowerCounters::PowerCounters() : mProcessPower(new ProcessPower()) {
+  if (mProcessPower) {
+    (void)mCounters.append(mProcessPower.get());
+  }
+}
+
+PowerCounters::~PowerCounters() { mCounters.clear(); }
+
+void PowerCounters::Sample() {}
diff --git a/tools/profiler/core/PowerCounters-win.cpp b/tools/profiler/core/PowerCounters-win.cpp
new file mode 100644
index 0000000000..f1d05389b6
--- /dev/null
+++ b/tools/profiler/core/PowerCounters-win.cpp
@@ -0,0 +1,342 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "PowerCounters.h"
+#include "nsXULAppAPI.h"  // for XRE_IsParentProcess
+#include "nsString.h"
+
+#include <windows.h>
+#include <devioctl.h>
+#include <setupapi.h>  // for SetupDi*
+// LogSeverity, defined by setupapi.h to DWORD, messes with other code.
+#undef LogSeverity
+
+#undef NTDDI_VERSION
+#define NTDDI_VERSION NTDDI_WINBLUE
+#include <emi.h>
+
+#ifndef NTDDI_WIN10_RS5
+// EMI v2 API exists in SDK 10.0.17763 (Windows 10 1809 / Redstone 5) and later.
+// Our build machines are still on SDK 10.0.17134.
+// Remove this block when updating the SDK (bug 1774628).
+typedef EMI_METADATA EMI_METADATA_V1;
+typedef EMI_MEASUREMENT_DATA EMI_CHANNEL_MEASUREMENT_DATA;
+#  define EMI_VERSION_V2 2
+
+typedef struct {
+  EMI_MEASUREMENT_UNIT MeasurementUnit;
+  USHORT ChannelNameSize;
+  WCHAR ChannelName[ANYSIZE_ARRAY];
+} EMI_CHANNEL_V2;
+
+typedef struct {
+  WCHAR HardwareOEM[EMI_NAME_MAX];
+  WCHAR HardwareModel[EMI_NAME_MAX];
+  USHORT HardwareRevision;
+  USHORT ChannelCount;
+  EMI_CHANNEL_V2 Channels[ANYSIZE_ARRAY];
+} EMI_METADATA_V2;
+
+#  define EMI_CHANNEL_V2_LENGTH(_ChannelNameSize) \
+    (FIELD_OFFSET(EMI_CHANNEL_V2, ChannelName) + (_ChannelNameSize))
+
+#  define EMI_CHANNEL_V2_NEXT_CHANNEL(_Channel) \
+    ((EMI_CHANNEL_V2*)((PUCHAR)(_Channel) +     \
+                       EMI_CHANNEL_V2_LENGTH((_Channel)->ChannelNameSize)))
+#endif
+
+using namespace mozilla;
+
+// This is a counter to collect power utilization during profiling.
+// It cannot be a raw `ProfilerCounter` because we need to manually add/remove
+// it while the profiler lock is already held.
+class PowerMeterChannel final : public BaseProfilerCount {
+ public:
+  explicit PowerMeterChannel(const WCHAR* aChannelName, ULONGLONG aInitialValue,
+                             ULONGLONG aInitialTime)
+      : BaseProfilerCount(nullptr, nullptr, nullptr, "power",
+                          "Power utilization"),
+        mChannelName(NS_ConvertUTF16toUTF8(aChannelName)),
+        mPreviousValue(aInitialValue),
+        mPreviousTime(aInitialTime),
+        mIsSampleNew(true) {
+    if (mChannelName.Equals("RAPL_Package0_PKG")) {
+      mLabel = "Power: CPU package";
+      mDescription = mChannelName.get();
+    } else if (mChannelName.Equals("RAPL_Package0_PP0")) {
+      mLabel = "Power: CPU cores";
+      mDescription = mChannelName.get();
+    } else if (mChannelName.Equals("RAPL_Package0_PP1")) {
+      mLabel = "Power: iGPU";
+      mDescription = mChannelName.get();
+    } else if (mChannelName.Equals("RAPL_Package0_DRAM")) {
+      mLabel = "Power: DRAM";
+      mDescription = mChannelName.get();
+    } else {
+      unsigned int coreId;
+      if (sscanf(mChannelName.get(), "RAPL_Package0_Core%u_CORE", &coreId) ==
+          1) {
+        mLabelString = "Power: CPU core ";
+        mLabelString.AppendInt(coreId);
+        mLabel = mLabelString.get();
+        mDescription = mChannelName.get();
+      } else {
+        mLabel = mChannelName.get();
+      }
+    }
+  }
+
+  CountSample Sample() override {
+    CountSample result;
+    result.count = mCounter;
+    result.number = 0;
+    result.isSampleNew = mIsSampleNew;
+    mIsSampleNew = false;
+    return result;
+  }
+
+  void AddSample(ULONGLONG aAbsoluteEnergy, ULONGLONG aAbsoluteTime) {
+    // aAbsoluteTime is the time since the system start in 100ns increments.
+    if (aAbsoluteTime == mPreviousTime) {
+      return;
+    }
+
+    if (aAbsoluteEnergy > mPreviousValue) {
+      int64_t increment = aAbsoluteEnergy - mPreviousValue;
+      mCounter += increment;
+      mPreviousValue += increment;
+      mPreviousTime = aAbsoluteTime;
+    }
+
+    mIsSampleNew = true;
+  }
+
+ private:
+  int64_t mCounter;
+  nsCString mChannelName;
+
+  // Used as a storage when the label can not be a literal string.
+  nsCString mLabelString;
+
+  ULONGLONG mPreviousValue;
+  ULONGLONG mPreviousTime;
+  bool mIsSampleNew;
+};
+
+class PowerMeterDevice {
+ public:
+  explicit PowerMeterDevice(LPCTSTR aDevicePath) {
+    mHandle = ::CreateFile(aDevicePath, GENERIC_READ,
+                           FILE_SHARE_READ | FILE_SHARE_WRITE, nullptr,
+                           OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
+    if (mHandle == INVALID_HANDLE_VALUE) {
+      return;
+    }
+
+    EMI_VERSION version = {0};
+    DWORD dwOut;
+
+    if (!::DeviceIoControl(mHandle, IOCTL_EMI_GET_VERSION, nullptr, 0, &version,
+                           sizeof(version), &dwOut, nullptr) ||
+        (version.EmiVersion != EMI_VERSION_V1 &&
+         version.EmiVersion != EMI_VERSION_V2)) {
+      return;
+    }
+
+    EMI_METADATA_SIZE size = {0};
+    if (!::DeviceIoControl(mHandle, IOCTL_EMI_GET_METADATA_SIZE, nullptr, 0,
+                           &size, sizeof(size), &dwOut, nullptr) ||
+        !size.MetadataSize) {
+      return;
+    }
+
+    UniquePtr<uint8_t[]> metadata(new (std::nothrow)
+                                      uint8_t[size.MetadataSize]);
+    if (!metadata) {
+      return;
+    }
+
+    if (version.EmiVersion == EMI_VERSION_V2) {
+      EMI_METADATA_V2* metadata2 =
+          reinterpret_cast<EMI_METADATA_V2*>(metadata.get());
+      if (!::DeviceIoControl(mHandle, IOCTL_EMI_GET_METADATA, nullptr, 0,
+                             metadata2, size.MetadataSize, &dwOut, nullptr)) {
+        return;
+      }
+
+      if (!mChannels.reserve(metadata2->ChannelCount)) {
+        return;
+      }
+
+      mDataBuffer =
+          MakeUnique<EMI_CHANNEL_MEASUREMENT_DATA[]>(metadata2->ChannelCount);
+      if (!mDataBuffer) {
+        return;
+      }
+
+      if (!::DeviceIoControl(
+              mHandle, IOCTL_EMI_GET_MEASUREMENT, nullptr, 0, mDataBuffer.get(),
+              sizeof(EMI_CHANNEL_MEASUREMENT_DATA[metadata2->ChannelCount]),
+              &dwOut, nullptr)) {
+        return;
+      }
+
+      EMI_CHANNEL_V2* channel = &metadata2->Channels[0];
+      for (int i = 0; i < metadata2->ChannelCount; ++i) {
+        EMI_CHANNEL_MEASUREMENT_DATA* channel_data = &mDataBuffer[i];
+        mChannels.infallibleAppend(new PowerMeterChannel(
+            channel->ChannelName, channel_data->AbsoluteEnergy,
+            channel_data->AbsoluteTime));
+        channel = EMI_CHANNEL_V2_NEXT_CHANNEL(channel);
+      }
+    } else if (version.EmiVersion == EMI_VERSION_V1) {
+      EMI_METADATA_V1* metadata1 =
+          reinterpret_cast<EMI_METADATA_V1*>(metadata.get());
+      if (!::DeviceIoControl(mHandle, IOCTL_EMI_GET_METADATA, nullptr, 0,
+                             metadata1, size.MetadataSize, &dwOut, nullptr)) {
+        return;
+      }
+
+      mDataBuffer = MakeUnique<EMI_CHANNEL_MEASUREMENT_DATA[]>(1);
+      if (!mDataBuffer) {
+        return;
+      }
+
+      if (!::DeviceIoControl(
+              mHandle, IOCTL_EMI_GET_MEASUREMENT, nullptr, 0, mDataBuffer.get(),
+              sizeof(EMI_CHANNEL_MEASUREMENT_DATA), &dwOut, nullptr)) {
+        return;
+      }
+
+      (void)mChannels.append(new PowerMeterChannel(
+          metadata1->MeteredHardwareName, mDataBuffer[0].AbsoluteEnergy,
+          mDataBuffer[0].AbsoluteTime));
+    }
+  }
+
+  ~PowerMeterDevice() {
+    if (mHandle != INVALID_HANDLE_VALUE) {
+      ::CloseHandle(mHandle);
+    }
+  }
+
+  void Sample() {
+    MOZ_ASSERT(HasChannels());
+    MOZ_ASSERT(mDataBuffer);
+
+    DWORD dwOut;
+    if (!::DeviceIoControl(
+            mHandle, IOCTL_EMI_GET_MEASUREMENT, nullptr, 0, mDataBuffer.get(),
+            sizeof(EMI_CHANNEL_MEASUREMENT_DATA[mChannels.length()]), &dwOut,
+            nullptr)) {
+      return;
+    }
+
+    for (size_t i = 0; i < mChannels.length(); ++i) {
+      EMI_CHANNEL_MEASUREMENT_DATA* channel_data = &mDataBuffer[i];
+      mChannels[i]->AddSample(channel_data->AbsoluteEnergy,
+                              channel_data->AbsoluteTime);
+    }
+  }
+
+  bool HasChannels() { return mChannels.length() != 0; }
+  void AppendCountersTo(PowerCounters::CountVector& aCounters) {
+    if (aCounters.reserve(aCounters.length() + mChannels.length())) {
+      for (auto& channel : mChannels) {
+        aCounters.infallibleAppend(channel.get());
+      }
+    }
+  }
+
+ private:
+  Vector<UniquePtr<PowerMeterChannel>, 4> mChannels;
+  HANDLE mHandle = INVALID_HANDLE_VALUE;
+  UniquePtr<EMI_CHANNEL_MEASUREMENT_DATA[]> mDataBuffer;
+};
+
+PowerCounters::PowerCounters() {
+  class MOZ_STACK_CLASS HDevInfoHolder final {
+   public:
+    explicit HDevInfoHolder(HDEVINFO aHandle) : mHandle(aHandle) {}
+
+    ~HDevInfoHolder() { ::SetupDiDestroyDeviceInfoList(mHandle); }
+
+   private:
+    HDEVINFO mHandle;
+  };
+
+  if (!XRE_IsParentProcess()) {
+    // Energy meters are global, so only sample them on the parent.
+    return;
+  }
+
+  // Energy Metering Device Interface
+  // {45BD8344-7ED6-49cf-A440-C276C933B053}
+  //
+  // Using GUID_DEVICE_ENERGY_METER does not compile as the symbol does not
+  // exist before Windows 10.
+  GUID my_GUID_DEVICE_ENERGY_METER = {
+      0x45bd8344,
+      0x7ed6,
+      0x49cf,
+      {0xa4, 0x40, 0xc2, 0x76, 0xc9, 0x33, 0xb0, 0x53}};
+
+  HDEVINFO hdev =
+      ::SetupDiGetClassDevs(&my_GUID_DEVICE_ENERGY_METER, nullptr, nullptr,
+                            DIGCF_PRESENT | DIGCF_DEVICEINTERFACE);
+  if (hdev == INVALID_HANDLE_VALUE) {
+    return;
+  }
+
+  HDevInfoHolder hdevHolder(hdev);
+
+  DWORD i = 0;
+  SP_DEVICE_INTERFACE_DATA did = {0};
+  did.cbSize = sizeof(did);
+
+  while (::SetupDiEnumDeviceInterfaces(
+      hdev, nullptr, &my_GUID_DEVICE_ENERGY_METER, i++, &did)) {
+    DWORD bufferSize = 0;
+    ::SetupDiGetDeviceInterfaceDetail(hdev, &did, nullptr, 0, &bufferSize,
+                                      nullptr);
+    if (::GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+      continue;
+    }
+
+    UniquePtr<uint8_t[]> buffer(new (std::nothrow) uint8_t[bufferSize]);
+    if (!buffer) {
+      continue;
+    }
+
+    PSP_DEVICE_INTERFACE_DETAIL_DATA pdidd =
+        reinterpret_cast<PSP_DEVICE_INTERFACE_DETAIL_DATA>(buffer.get());
+    MOZ_ASSERT(uintptr_t(buffer.get()) %
+                   alignof(PSP_DEVICE_INTERFACE_DETAIL_DATA) ==
+               0);
+    pdidd->cbSize = sizeof(*pdidd);
+    if (!::SetupDiGetDeviceInterfaceDetail(hdev, &did, pdidd, bufferSize,
+                                           &bufferSize, nullptr)) {
+      continue;
+    }
+
+    UniquePtr<PowerMeterDevice> pmd =
+        MakeUnique<PowerMeterDevice>(pdidd->DevicePath);
+    if (!pmd->HasChannels() ||
+        !mPowerMeterDevices.emplaceBack(std::move(pmd))) {
+      NS_WARNING("PowerMeterDevice without measurement channel (or OOM)");
+    }
+  }
+
+  for (auto& device : mPowerMeterDevices) {
+    device->AppendCountersTo(mCounters);
+  }
+}
+
+PowerCounters::~PowerCounters() { mCounters.clear(); }
+
+void PowerCounters::Sample() {
+  for (auto& device : mPowerMeterDevices) {
+    device->Sample();
+  }
+}
diff --git a/tools/profiler/core/PowerCounters.h b/tools/profiler/core/PowerCounters.h
new file mode 100644
index 0000000000..2fd8d5892c
--- /dev/null
+++ b/tools/profiler/core/PowerCounters.h
@@ -0,0 +1,52 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef TOOLS_POWERCOUNTERS_H_
+#define TOOLS_POWERCOUNTERS_H_
+
+#include "PlatformMacros.h"
+#include "mozilla/ProfilerCounts.h"
+#include "mozilla/UniquePtr.h"
+#include "mozilla/Vector.h"
+
+#if defined(_MSC_VER)
+class PowerMeterDevice;
+#endif
+#if defined(GP_PLAT_arm64_darwin)
+class ProcessPower;
+#endif
+#if defined(GP_PLAT_amd64_darwin)
+class RAPL;
+#endif
+
+class PowerCounters {
+ public:
+#if defined(_MSC_VER) || defined(GP_OS_darwin) || defined(GP_PLAT_amd64_linux)
+  explicit PowerCounters();
+  ~PowerCounters();
+  void Sample();
+#else
+  explicit PowerCounters(){};
+  ~PowerCounters(){};
+  void Sample(){};
+#endif
+
+  using CountVector = mozilla::Vector<BaseProfilerCount*, 4>;
+  const CountVector& GetCounters() { return mCounters; }
+
+ private:
+  CountVector mCounters;
+
+#if defined(_MSC_VER)
+  mozilla::Vector<mozilla::UniquePtr<PowerMeterDevice>> mPowerMeterDevices;
+#endif
+#if defined(GP_PLAT_arm64_darwin)
+  mozilla::UniquePtr<ProcessPower> mProcessPower;
+#endif
+#if defined(GP_PLAT_amd64_darwin)
+  RAPL* mRapl;
+#endif
+};
+
+#endif /* ndef TOOLS_POWERCOUNTERS_H_ */
diff --git a/tools/profiler/core/ProfileAdditionalInformation.cpp b/tools/profiler/core/ProfileAdditionalInformation.cpp
new file mode 100644
index 0000000000..ba3cd80e7c
--- /dev/null
+++ b/tools/profiler/core/ProfileAdditionalInformation.cpp
@@ -0,0 +1,102 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "ProfileAdditionalInformation.h"
+
+#include "jsapi.h"
+#include "js/JSON.h"
+#include "js/PropertyAndElement.h"
+#include "js/Value.h"
+#include "mozilla/JSONStringWriteFuncs.h"
+#include "mozilla/ipc/IPDLParamTraits.h"
+
+#ifdef MOZ_GECKO_PROFILER
+#  include "platform.h"
+
+void mozilla::ProfileGenerationAdditionalInformation::ToJSValue(
+    JSContext* aCx, JS::MutableHandle<JS::Value> aRetVal) const {
+  // Get the shared libraries array.
+  JS::Rooted<JS::Value> sharedLibrariesVal(aCx);
+  {
+    JSONStringWriteFunc<nsCString> buffer;
+    JSONWriter w(buffer, JSONWriter::SingleLineStyle);
+    w.StartArrayElement();
+    AppendSharedLibraries(w, mSharedLibraries);
+    w.EndArray();
+    NS_ConvertUTF8toUTF16 buffer16(buffer.StringCRef());
+    MOZ_ALWAYS_TRUE(JS_ParseJSON(aCx,
+                                 static_cast<const char16_t*>(buffer16.get()),
+                                 buffer16.Length(), &sharedLibrariesVal));
+  }
+
+  JS::Rooted<JSObject*> additionalInfoObj(aCx, JS_NewPlainObject(aCx));
+  JS_SetProperty(aCx, additionalInfoObj, "sharedLibraries", sharedLibrariesVal);
+  aRetVal.setObject(*additionalInfoObj);
+}
+#endif  // MOZ_GECKO_PROFILER
+
+namespace IPC {
+
+#ifdef MOZ_GECKO_PROFILER
+void IPC::ParamTraits<SharedLibrary>::Write(MessageWriter* aWriter,
+                                            const paramType& aParam) {
+  WriteParam(aWriter, aParam.mStart);
+  WriteParam(aWriter, aParam.mEnd);
+  WriteParam(aWriter, aParam.mOffset);
+  WriteParam(aWriter, aParam.mBreakpadId);
+  WriteParam(aWriter, aParam.mCodeId);
+  WriteParam(aWriter, aParam.mModuleName);
+  WriteParam(aWriter, aParam.mModulePath);
+  WriteParam(aWriter, aParam.mDebugName);
+  WriteParam(aWriter, aParam.mDebugPath);
+  WriteParam(aWriter, aParam.mVersion);
+  WriteParam(aWriter, aParam.mArch);
+}
+
+bool IPC::ParamTraits<SharedLibrary>::Read(MessageReader* aReader,
+                                           paramType* aResult) {
+  return ReadParam(aReader, &aResult->mStart) &&
+         ReadParam(aReader, &aResult->mEnd) &&
+         ReadParam(aReader, &aResult->mOffset) &&
+         ReadParam(aReader, &aResult->mBreakpadId) &&
+         ReadParam(aReader, &aResult->mCodeId) &&
+         ReadParam(aReader, &aResult->mModuleName) &&
+         ReadParam(aReader, &aResult->mModulePath) &&
+         ReadParam(aReader, &aResult->mDebugName) &&
+         ReadParam(aReader, &aResult->mDebugPath) &&
+         ReadParam(aReader, &aResult->mVersion) &&
+         ReadParam(aReader, &aResult->mArch);
+}
+
+void IPC::ParamTraits<SharedLibraryInfo>::Write(MessageWriter* aWriter,
+                                                const paramType& aParam) {
+  paramType& p = const_cast<paramType&>(aParam);
+  WriteParam(aWriter, p.mEntries);
+}
+
+bool IPC::ParamTraits<SharedLibraryInfo>::Read(MessageReader* aReader,
+                                               paramType* aResult) {
+  return ReadParam(aReader, &aResult->mEntries);
+}
+#endif  // MOZ_GECKO_PROFILER
+
+void IPC::ParamTraits<mozilla::ProfileGenerationAdditionalInformation>::Write(
+    MessageWriter* aWriter, const paramType& aParam) {
+#ifdef MOZ_GECKO_PROFILER
+  WriteParam(aWriter, aParam.mSharedLibraries);
+#endif  // MOZ_GECKO_PROFILER
+}
+
+bool IPC::ParamTraits<mozilla::ProfileGenerationAdditionalInformation>::Read(
+    MessageReader* aReader, paramType* aResult) {
+#ifdef MOZ_GECKO_PROFILER
+  return ReadParam(aReader, &aResult->mSharedLibraries);
+#else
+  return true;
+#endif  // MOZ_GECKO_PROFILER
+}
+
+}  // namespace IPC
diff --git a/tools/profiler/core/ProfileBuffer.cpp b/tools/profiler/core/ProfileBuffer.cpp
new file mode 100644
index 0000000000..170a4f14b4
--- /dev/null
+++ b/tools/profiler/core/ProfileBuffer.cpp
@@ -0,0 +1,243 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "ProfileBuffer.h"
+
+#include "BaseProfiler.h"
+#include "js/GCAPI.h"
+#include "jsfriendapi.h"
+#include "mozilla/MathAlgorithms.h"
+#include "nsJSPrincipals.h"
+#include "nsScriptSecurityManager.h"
+
+using namespace mozilla;
+
+ProfileBuffer::ProfileBuffer(ProfileChunkedBuffer& aBuffer)
+    : mEntries(aBuffer) {
+  // Assume the given buffer is in-session.
+  MOZ_ASSERT(mEntries.IsInSession());
+}
+
+/* static */
+ProfileBufferBlockIndex ProfileBuffer::AddEntry(
+    ProfileChunkedBuffer& aProfileChunkedBuffer,
+    const ProfileBufferEntry& aEntry) {
+  switch (aEntry.GetKind()) {
+#define SWITCH_KIND(KIND, TYPE, SIZE)                          \
+  case ProfileBufferEntry::Kind::KIND: {                       \
+    return aProfileChunkedBuffer.PutFrom(&aEntry, 1 + (SIZE)); \
+  }
+
+    FOR_EACH_PROFILE_BUFFER_ENTRY_KIND(SWITCH_KIND)
+
+#undef SWITCH_KIND
+    default:
+      MOZ_ASSERT(false, "Unhandled ProfilerBuffer entry KIND");
+      return ProfileBufferBlockIndex{};
+  }
+}
+
+// Called from signal, call only reentrant functions
+uint64_t ProfileBuffer::AddEntry(const ProfileBufferEntry& aEntry) {
+  return AddEntry(mEntries, aEntry).ConvertToProfileBufferIndex();
+}
+
+/* static */
+ProfileBufferBlockIndex ProfileBuffer::AddThreadIdEntry(
+    ProfileChunkedBuffer& aProfileChunkedBuffer, ProfilerThreadId aThreadId) {
+  return AddEntry(aProfileChunkedBuffer,
+                  ProfileBufferEntry::ThreadId(aThreadId));
+}
+
+uint64_t ProfileBuffer::AddThreadIdEntry(ProfilerThreadId aThreadId) {
+  return AddThreadIdEntry(mEntries, aThreadId).ConvertToProfileBufferIndex();
+}
+
+void ProfileBuffer::CollectCodeLocation(
+    const char* aLabel, const char* aStr, uint32_t aFrameFlags,
+    uint64_t aInnerWindowID, const Maybe<uint32_t>& aLineNumber,
+    const Maybe<uint32_t>& aColumnNumber,
+    const Maybe<JS::ProfilingCategoryPair>& aCategoryPair) {
+  AddEntry(ProfileBufferEntry::Label(aLabel));
+  AddEntry(ProfileBufferEntry::FrameFlags(uint64_t(aFrameFlags)));
+
+  if (aStr) {
+    // Store the string using one or more DynamicStringFragment entries.
+    size_t strLen = strlen(aStr) + 1;  // +1 for the null terminator
+    // If larger than the prescribed limit, we will cut the string and end it
+    // with an ellipsis.
+    const bool tooBig = strLen > kMaxFrameKeyLength;
+    if (tooBig) {
+      strLen = kMaxFrameKeyLength;
+    }
+    char chars[ProfileBufferEntry::kNumChars];
+    for (size_t j = 0;; j += ProfileBufferEntry::kNumChars) {
+      // Store up to kNumChars characters in the entry.
+      size_t len = ProfileBufferEntry::kNumChars;
+      const bool last = j + len >= strLen;
+      if (last) {
+        // Only the last entry may be smaller than kNumChars.
+        len = strLen - j;
+        if (tooBig) {
+          // That last entry is part of a too-big string, replace the end
+          // characters with an ellipsis "...".
+          len = std::max(len, size_t(4));
+          chars[len - 4] = '.';
+          chars[len - 3] = '.';
+          chars[len - 2] = '.';
+          chars[len - 1] = '\0';
+          // Make sure the memcpy will not overwrite our ellipsis!
+          len -= 4;
+        }
+      }
+      memcpy(chars, &aStr[j], len);
+      AddEntry(ProfileBufferEntry::DynamicStringFragment(chars));
+      if (last) {
+        break;
+      }
+    }
+  }
+
+  if (aInnerWindowID) {
+    AddEntry(ProfileBufferEntry::InnerWindowID(aInnerWindowID));
+  }
+
+  if (aLineNumber) {
+    AddEntry(ProfileBufferEntry::LineNumber(*aLineNumber));
+  }
+
+  if (aColumnNumber) {
+    AddEntry(ProfileBufferEntry::ColumnNumber(*aColumnNumber));
+  }
+
+  if (aCategoryPair.isSome()) {
+    AddEntry(ProfileBufferEntry::CategoryPair(int(*aCategoryPair)));
+  }
+}
+
+size_t ProfileBuffer::SizeOfExcludingThis(MallocSizeOf aMallocSizeOf) const {
+  // Measurement of the following members may be added later if DMD finds it
+  // is worthwhile:
+  // - memory pointed to by the elements within mEntries
+  return mEntries.SizeOfExcludingThis(aMallocSizeOf);
+}
+
+size_t ProfileBuffer::SizeOfIncludingThis(MallocSizeOf aMallocSizeOf) const {
+  return aMallocSizeOf(this) + SizeOfExcludingThis(aMallocSizeOf);
+}
+
+void ProfileBuffer::CollectOverheadStats(double aSamplingTimeMs,
+                                         TimeDuration aLocking,
+                                         TimeDuration aCleaning,
+                                         TimeDuration aCounters,
+                                         TimeDuration aThreads) {
+  double timeUs = aSamplingTimeMs * 1000.0;
+  if (mFirstSamplingTimeUs == 0.0) {
+    mFirstSamplingTimeUs = timeUs;
+  } else {
+    // Note that we'll have 1 fewer interval than other numbers (because
+    // we need both ends of an interval to know its duration). The final
+    // difference should be insignificant over the expected many thousands
+    // of iterations.
+    mIntervalsUs.Count(timeUs - mLastSamplingTimeUs);
+  }
+  mLastSamplingTimeUs = timeUs;
+  double locking = aLocking.ToMilliseconds() * 1000.0;
+  double cleaning = aCleaning.ToMilliseconds() * 1000.0;
+  double counters = aCounters.ToMilliseconds() * 1000.0;
+  double threads = aThreads.ToMilliseconds() * 1000.0;
+
+  mOverheadsUs.Count(locking + cleaning + counters + threads);
+  mLockingsUs.Count(locking);
+  mCleaningsUs.Count(cleaning);
+  mCountersUs.Count(counters);
+  mThreadsUs.Count(threads);
+
+  static const bool sRecordSamplingOverhead = []() {
+    const char* recordOverheads = getenv("MOZ_PROFILER_RECORD_OVERHEADS");
+    return recordOverheads && recordOverheads[0] != '\0';
+  }();
+  if (sRecordSamplingOverhead) {
+    AddEntry(ProfileBufferEntry::ProfilerOverheadTime(aSamplingTimeMs));
+    AddEntry(ProfileBufferEntry::ProfilerOverheadDuration(locking));
+    AddEntry(ProfileBufferEntry::ProfilerOverheadDuration(cleaning));
+    AddEntry(ProfileBufferEntry::ProfilerOverheadDuration(counters));
+    AddEntry(ProfileBufferEntry::ProfilerOverheadDuration(threads));
+  }
+}
+
+ProfilerBufferInfo ProfileBuffer::GetProfilerBufferInfo() const {
+  return {BufferRangeStart(),
+          BufferRangeEnd(),
+          static_cast<uint32_t>(*mEntries.BufferLength() /
+                                8),  // 8 bytes per entry.
+          mIntervalsUs,
+          mOverheadsUs,
+          mLockingsUs,
+          mCleaningsUs,
+          mCountersUs,
+          mThreadsUs};
+}
+
+/* ProfileBufferCollector */
+
+void ProfileBufferCollector::CollectNativeLeafAddr(void* aAddr) {
+  mBuf.AddEntry(ProfileBufferEntry::NativeLeafAddr(aAddr));
+}
+
+void ProfileBufferCollector::CollectJitReturnAddr(void* aAddr) {
+  mBuf.AddEntry(ProfileBufferEntry::JitReturnAddr(aAddr));
+}
+
+void ProfileBufferCollector::CollectWasmFrame(const char* aLabel) {
+  mBuf.CollectCodeLocation("", aLabel, 0, 0, Nothing(), Nothing(),
+                           Some(JS::ProfilingCategoryPair::JS_Wasm));
+}
+
+void ProfileBufferCollector::CollectProfilingStackFrame(
+    const js::ProfilingStackFrame& aFrame) {
+  // WARNING: this function runs within the profiler's "critical section".
+
+  MOZ_ASSERT(aFrame.isLabelFrame() ||
+             (aFrame.isJsFrame() && !aFrame.isOSRFrame()));
+
+  const char* label = aFrame.label();
+  const char* dynamicString = aFrame.dynamicString();
+  Maybe<uint32_t> line;
+  Maybe<uint32_t> column;
+
+  if (aFrame.isJsFrame()) {
+    // There are two kinds of JS frames that get pushed onto the ProfilingStack.
+    //
+    // - label = "", dynamic string = <something>
+    // - label = "js::RunScript", dynamic string = nullptr
+    //
+    // The line number is only interesting in the first case.
+
+    if (label[0] == '\0') {
+      MOZ_ASSERT(dynamicString);
+
+      // We call aFrame.script() repeatedly -- rather than storing the result in
+      // a local variable in order -- to avoid rooting hazards.
+      if (aFrame.script()) {
+        if (aFrame.pc()) {
+          unsigned col = 0;
+          line = Some(JS_PCToLineNumber(aFrame.script(), aFrame.pc(), &col));
+          column = Some(col);
+        }
+      }
+
+    } else {
+      MOZ_ASSERT(strcmp(label, "js::RunScript") == 0 && !dynamicString);
+    }
+  } else {
+    MOZ_ASSERT(aFrame.isLabelFrame());
+  }
+
+  mBuf.CollectCodeLocation(label, dynamicString, aFrame.flags(),
+                           aFrame.realmID(), line, column,
+                           Some(aFrame.categoryPair()));
+}
diff --git a/tools/profiler/core/ProfileBuffer.h b/tools/profiler/core/ProfileBuffer.h
new file mode 100644
index 0000000000..5da34909cc
--- /dev/null
+++ b/tools/profiler/core/ProfileBuffer.h
@@ -0,0 +1,260 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZ_PROFILE_BUFFER_H
+#define MOZ_PROFILE_BUFFER_H
+
+#include "GeckoProfiler.h"
+#include "ProfileBufferEntry.h"
+
+#include "mozilla/Maybe.h"
+#include "mozilla/PowerOfTwo.h"
+#include "mozilla/ProfileBufferChunkManagerSingle.h"
+#include "mozilla/ProfileChunkedBuffer.h"
+
+class ProcessStreamingContext;
+class RunningTimes;
+
+// Class storing most profiling data in a ProfileChunkedBuffer.
+//
+// This class is used as a queue of entries which, after construction, never
+// allocates. This makes it safe to use in the profiler's "critical section".
+class ProfileBuffer final {
+ public:
+  // ProfileBuffer constructor
+  // @param aBuffer The in-session ProfileChunkedBuffer to use as buffer
+  // manager.
+  explicit ProfileBuffer(mozilla::ProfileChunkedBuffer& aBuffer);
+
+  mozilla::ProfileChunkedBuffer& UnderlyingChunkedBuffer() const {
+    return mEntries;
+  }
+
+  bool IsThreadSafe() const { return mEntries.IsThreadSafe(); }
+
+  // Add |aEntry| to the buffer, ignoring what kind of entry it is.
+  uint64_t AddEntry(const ProfileBufferEntry& aEntry);
+
+  // Add to the buffer a sample start (ThreadId) entry for aThreadId.
+  // Returns the position of the entry.
+  uint64_t AddThreadIdEntry(ProfilerThreadId aThreadId);
+
+  void CollectCodeLocation(
+      const char* aLabel, const char* aStr, uint32_t aFrameFlags,
+      uint64_t aInnerWindowID, const mozilla::Maybe<uint32_t>& aLineNumber,
+      const mozilla::Maybe<uint32_t>& aColumnNumber,
+      const mozilla::Maybe<JS::ProfilingCategoryPair>& aCategoryPair);
+
+  // Maximum size of a frameKey string that we'll handle.
+  static const size_t kMaxFrameKeyLength = 512;
+
+  // Add JIT frame information to aJITFrameInfo for any JitReturnAddr entries
+  // that are currently in the buffer at or after aRangeStart, in samples
+  // for the given thread.
+  void AddJITInfoForRange(uint64_t aRangeStart, ProfilerThreadId aThreadId,
+                          JSContext* aContext, JITFrameInfo& aJITFrameInfo,
+                          mozilla::ProgressLogger aProgressLogger) const;
+
+  // Stream JSON for samples in the buffer to aWriter, using the supplied
+  // UniqueStacks object.
+  // Only streams samples for the given thread ID and which were taken at or
+  // after aSinceTime. If ID is 0, ignore the stored thread ID; this should only
+  // be used when the buffer contains only one sample.
+  // aUniqueStacks needs to contain information about any JIT frames that we
+  // might encounter in the buffer, before this method is called. In other
+  // words, you need to have called AddJITInfoForRange for every range that
+  // might contain JIT frame information before calling this method.
+  // Return the thread ID of the streamed sample(s), or 0.
+  ProfilerThreadId StreamSamplesToJSON(
+      SpliceableJSONWriter& aWriter, ProfilerThreadId aThreadId,
+      double aSinceTime, UniqueStacks& aUniqueStacks,
+      mozilla::ProgressLogger aProgressLogger) const;
+
+  void StreamMarkersToJSON(SpliceableJSONWriter& aWriter,
+                           ProfilerThreadId aThreadId,
+                           const mozilla::TimeStamp& aProcessStartTime,
+                           double aSinceTime, UniqueStacks& aUniqueStacks,
+                           mozilla::ProgressLogger aProgressLogger) const;
+
+  // Stream samples and markers from all threads that `aProcessStreamingContext`
+  // accepts.
+  void StreamSamplesAndMarkersToJSON(
+      ProcessStreamingContext& aProcessStreamingContext,
+      mozilla::ProgressLogger aProgressLogger) const;
+
+  void StreamPausedRangesToJSON(SpliceableJSONWriter& aWriter,
+                                double aSinceTime,
+                                mozilla::ProgressLogger aProgressLogger) const;
+  void StreamProfilerOverheadToJSON(
+      SpliceableJSONWriter& aWriter,
+      const mozilla::TimeStamp& aProcessStartTime, double aSinceTime,
+      mozilla::ProgressLogger aProgressLogger) const;
+  void StreamCountersToJSON(SpliceableJSONWriter& aWriter,
+                            const mozilla::TimeStamp& aProcessStartTime,
+                            double aSinceTime,
+                            mozilla::ProgressLogger aProgressLogger) const;
+
+  // Find (via |aLastSample|) the most recent sample for the thread denoted by
+  // |aThreadId| and clone it, patching in the current time as appropriate.
+  // Mutate |aLastSample| to point to the newly inserted sample.
+  // Returns whether duplication was successful.
+  bool DuplicateLastSample(ProfilerThreadId aThreadId, double aSampleTimeMs,
+                           mozilla::Maybe<uint64_t>& aLastSample,
+                           const RunningTimes& aRunningTimes);
+
+  void DiscardSamplesBeforeTime(double aTime);
+
+  // Read an entry in the buffer.
+  ProfileBufferEntry GetEntry(uint64_t aPosition) const {
+    return mEntries.ReadAt(
+        mozilla::ProfileBufferBlockIndex::CreateFromProfileBufferIndex(
+            aPosition),
+        [&](mozilla::Maybe<mozilla::ProfileBufferEntryReader>&& aMER) {
+          ProfileBufferEntry entry;
+          if (aMER.isSome()) {
+            if (aMER->CurrentBlockIndex().ConvertToProfileBufferIndex() ==
+                aPosition) {
+              // If we're here, it means `aPosition` pointed at a valid block.
+              MOZ_RELEASE_ASSERT(aMER->RemainingBytes() <= sizeof(entry));
+              aMER->ReadBytes(&entry, aMER->RemainingBytes());
+            } else {
+              // EntryReader at the wrong position, pretend to have read
+              // everything.
+              aMER->SetRemainingBytes(0);
+            }
+          }
+          return entry;
+        });
+  }
+
+  size_t SizeOfExcludingThis(mozilla::MallocSizeOf aMallocSizeOf) const;
+  size_t SizeOfIncludingThis(mozilla::MallocSizeOf aMallocSizeOf) const;
+
+  void CollectOverheadStats(double aSamplingTimeMs,
+                            mozilla::TimeDuration aLocking,
+                            mozilla::TimeDuration aCleaning,
+                            mozilla::TimeDuration aCounters,
+                            mozilla::TimeDuration aThreads);
+
+  ProfilerBufferInfo GetProfilerBufferInfo() const;
+
+ private:
+  // Add |aEntry| to the provided ProfileChunkedBuffer.
+  // `static` because it may be used to add an entry to a `ProfileChunkedBuffer`
+  // that is not attached to a `ProfileBuffer`.
+  static mozilla::ProfileBufferBlockIndex AddEntry(
+      mozilla::ProfileChunkedBuffer& aProfileChunkedBuffer,
+      const ProfileBufferEntry& aEntry);
+
+  // Add a sample start (ThreadId) entry for aThreadId to the provided
+  // ProfileChunkedBuffer. Returns the position of the entry.
+  // `static` because it may be used to add an entry to a `ProfileChunkedBuffer`
+  // that is not attached to a `ProfileBuffer`.
+  static mozilla::ProfileBufferBlockIndex AddThreadIdEntry(
+      mozilla::ProfileChunkedBuffer& aProfileChunkedBuffer,
+      ProfilerThreadId aThreadId);
+
+  // The storage in which this ProfileBuffer stores its entries.
+  mozilla::ProfileChunkedBuffer& mEntries;
+
+ public:
+  // `BufferRangeStart()` and `BufferRangeEnd()` return `uint64_t` values
+  // corresponding to the first entry and past the last entry stored in
+  // `mEntries`.
+  //
+  // The returned values are not guaranteed to be stable, because other threads
+  // may also be accessing the buffer concurrently. But they will always
+  // increase, and can therefore give an indication of how far these values have
+  // *at least* reached. In particular:
+  // - Entries whose index is strictly less that `BufferRangeStart()` have been
+  //   discarded by now, so any related data may also be safely discarded.
+  // - It is safe to try and read entries at any index strictly less than
+  //   `BufferRangeEnd()` -- but note that these reads may fail by the time you
+  //   request them, as old entries get overwritten by new ones.
+  uint64_t BufferRangeStart() const { return mEntries.GetState().mRangeStart; }
+  uint64_t BufferRangeEnd() const { return mEntries.GetState().mRangeEnd; }
+
+ private:
+  // Single pre-allocated chunk (to avoid spurious mallocs), used when:
+  // - Duplicating sleeping stacks (hence scExpectedMaximumStackSize).
+  // - Adding JIT info.
+  // - Streaming stacks to JSON.
+  // Mutable because it's accessed from non-multithreaded const methods.
+  mutable mozilla::Maybe<mozilla::ProfileBufferChunkManagerSingle>
+      mMaybeWorkerChunkManager;
+  mozilla::ProfileBufferChunkManagerSingle& WorkerChunkManager() const {
+    if (mMaybeWorkerChunkManager.isNothing()) {
+      // Only actually allocate it on first use. (Some ProfileBuffers are
+      // temporary and don't actually need this.)
+      mMaybeWorkerChunkManager.emplace(
+          mozilla::ProfileBufferChunk::SizeofChunkMetadata() +
+          mozilla::ProfileBufferChunkManager::scExpectedMaximumStackSize);
+    }
+    return *mMaybeWorkerChunkManager;
+  }
+
+  // GetStreamingParametersForThreadCallback:
+  //   (ProfilerThreadId) -> Maybe<StreamingParametersForThread>
+  template <typename GetStreamingParametersForThreadCallback>
+  ProfilerThreadId DoStreamSamplesAndMarkersToJSON(
+      mozilla::FailureLatch& aFailureLatch,
+      GetStreamingParametersForThreadCallback&&
+          aGetStreamingParametersForThreadCallback,
+      double aSinceTime, ProcessStreamingContext* aStreamingContextForMarkers,
+      mozilla::ProgressLogger aProgressLogger) const;
+
+  double mFirstSamplingTimeUs = 0.0;
+  double mLastSamplingTimeUs = 0.0;
+  ProfilerStats mIntervalsUs;
+  ProfilerStats mOverheadsUs;
+  ProfilerStats mLockingsUs;
+  ProfilerStats mCleaningsUs;
+  ProfilerStats mCountersUs;
+  ProfilerStats mThreadsUs;
+};
+
+/**
+ * Helper type used to implement ProfilerStackCollector. This type is used as
+ * the collector for MergeStacks by ProfileBuffer. It holds a reference to the
+ * buffer, as well as additional feature flags which are needed to control the
+ * data collection strategy
+ */
+class ProfileBufferCollector final : public ProfilerStackCollector {
+ public:
+  ProfileBufferCollector(ProfileBuffer& aBuf, uint64_t aSamplePos,
+                         uint64_t aBufferRangeStart)
+      : mBuf(aBuf),
+        mSamplePositionInBuffer(aSamplePos),
+        mBufferRangeStart(aBufferRangeStart) {
+    MOZ_ASSERT(
+        mSamplePositionInBuffer >= mBufferRangeStart,
+        "The sample position should always be after the buffer range start");
+  }
+
+  // Position at which the sample starts in the profiler buffer (which may be
+  // different from the buffer in which the sample data is collected here).
+  mozilla::Maybe<uint64_t> SamplePositionInBuffer() override {
+    return mozilla::Some(mSamplePositionInBuffer);
+  }
+
+  // Profiler buffer's range start (which may be different from the buffer in
+  // which the sample data is collected here).
+  mozilla::Maybe<uint64_t> BufferRangeStart() override {
+    return mozilla::Some(mBufferRangeStart);
+  }
+
+  virtual void CollectNativeLeafAddr(void* aAddr) override;
+  virtual void CollectJitReturnAddr(void* aAddr) override;
+  virtual void CollectWasmFrame(const char* aLabel) override;
+  virtual void CollectProfilingStackFrame(
+      const js::ProfilingStackFrame& aFrame) override;
+
+ private:
+  ProfileBuffer& mBuf;
+  uint64_t mSamplePositionInBuffer;
+  uint64_t mBufferRangeStart;
+};
+
+#endif
diff --git a/tools/profiler/core/ProfileBufferEntry.cpp b/tools/profiler/core/ProfileBufferEntry.cpp
new file mode 100644
index 0000000000..5429eac0b8
--- /dev/null
+++ b/tools/profiler/core/ProfileBufferEntry.cpp
@@ -0,0 +1,2321 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "ProfileBufferEntry.h"
+
+#include "mozilla/ProfilerMarkers.h"
+#include "platform.h"
+#include "ProfileBuffer.h"
+#include "ProfiledThreadData.h"
+#include "ProfilerBacktrace.h"
+#include "ProfilerRustBindings.h"
+
+#include "js/ProfilingFrameIterator.h"
+#include "jsapi.h"
+#include "jsfriendapi.h"
+#include "mozilla/Logging.h"
+#include "mozilla/JSONStringWriteFuncs.h"
+#include "mozilla/ScopeExit.h"
+#include "mozilla/Sprintf.h"
+#include "mozilla/StackWalk.h"
+#include "nsThreadUtils.h"
+#include "nsXULAppAPI.h"
+#include "ProfilerCodeAddressService.h"
+
+#include <ostream>
+#include <type_traits>
+
+using namespace mozilla;
+using namespace mozilla::literals::ProportionValue_literals;
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN ProfileBufferEntry
+
+ProfileBufferEntry::ProfileBufferEntry()
+    : mKind(Kind::INVALID), mStorage{0, 0, 0, 0, 0, 0, 0, 0} {}
+
+// aString must be a static string.
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, const char* aString)
+    : mKind(aKind) {
+  MOZ_ASSERT(aKind == Kind::Label);
+  memcpy(mStorage, &aString, sizeof(aString));
+}
+
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, char aChars[kNumChars])
+    : mKind(aKind) {
+  MOZ_ASSERT(aKind == Kind::DynamicStringFragment);
+  memcpy(mStorage, aChars, kNumChars);
+}
+
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, void* aPtr) : mKind(aKind) {
+  memcpy(mStorage, &aPtr, sizeof(aPtr));
+}
+
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, double aDouble)
+    : mKind(aKind) {
+  memcpy(mStorage, &aDouble, sizeof(aDouble));
+}
+
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, int aInt) : mKind(aKind) {
+  memcpy(mStorage, &aInt, sizeof(aInt));
+}
+
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, int64_t aInt64)
+    : mKind(aKind) {
+  memcpy(mStorage, &aInt64, sizeof(aInt64));
+}
+
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, uint64_t aUint64)
+    : mKind(aKind) {
+  memcpy(mStorage, &aUint64, sizeof(aUint64));
+}
+
+ProfileBufferEntry::ProfileBufferEntry(Kind aKind, ProfilerThreadId aThreadId)
+    : mKind(aKind) {
+  static_assert(std::is_trivially_copyable_v<ProfilerThreadId>);
+  static_assert(sizeof(aThreadId) <= sizeof(mStorage));
+  memcpy(mStorage, &aThreadId, sizeof(aThreadId));
+}
+
+const char* ProfileBufferEntry::GetString() const {
+  const char* result;
+  memcpy(&result, mStorage, sizeof(result));
+  return result;
+}
+
+void* ProfileBufferEntry::GetPtr() const {
+  void* result;
+  memcpy(&result, mStorage, sizeof(result));
+  return result;
+}
+
+double ProfileBufferEntry::GetDouble() const {
+  double result;
+  memcpy(&result, mStorage, sizeof(result));
+  return result;
+}
+
+int ProfileBufferEntry::GetInt() const {
+  int result;
+  memcpy(&result, mStorage, sizeof(result));
+  return result;
+}
+
+int64_t ProfileBufferEntry::GetInt64() const {
+  int64_t result;
+  memcpy(&result, mStorage, sizeof(result));
+  return result;
+}
+
+uint64_t ProfileBufferEntry::GetUint64() const {
+  uint64_t result;
+  memcpy(&result, mStorage, sizeof(result));
+  return result;
+}
+
+ProfilerThreadId ProfileBufferEntry::GetThreadId() const {
+  ProfilerThreadId result;
+  static_assert(std::is_trivially_copyable_v<ProfilerThreadId>);
+  memcpy(&result, mStorage, sizeof(result));
+  return result;
+}
+
+void ProfileBufferEntry::CopyCharsInto(char (&aOutArray)[kNumChars]) const {
+  memcpy(aOutArray, mStorage, kNumChars);
+}
+
+// END ProfileBufferEntry
+////////////////////////////////////////////////////////////////////////
+
+struct TypeInfo {
+  Maybe<nsCString> mKeyedBy;
+  Maybe<nsCString> mName;
+  Maybe<nsCString> mLocation;
+  Maybe<unsigned> mLineNumber;
+};
+
+// As mentioned in ProfileBufferEntry.h, the JSON format contains many
+// arrays whose elements are laid out according to various schemas to help
+// de-duplication. This RAII class helps write these arrays by keeping track of
+// the last non-null element written and adding the appropriate number of null
+// elements when writing new non-null elements. It also automatically opens and
+// closes an array element on the given JSON writer.
+//
+// You grant the AutoArraySchemaWriter exclusive access to the JSONWriter and
+// the UniqueJSONStrings objects for the lifetime of AutoArraySchemaWriter. Do
+// not access them independently while the AutoArraySchemaWriter is alive.
+// If you need to add complex objects, call FreeFormElement(), which will give
+// you temporary access to the writer.
+//
+// Example usage:
+//
+//     // Define the schema of elements in this type of array: [FOO, BAR, BAZ]
+//     enum Schema : uint32_t {
+//       FOO = 0,
+//       BAR = 1,
+//       BAZ = 2
+//     };
+//
+//     AutoArraySchemaWriter writer(someJsonWriter, someUniqueStrings);
+//     if (shouldWriteFoo) {
+//       writer.IntElement(FOO, getFoo());
+//     }
+//     ... etc ...
+//
+//     The elements need to be added in-order.
+class MOZ_RAII AutoArraySchemaWriter {
+ public:
+  explicit AutoArraySchemaWriter(SpliceableJSONWriter& aWriter)
+      : mJSONWriter(aWriter), mNextFreeIndex(0) {
+    mJSONWriter.StartArrayElement();
+  }
+
+  ~AutoArraySchemaWriter() { mJSONWriter.EndArray(); }
+
+  template <typename T>
+  void IntElement(uint32_t aIndex, T aValue) {
+    static_assert(!std::is_same_v<T, uint64_t>,
+                  "Narrowing uint64 -> int64 conversion not allowed");
+    FillUpTo(aIndex);
+    mJSONWriter.IntElement(static_cast<int64_t>(aValue));
+  }
+
+  void DoubleElement(uint32_t aIndex, double aValue) {
+    FillUpTo(aIndex);
+    mJSONWriter.DoubleElement(aValue);
+  }
+
+  void TimeMsElement(uint32_t aIndex, double aTime_ms) {
+    FillUpTo(aIndex);
+    mJSONWriter.TimeDoubleMsElement(aTime_ms);
+  }
+
+  void BoolElement(uint32_t aIndex, bool aValue) {
+    FillUpTo(aIndex);
+    mJSONWriter.BoolElement(aValue);
+  }
+
+ protected:
+  SpliceableJSONWriter& Writer() { return mJSONWriter; }
+
+  void FillUpTo(uint32_t aIndex) {
+    MOZ_ASSERT(aIndex >= mNextFreeIndex);
+    mJSONWriter.NullElements(aIndex - mNextFreeIndex);
+    mNextFreeIndex = aIndex + 1;
+  }
+
+ private:
+  SpliceableJSONWriter& mJSONWriter;
+  uint32_t mNextFreeIndex;
+};
+
+// Same as AutoArraySchemaWriter, but this can also write strings (output as
+// indexes into the table of unique strings).
+class MOZ_RAII AutoArraySchemaWithStringsWriter : public AutoArraySchemaWriter {
+ public:
+  AutoArraySchemaWithStringsWriter(SpliceableJSONWriter& aWriter,
+                                   UniqueJSONStrings& aStrings)
+      : AutoArraySchemaWriter(aWriter), mStrings(aStrings) {}
+
+  void StringElement(uint32_t aIndex, const Span<const char>& aValue) {
+    FillUpTo(aIndex);
+    mStrings.WriteElement(Writer(), aValue);
+  }
+
+ private:
+  UniqueJSONStrings& mStrings;
+};
+
+Maybe<UniqueStacks::StackKey> UniqueStacks::BeginStack(const FrameKey& aFrame) {
+  if (Maybe<uint32_t> frameIndex = GetOrAddFrameIndex(aFrame); frameIndex) {
+    return Some(StackKey(*frameIndex));
+  }
+  return Nothing{};
+}
+
+Vector<JITFrameInfoForBufferRange>&&
+JITFrameInfo::MoveRangesWithNewFailureLatch(FailureLatch& aFailureLatch) && {
+  aFailureLatch.SetFailureFrom(mLocalFailureLatchSource);
+  return std::move(mRanges);
+}
+
+UniquePtr<UniqueJSONStrings>&&
+JITFrameInfo::MoveUniqueStringsWithNewFailureLatch(
+    FailureLatch& aFailureLatch) && {
+  if (mUniqueStrings) {
+    mUniqueStrings->ChangeFailureLatchAndForwardState(aFailureLatch);
+  } else {
+    aFailureLatch.SetFailureFrom(mLocalFailureLatchSource);
+  }
+  return std::move(mUniqueStrings);
+}
+
+Maybe<UniqueStacks::StackKey> UniqueStacks::AppendFrame(
+    const StackKey& aStack, const FrameKey& aFrame) {
+  if (Maybe<uint32_t> stackIndex = GetOrAddStackIndex(aStack); stackIndex) {
+    if (Maybe<uint32_t> frameIndex = GetOrAddFrameIndex(aFrame); frameIndex) {
+      return Some(StackKey(aStack, *stackIndex, *frameIndex));
+    }
+  }
+  return Nothing{};
+}
+
+JITFrameInfoForBufferRange JITFrameInfoForBufferRange::Clone() const {
+  JITFrameInfoForBufferRange::JITAddressToJITFramesMap jitAddressToJITFramesMap;
+  MOZ_RELEASE_ASSERT(
+      jitAddressToJITFramesMap.reserve(mJITAddressToJITFramesMap.count()));
+  for (auto iter = mJITAddressToJITFramesMap.iter(); !iter.done();
+       iter.next()) {
+    const mozilla::Vector<JITFrameKey>& srcKeys = iter.get().value();
+    mozilla::Vector<JITFrameKey> destKeys;
+    MOZ_RELEASE_ASSERT(destKeys.appendAll(srcKeys));
+    jitAddressToJITFramesMap.putNewInfallible(iter.get().key(),
+                                              std::move(destKeys));
+  }
+
+  JITFrameInfoForBufferRange::JITFrameToFrameJSONMap jitFrameToFrameJSONMap;
+  MOZ_RELEASE_ASSERT(
+      jitFrameToFrameJSONMap.reserve(mJITFrameToFrameJSONMap.count()));
+  for (auto iter = mJITFrameToFrameJSONMap.iter(); !iter.done(); iter.next()) {
+    jitFrameToFrameJSONMap.putNewInfallible(iter.get().key(),
+                                            iter.get().value());
+  }
+
+  return JITFrameInfoForBufferRange{mRangeStart, mRangeEnd,
+                                    std::move(jitAddressToJITFramesMap),
+                                    std::move(jitFrameToFrameJSONMap)};
+}
+
+JITFrameInfo::JITFrameInfo(const JITFrameInfo& aOther,
+                           mozilla::ProgressLogger aProgressLogger)
+    : mUniqueStrings(MakeUniqueFallible<UniqueJSONStrings>(
+          mLocalFailureLatchSource, *aOther.mUniqueStrings,
+          aProgressLogger.CreateSubLoggerFromTo(
+              0_pc, "Creating JIT frame info unique strings...", 49_pc,
+              "Created JIT frame info unique strings"))) {
+  if (!mUniqueStrings) {
+    mLocalFailureLatchSource.SetFailure(
+        "OOM in JITFrameInfo allocating mUniqueStrings");
+    return;
+  }
+
+  if (mRanges.reserve(aOther.mRanges.length())) {
+    for (auto&& [i, progressLogger] :
+         aProgressLogger.CreateLoopSubLoggersFromTo(50_pc, 100_pc,
+                                                    aOther.mRanges.length(),
+                                                    "Copying JIT frame info")) {
+      mRanges.infallibleAppend(aOther.mRanges[i].Clone());
+    }
+  } else {
+    mLocalFailureLatchSource.SetFailure("OOM in JITFrameInfo resizing mRanges");
+  }
+}
+
+bool UniqueStacks::FrameKey::NormalFrameData::operator==(
+    const NormalFrameData& aOther) const {
+  return mLocation == aOther.mLocation &&
+         mRelevantForJS == aOther.mRelevantForJS &&
+         mBaselineInterp == aOther.mBaselineInterp &&
+         mInnerWindowID == aOther.mInnerWindowID && mLine == aOther.mLine &&
+         mColumn == aOther.mColumn && mCategoryPair == aOther.mCategoryPair;
+}
+
+bool UniqueStacks::FrameKey::JITFrameData::operator==(
+    const JITFrameData& aOther) const {
+  return mCanonicalAddress == aOther.mCanonicalAddress &&
+         mDepth == aOther.mDepth && mRangeIndex == aOther.mRangeIndex;
+}
+
+// Consume aJITFrameInfo by stealing its string table and its JIT frame info
+// ranges. The JIT frame info contains JSON which refers to strings from the
+// JIT frame info's string table, so our string table needs to have the same
+// strings at the same indices.
+UniqueStacks::UniqueStacks(
+    FailureLatch& aFailureLatch, JITFrameInfo&& aJITFrameInfo,
+    ProfilerCodeAddressService* aCodeAddressService /* = nullptr */)
+    : mUniqueStrings(std::move(aJITFrameInfo)
+                         .MoveUniqueStringsWithNewFailureLatch(aFailureLatch)),
+      mCodeAddressService(aCodeAddressService),
+      mFrameTableWriter(aFailureLatch),
+      mStackTableWriter(aFailureLatch),
+      mJITInfoRanges(std::move(aJITFrameInfo)
+                         .MoveRangesWithNewFailureLatch(aFailureLatch)) {
+  if (!mUniqueStrings) {
+    SetFailure("Did not get mUniqueStrings from JITFrameInfo");
+    return;
+  }
+
+  mFrameTableWriter.StartBareList();
+  mStackTableWriter.StartBareList();
+}
+
+Maybe<uint32_t> UniqueStacks::GetOrAddStackIndex(const StackKey& aStack) {
+  if (Failed()) {
+    return Nothing{};
+  }
+
+  uint32_t count = mStackToIndexMap.count();
+  auto entry = mStackToIndexMap.lookupForAdd(aStack);
+  if (entry) {
+    MOZ_ASSERT(entry->value() < count);
+    return Some(entry->value());
+  }
+
+  if (!mStackToIndexMap.add(entry, aStack, count)) {
+    SetFailure("OOM in UniqueStacks::GetOrAddStackIndex");
+    return Nothing{};
+  }
+  StreamStack(aStack);
+  return Some(count);
+}
+
+Maybe<Vector<UniqueStacks::FrameKey>>
+UniqueStacks::LookupFramesForJITAddressFromBufferPos(void* aJITAddress,
+                                                     uint64_t aBufferPos) {
+  JITFrameInfoForBufferRange* rangeIter =
+      std::lower_bound(mJITInfoRanges.begin(), mJITInfoRanges.end(), aBufferPos,
+                       [](const JITFrameInfoForBufferRange& aRange,
+                          uint64_t aPos) { return aRange.mRangeEnd < aPos; });
+  MOZ_RELEASE_ASSERT(
+      rangeIter != mJITInfoRanges.end() &&
+          rangeIter->mRangeStart <= aBufferPos &&
+          aBufferPos < rangeIter->mRangeEnd,
+      "Buffer position of jit address needs to be in one of the ranges");
+
+  using JITFrameKey = JITFrameInfoForBufferRange::JITFrameKey;
+
+  const JITFrameInfoForBufferRange& jitFrameInfoRange = *rangeIter;
+  auto jitFrameKeys =
+      jitFrameInfoRange.mJITAddressToJITFramesMap.lookup(aJITAddress);
+  if (!jitFrameKeys) {
+    return Nothing();
+  }
+
+  // Map the array of JITFrameKeys to an array of FrameKeys, and ensure that
+  // each of the FrameKeys exists in mFrameToIndexMap.
+  Vector<FrameKey> frameKeys;
+  MOZ_RELEASE_ASSERT(frameKeys.initCapacity(jitFrameKeys->value().length()));
+  for (const JITFrameKey& jitFrameKey : jitFrameKeys->value()) {
+    FrameKey frameKey(jitFrameKey.mCanonicalAddress, jitFrameKey.mDepth,
+                      rangeIter - mJITInfoRanges.begin());
+    uint32_t index = mFrameToIndexMap.count();
+    auto entry = mFrameToIndexMap.lookupForAdd(frameKey);
+    if (!entry) {
+      // We need to add this frame to our frame table. The JSON for this frame
+      // already exists in jitFrameInfoRange, we just need to splice it into
+      // the frame table and give it an index.
+      auto frameJSON =
+          jitFrameInfoRange.mJITFrameToFrameJSONMap.lookup(jitFrameKey);
+      MOZ_RELEASE_ASSERT(frameJSON, "Should have cached JSON for this frame");
+      mFrameTableWriter.Splice(frameJSON->value());
+      MOZ_RELEASE_ASSERT(mFrameToIndexMap.add(entry, frameKey, index));
+    }
+    MOZ_RELEASE_ASSERT(frameKeys.append(std::move(frameKey)));
+  }
+  return Some(std::move(frameKeys));
+}
+
+Maybe<uint32_t> UniqueStacks::GetOrAddFrameIndex(const FrameKey& aFrame) {
+  if (Failed()) {
+    return Nothing{};
+  }
+
+  uint32_t count = mFrameToIndexMap.count();
+  auto entry = mFrameToIndexMap.lookupForAdd(aFrame);
+  if (entry) {
+    MOZ_ASSERT(entry->value() < count);
+    return Some(entry->value());
+  }
+
+  if (!mFrameToIndexMap.add(entry, aFrame, count)) {
+    SetFailure("OOM in UniqueStacks::GetOrAddFrameIndex");
+    return Nothing{};
+  }
+  StreamNonJITFrame(aFrame);
+  return Some(count);
+}
+
+void UniqueStacks::SpliceFrameTableElements(SpliceableJSONWriter& aWriter) {
+  mFrameTableWriter.EndBareList();
+  aWriter.TakeAndSplice(mFrameTableWriter.TakeChunkedWriteFunc());
+}
+
+void UniqueStacks::SpliceStackTableElements(SpliceableJSONWriter& aWriter) {
+  mStackTableWriter.EndBareList();
+  aWriter.TakeAndSplice(mStackTableWriter.TakeChunkedWriteFunc());
+}
+
+[[nodiscard]] nsAutoCString UniqueStacks::FunctionNameOrAddress(void* aPC) {
+  nsAutoCString nameOrAddress;
+
+  if (!mCodeAddressService ||
+      !mCodeAddressService->GetFunction(aPC, nameOrAddress) ||
+      nameOrAddress.IsEmpty()) {
+    nameOrAddress.AppendASCII("0x");
+    // `AppendInt` only knows `uint32_t` or `uint64_t`, but because these are
+    // just aliases for *two* of (`unsigned`, `unsigned long`, and `unsigned
+    // long long`), a call with `uintptr_t` could use the third type and
+    // therefore would be ambiguous.
+    // So we want to force using exactly `uint32_t` or `uint64_t`, whichever
+    // matches the size of `uintptr_t`.
+    // (The outer cast to `uint` should then be a no-op.)
+    using uint = std::conditional_t<sizeof(uintptr_t) <= sizeof(uint32_t),
+                                    uint32_t, uint64_t>;
+    nameOrAddress.AppendInt(static_cast<uint>(reinterpret_cast<uintptr_t>(aPC)),
+                            16);
+  }
+
+  return nameOrAddress;
+}
+
+void UniqueStacks::StreamStack(const StackKey& aStack) {
+  enum Schema : uint32_t { PREFIX = 0, FRAME = 1 };
+
+  AutoArraySchemaWriter writer(mStackTableWriter);
+  if (aStack.mPrefixStackIndex.isSome()) {
+    writer.IntElement(PREFIX, *aStack.mPrefixStackIndex);
+  }
+  writer.IntElement(FRAME, aStack.mFrameIndex);
+}
+
+void UniqueStacks::StreamNonJITFrame(const FrameKey& aFrame) {
+  if (Failed()) {
+    return;
+  }
+
+  using NormalFrameData = FrameKey::NormalFrameData;
+
+  enum Schema : uint32_t {
+    LOCATION = 0,
+    RELEVANT_FOR_JS = 1,
+    INNER_WINDOW_ID = 2,
+    IMPLEMENTATION = 3,
+    LINE = 4,
+    COLUMN = 5,
+    CATEGORY = 6,
+    SUBCATEGORY = 7
+  };
+
+  AutoArraySchemaWithStringsWriter writer(mFrameTableWriter, *mUniqueStrings);
+
+  const NormalFrameData& data = aFrame.mData.as<NormalFrameData>();
+  writer.StringElement(LOCATION, data.mLocation);
+  writer.BoolElement(RELEVANT_FOR_JS, data.mRelevantForJS);
+
+  // It's okay to convert uint64_t to double here because DOM always creates IDs
+  // that are convertible to double.
+  writer.DoubleElement(INNER_WINDOW_ID, data.mInnerWindowID);
+
+  // The C++ interpreter is the default implementation so we only emit element
+  // for Baseline Interpreter frames.
+  if (data.mBaselineInterp) {
+    writer.StringElement(IMPLEMENTATION, MakeStringSpan("blinterp"));
+  }
+
+  if (data.mLine.isSome()) {
+    writer.IntElement(LINE, *data.mLine);
+  }
+  if (data.mColumn.isSome()) {
+    writer.IntElement(COLUMN, *data.mColumn);
+  }
+  if (data.mCategoryPair.isSome()) {
+    const JS::ProfilingCategoryPairInfo& info =
+        JS::GetProfilingCategoryPairInfo(*data.mCategoryPair);
+    writer.IntElement(CATEGORY, uint32_t(info.mCategory));
+    writer.IntElement(SUBCATEGORY, info.mSubcategoryIndex);
+  }
+}
+
+static void StreamJITFrame(JSContext* aContext, SpliceableJSONWriter& aWriter,
+                           UniqueJSONStrings& aUniqueStrings,
+                           const JS::ProfiledFrameHandle& aJITFrame) {
+  enum Schema : uint32_t {
+    LOCATION = 0,
+    RELEVANT_FOR_JS = 1,
+    INNER_WINDOW_ID = 2,
+    IMPLEMENTATION = 3,
+    LINE = 4,
+    COLUMN = 5,
+    CATEGORY = 6,
+    SUBCATEGORY = 7
+  };
+
+  AutoArraySchemaWithStringsWriter writer(aWriter, aUniqueStrings);
+
+  writer.StringElement(LOCATION, MakeStringSpan(aJITFrame.label()));
+  writer.BoolElement(RELEVANT_FOR_JS, false);
+
+  // It's okay to convert uint64_t to double here because DOM always creates IDs
+  // that are convertible to double.
+  // Realm ID is the name of innerWindowID inside JS code.
+  writer.DoubleElement(INNER_WINDOW_ID, aJITFrame.realmID());
+
+  JS::ProfilingFrameIterator::FrameKind frameKind = aJITFrame.frameKind();
+  MOZ_ASSERT(frameKind == JS::ProfilingFrameIterator::Frame_Ion ||
+             frameKind == JS::ProfilingFrameIterator::Frame_Baseline);
+  writer.StringElement(IMPLEMENTATION,
+                       frameKind == JS::ProfilingFrameIterator::Frame_Ion
+                           ? MakeStringSpan("ion")
+                           : MakeStringSpan("baseline"));
+
+  const JS::ProfilingCategoryPairInfo& info = JS::GetProfilingCategoryPairInfo(
+      frameKind == JS::ProfilingFrameIterator::Frame_Ion
+          ? JS::ProfilingCategoryPair::JS_IonMonkey
+          : JS::ProfilingCategoryPair::JS_Baseline);
+  writer.IntElement(CATEGORY, uint32_t(info.mCategory));
+  writer.IntElement(SUBCATEGORY, info.mSubcategoryIndex);
+}
+
+static nsCString JSONForJITFrame(JSContext* aContext,
+                                 const JS::ProfiledFrameHandle& aJITFrame,
+                                 UniqueJSONStrings& aUniqueStrings) {
+  nsCString json;
+  JSONStringRefWriteFunc jw(json);
+  SpliceableJSONWriter writer(jw, aUniqueStrings.SourceFailureLatch());
+  StreamJITFrame(aContext, writer, aUniqueStrings, aJITFrame);
+  return json;
+}
+
+void JITFrameInfo::AddInfoForRange(
+    uint64_t aRangeStart, uint64_t aRangeEnd, JSContext* aCx,
+    const std::function<void(const std::function<void(void*)>&)>&
+        aJITAddressProvider) {
+  if (mLocalFailureLatchSource.Failed()) {
+    return;
+  }
+
+  if (aRangeStart == aRangeEnd) {
+    return;
+  }
+
+  MOZ_RELEASE_ASSERT(aRangeStart < aRangeEnd);
+
+  if (!mRanges.empty()) {
+    const JITFrameInfoForBufferRange& prevRange = mRanges.back();
+    MOZ_RELEASE_ASSERT(prevRange.mRangeEnd <= aRangeStart,
+                       "Ranges must be non-overlapping and added in-order.");
+  }
+
+  using JITFrameKey = JITFrameInfoForBufferRange::JITFrameKey;
+
+  JITFrameInfoForBufferRange::JITAddressToJITFramesMap jitAddressToJITFrameMap;
+  JITFrameInfoForBufferRange::JITFrameToFrameJSONMap jitFrameToFrameJSONMap;
+
+  aJITAddressProvider([&](void* aJITAddress) {
+    // Make sure that we have cached data for aJITAddress.
+    auto addressEntry = jitAddressToJITFrameMap.lookupForAdd(aJITAddress);
+    if (!addressEntry) {
+      Vector<JITFrameKey> jitFrameKeys;
+      for (JS::ProfiledFrameHandle handle :
+           JS::GetProfiledFrames(aCx, aJITAddress)) {
+        uint32_t depth = jitFrameKeys.length();
+        JITFrameKey jitFrameKey{handle.canonicalAddress(), depth};
+        auto frameEntry = jitFrameToFrameJSONMap.lookupForAdd(jitFrameKey);
+        if (!frameEntry) {
+          if (!jitFrameToFrameJSONMap.add(
+                  frameEntry, jitFrameKey,
+                  JSONForJITFrame(aCx, handle, *mUniqueStrings))) {
+            mLocalFailureLatchSource.SetFailure(
+                "OOM in JITFrameInfo::AddInfoForRange adding jit->frame map");
+            return;
+          }
+        }
+        if (!jitFrameKeys.append(jitFrameKey)) {
+          mLocalFailureLatchSource.SetFailure(
+              "OOM in JITFrameInfo::AddInfoForRange adding jit frame key");
+          return;
+        }
+      }
+      if (!jitAddressToJITFrameMap.add(addressEntry, aJITAddress,
+                                       std::move(jitFrameKeys))) {
+        mLocalFailureLatchSource.SetFailure(
+            "OOM in JITFrameInfo::AddInfoForRange adding addr->jit map");
+        return;
+      }
+    }
+  });
+
+  if (!mRanges.append(JITFrameInfoForBufferRange{
+          aRangeStart, aRangeEnd, std::move(jitAddressToJITFrameMap),
+          std::move(jitFrameToFrameJSONMap)})) {
+    mLocalFailureLatchSource.SetFailure(
+        "OOM in JITFrameInfo::AddInfoForRange adding range");
+    return;
+  }
+}
+
+struct ProfileSample {
+  uint32_t mStack = 0;
+  double mTime = 0.0;
+  Maybe<double> mResponsiveness;
+  RunningTimes mRunningTimes;
+};
+
+// Write CPU measurements with "Delta" unit, which is some amount of work that
+// happened since the previous sample.
+static void WriteDelta(AutoArraySchemaWriter& aSchemaWriter, uint32_t aProperty,
+                       uint64_t aDelta) {
+  aSchemaWriter.IntElement(aProperty, int64_t(aDelta));
+}
+
+static void WriteSample(SpliceableJSONWriter& aWriter,
+                        const ProfileSample& aSample) {
+  enum Schema : uint32_t {
+    STACK = 0,
+    TIME = 1,
+    EVENT_DELAY = 2
+#define RUNNING_TIME_SCHEMA(index, name, unit, jsonProperty) , name
+    PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_SCHEMA)
+#undef RUNNING_TIME_SCHEMA
+  };
+
+  AutoArraySchemaWriter writer(aWriter);
+
+  writer.IntElement(STACK, aSample.mStack);
+
+  writer.TimeMsElement(TIME, aSample.mTime);
+
+  if (aSample.mResponsiveness.isSome()) {
+    writer.DoubleElement(EVENT_DELAY, *aSample.mResponsiveness);
+  }
+
+#define RUNNING_TIME_STREAM(index, name, unit, jsonProperty) \
+  aSample.mRunningTimes.GetJson##name##unit().apply(         \
+      [&writer](const uint64_t& aValue) {                    \
+        Write##unit(writer, name, aValue);                   \
+      });
+
+  PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_STREAM)
+
+#undef RUNNING_TIME_STREAM
+}
+
+static void StreamMarkerAfterKind(
+    ProfileBufferEntryReader& aER,
+    ProcessStreamingContext& aProcessStreamingContext) {
+  ThreadStreamingContext* threadData = nullptr;
+  mozilla::base_profiler_markers_detail::DeserializeAfterKindAndStream(
+      aER,
+      [&](ProfilerThreadId aThreadId) -> baseprofiler::SpliceableJSONWriter* {
+        threadData =
+            aProcessStreamingContext.GetThreadStreamingContext(aThreadId);
+        return threadData ? &threadData->mMarkersDataWriter : nullptr;
+      },
+      [&](ProfileChunkedBuffer& aChunkedBuffer) {
+        ProfilerBacktrace backtrace("", &aChunkedBuffer);
+        MOZ_ASSERT(threadData,
+                   "threadData should have been set before calling here");
+        backtrace.StreamJSON(threadData->mMarkersDataWriter,
+                             aProcessStreamingContext.ProcessStartTime(),
+                             *threadData->mUniqueStacks);
+      },
+      [&](mozilla::base_profiler_markers_detail::Streaming::DeserializerTag
+              aTag) {
+        MOZ_ASSERT(threadData,
+                   "threadData should have been set before calling here");
+
+        size_t payloadSize = aER.RemainingBytes();
+
+        ProfileBufferEntryReader::DoubleSpanOfConstBytes spans =
+            aER.ReadSpans(payloadSize);
+        if (MOZ_LIKELY(spans.IsSingleSpan())) {
+          // Only a single span, we can just refer to it directly
+          // instead of copying it.
+          profiler::ffi::gecko_profiler_serialize_marker_for_tag(
+              aTag, spans.mFirstOrOnly.Elements(), payloadSize,
+              &threadData->mMarkersDataWriter);
+        } else {
+          // Two spans, we need to concatenate them by copying.
+          uint8_t* payloadBuffer = new uint8_t[payloadSize];
+          spans.CopyBytesTo(payloadBuffer);
+          profiler::ffi::gecko_profiler_serialize_marker_for_tag(
+              aTag, payloadBuffer, payloadSize,
+              &threadData->mMarkersDataWriter);
+          delete[] payloadBuffer;
+        }
+      });
+}
+
+class EntryGetter {
+ public:
+  explicit EntryGetter(
+      ProfileChunkedBuffer::Reader& aReader,
+      mozilla::FailureLatch& aFailureLatch,
+      mozilla::ProgressLogger aProgressLogger = {},
+      uint64_t aInitialReadPos = 0,
+      ProcessStreamingContext* aStreamingContextForMarkers = nullptr)
+      : mFailureLatch(aFailureLatch),
+        mStreamingContextForMarkers(aStreamingContextForMarkers),
+        mBlockIt(
+            aReader.At(ProfileBufferBlockIndex::CreateFromProfileBufferIndex(
+                aInitialReadPos))),
+        mBlockItEnd(aReader.end()),
+        mRangeStart(mBlockIt.BufferRangeStart().ConvertToProfileBufferIndex()),
+        mRangeSize(
+            double(mBlockIt.BufferRangeEnd().ConvertToProfileBufferIndex() -
+                   mRangeStart)),
+        mProgressLogger(std::move(aProgressLogger)) {
+    SetLocalProgress(ProgressLogger::NO_LOCATION_UPDATE);
+    if (!ReadLegacyOrEnd()) {
+      // Find and read the next non-legacy entry.
+      Next();
+    }
+  }
+
+  bool Has() const {
+    return (!mFailureLatch.Failed()) && (mBlockIt != mBlockItEnd);
+  }
+
+  const ProfileBufferEntry& Get() const {
+    MOZ_ASSERT(Has() || mFailureLatch.Failed(),
+               "Caller should have checked `Has()` before `Get()`");
+    return mEntry;
+  }
+
+  void Next() {
+    MOZ_ASSERT(Has() || mFailureLatch.Failed(),
+               "Caller should have checked `Has()` before `Next()`");
+    ++mBlockIt;
+    ReadUntilLegacyOrEnd();
+  }
+
+  // Hand off the current iterator to the caller, which may be used to read
+  // any kind of entries (legacy or modern).
+  ProfileChunkedBuffer::BlockIterator Iterator() const { return mBlockIt; }
+
+  // After `Iterator()` was used, we can restart from *after* its updated
+  // position.
+  void RestartAfter(const ProfileChunkedBuffer::BlockIterator& it) {
+    mBlockIt = it;
+    if (!Has()) {
+      return;
+    }
+    Next();
+  }
+
+  ProfileBufferBlockIndex CurBlockIndex() const {
+    return mBlockIt.CurrentBlockIndex();
+  }
+
+  uint64_t CurPos() const {
+    return CurBlockIndex().ConvertToProfileBufferIndex();
+  }
+
+  void SetLocalProgress(const char* aLocation) {
+    mProgressLogger.SetLocalProgress(
+        ProportionValue{double(CurBlockIndex().ConvertToProfileBufferIndex() -
+                               mRangeStart) /
+                        mRangeSize},
+        aLocation);
+  }
+
+ private:
+  // Try to read the entry at the current `mBlockIt` position.
+  // * If we're at the end of the buffer, just return `true`.
+  // * If there is a "legacy" entry (containing a real `ProfileBufferEntry`),
+  //   read it into `mEntry`, and return `true` as well.
+  // * Otherwise the entry contains a "modern" type that cannot be read into
+  // `mEntry`, return `false` (so `EntryGetter` can skip to another entry).
+  bool ReadLegacyOrEnd() {
+    if (!Has()) {
+      return true;
+    }
+    // Read the entry "kind", which is always at the start of all entries.
+    ProfileBufferEntryReader er = *mBlockIt;
+    auto type = static_cast<ProfileBufferEntry::Kind>(
+        er.ReadObject<ProfileBufferEntry::KindUnderlyingType>());
+    MOZ_ASSERT(static_cast<ProfileBufferEntry::KindUnderlyingType>(type) <
+               static_cast<ProfileBufferEntry::KindUnderlyingType>(
+                   ProfileBufferEntry::Kind::MODERN_LIMIT));
+    if (type >= ProfileBufferEntry::Kind::LEGACY_LIMIT) {
+      if (type == ProfileBufferEntry::Kind::Marker &&
+          mStreamingContextForMarkers) {
+        StreamMarkerAfterKind(er, *mStreamingContextForMarkers);
+        if (!Has()) {
+          return true;
+        }
+        SetLocalProgress("Processed marker");
+      }
+      er.SetRemainingBytes(0);
+      return false;
+    }
+    // Here, we have a legacy item, we need to read it from the start.
+    // Because the above `ReadObject` moved the reader, we ned to reset it to
+    // the start of the entry before reading the whole entry.
+    er = *mBlockIt;
+    er.ReadBytes(&mEntry, er.RemainingBytes());
+    return true;
+  }
+
+  void ReadUntilLegacyOrEnd() {
+    for (;;) {
+      if (ReadLegacyOrEnd()) {
+        // Either we're at the end, or we could read a legacy entry -> Done.
+        break;
+      }
+      // Otherwise loop around until we hit a legacy entry or the end.
+      ++mBlockIt;
+    }
+    SetLocalProgress(ProgressLogger::NO_LOCATION_UPDATE);
+  }
+
+  mozilla::FailureLatch& mFailureLatch;
+
+  ProcessStreamingContext* const mStreamingContextForMarkers;
+
+  ProfileBufferEntry mEntry;
+  ProfileChunkedBuffer::BlockIterator mBlockIt;
+  const ProfileChunkedBuffer::BlockIterator mBlockItEnd;
+
+  // Progress logger, and the data needed to compute the current relative
+  // position in the buffer.
+  const mozilla::ProfileBufferIndex mRangeStart;
+  const double mRangeSize;
+  mozilla::ProgressLogger mProgressLogger;
+};
+
+// The following grammar shows legal sequences of profile buffer entries.
+// The sequences beginning with a ThreadId entry are known as "samples".
+//
+// (
+//   ( /* Samples */
+//     ThreadId
+//     TimeBeforeCompactStack
+//     RunningTimes?
+//     UnresponsivenessDurationMs?
+//     CompactStack
+//         /* internally including:
+//           ( NativeLeafAddr
+//           | Label FrameFlags? DynamicStringFragment*
+//             LineNumber? CategoryPair?
+//           | JitReturnAddr
+//           )+
+//         */
+//   )
+//   | ( /* Reference to a previous identical sample */
+//       ThreadId
+//       TimeBeforeSameSample
+//       RunningTimes?
+//       SameSample
+//     )
+//   | Marker
+//   | ( /* Counters */
+//       CounterId
+//       Time
+//       (
+//         CounterKey
+//         Count
+//         Number?
+//       )*
+//     )
+//   | CollectionStart
+//   | CollectionEnd
+//   | Pause
+//   | Resume
+//   | ( ProfilerOverheadTime /* Sampling start timestamp */
+//       ProfilerOverheadDuration /* Lock acquisition */
+//       ProfilerOverheadDuration /* Expired markers cleaning */
+//       ProfilerOverheadDuration /* Counters */
+//       ProfilerOverheadDuration /* Threads */
+//     )
+// )*
+//
+// The most complicated part is the stack entry sequence that begins with
+// Label. Here are some examples.
+//
+// - ProfilingStack frames without a dynamic string:
+//
+//     Label("js::RunScript")
+//     CategoryPair(JS::ProfilingCategoryPair::JS)
+//
+//     Label("XREMain::XRE_main")
+//     LineNumber(4660)
+//     CategoryPair(JS::ProfilingCategoryPair::OTHER)
+//
+//     Label("ElementRestyler::ComputeStyleChangeFor")
+//     LineNumber(3003)
+//     CategoryPair(JS::ProfilingCategoryPair::CSS)
+//
+// - ProfilingStack frames with a dynamic string:
+//
+//     Label("nsObserverService::NotifyObservers")
+//     FrameFlags(uint64_t(ProfilingStackFrame::Flags::IS_LABEL_FRAME))
+//     DynamicStringFragment("domwindo")
+//     DynamicStringFragment("wopened")
+//     LineNumber(291)
+//     CategoryPair(JS::ProfilingCategoryPair::OTHER)
+//
+//     Label("")
+//     FrameFlags(uint64_t(ProfilingStackFrame::Flags::IS_JS_FRAME))
+//     DynamicStringFragment("closeWin")
+//     DynamicStringFragment("dow (chr")
+//     DynamicStringFragment("ome://gl")
+//     DynamicStringFragment("obal/con")
+//     DynamicStringFragment("tent/glo")
+//     DynamicStringFragment("balOverl")
+//     DynamicStringFragment("ay.js:5)")
+//     DynamicStringFragment("")          # this string holds the closing '\0'
+//     LineNumber(25)
+//     CategoryPair(JS::ProfilingCategoryPair::JS)
+//
+//     Label("")
+//     FrameFlags(uint64_t(ProfilingStackFrame::Flags::IS_JS_FRAME))
+//     DynamicStringFragment("bound (s")
+//     DynamicStringFragment("elf-host")
+//     DynamicStringFragment("ed:914)")
+//     LineNumber(945)
+//     CategoryPair(JS::ProfilingCategoryPair::JS)
+//
+// - A profiling stack frame with an overly long dynamic string:
+//
+//     Label("")
+//     FrameFlags(uint64_t(ProfilingStackFrame::Flags::IS_LABEL_FRAME))
+//     DynamicStringFragment("(too lon")
+//     DynamicStringFragment("g)")
+//     LineNumber(100)
+//     CategoryPair(JS::ProfilingCategoryPair::NETWORK)
+//
+// - A wasm JIT frame:
+//
+//     Label("")
+//     FrameFlags(uint64_t(0))
+//     DynamicStringFragment("wasm-fun")
+//     DynamicStringFragment("ction[87")
+//     DynamicStringFragment("36] (blo")
+//     DynamicStringFragment("b:http:/")
+//     DynamicStringFragment("/webasse")
+//     DynamicStringFragment("mbly.org")
+//     DynamicStringFragment("/3dc5759")
+//     DynamicStringFragment("4-ce58-4")
+//     DynamicStringFragment("626-975b")
+//     DynamicStringFragment("-08ad116")
+//     DynamicStringFragment("30bc1:38")
+//     DynamicStringFragment("29856)")
+//
+// - A JS frame in a synchronous sample:
+//
+//     Label("")
+//     FrameFlags(uint64_t(ProfilingStackFrame::Flags::IS_LABEL_FRAME))
+//     DynamicStringFragment("u (https")
+//     DynamicStringFragment("://perf-")
+//     DynamicStringFragment("html.io/")
+//     DynamicStringFragment("ac0da204")
+//     DynamicStringFragment("aaa44d75")
+//     DynamicStringFragment("a800.bun")
+//     DynamicStringFragment("dle.js:2")
+//     DynamicStringFragment("5)")
+
+// Because this is a format entirely internal to the Profiler, any parsing
+// error indicates a bug in the ProfileBuffer writing or the parser itself,
+// or possibly flaky hardware.
+#define ERROR_AND_CONTINUE(msg)                            \
+  {                                                        \
+    fprintf(stderr, "ProfileBuffer parse error: %s", msg); \
+    MOZ_ASSERT(false, msg);                                \
+    continue;                                              \
+  }
+
+struct StreamingParametersForThread {
+  SpliceableJSONWriter& mWriter;
+  UniqueStacks& mUniqueStacks;
+  ThreadStreamingContext::PreviousStackState& mPreviousStackState;
+  uint32_t& mPreviousStack;
+
+  StreamingParametersForThread(
+      SpliceableJSONWriter& aWriter, UniqueStacks& aUniqueStacks,
+      ThreadStreamingContext::PreviousStackState& aPreviousStackState,
+      uint32_t& aPreviousStack)
+      : mWriter(aWriter),
+        mUniqueStacks(aUniqueStacks),
+        mPreviousStackState(aPreviousStackState),
+        mPreviousStack(aPreviousStack) {}
+};
+
+// GetStreamingParametersForThreadCallback:
+//   (ProfilerThreadId) -> Maybe<StreamingParametersForThread>
+template <typename GetStreamingParametersForThreadCallback>
+ProfilerThreadId ProfileBuffer::DoStreamSamplesAndMarkersToJSON(
+    mozilla::FailureLatch& aFailureLatch,
+    GetStreamingParametersForThreadCallback&&
+        aGetStreamingParametersForThreadCallback,
+    double aSinceTime, ProcessStreamingContext* aStreamingContextForMarkers,
+    mozilla::ProgressLogger aProgressLogger) const {
+  UniquePtr<char[]> dynStrBuf = MakeUnique<char[]>(kMaxFrameKeyLength);
+
+  return mEntries.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+    MOZ_ASSERT(aReader,
+               "ProfileChunkedBuffer cannot be out-of-session when sampler is "
+               "running");
+
+    ProfilerThreadId processedThreadId;
+
+    EntryGetter e(*aReader, aFailureLatch, std::move(aProgressLogger),
+                  /* aInitialReadPos */ 0, aStreamingContextForMarkers);
+
+    for (;;) {
+      // This block skips entries until we find the start of the next sample.
+      // This is useful in three situations.
+      //
+      // - The circular buffer overwrites old entries, so when we start parsing
+      //   we might be in the middle of a sample, and we must skip forward to
+      //   the start of the next sample.
+      //
+      // - We skip samples that don't have an appropriate ThreadId or Time.
+      //
+      // - We skip range Pause, Resume, CollectionStart, Marker, Counter
+      //   and CollectionEnd entries between samples.
+      while (e.Has()) {
+        if (e.Get().IsThreadId()) {
+          break;
+        }
+        e.Next();
+      }
+
+      if (!e.Has()) {
+        break;
+      }
+
+      // Due to the skip_to_next_sample block above, if we have an entry here it
+      // must be a ThreadId entry.
+      MOZ_ASSERT(e.Get().IsThreadId());
+
+      ProfilerThreadId threadId = e.Get().GetThreadId();
+      e.Next();
+
+      Maybe<StreamingParametersForThread> streamingParameters =
+          std::forward<GetStreamingParametersForThreadCallback>(
+              aGetStreamingParametersForThreadCallback)(threadId);
+
+      // Ignore samples that are for the wrong thread.
+      if (!streamingParameters) {
+        continue;
+      }
+
+      SpliceableJSONWriter& writer = streamingParameters->mWriter;
+      UniqueStacks& uniqueStacks = streamingParameters->mUniqueStacks;
+      ThreadStreamingContext::PreviousStackState& previousStackState =
+          streamingParameters->mPreviousStackState;
+      uint32_t& previousStack = streamingParameters->mPreviousStack;
+
+      auto ReadStack = [&](EntryGetter& e, double time, uint64_t entryPosition,
+                           const Maybe<double>& unresponsiveDuration,
+                           const RunningTimes& runningTimes) {
+        if (writer.Failed()) {
+          return;
+        }
+
+        Maybe<UniqueStacks::StackKey> maybeStack =
+            uniqueStacks.BeginStack(UniqueStacks::FrameKey("(root)"));
+        if (!maybeStack) {
+          writer.SetFailure("BeginStack failure");
+          return;
+        }
+
+        UniqueStacks::StackKey stack = *maybeStack;
+
+        int numFrames = 0;
+        while (e.Has()) {
+          if (e.Get().IsNativeLeafAddr()) {
+            numFrames++;
+
+            void* pc = e.Get().GetPtr();
+            e.Next();
+
+            nsAutoCString functionNameOrAddress =
+                uniqueStacks.FunctionNameOrAddress(pc);
+
+            maybeStack = uniqueStacks.AppendFrame(
+                stack, UniqueStacks::FrameKey(functionNameOrAddress.get()));
+            if (!maybeStack) {
+              writer.SetFailure("AppendFrame failure");
+              return;
+            }
+            stack = *maybeStack;
+
+          } else if (e.Get().IsLabel()) {
+            numFrames++;
+
+            const char* label = e.Get().GetString();
+            e.Next();
+
+            using FrameFlags = js::ProfilingStackFrame::Flags;
+            uint32_t frameFlags = 0;
+            if (e.Has() && e.Get().IsFrameFlags()) {
+              frameFlags = uint32_t(e.Get().GetUint64());
+              e.Next();
+            }
+
+            bool relevantForJS =
+                frameFlags & uint32_t(FrameFlags::RELEVANT_FOR_JS);
+
+            bool isBaselineInterp =
+                frameFlags & uint32_t(FrameFlags::IS_BLINTERP_FRAME);
+
+            // Copy potential dynamic string fragments into dynStrBuf, so that
+            // dynStrBuf will then contain the entire dynamic string.
+            size_t i = 0;
+            dynStrBuf[0] = '\0';
+            while (e.Has()) {
+              if (e.Get().IsDynamicStringFragment()) {
+                char chars[ProfileBufferEntry::kNumChars];
+                e.Get().CopyCharsInto(chars);
+                for (char c : chars) {
+                  if (i < kMaxFrameKeyLength) {
+                    dynStrBuf[i] = c;
+                    i++;
+                  }
+                }
+                e.Next();
+              } else {
+                break;
+              }
+            }
+            dynStrBuf[kMaxFrameKeyLength - 1] = '\0';
+            bool hasDynamicString = (i != 0);
+
+            nsAutoCStringN<1024> frameLabel;
+            if (label[0] != '\0' && hasDynamicString) {
+              if (frameFlags & uint32_t(FrameFlags::STRING_TEMPLATE_METHOD)) {
+                frameLabel.AppendPrintf("%s.%s", label, dynStrBuf.get());
+              } else if (frameFlags &
+                         uint32_t(FrameFlags::STRING_TEMPLATE_GETTER)) {
+                frameLabel.AppendPrintf("get %s.%s", label, dynStrBuf.get());
+              } else if (frameFlags &
+                         uint32_t(FrameFlags::STRING_TEMPLATE_SETTER)) {
+                frameLabel.AppendPrintf("set %s.%s", label, dynStrBuf.get());
+              } else {
+                frameLabel.AppendPrintf("%s %s", label, dynStrBuf.get());
+              }
+            } else if (hasDynamicString) {
+              frameLabel.Append(dynStrBuf.get());
+            } else {
+              frameLabel.Append(label);
+            }
+
+            uint64_t innerWindowID = 0;
+            if (e.Has() && e.Get().IsInnerWindowID()) {
+              innerWindowID = uint64_t(e.Get().GetUint64());
+              e.Next();
+            }
+
+            Maybe<unsigned> line;
+            if (e.Has() && e.Get().IsLineNumber()) {
+              line = Some(unsigned(e.Get().GetInt()));
+              e.Next();
+            }
+
+            Maybe<unsigned> column;
+            if (e.Has() && e.Get().IsColumnNumber()) {
+              column = Some(unsigned(e.Get().GetInt()));
+              e.Next();
+            }
+
+            Maybe<JS::ProfilingCategoryPair> categoryPair;
+            if (e.Has() && e.Get().IsCategoryPair()) {
+              categoryPair =
+                  Some(JS::ProfilingCategoryPair(uint32_t(e.Get().GetInt())));
+              e.Next();
+            }
+
+            maybeStack = uniqueStacks.AppendFrame(
+                stack,
+                UniqueStacks::FrameKey(std::move(frameLabel), relevantForJS,
+                                       isBaselineInterp, innerWindowID, line,
+                                       column, categoryPair));
+            if (!maybeStack) {
+              writer.SetFailure("AppendFrame failure");
+              return;
+            }
+            stack = *maybeStack;
+
+          } else if (e.Get().IsJitReturnAddr()) {
+            numFrames++;
+
+            // A JIT frame may expand to multiple frames due to inlining.
+            void* pc = e.Get().GetPtr();
+            const Maybe<Vector<UniqueStacks::FrameKey>>& frameKeys =
+                uniqueStacks.LookupFramesForJITAddressFromBufferPos(
+                    pc, entryPosition ? entryPosition : e.CurPos());
+            MOZ_RELEASE_ASSERT(
+                frameKeys,
+                "Attempting to stream samples for a buffer range "
+                "for which we don't have JITFrameInfo?");
+            for (const UniqueStacks::FrameKey& frameKey : *frameKeys) {
+              maybeStack = uniqueStacks.AppendFrame(stack, frameKey);
+              if (!maybeStack) {
+                writer.SetFailure("AppendFrame failure");
+                return;
+              }
+              stack = *maybeStack;
+            }
+
+            e.Next();
+
+          } else {
+            break;
+          }
+        }
+
+        // Even if this stack is considered empty, it contains the root frame,
+        // which needs to be in the JSON output because following "same samples"
+        // may refer to it when reusing this sample.mStack.
+        const Maybe<uint32_t> stackIndex =
+            uniqueStacks.GetOrAddStackIndex(stack);
+        if (!stackIndex) {
+          writer.SetFailure("Can't add unique string for stack");
+          return;
+        }
+
+        // And store that possibly-empty stack in case it's followed by "same
+        // sample" entries.
+        previousStack = *stackIndex;
+        previousStackState = (numFrames == 0)
+                                 ? ThreadStreamingContext::eStackWasEmpty
+                                 : ThreadStreamingContext::eStackWasNotEmpty;
+
+        // Even if too old or empty, we did process a sample for this thread id.
+        processedThreadId = threadId;
+
+        // Discard samples that are too old.
+        if (time < aSinceTime) {
+          return;
+        }
+
+        if (numFrames == 0 && runningTimes.IsEmpty()) {
+          // It is possible to have empty stacks if native stackwalking is
+          // disabled. Skip samples with empty stacks, unless we have useful
+          // running times.
+          return;
+        }
+
+        WriteSample(writer, ProfileSample{*stackIndex, time,
+                                          unresponsiveDuration, runningTimes});
+      };  // End of `ReadStack(EntryGetter&)` lambda.
+
+      if (e.Has() && e.Get().IsTime()) {
+        double time = e.Get().GetDouble();
+        e.Next();
+        // Note: Even if this sample is too old (before aSinceTime), we still
+        // need to read it, so that its frames are in the tables, in case there
+        // is a same-sample following it that would be after aSinceTime, which
+        // would need these frames to be present.
+
+        ReadStack(e, time, 0, Nothing{}, RunningTimes{});
+
+        e.SetLocalProgress("Processed sample");
+      } else if (e.Has() && e.Get().IsTimeBeforeCompactStack()) {
+        double time = e.Get().GetDouble();
+        // Note: Even if this sample is too old (before aSinceTime), we still
+        // need to read it, so that its frames are in the tables, in case there
+        // is a same-sample following it that would be after aSinceTime, which
+        // would need these frames to be present.
+
+        RunningTimes runningTimes;
+        Maybe<double> unresponsiveDuration;
+
+        ProfileChunkedBuffer::BlockIterator it = e.Iterator();
+        for (;;) {
+          ++it;
+          if (it.IsAtEnd()) {
+            break;
+          }
+          ProfileBufferEntryReader er = *it;
+          ProfileBufferEntry::Kind kind =
+              er.ReadObject<ProfileBufferEntry::Kind>();
+
+          // There may be running times before the CompactStack.
+          if (kind == ProfileBufferEntry::Kind::RunningTimes) {
+            er.ReadIntoObject(runningTimes);
+            continue;
+          }
+
+          // There may be an UnresponsiveDurationMs before the CompactStack.
+          if (kind == ProfileBufferEntry::Kind::UnresponsiveDurationMs) {
+            unresponsiveDuration = Some(er.ReadObject<double>());
+            continue;
+          }
+
+          if (kind == ProfileBufferEntry::Kind::CompactStack) {
+            ProfileChunkedBuffer tempBuffer(
+                ProfileChunkedBuffer::ThreadSafety::WithoutMutex,
+                WorkerChunkManager());
+            er.ReadIntoObject(tempBuffer);
+            tempBuffer.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+              MOZ_ASSERT(aReader,
+                         "Local ProfileChunkedBuffer cannot be out-of-session");
+              // This is a compact stack, it should only contain one sample.
+              EntryGetter stackEntryGetter(*aReader, aFailureLatch);
+              ReadStack(stackEntryGetter, time,
+                        it.CurrentBlockIndex().ConvertToProfileBufferIndex(),
+                        unresponsiveDuration, runningTimes);
+            });
+            WorkerChunkManager().Reset(tempBuffer.GetAllChunks());
+            break;
+          }
+
+          if (kind == ProfileBufferEntry::Kind::Marker &&
+              aStreamingContextForMarkers) {
+            StreamMarkerAfterKind(er, *aStreamingContextForMarkers);
+            continue;
+          }
+
+          MOZ_ASSERT(kind >= ProfileBufferEntry::Kind::LEGACY_LIMIT,
+                     "There should be no legacy entries between "
+                     "TimeBeforeCompactStack and CompactStack");
+          er.SetRemainingBytes(0);
+        }
+
+        e.RestartAfter(it);
+
+        e.SetLocalProgress("Processed compact sample");
+      } else if (e.Has() && e.Get().IsTimeBeforeSameSample()) {
+        if (previousStackState == ThreadStreamingContext::eNoStackYet) {
+          // We don't have any full sample yet, we cannot duplicate a "previous"
+          // one. This should only happen at most once per thread, for the very
+          // first sample.
+          continue;
+        }
+
+        ProfileSample sample;
+
+        // Keep the same `mStack` as previously output.
+        // Note that it may be empty, this is checked below before writing it.
+        sample.mStack = previousStack;
+
+        sample.mTime = e.Get().GetDouble();
+
+        // Ignore samples that are too old.
+        if (sample.mTime < aSinceTime) {
+          e.Next();
+          continue;
+        }
+
+        sample.mResponsiveness = Nothing{};
+
+        sample.mRunningTimes.Clear();
+
+        ProfileChunkedBuffer::BlockIterator it = e.Iterator();
+        for (;;) {
+          ++it;
+          if (it.IsAtEnd()) {
+            break;
+          }
+          ProfileBufferEntryReader er = *it;
+          ProfileBufferEntry::Kind kind =
+              er.ReadObject<ProfileBufferEntry::Kind>();
+
+          // There may be running times before the SameSample.
+          if (kind == ProfileBufferEntry::Kind::RunningTimes) {
+            er.ReadIntoObject(sample.mRunningTimes);
+            continue;
+          }
+
+          if (kind == ProfileBufferEntry::Kind::SameSample) {
+            if (previousStackState == ThreadStreamingContext::eStackWasEmpty &&
+                sample.mRunningTimes.IsEmpty()) {
+              // Skip samples with empty stacks, unless we have useful running
+              // times.
+              break;
+            }
+            WriteSample(writer, sample);
+            break;
+          }
+
+          if (kind == ProfileBufferEntry::Kind::Marker &&
+              aStreamingContextForMarkers) {
+            StreamMarkerAfterKind(er, *aStreamingContextForMarkers);
+            continue;
+          }
+
+          MOZ_ASSERT(kind >= ProfileBufferEntry::Kind::LEGACY_LIMIT,
+                     "There should be no legacy entries between "
+                     "TimeBeforeSameSample and SameSample");
+          er.SetRemainingBytes(0);
+        }
+
+        e.RestartAfter(it);
+
+        e.SetLocalProgress("Processed repeated sample");
+      } else {
+        ERROR_AND_CONTINUE("expected a Time entry");
+      }
+    }
+
+    return processedThreadId;
+  });
+}
+
+ProfilerThreadId ProfileBuffer::StreamSamplesToJSON(
+    SpliceableJSONWriter& aWriter, ProfilerThreadId aThreadId,
+    double aSinceTime, UniqueStacks& aUniqueStacks,
+    mozilla::ProgressLogger aProgressLogger) const {
+  ThreadStreamingContext::PreviousStackState previousStackState =
+      ThreadStreamingContext::eNoStackYet;
+  uint32_t stack = 0u;
+#ifdef DEBUG
+  int processedCount = 0;
+#endif  // DEBUG
+  return DoStreamSamplesAndMarkersToJSON(
+      aWriter.SourceFailureLatch(),
+      [&](ProfilerThreadId aReadThreadId) {
+        Maybe<StreamingParametersForThread> streamingParameters;
+#ifdef DEBUG
+        ++processedCount;
+        MOZ_ASSERT(
+            aThreadId.IsSpecified() ||
+                (processedCount == 1 && aReadThreadId.IsSpecified()),
+            "Unspecified aThreadId should only be used with 1-sample buffer");
+#endif  // DEBUG
+        if (!aThreadId.IsSpecified() || aThreadId == aReadThreadId) {
+          streamingParameters.emplace(aWriter, aUniqueStacks,
+                                      previousStackState, stack);
+        }
+        return streamingParameters;
+      },
+      aSinceTime, /* aStreamingContextForMarkers */ nullptr,
+      std::move(aProgressLogger));
+}
+
+void ProfileBuffer::StreamSamplesAndMarkersToJSON(
+    ProcessStreamingContext& aProcessStreamingContext,
+    mozilla::ProgressLogger aProgressLogger) const {
+  (void)DoStreamSamplesAndMarkersToJSON(
+      aProcessStreamingContext.SourceFailureLatch(),
+      [&](ProfilerThreadId aReadThreadId) {
+        Maybe<StreamingParametersForThread> streamingParameters;
+        ThreadStreamingContext* threadData =
+            aProcessStreamingContext.GetThreadStreamingContext(aReadThreadId);
+        if (threadData) {
+          streamingParameters.emplace(
+              threadData->mSamplesDataWriter, *threadData->mUniqueStacks,
+              threadData->mPreviousStackState, threadData->mPreviousStack);
+        }
+        return streamingParameters;
+      },
+      aProcessStreamingContext.GetSinceTime(), &aProcessStreamingContext,
+      std::move(aProgressLogger));
+}
+
+void ProfileBuffer::AddJITInfoForRange(
+    uint64_t aRangeStart, ProfilerThreadId aThreadId, JSContext* aContext,
+    JITFrameInfo& aJITFrameInfo,
+    mozilla::ProgressLogger aProgressLogger) const {
+  // We can only process JitReturnAddr entries if we have a JSContext.
+  MOZ_RELEASE_ASSERT(aContext);
+
+  aRangeStart = std::max(aRangeStart, BufferRangeStart());
+  aJITFrameInfo.AddInfoForRange(
+      aRangeStart, BufferRangeEnd(), aContext,
+      [&](const std::function<void(void*)>& aJITAddressConsumer) {
+        // Find all JitReturnAddr entries in the given range for the given
+        // thread, and call aJITAddressConsumer with those addresses.
+
+        mEntries.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+          MOZ_ASSERT(aReader,
+                     "ProfileChunkedBuffer cannot be out-of-session when "
+                     "sampler is running");
+
+          EntryGetter e(*aReader, aJITFrameInfo.LocalFailureLatchSource(),
+                        std::move(aProgressLogger), aRangeStart);
+
+          while (true) {
+            // Advance to the next ThreadId entry.
+            while (e.Has() && !e.Get().IsThreadId()) {
+              e.Next();
+            }
+            if (!e.Has()) {
+              break;
+            }
+
+            MOZ_ASSERT(e.Get().IsThreadId());
+            ProfilerThreadId threadId = e.Get().GetThreadId();
+            e.Next();
+
+            // Ignore samples that are for a different thread.
+            if (threadId != aThreadId) {
+              continue;
+            }
+
+            if (e.Has() && e.Get().IsTime()) {
+              // Legacy stack.
+              e.Next();
+              while (e.Has() && !e.Get().IsThreadId()) {
+                if (e.Get().IsJitReturnAddr()) {
+                  aJITAddressConsumer(e.Get().GetPtr());
+                }
+                e.Next();
+              }
+            } else if (e.Has() && e.Get().IsTimeBeforeCompactStack()) {
+              // Compact stack.
+              ProfileChunkedBuffer::BlockIterator it = e.Iterator();
+              for (;;) {
+                ++it;
+                if (it.IsAtEnd()) {
+                  break;
+                }
+                ProfileBufferEntryReader er = *it;
+                ProfileBufferEntry::Kind kind =
+                    er.ReadObject<ProfileBufferEntry::Kind>();
+                if (kind == ProfileBufferEntry::Kind::CompactStack) {
+                  ProfileChunkedBuffer tempBuffer(
+                      ProfileChunkedBuffer::ThreadSafety::WithoutMutex,
+                      WorkerChunkManager());
+                  er.ReadIntoObject(tempBuffer);
+                  tempBuffer.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+                    MOZ_ASSERT(
+                        aReader,
+                        "Local ProfileChunkedBuffer cannot be out-of-session");
+                    EntryGetter stackEntryGetter(
+                        *aReader, aJITFrameInfo.LocalFailureLatchSource());
+                    while (stackEntryGetter.Has()) {
+                      if (stackEntryGetter.Get().IsJitReturnAddr()) {
+                        aJITAddressConsumer(stackEntryGetter.Get().GetPtr());
+                      }
+                      stackEntryGetter.Next();
+                    }
+                  });
+                  WorkerChunkManager().Reset(tempBuffer.GetAllChunks());
+                  break;
+                }
+
+                MOZ_ASSERT(kind >= ProfileBufferEntry::Kind::LEGACY_LIMIT,
+                           "There should be no legacy entries between "
+                           "TimeBeforeCompactStack and CompactStack");
+                er.SetRemainingBytes(0);
+              }
+
+              e.Next();
+            } else if (e.Has() && e.Get().IsTimeBeforeSameSample()) {
+              // Sample index, nothing to do.
+
+            } else {
+              ERROR_AND_CONTINUE("expected a Time entry");
+            }
+          }
+        });
+      });
+}
+
+void ProfileBuffer::StreamMarkersToJSON(
+    SpliceableJSONWriter& aWriter, ProfilerThreadId aThreadId,
+    const TimeStamp& aProcessStartTime, double aSinceTime,
+    UniqueStacks& aUniqueStacks,
+    mozilla::ProgressLogger aProgressLogger) const {
+  mEntries.ReadEach([&](ProfileBufferEntryReader& aER) {
+    auto type = static_cast<ProfileBufferEntry::Kind>(
+        aER.ReadObject<ProfileBufferEntry::KindUnderlyingType>());
+    MOZ_ASSERT(static_cast<ProfileBufferEntry::KindUnderlyingType>(type) <
+               static_cast<ProfileBufferEntry::KindUnderlyingType>(
+                   ProfileBufferEntry::Kind::MODERN_LIMIT));
+    if (type == ProfileBufferEntry::Kind::Marker) {
+      mozilla::base_profiler_markers_detail::DeserializeAfterKindAndStream(
+          aER,
+          [&](const ProfilerThreadId& aMarkerThreadId) {
+            return (!aThreadId.IsSpecified() || aMarkerThreadId == aThreadId)
+                       ? &aWriter
+                       : nullptr;
+          },
+          [&](ProfileChunkedBuffer& aChunkedBuffer) {
+            ProfilerBacktrace backtrace("", &aChunkedBuffer);
+            backtrace.StreamJSON(aWriter, aProcessStartTime, aUniqueStacks);
+          },
+          [&](mozilla::base_profiler_markers_detail::Streaming::DeserializerTag
+                  aTag) {
+            size_t payloadSize = aER.RemainingBytes();
+
+            ProfileBufferEntryReader::DoubleSpanOfConstBytes spans =
+                aER.ReadSpans(payloadSize);
+            if (MOZ_LIKELY(spans.IsSingleSpan())) {
+              // Only a single span, we can just refer to it directly
+              // instead of copying it.
+              profiler::ffi::gecko_profiler_serialize_marker_for_tag(
+                  aTag, spans.mFirstOrOnly.Elements(), payloadSize, &aWriter);
+            } else {
+              // Two spans, we need to concatenate them by copying.
+              uint8_t* payloadBuffer = new uint8_t[payloadSize];
+              spans.CopyBytesTo(payloadBuffer);
+              profiler::ffi::gecko_profiler_serialize_marker_for_tag(
+                  aTag, payloadBuffer, payloadSize, &aWriter);
+              delete[] payloadBuffer;
+            }
+          });
+    } else {
+      // The entry was not a marker, we need to skip to the end.
+      aER.SetRemainingBytes(0);
+    }
+  });
+}
+
+void ProfileBuffer::StreamProfilerOverheadToJSON(
+    SpliceableJSONWriter& aWriter, const TimeStamp& aProcessStartTime,
+    double aSinceTime, mozilla::ProgressLogger aProgressLogger) const {
+  mEntries.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+    MOZ_ASSERT(aReader,
+               "ProfileChunkedBuffer cannot be out-of-session when sampler is "
+               "running");
+
+    EntryGetter e(*aReader, aWriter.SourceFailureLatch(),
+                  std::move(aProgressLogger));
+
+    enum Schema : uint32_t {
+      TIME = 0,
+      LOCKING = 1,
+      MARKER_CLEANING = 2,
+      COUNTERS = 3,
+      THREADS = 4
+    };
+
+    aWriter.StartObjectProperty("profilerOverhead");
+    aWriter.StartObjectProperty("samples");
+    // Stream all sampling overhead data. We skip other entries, because we
+    // process them in StreamSamplesToJSON()/etc.
+    {
+      JSONSchemaWriter schema(aWriter);
+      schema.WriteField("time");
+      schema.WriteField("locking");
+      schema.WriteField("expiredMarkerCleaning");
+      schema.WriteField("counters");
+      schema.WriteField("threads");
+    }
+
+    aWriter.StartArrayProperty("data");
+    double firstTime = 0.0;
+    double lastTime = 0.0;
+    ProfilerStats intervals, overheads, lockings, cleanings, counters, threads;
+    while (e.Has()) {
+      // valid sequence: ProfilerOverheadTime, ProfilerOverheadDuration * 4
+      if (e.Get().IsProfilerOverheadTime()) {
+        double time = e.Get().GetDouble();
+        if (time >= aSinceTime) {
+          e.Next();
+          if (!e.Has() || !e.Get().IsProfilerOverheadDuration()) {
+            ERROR_AND_CONTINUE(
+                "expected a ProfilerOverheadDuration entry after "
+                "ProfilerOverheadTime");
+          }
+          double locking = e.Get().GetDouble();
+          e.Next();
+          if (!e.Has() || !e.Get().IsProfilerOverheadDuration()) {
+            ERROR_AND_CONTINUE(
+                "expected a ProfilerOverheadDuration entry after "
+                "ProfilerOverheadTime,ProfilerOverheadDuration");
+          }
+          double cleaning = e.Get().GetDouble();
+          e.Next();
+          if (!e.Has() || !e.Get().IsProfilerOverheadDuration()) {
+            ERROR_AND_CONTINUE(
+                "expected a ProfilerOverheadDuration entry after "
+                "ProfilerOverheadTime,ProfilerOverheadDuration*2");
+          }
+          double counter = e.Get().GetDouble();
+          e.Next();
+          if (!e.Has() || !e.Get().IsProfilerOverheadDuration()) {
+            ERROR_AND_CONTINUE(
+                "expected a ProfilerOverheadDuration entry after "
+                "ProfilerOverheadTime,ProfilerOverheadDuration*3");
+          }
+          double thread = e.Get().GetDouble();
+
+          if (firstTime == 0.0) {
+            firstTime = time;
+          } else {
+            // Note that we'll have 1 fewer interval than other numbers (because
+            // we need both ends of an interval to know its duration). The final
+            // difference should be insignificant over the expected many
+            // thousands of iterations.
+            intervals.Count(time - lastTime);
+          }
+          lastTime = time;
+          overheads.Count(locking + cleaning + counter + thread);
+          lockings.Count(locking);
+          cleanings.Count(cleaning);
+          counters.Count(counter);
+          threads.Count(thread);
+
+          AutoArraySchemaWriter writer(aWriter);
+          writer.TimeMsElement(TIME, time);
+          writer.DoubleElement(LOCKING, locking);
+          writer.DoubleElement(MARKER_CLEANING, cleaning);
+          writer.DoubleElement(COUNTERS, counter);
+          writer.DoubleElement(THREADS, thread);
+        }
+      }
+      e.Next();
+    }
+    aWriter.EndArray();   // data
+    aWriter.EndObject();  // samples
+
+    // Only output statistics if there is at least one full interval (and
+    // therefore at least two samplings.)
+    if (intervals.n > 0) {
+      aWriter.StartObjectProperty("statistics");
+      aWriter.DoubleProperty("profiledDuration", lastTime - firstTime);
+      aWriter.IntProperty("samplingCount", overheads.n);
+      aWriter.DoubleProperty("overheadDurations", overheads.sum);
+      aWriter.DoubleProperty("overheadPercentage",
+                             overheads.sum / (lastTime - firstTime));
+#define PROFILER_STATS(name, var)                           \
+  aWriter.DoubleProperty("mean" name, (var).sum / (var).n); \
+  aWriter.DoubleProperty("min" name, (var).min);            \
+  aWriter.DoubleProperty("max" name, (var).max);
+      PROFILER_STATS("Interval", intervals);
+      PROFILER_STATS("Overhead", overheads);
+      PROFILER_STATS("Lockings", lockings);
+      PROFILER_STATS("Cleaning", cleanings);
+      PROFILER_STATS("Counter", counters);
+      PROFILER_STATS("Thread", threads);
+#undef PROFILER_STATS
+      aWriter.EndObject();  // statistics
+    }
+    aWriter.EndObject();  // profilerOverhead
+  });
+}
+
+struct CounterKeyedSample {
+  double mTime;
+  uint64_t mNumber;
+  int64_t mCount;
+};
+
+using CounterKeyedSamples = Vector<CounterKeyedSample>;
+
+static LazyLogModule sFuzzyfoxLog("Fuzzyfox");
+
+using CounterMap = HashMap<uint64_t, CounterKeyedSamples>;
+
+// HashMap lookup, if not found, a default value is inserted.
+// Returns reference to (existing or new) value inside the HashMap.
+template <typename HashM, typename Key>
+static auto& LookupOrAdd(HashM& aMap, Key&& aKey) {
+  auto addPtr = aMap.lookupForAdd(aKey);
+  if (!addPtr) {
+    MOZ_RELEASE_ASSERT(aMap.add(addPtr, std::forward<Key>(aKey),
+                                typename HashM::Entry::ValueType{}));
+    MOZ_ASSERT(!!addPtr);
+  }
+  return addPtr->value();
+}
+
+void ProfileBuffer::StreamCountersToJSON(
+    SpliceableJSONWriter& aWriter, const TimeStamp& aProcessStartTime,
+    double aSinceTime, mozilla::ProgressLogger aProgressLogger) const {
+  // Because this is a format entirely internal to the Profiler, any parsing
+  // error indicates a bug in the ProfileBuffer writing or the parser itself,
+  // or possibly flaky hardware.
+
+  mEntries.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+    MOZ_ASSERT(aReader,
+               "ProfileChunkedBuffer cannot be out-of-session when sampler is "
+               "running");
+
+    EntryGetter e(*aReader, aWriter.SourceFailureLatch(),
+                  std::move(aProgressLogger));
+
+    enum Schema : uint32_t { TIME = 0, COUNT = 1, NUMBER = 2 };
+
+    // Stream all counters. We skip other entries, because we process them in
+    // StreamSamplesToJSON()/etc.
+    //
+    // Valid sequence in the buffer:
+    // CounterID
+    // Time
+    // ( CounterKey Count Number? )*
+    //
+    // And the JSON (example):
+    // "counters": {
+    //  "name": "malloc",
+    //  "category": "Memory",
+    //  "description": "Amount of allocated memory",
+    //  "sample_groups": {
+    //   "id": 0,
+    //   "samples": {
+    //    "schema": {"time": 0, "number": 1, "count": 2},
+    //    "data": [
+    //     [
+    //      16117.033968000002,
+    //      2446216,
+    //      6801320
+    //     ],
+    //     [
+    //      16118.037638,
+    //      2446216,
+    //      6801320
+    //     ],
+    //    ],
+    //   }
+    //  }
+    // },
+
+    // Build the map of counters and populate it
+    HashMap<void*, CounterMap> counters;
+
+    while (e.Has()) {
+      // skip all non-Counters, including if we start in the middle of a counter
+      if (e.Get().IsCounterId()) {
+        void* id = e.Get().GetPtr();
+        CounterMap& counter = LookupOrAdd(counters, id);
+        e.Next();
+        if (!e.Has() || !e.Get().IsTime()) {
+          ERROR_AND_CONTINUE("expected a Time entry");
+        }
+        double time = e.Get().GetDouble();
+        e.Next();
+        if (time >= aSinceTime) {
+          while (e.Has() && e.Get().IsCounterKey()) {
+            uint64_t key = e.Get().GetUint64();
+            CounterKeyedSamples& data = LookupOrAdd(counter, key);
+            e.Next();
+            if (!e.Has() || !e.Get().IsCount()) {
+              ERROR_AND_CONTINUE("expected a Count entry");
+            }
+            int64_t count = e.Get().GetUint64();
+            e.Next();
+            uint64_t number;
+            if (!e.Has() || !e.Get().IsNumber()) {
+              number = 0;
+            } else {
+              number = e.Get().GetInt64();
+              e.Next();
+            }
+            CounterKeyedSample sample = {time, number, count};
+            MOZ_RELEASE_ASSERT(data.append(sample));
+          }
+        } else {
+          // skip counter sample - only need to skip the initial counter
+          // id, then let the loop at the top skip the rest
+        }
+      } else {
+        e.Next();
+      }
+    }
+    // we have a map of a map of counter entries; dump them to JSON
+    if (counters.count() == 0) {
+      return;
+    }
+
+    aWriter.StartArrayProperty("counters");
+    for (auto iter = counters.iter(); !iter.done(); iter.next()) {
+      CounterMap& counter = iter.get().value();
+      const BaseProfilerCount* base_counter =
+          static_cast<const BaseProfilerCount*>(iter.get().key());
+
+      aWriter.Start();
+      aWriter.StringProperty("name", MakeStringSpan(base_counter->mLabel));
+      aWriter.StringProperty("category",
+                             MakeStringSpan(base_counter->mCategory));
+      aWriter.StringProperty("description",
+                             MakeStringSpan(base_counter->mDescription));
+
+      aWriter.StartArrayProperty("sample_groups");
+      for (auto counter_iter = counter.iter(); !counter_iter.done();
+           counter_iter.next()) {
+        CounterKeyedSamples& samples = counter_iter.get().value();
+        uint64_t key = counter_iter.get().key();
+
+        size_t size = samples.length();
+        if (size == 0) {
+          continue;
+        }
+
+        bool hasNumber = false;
+        for (size_t i = 0; i < size; i++) {
+          if (samples[i].mNumber != 0) {
+            hasNumber = true;
+            break;
+          }
+        }
+
+        aWriter.StartObjectElement();
+        {
+          aWriter.IntProperty("id", static_cast<int64_t>(key));
+          aWriter.StartObjectProperty("samples");
+          {
+            JSONSchemaWriter schema(aWriter);
+            schema.WriteField("time");
+            schema.WriteField("count");
+            if (hasNumber) {
+              schema.WriteField("number");
+            }
+          }
+
+          aWriter.StartArrayProperty("data");
+          double previousSkippedTime = 0.0;
+          uint64_t previousNumber = 0;
+          int64_t previousCount = 0;
+          for (size_t i = 0; i < size; i++) {
+            // Encode as deltas, and only encode if different than the previous
+            // or next sample; Always write the first and last samples.
+            if (i == 0 || i == size - 1 ||
+                samples[i].mNumber != previousNumber ||
+                samples[i].mCount != previousCount ||
+                // Ensure we ouput the first 0 before skipping samples.
+                (i >= 2 && (samples[i - 2].mNumber != previousNumber ||
+                            samples[i - 2].mCount != previousCount))) {
+              if (i != 0 && samples[i].mTime >= samples[i - 1].mTime) {
+                MOZ_LOG(sFuzzyfoxLog, mozilla::LogLevel::Error,
+                        ("Fuzzyfox Profiler Assertion: %f >= %f",
+                         samples[i].mTime, samples[i - 1].mTime));
+              }
+              MOZ_ASSERT(i == 0 || samples[i].mTime >= samples[i - 1].mTime);
+              MOZ_ASSERT(samples[i].mNumber >= previousNumber);
+              MOZ_ASSERT(samples[i].mNumber - previousNumber <=
+                         uint64_t(std::numeric_limits<int64_t>::max()));
+
+              int64_t numberDelta =
+                  static_cast<int64_t>(samples[i].mNumber - previousNumber);
+              int64_t countDelta = samples[i].mCount - previousCount;
+
+              if (previousSkippedTime != 0.0 &&
+                  (numberDelta != 0 || countDelta != 0)) {
+                // Write the last skipped sample, unless the new one is all
+                // zeroes (that'd be redundant) This is useful to know when a
+                // certain value was last sampled, so that the front-end graph
+                // will be more correct.
+                AutoArraySchemaWriter writer(aWriter);
+                writer.TimeMsElement(TIME, previousSkippedTime);
+                // The deltas are effectively zeroes, since no change happened
+                // between the last actually-written sample and the last skipped
+                // one.
+                writer.IntElement(COUNT, 0);
+                if (hasNumber) {
+                  writer.IntElement(NUMBER, 0);
+                }
+              }
+
+              AutoArraySchemaWriter writer(aWriter);
+              writer.TimeMsElement(TIME, samples[i].mTime);
+              writer.IntElement(COUNT, countDelta);
+              if (hasNumber) {
+                writer.IntElement(NUMBER, numberDelta);
+              }
+
+              previousSkippedTime = 0.0;
+              previousNumber = samples[i].mNumber;
+              previousCount = samples[i].mCount;
+            } else {
+              previousSkippedTime = samples[i].mTime;
+            }
+          }
+          aWriter.EndArray();   // data
+          aWriter.EndObject();  // samples
+        }
+        aWriter.EndObject();  // sample_groups item
+      }
+      aWriter.EndArray();  // sample groups
+      aWriter.End();       // for each counter
+    }
+    aWriter.EndArray();  // counters
+  });
+}
+
+#undef ERROR_AND_CONTINUE
+
+static void AddPausedRange(SpliceableJSONWriter& aWriter, const char* aReason,
+                           const Maybe<double>& aStartTime,
+                           const Maybe<double>& aEndTime) {
+  aWriter.Start();
+  if (aStartTime) {
+    aWriter.TimeDoubleMsProperty("startTime", *aStartTime);
+  } else {
+    aWriter.NullProperty("startTime");
+  }
+  if (aEndTime) {
+    aWriter.TimeDoubleMsProperty("endTime", *aEndTime);
+  } else {
+    aWriter.NullProperty("endTime");
+  }
+  aWriter.StringProperty("reason", MakeStringSpan(aReason));
+  aWriter.End();
+}
+
+void ProfileBuffer::StreamPausedRangesToJSON(
+    SpliceableJSONWriter& aWriter, double aSinceTime,
+    mozilla::ProgressLogger aProgressLogger) const {
+  mEntries.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+    MOZ_ASSERT(aReader,
+               "ProfileChunkedBuffer cannot be out-of-session when sampler is "
+               "running");
+
+    EntryGetter e(*aReader, aWriter.SourceFailureLatch(),
+                  aProgressLogger.CreateSubLoggerFromTo(
+                      1_pc, "Streaming pauses...", 99_pc, "Streamed pauses"));
+
+    Maybe<double> currentPauseStartTime;
+    Maybe<double> currentCollectionStartTime;
+
+    while (e.Has()) {
+      if (e.Get().IsPause()) {
+        currentPauseStartTime = Some(e.Get().GetDouble());
+      } else if (e.Get().IsResume()) {
+        AddPausedRange(aWriter, "profiler-paused", currentPauseStartTime,
+                       Some(e.Get().GetDouble()));
+        currentPauseStartTime = Nothing();
+      } else if (e.Get().IsCollectionStart()) {
+        currentCollectionStartTime = Some(e.Get().GetDouble());
+      } else if (e.Get().IsCollectionEnd()) {
+        AddPausedRange(aWriter, "collecting", currentCollectionStartTime,
+                       Some(e.Get().GetDouble()));
+        currentCollectionStartTime = Nothing();
+      }
+      e.Next();
+    }
+
+    if (currentPauseStartTime) {
+      AddPausedRange(aWriter, "profiler-paused", currentPauseStartTime,
+                     Nothing());
+    }
+    if (currentCollectionStartTime) {
+      AddPausedRange(aWriter, "collecting", currentCollectionStartTime,
+                     Nothing());
+    }
+  });
+}
+
+bool ProfileBuffer::DuplicateLastSample(ProfilerThreadId aThreadId,
+                                        double aSampleTimeMs,
+                                        Maybe<uint64_t>& aLastSample,
+                                        const RunningTimes& aRunningTimes) {
+  if (!aLastSample) {
+    return false;
+  }
+
+  if (mEntries.IsIndexInCurrentChunk(ProfileBufferIndex{*aLastSample})) {
+    // The last (fully-written) sample is in this chunk, we can refer to it.
+
+    // Note that between now and when we write the SameSample below, another
+    // chunk could have been started, so the SameSample will in fact refer to a
+    // block in a previous chunk. This is okay, because:
+    // - When serializing to JSON, if that chunk is still there, we'll still be
+    //   able to find that old stack, so nothing will be lost.
+    // - If unfortunately that chunk has been destroyed, we will lose this
+    //   sample. But this will only happen to the first sample (per thread) in
+    //   in the whole JSON output, because the next time we're here to duplicate
+    //   the same sample again, IsIndexInCurrentChunk will say `false` and we
+    //   will fall back to the normal copy or even re-sample. Losing the first
+    //   sample out of many in a whole recording is acceptable.
+    //
+    // |---| = chunk, S = Sample, D = Duplicate, s = same sample
+    // |---S-s-s--| |s-D--s--s-| |s-D--s---s|
+    // Later, the first chunk is destroyed/recycled:
+    //              |s-D--s--s-| |s-D--s---s| |-...
+    // Output:       ^ ^  ^       ^
+    //               `-|--|-------|--- Same but no previous -> lost.
+    //                 `--|-------|--- Full duplicate sample.
+    //                    `-------|--- Same with previous -> okay.
+    //                            `--- Same but now we have a previous -> okay!
+
+    AUTO_PROFILER_STATS(DuplicateLastSample_SameSample);
+
+    // Add the thread id first. We don't update `aLastSample` because we are not
+    // writing a full sample.
+    (void)AddThreadIdEntry(aThreadId);
+
+    // Copy the new time, to be followed by a SameSample.
+    AddEntry(ProfileBufferEntry::TimeBeforeSameSample(aSampleTimeMs));
+
+    // Add running times if they have data.
+    if (!aRunningTimes.IsEmpty()) {
+      mEntries.PutObjects(ProfileBufferEntry::Kind::RunningTimes,
+                          aRunningTimes);
+    }
+
+    // Finish with a SameSample entry.
+    mEntries.PutObjects(ProfileBufferEntry::Kind::SameSample);
+
+    return true;
+  }
+
+  AUTO_PROFILER_STATS(DuplicateLastSample_copy);
+
+  ProfileChunkedBuffer tempBuffer(
+      ProfileChunkedBuffer::ThreadSafety::WithoutMutex, WorkerChunkManager());
+
+  auto retrieveWorkerChunk = MakeScopeExit(
+      [&]() { WorkerChunkManager().Reset(tempBuffer.GetAllChunks()); });
+
+  const bool ok = mEntries.Read([&](ProfileChunkedBuffer::Reader* aReader) {
+    MOZ_ASSERT(aReader,
+               "ProfileChunkedBuffer cannot be out-of-session when sampler is "
+               "running");
+
+    // DuplicateLastSample is only called during profiling, so we don't need a
+    // progress logger (only useful when capturing the final profile).
+    EntryGetter e(*aReader, mozilla::FailureLatchInfallibleSource::Singleton(),
+                  ProgressLogger{}, *aLastSample);
+
+    if (e.CurPos() != *aLastSample) {
+      // The last sample is no longer within the buffer range, so we cannot
+      // use it. Reset the stored buffer position to Nothing().
+      aLastSample.reset();
+      return false;
+    }
+
+    MOZ_RELEASE_ASSERT(e.Has() && e.Get().IsThreadId() &&
+                       e.Get().GetThreadId() == aThreadId);
+
+    e.Next();
+
+    // Go through the whole entry and duplicate it, until we find the next
+    // one.
+    while (e.Has()) {
+      switch (e.Get().GetKind()) {
+        case ProfileBufferEntry::Kind::Pause:
+        case ProfileBufferEntry::Kind::Resume:
+        case ProfileBufferEntry::Kind::PauseSampling:
+        case ProfileBufferEntry::Kind::ResumeSampling:
+        case ProfileBufferEntry::Kind::CollectionStart:
+        case ProfileBufferEntry::Kind::CollectionEnd:
+        case ProfileBufferEntry::Kind::ThreadId:
+        case ProfileBufferEntry::Kind::TimeBeforeSameSample:
+          // We're done.
+          return true;
+        case ProfileBufferEntry::Kind::Time:
+          // Copy with new time
+          AddEntry(tempBuffer, ProfileBufferEntry::Time(aSampleTimeMs));
+          break;
+        case ProfileBufferEntry::Kind::TimeBeforeCompactStack: {
+          // Copy with new time, followed by a compact stack.
+          AddEntry(tempBuffer,
+                   ProfileBufferEntry::TimeBeforeCompactStack(aSampleTimeMs));
+
+          // Add running times if they have data.
+          if (!aRunningTimes.IsEmpty()) {
+            tempBuffer.PutObjects(ProfileBufferEntry::Kind::RunningTimes,
+                                  aRunningTimes);
+          }
+
+          // The `CompactStack` *must* be present afterwards, but may not
+          // immediately follow `TimeBeforeCompactStack` (e.g., some markers
+          // could be written in-between), so we need to look for it in the
+          // following entries.
+          ProfileChunkedBuffer::BlockIterator it = e.Iterator();
+          for (;;) {
+            ++it;
+            if (it.IsAtEnd()) {
+              break;
+            }
+            ProfileBufferEntryReader er = *it;
+            auto kind = static_cast<ProfileBufferEntry::Kind>(
+                er.ReadObject<ProfileBufferEntry::KindUnderlyingType>());
+            MOZ_ASSERT(
+                static_cast<ProfileBufferEntry::KindUnderlyingType>(kind) <
+                static_cast<ProfileBufferEntry::KindUnderlyingType>(
+                    ProfileBufferEntry::Kind::MODERN_LIMIT));
+            if (kind == ProfileBufferEntry::Kind::CompactStack) {
+              // Found our CompactStack, just make a copy of the whole entry.
+              er = *it;
+              auto bytes = er.RemainingBytes();
+              MOZ_ASSERT(bytes <
+                         ProfileBufferChunkManager::scExpectedMaximumStackSize);
+              tempBuffer.Put(bytes, [&](Maybe<ProfileBufferEntryWriter>& aEW) {
+                MOZ_ASSERT(aEW.isSome(), "tempBuffer cannot be out-of-session");
+                aEW->WriteFromReader(er, bytes);
+              });
+              // CompactStack marks the end, we're done.
+              break;
+            }
+
+            MOZ_ASSERT(kind >= ProfileBufferEntry::Kind::LEGACY_LIMIT,
+                       "There should be no legacy entries between "
+                       "TimeBeforeCompactStack and CompactStack");
+            er.SetRemainingBytes(0);
+            // Here, we have encountered a non-legacy entry that was not the
+            // CompactStack we're looking for; just continue the search...
+          }
+          // We're done.
+          return true;
+        }
+        case ProfileBufferEntry::Kind::CounterKey:
+        case ProfileBufferEntry::Kind::Number:
+        case ProfileBufferEntry::Kind::Count:
+          // Don't copy anything not part of a thread's stack sample
+          break;
+        case ProfileBufferEntry::Kind::CounterId:
+          // CounterId is normally followed by Time - if so, we'd like
+          // to skip it.  If we duplicate Time, it won't hurt anything, just
+          // waste buffer space (and this can happen if the CounterId has
+          // fallen off the end of the buffer, but Time (and Number/Count)
+          // are still in the buffer).
+          e.Next();
+          if (e.Has() && e.Get().GetKind() != ProfileBufferEntry::Kind::Time) {
+            // this would only happen if there was an invalid sequence
+            // in the buffer.  Don't skip it.
+            continue;
+          }
+          // we've skipped Time
+          break;
+        case ProfileBufferEntry::Kind::ProfilerOverheadTime:
+          // ProfilerOverheadTime is normally followed by
+          // ProfilerOverheadDuration*4 - if so, we'd like to skip it. Don't
+          // duplicate, as we are in the middle of a sampling and will soon
+          // capture its own overhead.
+          e.Next();
+          // A missing Time would only happen if there was an invalid
+          // sequence in the buffer. Don't skip unexpected entry.
+          if (e.Has() &&
+              e.Get().GetKind() !=
+                  ProfileBufferEntry::Kind::ProfilerOverheadDuration) {
+            continue;
+          }
+          e.Next();
+          if (e.Has() &&
+              e.Get().GetKind() !=
+                  ProfileBufferEntry::Kind::ProfilerOverheadDuration) {
+            continue;
+          }
+          e.Next();
+          if (e.Has() &&
+              e.Get().GetKind() !=
+                  ProfileBufferEntry::Kind::ProfilerOverheadDuration) {
+            continue;
+          }
+          e.Next();
+          if (e.Has() &&
+              e.Get().GetKind() !=
+                  ProfileBufferEntry::Kind::ProfilerOverheadDuration) {
+            continue;
+          }
+          // we've skipped ProfilerOverheadTime and
+          // ProfilerOverheadDuration*4.
+          break;
+        default: {
+          // Copy anything else we don't know about.
+          AddEntry(tempBuffer, e.Get());
+          break;
+        }
+      }
+      e.Next();
+    }
+    return true;
+  });
+
+  if (!ok) {
+    return false;
+  }
+
+  // If the buffer was big enough, there won't be any cleared blocks.
+  if (tempBuffer.GetState().mClearedBlockCount != 0) {
+    // No need to try to read stack again as it won't fit. Reset the stored
+    // buffer position to Nothing().
+    aLastSample.reset();
+    return false;
+  }
+
+  aLastSample = Some(AddThreadIdEntry(aThreadId));
+
+  mEntries.AppendContents(tempBuffer);
+
+  return true;
+}
+
+void ProfileBuffer::DiscardSamplesBeforeTime(double aTime) {
+  // This function does nothing!
+  // The duration limit will be removed from Firefox, see bug 1632365.
+  Unused << aTime;
+}
+
+// END ProfileBuffer
+////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/core/ProfileBufferEntry.h b/tools/profiler/core/ProfileBufferEntry.h
new file mode 100644
index 0000000000..bfee4923a3
--- /dev/null
+++ b/tools/profiler/core/ProfileBufferEntry.h
@@ -0,0 +1,532 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef ProfileBufferEntry_h
+#define ProfileBufferEntry_h
+
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <utility>
+#include <type_traits>
+#include "gtest/MozGtestFriend.h"
+#include "js/ProfilingCategory.h"
+#include "mozilla/Attributes.h"
+#include "mozilla/HashFunctions.h"
+#include "mozilla/HashTable.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/ProfileBufferEntryKinds.h"
+#include "mozilla/ProfileJSONWriter.h"
+#include "mozilla/ProfilerUtils.h"
+#include "mozilla/UniquePtrExtensions.h"
+#include "mozilla/Variant.h"
+#include "mozilla/Vector.h"
+#include "nsString.h"
+
+class ProfilerCodeAddressService;
+struct JSContext;
+
+class ProfileBufferEntry {
+ public:
+  using KindUnderlyingType =
+      std::underlying_type_t<::mozilla::ProfileBufferEntryKind>;
+  using Kind = mozilla::ProfileBufferEntryKind;
+
+  ProfileBufferEntry();
+
+  static constexpr size_t kNumChars = mozilla::ProfileBufferEntryNumChars;
+
+ private:
+  // aString must be a static string.
+  ProfileBufferEntry(Kind aKind, const char* aString);
+  ProfileBufferEntry(Kind aKind, char aChars[kNumChars]);
+  ProfileBufferEntry(Kind aKind, void* aPtr);
+  ProfileBufferEntry(Kind aKind, double aDouble);
+  ProfileBufferEntry(Kind aKind, int64_t aInt64);
+  ProfileBufferEntry(Kind aKind, uint64_t aUint64);
+  ProfileBufferEntry(Kind aKind, int aInt);
+  ProfileBufferEntry(Kind aKind, ProfilerThreadId aThreadId);
+
+ public:
+#define CTOR(KIND, TYPE, SIZE)                   \
+  static ProfileBufferEntry KIND(TYPE aVal) {    \
+    return ProfileBufferEntry(Kind::KIND, aVal); \
+  }
+  FOR_EACH_PROFILE_BUFFER_ENTRY_KIND(CTOR)
+#undef CTOR
+
+  Kind GetKind() const { return mKind; }
+
+#define IS_KIND(KIND, TYPE, SIZE) \
+  bool Is##KIND() const { return mKind == Kind::KIND; }
+  FOR_EACH_PROFILE_BUFFER_ENTRY_KIND(IS_KIND)
+#undef IS_KIND
+
+ private:
+  FRIEND_TEST(ThreadProfile, InsertOneEntry);
+  FRIEND_TEST(ThreadProfile, InsertOneEntryWithTinyBuffer);
+  FRIEND_TEST(ThreadProfile, InsertEntriesNoWrap);
+  FRIEND_TEST(ThreadProfile, InsertEntriesWrap);
+  FRIEND_TEST(ThreadProfile, MemoryMeasure);
+  friend class ProfileBuffer;
+
+  Kind mKind;
+  uint8_t mStorage[kNumChars];
+
+  const char* GetString() const;
+  void* GetPtr() const;
+  double GetDouble() const;
+  int GetInt() const;
+  int64_t GetInt64() const;
+  uint64_t GetUint64() const;
+  ProfilerThreadId GetThreadId() const;
+  void CopyCharsInto(char (&aOutArray)[kNumChars]) const;
+};
+
+// Packed layout: 1 byte for the tag + 8 bytes for the value.
+static_assert(sizeof(ProfileBufferEntry) == 9, "bad ProfileBufferEntry size");
+
+// Contains all the information about JIT frames that is needed to stream stack
+// frames for JitReturnAddr entries in the profiler buffer.
+// Every return address (void*) is mapped to one or more JITFrameKeys, and
+// every JITFrameKey is mapped to a JSON string for that frame.
+// mRangeStart and mRangeEnd describe the range in the buffer for which this
+// mapping is valid. Only JitReturnAddr entries within that buffer range can be
+// processed using this JITFrameInfoForBufferRange object.
+struct JITFrameInfoForBufferRange final {
+  JITFrameInfoForBufferRange Clone() const;
+
+  uint64_t mRangeStart;
+  uint64_t mRangeEnd;  // mRangeEnd marks the first invalid index.
+
+  struct JITFrameKey {
+    bool operator==(const JITFrameKey& aOther) const {
+      return mCanonicalAddress == aOther.mCanonicalAddress &&
+             mDepth == aOther.mDepth;
+    }
+    bool operator!=(const JITFrameKey& aOther) const {
+      return !(*this == aOther);
+    }
+
+    void* mCanonicalAddress;
+    uint32_t mDepth;
+  };
+  struct JITFrameKeyHasher {
+    using Lookup = JITFrameKey;
+
+    static mozilla::HashNumber hash(const JITFrameKey& aLookup) {
+      mozilla::HashNumber hash = 0;
+      hash = mozilla::AddToHash(hash, aLookup.mCanonicalAddress);
+      hash = mozilla::AddToHash(hash, aLookup.mDepth);
+      return hash;
+    }
+
+    static bool match(const JITFrameKey& aKey, const JITFrameKey& aLookup) {
+      return aKey == aLookup;
+    }
+
+    static void rekey(JITFrameKey& aKey, const JITFrameKey& aNewKey) {
+      aKey = aNewKey;
+    }
+  };
+
+  using JITAddressToJITFramesMap =
+      mozilla::HashMap<void*, mozilla::Vector<JITFrameKey>>;
+  JITAddressToJITFramesMap mJITAddressToJITFramesMap;
+  using JITFrameToFrameJSONMap =
+      mozilla::HashMap<JITFrameKey, nsCString, JITFrameKeyHasher>;
+  JITFrameToFrameJSONMap mJITFrameToFrameJSONMap;
+};
+
+// Contains JITFrameInfoForBufferRange objects for multiple profiler buffer
+// ranges.
+class JITFrameInfo final {
+ public:
+  JITFrameInfo()
+      : mUniqueStrings(mozilla::MakeUniqueFallible<UniqueJSONStrings>(
+            mLocalFailureLatchSource)) {
+    if (!mUniqueStrings) {
+      mLocalFailureLatchSource.SetFailure(
+          "OOM in JITFrameInfo allocating mUniqueStrings");
+    }
+  }
+
+  MOZ_IMPLICIT JITFrameInfo(const JITFrameInfo& aOther,
+                            mozilla::ProgressLogger aProgressLogger);
+
+  // Creates a new JITFrameInfoForBufferRange object in mRanges by looking up
+  // information about the provided JIT return addresses using aCx.
+  // Addresses are provided like this:
+  // The caller of AddInfoForRange supplies a function in aJITAddressProvider.
+  // This function will be called once, synchronously, with an
+  // aJITAddressConsumer argument, which is a function that needs to be called
+  // for every address. That function can be called multiple times for the same
+  // address.
+  void AddInfoForRange(
+      uint64_t aRangeStart, uint64_t aRangeEnd, JSContext* aCx,
+      const std::function<void(const std::function<void(void*)>&)>&
+          aJITAddressProvider);
+
+  // Returns whether the information stored in this object is still relevant
+  // for any entries in the buffer.
+  bool HasExpired(uint64_t aCurrentBufferRangeStart) const {
+    if (mRanges.empty()) {
+      // No information means no relevant information. Allow this object to be
+      // discarded.
+      return true;
+    }
+    return mRanges.back().mRangeEnd <= aCurrentBufferRangeStart;
+  }
+
+  mozilla::FailureLatch& LocalFailureLatchSource() {
+    return mLocalFailureLatchSource;
+  }
+
+  // The encapsulated data points at the local FailureLatch, so on the way out
+  // they must be given a new external FailureLatch to start using instead.
+  mozilla::Vector<JITFrameInfoForBufferRange>&& MoveRangesWithNewFailureLatch(
+      mozilla::FailureLatch& aFailureLatch) &&;
+  mozilla::UniquePtr<UniqueJSONStrings>&& MoveUniqueStringsWithNewFailureLatch(
+      mozilla::FailureLatch& aFailureLatch) &&;
+
+ private:
+  // JITFrameInfo's may exist during profiling, so it carries its own fallible
+  // FailureLatch. If&when the data below is finally extracted, any error is
+  // forwarded to the caller.
+  mozilla::FailureLatchSource mLocalFailureLatchSource;
+
+  // The array of ranges of JIT frame information, sorted by buffer position.
+  // Ranges are non-overlapping.
+  // The JSON of the cached frames can contain string indexes, which refer
+  // to strings in mUniqueStrings.
+  mozilla::Vector<JITFrameInfoForBufferRange> mRanges;
+
+  // The string table which contains strings used in the frame JSON that's
+  // cached in mRanges.
+  mozilla::UniquePtr<UniqueJSONStrings> mUniqueStrings;
+};
+
+class UniqueStacks final : public mozilla::FailureLatch {
+ public:
+  struct FrameKey {
+    explicit FrameKey(const char* aLocation)
+        : mData(NormalFrameData{nsCString(aLocation), false, false, 0,
+                                mozilla::Nothing(), mozilla::Nothing()}) {}
+
+    FrameKey(nsCString&& aLocation, bool aRelevantForJS, bool aBaselineInterp,
+             uint64_t aInnerWindowID, const mozilla::Maybe<unsigned>& aLine,
+             const mozilla::Maybe<unsigned>& aColumn,
+             const mozilla::Maybe<JS::ProfilingCategoryPair>& aCategoryPair)
+        : mData(NormalFrameData{aLocation, aRelevantForJS, aBaselineInterp,
+                                aInnerWindowID, aLine, aColumn,
+                                aCategoryPair}) {}
+
+    FrameKey(void* aJITAddress, uint32_t aJITDepth, uint32_t aRangeIndex)
+        : mData(JITFrameData{aJITAddress, aJITDepth, aRangeIndex}) {}
+
+    FrameKey(const FrameKey& aToCopy) = default;
+
+    uint32_t Hash() const;
+    bool operator==(const FrameKey& aOther) const {
+      return mData == aOther.mData;
+    }
+
+    struct NormalFrameData {
+      bool operator==(const NormalFrameData& aOther) const;
+
+      nsCString mLocation;
+      bool mRelevantForJS;
+      bool mBaselineInterp;
+      uint64_t mInnerWindowID;
+      mozilla::Maybe<unsigned> mLine;
+      mozilla::Maybe<unsigned> mColumn;
+      mozilla::Maybe<JS::ProfilingCategoryPair> mCategoryPair;
+    };
+    struct JITFrameData {
+      bool operator==(const JITFrameData& aOther) const;
+
+      void* mCanonicalAddress;
+      uint32_t mDepth;
+      uint32_t mRangeIndex;
+    };
+    mozilla::Variant<NormalFrameData, JITFrameData> mData;
+  };
+
+  struct FrameKeyHasher {
+    using Lookup = FrameKey;
+
+    static mozilla::HashNumber hash(const FrameKey& aLookup) {
+      mozilla::HashNumber hash = 0;
+      if (aLookup.mData.is<FrameKey::NormalFrameData>()) {
+        const FrameKey::NormalFrameData& data =
+            aLookup.mData.as<FrameKey::NormalFrameData>();
+        if (!data.mLocation.IsEmpty()) {
+          hash = mozilla::AddToHash(hash,
+                                    mozilla::HashString(data.mLocation.get()));
+        }
+        hash = mozilla::AddToHash(hash, data.mRelevantForJS);
+        hash = mozilla::AddToHash(hash, data.mBaselineInterp);
+        hash = mozilla::AddToHash(hash, data.mInnerWindowID);
+        if (data.mLine.isSome()) {
+          hash = mozilla::AddToHash(hash, *data.mLine);
+        }
+        if (data.mColumn.isSome()) {
+          hash = mozilla::AddToHash(hash, *data.mColumn);
+        }
+        if (data.mCategoryPair.isSome()) {
+          hash = mozilla::AddToHash(hash,
+                                    static_cast<uint32_t>(*data.mCategoryPair));
+        }
+      } else {
+        const FrameKey::JITFrameData& data =
+            aLookup.mData.as<FrameKey::JITFrameData>();
+        hash = mozilla::AddToHash(hash, data.mCanonicalAddress);
+        hash = mozilla::AddToHash(hash, data.mDepth);
+        hash = mozilla::AddToHash(hash, data.mRangeIndex);
+      }
+      return hash;
+    }
+
+    static bool match(const FrameKey& aKey, const FrameKey& aLookup) {
+      return aKey == aLookup;
+    }
+
+    static void rekey(FrameKey& aKey, const FrameKey& aNewKey) {
+      aKey = aNewKey;
+    }
+  };
+
+  struct StackKey {
+    mozilla::Maybe<uint32_t> mPrefixStackIndex;
+    uint32_t mFrameIndex;
+
+    explicit StackKey(uint32_t aFrame)
+        : mFrameIndex(aFrame), mHash(mozilla::HashGeneric(aFrame)) {}
+
+    StackKey(const StackKey& aPrefix, uint32_t aPrefixStackIndex,
+             uint32_t aFrame)
+        : mPrefixStackIndex(mozilla::Some(aPrefixStackIndex)),
+          mFrameIndex(aFrame),
+          mHash(mozilla::AddToHash(aPrefix.mHash, aFrame)) {}
+
+    mozilla::HashNumber Hash() const { return mHash; }
+
+    bool operator==(const StackKey& aOther) const {
+      return mPrefixStackIndex == aOther.mPrefixStackIndex &&
+             mFrameIndex == aOther.mFrameIndex;
+    }
+
+   private:
+    mozilla::HashNumber mHash;
+  };
+
+  struct StackKeyHasher {
+    using Lookup = StackKey;
+
+    static mozilla::HashNumber hash(const StackKey& aLookup) {
+      return aLookup.Hash();
+    }
+
+    static bool match(const StackKey& aKey, const StackKey& aLookup) {
+      return aKey == aLookup;
+    }
+
+    static void rekey(StackKey& aKey, const StackKey& aNewKey) {
+      aKey = aNewKey;
+    }
+  };
+
+  UniqueStacks(mozilla::FailureLatch& aFailureLatch,
+               JITFrameInfo&& aJITFrameInfo,
+               ProfilerCodeAddressService* aCodeAddressService = nullptr);
+
+  // Return a StackKey for aFrame as the stack's root frame (no prefix).
+  [[nodiscard]] mozilla::Maybe<StackKey> BeginStack(const FrameKey& aFrame);
+
+  // Return a new StackKey that is obtained by appending aFrame to aStack.
+  [[nodiscard]] mozilla::Maybe<StackKey> AppendFrame(const StackKey& aStack,
+                                                     const FrameKey& aFrame);
+
+  // Look up frame keys for the given JIT address, and ensure that our frame
+  // table has entries for the returned frame keys. The JSON for these frames
+  // is taken from mJITInfoRanges.
+  // aBufferPosition is needed in order to look up the correct JIT frame info
+  // object in mJITInfoRanges.
+  [[nodiscard]] mozilla::Maybe<mozilla::Vector<UniqueStacks::FrameKey>>
+  LookupFramesForJITAddressFromBufferPos(void* aJITAddress,
+                                         uint64_t aBufferPosition);
+
+  [[nodiscard]] mozilla::Maybe<uint32_t> GetOrAddFrameIndex(
+      const FrameKey& aFrame);
+  [[nodiscard]] mozilla::Maybe<uint32_t> GetOrAddStackIndex(
+      const StackKey& aStack);
+
+  void SpliceFrameTableElements(SpliceableJSONWriter& aWriter);
+  void SpliceStackTableElements(SpliceableJSONWriter& aWriter);
+
+  [[nodiscard]] UniqueJSONStrings& UniqueStrings() {
+    MOZ_RELEASE_ASSERT(mUniqueStrings.get());
+    return *mUniqueStrings;
+  }
+
+  // Find the function name at the given PC (if a ProfilerCodeAddressService was
+  // provided), otherwise just stringify that PC.
+  [[nodiscard]] nsAutoCString FunctionNameOrAddress(void* aPC);
+
+  FAILURELATCH_IMPL_PROXY(mFrameTableWriter)
+
+ private:
+  void StreamNonJITFrame(const FrameKey& aFrame);
+  void StreamStack(const StackKey& aStack);
+
+  mozilla::UniquePtr<UniqueJSONStrings> mUniqueStrings;
+
+  ProfilerCodeAddressService* mCodeAddressService = nullptr;
+
+  SpliceableChunkedJSONWriter mFrameTableWriter;
+  mozilla::HashMap<FrameKey, uint32_t, FrameKeyHasher> mFrameToIndexMap;
+
+  SpliceableChunkedJSONWriter mStackTableWriter;
+  mozilla::HashMap<StackKey, uint32_t, StackKeyHasher> mStackToIndexMap;
+
+  mozilla::Vector<JITFrameInfoForBufferRange> mJITInfoRanges;
+};
+
+//
+// Thread profile JSON Format
+// --------------------------
+//
+// The profile contains much duplicate information. The output JSON of the
+// profile attempts to deduplicate strings, frames, and stack prefixes, to cut
+// down on size and to increase JSON streaming speed. Deduplicated values are
+// streamed as indices into their respective tables.
+//
+// Further, arrays of objects with the same set of properties (e.g., samples,
+// frames) are output as arrays according to a schema instead of an object
+// with property names. A property that is not present is represented in the
+// array as null or undefined.
+//
+// The format of the thread profile JSON is shown by the following example
+// with 1 sample and 1 marker:
+//
+// {
+//   "name": "Foo",
+//   "tid": 42,
+//   "samples":
+//   {
+//     "schema":
+//     {
+//       "stack": 0,          /* index into stackTable */
+//       "time": 1,           /* number */
+//       "eventDelay": 2,     /* number */
+//       "ThreadCPUDelta": 3, /* optional number */
+//     },
+//     "data":
+//     [
+//       [ 1, 0.0, 0.0 ]      /* { stack: 1, time: 0.0, eventDelay: 0.0 } */
+//     ]
+//   },
+//
+//   "markers":
+//   {
+//     "schema":
+//     {
+//       "name": 0,           /* index into stringTable */
+//       "time": 1,           /* number */
+//       "data": 2            /* arbitrary JSON */
+//     },
+//     "data":
+//     [
+//       [ 3, 0.1 ]           /* { name: 'example marker', time: 0.1 } */
+//     ]
+//   },
+//
+//   "stackTable":
+//   {
+//     "schema":
+//     {
+//       "prefix": 0,         /* index into stackTable */
+//       "frame": 1           /* index into frameTable */
+//     },
+//     "data":
+//     [
+//       [ null, 0 ],         /* (root) */
+//       [ 0,    1 ]          /* (root) > foo.js */
+//     ]
+//   },
+//
+//   "frameTable":
+//   {
+//     "schema":
+//     {
+//       "location": 0,       /* index into stringTable */
+//       "relevantForJS": 1,  /* bool */
+//       "innerWindowID": 2,  /* inner window ID of global JS `window` object */
+//       "implementation": 3, /* index into stringTable */
+//       "line": 4,           /* number */
+//       "column": 5,         /* number */
+//       "category": 6,       /* index into profile.meta.categories */
+//       "subcategory": 7     /* index into
+//       profile.meta.categories[category].subcategories */
+//     },
+//     "data":
+//     [
+//       [ 0 ],               /* { location: '(root)' } */
+//       [ 1, null, null, 2 ] /* { location: 'foo.js',
+//                                 implementation: 'baseline' } */
+//     ]
+//   },
+//
+//   "stringTable":
+//   [
+//     "(root)",
+//     "foo.js",
+//     "baseline",
+//     "example marker"
+//   ]
+// }
+//
+// Process:
+// {
+//   "name": "Bar",
+//   "pid": 24,
+//   "threads":
+//   [
+//     <0-N threads from above>
+//   ],
+//   "counters": /* includes the memory counter */
+//   [
+//     {
+//       "name": "qwerty",
+//       "category": "uiop",
+//       "description": "this is qwerty uiop",
+//       "sample_groups:
+//       [
+//         {
+//           "id": 42, /* number (thread id, or object identifier (tab), etc) */
+//           "samples:
+//           {
+//             "schema":
+//             {
+//               "time": 1,   /* number */
+//               "number": 2, /* number (of times the counter was touched) */
+//               "count": 3   /* number (total for the counter) */
+//             },
+//             "data":
+//             [
+//               [ 0.1, 1824,
+//                 454622 ]   /* { time: 0.1, number: 1824, count: 454622 } */
+//             ]
+//           },
+//         },
+//         /* more sample-group objects with different id's */
+//       ]
+//     },
+//     /* more counters */
+//   ],
+// }
+//
+#endif /* ndef ProfileBufferEntry_h */
diff --git a/tools/profiler/core/ProfiledThreadData.cpp b/tools/profiler/core/ProfiledThreadData.cpp
new file mode 100644
index 0000000000..febda0d85b
--- /dev/null
+++ b/tools/profiler/core/ProfiledThreadData.cpp
@@ -0,0 +1,455 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "ProfiledThreadData.h"
+
+#include "platform.h"
+#include "ProfileBuffer.h"
+
+#include "mozilla/OriginAttributes.h"
+#include "mozilla/Span.h"
+#include "nsXULAppAPI.h"
+
+#if defined(GP_OS_darwin)
+#  include <pthread.h>
+#endif
+
+using namespace mozilla::literals::ProportionValue_literals;
+
+ProfiledThreadData::ProfiledThreadData(
+    const mozilla::profiler::ThreadRegistrationInfo& aThreadInfo)
+    : mThreadInfo(aThreadInfo.Name(), aThreadInfo.ThreadId(),
+                  aThreadInfo.IsMainThread(), aThreadInfo.RegisterTime()) {
+  MOZ_COUNT_CTOR(ProfiledThreadData);
+}
+
+ProfiledThreadData::ProfiledThreadData(
+    mozilla::profiler::ThreadRegistrationInfo&& aThreadInfo)
+    : mThreadInfo(std::move(aThreadInfo)) {
+  MOZ_COUNT_CTOR(ProfiledThreadData);
+}
+
+ProfiledThreadData::~ProfiledThreadData() {
+  MOZ_COUNT_DTOR(ProfiledThreadData);
+}
+
+static void StreamTables(UniqueStacks&& aUniqueStacks, JSContext* aCx,
+                         SpliceableJSONWriter& aWriter,
+                         const mozilla::TimeStamp& aProcessStartTime,
+                         mozilla::ProgressLogger aProgressLogger) {
+  aWriter.StartObjectProperty("stackTable");
+  {
+    {
+      JSONSchemaWriter schema(aWriter);
+      schema.WriteField("prefix");
+      schema.WriteField("frame");
+    }
+
+    aWriter.StartArrayProperty("data");
+    {
+      aProgressLogger.SetLocalProgress(1_pc, "Splicing stack table...");
+      aUniqueStacks.SpliceStackTableElements(aWriter);
+      aProgressLogger.SetLocalProgress(30_pc, "Spliced stack table");
+    }
+    aWriter.EndArray();
+  }
+  aWriter.EndObject();
+
+  aWriter.StartObjectProperty("frameTable");
+  {
+    {
+      JSONSchemaWriter schema(aWriter);
+      schema.WriteField("location");
+      schema.WriteField("relevantForJS");
+      schema.WriteField("innerWindowID");
+      schema.WriteField("implementation");
+      schema.WriteField("line");
+      schema.WriteField("column");
+      schema.WriteField("category");
+      schema.WriteField("subcategory");
+    }
+
+    aWriter.StartArrayProperty("data");
+    {
+      aProgressLogger.SetLocalProgress(30_pc, "Splicing frame table...");
+      aUniqueStacks.SpliceFrameTableElements(aWriter);
+      aProgressLogger.SetLocalProgress(60_pc, "Spliced frame table");
+    }
+    aWriter.EndArray();
+  }
+  aWriter.EndObject();
+
+  aWriter.StartArrayProperty("stringTable");
+  {
+    aProgressLogger.SetLocalProgress(60_pc, "Splicing string table...");
+    std::move(aUniqueStacks.UniqueStrings()).SpliceStringTableElements(aWriter);
+    aProgressLogger.SetLocalProgress(90_pc, "Spliced string table");
+  }
+  aWriter.EndArray();
+}
+
+mozilla::NotNull<mozilla::UniquePtr<UniqueStacks>>
+ProfiledThreadData::PrepareUniqueStacks(
+    const ProfileBuffer& aBuffer, JSContext* aCx,
+    mozilla::FailureLatch& aFailureLatch, ProfilerCodeAddressService* aService,
+    mozilla::ProgressLogger aProgressLogger) {
+  if (mJITFrameInfoForPreviousJSContexts &&
+      mJITFrameInfoForPreviousJSContexts->HasExpired(
+          aBuffer.BufferRangeStart())) {
+    mJITFrameInfoForPreviousJSContexts = nullptr;
+  }
+  aProgressLogger.SetLocalProgress(1_pc, "Checked JIT frame info presence");
+
+  // If we have an existing JITFrameInfo in mJITFrameInfoForPreviousJSContexts,
+  // copy the data from it.
+  JITFrameInfo jitFrameInfo =
+      mJITFrameInfoForPreviousJSContexts
+          ? JITFrameInfo(*mJITFrameInfoForPreviousJSContexts,
+                         aProgressLogger.CreateSubLoggerTo(
+                             "Retrieving JIT frame info...", 10_pc,
+                             "Retrieved JIT frame info"))
+          : JITFrameInfo();
+
+  if (aCx && mBufferPositionWhenReceivedJSContext) {
+    aBuffer.AddJITInfoForRange(
+        *mBufferPositionWhenReceivedJSContext, mThreadInfo.ThreadId(), aCx,
+        jitFrameInfo,
+        aProgressLogger.CreateSubLoggerTo("Adding JIT info...", 90_pc,
+                                          "Added JIT info"));
+  } else {
+    aProgressLogger.SetLocalProgress(90_pc, "No JIT info");
+  }
+
+  return mozilla::MakeNotNull<mozilla::UniquePtr<UniqueStacks>>(
+      aFailureLatch, std::move(jitFrameInfo), aService);
+}
+
+void ProfiledThreadData::StreamJSON(
+    const ProfileBuffer& aBuffer, JSContext* aCx, SpliceableJSONWriter& aWriter,
+    const nsACString& aProcessName, const nsACString& aETLDplus1,
+    const mozilla::TimeStamp& aProcessStartTime, double aSinceTime,
+    ProfilerCodeAddressService* aService,
+    mozilla::ProgressLogger aProgressLogger) {
+  mozilla::NotNull<mozilla::UniquePtr<UniqueStacks>> uniqueStacks =
+      PrepareUniqueStacks(aBuffer, aCx, aWriter.SourceFailureLatch(), aService,
+                          aProgressLogger.CreateSubLoggerFromTo(
+                              0_pc, "Preparing unique stacks...", 10_pc,
+                              "Prepared Unique stacks"));
+
+  aWriter.SetUniqueStrings(uniqueStacks->UniqueStrings());
+
+  aWriter.Start();
+  {
+    StreamSamplesAndMarkers(
+        mThreadInfo.Name(), mThreadInfo.ThreadId(), aBuffer, aWriter,
+        aProcessName, aETLDplus1, aProcessStartTime, mThreadInfo.RegisterTime(),
+        mUnregisterTime, aSinceTime, *uniqueStacks,
+        aProgressLogger.CreateSubLoggerTo(
+            90_pc,
+            "ProfiledThreadData::StreamJSON: Streamed samples and markers"));
+
+    StreamTables(std::move(*uniqueStacks), aCx, aWriter, aProcessStartTime,
+                 aProgressLogger.CreateSubLoggerTo(
+                     99_pc, "Streamed tables and trace logger"));
+  }
+  aWriter.End();
+
+  aWriter.ResetUniqueStrings();
+}
+
+void ProfiledThreadData::StreamJSON(
+    ThreadStreamingContext&& aThreadStreamingContext,
+    SpliceableJSONWriter& aWriter, const nsACString& aProcessName,
+    const nsACString& aETLDplus1, const mozilla::TimeStamp& aProcessStartTime,
+    ProfilerCodeAddressService* aService,
+    mozilla::ProgressLogger aProgressLogger) {
+  aWriter.Start();
+  {
+    StreamSamplesAndMarkers(
+        mThreadInfo.Name(), aThreadStreamingContext, aWriter, aProcessName,
+        aETLDplus1, aProcessStartTime, mThreadInfo.RegisterTime(),
+        mUnregisterTime,
+        aProgressLogger.CreateSubLoggerFromTo(
+            1_pc, "ProfiledThreadData::StreamJSON(context): Streaming...",
+            90_pc,
+            "ProfiledThreadData::StreamJSON(context): Streamed samples and "
+            "markers"));
+
+    StreamTables(
+        std::move(*aThreadStreamingContext.mUniqueStacks),
+        aThreadStreamingContext.mJSContext, aWriter, aProcessStartTime,
+        aProgressLogger.CreateSubLoggerTo(
+            "ProfiledThreadData::StreamJSON(context): Streaming tables...",
+            99_pc, "ProfiledThreadData::StreamJSON(context): Streamed tables"));
+  }
+  aWriter.End();
+}
+
+// StreamSamplesDataCallback: (ProgressLogger) -> ProfilerThreadId
+// StreamMarkersDataCallback: (ProgressLogger) -> void
+// Returns the ProfilerThreadId returned by StreamSamplesDataCallback, which
+// should be the thread id of the last sample that was processed (if any;
+// otherwise it is left unspecified). This is mostly useful when the caller
+// doesn't know where the sample comes from, e.g., when it's a backtrace in a
+// marker.
+template <typename StreamSamplesDataCallback,
+          typename StreamMarkersDataCallback>
+ProfilerThreadId DoStreamSamplesAndMarkers(
+    const char* aName, SpliceableJSONWriter& aWriter,
+    const nsACString& aProcessName, const nsACString& aETLDplus1,
+    const mozilla::TimeStamp& aProcessStartTime,
+    const mozilla::TimeStamp& aRegisterTime,
+    const mozilla::TimeStamp& aUnregisterTime,
+    mozilla::ProgressLogger aProgressLogger,
+    StreamSamplesDataCallback&& aStreamSamplesDataCallback,
+    StreamMarkersDataCallback&& aStreamMarkersDataCallback) {
+  ProfilerThreadId processedThreadId;
+
+  aWriter.StringProperty("processType",
+                         mozilla::MakeStringSpan(XRE_GetProcessTypeString()));
+
+  aWriter.StringProperty("name", mozilla::MakeStringSpan(aName));
+
+  // Use given process name (if any), unless we're the parent process.
+  if (XRE_IsParentProcess()) {
+    aWriter.StringProperty("processName", "Parent Process");
+  } else if (!aProcessName.IsEmpty()) {
+    aWriter.StringProperty("processName", aProcessName);
+  }
+  if (!aETLDplus1.IsEmpty()) {
+    nsAutoCString originNoSuffix;
+    mozilla::OriginAttributes attrs;
+    if (!attrs.PopulateFromOrigin(aETLDplus1, originNoSuffix)) {
+      aWriter.StringProperty("eTLD+1", aETLDplus1);
+    } else {
+      aWriter.StringProperty("eTLD+1", originNoSuffix);
+      aWriter.BoolProperty("isPrivateBrowsing", attrs.mPrivateBrowsingId > 0);
+      aWriter.IntProperty("userContextId", attrs.mUserContextId);
+    }
+  }
+
+  if (aRegisterTime) {
+    aWriter.DoubleProperty(
+        "registerTime", (aRegisterTime - aProcessStartTime).ToMilliseconds());
+  } else {
+    aWriter.NullProperty("registerTime");
+  }
+
+  if (aUnregisterTime) {
+    aWriter.DoubleProperty(
+        "unregisterTime",
+        (aUnregisterTime - aProcessStartTime).ToMilliseconds());
+  } else {
+    aWriter.NullProperty("unregisterTime");
+  }
+
+  aWriter.StartObjectProperty("samples");
+  {
+    {
+      JSONSchemaWriter schema(aWriter);
+      schema.WriteField("stack");
+      schema.WriteField("time");
+      schema.WriteField("eventDelay");
+#define RUNNING_TIME_FIELD(index, name, unit, jsonProperty) \
+  schema.WriteField(#jsonProperty);
+      PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_FIELD)
+#undef RUNNING_TIME_FIELD
+    }
+
+    aWriter.StartArrayProperty("data");
+    {
+      processedThreadId = std::forward<StreamSamplesDataCallback>(
+          aStreamSamplesDataCallback)(aProgressLogger.CreateSubLoggerFromTo(
+          1_pc, "Streaming samples...", 49_pc, "Streamed samples"));
+    }
+    aWriter.EndArray();
+  }
+  aWriter.EndObject();
+
+  aWriter.StartObjectProperty("markers");
+  {
+    {
+      JSONSchemaWriter schema(aWriter);
+      schema.WriteField("name");
+      schema.WriteField("startTime");
+      schema.WriteField("endTime");
+      schema.WriteField("phase");
+      schema.WriteField("category");
+      schema.WriteField("data");
+    }
+
+    aWriter.StartArrayProperty("data");
+    {
+      std::forward<StreamMarkersDataCallback>(aStreamMarkersDataCallback)(
+          aProgressLogger.CreateSubLoggerFromTo(50_pc, "Streaming markers...",
+                                                99_pc, "Streamed markers"));
+    }
+    aWriter.EndArray();
+  }
+  aWriter.EndObject();
+
+  // Tech note: If `ToNumber()` returns a uint64_t, the conversion to int64_t is
+  // "implementation-defined" before C++20. This is acceptable here, because
+  // this is a one-way conversion to a unique identifier that's used to visually
+  // separate data by thread on the front-end.
+  aWriter.IntProperty(
+      "pid", static_cast<int64_t>(profiler_current_process_id().ToNumber()));
+  aWriter.IntProperty("tid",
+                      static_cast<int64_t>(processedThreadId.ToNumber()));
+
+  return processedThreadId;
+}
+
+ProfilerThreadId StreamSamplesAndMarkers(
+    const char* aName, ProfilerThreadId aThreadId, const ProfileBuffer& aBuffer,
+    SpliceableJSONWriter& aWriter, const nsACString& aProcessName,
+    const nsACString& aETLDplus1, const mozilla::TimeStamp& aProcessStartTime,
+    const mozilla::TimeStamp& aRegisterTime,
+    const mozilla::TimeStamp& aUnregisterTime, double aSinceTime,
+    UniqueStacks& aUniqueStacks, mozilla::ProgressLogger aProgressLogger) {
+  return DoStreamSamplesAndMarkers(
+      aName, aWriter, aProcessName, aETLDplus1, aProcessStartTime,
+      aRegisterTime, aUnregisterTime, std::move(aProgressLogger),
+      [&](mozilla::ProgressLogger aSubProgressLogger) {
+        ProfilerThreadId processedThreadId = aBuffer.StreamSamplesToJSON(
+            aWriter, aThreadId, aSinceTime, aUniqueStacks,
+            std::move(aSubProgressLogger));
+        return aThreadId.IsSpecified() ? aThreadId : processedThreadId;
+      },
+      [&](mozilla::ProgressLogger aSubProgressLogger) {
+        aBuffer.StreamMarkersToJSON(aWriter, aThreadId, aProcessStartTime,
+                                    aSinceTime, aUniqueStacks,
+                                    std::move(aSubProgressLogger));
+      });
+}
+
+void StreamSamplesAndMarkers(const char* aName,
+                             ThreadStreamingContext& aThreadData,
+                             SpliceableJSONWriter& aWriter,
+                             const nsACString& aProcessName,
+                             const nsACString& aETLDplus1,
+                             const mozilla::TimeStamp& aProcessStartTime,
+                             const mozilla::TimeStamp& aRegisterTime,
+                             const mozilla::TimeStamp& aUnregisterTime,
+                             mozilla::ProgressLogger aProgressLogger) {
+  (void)DoStreamSamplesAndMarkers(
+      aName, aWriter, aProcessName, aETLDplus1, aProcessStartTime,
+      aRegisterTime, aUnregisterTime, std::move(aProgressLogger),
+      [&](mozilla::ProgressLogger aSubProgressLogger) {
+        aWriter.TakeAndSplice(
+            aThreadData.mSamplesDataWriter.TakeChunkedWriteFunc());
+        return aThreadData.mProfiledThreadData.Info().ThreadId();
+      },
+      [&](mozilla::ProgressLogger aSubProgressLogger) {
+        aWriter.TakeAndSplice(
+            aThreadData.mMarkersDataWriter.TakeChunkedWriteFunc());
+      });
+}
+
+void ProfiledThreadData::NotifyAboutToLoseJSContext(
+    JSContext* aContext, const mozilla::TimeStamp& aProcessStartTime,
+    ProfileBuffer& aBuffer) {
+  if (!mBufferPositionWhenReceivedJSContext) {
+    return;
+  }
+
+  MOZ_RELEASE_ASSERT(aContext);
+
+  if (mJITFrameInfoForPreviousJSContexts &&
+      mJITFrameInfoForPreviousJSContexts->HasExpired(
+          aBuffer.BufferRangeStart())) {
+    mJITFrameInfoForPreviousJSContexts = nullptr;
+  }
+
+  mozilla::UniquePtr<JITFrameInfo> jitFrameInfo =
+      mJITFrameInfoForPreviousJSContexts
+          ? std::move(mJITFrameInfoForPreviousJSContexts)
+          : mozilla::MakeUnique<JITFrameInfo>();
+
+  aBuffer.AddJITInfoForRange(*mBufferPositionWhenReceivedJSContext,
+                             mThreadInfo.ThreadId(), aContext, *jitFrameInfo,
+                             mozilla::ProgressLogger{});
+
+  mJITFrameInfoForPreviousJSContexts = std::move(jitFrameInfo);
+  mBufferPositionWhenReceivedJSContext = mozilla::Nothing();
+}
+
+ThreadStreamingContext::ThreadStreamingContext(
+    ProfiledThreadData& aProfiledThreadData, const ProfileBuffer& aBuffer,
+    JSContext* aCx, mozilla::FailureLatch& aFailureLatch,
+    ProfilerCodeAddressService* aService,
+    mozilla::ProgressLogger aProgressLogger)
+    : mProfiledThreadData(aProfiledThreadData),
+      mJSContext(aCx),
+      mSamplesDataWriter(aFailureLatch),
+      mMarkersDataWriter(aFailureLatch),
+      mUniqueStacks(mProfiledThreadData.PrepareUniqueStacks(
+          aBuffer, aCx, aFailureLatch, aService,
+          aProgressLogger.CreateSubLoggerFromTo(
+              0_pc, "Preparing thread streaming context unique stacks...",
+              99_pc, "Prepared thread streaming context Unique stacks"))) {
+  if (aFailureLatch.Failed()) {
+    return;
+  }
+  mSamplesDataWriter.SetUniqueStrings(mUniqueStacks->UniqueStrings());
+  mSamplesDataWriter.StartBareList();
+  mMarkersDataWriter.SetUniqueStrings(mUniqueStacks->UniqueStrings());
+  mMarkersDataWriter.StartBareList();
+}
+
+void ThreadStreamingContext::FinalizeWriter() {
+  mSamplesDataWriter.EndBareList();
+  mMarkersDataWriter.EndBareList();
+}
+
+ProcessStreamingContext::ProcessStreamingContext(
+    size_t aThreadCount, mozilla::FailureLatch& aFailureLatch,
+    const mozilla::TimeStamp& aProcessStartTime, double aSinceTime)
+    : mFailureLatch(aFailureLatch),
+      mProcessStartTime(aProcessStartTime),
+      mSinceTime(aSinceTime) {
+  if (mFailureLatch.Failed()) {
+    return;
+  }
+  if (!mTIDList.initCapacity(aThreadCount)) {
+    mFailureLatch.SetFailure(
+        "OOM in ProcessStreamingContext allocating TID list");
+    return;
+  }
+  if (!mThreadStreamingContextList.initCapacity(aThreadCount)) {
+    mFailureLatch.SetFailure(
+        "OOM in ProcessStreamingContext allocating context list");
+    mTIDList.clear();
+    return;
+  }
+}
+
+ProcessStreamingContext::~ProcessStreamingContext() {
+  if (mFailureLatch.Failed()) {
+    return;
+  }
+  MOZ_ASSERT(mTIDList.length() == mThreadStreamingContextList.length());
+  MOZ_ASSERT(mTIDList.length() == mTIDList.capacity(),
+             "Didn't pre-allocate exactly right");
+}
+
+void ProcessStreamingContext::AddThreadStreamingContext(
+    ProfiledThreadData& aProfiledThreadData, const ProfileBuffer& aBuffer,
+    JSContext* aCx, ProfilerCodeAddressService* aService,
+    mozilla::ProgressLogger aProgressLogger) {
+  if (mFailureLatch.Failed()) {
+    return;
+  }
+  MOZ_ASSERT(mTIDList.length() == mThreadStreamingContextList.length());
+  MOZ_ASSERT(mTIDList.length() < mTIDList.capacity(),
+             "Didn't pre-allocate enough");
+  mTIDList.infallibleAppend(aProfiledThreadData.Info().ThreadId());
+  mThreadStreamingContextList.infallibleEmplaceBack(
+      aProfiledThreadData, aBuffer, aCx, mFailureLatch, aService,
+      aProgressLogger.CreateSubLoggerFromTo(
+          1_pc, "Prepared streaming thread id", 100_pc,
+          "Added thread streaming context"));
+}
diff --git a/tools/profiler/core/ProfiledThreadData.h b/tools/profiler/core/ProfiledThreadData.h
new file mode 100644
index 0000000000..47ae0c579c
--- /dev/null
+++ b/tools/profiler/core/ProfiledThreadData.h
@@ -0,0 +1,250 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef ProfiledThreadData_h
+#define ProfiledThreadData_h
+
+#include "platform.h"
+#include "ProfileBuffer.h"
+#include "ProfileBufferEntry.h"
+
+#include "mozilla/FailureLatch.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/NotNull.h"
+#include "mozilla/ProfileJSONWriter.h"
+#include "mozilla/ProfilerThreadRegistrationInfo.h"
+#include "mozilla/RefPtr.h"
+#include "mozilla/TimeStamp.h"
+#include "mozilla/UniquePtr.h"
+#include "mozilla/Vector.h"
+#include "nsStringFwd.h"
+
+class nsIEventTarget;
+class ProfilerCodeAddressService;
+struct JSContext;
+struct ThreadStreamingContext;
+
+// This class contains information about a thread that is only relevant while
+// the profiler is running, for any threads (both alive and dead) whose thread
+// name matches the "thread filter" in the current profiler run.
+// ProfiledThreadData objects may be kept alive even after the thread is
+// unregistered, as long as there is still data for that thread in the profiler
+// buffer.
+//
+// Accesses to this class are protected by the profiler state lock.
+//
+// Created as soon as the following are true for the thread:
+//  - The profiler is running, and
+//  - the thread matches the profiler's thread filter, and
+//  - the thread is registered with the profiler.
+// So it gets created in response to either (1) the profiler being started (for
+// an existing registered thread) or (2) the thread being registered (if the
+// profiler is already running).
+//
+// The thread may be unregistered during the lifetime of ProfiledThreadData.
+// If that happens, NotifyUnregistered() is called.
+//
+// This class is the right place to store buffer positions. Profiler buffer
+// positions become invalid if the profiler buffer is destroyed, which happens
+// when the profiler is stopped.
+class ProfiledThreadData final {
+ public:
+  explicit ProfiledThreadData(
+      const mozilla::profiler::ThreadRegistrationInfo& aThreadInfo);
+  explicit ProfiledThreadData(
+      mozilla::profiler::ThreadRegistrationInfo&& aThreadInfo);
+  ~ProfiledThreadData();
+
+  void NotifyUnregistered(uint64_t aBufferPosition) {
+    mLastSample = mozilla::Nothing();
+    MOZ_ASSERT(!mBufferPositionWhenReceivedJSContext,
+               "JSContext should have been cleared before the thread was "
+               "unregistered");
+    mUnregisterTime = mozilla::TimeStamp::Now();
+    mBufferPositionWhenUnregistered = mozilla::Some(aBufferPosition);
+    mPreviousThreadRunningTimes.Clear();
+  }
+  mozilla::Maybe<uint64_t> BufferPositionWhenUnregistered() {
+    return mBufferPositionWhenUnregistered;
+  }
+
+  mozilla::Maybe<uint64_t>& LastSample() { return mLastSample; }
+
+  mozilla::NotNull<mozilla::UniquePtr<UniqueStacks>> PrepareUniqueStacks(
+      const ProfileBuffer& aBuffer, JSContext* aCx,
+      mozilla::FailureLatch& aFailureLatch,
+      ProfilerCodeAddressService* aService,
+      mozilla::ProgressLogger aProgressLogger);
+
+  void StreamJSON(const ProfileBuffer& aBuffer, JSContext* aCx,
+                  SpliceableJSONWriter& aWriter, const nsACString& aProcessName,
+                  const nsACString& aETLDplus1,
+                  const mozilla::TimeStamp& aProcessStartTime,
+                  double aSinceTime, ProfilerCodeAddressService* aService,
+                  mozilla::ProgressLogger aProgressLogger);
+  void StreamJSON(ThreadStreamingContext&& aThreadStreamingContext,
+                  SpliceableJSONWriter& aWriter, const nsACString& aProcessName,
+                  const nsACString& aETLDplus1,
+                  const mozilla::TimeStamp& aProcessStartTime,
+                  ProfilerCodeAddressService* aService,
+                  mozilla::ProgressLogger aProgressLogger);
+
+  const mozilla::profiler::ThreadRegistrationInfo& Info() const {
+    return mThreadInfo;
+  }
+
+  void NotifyReceivedJSContext(uint64_t aCurrentBufferPosition) {
+    mBufferPositionWhenReceivedJSContext =
+        mozilla::Some(aCurrentBufferPosition);
+  }
+
+  // Call this method when the JS entries inside the buffer are about to
+  // become invalid, i.e., just before JS shutdown.
+  void NotifyAboutToLoseJSContext(JSContext* aCx,
+                                  const mozilla::TimeStamp& aProcessStartTime,
+                                  ProfileBuffer& aBuffer);
+
+  RunningTimes& PreviousThreadRunningTimesRef() {
+    return mPreviousThreadRunningTimes;
+  }
+
+ private:
+  // Group A:
+  // The following fields are interesting for the entire lifetime of a
+  // ProfiledThreadData object.
+
+  // This thread's thread info. Local copy because the one in ThreadRegistration
+  // may be destroyed while ProfiledThreadData stays alive.
+  const mozilla::profiler::ThreadRegistrationInfo mThreadInfo;
+
+  // Contains JSON for JIT frames from any JSContexts that were used for this
+  // thread in the past.
+  // Null if this thread has never lost a JSContext or if all samples from
+  // previous JSContexts have been evicted from the profiler buffer.
+  mozilla::UniquePtr<JITFrameInfo> mJITFrameInfoForPreviousJSContexts;
+
+  // Group B:
+  // The following fields are only used while this thread is alive and
+  // registered. They become Nothing() or empty once the thread is unregistered.
+
+  // When sampling, this holds the position in ActivePS::mBuffer of the most
+  // recent sample for this thread, or Nothing() if there is no sample for this
+  // thread in the buffer.
+  mozilla::Maybe<uint64_t> mLastSample;
+
+  // Only non-Nothing() if the thread currently has a JSContext.
+  mozilla::Maybe<uint64_t> mBufferPositionWhenReceivedJSContext;
+
+  // RunningTimes at the previous sample if any, or empty.
+  RunningTimes mPreviousThreadRunningTimes;
+
+  // Group C:
+  // The following fields are only used once this thread has been unregistered.
+
+  mozilla::Maybe<uint64_t> mBufferPositionWhenUnregistered;
+  mozilla::TimeStamp mUnregisterTime;
+};
+
+// This class will be used when outputting the profile data for one thread.
+struct ThreadStreamingContext {
+  ProfiledThreadData& mProfiledThreadData;
+  JSContext* mJSContext;
+  SpliceableChunkedJSONWriter mSamplesDataWriter;
+  SpliceableChunkedJSONWriter mMarkersDataWriter;
+  mozilla::NotNull<mozilla::UniquePtr<UniqueStacks>> mUniqueStacks;
+
+  // These are updated when writing samples, and reused for "same-sample"s.
+  enum PreviousStackState { eNoStackYet, eStackWasNotEmpty, eStackWasEmpty };
+  PreviousStackState mPreviousStackState = eNoStackYet;
+  uint32_t mPreviousStack = 0;
+
+  ThreadStreamingContext(ProfiledThreadData& aProfiledThreadData,
+                         const ProfileBuffer& aBuffer, JSContext* aCx,
+                         mozilla::FailureLatch& aFailureLatch,
+                         ProfilerCodeAddressService* aService,
+                         mozilla::ProgressLogger aProgressLogger);
+
+  void FinalizeWriter();
+};
+
+// This class will be used when outputting the profile data for all threads.
+class ProcessStreamingContext final : public mozilla::FailureLatch {
+ public:
+  // Pre-allocate space for `aThreadCount` threads.
+  ProcessStreamingContext(size_t aThreadCount,
+                          mozilla::FailureLatch& aFailureLatch,
+                          const mozilla::TimeStamp& aProcessStartTime,
+                          double aSinceTime);
+
+  ~ProcessStreamingContext();
+
+  // Add the streaming context corresponding to each profiled thread. This
+  // should be called exactly the number of times specified in the constructor.
+  void AddThreadStreamingContext(ProfiledThreadData& aProfiledThreadData,
+                                 const ProfileBuffer& aBuffer, JSContext* aCx,
+                                 ProfilerCodeAddressService* aService,
+                                 mozilla::ProgressLogger aProgressLogger);
+
+  // Retrieve the ThreadStreamingContext for a given thread id.
+  // Returns null if that thread id doesn't correspond to any profiled thread.
+  ThreadStreamingContext* GetThreadStreamingContext(
+      const ProfilerThreadId& aThreadId) {
+    for (size_t i = 0; i < mTIDList.length(); ++i) {
+      if (mTIDList[i] == aThreadId) {
+        return &mThreadStreamingContextList[i];
+      }
+    }
+    return nullptr;
+  }
+
+  const mozilla::TimeStamp& ProcessStartTime() const {
+    return mProcessStartTime;
+  }
+
+  double GetSinceTime() const { return mSinceTime; }
+
+  ThreadStreamingContext* begin() {
+    return mThreadStreamingContextList.begin();
+  };
+  ThreadStreamingContext* end() { return mThreadStreamingContextList.end(); };
+
+  FAILURELATCH_IMPL_PROXY(mFailureLatch)
+
+ private:
+  // Separate list of thread ids, it's much faster to do a linear search
+  // here than a vector of bigger items like mThreadStreamingContextList.
+  mozilla::Vector<ProfilerThreadId> mTIDList;
+  // Contexts corresponding to the thread id at the same indexes.
+  mozilla::Vector<ThreadStreamingContext> mThreadStreamingContextList;
+
+  mozilla::FailureLatch& mFailureLatch;
+
+  const mozilla::TimeStamp mProcessStartTime;
+
+  const double mSinceTime;
+};
+
+// Stream all samples and markers from aBuffer with the given aThreadId (or 0
+// for everything, which is assumed to be a single backtrace sample.)
+// Returns the thread id of the output sample(s), or 0 if none was present.
+ProfilerThreadId StreamSamplesAndMarkers(
+    const char* aName, ProfilerThreadId aThreadId, const ProfileBuffer& aBuffer,
+    SpliceableJSONWriter& aWriter, const nsACString& aProcessName,
+    const nsACString& aETLDplus1, const mozilla::TimeStamp& aProcessStartTime,
+    const mozilla::TimeStamp& aRegisterTime,
+    const mozilla::TimeStamp& aUnregisterTime, double aSinceTime,
+    UniqueStacks& aUniqueStacks, mozilla::ProgressLogger aProgressLogger);
+void StreamSamplesAndMarkers(const char* aName,
+                             ThreadStreamingContext& aThreadData,
+                             SpliceableJSONWriter& aWriter,
+                             const nsACString& aProcessName,
+                             const nsACString& aETLDplus1,
+                             const mozilla::TimeStamp& aProcessStartTime,
+                             const mozilla::TimeStamp& aRegisterTime,
+                             const mozilla::TimeStamp& aUnregisterTime,
+                             mozilla::ProgressLogger aProgressLogger);
+
+#endif  // ProfiledThreadData_h
diff --git a/tools/profiler/core/ProfilerBacktrace.cpp b/tools/profiler/core/ProfilerBacktrace.cpp
new file mode 100644
index 0000000000..a264d85d64
--- /dev/null
+++ b/tools/profiler/core/ProfilerBacktrace.cpp
@@ -0,0 +1,101 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "ProfilerBacktrace.h"
+
+#include "ProfileBuffer.h"
+#include "ProfiledThreadData.h"
+
+#include "mozilla/ProfileJSONWriter.h"
+
+ProfilerBacktrace::ProfilerBacktrace(
+    const char* aName,
+    mozilla::UniquePtr<mozilla::ProfileChunkedBuffer>
+        aProfileChunkedBufferStorage,
+    mozilla::UniquePtr<ProfileBuffer>
+        aProfileBufferStorageOrNull /* = nullptr */)
+    : mName(aName),
+      mOptionalProfileChunkedBufferStorage(
+          std::move(aProfileChunkedBufferStorage)),
+      mProfileChunkedBuffer(mOptionalProfileChunkedBufferStorage.get()),
+      mOptionalProfileBufferStorage(std::move(aProfileBufferStorageOrNull)),
+      mProfileBuffer(mOptionalProfileBufferStorage.get()) {
+  MOZ_COUNT_CTOR(ProfilerBacktrace);
+  if (mProfileBuffer) {
+    MOZ_RELEASE_ASSERT(mProfileChunkedBuffer,
+                       "If we take ownership of a ProfileBuffer, we must also "
+                       "receive ownership of a ProfileChunkedBuffer");
+    MOZ_RELEASE_ASSERT(
+        mProfileChunkedBuffer == &mProfileBuffer->UnderlyingChunkedBuffer(),
+        "If we take ownership of a ProfileBuffer, we must also receive "
+        "ownership of its ProfileChunkedBuffer");
+  }
+  MOZ_ASSERT(
+      !mProfileChunkedBuffer || !mProfileChunkedBuffer->IsThreadSafe(),
+      "ProfilerBacktrace only takes a non-thread-safe ProfileChunkedBuffer");
+}
+
+ProfilerBacktrace::ProfilerBacktrace(
+    const char* aName,
+    mozilla::ProfileChunkedBuffer* aExternalProfileChunkedBuffer,
+    ProfileBuffer* aExternalProfileBuffer)
+    : mName(aName),
+      mProfileChunkedBuffer(aExternalProfileChunkedBuffer),
+      mProfileBuffer(aExternalProfileBuffer) {
+  MOZ_COUNT_CTOR(ProfilerBacktrace);
+  if (!mProfileChunkedBuffer) {
+    if (mProfileBuffer) {
+      // We don't have a ProfileChunkedBuffer but we have a ProfileBuffer, use
+      // the latter's ProfileChunkedBuffer.
+      mProfileChunkedBuffer = &mProfileBuffer->UnderlyingChunkedBuffer();
+      MOZ_ASSERT(!mProfileChunkedBuffer->IsThreadSafe(),
+                 "ProfilerBacktrace only takes a non-thread-safe "
+                 "ProfileChunkedBuffer");
+    }
+  } else {
+    if (mProfileBuffer) {
+      MOZ_RELEASE_ASSERT(
+          mProfileChunkedBuffer == &mProfileBuffer->UnderlyingChunkedBuffer(),
+          "If we reference both ProfileChunkedBuffer and ProfileBuffer, they "
+          "must already be connected");
+    }
+    MOZ_ASSERT(!mProfileChunkedBuffer->IsThreadSafe(),
+               "ProfilerBacktrace only takes a non-thread-safe "
+               "ProfileChunkedBuffer");
+  }
+}
+
+ProfilerBacktrace::~ProfilerBacktrace() { MOZ_COUNT_DTOR(ProfilerBacktrace); }
+
+ProfilerThreadId ProfilerBacktrace::StreamJSON(
+    SpliceableJSONWriter& aWriter, const mozilla::TimeStamp& aProcessStartTime,
+    UniqueStacks& aUniqueStacks) {
+  ProfilerThreadId processedThreadId;
+
+  // Unlike ProfiledThreadData::StreamJSON, we don't need to call
+  // ProfileBuffer::AddJITInfoForRange because ProfileBuffer does not contain
+  // any JitReturnAddr entries. For synchronous samples, JIT frames get expanded
+  // at sample time.
+  if (mProfileBuffer) {
+    processedThreadId = StreamSamplesAndMarkers(
+        mName.c_str(), ProfilerThreadId{}, *mProfileBuffer, aWriter, ""_ns,
+        ""_ns, aProcessStartTime,
+        /* aRegisterTime */ mozilla::TimeStamp(),
+        /* aUnregisterTime */ mozilla::TimeStamp(),
+        /* aSinceTime */ 0, aUniqueStacks, mozilla::ProgressLogger{});
+  } else if (mProfileChunkedBuffer) {
+    ProfileBuffer profileBuffer(*mProfileChunkedBuffer);
+    processedThreadId = StreamSamplesAndMarkers(
+        mName.c_str(), ProfilerThreadId{}, profileBuffer, aWriter, ""_ns, ""_ns,
+        aProcessStartTime,
+        /* aRegisterTime */ mozilla::TimeStamp(),
+        /* aUnregisterTime */ mozilla::TimeStamp(),
+        /* aSinceTime */ 0, aUniqueStacks, mozilla::ProgressLogger{});
+  }
+  // If there are no buffers, the backtrace is empty and nothing is streamed.
+
+  return processedThreadId;
+}
diff --git a/tools/profiler/core/ProfilerBacktrace.h b/tools/profiler/core/ProfilerBacktrace.h
new file mode 100644
index 0000000000..55811f4422
--- /dev/null
+++ b/tools/profiler/core/ProfilerBacktrace.h
@@ -0,0 +1,184 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef __PROFILER_BACKTRACE_H
+#define __PROFILER_BACKTRACE_H
+
+#include "ProfileBuffer.h"
+
+#include "mozilla/ProfileBufferEntrySerialization.h"
+#include "mozilla/UniquePtrExtensions.h"
+
+#include <string>
+
+class ProfileBuffer;
+class ProfilerCodeAddressService;
+class ThreadInfo;
+class UniqueStacks;
+
+namespace mozilla {
+class ProfileChunkedBuffer;
+class TimeStamp;
+namespace baseprofiler {
+class SpliceableJSONWriter;
+}  // namespace baseprofiler
+}  // namespace mozilla
+
+// ProfilerBacktrace encapsulates a synchronous sample.
+// It can work with a ProfileBuffer and/or a ProfileChunkedBuffer (if both, they
+// must already be linked together). The ProfileChunkedBuffer contains all the
+// data; the ProfileBuffer is not strictly needed, only provide it if it is
+// already available at the call site.
+// And these buffers can either be:
+// - owned here, so that the ProfilerBacktrace object can be kept for later
+//   use), OR
+// - referenced through pointers (in cases where the backtrace is immediately
+//   streamed out, so we only need temporary references to external buffers);
+//   these pointers may be null for empty backtraces.
+class ProfilerBacktrace {
+ public:
+  // Take ownership of external buffers and use them to keep, and to stream a
+  // backtrace. If a ProfileBuffer is given, its underlying chunked buffer must
+  // be provided as well.
+  explicit ProfilerBacktrace(
+      const char* aName,
+      mozilla::UniquePtr<mozilla::ProfileChunkedBuffer>
+          aProfileChunkedBufferStorage,
+      mozilla::UniquePtr<ProfileBuffer> aProfileBufferStorageOrNull = nullptr);
+
+  // Take pointers to external buffers and use them to stream a backtrace.
+  // If null, the backtrace is effectively empty.
+  // If both are provided, they must already be connected.
+  explicit ProfilerBacktrace(
+      const char* aName,
+      mozilla::ProfileChunkedBuffer* aExternalProfileChunkedBufferOrNull =
+          nullptr,
+      ProfileBuffer* aExternalProfileBufferOrNull = nullptr);
+
+  ~ProfilerBacktrace();
+
+  [[nodiscard]] bool IsEmpty() const {
+    return !mProfileChunkedBuffer ||
+           mozilla::ProfileBufferEntryWriter::Serializer<
+               mozilla::ProfileChunkedBuffer>::Bytes(*mProfileChunkedBuffer) <=
+               mozilla::ULEB128Size(0u);
+  }
+
+  // ProfilerBacktraces' stacks are deduplicated in the context of the
+  // profile that contains the backtrace as a marker payload.
+  //
+  // That is, markers that contain backtraces should not need their own stack,
+  // frame, and string tables. They should instead reuse their parent
+  // profile's tables.
+  ProfilerThreadId StreamJSON(
+      mozilla::baseprofiler::SpliceableJSONWriter& aWriter,
+      const mozilla::TimeStamp& aProcessStartTime, UniqueStacks& aUniqueStacks);
+
+ private:
+  // Used to serialize a ProfilerBacktrace.
+  friend struct mozilla::ProfileBufferEntryWriter::Serializer<
+      ProfilerBacktrace>;
+  friend struct mozilla::ProfileBufferEntryReader::Deserializer<
+      ProfilerBacktrace>;
+
+  std::string mName;
+
+  // `ProfileChunkedBuffer` in which `mProfileBuffer` stores its data; must be
+  // located before `mProfileBuffer` so that it's destroyed after.
+  mozilla::UniquePtr<mozilla::ProfileChunkedBuffer>
+      mOptionalProfileChunkedBufferStorage;
+  // If null, there is no need to check mProfileBuffer's (if present) underlying
+  // buffer because this is done when constructed.
+  mozilla::ProfileChunkedBuffer* mProfileChunkedBuffer;
+
+  mozilla::UniquePtr<ProfileBuffer> mOptionalProfileBufferStorage;
+  ProfileBuffer* mProfileBuffer;
+};
+
+namespace mozilla {
+
+// Format: [ UniquePtr<BlockRingsBuffer> | name ]
+// Initial len==0 marks a nullptr or empty backtrace.
+template <>
+struct mozilla::ProfileBufferEntryWriter::Serializer<ProfilerBacktrace> {
+  static Length Bytes(const ProfilerBacktrace& aBacktrace) {
+    if (!aBacktrace.mProfileChunkedBuffer) {
+      // No buffer.
+      return ULEB128Size(0u);
+    }
+    auto bufferBytes = SumBytes(*aBacktrace.mProfileChunkedBuffer);
+    if (bufferBytes <= ULEB128Size(0u)) {
+      // Empty buffer.
+      return ULEB128Size(0u);
+    }
+    return bufferBytes + SumBytes(aBacktrace.mName);
+  }
+
+  static void Write(mozilla::ProfileBufferEntryWriter& aEW,
+                    const ProfilerBacktrace& aBacktrace) {
+    if (!aBacktrace.mProfileChunkedBuffer ||
+        SumBytes(*aBacktrace.mProfileChunkedBuffer) <= ULEB128Size(0u)) {
+      // No buffer, or empty buffer.
+      aEW.WriteULEB128(0u);
+      return;
+    }
+    aEW.WriteObject(*aBacktrace.mProfileChunkedBuffer);
+    aEW.WriteObject(aBacktrace.mName);
+  }
+};
+
+template <typename Destructor>
+struct mozilla::ProfileBufferEntryWriter::Serializer<
+    mozilla::UniquePtr<ProfilerBacktrace, Destructor>> {
+  static Length Bytes(
+      const mozilla::UniquePtr<ProfilerBacktrace, Destructor>& aBacktrace) {
+    if (!aBacktrace) {
+      // Null backtrace pointer (treated like an empty backtrace).
+      return ULEB128Size(0u);
+    }
+    return SumBytes(*aBacktrace);
+  }
+
+  static void Write(
+      mozilla::ProfileBufferEntryWriter& aEW,
+      const mozilla::UniquePtr<ProfilerBacktrace, Destructor>& aBacktrace) {
+    if (!aBacktrace) {
+      // Null backtrace pointer (treated like an empty backtrace).
+      aEW.WriteULEB128(0u);
+      return;
+    }
+    aEW.WriteObject(*aBacktrace);
+  }
+};
+
+template <typename Destructor>
+struct mozilla::ProfileBufferEntryReader::Deserializer<
+    mozilla::UniquePtr<ProfilerBacktrace, Destructor>> {
+  static void ReadInto(
+      mozilla::ProfileBufferEntryReader& aER,
+      mozilla::UniquePtr<ProfilerBacktrace, Destructor>& aBacktrace) {
+    aBacktrace = Read(aER);
+  }
+
+  static mozilla::UniquePtr<ProfilerBacktrace, Destructor> Read(
+      mozilla::ProfileBufferEntryReader& aER) {
+    auto profileChunkedBuffer =
+        aER.ReadObject<UniquePtr<ProfileChunkedBuffer>>();
+    if (!profileChunkedBuffer) {
+      return nullptr;
+    }
+    MOZ_ASSERT(
+        !profileChunkedBuffer->IsThreadSafe(),
+        "ProfilerBacktrace only stores non-thread-safe ProfileChunkedBuffers");
+    std::string name = aER.ReadObject<std::string>();
+    return UniquePtr<ProfilerBacktrace, Destructor>{
+        new ProfilerBacktrace(name.c_str(), std::move(profileChunkedBuffer))};
+  }
+};
+
+}  // namespace mozilla
+
+#endif  // __PROFILER_BACKTRACE_H
diff --git a/tools/profiler/core/ProfilerBindings.cpp b/tools/profiler/core/ProfilerBindings.cpp
new file mode 100644
index 0000000000..c3af5c5b56
--- /dev/null
+++ b/tools/profiler/core/ProfilerBindings.cpp
@@ -0,0 +1,386 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* FFI functions for Profiler Rust API to call into profiler */
+
+#include "ProfilerBindings.h"
+
+#include "GeckoProfiler.h"
+
+#include <set>
+#include <type_traits>
+
+void gecko_profiler_register_thread(const char* aName) {
+  PROFILER_REGISTER_THREAD(aName);
+}
+
+void gecko_profiler_unregister_thread() { PROFILER_UNREGISTER_THREAD(); }
+
+void gecko_profiler_construct_label(mozilla::AutoProfilerLabel* aAutoLabel,
+                                    JS::ProfilingCategoryPair aCategoryPair) {
+#ifdef MOZ_GECKO_PROFILER
+  new (aAutoLabel) mozilla::AutoProfilerLabel(
+      "", nullptr, aCategoryPair,
+      uint32_t(
+          js::ProfilingStackFrame::Flags::LABEL_DETERMINED_BY_CATEGORY_PAIR));
+#endif
+}
+
+void gecko_profiler_destruct_label(mozilla::AutoProfilerLabel* aAutoLabel) {
+#ifdef MOZ_GECKO_PROFILER
+  aAutoLabel->~AutoProfilerLabel();
+#endif
+}
+
+void gecko_profiler_construct_timestamp_now(mozilla::TimeStamp* aTimeStamp) {
+  new (aTimeStamp) mozilla::TimeStamp(mozilla::TimeStamp::Now());
+}
+
+void gecko_profiler_clone_timestamp(const mozilla::TimeStamp* aSrcTimeStamp,
+                                    mozilla::TimeStamp* aDestTimeStamp) {
+  new (aDestTimeStamp) mozilla::TimeStamp(*aSrcTimeStamp);
+}
+
+void gecko_profiler_destruct_timestamp(mozilla::TimeStamp* aTimeStamp) {
+  aTimeStamp->~TimeStamp();
+}
+
+void gecko_profiler_add_timestamp(const mozilla::TimeStamp* aTimeStamp,
+                                  mozilla::TimeStamp* aDestTimeStamp,
+                                  double aMicroseconds) {
+  new (aDestTimeStamp) mozilla::TimeStamp(
+      *aTimeStamp + mozilla::TimeDuration::FromMicroseconds(aMicroseconds));
+}
+
+void gecko_profiler_subtract_timestamp(const mozilla::TimeStamp* aTimeStamp,
+                                       mozilla::TimeStamp* aDestTimeStamp,
+                                       double aMicroseconds) {
+  new (aDestTimeStamp) mozilla::TimeStamp(
+      *aTimeStamp - mozilla::TimeDuration::FromMicroseconds(aMicroseconds));
+}
+
+void gecko_profiler_construct_marker_timing_instant_at(
+    mozilla::MarkerTiming* aMarkerTiming, const mozilla::TimeStamp* aTime) {
+#ifdef MOZ_GECKO_PROFILER
+  static_assert(std::is_trivially_copyable_v<mozilla::MarkerTiming>);
+  mozilla::MarkerTiming::UnsafeConstruct(aMarkerTiming, *aTime,
+                                         mozilla::TimeStamp{},
+                                         mozilla::MarkerTiming::Phase::Instant);
+#endif
+}
+
+void gecko_profiler_construct_marker_timing_instant_now(
+    mozilla::MarkerTiming* aMarkerTiming) {
+#ifdef MOZ_GECKO_PROFILER
+  static_assert(std::is_trivially_copyable_v<mozilla::MarkerTiming>);
+  mozilla::MarkerTiming::UnsafeConstruct(
+      aMarkerTiming, mozilla::TimeStamp::Now(), mozilla::TimeStamp{},
+      mozilla::MarkerTiming::Phase::Instant);
+#endif
+}
+
+void gecko_profiler_construct_marker_timing_interval(
+    mozilla::MarkerTiming* aMarkerTiming, const mozilla::TimeStamp* aStartTime,
+    const mozilla::TimeStamp* aEndTime) {
+#ifdef MOZ_GECKO_PROFILER
+  static_assert(std::is_trivially_copyable_v<mozilla::MarkerTiming>);
+  mozilla::MarkerTiming::UnsafeConstruct(
+      aMarkerTiming, *aStartTime, *aEndTime,
+      mozilla::MarkerTiming::Phase::Interval);
+#endif
+}
+
+void gecko_profiler_construct_marker_timing_interval_until_now_from(
+    mozilla::MarkerTiming* aMarkerTiming,
+    const mozilla::TimeStamp* aStartTime) {
+#ifdef MOZ_GECKO_PROFILER
+  static_assert(std::is_trivially_copyable_v<mozilla::MarkerTiming>);
+  mozilla::MarkerTiming::UnsafeConstruct(
+      aMarkerTiming, *aStartTime, mozilla::TimeStamp::Now(),
+      mozilla::MarkerTiming::Phase::Interval);
+#endif
+}
+
+void gecko_profiler_construct_marker_timing_interval_start(
+    mozilla::MarkerTiming* aMarkerTiming, const mozilla::TimeStamp* aTime) {
+#ifdef MOZ_GECKO_PROFILER
+  static_assert(std::is_trivially_copyable_v<mozilla::MarkerTiming>);
+  mozilla::MarkerTiming::UnsafeConstruct(
+      aMarkerTiming, *aTime, mozilla::TimeStamp{},
+      mozilla::MarkerTiming::Phase::IntervalStart);
+#endif
+}
+
+void gecko_profiler_construct_marker_timing_interval_end(
+    mozilla::MarkerTiming* aMarkerTiming, const mozilla::TimeStamp* aTime) {
+#ifdef MOZ_GECKO_PROFILER
+  static_assert(std::is_trivially_copyable_v<mozilla::MarkerTiming>);
+  mozilla::MarkerTiming::UnsafeConstruct(
+      aMarkerTiming, mozilla::TimeStamp{}, *aTime,
+      mozilla::MarkerTiming::Phase::IntervalEnd);
+#endif
+}
+
+void gecko_profiler_destruct_marker_timing(
+    mozilla::MarkerTiming* aMarkerTiming) {
+#ifdef MOZ_GECKO_PROFILER
+  aMarkerTiming->~MarkerTiming();
+#endif
+}
+
+void gecko_profiler_construct_marker_schema(
+    mozilla::MarkerSchema* aMarkerSchema,
+    const mozilla::MarkerSchema::Location* aLocations, size_t aLength) {
+#ifdef MOZ_GECKO_PROFILER
+  new (aMarkerSchema) mozilla::MarkerSchema(aLocations, aLength);
+#endif
+}
+
+void gecko_profiler_construct_marker_schema_with_special_front_end_location(
+    mozilla::MarkerSchema* aMarkerSchema) {
+#ifdef MOZ_GECKO_PROFILER
+  new (aMarkerSchema)
+      mozilla::MarkerSchema(mozilla::MarkerSchema::SpecialFrontendLocation{});
+#endif
+}
+
+void gecko_profiler_destruct_marker_schema(
+    mozilla::MarkerSchema* aMarkerSchema) {
+#ifdef MOZ_GECKO_PROFILER
+  aMarkerSchema->~MarkerSchema();
+#endif
+}
+
+void gecko_profiler_marker_schema_set_chart_label(
+    mozilla::MarkerSchema* aSchema, const char* aLabel, size_t aLabelLength) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->SetChartLabel(std::string(aLabel, aLabelLength));
+#endif
+}
+
+void gecko_profiler_marker_schema_set_tooltip_label(
+    mozilla::MarkerSchema* aSchema, const char* aLabel, size_t aLabelLength) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->SetTooltipLabel(std::string(aLabel, aLabelLength));
+#endif
+}
+
+void gecko_profiler_marker_schema_set_table_label(
+    mozilla::MarkerSchema* aSchema, const char* aLabel, size_t aLabelLength) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->SetTableLabel(std::string(aLabel, aLabelLength));
+#endif
+}
+
+void gecko_profiler_marker_schema_set_all_labels(mozilla::MarkerSchema* aSchema,
+                                                 const char* aLabel,
+                                                 size_t aLabelLength) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->SetAllLabels(std::string(aLabel, aLabelLength));
+#endif
+}
+
+void gecko_profiler_marker_schema_add_key_format(
+    mozilla::MarkerSchema* aSchema, const char* aKey, size_t aKeyLength,
+    mozilla::MarkerSchema::Format aFormat) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->AddKeyFormat(std::string(aKey, aKeyLength), aFormat);
+#endif
+}
+
+void gecko_profiler_marker_schema_add_key_label_format(
+    mozilla::MarkerSchema* aSchema, const char* aKey, size_t aKeyLength,
+    const char* aLabel, size_t aLabelLength,
+    mozilla::MarkerSchema::Format aFormat) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->AddKeyLabelFormat(std::string(aKey, aKeyLength),
+                             std::string(aLabel, aLabelLength), aFormat);
+#endif
+}
+
+void gecko_profiler_marker_schema_add_key_format_searchable(
+    mozilla::MarkerSchema* aSchema, const char* aKey, size_t aKeyLength,
+    mozilla::MarkerSchema::Format aFormat,
+    mozilla::MarkerSchema::Searchable aSearchable) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->AddKeyFormatSearchable(std::string(aKey, aKeyLength), aFormat,
+                                  aSearchable);
+#endif
+}
+
+void gecko_profiler_marker_schema_add_key_label_format_searchable(
+    mozilla::MarkerSchema* aSchema, const char* aKey, size_t aKeyLength,
+    const char* aLabel, size_t aLabelLength,
+    mozilla::MarkerSchema::Format aFormat,
+    mozilla::MarkerSchema::Searchable aSearchable) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->AddKeyLabelFormatSearchable(std::string(aKey, aKeyLength),
+                                       std::string(aLabel, aLabelLength),
+                                       aFormat, aSearchable);
+#endif
+}
+
+void gecko_profiler_marker_schema_add_static_label_value(
+    mozilla::MarkerSchema* aSchema, const char* aLabel, size_t aLabelLength,
+    const char* aValue, size_t aValueLength) {
+#ifdef MOZ_GECKO_PROFILER
+  aSchema->AddStaticLabelValue(std::string(aLabel, aLabelLength),
+                               std::string(aValue, aValueLength));
+#endif
+}
+
+void gecko_profiler_marker_schema_stream(
+    mozilla::baseprofiler::SpliceableJSONWriter* aWriter, const char* aName,
+    size_t aNameLength, mozilla::MarkerSchema* aMarkerSchema,
+    void* aStreamedNamesSet) {
+#ifdef MOZ_GECKO_PROFILER
+  auto* streamedNames = static_cast<std::set<std::string>*>(aStreamedNamesSet);
+  // std::set.insert(T&&) returns a pair, its `second` is true if the element
+  // was actually inserted (i.e., it was not there yet.).
+  const bool didInsert =
+      streamedNames->insert(std::string(aName, aNameLength)).second;
+  if (didInsert) {
+    std::move(*aMarkerSchema)
+        .Stream(*aWriter, mozilla::Span(aName, aNameLength));
+  }
+#endif
+}
+
+void gecko_profiler_json_writer_int_property(
+    mozilla::baseprofiler::SpliceableJSONWriter* aWriter, const char* aName,
+    size_t aNameLength, int64_t aValue) {
+#ifdef MOZ_GECKO_PROFILER
+  aWriter->IntProperty(mozilla::Span(aName, aNameLength), aValue);
+#endif
+}
+
+void gecko_profiler_json_writer_float_property(
+    mozilla::baseprofiler::SpliceableJSONWriter* aWriter, const char* aName,
+    size_t aNameLength, double aValue) {
+#ifdef MOZ_GECKO_PROFILER
+  aWriter->DoubleProperty(mozilla::Span(aName, aNameLength), aValue);
+#endif
+}
+
+void gecko_profiler_json_writer_bool_property(
+    mozilla::baseprofiler::SpliceableJSONWriter* aWriter, const char* aName,
+    size_t aNameLength, bool aValue) {
+#ifdef MOZ_GECKO_PROFILER
+  aWriter->BoolProperty(mozilla::Span(aName, aNameLength), aValue);
+#endif
+}
+void gecko_profiler_json_writer_string_property(
+    mozilla::baseprofiler::SpliceableJSONWriter* aWriter, const char* aName,
+    size_t aNameLength, const char* aValue, size_t aValueLength) {
+#ifdef MOZ_GECKO_PROFILER
+  aWriter->StringProperty(mozilla::Span(aName, aNameLength),
+                          mozilla::Span(aValue, aValueLength));
+#endif
+}
+
+void gecko_profiler_json_writer_null_property(
+    mozilla::baseprofiler::SpliceableJSONWriter* aWriter, const char* aName,
+    size_t aNameLength) {
+#ifdef MOZ_GECKO_PROFILER
+  aWriter->NullProperty(mozilla::Span(aName, aNameLength));
+#endif
+}
+
+void gecko_profiler_add_marker_untyped(
+    const char* aName, size_t aNameLength,
+    mozilla::baseprofiler::ProfilingCategoryPair aCategoryPair,
+    mozilla::MarkerTiming* aMarkerTiming,
+    mozilla::StackCaptureOptions aStackCaptureOptions) {
+#ifdef MOZ_GECKO_PROFILER
+  profiler_add_marker(
+      mozilla::ProfilerString8View(aName, aNameLength),
+      mozilla::MarkerCategory{aCategoryPair},
+      mozilla::MarkerOptions(
+          std::move(*aMarkerTiming),
+          mozilla::MarkerStack::WithCaptureOptions(aStackCaptureOptions)));
+#endif
+}
+
+void gecko_profiler_add_marker_text(
+    const char* aName, size_t aNameLength,
+    mozilla::baseprofiler::ProfilingCategoryPair aCategoryPair,
+    mozilla::MarkerTiming* aMarkerTiming,
+    mozilla::StackCaptureOptions aStackCaptureOptions, const char* aText,
+    size_t aTextLength) {
+#ifdef MOZ_GECKO_PROFILER
+  profiler_add_marker(
+      mozilla::ProfilerString8View(aName, aNameLength),
+      mozilla::MarkerCategory{aCategoryPair},
+      mozilla::MarkerOptions(
+          std::move(*aMarkerTiming),
+          mozilla::MarkerStack::WithCaptureOptions(aStackCaptureOptions)),
+      geckoprofiler::markers::TextMarker{},
+      mozilla::ProfilerString8View(aText, aTextLength));
+#endif
+}
+
+void gecko_profiler_add_marker(
+    const char* aName, size_t aNameLength,
+    mozilla::baseprofiler::ProfilingCategoryPair aCategoryPair,
+    mozilla::MarkerTiming* aMarkerTiming,
+    mozilla::StackCaptureOptions aStackCaptureOptions, uint8_t aMarkerTag,
+    const uint8_t* aPayload, size_t aPayloadSize) {
+#ifdef MOZ_GECKO_PROFILER
+  // Copy the marker timing and create the marker option.
+  mozilla::MarkerOptions markerOptions(
+      std::move(*aMarkerTiming),
+      mozilla::MarkerStack::WithCaptureOptions(aStackCaptureOptions));
+
+  // Currently it's not possible to add a threadId option, but we will
+  // have it soon.
+  if (markerOptions.ThreadId().IsUnspecified()) {
+    // If yet unspecified, set thread to this thread where the marker is added.
+    markerOptions.Set(mozilla::MarkerThreadId::CurrentThread());
+  }
+
+  auto& buffer = profiler_get_core_buffer();
+  mozilla::Span payload(aPayload, aPayloadSize);
+
+  mozilla::StackCaptureOptions captureOptions =
+      markerOptions.Stack().CaptureOptions();
+  if (captureOptions != mozilla::StackCaptureOptions::NoStack &&
+      // Do not capture a stack if the NoMarkerStacks feature is set.
+      profiler_active_without_feature(ProfilerFeature::NoMarkerStacks)) {
+    // A capture was requested, let's attempt to do it here&now. This avoids a
+    // lot of allocations that would be necessary if capturing a backtrace
+    // separately.
+    // TODO use a local on-stack byte buffer to remove last allocation.
+    // TODO reduce internal profiler stack levels, see bug 1659872.
+    mozilla::ProfileBufferChunkManagerSingle chunkManager(
+        mozilla::ProfileBufferChunkManager::scExpectedMaximumStackSize);
+    mozilla::ProfileChunkedBuffer chunkedBuffer(
+        mozilla::ProfileChunkedBuffer::ThreadSafety::WithoutMutex,
+        chunkManager);
+    markerOptions.StackRef().UseRequestedBacktrace(
+        profiler_capture_backtrace_into(chunkedBuffer, captureOptions)
+            ? &chunkedBuffer
+            : nullptr);
+
+    // This call must be made from here, while chunkedBuffer is in scope.
+    buffer.PutObjects(
+        mozilla::ProfileBufferEntryKind::Marker, markerOptions,
+        mozilla::ProfilerString8View(aName, aNameLength),
+        mozilla::MarkerCategory{aCategoryPair},
+        mozilla::base_profiler_markers_detail::Streaming::DeserializerTag(
+            aMarkerTag),
+        mozilla::MarkerPayloadType::Rust, payload);
+    return;
+  }
+
+  buffer.PutObjects(
+      mozilla::ProfileBufferEntryKind::Marker, markerOptions,
+      mozilla::ProfilerString8View(aName, aNameLength),
+      mozilla::MarkerCategory{aCategoryPair},
+      mozilla::base_profiler_markers_detail::Streaming::DeserializerTag(
+          aMarkerTag),
+      mozilla::MarkerPayloadType::Rust, payload);
+#endif
+}
diff --git a/tools/profiler/core/ProfilerCodeAddressService.cpp b/tools/profiler/core/ProfilerCodeAddressService.cpp
new file mode 100644
index 0000000000..5a65e06379
--- /dev/null
+++ b/tools/profiler/core/ProfilerCodeAddressService.cpp
@@ -0,0 +1,75 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "ProfilerCodeAddressService.h"
+
+#include "platform.h"
+#include "mozilla/StackWalk.h"
+
+using namespace mozilla;
+
+#if defined(XP_LINUX) || defined(XP_FREEBSD)
+static char* SearchSymbolTable(SymbolTable& aTable, uint32_t aOffset) {
+  size_t index;
+  bool exact =
+      BinarySearch(aTable.mAddrs, 0, aTable.mAddrs.Length(), aOffset, &index);
+
+  if (index == 0 && !exact) {
+    // Our offset is before the first symbol in the table; no result.
+    return nullptr;
+  }
+
+  // Extract the (mangled) symbol name out of the string table.
+  auto strings = reinterpret_cast<char*>(aTable.mBuffer.Elements());
+  nsCString symbol;
+  symbol.Append(strings + aTable.mIndex[index - 1],
+                aTable.mIndex[index] - aTable.mIndex[index - 1]);
+
+  // First try demangling as a Rust identifier.
+  char demangled[1024];
+  if (!profiler_demangle_rust(symbol.get(), demangled,
+                              ArrayLength(demangled))) {
+    // Then as a C++ identifier.
+    DemangleSymbol(symbol.get(), demangled, ArrayLength(demangled));
+  }
+  demangled[ArrayLength(demangled) - 1] = '\0';
+
+  // Use the mangled name if we didn't successfully demangle.
+  return strdup(demangled[0] != '\0' ? demangled : symbol.get());
+}
+#endif
+
+bool ProfilerCodeAddressService::GetFunction(const void* aPc,
+                                             nsACString& aResult) {
+  Entry& entry = GetEntry(aPc);
+
+#if defined(XP_LINUX) || defined(XP_FREEBSD)
+  // On Linux, most symbols will not be found by the MozDescribeCodeAddress call
+  // that GetEntry does.  So we read the symbol table directly from the ELF
+  // image.
+
+  // SymbolTable currently assumes library offsets will not be larger than
+  // 4 GiB.
+  if (entry.mLOffset <= 0xFFFFFFFF && !entry.mFunction) {
+    auto p = mSymbolTables.lookupForAdd(entry.mLibrary);
+    if (!p) {
+      if (!mSymbolTables.add(p, entry.mLibrary, SymbolTable())) {
+        MOZ_CRASH("ProfilerCodeAddressService OOM");
+      }
+      profiler_get_symbol_table(entry.mLibrary, nullptr, &p->value());
+    }
+    entry.mFunction =
+        SearchSymbolTable(p->value(), static_cast<uint32_t>(entry.mLOffset));
+  }
+#endif
+
+  if (!entry.mFunction || entry.mFunction[0] == '\0') {
+    return false;
+  }
+
+  aResult = nsDependentCString(entry.mFunction);
+  return true;
+}
diff --git a/tools/profiler/core/ProfilerMarkers.cpp b/tools/profiler/core/ProfilerMarkers.cpp
new file mode 100644
index 0000000000..7c299678d1
--- /dev/null
+++ b/tools/profiler/core/ProfilerMarkers.cpp
@@ -0,0 +1,32 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/ProfilerMarkers.h"
+
+template mozilla::ProfileBufferBlockIndex AddMarkerToBuffer(
+    mozilla::ProfileChunkedBuffer&, const mozilla::ProfilerString8View&,
+    const mozilla::MarkerCategory&, mozilla::MarkerOptions&&,
+    mozilla::baseprofiler::markers::NoPayload);
+
+template mozilla::ProfileBufferBlockIndex AddMarkerToBuffer(
+    mozilla::ProfileChunkedBuffer&, const mozilla::ProfilerString8View&,
+    const mozilla::MarkerCategory&, mozilla::MarkerOptions&&,
+    mozilla::baseprofiler::markers::TextMarker, const std::string&);
+
+template mozilla::ProfileBufferBlockIndex profiler_add_marker(
+    const mozilla::ProfilerString8View&, const mozilla::MarkerCategory&,
+    mozilla::MarkerOptions&&, mozilla::baseprofiler::markers::TextMarker,
+    const std::string&);
+
+template mozilla::ProfileBufferBlockIndex profiler_add_marker(
+    const mozilla::ProfilerString8View&, const mozilla::MarkerCategory&,
+    mozilla::MarkerOptions&&, mozilla::baseprofiler::markers::TextMarker,
+    const nsCString&);
+
+template mozilla::ProfileBufferBlockIndex profiler_add_marker(
+    const mozilla::ProfilerString8View&, const mozilla::MarkerCategory&,
+    mozilla::MarkerOptions&&, mozilla::baseprofiler::markers::Tracing,
+    const mozilla::ProfilerString8View&);
diff --git a/tools/profiler/core/ProfilerThreadRegistration.cpp b/tools/profiler/core/ProfilerThreadRegistration.cpp
new file mode 100644
index 0000000000..c81d00573d
--- /dev/null
+++ b/tools/profiler/core/ProfilerThreadRegistration.cpp
@@ -0,0 +1,198 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/ProfilerThreadRegistration.h"
+
+#include "mozilla/ProfilerMarkers.h"
+#include "mozilla/ProfilerThreadRegistry.h"
+#include "nsString.h"
+#ifdef MOZ_GECKO_PROFILER
+#  include "platform.h"
+#else
+#  define profiler_mark_thread_awake()
+#  define profiler_mark_thread_asleep()
+#endif
+
+namespace mozilla::profiler {
+
+/* static */
+MOZ_THREAD_LOCAL(ThreadRegistration*) ThreadRegistration::tlsThreadRegistration;
+
+ThreadRegistration::ThreadRegistration(const char* aName, const void* aStackTop)
+    : mData(aName, aStackTop) {
+  auto* tls = GetTLS();
+  if (MOZ_UNLIKELY(!tls)) {
+    // No TLS, nothing can be done without it.
+    return;
+  }
+
+  if (ThreadRegistration* rootRegistration = tls->get(); rootRegistration) {
+    // This is a nested ThreadRegistration object, so the thread is already
+    // registered in the TLS and ThreadRegistry and we don't need to register
+    // again.
+    MOZ_ASSERT(
+        mData.Info().ThreadId() == rootRegistration->mData.Info().ThreadId(),
+        "Thread being re-registered has changed its TID");
+    // TODO: Use new name. This is currently not possible because the
+    // TLS-stored RegisteredThread's ThreadInfo cannot be changed.
+    // In the meantime, we record a marker that could be used in the frontend.
+    PROFILER_MARKER_TEXT("Nested ThreadRegistration()", OTHER_Profiling,
+                         MarkerOptions{},
+                         ProfilerString8View::WrapNullTerminatedString(aName));
+    return;
+  }
+
+  tls->set(this);
+  ThreadRegistry::Register(OnThreadRef{*this});
+  profiler_mark_thread_awake();
+}
+
+ThreadRegistration::~ThreadRegistration() {
+  MOZ_ASSERT(profiler_current_thread_id() == mData.mInfo.ThreadId(),
+             "ThreadRegistration must be destroyed on its thread");
+  MOZ_ASSERT(!mDataMutex.IsLockedOnCurrentThread(),
+             "Mutex shouldn't be locked here, as it's about to be destroyed "
+             "in ~ThreadRegistration()");
+  auto* tls = GetTLS();
+  if (MOZ_UNLIKELY(!tls)) {
+    // No TLS, nothing can be done without it.
+    return;
+  }
+
+  if (ThreadRegistration* rootRegistration = tls->get(); rootRegistration) {
+    if (rootRegistration != this) {
+      // `this` is not in the TLS, so it was a nested registration, there is
+      // nothing to unregister yet.
+      PROFILER_MARKER_TEXT(
+          "Nested ~ThreadRegistration()", OTHER_Profiling, MarkerOptions{},
+          ProfilerString8View::WrapNullTerminatedString(mData.Info().Name()));
+      return;
+    }
+
+    profiler_mark_thread_asleep();
+#ifdef NIGHTLY_BUILD
+    mData.RecordWakeCount();
+#endif
+    ThreadRegistry::Unregister(OnThreadRef{*this});
+#ifdef DEBUG
+    // After ThreadRegistry::Unregister, other threads should not be able to
+    // find this ThreadRegistration, and shouldn't have kept any reference to
+    // it across the ThreadRegistry mutex.
+    MOZ_ASSERT(mDataMutex.TryLock(),
+               "Mutex shouldn't be locked in any thread, as it's about to be "
+               "destroyed in ~ThreadRegistration()");
+    // Undo the above successful TryLock.
+    mDataMutex.Unlock();
+#endif  // DEBUG
+
+    tls->set(nullptr);
+    return;
+  }
+
+  // Already removed from the TLS!? This could happen with improperly-nested
+  // register/unregister calls, and the first ThreadRegistration has already
+  // been unregistered.
+  // We cannot record a marker on this thread because it was already
+  // unregistered. Send it to the main thread (unless this *is* already the
+  // main thread, which has been unregistered); this may be useful to catch
+  // mismatched register/unregister pairs in Firefox.
+  if (!profiler_is_main_thread()) {
+    nsAutoCString threadId("thread id: ");
+    threadId.AppendInt(profiler_current_thread_id().ToNumber());
+    threadId.AppendLiteral(", name: \"");
+    threadId.AppendASCII(mData.Info().Name());
+    threadId.AppendLiteral("\"");
+    PROFILER_MARKER_TEXT(
+        "~ThreadRegistration() but TLS is empty", OTHER_Profiling,
+        MarkerOptions(MarkerThreadId::MainThread(), MarkerStack::Capture()),
+        threadId);
+  }
+}
+
+/* static */
+ProfilingStack* ThreadRegistration::RegisterThread(const char* aName,
+                                                   const void* aStackTop) {
+  auto* tls = GetTLS();
+  if (MOZ_UNLIKELY(!tls)) {
+    // No TLS, nothing can be done without it.
+    return nullptr;
+  }
+
+  if (ThreadRegistration* rootRegistration = tls->get(); rootRegistration) {
+    // Already registered, record the extra depth to ignore the matching
+    // UnregisterThread.
+    ++rootRegistration->mOtherRegistrations;
+    // TODO: Use new name. This is currently not possible because the
+    // TLS-stored RegisteredThread's ThreadInfo cannot be changed.
+    // In the meantime, we record a marker that could be used in the frontend.
+    PROFILER_MARKER_TEXT("Nested ThreadRegistration::RegisterThread()",
+                         OTHER_Profiling, MarkerOptions{},
+                         ProfilerString8View::WrapNullTerminatedString(aName));
+    return &rootRegistration->mData.mProfilingStack;
+  }
+
+  // Create on heap, it self-registers with the TLS (its effective owner, so
+  // we can forget the pointer after this), and with the Profiler.
+  ThreadRegistration* tr = new ThreadRegistration(aName, aStackTop);
+  tr->mIsOnHeap = true;
+  return &tr->mData.mProfilingStack;
+}
+
+/* static */
+void ThreadRegistration::UnregisterThread() {
+  auto* tls = GetTLS();
+  if (MOZ_UNLIKELY(!tls)) {
+    // No TLS, nothing can be done without it.
+    return;
+  }
+
+  if (ThreadRegistration* rootRegistration = tls->get(); rootRegistration) {
+    if (rootRegistration->mOtherRegistrations != 0) {
+      // This is assumed to be a matching UnregisterThread() for a nested
+      // RegisterThread(). Decrease depth and we're done.
+      --rootRegistration->mOtherRegistrations;
+      // We don't know what name was used in the related RegisterThread().
+      PROFILER_MARKER_UNTYPED("Nested ThreadRegistration::UnregisterThread()",
+                              OTHER_Profiling);
+      return;
+    }
+
+    if (!rootRegistration->mIsOnHeap) {
+      // The root registration was not added by `RegisterThread()`, so it
+      // shouldn't be deleted!
+      // This could happen if there are un-paired `UnregisterThread` calls when
+      // the initial registration (still alive) was done on the stack. We don't
+      // know what name was used in the related RegisterThread().
+      PROFILER_MARKER_UNTYPED("Excess ThreadRegistration::UnregisterThread()",
+                              OTHER_Profiling, MarkerStack::Capture());
+      return;
+    }
+
+    // This is the last `UnregisterThread()` that should match the first
+    // `RegisterThread()` that created this ThreadRegistration on the heap.
+    // Just delete this root registration, it will de-register itself from the
+    // TLS (and from the Profiler).
+    delete rootRegistration;
+    return;
+  }
+
+  // There is no known ThreadRegistration for this thread, ignore this
+  // request. We cannot record a marker on this thread because it was already
+  // unregistered. Send it to the main thread (unless this *is* already the
+  // main thread, which has been unregistered); this may be useful to catch
+  // mismatched register/unregister pairs in Firefox.
+  if (!profiler_is_main_thread()) {
+    nsAutoCString threadId("thread id: ");
+    threadId.AppendInt(profiler_current_thread_id().ToNumber());
+    PROFILER_MARKER_TEXT(
+        "ThreadRegistration::UnregisterThread() but TLS is empty",
+        OTHER_Profiling,
+        MarkerOptions(MarkerThreadId::MainThread(), MarkerStack::Capture()),
+        threadId);
+  }
+}
+
+}  // namespace mozilla::profiler
diff --git a/tools/profiler/core/ProfilerThreadRegistrationData.cpp b/tools/profiler/core/ProfilerThreadRegistrationData.cpp
new file mode 100644
index 0000000000..e70f9e749a
--- /dev/null
+++ b/tools/profiler/core/ProfilerThreadRegistrationData.cpp
@@ -0,0 +1,303 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/ProfilerThreadRegistrationData.h"
+
+#include "mozilla/FOGIPC.h"
+#include "mozilla/glean/GleanMetrics.h"
+#include "mozilla/ProfilerMarkers.h"
+#include "js/AllocationRecording.h"
+#include "js/ProfilingStack.h"
+
+#if defined(XP_WIN)
+#  include <windows.h>
+#elif defined(XP_DARWIN)
+#  include <pthread.h>
+#endif
+
+#ifdef NIGHTLY_BUILD
+namespace geckoprofiler::markers {
+
+using namespace mozilla;
+
+struct ThreadCpuUseMarker {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("ThreadCpuUse");
+  }
+  static void StreamJSONMarkerData(baseprofiler::SpliceableJSONWriter& aWriter,
+                                   ProfilerThreadId aThreadId,
+                                   int64_t aCpuTimeMs, int64_t aWakeUps,
+                                   const ProfilerString8View& aThreadName) {
+    aWriter.IntProperty("threadId", static_cast<int64_t>(aThreadId.ToNumber()));
+    aWriter.IntProperty("time", aCpuTimeMs);
+    aWriter.IntProperty("wakeups", aWakeUps);
+    aWriter.StringProperty("label", aThreadName);
+  }
+  static MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::MarkerChart, MS::Location::MarkerTable};
+    schema.AddKeyLabelFormat("time", "CPU Time", MS::Format::Milliseconds);
+    schema.AddKeyLabelFormat("wakeups", "Wake ups", MS::Format::Integer);
+    schema.SetTooltipLabel("{marker.name} - {marker.data.label}");
+    schema.SetTableLabel(
+        "{marker.name} - {marker.data.label}: {marker.data.time} of CPU time, "
+        "{marker.data.wakeups} wake ups");
+    return schema;
+  }
+};
+
+}  // namespace geckoprofiler::markers
+#endif
+
+namespace mozilla::profiler {
+
+ThreadRegistrationData::ThreadRegistrationData(const char* aName,
+                                               const void* aStackTop)
+    : mInfo(aName),
+      mPlatformData(mInfo.ThreadId()),
+      mStackTop(
+#if defined(XP_WIN)
+          // We don't have to guess on Windows.
+          reinterpret_cast<const void*>(
+              reinterpret_cast<PNT_TIB>(NtCurrentTeb())->StackBase)
+#elif defined(XP_DARWIN)
+          // We don't have to guess on Mac/Darwin.
+          reinterpret_cast<const void*>(
+              pthread_get_stackaddr_np(pthread_self()))
+#else
+          // Otherwise use the given guess.
+          aStackTop
+#endif
+      ) {
+}
+
+// This is a simplified version of profiler_add_marker that can be easily passed
+// into the JS engine.
+static void profiler_add_js_marker(const char* aMarkerName,
+                                   const char* aMarkerText) {
+  PROFILER_MARKER_TEXT(
+      mozilla::ProfilerString8View::WrapNullTerminatedString(aMarkerName), JS,
+      {}, mozilla::ProfilerString8View::WrapNullTerminatedString(aMarkerText));
+}
+
+static void profiler_add_js_allocation_marker(JS::RecordAllocationInfo&& info) {
+  if (!profiler_thread_is_being_profiled_for_markers()) {
+    return;
+  }
+
+  struct JsAllocationMarker {
+    static constexpr mozilla::Span<const char> MarkerTypeName() {
+      return mozilla::MakeStringSpan("JS allocation");
+    }
+    static void StreamJSONMarkerData(
+        mozilla::baseprofiler::SpliceableJSONWriter& aWriter,
+        const mozilla::ProfilerString16View& aTypeName,
+        const mozilla::ProfilerString8View& aClassName,
+        const mozilla::ProfilerString16View& aDescriptiveTypeName,
+        const mozilla::ProfilerString8View& aCoarseType, uint64_t aSize,
+        bool aInNursery) {
+      if (aClassName.Length() != 0) {
+        aWriter.StringProperty("className", aClassName);
+      }
+      if (aTypeName.Length() != 0) {
+        aWriter.StringProperty("typeName", NS_ConvertUTF16toUTF8(aTypeName));
+      }
+      if (aDescriptiveTypeName.Length() != 0) {
+        aWriter.StringProperty("descriptiveTypeName",
+                               NS_ConvertUTF16toUTF8(aDescriptiveTypeName));
+      }
+      aWriter.StringProperty("coarseType", aCoarseType);
+      aWriter.IntProperty("size", aSize);
+      aWriter.BoolProperty("inNursery", aInNursery);
+    }
+    static mozilla::MarkerSchema MarkerTypeDisplay() {
+      return mozilla::MarkerSchema::SpecialFrontendLocation{};
+    }
+  };
+
+  profiler_add_marker(
+      "JS allocation", geckoprofiler::category::JS,
+      mozilla::MarkerStack::Capture(), JsAllocationMarker{},
+      mozilla::ProfilerString16View::WrapNullTerminatedString(info.typeName),
+      mozilla::ProfilerString8View::WrapNullTerminatedString(info.className),
+      mozilla::ProfilerString16View::WrapNullTerminatedString(
+          info.descriptiveTypeName),
+      mozilla::ProfilerString8View::WrapNullTerminatedString(info.coarseType),
+      info.size, info.inNursery);
+}
+
+void ThreadRegistrationLockedRWFromAnyThread::SetProfilingFeaturesAndData(
+    ThreadProfilingFeatures aProfilingFeatures,
+    ProfiledThreadData* aProfiledThreadData, const PSAutoLock&) {
+  MOZ_ASSERT(mProfilingFeatures == ThreadProfilingFeatures::NotProfiled);
+  mProfilingFeatures = aProfilingFeatures;
+
+  MOZ_ASSERT(!mProfiledThreadData);
+  MOZ_ASSERT(aProfiledThreadData);
+  mProfiledThreadData = aProfiledThreadData;
+
+  if (mJSContext) {
+    // The thread is now being profiled, and we already have a JSContext,
+    // allocate a JsFramesBuffer to allow profiler-unlocked on-thread sampling.
+    MOZ_ASSERT(!mJsFrameBuffer);
+    mJsFrameBuffer = new JsFrame[MAX_JS_FRAMES];
+  }
+
+  // Check invariants.
+  MOZ_ASSERT((mProfilingFeatures != ThreadProfilingFeatures::NotProfiled) ==
+             !!mProfiledThreadData);
+  MOZ_ASSERT((mJSContext &&
+              (mProfilingFeatures != ThreadProfilingFeatures::NotProfiled)) ==
+             !!mJsFrameBuffer);
+}
+
+void ThreadRegistrationLockedRWFromAnyThread::ClearProfilingFeaturesAndData(
+    const PSAutoLock&) {
+  mProfilingFeatures = ThreadProfilingFeatures::NotProfiled;
+  mProfiledThreadData = nullptr;
+
+  if (mJsFrameBuffer) {
+    delete[] mJsFrameBuffer;
+    mJsFrameBuffer = nullptr;
+  }
+
+  // Check invariants.
+  MOZ_ASSERT((mProfilingFeatures != ThreadProfilingFeatures::NotProfiled) ==
+             !!mProfiledThreadData);
+  MOZ_ASSERT((mJSContext &&
+              (mProfilingFeatures != ThreadProfilingFeatures::NotProfiled)) ==
+             !!mJsFrameBuffer);
+}
+
+void ThreadRegistrationLockedRWOnThread::SetJSContext(JSContext* aJSContext) {
+  MOZ_ASSERT(aJSContext && !mJSContext);
+
+  mJSContext = aJSContext;
+
+  if (mProfiledThreadData) {
+    MOZ_ASSERT((mProfilingFeatures != ThreadProfilingFeatures::NotProfiled) ==
+               !!mProfiledThreadData);
+    // We now have a JSContext, and the thread is already being profiled,
+    // allocate a JsFramesBuffer to allow profiler-unlocked on-thread sampling.
+    MOZ_ASSERT(!mJsFrameBuffer);
+    mJsFrameBuffer = new JsFrame[MAX_JS_FRAMES];
+  }
+
+  // We give the JS engine a non-owning reference to the ProfilingStack. It's
+  // important that the JS engine doesn't touch this once the thread dies.
+  js::SetContextProfilingStack(aJSContext, &ProfilingStackRef());
+
+  // Check invariants.
+  MOZ_ASSERT((mJSContext &&
+              (mProfilingFeatures != ThreadProfilingFeatures::NotProfiled)) ==
+             !!mJsFrameBuffer);
+}
+
+void ThreadRegistrationLockedRWOnThread::ClearJSContext() {
+  mJSContext = nullptr;
+
+  if (mJsFrameBuffer) {
+    delete[] mJsFrameBuffer;
+    mJsFrameBuffer = nullptr;
+  }
+
+  // Check invariants.
+  MOZ_ASSERT((mJSContext &&
+              (mProfilingFeatures != ThreadProfilingFeatures::NotProfiled)) ==
+             !!mJsFrameBuffer);
+}
+
+void ThreadRegistrationLockedRWOnThread::PollJSSampling() {
+  // We can't start/stop profiling until we have the thread's JSContext.
+  if (mJSContext) {
+    // It is possible for mJSSampling to go through the following sequences.
+    //
+    // - INACTIVE, ACTIVE_REQUESTED, INACTIVE_REQUESTED, INACTIVE
+    //
+    // - ACTIVE, INACTIVE_REQUESTED, ACTIVE_REQUESTED, ACTIVE
+    //
+    // Therefore, the if and else branches here aren't always interleaved.
+    // This is ok because the JS engine can handle that.
+    //
+    if (mJSSampling == ACTIVE_REQUESTED) {
+      mJSSampling = ACTIVE;
+      js::EnableContextProfilingStack(mJSContext, true);
+
+      if (JSAllocationsEnabled()) {
+        // TODO - This probability should not be hardcoded. See Bug 1547284.
+        JS::EnableRecordingAllocations(mJSContext,
+                                       profiler_add_js_allocation_marker, 0.01);
+      }
+      js::RegisterContextProfilingEventMarker(mJSContext,
+                                              profiler_add_js_marker);
+
+    } else if (mJSSampling == INACTIVE_REQUESTED) {
+      mJSSampling = INACTIVE;
+      js::EnableContextProfilingStack(mJSContext, false);
+
+      if (JSAllocationsEnabled()) {
+        JS::DisableRecordingAllocations(mJSContext);
+      }
+    }
+  }
+}
+
+#ifdef NIGHTLY_BUILD
+void ThreadRegistrationUnlockedConstReaderAndAtomicRW::RecordWakeCount() const {
+  baseprofiler::detail::BaseProfilerAutoLock lock(mRecordWakeCountMutex);
+
+  uint64_t newWakeCount = mWakeCount - mAlreadyRecordedWakeCount;
+  if (newWakeCount == 0 && mSleep != AWAKE) {
+    // If no new wake-up was counted, and the thread is not marked awake,
+    // we can be pretty sure there is no CPU activity to record.
+    // Threads that are never annotated as asleep/awake (typically rust threads)
+    // start as awake.
+    return;
+  }
+
+  uint64_t cpuTimeNs;
+  if (!GetCpuTimeSinceThreadStartInNs(&cpuTimeNs, PlatformDataCRef())) {
+    cpuTimeNs = 0;
+  }
+
+  constexpr uint64_t NS_PER_MS = 1'000'000;
+  uint64_t cpuTimeMs = cpuTimeNs / NS_PER_MS;
+
+  uint64_t newCpuTimeMs = MOZ_LIKELY(cpuTimeMs > mAlreadyRecordedCpuTimeInMs)
+                              ? cpuTimeMs - mAlreadyRecordedCpuTimeInMs
+                              : 0;
+
+  if (!newWakeCount && !newCpuTimeMs) {
+    // Nothing to report, avoid computing the Glean friendly thread name.
+    return;
+  }
+
+  nsAutoCString threadName(mInfo.Name());
+  // Trim the trailing number of threads that are part of a thread pool.
+  for (size_t length = threadName.Length(); length > 0; --length) {
+    const char c = threadName.CharAt(length - 1);
+    if ((c < '0' || c > '9') && c != '#' && c != ' ') {
+      if (length != threadName.Length()) {
+        threadName.SetLength(length);
+      }
+      break;
+    }
+  }
+
+  mozilla::glean::RecordThreadCpuUse(threadName, newCpuTimeMs, newWakeCount);
+
+  // The thread id is provided as part of the payload because this call is
+  // inside a ThreadRegistration data function, which could be invoked with
+  // the ThreadRegistry locked. We cannot call any function/option that could
+  // attempt to lock the ThreadRegistry again, like MarkerThreadId.
+  PROFILER_MARKER("Thread CPU use", OTHER, {}, ThreadCpuUseMarker,
+                  mInfo.ThreadId(), newCpuTimeMs, newWakeCount, threadName);
+  mAlreadyRecordedCpuTimeInMs = cpuTimeMs;
+  mAlreadyRecordedWakeCount += newWakeCount;
+}
+#endif
+
+}  // namespace mozilla::profiler
diff --git a/tools/profiler/core/ProfilerThreadRegistry.cpp b/tools/profiler/core/ProfilerThreadRegistry.cpp
new file mode 100644
index 0000000000..cb456471d9
--- /dev/null
+++ b/tools/profiler/core/ProfilerThreadRegistry.cpp
@@ -0,0 +1,40 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/ProfilerThreadRegistry.h"
+
+namespace mozilla::profiler {
+
+/* static */
+ThreadRegistry::RegistryContainer ThreadRegistry::sRegistryContainer;
+
+/* static */
+ThreadRegistry::RegistryMutex ThreadRegistry::sRegistryMutex;
+
+#if !defined(MOZ_GECKO_PROFILER)
+// When MOZ_GECKO_PROFILER is not defined, the function definitions in
+// platform.cpp are not built, causing link errors. So we keep these simple
+// definitions here.
+
+/* static */
+void ThreadRegistry::Register(ThreadRegistration::OnThreadRef aOnThreadRef) {
+  LockedRegistry lock;
+  MOZ_RELEASE_ASSERT(sRegistryContainer.append(OffThreadRef{aOnThreadRef}));
+}
+
+/* static */
+void ThreadRegistry::Unregister(ThreadRegistration::OnThreadRef aOnThreadRef) {
+  LockedRegistry lock;
+  for (OffThreadRef& thread : sRegistryContainer) {
+    if (thread.IsPointingAt(*aOnThreadRef.mThreadRegistration)) {
+      sRegistryContainer.erase(&thread);
+      break;
+    }
+  }
+}
+#endif  // !defined(MOZ_GECKO_PROFILER)
+
+}  // namespace mozilla::profiler
diff --git a/tools/profiler/core/ProfilerUtils.cpp b/tools/profiler/core/ProfilerUtils.cpp
new file mode 100644
index 0000000000..6a46878ad7
--- /dev/null
+++ b/tools/profiler/core/ProfilerUtils.cpp
@@ -0,0 +1,118 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// This file implements functions from ProfilerUtils.h on all platforms.
+// Functions with platform-specific implementations are separated in #if blocks
+// below, with each block being self-contained with all the #includes and
+// definitions it needs, to keep platform code easier to maintain in isolation.
+
+#include "mozilla/ProfilerUtils.h"
+
+// --------------------------------------------- Windows process & thread ids
+#if defined(XP_WIN)
+
+#  include <process.h>
+#  include <processthreadsapi.h>
+
+ProfilerProcessId profiler_current_process_id() {
+  return ProfilerProcessId::FromNativeId(_getpid());
+}
+
+ProfilerThreadId profiler_current_thread_id() {
+  static_assert(std::is_same_v<ProfilerThreadId::NativeType,
+                               decltype(GetCurrentThreadId())>,
+                "ProfilerThreadId::NativeType must be exactly the type "
+                "returned by GetCurrentThreadId()");
+  return ProfilerThreadId::FromNativeId(GetCurrentThreadId());
+}
+
+// --------------------------------------------- Non-Windows process id
+#else
+// All non-Windows platforms are assumed to be POSIX, which has getpid().
+
+#  include <unistd.h>
+
+ProfilerProcessId profiler_current_process_id() {
+  return ProfilerProcessId::FromNativeId(getpid());
+}
+
+// --------------------------------------------- Non-Windows thread id
+// ------------------------------------------------------- macOS
+#  if defined(XP_MACOSX)
+
+#    include <pthread.h>
+
+ProfilerThreadId profiler_current_thread_id() {
+  uint64_t tid;
+  if (pthread_threadid_np(nullptr, &tid) != 0) {
+    return ProfilerThreadId{};
+  }
+  return ProfilerThreadId::FromNativeId(tid);
+}
+
+// ------------------------------------------------------- Android
+// Test Android before Linux, because Linux includes Android.
+#  elif defined(__ANDROID__) || defined(ANDROID)
+
+ProfilerThreadId profiler_current_thread_id() {
+  return ProfilerThreadId::FromNativeId(gettid());
+}
+
+// ------------------------------------------------------- Linux
+#  elif defined(XP_LINUX)
+
+#    include <sys/syscall.h>
+
+ProfilerThreadId profiler_current_thread_id() {
+  // glibc doesn't provide a wrapper for gettid() until 2.30
+  return ProfilerThreadId::FromNativeId(syscall(SYS_gettid));
+}
+
+// ------------------------------------------------------- FreeBSD
+#  elif defined(XP_FREEBSD)
+
+#    include <sys/thr.h>
+
+ProfilerThreadId profiler_current_thread_id() {
+  long id;
+  if (thr_self(&id) != 0) {
+    return ProfilerThreadId{};
+  }
+  return ProfilerThreadId::FromNativeId(id);
+}
+
+// ------------------------------------------------------- Others
+#  else
+
+ProfilerThreadId profiler_current_thread_id() {
+  return ProfilerThreadId::FromNativeId(std::this_thread::get_id());
+}
+
+#  endif
+#endif  // End of non-XP_WIN.
+
+// --------------------------------------------- Platform-agnostic definitions
+
+#include "MainThreadUtils.h"
+#include "mozilla/Assertions.h"
+
+static ProfilerThreadId scProfilerMainThreadId;
+
+void profiler_init_main_thread_id() {
+  MOZ_ASSERT(NS_IsMainThread());
+  mozilla::baseprofiler::profiler_init_main_thread_id();
+  if (!scProfilerMainThreadId.IsSpecified()) {
+    scProfilerMainThreadId = profiler_current_thread_id();
+  }
+}
+
+[[nodiscard]] ProfilerThreadId profiler_main_thread_id() {
+  return scProfilerMainThreadId;
+}
+
+[[nodiscard]] bool profiler_is_main_thread() {
+  return profiler_current_thread_id() == scProfilerMainThreadId;
+}
diff --git a/tools/profiler/core/VTuneProfiler.cpp b/tools/profiler/core/VTuneProfiler.cpp
new file mode 100644
index 0000000000..58a39c51ee
--- /dev/null
+++ b/tools/profiler/core/VTuneProfiler.cpp
@@ -0,0 +1,80 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef XP_WIN
+#  undef UNICODE
+#  undef _UNICODE
+#endif
+
+#include "VTuneProfiler.h"
+#include "mozilla/Bootstrap.h"
+#include <memory>
+
+VTuneProfiler* VTuneProfiler::mInstance = nullptr;
+
+void VTuneProfiler::Initialize() {
+  // This is just a 'dirty trick' to find out if the ittnotify DLL was found.
+  // If it wasn't this function always returns 0, otherwise it returns
+  // incrementing numbers, if the library was found this wastes 2 events but
+  // that should be okay.
+  __itt_event testEvent =
+      __itt_event_create("Test event", strlen("Test event"));
+  testEvent = __itt_event_create("Test event 2", strlen("Test event 2"));
+
+  if (testEvent) {
+    mInstance = new VTuneProfiler();
+  }
+}
+
+void VTuneProfiler::Shutdown() {}
+
+void VTuneProfiler::TraceInternal(const char* aName, TracingKind aKind) {
+  std::string str(aName);
+
+  auto iter = mStrings.find(str);
+
+  __itt_event event;
+  if (iter != mStrings.end()) {
+    event = iter->second;
+  } else {
+    event = __itt_event_create(aName, str.length());
+    mStrings.insert({str, event});
+  }
+
+  if (aKind == TRACING_INTERVAL_START || aKind == TRACING_EVENT) {
+    // VTune will consider starts not matched with an end to be single point in
+    // time events.
+    __itt_event_start(event);
+  } else {
+    __itt_event_end(event);
+  }
+}
+
+void VTuneProfiler::RegisterThreadInternal(const char* aName) {
+  std::string str(aName);
+
+  if (!str.compare("GeckoMain")) {
+    // Process main thread.
+    switch (XRE_GetProcessType()) {
+      case GeckoProcessType::GeckoProcessType_Default:
+        __itt_thread_set_name("Main Process");
+        break;
+      case GeckoProcessType::GeckoProcessType_Content:
+        __itt_thread_set_name("Content Process");
+        break;
+      case GeckoProcessType::GeckoProcessType_GMPlugin:
+        __itt_thread_set_name("Plugin Process");
+        break;
+      case GeckoProcessType::GeckoProcessType_GPU:
+        __itt_thread_set_name("GPU Process");
+        break;
+      default:
+        __itt_thread_set_name("Unknown Process");
+    }
+    return;
+  }
+  __itt_thread_set_name(aName);
+}
diff --git a/tools/profiler/core/VTuneProfiler.h b/tools/profiler/core/VTuneProfiler.h
new file mode 100644
index 0000000000..e3abe6b90d
--- /dev/null
+++ b/tools/profiler/core/VTuneProfiler.h
@@ -0,0 +1,78 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef VTuneProfiler_h
+#define VTuneProfiler_h
+
+// The intent here is to add 0 overhead for regular users. In order to build
+// the VTune profiler code at all --enable-vtune-instrumentation needs to be
+// set as a build option. Even then, when none of the environment variables
+// is specified that allow us to find the ittnotify DLL, these functions
+// should be minimal overhead. When starting Firefox under VTune, these
+// env vars will be automatically defined, otherwise INTEL_LIBITTNOTIFY32/64
+// should be set to point at the ittnotify DLL.
+#ifndef MOZ_VTUNE_INSTRUMENTATION
+
+#  define VTUNE_INIT()
+#  define VTUNE_SHUTDOWN()
+
+#  define VTUNE_TRACING(name, kind)
+#  define VTUNE_REGISTER_THREAD(name)
+
+#else
+
+#  include "GeckoProfiler.h"
+
+// This is the regular Intel header, these functions are actually defined for
+// us inside js/src/vtune by an intel C file which actually dynamically resolves
+// them to the correct DLL. Through libxul these will 'magically' resolve.
+#  include "vtune/ittnotify.h"
+
+#  include <stddef.h>
+#  include <unordered_map>
+#  include <string>
+
+class VTuneProfiler {
+ public:
+  static void Initialize();
+  static void Shutdown();
+
+  enum TracingKind {
+    TRACING_EVENT,
+    TRACING_INTERVAL_START,
+    TRACING_INTERVAL_END,
+  };
+
+  static void Trace(const char* aName, TracingKind aKind) {
+    if (mInstance) {
+      mInstance->TraceInternal(aName, aKind);
+    }
+  }
+  static void RegisterThread(const char* aName) {
+    if (mInstance) {
+      mInstance->RegisterThreadInternal(aName);
+    }
+  }
+
+ private:
+  void TraceInternal(const char* aName, TracingKind aKind);
+  void RegisterThreadInternal(const char* aName);
+
+  // This is null when the ittnotify DLL could not be found.
+  static VTuneProfiler* mInstance;
+
+  std::unordered_map<std::string, __itt_event> mStrings;
+};
+
+#  define VTUNE_INIT() VTuneProfiler::Initialize()
+#  define VTUNE_SHUTDOWN() VTuneProfiler::Shutdown()
+
+#  define VTUNE_TRACING(name, kind) VTuneProfiler::Trace(name, kind)
+#  define VTUNE_REGISTER_THREAD(name) VTuneProfiler::RegisterThread(name)
+
+#endif
+
+#endif /* VTuneProfiler_h */
diff --git a/tools/profiler/core/memory_hooks.cpp b/tools/profiler/core/memory_hooks.cpp
new file mode 100644
index 0000000000..59e87d607c
--- /dev/null
+++ b/tools/profiler/core/memory_hooks.cpp
@@ -0,0 +1,632 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "memory_hooks.h"
+
+#include "nscore.h"
+
+#include "mozilla/Assertions.h"
+#include "mozilla/Atomics.h"
+#include "mozilla/FastBernoulliTrial.h"
+#include "mozilla/IntegerPrintfMacros.h"
+#include "mozilla/JSONWriter.h"
+#include "mozilla/MemoryReporting.h"
+#include "mozilla/PlatformMutex.h"
+#include "mozilla/ProfilerCounts.h"
+#include "mozilla/ThreadLocal.h"
+
+#include "GeckoProfiler.h"
+#include "prenv.h"
+#include "replace_malloc.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef XP_WIN
+#  include <windows.h>
+#  include <process.h>
+#else
+#  include <pthread.h>
+#  include <sys/types.h>
+#  include <unistd.h>
+#endif
+
+#ifdef ANDROID
+#  include <android/log.h>
+#endif
+
+// The counters start out as a nullptr, and then get initialized only once. They
+// are never destroyed, as it would cause race conditions for the memory hooks
+// that use the counters. This helps guard against potentially expensive
+// operations like using a mutex.
+//
+// In addition, this is a raw pointer and not a UniquePtr, as the counter
+// machinery will try and de-register itself from the profiler. This could
+// happen after the profiler and its PSMutex was already destroyed, resulting in
+// a crash.
+static ProfilerCounterTotal* sCounter;
+
+// The gBernoulli value starts out as a nullptr, and only gets initialized once.
+// It then lives for the entire lifetime of the process. It cannot be deleted
+// without additional multi-threaded protections, since if we deleted it during
+// profiler_stop then there could be a race between threads already in a
+// memory hook that might try to access the value after or during deletion.
+static mozilla::FastBernoulliTrial* gBernoulli;
+
+namespace mozilla::profiler {
+
+//---------------------------------------------------------------------------
+// Utilities
+//---------------------------------------------------------------------------
+
+// Returns true or or false depending on whether the marker was actually added
+// or not.
+static bool profiler_add_native_allocation_marker(int64_t aSize,
+                                                  uintptr_t aMemoryAddress) {
+  if (!profiler_thread_is_being_profiled_for_markers(
+          profiler_main_thread_id())) {
+    return false;
+  }
+
+  // Because native allocations may be intercepted anywhere, blocking while
+  // locking the profiler mutex here could end up causing a deadlock if another
+  // mutex is taken, which the profiler may indirectly need elsewhere.
+  // See bug 1642726 for such a scenario.
+  // So instead we bail out if the mutex is already locked. Native allocations
+  // are statistically sampled anyway, so missing a few because of this is
+  // acceptable.
+  if (profiler_is_locked_on_current_thread()) {
+    return false;
+  }
+
+  struct NativeAllocationMarker {
+    static constexpr mozilla::Span<const char> MarkerTypeName() {
+      return mozilla::MakeStringSpan("Native allocation");
+    }
+    static void StreamJSONMarkerData(
+        mozilla::baseprofiler::SpliceableJSONWriter& aWriter, int64_t aSize,
+        uintptr_t aMemoryAddress, ProfilerThreadId aThreadId) {
+      aWriter.IntProperty("size", aSize);
+      aWriter.IntProperty("memoryAddress",
+                          static_cast<int64_t>(aMemoryAddress));
+      // Tech note: If `ToNumber()` returns a uint64_t, the conversion to
+      // int64_t is "implementation-defined" before C++20. This is acceptable
+      // here, because this is a one-way conversion to a unique identifier
+      // that's used to visually separate data by thread on the front-end.
+      aWriter.IntProperty("threadId",
+                          static_cast<int64_t>(aThreadId.ToNumber()));
+    }
+    static mozilla::MarkerSchema MarkerTypeDisplay() {
+      return mozilla::MarkerSchema::SpecialFrontendLocation{};
+    }
+  };
+
+  profiler_add_marker("Native allocation", geckoprofiler::category::OTHER,
+                      {MarkerThreadId::MainThread(), MarkerStack::Capture()},
+                      NativeAllocationMarker{}, aSize, aMemoryAddress,
+                      profiler_current_thread_id());
+  return true;
+}
+
+static malloc_table_t gMallocTable;
+
+// This is only needed because of the |const void*| vs |void*| arg mismatch.
+static size_t MallocSizeOf(const void* aPtr) {
+  return gMallocTable.malloc_usable_size(const_cast<void*>(aPtr));
+}
+
+// The values for the Bernoulli trial are taken from DMD. According to DMD:
+//
+//   In testing, a probability of 0.003 resulted in ~25% of heap blocks getting
+//   a stack trace and ~80% of heap bytes getting a stack trace. (This is
+//   possible because big heap blocks are more likely to get a stack trace.)
+//
+//   The random number seeds are arbitrary and were obtained from random.org.
+//
+// However this value resulted in a lot of slowdown since the profiler stacks
+// are pretty heavy to collect. The value was lowered to 10% of the original to
+// 0.0003.
+static void EnsureBernoulliIsInstalled() {
+  if (!gBernoulli) {
+    // This is only installed once. See the gBernoulli definition for more
+    // information.
+    gBernoulli =
+        new FastBernoulliTrial(0.0003, 0x8e26eeee166bc8ca, 0x56820f304a9c9ae0);
+  }
+}
+
+// This class provides infallible allocations (they abort on OOM) like
+// mozalloc's InfallibleAllocPolicy, except that memory hooks are bypassed. This
+// policy is used by the HashSet.
+class InfallibleAllocWithoutHooksPolicy {
+  static void ExitOnFailure(const void* aP) {
+    if (!aP) {
+      MOZ_CRASH("Profiler memory hooks out of memory; aborting");
+    }
+  }
+
+ public:
+  template <typename T>
+  static T* maybe_pod_malloc(size_t aNumElems) {
+    if (aNumElems & mozilla::tl::MulOverflowMask<sizeof(T)>::value) {
+      return nullptr;
+    }
+    return (T*)gMallocTable.malloc(aNumElems * sizeof(T));
+  }
+
+  template <typename T>
+  static T* maybe_pod_calloc(size_t aNumElems) {
+    return (T*)gMallocTable.calloc(aNumElems, sizeof(T));
+  }
+
+  template <typename T>
+  static T* maybe_pod_realloc(T* aPtr, size_t aOldSize, size_t aNewSize) {
+    if (aNewSize & mozilla::tl::MulOverflowMask<sizeof(T)>::value) {
+      return nullptr;
+    }
+    return (T*)gMallocTable.realloc(aPtr, aNewSize * sizeof(T));
+  }
+
+  template <typename T>
+  static T* pod_malloc(size_t aNumElems) {
+    T* p = maybe_pod_malloc<T>(aNumElems);
+    ExitOnFailure(p);
+    return p;
+  }
+
+  template <typename T>
+  static T* pod_calloc(size_t aNumElems) {
+    T* p = maybe_pod_calloc<T>(aNumElems);
+    ExitOnFailure(p);
+    return p;
+  }
+
+  template <typename T>
+  static T* pod_realloc(T* aPtr, size_t aOldSize, size_t aNewSize) {
+    T* p = maybe_pod_realloc(aPtr, aOldSize, aNewSize);
+    ExitOnFailure(p);
+    return p;
+  }
+
+  template <typename T>
+  static void free_(T* aPtr, size_t aSize = 0) {
+    gMallocTable.free(aPtr);
+  }
+
+  static void reportAllocOverflow() { ExitOnFailure(nullptr); }
+  bool checkSimulatedOOM() const { return true; }
+};
+
+// We can't use mozilla::Mutex because it causes re-entry into the memory hooks.
+// Define a custom implementation here.
+class Mutex : private ::mozilla::detail::MutexImpl {
+ public:
+  Mutex() : ::mozilla::detail::MutexImpl() {}
+
+  void Lock() { ::mozilla::detail::MutexImpl::lock(); }
+  void Unlock() { ::mozilla::detail::MutexImpl::unlock(); }
+};
+
+class MutexAutoLock {
+  MutexAutoLock(const MutexAutoLock&) = delete;
+  void operator=(const MutexAutoLock&) = delete;
+
+  Mutex& mMutex;
+
+ public:
+  explicit MutexAutoLock(Mutex& aMutex) : mMutex(aMutex) { mMutex.Lock(); }
+  ~MutexAutoLock() { mMutex.Unlock(); }
+};
+
+//---------------------------------------------------------------------------
+// Tracked allocations
+//---------------------------------------------------------------------------
+
+// The allocation tracker is shared between multiple threads, and is the
+// coordinator for knowing when allocations have been tracked. The mutable
+// internal state is protected by a mutex, and managed by the methods.
+//
+// The tracker knows about all the allocations that we have added to the
+// profiler. This way, whenever any given piece of memory is freed, we can see
+// if it was previously tracked, and we can track its deallocation.
+
+class AllocationTracker {
+  // This type tracks all of the allocations that we have captured. This way, we
+  // can see if a deallocation is inside of this set. We want to provide a
+  // balanced view into the allocations and deallocations.
+  typedef mozilla::HashSet<const void*, mozilla::DefaultHasher<const void*>,
+                           InfallibleAllocWithoutHooksPolicy>
+      AllocationSet;
+
+ public:
+  AllocationTracker() : mAllocations(), mMutex() {}
+
+  void AddMemoryAddress(const void* memoryAddress) {
+    MutexAutoLock lock(mMutex);
+    if (!mAllocations.put(memoryAddress)) {
+      MOZ_CRASH("Out of memory while tracking native allocations.");
+    };
+  }
+
+  void Reset() {
+    MutexAutoLock lock(mMutex);
+    mAllocations.clearAndCompact();
+  }
+
+  // Returns true when the memory address is found and removed, otherwise that
+  // memory address is not being tracked and it returns false.
+  bool RemoveMemoryAddressIfFound(const void* memoryAddress) {
+    MutexAutoLock lock(mMutex);
+
+    auto ptr = mAllocations.lookup(memoryAddress);
+    if (ptr) {
+      // The memory was present. It no longer needs to be tracked.
+      mAllocations.remove(ptr);
+      return true;
+    }
+
+    return false;
+  }
+
+ private:
+  AllocationSet mAllocations;
+  Mutex mMutex MOZ_UNANNOTATED;
+};
+
+static AllocationTracker* gAllocationTracker;
+
+static void EnsureAllocationTrackerIsInstalled() {
+  if (!gAllocationTracker) {
+    // This is only installed once.
+    gAllocationTracker = new AllocationTracker();
+  }
+}
+
+//---------------------------------------------------------------------------
+// Per-thread blocking of intercepts
+//---------------------------------------------------------------------------
+
+// On MacOS, and Linux the first __thread/thread_local access calls malloc,
+// which leads to an infinite loop. So we use pthread-based TLS instead, which
+// somehow doesn't have this problem.
+#if !defined(XP_DARWIN) && !defined(XP_LINUX)
+#  define PROFILER_THREAD_LOCAL(T) MOZ_THREAD_LOCAL(T)
+#else
+#  define PROFILER_THREAD_LOCAL(T) \
+    ::mozilla::detail::ThreadLocal<T, ::mozilla::detail::ThreadLocalKeyStorage>
+#endif
+
+// This class is used to determine if allocations on this thread should be
+// intercepted or not.
+// Creating a ThreadIntercept object on the stack will implicitly block nested
+// ones. There are other reasons to block: The feature is off, or we're inside a
+// profiler function that is locking a mutex.
+class MOZ_RAII ThreadIntercept {
+  // When set to true, malloc does not intercept additional allocations. This is
+  // needed because collecting stacks creates new allocations. When blocked,
+  // these allocations are then ignored by the memory hook.
+  static PROFILER_THREAD_LOCAL(bool) tlsIsBlocked;
+
+  // This is a quick flag to check and see if the allocations feature is enabled
+  // or disabled.
+  static mozilla::Atomic<bool, mozilla::Relaxed> sAllocationsFeatureEnabled;
+
+  // True if this ThreadIntercept has set tlsIsBlocked.
+  bool mIsBlockingTLS;
+
+  // True if interception is blocked for any reason.
+  bool mIsBlocked;
+
+ public:
+  static void Init() {
+    tlsIsBlocked.infallibleInit();
+    // infallibleInit should zero-initialize, which corresponds to `false`.
+    MOZ_ASSERT(!tlsIsBlocked.get());
+  }
+
+  ThreadIntercept() {
+    // If the allocation interception feature is enabled, and the TLS is not
+    // blocked yet, we will block the TLS now, and unblock on destruction.
+    mIsBlockingTLS = sAllocationsFeatureEnabled && !tlsIsBlocked.get();
+    if (mIsBlockingTLS) {
+      MOZ_ASSERT(!tlsIsBlocked.get());
+      tlsIsBlocked.set(true);
+      // Since this is the top-level ThreadIntercept, interceptions are not
+      // blocked unless the profiler itself holds a locked mutex, in which case
+      // we don't want to intercept allocations that originate from such a
+      // profiler call.
+      mIsBlocked = profiler_is_locked_on_current_thread();
+    } else {
+      // The feature is off, or the TLS was already blocked, then we block this
+      // interception.
+      mIsBlocked = true;
+    }
+  }
+
+  ~ThreadIntercept() {
+    if (mIsBlockingTLS) {
+      MOZ_ASSERT(tlsIsBlocked.get());
+      tlsIsBlocked.set(false);
+    }
+  }
+
+  // Is this ThreadIntercept effectively blocked? (Feature is off, or this
+  // ThreadIntercept is nested, or we're inside a locked-Profiler function.)
+  bool IsBlocked() const { return mIsBlocked; }
+
+  static void EnableAllocationFeature() { sAllocationsFeatureEnabled = true; }
+
+  static void DisableAllocationFeature() { sAllocationsFeatureEnabled = false; }
+};
+
+PROFILER_THREAD_LOCAL(bool) ThreadIntercept::tlsIsBlocked;
+
+mozilla::Atomic<bool, mozilla::Relaxed>
+    ThreadIntercept::sAllocationsFeatureEnabled(false);
+
+//---------------------------------------------------------------------------
+// malloc/free callbacks
+//---------------------------------------------------------------------------
+
+static void AllocCallback(void* aPtr, size_t aReqSize) {
+  if (!aPtr) {
+    return;
+  }
+
+  // The first part of this function does not allocate.
+  size_t actualSize = gMallocTable.malloc_usable_size(aPtr);
+  if (actualSize > 0) {
+    sCounter->Add(actualSize);
+  }
+
+  ThreadIntercept threadIntercept;
+  if (threadIntercept.IsBlocked()) {
+    // Either the native allocations feature is not turned on, or we may be
+    // recursing into a memory hook, return. We'll still collect counter
+    // information about this allocation, but no stack.
+    return;
+  }
+
+  AUTO_PROFILER_LABEL("AllocCallback", PROFILER);
+
+  // Perform a bernoulli trial, which will return true or false based on its
+  // configured probability. It takes into account the byte size so that
+  // larger allocations are weighted heavier than smaller allocations.
+  MOZ_ASSERT(gBernoulli,
+             "gBernoulli must be properly installed for the memory hooks.");
+  if (
+      // First perform the Bernoulli trial.
+      gBernoulli->trial(actualSize) &&
+      // Second, attempt to add a marker if the Bernoulli trial passed.
+      profiler_add_native_allocation_marker(
+          static_cast<int64_t>(actualSize),
+          reinterpret_cast<uintptr_t>(aPtr))) {
+    MOZ_ASSERT(gAllocationTracker,
+               "gAllocationTracker must be properly installed for the memory "
+               "hooks.");
+    // Only track the memory if the allocation marker was actually added to the
+    // profiler.
+    gAllocationTracker->AddMemoryAddress(aPtr);
+  }
+
+  // We're ignoring aReqSize here
+}
+
+static void FreeCallback(void* aPtr) {
+  if (!aPtr) {
+    return;
+  }
+
+  // The first part of this function does not allocate.
+  size_t unsignedSize = MallocSizeOf(aPtr);
+  int64_t signedSize = -(static_cast<int64_t>(unsignedSize));
+  sCounter->Add(signedSize);
+
+  ThreadIntercept threadIntercept;
+  if (threadIntercept.IsBlocked()) {
+    // Either the native allocations feature is not turned on, or we may be
+    // recursing into a memory hook, return. We'll still collect counter
+    // information about this allocation, but no stack.
+    return;
+  }
+
+  AUTO_PROFILER_LABEL("FreeCallback", PROFILER);
+
+  // Perform a bernoulli trial, which will return true or false based on its
+  // configured probability. It takes into account the byte size so that
+  // larger allocations are weighted heavier than smaller allocations.
+  MOZ_ASSERT(
+      gAllocationTracker,
+      "gAllocationTracker must be properly installed for the memory hooks.");
+  if (gAllocationTracker->RemoveMemoryAddressIfFound(aPtr)) {
+    // This size here is negative, indicating a deallocation.
+    profiler_add_native_allocation_marker(signedSize,
+                                          reinterpret_cast<uintptr_t>(aPtr));
+  }
+}
+
+}  // namespace mozilla::profiler
+
+//---------------------------------------------------------------------------
+// malloc/free interception
+//---------------------------------------------------------------------------
+
+using namespace mozilla::profiler;
+
+static void* replace_malloc(size_t aSize) {
+  // This must be a call to malloc from outside.  Intercept it.
+  void* ptr = gMallocTable.malloc(aSize);
+  AllocCallback(ptr, aSize);
+  return ptr;
+}
+
+static void* replace_calloc(size_t aCount, size_t aSize) {
+  void* ptr = gMallocTable.calloc(aCount, aSize);
+  AllocCallback(ptr, aCount * aSize);
+  return ptr;
+}
+
+static void* replace_realloc(void* aOldPtr, size_t aSize) {
+  // If |aOldPtr| is nullptr, the call is equivalent to |malloc(aSize)|.
+  if (!aOldPtr) {
+    return replace_malloc(aSize);
+  }
+
+  FreeCallback(aOldPtr);
+  void* ptr = gMallocTable.realloc(aOldPtr, aSize);
+  if (ptr) {
+    AllocCallback(ptr, aSize);
+  } else {
+    // If realloc fails, we undo the prior operations by re-inserting the old
+    // pointer into the live block table. We don't have to do anything with the
+    // dead block list because the dead block hasn't yet been inserted. The
+    // block will end up looking like it was allocated for the first time here,
+    // which is untrue, and the slop bytes will be zero, which may be untrue.
+    // But this case is rare and doing better isn't worth the effort.
+    AllocCallback(aOldPtr, gMallocTable.malloc_usable_size(aOldPtr));
+  }
+  return ptr;
+}
+
+static void* replace_memalign(size_t aAlignment, size_t aSize) {
+  void* ptr = gMallocTable.memalign(aAlignment, aSize);
+  AllocCallback(ptr, aSize);
+  return ptr;
+}
+
+static void replace_free(void* aPtr) {
+  FreeCallback(aPtr);
+  gMallocTable.free(aPtr);
+}
+
+static void* replace_moz_arena_malloc(arena_id_t aArena, size_t aSize) {
+  void* ptr = gMallocTable.moz_arena_malloc(aArena, aSize);
+  AllocCallback(ptr, aSize);
+  return ptr;
+}
+
+static void* replace_moz_arena_calloc(arena_id_t aArena, size_t aCount,
+                                      size_t aSize) {
+  void* ptr = gMallocTable.moz_arena_calloc(aArena, aCount, aSize);
+  AllocCallback(ptr, aCount * aSize);
+  return ptr;
+}
+
+static void* replace_moz_arena_realloc(arena_id_t aArena, void* aPtr,
+                                       size_t aSize) {
+  void* ptr = gMallocTable.moz_arena_realloc(aArena, aPtr, aSize);
+  AllocCallback(ptr, aSize);
+  return ptr;
+}
+
+static void replace_moz_arena_free(arena_id_t aArena, void* aPtr) {
+  FreeCallback(aPtr);
+  gMallocTable.moz_arena_free(aArena, aPtr);
+}
+
+static void* replace_moz_arena_memalign(arena_id_t aArena, size_t aAlignment,
+                                        size_t aSize) {
+  void* ptr = gMallocTable.moz_arena_memalign(aArena, aAlignment, aSize);
+  AllocCallback(ptr, aSize);
+  return ptr;
+}
+
+// we have to replace these or jemalloc will assume we don't implement any
+// of the arena replacements!
+static arena_id_t replace_moz_create_arena_with_params(
+    arena_params_t* aParams) {
+  return gMallocTable.moz_create_arena_with_params(aParams);
+}
+
+static void replace_moz_dispose_arena(arena_id_t aArenaId) {
+  return gMallocTable.moz_dispose_arena(aArenaId);
+}
+
+static void replace_moz_set_max_dirty_page_modifier(int32_t aModifier) {
+  return gMallocTable.moz_set_max_dirty_page_modifier(aModifier);
+}
+
+// Must come after all the replace_* funcs
+void replace_init(malloc_table_t* aMallocTable, ReplaceMallocBridge** aBridge) {
+  gMallocTable = *aMallocTable;
+#define MALLOC_FUNCS (MALLOC_FUNCS_MALLOC_BASE | MALLOC_FUNCS_ARENA)
+#define MALLOC_DECL(name, ...) aMallocTable->name = replace_##name;
+#include "malloc_decls.h"
+}
+
+void profiler_replace_remove() {}
+
+namespace mozilla::profiler {
+//---------------------------------------------------------------------------
+// Initialization
+//---------------------------------------------------------------------------
+
+BaseProfilerCount* install_memory_hooks() {
+  if (!sCounter) {
+    sCounter = new ProfilerCounterTotal("malloc", "Memory",
+                                        "Amount of allocated memory");
+    // Also initialize the ThreadIntercept, even if native allocation tracking
+    // won't be turned on. This way the TLS will be initialized.
+    ThreadIntercept::Init();
+  } else {
+    sCounter->Clear();
+  }
+  jemalloc_replace_dynamic(replace_init);
+  return sCounter;
+}
+
+// Remove the hooks, but leave the sCounter machinery. Deleting the counter
+// would race with any existing memory hooks that are currently running. Rather
+// than adding overhead here of mutexes it's cheaper for the performance to just
+// leak these values.
+void remove_memory_hooks() { jemalloc_replace_dynamic(nullptr); }
+
+void enable_native_allocations() {
+  // The bloat log tracks allocations and deallocations. This can conflict
+  // with the memory hook machinery, as the bloat log creates its own
+  // allocations. This means we can re-enter inside the bloat log machinery. At
+  // this time, the bloat log does not know about cannot handle the native
+  // allocation feature.
+  //
+  // At the time of this writing, we hit this assertion:
+  // IsIdle(oldState) || IsRead(oldState) in Checker::StartReadOp()
+  //
+  //    #01: GetBloatEntry(char const*, unsigned int)
+  //    #02: NS_LogCtor
+  //    #03: profiler_get_backtrace()
+  //    #04: profiler_add_native_allocation_marker(long long)
+  //    #05: mozilla::profiler::AllocCallback(void*, unsigned long)
+  //    #06: replace_calloc(unsigned long, unsigned long)
+  //    #07: PLDHashTable::ChangeTable(int)
+  //    #08: PLDHashTable::Add(void const*, std::nothrow_t const&)
+  //    #09: nsBaseHashtable<nsDepCharHashKey, nsAutoPtr<BloatEntry>, ...
+  //    #10: GetBloatEntry(char const*, unsigned int)
+  //    #11: NS_LogCtor
+  //    #12: profiler_get_backtrace()
+  //    ...
+  MOZ_ASSERT(!PR_GetEnv("XPCOM_MEM_BLOAT_LOG"),
+             "The bloat log feature is not compatible with the native "
+             "allocations instrumentation.");
+
+  EnsureBernoulliIsInstalled();
+  EnsureAllocationTrackerIsInstalled();
+  ThreadIntercept::EnableAllocationFeature();
+}
+
+// This is safe to call even if native allocations hasn't been enabled.
+void disable_native_allocations() {
+  ThreadIntercept::DisableAllocationFeature();
+  if (gAllocationTracker) {
+    gAllocationTracker->Reset();
+  }
+}
+
+}  // namespace mozilla::profiler
diff --git a/tools/profiler/core/memory_hooks.h b/tools/profiler/core/memory_hooks.h
new file mode 100644
index 0000000000..a6ace771dd
--- /dev/null
+++ b/tools/profiler/core/memory_hooks.h
@@ -0,0 +1,25 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef memory_hooks_h
+#define memory_hooks_h
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+class BaseProfilerCount;
+
+namespace mozilla {
+namespace profiler {
+
+BaseProfilerCount* install_memory_hooks();
+void remove_memory_hooks();
+void enable_native_allocations();
+void disable_native_allocations();
+
+}  // namespace profiler
+}  // namespace mozilla
+#endif
+
+#endif
diff --git a/tools/profiler/core/platform-linux-android.cpp b/tools/profiler/core/platform-linux-android.cpp
new file mode 100644
index 0000000000..6bcb9cf38b
--- /dev/null
+++ b/tools/profiler/core/platform-linux-android.cpp
@@ -0,0 +1,636 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+// Copyright (c) 2006-2011 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in
+//    the documentation and/or other materials provided with the
+//    distribution.
+//  * Neither the name of Google, Inc. nor the names of its contributors
+//    may be used to endorse or promote products derived from this
+//    software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// This file is used for both Linux and Android as well as FreeBSD.
+
+#include <stdio.h>
+#include <math.h>
+
+#include <pthread.h>
+#if defined(GP_OS_freebsd)
+#  include <sys/thr.h>
+#endif
+#include <semaphore.h>
+#include <signal.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <ucontext.h>
+// Ubuntu Dapper requires memory pages to be marked as
+// executable. Otherwise, OS raises an exception when executing code
+// in that page.
+#include <sys/types.h>  // mmap & munmap
+#include <sys/mman.h>   // mmap & munmap
+#include <sys/stat.h>   // open
+#include <fcntl.h>      // open
+#include <unistd.h>     // sysconf
+#include <semaphore.h>
+#ifdef __GLIBC__
+#  include <execinfo.h>  // backtrace, backtrace_symbols
+#endif                   // def __GLIBC__
+#include <strings.h>     // index
+#include <errno.h>
+#include <stdarg.h>
+
+#include "prenv.h"
+#include "mozilla/PodOperations.h"
+#include "mozilla/DebugOnly.h"
+#if defined(GP_OS_linux) || defined(GP_OS_android)
+#  include "common/linux/breakpad_getcontext.h"
+#endif
+
+#include <string.h>
+#include <list>
+
+using namespace mozilla;
+
+static void PopulateRegsFromContext(Registers& aRegs, ucontext_t* aContext) {
+  aRegs.mContext = aContext;
+  mcontext_t& mcontext = aContext->uc_mcontext;
+
+  // Extracting the sample from the context is extremely machine dependent.
+#if defined(GP_PLAT_x86_linux) || defined(GP_PLAT_x86_android)
+  aRegs.mPC = reinterpret_cast<Address>(mcontext.gregs[REG_EIP]);
+  aRegs.mSP = reinterpret_cast<Address>(mcontext.gregs[REG_ESP]);
+  aRegs.mFP = reinterpret_cast<Address>(mcontext.gregs[REG_EBP]);
+  aRegs.mLR = 0;
+#elif defined(GP_PLAT_amd64_linux) || defined(GP_PLAT_amd64_android)
+  aRegs.mPC = reinterpret_cast<Address>(mcontext.gregs[REG_RIP]);
+  aRegs.mSP = reinterpret_cast<Address>(mcontext.gregs[REG_RSP]);
+  aRegs.mFP = reinterpret_cast<Address>(mcontext.gregs[REG_RBP]);
+  aRegs.mLR = 0;
+#elif defined(GP_PLAT_amd64_freebsd)
+  aRegs.mPC = reinterpret_cast<Address>(mcontext.mc_rip);
+  aRegs.mSP = reinterpret_cast<Address>(mcontext.mc_rsp);
+  aRegs.mFP = reinterpret_cast<Address>(mcontext.mc_rbp);
+  aRegs.mLR = 0;
+#elif defined(GP_PLAT_arm_linux) || defined(GP_PLAT_arm_android)
+  aRegs.mPC = reinterpret_cast<Address>(mcontext.arm_pc);
+  aRegs.mSP = reinterpret_cast<Address>(mcontext.arm_sp);
+  aRegs.mFP = reinterpret_cast<Address>(mcontext.arm_fp);
+  aRegs.mLR = reinterpret_cast<Address>(mcontext.arm_lr);
+#elif defined(GP_PLAT_arm64_linux) || defined(GP_PLAT_arm64_android)
+  aRegs.mPC = reinterpret_cast<Address>(mcontext.pc);
+  aRegs.mSP = reinterpret_cast<Address>(mcontext.sp);
+  aRegs.mFP = reinterpret_cast<Address>(mcontext.regs[29]);
+  aRegs.mLR = reinterpret_cast<Address>(mcontext.regs[30]);
+#elif defined(GP_PLAT_arm64_freebsd)
+  aRegs.mPC = reinterpret_cast<Address>(mcontext.mc_gpregs.gp_elr);
+  aRegs.mSP = reinterpret_cast<Address>(mcontext.mc_gpregs.gp_sp);
+  aRegs.mFP = reinterpret_cast<Address>(mcontext.mc_gpregs.gp_x[29]);
+  aRegs.mLR = reinterpret_cast<Address>(mcontext.mc_gpregs.gp_lr);
+#elif defined(GP_PLAT_mips64_linux) || defined(GP_PLAT_mips64_android)
+  aRegs.mPC = reinterpret_cast<Address>(mcontext.pc);
+  aRegs.mSP = reinterpret_cast<Address>(mcontext.gregs[29]);
+  aRegs.mFP = reinterpret_cast<Address>(mcontext.gregs[30]);
+
+#else
+#  error "bad platform"
+#endif
+}
+
+#if defined(GP_OS_android)
+#  define SYS_tgkill __NR_tgkill
+#endif
+
+#if defined(GP_OS_linux) || defined(GP_OS_android)
+int tgkill(pid_t tgid, pid_t tid, int signalno) {
+  return syscall(SYS_tgkill, tgid, tid, signalno);
+}
+#endif
+
+#if defined(GP_OS_freebsd)
+#  define tgkill thr_kill2
+#endif
+
+mozilla::profiler::PlatformData::PlatformData(ProfilerThreadId aThreadId) {
+  MOZ_ASSERT(aThreadId == profiler_current_thread_id());
+  if (clockid_t clockid; pthread_getcpuclockid(pthread_self(), &clockid) == 0) {
+    mClockId = Some(clockid);
+  }
+}
+
+mozilla::profiler::PlatformData::~PlatformData() = default;
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN Sampler target specifics
+
+// The only way to reliably interrupt a Linux thread and inspect its register
+// and stack state is by sending a signal to it, and doing the work inside the
+// signal handler.  But we don't want to run much code inside the signal
+// handler, since POSIX severely restricts what we can do in signal handlers.
+// So we use a system of semaphores to suspend the thread and allow the
+// sampler thread to do all the work of unwinding and copying out whatever
+// data it wants.
+//
+// A four-message protocol is used to reliably suspend and later resume the
+// thread to be sampled (the samplee):
+//
+// Sampler (signal sender) thread              Samplee (thread to be sampled)
+//
+// Prepare the SigHandlerCoordinator
+// and point sSigHandlerCoordinator at it
+//
+// send SIGPROF to samplee ------- MSG 1 ----> (enter signal handler)
+// wait(mMessage2)                             Copy register state
+//                                               into sSigHandlerCoordinator
+//                         <------ MSG 2 ----- post(mMessage2)
+// Samplee is now suspended.                   wait(mMessage3)
+//   Examine its stack/register
+//   state at leisure
+//
+// Release samplee:
+//   post(mMessage3)       ------- MSG 3 ----->
+// wait(mMessage4)                              Samplee now resumes.  Tell
+//                                                the sampler that we are done.
+//                         <------ MSG 4 ------ post(mMessage4)
+// Now we know the samplee's signal             (leave signal handler)
+//   handler has finished using
+//   sSigHandlerCoordinator.  We can
+//   safely reuse it for some other thread.
+//
+
+// A type used to coordinate between the sampler (signal sending) thread and
+// the thread currently being sampled (the samplee, which receives the
+// signals).
+//
+// The first message is sent using a SIGPROF signal delivery.  The subsequent
+// three are sent using sem_wait/sem_post pairs.  They are named accordingly
+// in the following struct.
+struct SigHandlerCoordinator {
+  SigHandlerCoordinator() {
+    PodZero(&mUContext);
+    int r = sem_init(&mMessage2, /* pshared */ 0, 0);
+    r |= sem_init(&mMessage3, /* pshared */ 0, 0);
+    r |= sem_init(&mMessage4, /* pshared */ 0, 0);
+    MOZ_ASSERT(r == 0);
+    (void)r;
+  }
+
+  ~SigHandlerCoordinator() {
+    int r = sem_destroy(&mMessage2);
+    r |= sem_destroy(&mMessage3);
+    r |= sem_destroy(&mMessage4);
+    MOZ_ASSERT(r == 0);
+    (void)r;
+  }
+
+  sem_t mMessage2;       // To sampler: "context is in sSigHandlerCoordinator"
+  sem_t mMessage3;       // To samplee: "resume"
+  sem_t mMessage4;       // To sampler: "finished with sSigHandlerCoordinator"
+  ucontext_t mUContext;  // Context at signal
+};
+
+struct SigHandlerCoordinator* Sampler::sSigHandlerCoordinator = nullptr;
+
+static void SigprofHandler(int aSignal, siginfo_t* aInfo, void* aContext) {
+  // Avoid TSan warning about clobbering errno.
+  int savedErrno = errno;
+
+  MOZ_ASSERT(aSignal == SIGPROF);
+  MOZ_ASSERT(Sampler::sSigHandlerCoordinator);
+
+  // By sending us this signal, the sampler thread has sent us message 1 in
+  // the comment above, with the meaning "|sSigHandlerCoordinator| is ready
+  // for use, please copy your register context into it."
+  Sampler::sSigHandlerCoordinator->mUContext =
+      *static_cast<ucontext_t*>(aContext);
+
+  // Send message 2: tell the sampler thread that the context has been copied
+  // into |sSigHandlerCoordinator->mUContext|.  sem_post can never fail by
+  // being interrupted by a signal, so there's no loop around this call.
+  int r = sem_post(&Sampler::sSigHandlerCoordinator->mMessage2);
+  MOZ_ASSERT(r == 0);
+
+  // At this point, the sampler thread assumes we are suspended, so we must
+  // not touch any global state here.
+
+  // Wait for message 3: the sampler thread tells us to resume.
+  while (true) {
+    r = sem_wait(&Sampler::sSigHandlerCoordinator->mMessage3);
+    if (r == -1 && errno == EINTR) {
+      // Interrupted by a signal.  Try again.
+      continue;
+    }
+    // We don't expect any other kind of failure
+    MOZ_ASSERT(r == 0);
+    break;
+  }
+
+  // Send message 4: tell the sampler thread that we are finished accessing
+  // |sSigHandlerCoordinator|.  After this point it is not safe to touch
+  // |sSigHandlerCoordinator|.
+  r = sem_post(&Sampler::sSigHandlerCoordinator->mMessage4);
+  MOZ_ASSERT(r == 0);
+
+  errno = savedErrno;
+}
+
+Sampler::Sampler(PSLockRef aLock)
+    : mMyPid(profiler_current_process_id()),
+      // We don't know what the sampler thread's ID will be until it runs, so
+      // set mSamplerTid to a dummy value and fill it in for real in
+      // SuspendAndSampleAndResumeThread().
+      mSamplerTid{} {
+#if defined(USE_EHABI_STACKWALK)
+  mozilla::EHABIStackWalkInit();
+#endif
+
+  // NOTE: We don't initialize LUL here, instead initializing it in
+  // SamplerThread's constructor. This is because with the
+  // profiler_suspend_and_sample_thread entry point, we want to be able to
+  // sample without waiting for LUL to be initialized.
+
+  // Request profiling signals.
+  struct sigaction sa;
+  sa.sa_sigaction = SigprofHandler;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_RESTART | SA_SIGINFO;
+  if (sigaction(SIGPROF, &sa, &mOldSigprofHandler) != 0) {
+    MOZ_CRASH("Error installing SIGPROF handler in the profiler");
+  }
+}
+
+void Sampler::Disable(PSLockRef aLock) {
+  // Restore old signal handler. This is global state so it's important that
+  // we do it now, while gPSMutex is locked.
+  sigaction(SIGPROF, &mOldSigprofHandler, 0);
+}
+
+static void StreamMetaPlatformSampleUnits(PSLockRef aLock,
+                                          SpliceableJSONWriter& aWriter) {
+  aWriter.StringProperty("threadCPUDelta", "ns");
+}
+
+/* static */
+uint64_t RunningTimes::ConvertRawToJson(uint64_t aRawValue) {
+  return aRawValue;
+}
+
+namespace mozilla::profiler {
+bool GetCpuTimeSinceThreadStartInNs(
+    uint64_t* aResult, const mozilla::profiler::PlatformData& aPlatformData) {
+  Maybe<clockid_t> maybeCid = aPlatformData.GetClockId();
+  if (MOZ_UNLIKELY(!maybeCid)) {
+    return false;
+  }
+
+  timespec t;
+  if (clock_gettime(*maybeCid, &t) != 0) {
+    return false;
+  }
+
+  *aResult = uint64_t(t.tv_sec) * 1'000'000'000u + uint64_t(t.tv_nsec);
+  return true;
+}
+}  // namespace mozilla::profiler
+
+static RunningTimes GetProcessRunningTimesDiff(
+    PSLockRef aLock, RunningTimes& aPreviousRunningTimesToBeUpdated) {
+  AUTO_PROFILER_STATS(GetProcessRunningTimes);
+
+  RunningTimes newRunningTimes;
+  {
+    AUTO_PROFILER_STATS(GetProcessRunningTimes_clock_gettime);
+    if (timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts) == 0) {
+      newRunningTimes.SetThreadCPUDelta(uint64_t(ts.tv_sec) * 1'000'000'000u +
+                                        uint64_t(ts.tv_nsec));
+    }
+    newRunningTimes.SetPostMeasurementTimeStamp(TimeStamp::Now());
+  };
+
+  const RunningTimes diff = newRunningTimes - aPreviousRunningTimesToBeUpdated;
+  aPreviousRunningTimesToBeUpdated = newRunningTimes;
+  return diff;
+}
+
+static RunningTimes GetThreadRunningTimesDiff(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData) {
+  AUTO_PROFILER_STATS(GetRunningTimes_clock_gettime_thread);
+
+  const mozilla::profiler::PlatformData& platformData =
+      aThreadData.PlatformDataCRef();
+  Maybe<clockid_t> maybeCid = platformData.GetClockId();
+
+  if (MOZ_UNLIKELY(!maybeCid)) {
+    // No clock id -> Nothing to measure apart from the timestamp.
+    RunningTimes emptyRunningTimes;
+    emptyRunningTimes.SetPostMeasurementTimeStamp(TimeStamp::Now());
+    return emptyRunningTimes;
+  }
+
+  const RunningTimes newRunningTimes = GetRunningTimesWithTightTimestamp(
+      [cid = *maybeCid](RunningTimes& aRunningTimes) {
+        AUTO_PROFILER_STATS(GetRunningTimes_clock_gettime);
+        if (timespec ts; clock_gettime(cid, &ts) == 0) {
+          aRunningTimes.ResetThreadCPUDelta(
+              uint64_t(ts.tv_sec) * 1'000'000'000u + uint64_t(ts.tv_nsec));
+        } else {
+          aRunningTimes.ClearThreadCPUDelta();
+        }
+      });
+
+  ProfiledThreadData* profiledThreadData =
+      aThreadData.GetProfiledThreadData(aLock);
+  MOZ_ASSERT(profiledThreadData);
+  RunningTimes& previousRunningTimes =
+      profiledThreadData->PreviousThreadRunningTimesRef();
+  const RunningTimes diff = newRunningTimes - previousRunningTimes;
+  previousRunningTimes = newRunningTimes;
+  return diff;
+}
+
+static void DiscardSuspendedThreadRunningTimes(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData) {
+  AUTO_PROFILER_STATS(DiscardSuspendedThreadRunningTimes);
+
+  // On Linux, suspending a thread uses a signal that makes that thread work
+  // to handle it. So we want to discard any added running time since the call
+  // to GetThreadRunningTimesDiff, which is done by overwriting the thread's
+  // PreviousThreadRunningTimesRef() with the current running time now.
+
+  const mozilla::profiler::PlatformData& platformData =
+      aThreadData.PlatformDataCRef();
+  Maybe<clockid_t> maybeCid = platformData.GetClockId();
+
+  if (MOZ_UNLIKELY(!maybeCid)) {
+    // No clock id -> Nothing to measure.
+    return;
+  }
+
+  ProfiledThreadData* profiledThreadData =
+      aThreadData.GetProfiledThreadData(aLock);
+  MOZ_ASSERT(profiledThreadData);
+  RunningTimes& previousRunningTimes =
+      profiledThreadData->PreviousThreadRunningTimesRef();
+
+  if (timespec ts; clock_gettime(*maybeCid, &ts) == 0) {
+    previousRunningTimes.ResetThreadCPUDelta(
+        uint64_t(ts.tv_sec) * 1'000'000'000u + uint64_t(ts.tv_nsec));
+  } else {
+    previousRunningTimes.ClearThreadCPUDelta();
+  }
+}
+
+template <typename Func>
+void Sampler::SuspendAndSampleAndResumeThread(
+    PSLockRef aLock,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const TimeStamp& aNow, const Func& aProcessRegs) {
+  // Only one sampler thread can be sampling at once.  So we expect to have
+  // complete control over |sSigHandlerCoordinator|.
+  MOZ_ASSERT(!sSigHandlerCoordinator);
+
+  if (!mSamplerTid.IsSpecified()) {
+    mSamplerTid = profiler_current_thread_id();
+  }
+  ProfilerThreadId sampleeTid = aThreadData.Info().ThreadId();
+  MOZ_RELEASE_ASSERT(sampleeTid != mSamplerTid);
+
+  //----------------------------------------------------------------//
+  // Suspend the samplee thread and get its context.
+
+  SigHandlerCoordinator coord;  // on sampler thread's stack
+  sSigHandlerCoordinator = &coord;
+
+  // Send message 1 to the samplee (the thread to be sampled), by
+  // signalling at it.
+  // This could fail if the thread doesn't exist anymore.
+  int r = tgkill(mMyPid.ToNumber(), sampleeTid.ToNumber(), SIGPROF);
+  if (r == 0) {
+    // Wait for message 2 from the samplee, indicating that the context
+    // is available and that the thread is suspended.
+    while (true) {
+      r = sem_wait(&sSigHandlerCoordinator->mMessage2);
+      if (r == -1 && errno == EINTR) {
+        // Interrupted by a signal.  Try again.
+        continue;
+      }
+      // We don't expect any other kind of failure.
+      MOZ_ASSERT(r == 0);
+      break;
+    }
+
+    //----------------------------------------------------------------//
+    // Sample the target thread.
+
+    // WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+    //
+    // The profiler's "critical section" begins here.  In the critical section,
+    // we must not do any dynamic memory allocation, nor try to acquire any lock
+    // or any other unshareable resource.  This is because the thread to be
+    // sampled has been suspended at some entirely arbitrary point, and we have
+    // no idea which unsharable resources (locks, essentially) it holds.  So any
+    // attempt to acquire any lock, including the implied locks used by the
+    // malloc implementation, risks deadlock.  This includes TimeStamp::Now(),
+    // which gets a lock on Windows.
+
+    // The samplee thread is now frozen and sSigHandlerCoordinator->mUContext is
+    // valid.  We can poke around in it and unwind its stack as we like.
+
+    // Extract the current register values.
+    Registers regs;
+    PopulateRegsFromContext(regs, &sSigHandlerCoordinator->mUContext);
+    aProcessRegs(regs, aNow);
+
+    //----------------------------------------------------------------//
+    // Resume the target thread.
+
+    // Send message 3 to the samplee, which tells it to resume.
+    r = sem_post(&sSigHandlerCoordinator->mMessage3);
+    MOZ_ASSERT(r == 0);
+
+    // Wait for message 4 from the samplee, which tells us that it has
+    // finished with |sSigHandlerCoordinator|.
+    while (true) {
+      r = sem_wait(&sSigHandlerCoordinator->mMessage4);
+      if (r == -1 && errno == EINTR) {
+        continue;
+      }
+      MOZ_ASSERT(r == 0);
+      break;
+    }
+
+    // The profiler's critical section ends here.  After this point, none of the
+    // critical section limitations documented above apply.
+    //
+    // WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+  }
+
+  // This isn't strictly necessary, but doing so does help pick up anomalies
+  // in which the signal handler is running when it shouldn't be.
+  sSigHandlerCoordinator = nullptr;
+}
+
+// END Sampler target specifics
+////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN SamplerThread target specifics
+
+static void* ThreadEntry(void* aArg) {
+  auto thread = static_cast<SamplerThread*>(aArg);
+  thread->Run();
+  return nullptr;
+}
+
+SamplerThread::SamplerThread(PSLockRef aLock, uint32_t aActivityGeneration,
+                             double aIntervalMilliseconds, uint32_t aFeatures)
+    : mSampler(aLock),
+      mActivityGeneration(aActivityGeneration),
+      mIntervalMicroseconds(
+          std::max(1, int(floor(aIntervalMilliseconds * 1000 + 0.5)))) {
+#if defined(USE_LUL_STACKWALK)
+  lul::LUL* lul = CorePS::Lul();
+  if (!lul && ProfilerFeature::HasStackWalk(aFeatures)) {
+    CorePS::SetLul(MakeUnique<lul::LUL>(logging_sink_for_LUL));
+    // Read all the unwind info currently available.
+    lul = CorePS::Lul();
+    read_procmaps(lul);
+
+    // Switch into unwind mode. After this point, we can't add or remove any
+    // unwind info to/from this LUL instance. The only thing we can do with
+    // it is Unwind() calls.
+    lul->EnableUnwinding();
+
+    // Has a test been requested?
+    if (PR_GetEnv("MOZ_PROFILER_LUL_TEST")) {
+      int nTests = 0, nTestsPassed = 0;
+      RunLulUnitTests(&nTests, &nTestsPassed, lul);
+    }
+  }
+#endif
+
+  // Start the sampling thread. It repeatedly sends a SIGPROF signal. Sending
+  // the signal ourselves instead of relying on itimer provides much better
+  // accuracy.
+  //
+  // At least 350 KiB of stack space are needed when built with TSAN. This
+  // includes lul::N_STACK_BYTES plus whatever else is needed for the sampler
+  // thread. Set the stack size to 800 KiB to keep a safe margin above that.
+  pthread_attr_t attr;
+  if (pthread_attr_init(&attr) != 0 ||
+      pthread_attr_setstacksize(&attr, 800 * 1024) != 0 ||
+      pthread_create(&mThread, &attr, ThreadEntry, this) != 0) {
+    MOZ_CRASH("pthread_create failed");
+  }
+  pthread_attr_destroy(&attr);
+}
+
+SamplerThread::~SamplerThread() {
+  pthread_join(mThread, nullptr);
+  // Just in the unlikely case some callbacks were added between the end of the
+  // thread and now.
+  InvokePostSamplingCallbacks(std::move(mPostSamplingCallbackList),
+                              SamplingState::JustStopped);
+}
+
+void SamplerThread::SleepMicro(uint32_t aMicroseconds) {
+  if (aMicroseconds >= 1000000) {
+    // Use usleep for larger intervals, because the nanosleep
+    // code below only supports intervals < 1 second.
+    MOZ_ALWAYS_TRUE(!::usleep(aMicroseconds));
+    return;
+  }
+
+  struct timespec ts;
+  ts.tv_sec = 0;
+  ts.tv_nsec = aMicroseconds * 1000UL;
+
+  int rv = ::nanosleep(&ts, &ts);
+
+  while (rv != 0 && errno == EINTR) {
+    // Keep waiting in case of interrupt.
+    // nanosleep puts the remaining time back into ts.
+    rv = ::nanosleep(&ts, &ts);
+  }
+
+  MOZ_ASSERT(!rv, "nanosleep call failed");
+}
+
+void SamplerThread::Stop(PSLockRef aLock) {
+  // Restore old signal handler. This is global state so it's important that
+  // we do it now, while gPSMutex is locked. It's safe to do this now even
+  // though this SamplerThread is still alive, because the next time the main
+  // loop of Run() iterates it won't get past the mActivityGeneration check,
+  // and so won't send any signals.
+  mSampler.Disable(aLock);
+}
+
+// END SamplerThread target specifics
+////////////////////////////////////////////////////////////////////////
+
+#if defined(GP_OS_linux) || defined(GP_OS_freebsd)
+
+// We use pthread_atfork() to temporarily disable signal delivery during any
+// fork() call. Without that, fork() can be repeatedly interrupted by signal
+// delivery, requiring it to be repeatedly restarted, which can lead to *long*
+// delays. See bug 837390.
+//
+// We provide no paf_child() function to run in the child after forking. This
+// is fine because we always immediately exec() after fork(), and exec()
+// clobbers all process state. Also, we don't want the sampler to resume in the
+// child process between fork() and exec(), it would be wasteful.
+//
+// Unfortunately all this is only doable on non-Android because Bionic doesn't
+// have pthread_atfork.
+
+// In the parent, before the fork, increase gSkipSampling to ensure that
+// profiler sampling loops will be skipped. There could be one in progress now,
+// causing a small delay, but further sampling will be skipped, allowing `fork`
+// to complete.
+static void paf_prepare() { ++gSkipSampling; }
+
+// In the parent, after the fork, decrease gSkipSampling to let the sampler
+// resume sampling (unless other places have made it non-zero as well).
+static void paf_parent() { --gSkipSampling; }
+
+static void PlatformInit(PSLockRef aLock) {
+  // Set up the fork handlers.
+  pthread_atfork(paf_prepare, paf_parent, nullptr);
+}
+
+#else
+
+static void PlatformInit(PSLockRef aLock) {}
+
+#endif
+
+#if defined(HAVE_NATIVE_UNWIND)
+#  define REGISTERS_SYNC_POPULATE(regs)                         \
+    if (!getcontext(&regs.mContextSyncStorage)) {               \
+      PopulateRegsFromContext(regs, &regs.mContextSyncStorage); \
+    }
+#endif
diff --git a/tools/profiler/core/platform-macos.cpp b/tools/profiler/core/platform-macos.cpp
new file mode 100644
index 0000000000..b69a346d64
--- /dev/null
+++ b/tools/profiler/core/platform-macos.cpp
@@ -0,0 +1,297 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <mach/mach_init.h>
+#include <mach-o/getsect.h>
+
+#include <AvailabilityMacros.h>
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <libkern/OSAtomic.h>
+#include <libproc.h>
+#include <mach/mach.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <mach/thread_act.h>
+#include <mach/vm_statistics.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+
+// this port is based off of v8 svn revision 9837
+
+mozilla::profiler::PlatformData::PlatformData(ProfilerThreadId aThreadId)
+    : mProfiledThread(mach_thread_self()) {}
+
+mozilla::profiler::PlatformData::~PlatformData() {
+  // Deallocate Mach port for thread.
+  mach_port_deallocate(mach_task_self(), mProfiledThread);
+}
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN Sampler target specifics
+
+Sampler::Sampler(PSLockRef aLock) {}
+
+void Sampler::Disable(PSLockRef aLock) {}
+
+static void StreamMetaPlatformSampleUnits(PSLockRef aLock,
+                                          SpliceableJSONWriter& aWriter) {
+  // Microseconds.
+  aWriter.StringProperty("threadCPUDelta", "\u00B5s");
+}
+
+/* static */
+uint64_t RunningTimes::ConvertRawToJson(uint64_t aRawValue) {
+  return aRawValue;
+}
+
+namespace mozilla::profiler {
+bool GetCpuTimeSinceThreadStartInNs(
+    uint64_t* aResult, const mozilla::profiler::PlatformData& aPlatformData) {
+  thread_extended_info_data_t threadInfoData;
+  mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+  if (thread_info(aPlatformData.ProfiledThread(), THREAD_EXTENDED_INFO,
+                  (thread_info_t)&threadInfoData, &count) != KERN_SUCCESS) {
+    return false;
+  }
+
+  *aResult = threadInfoData.pth_user_time + threadInfoData.pth_system_time;
+  return true;
+}
+}  // namespace mozilla::profiler
+
+static RunningTimes GetProcessRunningTimesDiff(
+    PSLockRef aLock, RunningTimes& aPreviousRunningTimesToBeUpdated) {
+  AUTO_PROFILER_STATS(GetProcessRunningTimes);
+
+  RunningTimes newRunningTimes;
+  {
+    AUTO_PROFILER_STATS(GetProcessRunningTimes_task_info);
+
+    static const auto pid = getpid();
+    struct proc_taskinfo pti;
+    if ((unsigned long)proc_pidinfo(pid, PROC_PIDTASKINFO, 0, &pti,
+                                    PROC_PIDTASKINFO_SIZE) >=
+        PROC_PIDTASKINFO_SIZE) {
+      newRunningTimes.SetThreadCPUDelta(pti.pti_total_user +
+                                        pti.pti_total_system);
+    }
+    newRunningTimes.SetPostMeasurementTimeStamp(TimeStamp::Now());
+  };
+
+  const RunningTimes diff = newRunningTimes - aPreviousRunningTimesToBeUpdated;
+  aPreviousRunningTimesToBeUpdated = newRunningTimes;
+  return diff;
+}
+
+static RunningTimes GetThreadRunningTimesDiff(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData) {
+  AUTO_PROFILER_STATS(GetRunningTimes);
+
+  const mozilla::profiler::PlatformData& platformData =
+      aThreadData.PlatformDataCRef();
+
+  const RunningTimes newRunningTimes = GetRunningTimesWithTightTimestamp(
+      [&platformData](RunningTimes& aRunningTimes) {
+        AUTO_PROFILER_STATS(GetRunningTimes_thread_info);
+        thread_basic_info_data_t threadBasicInfo;
+        mach_msg_type_number_t basicCount = THREAD_BASIC_INFO_COUNT;
+        if (thread_info(platformData.ProfiledThread(), THREAD_BASIC_INFO,
+                        reinterpret_cast<thread_info_t>(&threadBasicInfo),
+                        &basicCount) == KERN_SUCCESS &&
+            basicCount == THREAD_BASIC_INFO_COUNT) {
+          uint64_t userTimeUs =
+              uint64_t(threadBasicInfo.user_time.seconds) *
+                  uint64_t(USEC_PER_SEC) +
+              uint64_t(threadBasicInfo.user_time.microseconds);
+          uint64_t systemTimeUs =
+              uint64_t(threadBasicInfo.system_time.seconds) *
+                  uint64_t(USEC_PER_SEC) +
+              uint64_t(threadBasicInfo.system_time.microseconds);
+          aRunningTimes.ResetThreadCPUDelta(userTimeUs + systemTimeUs);
+        } else {
+          aRunningTimes.ClearThreadCPUDelta();
+        }
+      });
+
+  ProfiledThreadData* profiledThreadData =
+      aThreadData.GetProfiledThreadData(aLock);
+  MOZ_ASSERT(profiledThreadData);
+  RunningTimes& previousRunningTimes =
+      profiledThreadData->PreviousThreadRunningTimesRef();
+  const RunningTimes diff = newRunningTimes - previousRunningTimes;
+  previousRunningTimes = newRunningTimes;
+  return diff;
+}
+
+static void DiscardSuspendedThreadRunningTimes(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData) {
+  // Nothing to do!
+  // On macOS, suspending a thread doesn't make that thread work.
+}
+
+template <typename Func>
+void Sampler::SuspendAndSampleAndResumeThread(
+    PSLockRef aLock,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const TimeStamp& aNow, const Func& aProcessRegs) {
+  thread_act_t samplee_thread = aThreadData.PlatformDataCRef().ProfiledThread();
+
+  //----------------------------------------------------------------//
+  // Suspend the samplee thread and get its context.
+
+  // We're using thread_suspend on OS X because pthread_kill (which is what we
+  // at one time used on Linux) has less consistent performance and causes
+  // strange crashes, see bug 1166778 and bug 1166808.  thread_suspend
+  // is also just a lot simpler to use.
+
+  if (KERN_SUCCESS != thread_suspend(samplee_thread)) {
+    return;
+  }
+
+  //----------------------------------------------------------------//
+  // Sample the target thread.
+
+  // WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+  //
+  // The profiler's "critical section" begins here.  We must be very careful
+  // what we do here, or risk deadlock.  See the corresponding comment in
+  // platform-linux-android.cpp for details.
+
+#if defined(__x86_64__)
+  thread_state_flavor_t flavor = x86_THREAD_STATE64;
+  x86_thread_state64_t state;
+  mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT;
+#  if __DARWIN_UNIX03
+#    define REGISTER_FIELD(name) __r##name
+#  else
+#    define REGISTER_FIELD(name) r##name
+#  endif  // __DARWIN_UNIX03
+#elif defined(__aarch64__)
+  thread_state_flavor_t flavor = ARM_THREAD_STATE64;
+  arm_thread_state64_t state;
+  mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT;
+#  if __DARWIN_UNIX03
+#    define REGISTER_FIELD(name) __##name
+#  else
+#    define REGISTER_FIELD(name) name
+#  endif  // __DARWIN_UNIX03
+#else
+#  error "unknown architecture"
+#endif
+
+  if (thread_get_state(samplee_thread, flavor,
+                       reinterpret_cast<natural_t*>(&state),
+                       &count) == KERN_SUCCESS) {
+    Registers regs;
+#if defined(__x86_64__)
+    regs.mPC = reinterpret_cast<Address>(state.REGISTER_FIELD(ip));
+    regs.mSP = reinterpret_cast<Address>(state.REGISTER_FIELD(sp));
+    regs.mFP = reinterpret_cast<Address>(state.REGISTER_FIELD(bp));
+    regs.mLR = 0;
+#elif defined(__aarch64__)
+    regs.mPC = reinterpret_cast<Address>(state.REGISTER_FIELD(pc));
+    regs.mSP = reinterpret_cast<Address>(state.REGISTER_FIELD(sp));
+    regs.mFP = reinterpret_cast<Address>(state.REGISTER_FIELD(fp));
+    regs.mLR = reinterpret_cast<Address>(state.REGISTER_FIELD(lr));
+#else
+#  error "unknown architecture"
+#endif
+
+    aProcessRegs(regs, aNow);
+  }
+
+#undef REGISTER_FIELD
+
+  //----------------------------------------------------------------//
+  // Resume the target thread.
+
+  thread_resume(samplee_thread);
+
+  // The profiler's critical section ends here.
+  //
+  // WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+}
+
+// END Sampler target specifics
+////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN SamplerThread target specifics
+
+static void* ThreadEntry(void* aArg) {
+  auto thread = static_cast<SamplerThread*>(aArg);
+  thread->Run();
+  return nullptr;
+}
+
+SamplerThread::SamplerThread(PSLockRef aLock, uint32_t aActivityGeneration,
+                             double aIntervalMilliseconds, uint32_t aFeatures)
+    : mSampler(aLock),
+      mActivityGeneration(aActivityGeneration),
+      mIntervalMicroseconds(
+          std::max(1, int(floor(aIntervalMilliseconds * 1000 + 0.5)))),
+      mThread{nullptr} {
+  pthread_attr_t* attr_ptr = nullptr;
+  if (pthread_create(&mThread, attr_ptr, ThreadEntry, this) != 0) {
+    MOZ_CRASH("pthread_create failed");
+  }
+}
+
+SamplerThread::~SamplerThread() {
+  pthread_join(mThread, nullptr);
+  // Just in the unlikely case some callbacks were added between the end of the
+  // thread and now.
+  InvokePostSamplingCallbacks(std::move(mPostSamplingCallbackList),
+                              SamplingState::JustStopped);
+}
+
+void SamplerThread::SleepMicro(uint32_t aMicroseconds) {
+  usleep(aMicroseconds);
+  // FIXME: the OSX 10.12 page for usleep says "The usleep() function is
+  // obsolescent.  Use nanosleep(2) instead."  This implementation could be
+  // merged with the linux-android version.  Also, this doesn't handle the
+  // case where the usleep call is interrupted by a signal.
+}
+
+void SamplerThread::Stop(PSLockRef aLock) { mSampler.Disable(aLock); }
+
+// END SamplerThread target specifics
+////////////////////////////////////////////////////////////////////////
+
+static void PlatformInit(PSLockRef aLock) {}
+
+// clang-format off
+#if defined(HAVE_NATIVE_UNWIND)
+// Derive the stack pointer from the frame pointer. The 0x10 offset is
+// 8 bytes for the previous frame pointer and 8 bytes for the return
+// address both stored on the stack after at the beginning of the current
+// frame.
+#  define REGISTERS_SYNC_POPULATE(regs)                                       \
+    regs.mSP = reinterpret_cast<Address>(__builtin_frame_address(0)) + 0x10;  \
+    _Pragma("GCC diagnostic push")                                            \
+    _Pragma("GCC diagnostic ignored \"-Wframe-address\"")                     \
+    regs.mFP = reinterpret_cast<Address>(__builtin_frame_address(1));         \
+    _Pragma("GCC diagnostic pop")                                             \
+    regs.mPC = reinterpret_cast<Address>(                                     \
+        __builtin_extract_return_addr(__builtin_return_address(0)));          \
+    regs.mLR = 0;
+#endif
+// clang-format on
diff --git a/tools/profiler/core/platform-win32.cpp b/tools/profiler/core/platform-win32.cpp
new file mode 100644
index 0000000000..5e10e04c89
--- /dev/null
+++ b/tools/profiler/core/platform-win32.cpp
@@ -0,0 +1,496 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+// Copyright (c) 2006-2011 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in
+//    the documentation and/or other materials provided with the
+//    distribution.
+//  * Neither the name of Google, Inc. nor the names of its contributors
+//    may be used to endorse or promote products derived from this
+//    software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+#include <windows.h>
+#include <mmsystem.h>
+#include <process.h>
+
+#include "nsWindowsDllInterceptor.h"
+#include "mozilla/StackWalk_windows.h"
+#include "mozilla/WindowsVersion.h"
+
+#include <type_traits>
+
+static void PopulateRegsFromContext(Registers& aRegs, CONTEXT* aContext) {
+#if defined(GP_ARCH_amd64)
+  aRegs.mPC = reinterpret_cast<Address>(aContext->Rip);
+  aRegs.mSP = reinterpret_cast<Address>(aContext->Rsp);
+  aRegs.mFP = reinterpret_cast<Address>(aContext->Rbp);
+  aRegs.mLR = 0;
+#elif defined(GP_ARCH_x86)
+  aRegs.mPC = reinterpret_cast<Address>(aContext->Eip);
+  aRegs.mSP = reinterpret_cast<Address>(aContext->Esp);
+  aRegs.mFP = reinterpret_cast<Address>(aContext->Ebp);
+  aRegs.mLR = 0;
+#elif defined(GP_ARCH_arm64)
+  aRegs.mPC = reinterpret_cast<Address>(aContext->Pc);
+  aRegs.mSP = reinterpret_cast<Address>(aContext->Sp);
+  aRegs.mFP = reinterpret_cast<Address>(aContext->Fp);
+  aRegs.mLR = reinterpret_cast<Address>(aContext->Lr);
+#else
+#  error "bad arch"
+#endif
+}
+
+// Gets a real (i.e. not pseudo) handle for the current thread, with the
+// permissions needed for profiling.
+// @return a real HANDLE for the current thread.
+static HANDLE GetRealCurrentThreadHandleForProfiling() {
+  HANDLE realCurrentThreadHandle;
+  if (!::DuplicateHandle(
+          ::GetCurrentProcess(), ::GetCurrentThread(), ::GetCurrentProcess(),
+          &realCurrentThreadHandle,
+          THREAD_GET_CONTEXT | THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION,
+          FALSE, 0)) {
+    return nullptr;
+  }
+
+  return realCurrentThreadHandle;
+}
+
+static_assert(
+    std::is_same_v<mozilla::profiler::PlatformData::WindowsHandle, HANDLE>);
+
+mozilla::profiler::PlatformData::PlatformData(ProfilerThreadId aThreadId)
+    : mProfiledThread(GetRealCurrentThreadHandleForProfiling()) {
+  MOZ_ASSERT(aThreadId == ProfilerThreadId::FromNumber(::GetCurrentThreadId()));
+}
+
+mozilla::profiler::PlatformData::~PlatformData() {
+  if (mProfiledThread) {
+    CloseHandle(mProfiledThread);
+    mProfiledThread = nullptr;
+  }
+}
+
+static const HANDLE kNoThread = INVALID_HANDLE_VALUE;
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN Sampler target specifics
+
+Sampler::Sampler(PSLockRef aLock) {}
+
+void Sampler::Disable(PSLockRef aLock) {}
+
+static void StreamMetaPlatformSampleUnits(PSLockRef aLock,
+                                          SpliceableJSONWriter& aWriter) {
+  static const Span<const char> units =
+      (GetCycleTimeFrequencyMHz() != 0) ? MakeStringSpan("ns")
+                                        : MakeStringSpan("variable CPU cycles");
+  aWriter.StringProperty("threadCPUDelta", units);
+}
+
+/* static */
+uint64_t RunningTimes::ConvertRawToJson(uint64_t aRawValue) {
+  static const uint64_t cycleTimeFrequencyMHz = GetCycleTimeFrequencyMHz();
+  if (cycleTimeFrequencyMHz == 0u) {
+    return aRawValue;
+  }
+
+  constexpr uint64_t GHZ_PER_MHZ = 1'000u;
+  // To get ns, we need to divide cycles by a frequency in GHz, i.e.:
+  // cycles / (f_MHz / GHZ_PER_MHZ). To avoid losing the integer precision of
+  // f_MHz, this is computed as (cycles * GHZ_PER_MHZ) / f_MHz.
+  // Adding GHZ_PER_MHZ/2 to (cycles * GHZ_PER_MHZ) will round to nearest when
+  // the result of the division is truncated.
+  return (aRawValue * GHZ_PER_MHZ + (GHZ_PER_MHZ / 2u)) / cycleTimeFrequencyMHz;
+}
+
+static inline uint64_t ToNanoSeconds(const FILETIME& aFileTime) {
+  // FILETIME values are 100-nanoseconds units, converting
+  ULARGE_INTEGER usec = {{aFileTime.dwLowDateTime, aFileTime.dwHighDateTime}};
+  return usec.QuadPart * 100;
+}
+
+namespace mozilla::profiler {
+bool GetCpuTimeSinceThreadStartInNs(
+    uint64_t* aResult, const mozilla::profiler::PlatformData& aPlatformData) {
+  const HANDLE profiledThread = aPlatformData.ProfiledThread();
+  int frequencyInMHz = GetCycleTimeFrequencyMHz();
+  if (frequencyInMHz) {
+    uint64_t cpuCycleCount;
+    if (!QueryThreadCycleTime(profiledThread, &cpuCycleCount)) {
+      return false;
+    }
+
+    constexpr uint64_t USEC_PER_NSEC = 1000L;
+    *aResult = cpuCycleCount * USEC_PER_NSEC / frequencyInMHz;
+    return true;
+  }
+
+  FILETIME createTime, exitTime, kernelTime, userTime;
+  if (!GetThreadTimes(profiledThread, &createTime, &exitTime, &kernelTime,
+                      &userTime)) {
+    return false;
+  }
+
+  *aResult = ToNanoSeconds(kernelTime) + ToNanoSeconds(userTime);
+  return true;
+}
+}  // namespace mozilla::profiler
+
+static RunningTimes GetProcessRunningTimesDiff(
+    PSLockRef aLock, RunningTimes& aPreviousRunningTimesToBeUpdated) {
+  AUTO_PROFILER_STATS(GetProcessRunningTimes);
+
+  static const HANDLE processHandle = GetCurrentProcess();
+
+  RunningTimes newRunningTimes;
+  {
+    AUTO_PROFILER_STATS(GetProcessRunningTimes_QueryProcessCycleTime);
+    if (ULONG64 cycles; QueryProcessCycleTime(processHandle, &cycles) != 0) {
+      newRunningTimes.SetThreadCPUDelta(cycles);
+    }
+    newRunningTimes.SetPostMeasurementTimeStamp(TimeStamp::Now());
+  };
+
+  const RunningTimes diff = newRunningTimes - aPreviousRunningTimesToBeUpdated;
+  aPreviousRunningTimesToBeUpdated = newRunningTimes;
+  return diff;
+}
+
+static RunningTimes GetThreadRunningTimesDiff(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData) {
+  AUTO_PROFILER_STATS(GetThreadRunningTimes);
+
+  const mozilla::profiler::PlatformData& platformData =
+      aThreadData.PlatformDataCRef();
+  const HANDLE profiledThread = platformData.ProfiledThread();
+
+  const RunningTimes newRunningTimes = GetRunningTimesWithTightTimestamp(
+      [profiledThread](RunningTimes& aRunningTimes) {
+        AUTO_PROFILER_STATS(GetThreadRunningTimes_QueryThreadCycleTime);
+        if (ULONG64 cycles;
+            QueryThreadCycleTime(profiledThread, &cycles) != 0) {
+          aRunningTimes.ResetThreadCPUDelta(cycles);
+        } else {
+          aRunningTimes.ClearThreadCPUDelta();
+        }
+      });
+
+  ProfiledThreadData* profiledThreadData =
+      aThreadData.GetProfiledThreadData(aLock);
+  MOZ_ASSERT(profiledThreadData);
+  RunningTimes& previousRunningTimes =
+      profiledThreadData->PreviousThreadRunningTimesRef();
+  const RunningTimes diff = newRunningTimes - previousRunningTimes;
+  previousRunningTimes = newRunningTimes;
+  return diff;
+}
+
+static void DiscardSuspendedThreadRunningTimes(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData) {
+  AUTO_PROFILER_STATS(DiscardSuspendedThreadRunningTimes);
+
+  // On Windows, suspending a thread makes that thread work a little bit. So we
+  // want to discard any added running time since the call to
+  // GetThreadRunningTimesDiff, which is done by overwriting the thread's
+  // PreviousThreadRunningTimesRef() with the current running time now.
+
+  const mozilla::profiler::PlatformData& platformData =
+      aThreadData.PlatformDataCRef();
+  const HANDLE profiledThread = platformData.ProfiledThread();
+
+  ProfiledThreadData* profiledThreadData =
+      aThreadData.GetProfiledThreadData(aLock);
+  MOZ_ASSERT(profiledThreadData);
+  RunningTimes& previousRunningTimes =
+      profiledThreadData->PreviousThreadRunningTimesRef();
+
+  if (ULONG64 cycles; QueryThreadCycleTime(profiledThread, &cycles) != 0) {
+    previousRunningTimes.ResetThreadCPUDelta(cycles);
+  } else {
+    previousRunningTimes.ClearThreadCPUDelta();
+  }
+}
+
+template <typename Func>
+void Sampler::SuspendAndSampleAndResumeThread(
+    PSLockRef aLock,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const TimeStamp& aNow, const Func& aProcessRegs) {
+  HANDLE profiled_thread = aThreadData.PlatformDataCRef().ProfiledThread();
+  if (profiled_thread == nullptr) {
+    return;
+  }
+
+  // Context used for sampling the register state of the profiled thread.
+  CONTEXT context;
+  memset(&context, 0, sizeof(context));
+
+  //----------------------------------------------------------------//
+  // Suspend the samplee thread and get its context.
+
+  static const DWORD kSuspendFailed = static_cast<DWORD>(-1);
+  if (SuspendThread(profiled_thread) == kSuspendFailed) {
+    return;
+  }
+
+  // SuspendThread is asynchronous, so the thread may still be running.
+  // Call GetThreadContext first to ensure the thread is really suspended.
+  // See https://blogs.msdn.microsoft.com/oldnewthing/20150205-00/?p=44743.
+
+  // Using only CONTEXT_CONTROL is faster but on 64-bit it causes crashes in
+  // RtlVirtualUnwind (see bug 1120126) so we set all the flags.
+#if defined(GP_ARCH_amd64)
+  context.ContextFlags = CONTEXT_FULL;
+#else
+  context.ContextFlags = CONTEXT_CONTROL;
+#endif
+  if (!GetThreadContext(profiled_thread, &context)) {
+    ResumeThread(profiled_thread);
+    return;
+  }
+
+  //----------------------------------------------------------------//
+  // Sample the target thread.
+
+  // WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+  //
+  // The profiler's "critical section" begins here.  We must be very careful
+  // what we do here, or risk deadlock.  See the corresponding comment in
+  // platform-linux-android.cpp for details.
+
+  Registers regs;
+  PopulateRegsFromContext(regs, &context);
+  aProcessRegs(regs, aNow);
+
+  //----------------------------------------------------------------//
+  // Resume the target thread.
+
+  ResumeThread(profiled_thread);
+
+  // The profiler's critical section ends here.
+  //
+  // WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+}
+
+// END Sampler target specifics
+////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN SamplerThread target specifics
+
+static unsigned int __stdcall ThreadEntry(void* aArg) {
+  auto thread = static_cast<SamplerThread*>(aArg);
+  thread->Run();
+  return 0;
+}
+
+static unsigned int __stdcall UnregisteredThreadSpyEntry(void* aArg) {
+  auto thread = static_cast<SamplerThread*>(aArg);
+  thread->RunUnregisteredThreadSpy();
+  return 0;
+}
+
+SamplerThread::SamplerThread(PSLockRef aLock, uint32_t aActivityGeneration,
+                             double aIntervalMilliseconds, uint32_t aFeatures)
+    : mSampler(aLock),
+      mActivityGeneration(aActivityGeneration),
+      mIntervalMicroseconds(
+          std::max(1, int(floor(aIntervalMilliseconds * 1000 + 0.5)))),
+      mNoTimerResolutionChange(
+          ProfilerFeature::HasNoTimerResolutionChange(aFeatures)) {
+  if ((!mNoTimerResolutionChange) && (mIntervalMicroseconds < 10 * 1000)) {
+    // By default the timer resolution (which tends to be 1/64Hz, around 16ms)
+    // is not changed. However, if the requested interval is sufficiently low,
+    // the resolution will be adjusted to match. Note that this affects all
+    // timers in Firefox, and could therefore hide issues while profiling. This
+    // change may be prevented with the "notimerresolutionchange" feature.
+    ::timeBeginPeriod(mIntervalMicroseconds / 1000);
+  }
+
+  if (ProfilerFeature::HasUnregisteredThreads(aFeatures)) {
+    // Sampler&spy threads are not running yet, so it's safe to modify
+    // mSpyingState without locking the monitor.
+    mSpyingState = SpyingState::Spy_Initializing;
+    mUnregisteredThreadSpyThread = reinterpret_cast<HANDLE>(
+        _beginthreadex(nullptr,
+                       /* stack_size */ 0, UnregisteredThreadSpyEntry, this,
+                       /* initflag */ 0, nullptr));
+    if (mUnregisteredThreadSpyThread == 0) {
+      MOZ_CRASH("_beginthreadex failed");
+    }
+  }
+
+  // Create a new thread. It is important to use _beginthreadex() instead of
+  // the Win32 function CreateThread(), because the CreateThread() does not
+  // initialize thread-specific structures in the C runtime library.
+  mThread = reinterpret_cast<HANDLE>(_beginthreadex(nullptr,
+                                                    /* stack_size */ 0,
+                                                    ThreadEntry, this,
+                                                    /* initflag */ 0, nullptr));
+  if (mThread == 0) {
+    MOZ_CRASH("_beginthreadex failed");
+  }
+}
+
+SamplerThread::~SamplerThread() {
+  if (mUnregisteredThreadSpyThread) {
+    {
+      // Make sure the spying thread is not actively working, because the win32
+      // function it's using could deadlock with WaitForSingleObject below.
+      MonitorAutoLock spyingStateLock{mSpyingStateMonitor};
+      while (mSpyingState != SpyingState::Spy_Waiting &&
+             mSpyingState != SpyingState::SamplerToSpy_Start) {
+        spyingStateLock.Wait();
+      }
+
+      mSpyingState = SpyingState::MainToSpy_Shutdown;
+      spyingStateLock.NotifyAll();
+
+      do {
+        spyingStateLock.Wait();
+      } while (mSpyingState != SpyingState::SpyToMain_ShuttingDown);
+    }
+
+    WaitForSingleObject(mUnregisteredThreadSpyThread, INFINITE);
+
+    // Close our own handle for the thread.
+    if (mUnregisteredThreadSpyThread != kNoThread) {
+      CloseHandle(mUnregisteredThreadSpyThread);
+    }
+  }
+
+  WaitForSingleObject(mThread, INFINITE);
+
+  // Close our own handle for the thread.
+  if (mThread != kNoThread) {
+    CloseHandle(mThread);
+  }
+
+  // Just in the unlikely case some callbacks were added between the end of the
+  // thread and now.
+  InvokePostSamplingCallbacks(std::move(mPostSamplingCallbackList),
+                              SamplingState::JustStopped);
+}
+
+void SamplerThread::RunUnregisteredThreadSpy() {
+  // TODO: Consider registering this thread.
+  // Pros: Remove from list of unregistered threads; Not useful to profiling
+  //       Firefox itself.
+  // Cons: Doesn't appear in the profile, so users may miss the expensive CPU
+  //       cost of this work on Windows.
+  PR_SetCurrentThreadName("UnregisteredThreadSpy");
+
+  while (true) {
+    {
+      MonitorAutoLock spyingStateLock{mSpyingStateMonitor};
+      // Either this is the first loop, or we're looping after working.
+      MOZ_ASSERT(mSpyingState == SpyingState::Spy_Initializing ||
+                 mSpyingState == SpyingState::Spy_Working);
+
+      // Let everyone know we're waiting, and then wait.
+      mSpyingState = SpyingState::Spy_Waiting;
+      mSpyingStateMonitor.NotifyAll();
+      do {
+        spyingStateLock.Wait();
+      } while (mSpyingState == SpyingState::Spy_Waiting);
+
+      if (mSpyingState == SpyingState::MainToSpy_Shutdown) {
+        mSpyingState = SpyingState::SpyToMain_ShuttingDown;
+        mSpyingStateMonitor.NotifyAll();
+        break;
+      }
+
+      MOZ_ASSERT(mSpyingState == SpyingState::SamplerToSpy_Start);
+      mSpyingState = SpyingState::Spy_Working;
+    }
+
+    // Do the work without lock, so other threads can read the current state.
+    SpyOnUnregisteredThreads();
+  }
+}
+
+void SamplerThread::SleepMicro(uint32_t aMicroseconds) {
+  // For now, keep the old behaviour of minimum Sleep(1), even for
+  // smaller-than-usual sleeps after an overshoot, unless the user has
+  // explicitly opted into a sub-millisecond profiler interval.
+  if (mIntervalMicroseconds >= 1000) {
+    ::Sleep(std::max(1u, aMicroseconds / 1000));
+  } else {
+    TimeStamp start = TimeStamp::Now();
+    TimeStamp end = start + TimeDuration::FromMicroseconds(aMicroseconds);
+
+    // First, sleep for as many whole milliseconds as possible.
+    if (aMicroseconds >= 1000) {
+      ::Sleep(aMicroseconds / 1000);
+    }
+
+    // Then, spin until enough time has passed.
+    while (TimeStamp::Now() < end) {
+      YieldProcessor();
+    }
+  }
+}
+
+void SamplerThread::Stop(PSLockRef aLock) {
+  if ((!mNoTimerResolutionChange) && (mIntervalMicroseconds < 10 * 1000)) {
+    // Disable any timer resolution changes we've made. Do it now while
+    // gPSMutex is locked, i.e. before any other SamplerThread can be created
+    // and call ::timeBeginPeriod().
+    //
+    // It's safe to do this now even though this SamplerThread is still alive,
+    // because the next time the main loop of Run() iterates it won't get past
+    // the mActivityGeneration check, and so it won't make any more ::Sleep()
+    // calls.
+    ::timeEndPeriod(mIntervalMicroseconds / 1000);
+  }
+
+  mSampler.Disable(aLock);
+}
+
+// END SamplerThread target specifics
+////////////////////////////////////////////////////////////////////////
+
+static void PlatformInit(PSLockRef aLock) {}
+
+#if defined(HAVE_NATIVE_UNWIND)
+#  define REGISTERS_SYNC_POPULATE(regs) \
+    CONTEXT context;                    \
+    RtlCaptureContext(&context);        \
+    PopulateRegsFromContext(regs, &context);
+#endif
+
+#if defined(GP_PLAT_amd64_windows)
+
+// Use InitializeWin64ProfilerHooks from the base profiler.
+
+namespace mozilla {
+namespace baseprofiler {
+MFBT_API void InitializeWin64ProfilerHooks();
+}  // namespace baseprofiler
+}  // namespace mozilla
+
+using mozilla::baseprofiler::InitializeWin64ProfilerHooks;
+
+#endif  // defined(GP_PLAT_amd64_windows)
diff --git a/tools/profiler/core/platform.cpp b/tools/profiler/core/platform.cpp
new file mode 100644
index 0000000000..8950c48b58
--- /dev/null
+++ b/tools/profiler/core/platform.cpp
@@ -0,0 +1,7067 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// There are three kinds of samples done by the profiler.
+//
+// - A "periodic" sample is the most complex kind. It is done in response to a
+//   timer while the profiler is active. It involves writing a stack trace plus
+//   a variety of other values (memory measurements, responsiveness
+//   measurements, markers, etc.) into the main ProfileBuffer. The sampling is
+//   done from off-thread, and so SuspendAndSampleAndResumeThread() is used to
+//   get the register values.
+//
+// - A "synchronous" sample is a simpler kind. It is done in response to an API
+//   call (profiler_get_backtrace()). It involves writing a stack trace and
+//   little else into a temporary ProfileBuffer, and wrapping that up in a
+//   ProfilerBacktrace that can be subsequently used in a marker. The sampling
+//   is done on-thread, and so REGISTERS_SYNC_POPULATE() is used to get the
+//   register values.
+//
+// - A "backtrace" sample is the simplest kind. It is done in response to an
+//   API call (profiler_suspend_and_sample_thread()). It involves getting a
+//   stack trace via a ProfilerStackCollector; it does not write to a
+//   ProfileBuffer. The sampling is done from off-thread, and so uses
+//   SuspendAndSampleAndResumeThread() to get the register values.
+
+#include "platform.h"
+
+#include "GeckoProfiler.h"
+#include "GeckoProfilerReporter.h"
+#include "PageInformation.h"
+#include "PowerCounters.h"
+#include "ProfileBuffer.h"
+#include "ProfiledThreadData.h"
+#include "ProfilerBacktrace.h"
+#include "ProfilerChild.h"
+#include "ProfilerCodeAddressService.h"
+#include "ProfilerControl.h"
+#include "ProfilerIOInterposeObserver.h"
+#include "ProfilerParent.h"
+#include "ProfilerRustBindings.h"
+#include "mozilla/MozPromise.h"
+#include "shared-libraries.h"
+#include "VTuneProfiler.h"
+
+#include "js/ProfilingFrameIterator.h"
+#include "memory_hooks.h"
+#include "mozilla/ArrayUtils.h"
+#include "mozilla/AutoProfilerLabel.h"
+#include "mozilla/BaseAndGeckoProfilerDetail.h"
+#include "mozilla/ExtensionPolicyService.h"
+#include "mozilla/extensions/WebExtensionPolicy.h"
+#include "mozilla/glean/GleanMetrics.h"
+#include "mozilla/Monitor.h"
+#include "mozilla/Preferences.h"
+#include "mozilla/Printf.h"
+#include "mozilla/ProcInfo.h"
+#include "mozilla/ProfileBufferChunkManagerSingle.h"
+#include "mozilla/ProfileBufferChunkManagerWithLocalLimit.h"
+#include "mozilla/ProfileChunkedBuffer.h"
+#include "mozilla/SchedulerGroup.h"
+#include "mozilla/Services.h"
+#include "mozilla/StackWalk.h"
+#ifdef XP_WIN
+#  include "mozilla/StackWalkThread.h"
+#endif
+#include "mozilla/StaticPtr.h"
+#include "mozilla/ThreadLocal.h"
+#include "mozilla/TimeStamp.h"
+#include "mozilla/UniquePtr.h"
+#include "mozilla/Vector.h"
+#include "BaseProfiler.h"
+#include "nsDirectoryServiceDefs.h"
+#include "nsDirectoryServiceUtils.h"
+#include "nsIDocShell.h"
+#include "nsIHttpProtocolHandler.h"
+#include "nsIObserverService.h"
+#include "nsIPropertyBag2.h"
+#include "nsIXULAppInfo.h"
+#include "nsIXULRuntime.h"
+#include "nsJSPrincipals.h"
+#include "nsMemoryReporterManager.h"
+#include "nsPIDOMWindow.h"
+#include "nsProfilerStartParams.h"
+#include "nsScriptSecurityManager.h"
+#include "nsSystemInfo.h"
+#include "nsThreadUtils.h"
+#include "nsXULAppAPI.h"
+#include "Tracing.h"
+#include "prdtoa.h"
+#include "prtime.h"
+
+#include <algorithm>
+#include <errno.h>
+#include <fstream>
+#include <ostream>
+#include <set>
+#include <sstream>
+#include <string_view>
+#include <type_traits>
+
+#if defined(GP_OS_android)
+#  include "JavaExceptions.h"
+#  include "mozilla/java/GeckoJavaSamplerNatives.h"
+#  include "mozilla/jni/Refs.h"
+#endif
+
+#if defined(GP_OS_darwin)
+#  include "nsCocoaFeatures.h"
+#endif
+
+#if defined(GP_PLAT_amd64_darwin)
+#  include <cpuid.h>
+#endif
+
+#if defined(GP_OS_windows)
+#  include <processthreadsapi.h>
+
+// GetThreadInformation is not available on Windows 7.
+WINBASEAPI
+BOOL WINAPI GetThreadInformation(
+    _In_ HANDLE hThread, _In_ THREAD_INFORMATION_CLASS ThreadInformationClass,
+    _Out_writes_bytes_(ThreadInformationSize) LPVOID ThreadInformation,
+    _In_ DWORD ThreadInformationSize);
+
+#endif
+
+// Win32 builds always have frame pointers, so FramePointerStackWalk() always
+// works.
+#if defined(GP_PLAT_x86_windows)
+#  define HAVE_NATIVE_UNWIND
+#  define USE_FRAME_POINTER_STACK_WALK
+#endif
+
+// Win64 builds always omit frame pointers, so we use the slower
+// MozStackWalk(), which works in that case.
+#if defined(GP_PLAT_amd64_windows)
+#  define HAVE_NATIVE_UNWIND
+#  define USE_MOZ_STACK_WALK
+#endif
+
+// AArch64 Win64 doesn't seem to use frame pointers, so we use the slower
+// MozStackWalk().
+#if defined(GP_PLAT_arm64_windows)
+#  define HAVE_NATIVE_UNWIND
+#  define USE_MOZ_STACK_WALK
+#endif
+
+// Mac builds use FramePointerStackWalk(). Even if we build without
+// frame pointers, we'll still get useful stacks in system libraries
+// because those always have frame pointers.
+// We don't use MozStackWalk() on Mac.
+#if defined(GP_OS_darwin)
+#  define HAVE_NATIVE_UNWIND
+#  define USE_FRAME_POINTER_STACK_WALK
+#endif
+
+// Android builds use the ARM Exception Handling ABI to unwind.
+#if defined(GP_PLAT_arm_linux) || defined(GP_PLAT_arm_android)
+#  define HAVE_NATIVE_UNWIND
+#  define USE_EHABI_STACKWALK
+#  include "EHABIStackWalk.h"
+#endif
+
+// Linux/BSD builds use LUL, which uses DWARF info to unwind stacks.
+#if defined(GP_PLAT_amd64_linux) || defined(GP_PLAT_x86_linux) ||       \
+    defined(GP_PLAT_amd64_android) || defined(GP_PLAT_x86_android) ||   \
+    defined(GP_PLAT_mips64_linux) || defined(GP_PLAT_arm64_linux) ||    \
+    defined(GP_PLAT_arm64_android) || defined(GP_PLAT_amd64_freebsd) || \
+    defined(GP_PLAT_arm64_freebsd)
+#  define HAVE_NATIVE_UNWIND
+#  define USE_LUL_STACKWALK
+#  include "lul/LulMain.h"
+#  include "lul/platform-linux-lul.h"
+
+// On linux we use LUL for periodic samples and synchronous samples, but we use
+// FramePointerStackWalk for backtrace samples when MOZ_PROFILING is enabled.
+// (See the comment at the top of the file for a definition of
+// periodic/synchronous/backtrace.).
+//
+// FramePointerStackWalk can produce incomplete stacks when the current entry is
+// in a shared library without framepointers, however LUL can take a long time
+// to initialize, which is undesirable for consumers of
+// profiler_suspend_and_sample_thread like the Background Hang Reporter.
+#  if defined(MOZ_PROFILING)
+#    define USE_FRAME_POINTER_STACK_WALK
+#  endif
+#endif
+
+// We can only stackwalk without expensive initialization on platforms which
+// support FramePointerStackWalk or MozStackWalk. LUL Stackwalking requires
+// initializing LUL, and EHABIStackWalk requires initializing EHABI, both of
+// which can be expensive.
+#if defined(USE_FRAME_POINTER_STACK_WALK) || defined(USE_MOZ_STACK_WALK)
+#  define HAVE_FASTINIT_NATIVE_UNWIND
+#endif
+
+#ifdef MOZ_VALGRIND
+#  include <valgrind/memcheck.h>
+#else
+#  define VALGRIND_MAKE_MEM_DEFINED(_addr, _len) ((void)0)
+#endif
+
+#if defined(GP_OS_linux) || defined(GP_OS_android) || defined(GP_OS_freebsd)
+#  include <ucontext.h>
+#endif
+
+using namespace mozilla;
+using namespace mozilla::literals::ProportionValue_literals;
+
+using mozilla::profiler::detail::RacyFeatures;
+using ThreadRegistration = mozilla::profiler::ThreadRegistration;
+using ThreadRegistrationInfo = mozilla::profiler::ThreadRegistrationInfo;
+using ThreadRegistry = mozilla::profiler::ThreadRegistry;
+
+LazyLogModule gProfilerLog("prof");
+
+ProfileChunkedBuffer& profiler_get_core_buffer() {
+  // Defer to the Base Profiler in mozglue to create the core buffer if needed,
+  // and keep a reference here, for quick access in xul.
+  static ProfileChunkedBuffer& sProfileChunkedBuffer =
+      baseprofiler::profiler_get_core_buffer();
+  return sProfileChunkedBuffer;
+}
+
+mozilla::Atomic<int, mozilla::MemoryOrdering::Relaxed> gSkipSampling;
+
+#if defined(GP_OS_android)
+class GeckoJavaSampler
+    : public java::GeckoJavaSampler::Natives<GeckoJavaSampler> {
+ private:
+  GeckoJavaSampler();
+
+ public:
+  static double GetProfilerTime() {
+    if (!profiler_is_active()) {
+      return 0.0;
+    }
+    return profiler_time();
+  };
+
+  static void JavaStringArrayToCharArray(jni::ObjectArray::Param& aJavaArray,
+                                         Vector<const char*>& aCharArray,
+                                         JNIEnv* aJni) {
+    int arraySize = aJavaArray->Length();
+    for (int i = 0; i < arraySize; i++) {
+      jstring javaString =
+          (jstring)(aJni->GetObjectArrayElement(aJavaArray.Get(), i));
+      const char* filterString = aJni->GetStringUTFChars(javaString, 0);
+      // FIXME. These strings are leaked.
+      MOZ_RELEASE_ASSERT(aCharArray.append(filterString));
+    }
+  }
+
+  static void StartProfiler(jni::ObjectArray::Param aFiltersArray,
+                            jni::ObjectArray::Param aFeaturesArray) {
+    JNIEnv* jni = jni::GetEnvForThread();
+    Vector<const char*> filtersTemp;
+    Vector<const char*> featureStringArray;
+
+    JavaStringArrayToCharArray(aFiltersArray, filtersTemp, jni);
+    JavaStringArrayToCharArray(aFeaturesArray, featureStringArray, jni);
+
+    uint32_t features = 0;
+    features = ParseFeaturesFromStringArray(featureStringArray.begin(),
+                                            featureStringArray.length());
+
+    // 128 * 1024 * 1024 is the entries preset that is given in
+    // devtools/client/performance-new/shared/background.jsm.js
+    profiler_start(PowerOfTwo32(128 * 1024 * 1024), 5.0, features,
+                   filtersTemp.begin(), filtersTemp.length(), 0, Nothing());
+  }
+
+  static void StopProfiler(jni::Object::Param aGeckoResult) {
+    auto result = java::GeckoResult::LocalRef(aGeckoResult);
+    profiler_pause();
+    nsCOMPtr<nsIProfiler> nsProfiler(
+        do_GetService("@mozilla.org/tools/profiler;1"));
+    nsProfiler->GetProfileDataAsGzippedArrayBufferAndroid(0)->Then(
+        GetMainThreadSerialEventTarget(), __func__,
+        [result](FallibleTArray<uint8_t> compressedProfile) {
+          result->Complete(jni::ByteArray::New(
+              reinterpret_cast<const int8_t*>(compressedProfile.Elements()),
+              compressedProfile.Length()));
+
+          // Done with capturing a profile. Stop the profiler.
+          profiler_stop();
+        },
+        [result](nsresult aRv) {
+          char errorString[9];
+          sprintf(errorString, "%08x", aRv);
+          result->CompleteExceptionally(
+              mozilla::java::sdk::IllegalStateException::New(errorString)
+                  .Cast<jni::Throwable>());
+
+          // Failed to capture a profile. Stop the profiler.
+          profiler_stop();
+        });
+  }
+};
+#endif
+
+constexpr static bool ValidateFeatures() {
+  int expectedFeatureNumber = 0;
+
+  // Feature numbers should start at 0 and increase by 1 each.
+#define CHECK_FEATURE(n_, str_, Name_, desc_) \
+  if ((n_) != expectedFeatureNumber) {        \
+    return false;                             \
+  }                                           \
+  ++expectedFeatureNumber;
+
+  PROFILER_FOR_EACH_FEATURE(CHECK_FEATURE)
+
+#undef CHECK_FEATURE
+
+  return true;
+}
+
+static_assert(ValidateFeatures(), "Feature list is invalid");
+
+// Return all features that are available on this platform.
+static uint32_t AvailableFeatures() {
+  uint32_t features = 0;
+
+#define ADD_FEATURE(n_, str_, Name_, desc_) \
+  ProfilerFeature::Set##Name_(features);
+
+  // Add all the possible features.
+  PROFILER_FOR_EACH_FEATURE(ADD_FEATURE)
+
+#undef ADD_FEATURE
+
+  // Now remove features not supported on this platform/configuration.
+#if !defined(GP_OS_android)
+  ProfilerFeature::ClearJava(features);
+#endif
+#if !defined(HAVE_NATIVE_UNWIND)
+  ProfilerFeature::ClearStackWalk(features);
+#endif
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  if (getenv("XPCOM_MEM_BLOAT_LOG")) {
+    NS_WARNING("XPCOM_MEM_BLOAT_LOG is set, disabling native allocations.");
+    // The memory hooks are available, but the bloat log is enabled, which is
+    // not compatible with the native allocations tracking. See the comment in
+    // enable_native_allocations() (tools/profiler/core/memory_hooks.cpp) for
+    // more information.
+    ProfilerFeature::ClearNativeAllocations(features);
+  }
+#else
+  // The memory hooks are not available.
+  ProfilerFeature::ClearNativeAllocations(features);
+#endif
+
+#if !defined(GP_OS_windows)
+  ProfilerFeature::ClearNoTimerResolutionChange(features);
+#endif
+
+  return features;
+}
+
+// Default features common to all contexts (even if not available).
+static constexpr uint32_t DefaultFeatures() {
+  return ProfilerFeature::Java | ProfilerFeature::JS |
+         ProfilerFeature::StackWalk | ProfilerFeature::CPUUtilization |
+         ProfilerFeature::Screenshots | ProfilerFeature::ProcessCPU;
+}
+
+// Extra default features when MOZ_PROFILER_STARTUP is set (even if not
+// available).
+static constexpr uint32_t StartupExtraDefaultFeatures() {
+  // Enable file I/Os by default for startup profiles as startup is heavy on
+  // I/O operations.
+  return ProfilerFeature::FileIOAll | ProfilerFeature::IPCMessages;
+}
+
+Json::String ToCompactString(const Json::Value& aJsonValue) {
+  Json::StreamWriterBuilder builder;
+  // No indentations, and no newlines.
+  builder["indentation"] = "";
+  // This removes spaces after colons.
+  builder["enableYAMLCompatibility"] = false;
+  // Only 6 digits after the decimal point; timestamps in ms have ns precision.
+  builder["precision"] = 6;
+  builder["precisionType"] = "decimal";
+
+  return Json::writeString(builder, aJsonValue);
+}
+
+/* static */ mozilla::baseprofiler::detail::BaseProfilerMutex
+    ProfilingLog::gMutex;
+/* static */ mozilla::UniquePtr<Json::Value> ProfilingLog::gLog;
+
+/* static */ void ProfilingLog::Init() {
+  mozilla::baseprofiler::detail::BaseProfilerAutoLock lock{gMutex};
+  MOZ_ASSERT(!gLog);
+  gLog = mozilla::MakeUniqueFallible<Json::Value>(Json::objectValue);
+  if (gLog) {
+    (*gLog)[Json::StaticString{"profilingLogBegin" TIMESTAMP_JSON_SUFFIX}] =
+        ProfilingLog::Timestamp();
+  }
+}
+
+/* static */ void ProfilingLog::Destroy() {
+  mozilla::baseprofiler::detail::BaseProfilerAutoLock lock{gMutex};
+  MOZ_ASSERT(gLog);
+  gLog = nullptr;
+}
+
+/* static */ bool ProfilingLog::IsLockedOnCurrentThread() {
+  return gMutex.IsLockedOnCurrentThread();
+}
+
+// RAII class to lock the profiler mutex.
+// It provides a mechanism to determine if it is locked or not in order for
+// memory hooks to avoid re-entering the profiler locked state.
+// Locking order: Profiler, ThreadRegistry, ThreadRegistration.
+class MOZ_RAII PSAutoLock {
+ public:
+  PSAutoLock()
+      : mLock([]() -> mozilla::baseprofiler::detail::BaseProfilerMutex& {
+          // In DEBUG builds, *before* we attempt to lock gPSMutex, we want to
+          // check that the ThreadRegistry, ThreadRegistration, and ProfilingLog
+          // mutexes are *not* locked on this thread, to avoid inversion
+          // deadlocks.
+          MOZ_ASSERT(!ThreadRegistry::IsRegistryMutexLockedOnCurrentThread());
+          MOZ_ASSERT(!ThreadRegistration::IsDataMutexLockedOnCurrentThread());
+          MOZ_ASSERT(!ProfilingLog::IsLockedOnCurrentThread());
+          return gPSMutex;
+        }()) {}
+
+  PSAutoLock(const PSAutoLock&) = delete;
+  void operator=(const PSAutoLock&) = delete;
+
+  static bool IsLockedOnCurrentThread() {
+    return gPSMutex.IsLockedOnCurrentThread();
+  }
+
+ private:
+  static mozilla::baseprofiler::detail::BaseProfilerMutex gPSMutex;
+  mozilla::baseprofiler::detail::BaseProfilerAutoLock mLock;
+};
+
+/* static */ mozilla::baseprofiler::detail::BaseProfilerMutex
+    PSAutoLock::gPSMutex{"Gecko Profiler mutex"};
+
+// Only functions that take a PSLockRef arg can access CorePS's and ActivePS's
+// fields.
+typedef const PSAutoLock& PSLockRef;
+
+#define PS_GET(type_, name_)      \
+  static type_ name_(PSLockRef) { \
+    MOZ_ASSERT(sInstance);        \
+    return sInstance->m##name_;   \
+  }
+
+#define PS_GET_LOCKLESS(type_, name_) \
+  static type_ name_() {              \
+    MOZ_ASSERT(sInstance);            \
+    return sInstance->m##name_;       \
+  }
+
+#define PS_GET_AND_SET(type_, name_)                  \
+  PS_GET(type_, name_)                                \
+  static void Set##name_(PSLockRef, type_ a##name_) { \
+    MOZ_ASSERT(sInstance);                            \
+    sInstance->m##name_ = a##name_;                   \
+  }
+
+static constexpr size_t MAX_JS_FRAMES =
+    mozilla::profiler::ThreadRegistrationData::MAX_JS_FRAMES;
+using JsFrame = mozilla::profiler::ThreadRegistrationData::JsFrame;
+using JsFrameBuffer = mozilla::profiler::ThreadRegistrationData::JsFrameBuffer;
+
+// All functions in this file can run on multiple threads unless they have an
+// NS_IsMainThread() assertion.
+
+// This class contains the profiler's core global state, i.e. that which is
+// valid even when the profiler is not active. Most profile operations can't do
+// anything useful when this class is not instantiated, so we release-assert
+// its non-nullness in all such operations.
+//
+// Accesses to CorePS are guarded by gPSMutex. Getters and setters take a
+// PSAutoLock reference as an argument as proof that the gPSMutex is currently
+// locked. This makes it clear when gPSMutex is locked and helps avoid
+// accidental unlocked accesses to global state. There are ways to circumvent
+// this mechanism, but please don't do so without *very* good reason and a
+// detailed explanation.
+//
+// The exceptions to this rule:
+//
+// - mProcessStartTime, because it's immutable;
+class CorePS {
+ private:
+  CorePS()
+      : mProcessStartTime(TimeStamp::ProcessCreation())
+#ifdef USE_LUL_STACKWALK
+        ,
+        mLul(nullptr)
+#endif
+  {
+    MOZ_ASSERT(NS_IsMainThread(),
+               "CorePS must be created from the main thread");
+  }
+
+  ~CorePS() {
+#ifdef USE_LUL_STACKWALK
+    delete sInstance->mLul;
+#endif
+  }
+
+ public:
+  static void Create(PSLockRef aLock) {
+    MOZ_ASSERT(!sInstance);
+    sInstance = new CorePS();
+  }
+
+  static void Destroy(PSLockRef aLock) {
+    MOZ_ASSERT(sInstance);
+    delete sInstance;
+    sInstance = nullptr;
+  }
+
+  // Unlike ActivePS::Exists(), CorePS::Exists() can be called without gPSMutex
+  // being locked. This is because CorePS is instantiated so early on the main
+  // thread that we don't have to worry about it being racy.
+  static bool Exists() { return !!sInstance; }
+
+  static void AddSizeOf(PSLockRef, MallocSizeOf aMallocSizeOf,
+                        size_t& aProfSize, size_t& aLulSize) {
+    MOZ_ASSERT(sInstance);
+
+    aProfSize += aMallocSizeOf(sInstance);
+
+    aProfSize += ThreadRegistry::SizeOfIncludingThis(aMallocSizeOf);
+
+    for (auto& registeredPage : sInstance->mRegisteredPages) {
+      aProfSize += registeredPage->SizeOfIncludingThis(aMallocSizeOf);
+    }
+
+    // Measurement of the following things may be added later if DMD finds it
+    // is worthwhile:
+    // - CorePS::mRegisteredPages itself (its elements' children are
+    // measured above)
+
+#if defined(USE_LUL_STACKWALK)
+    if (lul::LUL* lulPtr = sInstance->mLul; lulPtr) {
+      aLulSize += lulPtr->SizeOfIncludingThis(aMallocSizeOf);
+    }
+#endif
+  }
+
+  // No PSLockRef is needed for this field because it's immutable.
+  PS_GET_LOCKLESS(TimeStamp, ProcessStartTime)
+
+  PS_GET(JsFrameBuffer&, JsFrames)
+
+  PS_GET(Vector<RefPtr<PageInformation>>&, RegisteredPages)
+
+  static void AppendRegisteredPage(PSLockRef,
+                                   RefPtr<PageInformation>&& aRegisteredPage) {
+    MOZ_ASSERT(sInstance);
+    struct RegisteredPageComparator {
+      PageInformation* aA;
+      bool operator()(PageInformation* aB) const { return aA->Equals(aB); }
+    };
+
+    auto foundPageIter = std::find_if(
+        sInstance->mRegisteredPages.begin(), sInstance->mRegisteredPages.end(),
+        RegisteredPageComparator{aRegisteredPage.get()});
+
+    if (foundPageIter != sInstance->mRegisteredPages.end()) {
+      if ((*foundPageIter)->Url().EqualsLiteral("about:blank")) {
+        // When a BrowsingContext is loaded, the first url loaded in it will be
+        // about:blank, and if the principal matches, the first document loaded
+        // in it will share an inner window. That's why we should delete the
+        // intermittent about:blank if they share the inner window.
+        sInstance->mRegisteredPages.erase(foundPageIter);
+      } else {
+        // Do not register the same page again.
+        return;
+      }
+    }
+
+    MOZ_RELEASE_ASSERT(
+        sInstance->mRegisteredPages.append(std::move(aRegisteredPage)));
+  }
+
+  static void RemoveRegisteredPage(PSLockRef,
+                                   uint64_t aRegisteredInnerWindowID) {
+    MOZ_ASSERT(sInstance);
+    // Remove RegisteredPage from mRegisteredPages by given inner window ID.
+    sInstance->mRegisteredPages.eraseIf([&](const RefPtr<PageInformation>& rd) {
+      return rd->InnerWindowID() == aRegisteredInnerWindowID;
+    });
+  }
+
+  static void ClearRegisteredPages(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    sInstance->mRegisteredPages.clear();
+  }
+
+  PS_GET(const Vector<BaseProfilerCount*>&, Counters)
+
+  static void AppendCounter(PSLockRef, BaseProfilerCount* aCounter) {
+    MOZ_ASSERT(sInstance);
+    // we don't own the counter; they may be stored in static objects
+    MOZ_RELEASE_ASSERT(sInstance->mCounters.append(aCounter));
+  }
+
+  static void RemoveCounter(PSLockRef, BaseProfilerCount* aCounter) {
+    // we may be called to remove a counter after the profiler is stopped or
+    // late in shutdown.
+    if (sInstance) {
+      auto* counter = std::find(sInstance->mCounters.begin(),
+                                sInstance->mCounters.end(), aCounter);
+      MOZ_RELEASE_ASSERT(counter != sInstance->mCounters.end());
+      sInstance->mCounters.erase(counter);
+    }
+  }
+
+#ifdef USE_LUL_STACKWALK
+  static lul::LUL* Lul() {
+    MOZ_RELEASE_ASSERT(sInstance);
+    return sInstance->mLul;
+  }
+  static void SetLul(UniquePtr<lul::LUL> aLul) {
+    MOZ_RELEASE_ASSERT(sInstance);
+    MOZ_RELEASE_ASSERT(
+        sInstance->mLul.compareExchange(nullptr, aLul.release()));
+  }
+#endif
+
+  PS_GET_AND_SET(const nsACString&, ProcessName)
+  PS_GET_AND_SET(const nsACString&, ETLDplus1)
+
+ private:
+  // The singleton instance
+  static CorePS* sInstance;
+
+  // The time that the process started.
+  const TimeStamp mProcessStartTime;
+
+  // Info on all the registered pages.
+  // InnerWindowIDs in mRegisteredPages are unique.
+  Vector<RefPtr<PageInformation>> mRegisteredPages;
+
+  // Non-owning pointers to all active counters
+  Vector<BaseProfilerCount*> mCounters;
+
+#ifdef USE_LUL_STACKWALK
+  // LUL's state. Null prior to the first activation, non-null thereafter.
+  // Owned by this CorePS.
+  mozilla::Atomic<lul::LUL*> mLul;
+#endif
+
+  // Process name, provided by child process initialization code.
+  nsAutoCString mProcessName;
+  // Private name, provided by child process initialization code (eTLD+1 in
+  // fission)
+  nsAutoCString mETLDplus1;
+
+  // This memory buffer is used by the MergeStacks mechanism. Previously it was
+  // stack allocated, but this led to a stack overflow, as it was too much
+  // memory. Here the buffer can be pre-allocated, and shared with the
+  // MergeStacks feature as needed. MergeStacks is only run while holding the
+  // lock, so it is safe to have only one instance allocated for all of the
+  // threads.
+  JsFrameBuffer mJsFrames;
+};
+
+CorePS* CorePS::sInstance = nullptr;
+
+void locked_profiler_add_sampled_counter(PSLockRef aLock,
+                                         BaseProfilerCount* aCounter) {
+  CorePS::AppendCounter(aLock, aCounter);
+}
+
+void locked_profiler_remove_sampled_counter(PSLockRef aLock,
+                                            BaseProfilerCount* aCounter) {
+  // Note: we don't enforce a final sample, though we could do so if the
+  // profiler was active
+  CorePS::RemoveCounter(aLock, aCounter);
+}
+
+class SamplerThread;
+
+static SamplerThread* NewSamplerThread(PSLockRef aLock, uint32_t aGeneration,
+                                       double aInterval, uint32_t aFeatures);
+
+struct LiveProfiledThreadData {
+  UniquePtr<ProfiledThreadData> mProfiledThreadData;
+};
+
+// The buffer size is provided as a number of "entries", this is their size in
+// bytes.
+constexpr static uint32_t scBytesPerEntry = 8;
+
+// This class contains the profiler's global state that is valid only when the
+// profiler is active. When not instantiated, the profiler is inactive.
+//
+// Accesses to ActivePS are guarded by gPSMutex, in much the same fashion as
+// CorePS.
+//
+class ActivePS {
+ private:
+  // We need to decide how many chunks of what size we want to fit in the given
+  // total maximum capacity for this process, in the (likely) context of
+  // multiple processes doing the same choice and having an inter-process
+  // mechanism to control the overal memory limit.
+
+  // Minimum chunk size allowed, enough for at least one stack.
+  constexpr static uint32_t scMinimumChunkSize =
+      2 * ProfileBufferChunkManager::scExpectedMaximumStackSize;
+
+  // Ideally we want at least 2 unreleased chunks to work with (1 current and 1
+  // next), and 2 released chunks (so that one can be recycled when old, leaving
+  // one with some data).
+  constexpr static uint32_t scMinimumNumberOfChunks = 4;
+
+  // And we want to limit chunks to a maximum size, which is a compromise
+  // between:
+  // - A big size, which helps with reducing the rate of allocations and IPCs.
+  // - A small size, which helps with equalizing the duration of recorded data
+  //   (as the inter-process controller will discard the oldest chunks in all
+  //   Firefox processes).
+  constexpr static uint32_t scMaximumChunkSize = 1024 * 1024;
+
+ public:
+  // We should be able to store at least the minimum number of the smallest-
+  // possible chunks.
+  constexpr static uint32_t scMinimumBufferSize =
+      scMinimumNumberOfChunks * scMinimumChunkSize;
+  // Note: Keep in sync with GeckoThread.maybeStartGeckoProfiler:
+  // https://searchfox.org/mozilla-central/source/mobile/android/geckoview/src/main/java/org/mozilla/gecko/GeckoThread.java
+  constexpr static uint32_t scMinimumBufferEntries =
+      scMinimumBufferSize / scBytesPerEntry;
+
+  // Limit to 2GiB.
+  constexpr static uint32_t scMaximumBufferSize = 2u * 1024u * 1024u * 1024u;
+  constexpr static uint32_t scMaximumBufferEntries =
+      scMaximumBufferSize / scBytesPerEntry;
+
+  constexpr static uint32_t ClampToAllowedEntries(uint32_t aEntries) {
+    if (aEntries <= scMinimumBufferEntries) {
+      return scMinimumBufferEntries;
+    }
+    if (aEntries >= scMaximumBufferEntries) {
+      return scMaximumBufferEntries;
+    }
+    return aEntries;
+  }
+
+ private:
+  constexpr static uint32_t ChunkSizeForEntries(uint32_t aEntries) {
+    return uint32_t(std::min(size_t(ClampToAllowedEntries(aEntries)) *
+                                 scBytesPerEntry / scMinimumNumberOfChunks,
+                             size_t(scMaximumChunkSize)));
+  }
+
+  static uint32_t AdjustFeatures(uint32_t aFeatures, uint32_t aFilterCount) {
+    // Filter out any features unavailable in this platform/configuration.
+    aFeatures &= AvailableFeatures();
+
+    // Some features imply others.
+    if (aFeatures & ProfilerFeature::FileIOAll) {
+      aFeatures |= ProfilerFeature::MainThreadIO | ProfilerFeature::FileIO;
+    } else if (aFeatures & ProfilerFeature::FileIO) {
+      aFeatures |= ProfilerFeature::MainThreadIO;
+    }
+
+    if (aFeatures & ProfilerFeature::CPUAllThreads) {
+      aFeatures |= ProfilerFeature::CPUUtilization;
+    }
+
+    return aFeatures;
+  }
+
+  bool ShouldInterposeIOs() {
+    return ProfilerFeature::HasMainThreadIO(mFeatures) ||
+           ProfilerFeature::HasFileIO(mFeatures) ||
+           ProfilerFeature::HasFileIOAll(mFeatures);
+  }
+
+  ActivePS(
+      PSLockRef aLock, const TimeStamp& aProfilingStartTime,
+      PowerOfTwo32 aCapacity, double aInterval, uint32_t aFeatures,
+      const char** aFilters, uint32_t aFilterCount, uint64_t aActiveTabID,
+      const Maybe<double>& aDuration,
+      UniquePtr<ProfileBufferChunkManagerWithLocalLimit> aChunkManagerOrNull)
+      : mProfilingStartTime(aProfilingStartTime),
+        mGeneration(sNextGeneration++),
+        mCapacity(aCapacity),
+        mDuration(aDuration),
+        mInterval(aInterval),
+        mFeatures(AdjustFeatures(aFeatures, aFilterCount)),
+        mActiveTabID(aActiveTabID),
+        mProfileBufferChunkManager(
+            aChunkManagerOrNull
+                ? std::move(aChunkManagerOrNull)
+                : MakeUnique<ProfileBufferChunkManagerWithLocalLimit>(
+                      size_t(ClampToAllowedEntries(aCapacity.Value())) *
+                          scBytesPerEntry,
+                      ChunkSizeForEntries(aCapacity.Value()))),
+        mProfileBuffer([this]() -> ProfileChunkedBuffer& {
+          ProfileChunkedBuffer& coreBuffer = profiler_get_core_buffer();
+          coreBuffer.SetChunkManagerIfDifferent(*mProfileBufferChunkManager);
+          return coreBuffer;
+        }()),
+        mMaybeProcessCPUCounter(ProfilerFeature::HasProcessCPU(aFeatures)
+                                    ? new ProcessCPUCounter(aLock)
+                                    : nullptr),
+        mMaybePowerCounters(nullptr),
+        // The new sampler thread doesn't start sampling immediately because the
+        // main loop within Run() is blocked until this function's caller
+        // unlocks gPSMutex.
+        mSamplerThread(
+            NewSamplerThread(aLock, mGeneration, aInterval, aFeatures)),
+        mIsPaused(false),
+        mIsSamplingPaused(false) {
+    ProfilingLog::Init();
+
+    // Deep copy and lower-case aFilters.
+    MOZ_ALWAYS_TRUE(mFilters.resize(aFilterCount));
+    MOZ_ALWAYS_TRUE(mFiltersLowered.resize(aFilterCount));
+    for (uint32_t i = 0; i < aFilterCount; ++i) {
+      mFilters[i] = aFilters[i];
+      mFiltersLowered[i].reserve(mFilters[i].size());
+      std::transform(mFilters[i].cbegin(), mFilters[i].cend(),
+                     std::back_inserter(mFiltersLowered[i]), ::tolower);
+    }
+
+#if !defined(RELEASE_OR_BETA)
+    if (ShouldInterposeIOs()) {
+      // We need to register the observer on the main thread, because we want
+      // to observe IO that happens on the main thread.
+      // IOInterposer needs to be initialized before calling
+      // IOInterposer::Register or our observer will be silently dropped.
+      if (NS_IsMainThread()) {
+        IOInterposer::Init();
+        IOInterposer::Register(IOInterposeObserver::OpAll,
+                               &ProfilerIOInterposeObserver::GetInstance());
+      } else {
+        NS_DispatchToMainThread(
+            NS_NewRunnableFunction("ActivePS::ActivePS", []() {
+              // Note: This could theoretically happen after ActivePS gets
+              // destroyed, but it's ok:
+              // - The Observer always checks that the profiler is (still)
+              //   active before doing its work.
+              // - The destruction should happen on the same thread as this
+              //   construction, so the un-registration will also be dispatched
+              //   and queued on the main thread, and run after this.
+              IOInterposer::Init();
+              IOInterposer::Register(
+                  IOInterposeObserver::OpAll,
+                  &ProfilerIOInterposeObserver::GetInstance());
+            }));
+      }
+    }
+#endif
+
+    if (ProfilerFeature::HasPower(aFeatures)) {
+      mMaybePowerCounters = new PowerCounters();
+      for (const auto& powerCounter : mMaybePowerCounters->GetCounters()) {
+        locked_profiler_add_sampled_counter(aLock, powerCounter);
+      }
+    }
+  }
+
+  ~ActivePS() {
+    MOZ_ASSERT(
+        !mMaybeProcessCPUCounter,
+        "mMaybeProcessCPUCounter should have been deleted before ~ActivePS()");
+    MOZ_ASSERT(
+        !mMaybePowerCounters,
+        "mMaybePowerCounters should have been deleted before ~ActivePS()");
+
+#if !defined(RELEASE_OR_BETA)
+    if (ShouldInterposeIOs()) {
+      // We need to unregister the observer on the main thread, because that's
+      // where we've registered it.
+      if (NS_IsMainThread()) {
+        IOInterposer::Unregister(IOInterposeObserver::OpAll,
+                                 &ProfilerIOInterposeObserver::GetInstance());
+      } else {
+        NS_DispatchToMainThread(
+            NS_NewRunnableFunction("ActivePS::~ActivePS", []() {
+              IOInterposer::Unregister(
+                  IOInterposeObserver::OpAll,
+                  &ProfilerIOInterposeObserver::GetInstance());
+            }));
+      }
+    }
+#endif
+    if (mProfileBufferChunkManager) {
+      // We still control the chunk manager, remove it from the core buffer.
+      profiler_get_core_buffer().ResetChunkManager();
+    }
+
+    ProfilingLog::Destroy();
+  }
+
+  bool ThreadSelected(const char* aThreadName) {
+    if (mFiltersLowered.empty()) {
+      return true;
+    }
+
+    std::string name = aThreadName;
+    std::transform(name.begin(), name.end(), name.begin(), ::tolower);
+
+    for (const auto& filter : mFiltersLowered) {
+      if (filter == "*") {
+        return true;
+      }
+
+      // Crude, non UTF-8 compatible, case insensitive substring search
+      if (name.find(filter) != std::string::npos) {
+        return true;
+      }
+
+      // If the filter is "pid:<my pid>", profile all threads.
+      if (mozilla::profiler::detail::FilterHasPid(filter.c_str())) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+ public:
+  static void Create(
+      PSLockRef aLock, const TimeStamp& aProfilingStartTime,
+      PowerOfTwo32 aCapacity, double aInterval, uint32_t aFeatures,
+      const char** aFilters, uint32_t aFilterCount, uint64_t aActiveTabID,
+      const Maybe<double>& aDuration,
+      UniquePtr<ProfileBufferChunkManagerWithLocalLimit> aChunkManagerOrNull) {
+    MOZ_ASSERT(!sInstance);
+    sInstance = new ActivePS(aLock, aProfilingStartTime, aCapacity, aInterval,
+                             aFeatures, aFilters, aFilterCount, aActiveTabID,
+                             aDuration, std::move(aChunkManagerOrNull));
+  }
+
+  [[nodiscard]] static SamplerThread* Destroy(PSLockRef aLock) {
+    MOZ_ASSERT(sInstance);
+    if (sInstance->mMaybeProcessCPUCounter) {
+      locked_profiler_remove_sampled_counter(
+          aLock, sInstance->mMaybeProcessCPUCounter);
+      delete sInstance->mMaybeProcessCPUCounter;
+      sInstance->mMaybeProcessCPUCounter = nullptr;
+    }
+
+    if (sInstance->mMaybePowerCounters) {
+      for (const auto& powerCounter :
+           sInstance->mMaybePowerCounters->GetCounters()) {
+        locked_profiler_remove_sampled_counter(aLock, powerCounter);
+      }
+      delete sInstance->mMaybePowerCounters;
+      sInstance->mMaybePowerCounters = nullptr;
+    }
+
+    auto samplerThread = sInstance->mSamplerThread;
+    delete sInstance;
+    sInstance = nullptr;
+
+    return samplerThread;
+  }
+
+  static bool Exists(PSLockRef) { return !!sInstance; }
+
+  static bool Equals(PSLockRef, PowerOfTwo32 aCapacity,
+                     const Maybe<double>& aDuration, double aInterval,
+                     uint32_t aFeatures, const char** aFilters,
+                     uint32_t aFilterCount, uint64_t aActiveTabID) {
+    MOZ_ASSERT(sInstance);
+    if (sInstance->mCapacity != aCapacity ||
+        sInstance->mDuration != aDuration ||
+        sInstance->mInterval != aInterval ||
+        sInstance->mFeatures != aFeatures ||
+        sInstance->mFilters.length() != aFilterCount ||
+        sInstance->mActiveTabID != aActiveTabID) {
+      return false;
+    }
+
+    for (uint32_t i = 0; i < sInstance->mFilters.length(); ++i) {
+      if (strcmp(sInstance->mFilters[i].c_str(), aFilters[i]) != 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static size_t SizeOf(PSLockRef, MallocSizeOf aMallocSizeOf) {
+    MOZ_ASSERT(sInstance);
+
+    size_t n = aMallocSizeOf(sInstance);
+
+    n += sInstance->mProfileBuffer.SizeOfExcludingThis(aMallocSizeOf);
+
+    // Measurement of the following members may be added later if DMD finds it
+    // is worthwhile:
+    // - mLiveProfiledThreads (both the array itself, and the contents)
+    // - mDeadProfiledThreads (both the array itself, and the contents)
+    //
+
+    return n;
+  }
+
+  static ThreadProfilingFeatures ProfilingFeaturesForThread(
+      PSLockRef aLock, const ThreadRegistrationInfo& aInfo) {
+    MOZ_ASSERT(sInstance);
+    if (sInstance->ThreadSelected(aInfo.Name())) {
+      // This thread was selected by the user, record everything.
+      return ThreadProfilingFeatures::Any;
+    }
+    ThreadProfilingFeatures features = ThreadProfilingFeatures::NotProfiled;
+    if (ActivePS::FeatureCPUAllThreads(aLock)) {
+      features = Combine(features, ThreadProfilingFeatures::CPUUtilization);
+    }
+    if (ActivePS::FeatureSamplingAllThreads(aLock)) {
+      features = Combine(features, ThreadProfilingFeatures::Sampling);
+    }
+    if (ActivePS::FeatureMarkersAllThreads(aLock)) {
+      features = Combine(features, ThreadProfilingFeatures::Markers);
+    }
+    return features;
+  }
+
+  [[nodiscard]] static bool AppendPostSamplingCallback(
+      PSLockRef, PostSamplingCallback&& aCallback);
+
+  // Writes out the current active configuration of the profile.
+  static void WriteActiveConfiguration(
+      PSLockRef aLock, JSONWriter& aWriter,
+      const Span<const char>& aPropertyName = MakeStringSpan("")) {
+    if (!sInstance) {
+      if (!aPropertyName.empty()) {
+        aWriter.NullProperty(aPropertyName);
+      } else {
+        aWriter.NullElement();
+      }
+      return;
+    };
+
+    if (!aPropertyName.empty()) {
+      aWriter.StartObjectProperty(aPropertyName);
+    } else {
+      aWriter.StartObjectElement();
+    }
+
+    {
+      aWriter.StartArrayProperty("features");
+#define WRITE_ACTIVE_FEATURES(n_, str_, Name_, desc_)    \
+  if (profiler_feature_active(ProfilerFeature::Name_)) { \
+    aWriter.StringElement(str_);                         \
+  }
+
+      PROFILER_FOR_EACH_FEATURE(WRITE_ACTIVE_FEATURES)
+#undef WRITE_ACTIVE_FEATURES
+      aWriter.EndArray();
+    }
+    {
+      aWriter.StartArrayProperty("threads");
+      for (const auto& filter : sInstance->mFilters) {
+        aWriter.StringElement(filter);
+      }
+      aWriter.EndArray();
+    }
+    {
+      // Now write all the simple values.
+
+      // The interval is also available on profile.meta.interval
+      aWriter.DoubleProperty("interval", sInstance->mInterval);
+      aWriter.IntProperty("capacity", sInstance->mCapacity.Value());
+      if (sInstance->mDuration) {
+        aWriter.DoubleProperty("duration", sInstance->mDuration.value());
+      }
+      // Here, we are converting uint64_t to double. Tab IDs are
+      // being created using `nsContentUtils::GenerateProcessSpecificId`, which
+      // is specifically designed to only use 53 of the 64 bits to be lossless
+      // when passed into and out of JS as a double.
+      aWriter.DoubleProperty("activeTabID", sInstance->mActiveTabID);
+    }
+    aWriter.EndObject();
+  }
+
+  PS_GET_LOCKLESS(TimeStamp, ProfilingStartTime)
+
+  PS_GET(uint32_t, Generation)
+
+  PS_GET(PowerOfTwo32, Capacity)
+
+  PS_GET(Maybe<double>, Duration)
+
+  PS_GET(double, Interval)
+
+  PS_GET(uint32_t, Features)
+
+  PS_GET(uint64_t, ActiveTabID)
+
+#define PS_GET_FEATURE(n_, str_, Name_, desc_)                \
+  static bool Feature##Name_(PSLockRef) {                     \
+    MOZ_ASSERT(sInstance);                                    \
+    return ProfilerFeature::Has##Name_(sInstance->mFeatures); \
+  }
+
+  PROFILER_FOR_EACH_FEATURE(PS_GET_FEATURE)
+
+#undef PS_GET_FEATURE
+
+  static uint32_t JSFlags(PSLockRef aLock) {
+    uint32_t Flags = 0;
+    Flags |=
+        FeatureJS(aLock) ? uint32_t(JSInstrumentationFlags::StackSampling) : 0;
+
+    Flags |= FeatureJSAllocations(aLock)
+                 ? uint32_t(JSInstrumentationFlags::Allocations)
+                 : 0;
+    return Flags;
+  }
+
+  PS_GET(const Vector<std::string>&, Filters)
+  PS_GET(const Vector<std::string>&, FiltersLowered)
+
+  // Not using PS_GET, because only the "Controlled" interface of
+  // `mProfileBufferChunkManager` should be exposed here.
+  static ProfileBufferChunkManagerWithLocalLimit& ControlledChunkManager(
+      PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    MOZ_ASSERT(sInstance->mProfileBufferChunkManager);
+    return *sInstance->mProfileBufferChunkManager;
+  }
+
+  static void FulfillChunkRequests(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    if (sInstance->mProfileBufferChunkManager) {
+      sInstance->mProfileBufferChunkManager->FulfillChunkRequests();
+    }
+  }
+
+  static ProfileBuffer& Buffer(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    return sInstance->mProfileBuffer;
+  }
+
+  static const Vector<LiveProfiledThreadData>& LiveProfiledThreads(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    return sInstance->mLiveProfiledThreads;
+  }
+
+  struct ProfiledThreadListElement {
+    TimeStamp mRegisterTime;
+    JSContext* mJSContext;  // Null for unregistered threads.
+    ProfiledThreadData* mProfiledThreadData;
+  };
+  using ProfiledThreadList = Vector<ProfiledThreadListElement>;
+
+  // Returns a ProfiledThreadList with all threads that should be included in a
+  // profile, both for threads that are still registered, and for threads that
+  // have been unregistered but still have data in the buffer.
+  // The returned array is sorted by thread register time.
+  // Do not hold on to the return value past LockedRegistry.
+  static ProfiledThreadList ProfiledThreads(
+      ThreadRegistry::LockedRegistry& aLockedRegistry, PSLockRef aLock) {
+    MOZ_ASSERT(sInstance);
+    ProfiledThreadList array;
+    MOZ_RELEASE_ASSERT(
+        array.initCapacity(sInstance->mLiveProfiledThreads.length() +
+                           sInstance->mDeadProfiledThreads.length()));
+
+    for (ThreadRegistry::OffThreadRef offThreadRef : aLockedRegistry) {
+      ProfiledThreadData* profiledThreadData =
+          offThreadRef.UnlockedRWForLockedProfilerRef().GetProfiledThreadData(
+              aLock);
+      if (!profiledThreadData) {
+        // This thread was not profiled, continue with the next one.
+        continue;
+      }
+      ThreadRegistry::OffThreadRef::RWFromAnyThreadWithLock lockedThreadData =
+          offThreadRef.GetLockedRWFromAnyThread();
+      MOZ_RELEASE_ASSERT(array.append(ProfiledThreadListElement{
+          profiledThreadData->Info().RegisterTime(),
+          lockedThreadData->GetJSContext(), profiledThreadData}));
+    }
+
+    for (auto& t : sInstance->mDeadProfiledThreads) {
+      MOZ_RELEASE_ASSERT(array.append(ProfiledThreadListElement{
+          t->Info().RegisterTime(), (JSContext*)nullptr, t.get()}));
+    }
+
+    std::sort(array.begin(), array.end(),
+              [](const ProfiledThreadListElement& a,
+                 const ProfiledThreadListElement& b) {
+                return a.mRegisterTime < b.mRegisterTime;
+              });
+    return array;
+  }
+
+  static Vector<RefPtr<PageInformation>> ProfiledPages(PSLockRef aLock) {
+    MOZ_ASSERT(sInstance);
+    Vector<RefPtr<PageInformation>> array;
+    for (auto& d : CorePS::RegisteredPages(aLock)) {
+      MOZ_RELEASE_ASSERT(array.append(d));
+    }
+    for (auto& d : sInstance->mDeadProfiledPages) {
+      MOZ_RELEASE_ASSERT(array.append(d));
+    }
+    // We don't need to sort the pages like threads since we won't show them
+    // as a list.
+    return array;
+  }
+
+  static ProfiledThreadData* AddLiveProfiledThread(
+      PSLockRef, UniquePtr<ProfiledThreadData>&& aProfiledThreadData) {
+    MOZ_ASSERT(sInstance);
+    MOZ_RELEASE_ASSERT(sInstance->mLiveProfiledThreads.append(
+        LiveProfiledThreadData{std::move(aProfiledThreadData)}));
+
+    // Return a weak pointer to the ProfiledThreadData object.
+    return sInstance->mLiveProfiledThreads.back().mProfiledThreadData.get();
+  }
+
+  static void UnregisterThread(PSLockRef aLockRef,
+                               ProfiledThreadData* aProfiledThreadData) {
+    MOZ_ASSERT(sInstance);
+
+    DiscardExpiredDeadProfiledThreads(aLockRef);
+
+    // Find the right entry in the mLiveProfiledThreads array and remove the
+    // element, moving the ProfiledThreadData object for the thread into the
+    // mDeadProfiledThreads array.
+    for (size_t i = 0; i < sInstance->mLiveProfiledThreads.length(); i++) {
+      LiveProfiledThreadData& thread = sInstance->mLiveProfiledThreads[i];
+      if (thread.mProfiledThreadData == aProfiledThreadData) {
+        thread.mProfiledThreadData->NotifyUnregistered(
+            sInstance->mProfileBuffer.BufferRangeEnd());
+        MOZ_RELEASE_ASSERT(sInstance->mDeadProfiledThreads.append(
+            std::move(thread.mProfiledThreadData)));
+        sInstance->mLiveProfiledThreads.erase(
+            &sInstance->mLiveProfiledThreads[i]);
+        return;
+      }
+    }
+  }
+
+  // This is a counter to collect process CPU utilization during profiling.
+  // It cannot be a raw `ProfilerCounter` because we need to manually add/remove
+  // it while the profiler lock is already held.
+  class ProcessCPUCounter final : public BaseProfilerCount {
+   public:
+    explicit ProcessCPUCounter(PSLockRef aLock)
+        : BaseProfilerCount("processCPU", &mCounter, nullptr, "CPU",
+                            "Process CPU utilization") {
+      // Adding on construction, so it's ready before the sampler starts.
+      locked_profiler_add_sampled_counter(aLock, this);
+      // Note: Removed from ActivePS::Destroy, because a lock is needed.
+    }
+
+    void Add(int64_t aNumber) { mCounter += aNumber; }
+
+   private:
+    ProfilerAtomicSigned mCounter;
+  };
+  PS_GET(ProcessCPUCounter*, MaybeProcessCPUCounter);
+
+  PS_GET(PowerCounters*, MaybePowerCounters);
+
+  PS_GET_AND_SET(bool, IsPaused)
+
+  // True if sampling is paused (though generic `SetIsPaused()` or specific
+  // `SetIsSamplingPaused()`).
+  static bool IsSamplingPaused(PSLockRef lock) {
+    MOZ_ASSERT(sInstance);
+    return IsPaused(lock) || sInstance->mIsSamplingPaused;
+  }
+
+  static void SetIsSamplingPaused(PSLockRef, bool aIsSamplingPaused) {
+    MOZ_ASSERT(sInstance);
+    sInstance->mIsSamplingPaused = aIsSamplingPaused;
+  }
+
+  static void DiscardExpiredDeadProfiledThreads(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    uint64_t bufferRangeStart = sInstance->mProfileBuffer.BufferRangeStart();
+    // Discard any dead threads that were unregistered before bufferRangeStart.
+    sInstance->mDeadProfiledThreads.eraseIf(
+        [bufferRangeStart](
+            const UniquePtr<ProfiledThreadData>& aProfiledThreadData) {
+          Maybe<uint64_t> bufferPosition =
+              aProfiledThreadData->BufferPositionWhenUnregistered();
+          MOZ_RELEASE_ASSERT(bufferPosition,
+                             "should have unregistered this thread");
+          return *bufferPosition < bufferRangeStart;
+        });
+  }
+
+  static void UnregisterPage(PSLockRef aLock,
+                             uint64_t aRegisteredInnerWindowID) {
+    MOZ_ASSERT(sInstance);
+    auto& registeredPages = CorePS::RegisteredPages(aLock);
+    for (size_t i = 0; i < registeredPages.length(); i++) {
+      RefPtr<PageInformation>& page = registeredPages[i];
+      if (page->InnerWindowID() == aRegisteredInnerWindowID) {
+        page->NotifyUnregistered(sInstance->mProfileBuffer.BufferRangeEnd());
+        MOZ_RELEASE_ASSERT(
+            sInstance->mDeadProfiledPages.append(std::move(page)));
+        registeredPages.erase(&registeredPages[i--]);
+      }
+    }
+  }
+
+  static void DiscardExpiredPages(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    uint64_t bufferRangeStart = sInstance->mProfileBuffer.BufferRangeStart();
+    // Discard any dead pages that were unregistered before
+    // bufferRangeStart.
+    sInstance->mDeadProfiledPages.eraseIf(
+        [bufferRangeStart](const RefPtr<PageInformation>& aProfiledPage) {
+          Maybe<uint64_t> bufferPosition =
+              aProfiledPage->BufferPositionWhenUnregistered();
+          MOZ_RELEASE_ASSERT(bufferPosition,
+                             "should have unregistered this page");
+          return *bufferPosition < bufferRangeStart;
+        });
+  }
+
+  static void ClearUnregisteredPages(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    sInstance->mDeadProfiledPages.clear();
+  }
+
+  static void ClearExpiredExitProfiles(PSLockRef) {
+    MOZ_ASSERT(sInstance);
+    uint64_t bufferRangeStart = sInstance->mProfileBuffer.BufferRangeStart();
+    // Discard exit profiles that were gathered before our buffer RangeStart.
+    // If we have started to overwrite our data from when the Base profile was
+    // added, we should get rid of that Base profile because it's now older than
+    // our oldest Gecko profile data.
+    //
+    // When adding: (In practice the starting buffer should be empty)
+    // v Start == End
+    // |                 <-- Buffer range, initially empty.
+    // ^ mGeckoIndexWhenBaseProfileAdded < Start FALSE -> keep it
+    //
+    // Later, still in range:
+    // v Start   v End
+    // |=========|       <-- Buffer range growing.
+    // ^ mGeckoIndexWhenBaseProfileAdded < Start FALSE -> keep it
+    //
+    // Even later, now out of range:
+    //       v Start      v End
+    //       |============|       <-- Buffer range full and sliding.
+    // ^ mGeckoIndexWhenBaseProfileAdded < Start TRUE! -> Discard it
+    if (sInstance->mBaseProfileThreads &&
+        sInstance->mGeckoIndexWhenBaseProfileAdded
+                .ConvertToProfileBufferIndex() <
+            profiler_get_core_buffer().GetState().mRangeStart) {
+      DEBUG_LOG("ClearExpiredExitProfiles() - Discarding base profile %p",
+                sInstance->mBaseProfileThreads.get());
+      sInstance->mBaseProfileThreads.reset();
+    }
+    sInstance->mExitProfiles.eraseIf(
+        [bufferRangeStart](const ExitProfile& aExitProfile) {
+          return aExitProfile.mBufferPositionAtGatherTime < bufferRangeStart;
+        });
+  }
+
+  static void AddBaseProfileThreads(PSLockRef aLock,
+                                    UniquePtr<char[]> aBaseProfileThreads) {
+    MOZ_ASSERT(sInstance);
+    DEBUG_LOG("AddBaseProfileThreads(%p)", aBaseProfileThreads.get());
+    sInstance->mBaseProfileThreads = std::move(aBaseProfileThreads);
+    sInstance->mGeckoIndexWhenBaseProfileAdded =
+        ProfileBufferBlockIndex::CreateFromProfileBufferIndex(
+            profiler_get_core_buffer().GetState().mRangeEnd);
+  }
+
+  static UniquePtr<char[]> MoveBaseProfileThreads(PSLockRef aLock) {
+    MOZ_ASSERT(sInstance);
+
+    ClearExpiredExitProfiles(aLock);
+
+    DEBUG_LOG("MoveBaseProfileThreads() - Consuming base profile %p",
+              sInstance->mBaseProfileThreads.get());
+    return std::move(sInstance->mBaseProfileThreads);
+  }
+
+  static void AddExitProfile(PSLockRef aLock, const nsACString& aExitProfile) {
+    MOZ_ASSERT(sInstance);
+
+    ClearExpiredExitProfiles(aLock);
+
+    MOZ_RELEASE_ASSERT(sInstance->mExitProfiles.append(ExitProfile{
+        nsCString(aExitProfile), sInstance->mProfileBuffer.BufferRangeEnd()}));
+  }
+
+  static Vector<nsCString> MoveExitProfiles(PSLockRef aLock) {
+    MOZ_ASSERT(sInstance);
+
+    ClearExpiredExitProfiles(aLock);
+
+    Vector<nsCString> profiles;
+    MOZ_RELEASE_ASSERT(
+        profiles.initCapacity(sInstance->mExitProfiles.length()));
+    for (auto& profile : sInstance->mExitProfiles) {
+      MOZ_RELEASE_ASSERT(profiles.append(std::move(profile.mJSON)));
+    }
+    sInstance->mExitProfiles.clear();
+    return profiles;
+  }
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  static void SetMemoryCounter(const BaseProfilerCount* aMemoryCounter) {
+    MOZ_ASSERT(sInstance);
+
+    sInstance->mMemoryCounter = aMemoryCounter;
+  }
+
+  static bool IsMemoryCounter(const BaseProfilerCount* aMemoryCounter) {
+    MOZ_ASSERT(sInstance);
+
+    return sInstance->mMemoryCounter == aMemoryCounter;
+  }
+#endif
+
+ private:
+  // The singleton instance.
+  static ActivePS* sInstance;
+
+  const TimeStamp mProfilingStartTime;
+
+  // We need to track activity generations. If we didn't we could have the
+  // following scenario.
+  //
+  // - profiler_stop() locks gPSMutex, de-instantiates ActivePS, unlocks
+  //   gPSMutex, deletes the SamplerThread (which does a join).
+  //
+  // - profiler_start() runs on a different thread, locks gPSMutex,
+  //   re-instantiates ActivePS, unlocks gPSMutex -- all before the join
+  //   completes.
+  //
+  // - SamplerThread::Run() locks gPSMutex, sees that ActivePS is instantiated,
+  //   and continues as if the start/stop pair didn't occur. Also
+  //   profiler_stop() is stuck, unable to finish.
+  //
+  // By checking ActivePS *and* the generation, we can avoid this scenario.
+  // sNextGeneration is used to track the next generation number; it is static
+  // because it must persist across different ActivePS instantiations.
+  const uint32_t mGeneration;
+  static uint32_t sNextGeneration;
+
+  // The maximum number of entries in mProfileBuffer.
+  const PowerOfTwo32 mCapacity;
+
+  // The maximum duration of entries in mProfileBuffer, in seconds.
+  const Maybe<double> mDuration;
+
+  // The interval between samples, measured in milliseconds.
+  const double mInterval;
+
+  // The profile features that are enabled.
+  const uint32_t mFeatures;
+
+  // Substrings of names of threads we want to profile.
+  Vector<std::string> mFilters;
+  Vector<std::string> mFiltersLowered;
+
+  // ID of the active browser screen's active tab.
+  // It's being used to determine the profiled tab. It's "0" if we failed to
+  // get the ID.
+  const uint64_t mActiveTabID;
+
+  // The chunk manager used by `mProfileBuffer` below.
+  // May become null if it gets transferred ouf of the Gecko Profiler.
+  UniquePtr<ProfileBufferChunkManagerWithLocalLimit> mProfileBufferChunkManager;
+
+  // The buffer into which all samples are recorded.
+  ProfileBuffer mProfileBuffer;
+
+  // ProfiledThreadData objects for any threads that were profiled at any point
+  // during this run of the profiler:
+  //  - mLiveProfiledThreads contains all threads that are still registered, and
+  //  - mDeadProfiledThreads contains all threads that have already been
+  //    unregistered but for which there is still data in the profile buffer.
+  Vector<LiveProfiledThreadData> mLiveProfiledThreads;
+  Vector<UniquePtr<ProfiledThreadData>> mDeadProfiledThreads;
+
+  // Info on all the dead pages.
+  // Registered pages are being moved to this array after unregistration.
+  // We are keeping them in case we need them in the profile data.
+  // We are removing them when we ensure that we won't need them anymore.
+  Vector<RefPtr<PageInformation>> mDeadProfiledPages;
+
+  // Used to collect process CPU utilization values, if the feature is on.
+  ProcessCPUCounter* mMaybeProcessCPUCounter;
+
+  // Used to collect power use data, if the power feature is on.
+  PowerCounters* mMaybePowerCounters;
+
+  // The current sampler thread. This class is not responsible for destroying
+  // the SamplerThread object; the Destroy() method returns it so the caller
+  // can destroy it.
+  SamplerThread* const mSamplerThread;
+
+  // Is the profiler fully paused?
+  bool mIsPaused;
+
+  // Is the profiler periodic sampling paused?
+  bool mIsSamplingPaused;
+
+  // Optional startup profile thread array from BaseProfiler.
+  UniquePtr<char[]> mBaseProfileThreads;
+  ProfileBufferBlockIndex mGeckoIndexWhenBaseProfileAdded;
+
+  struct ExitProfile {
+    nsCString mJSON;
+    uint64_t mBufferPositionAtGatherTime;
+  };
+  Vector<ExitProfile> mExitProfiles;
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  Atomic<const BaseProfilerCount*> mMemoryCounter;
+#endif
+};
+
+ActivePS* ActivePS::sInstance = nullptr;
+uint32_t ActivePS::sNextGeneration = 0;
+
+#undef PS_GET
+#undef PS_GET_LOCKLESS
+#undef PS_GET_AND_SET
+
+using ProfilerStateChangeMutex =
+    mozilla::baseprofiler::detail::BaseProfilerMutex;
+using ProfilerStateChangeLock =
+    mozilla::baseprofiler::detail::BaseProfilerAutoLock;
+static ProfilerStateChangeMutex gProfilerStateChangeMutex;
+
+struct IdentifiedProfilingStateChangeCallback {
+  ProfilingStateSet mProfilingStateSet;
+  ProfilingStateChangeCallback mProfilingStateChangeCallback;
+  uintptr_t mUniqueIdentifier;
+
+  explicit IdentifiedProfilingStateChangeCallback(
+      ProfilingStateSet aProfilingStateSet,
+      ProfilingStateChangeCallback&& aProfilingStateChangeCallback,
+      uintptr_t aUniqueIdentifier)
+      : mProfilingStateSet(aProfilingStateSet),
+        mProfilingStateChangeCallback(aProfilingStateChangeCallback),
+        mUniqueIdentifier(aUniqueIdentifier) {}
+};
+using IdentifiedProfilingStateChangeCallbackUPtr =
+    UniquePtr<IdentifiedProfilingStateChangeCallback>;
+
+static Vector<IdentifiedProfilingStateChangeCallbackUPtr>
+    mIdentifiedProfilingStateChangeCallbacks;
+
+void profiler_add_state_change_callback(
+    ProfilingStateSet aProfilingStateSet,
+    ProfilingStateChangeCallback&& aCallback,
+    uintptr_t aUniqueIdentifier /* = 0 */) {
+  MOZ_ASSERT(!PSAutoLock::IsLockedOnCurrentThread());
+  ProfilerStateChangeLock lock(gProfilerStateChangeMutex);
+
+#ifdef DEBUG
+  // Check if a non-zero id is not already used. Bug forgive it in non-DEBUG
+  // builds; in the worst case they may get removed too early.
+  if (aUniqueIdentifier != 0) {
+    for (const IdentifiedProfilingStateChangeCallbackUPtr& idedCallback :
+         mIdentifiedProfilingStateChangeCallbacks) {
+      MOZ_ASSERT(idedCallback->mUniqueIdentifier != aUniqueIdentifier);
+    }
+  }
+#endif  // DEBUG
+
+  if (aProfilingStateSet.contains(ProfilingState::AlreadyActive) &&
+      profiler_is_active()) {
+    aCallback(ProfilingState::AlreadyActive);
+  }
+
+  (void)mIdentifiedProfilingStateChangeCallbacks.append(
+      MakeUnique<IdentifiedProfilingStateChangeCallback>(
+          aProfilingStateSet, std::move(aCallback), aUniqueIdentifier));
+}
+
+// Remove the callback with the given identifier.
+void profiler_remove_state_change_callback(uintptr_t aUniqueIdentifier) {
+  MOZ_ASSERT(aUniqueIdentifier != 0);
+  if (aUniqueIdentifier == 0) {
+    // Forgive zero in non-DEBUG builds.
+    return;
+  }
+
+  MOZ_ASSERT(!PSAutoLock::IsLockedOnCurrentThread());
+  ProfilerStateChangeLock lock(gProfilerStateChangeMutex);
+
+  mIdentifiedProfilingStateChangeCallbacks.eraseIf(
+      [aUniqueIdentifier](
+          const IdentifiedProfilingStateChangeCallbackUPtr& aIdedCallback) {
+        if (aIdedCallback->mUniqueIdentifier != aUniqueIdentifier) {
+          return false;
+        }
+        if (aIdedCallback->mProfilingStateSet.contains(
+                ProfilingState::RemovingCallback)) {
+          aIdedCallback->mProfilingStateChangeCallback(
+              ProfilingState::RemovingCallback);
+        }
+        return true;
+      });
+}
+
+static void invoke_profiler_state_change_callbacks(
+    ProfilingState aProfilingState) {
+  MOZ_ASSERT(!PSAutoLock::IsLockedOnCurrentThread());
+  ProfilerStateChangeLock lock(gProfilerStateChangeMutex);
+
+  for (const IdentifiedProfilingStateChangeCallbackUPtr& idedCallback :
+       mIdentifiedProfilingStateChangeCallbacks) {
+    if (idedCallback->mProfilingStateSet.contains(aProfilingState)) {
+      idedCallback->mProfilingStateChangeCallback(aProfilingState);
+    }
+  }
+}
+
+Atomic<uint32_t, MemoryOrdering::Relaxed> RacyFeatures::sActiveAndFeatures(0);
+
+// The name of the main thread.
+static const char* const kMainThreadName = "GeckoMain";
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN sampling/unwinding code
+
+// The registers used for stack unwinding and a few other sampling purposes.
+// The ctor does nothing; users are responsible for filling in the fields.
+class Registers {
+ public:
+  Registers() : mPC{nullptr}, mSP{nullptr}, mFP{nullptr}, mLR{nullptr} {}
+
+  void Clear() { memset(this, 0, sizeof(*this)); }
+
+  // These fields are filled in by
+  // Sampler::SuspendAndSampleAndResumeThread() for periodic and backtrace
+  // samples, and by REGISTERS_SYNC_POPULATE for synchronous samples.
+  Address mPC;  // Instruction pointer.
+  Address mSP;  // Stack pointer.
+  Address mFP;  // Frame pointer.
+  Address mLR;  // ARM link register.
+#if defined(GP_OS_linux) || defined(GP_OS_android) || defined(GP_OS_freebsd)
+  // This contains all the registers, which means it duplicates the four fields
+  // above. This is ok.
+  ucontext_t* mContext;  // The context from the signal handler or below.
+  ucontext_t mContextSyncStorage;  // Storage for sync stack unwinding.
+#endif
+};
+
+// Setting MAX_NATIVE_FRAMES too high risks the unwinder wasting a lot of time
+// looping on corrupted stacks.
+static const size_t MAX_NATIVE_FRAMES = 1024;
+
+struct NativeStack {
+  void* mPCs[MAX_NATIVE_FRAMES];
+  void* mSPs[MAX_NATIVE_FRAMES];
+  size_t mCount;  // Number of frames filled.
+
+  NativeStack() : mPCs(), mSPs(), mCount(0) {}
+};
+
+Atomic<bool> WALKING_JS_STACK(false);
+
+struct AutoWalkJSStack {
+  bool walkAllowed;
+
+  AutoWalkJSStack() : walkAllowed(false) {
+    walkAllowed = WALKING_JS_STACK.compareExchange(false, true);
+  }
+
+  ~AutoWalkJSStack() {
+    if (walkAllowed) {
+      WALKING_JS_STACK = false;
+    }
+  }
+};
+
+class StackWalkControl {
+ public:
+  struct ResumePoint {
+    // If lost, the stack walker should resume at these values.
+    void* resumeSp;  // If null, stop the walker here, don't resume again.
+    void* resumeBp;
+    void* resumePc;
+  };
+
+#if ((defined(USE_MOZ_STACK_WALK) || defined(USE_FRAME_POINTER_STACK_WALK)) && \
+     defined(GP_ARCH_amd64))
+ public:
+  static constexpr bool scIsSupported = true;
+
+  void Clear() { mResumePointCount = 0; }
+
+  size_t ResumePointCount() const { return mResumePointCount; }
+
+  static constexpr size_t MaxResumePointCount() {
+    return scMaxResumePointCount;
+  }
+
+  // Add a resume point. Note that adding anything past MaxResumePointCount()
+  // would silently fail. In practice this means that stack walking may still
+  // lose native frames.
+  void AddResumePoint(ResumePoint&& aResumePoint) {
+    // If SP is null, we expect BP and PC to also be null.
+    MOZ_ASSERT_IF(!aResumePoint.resumeSp, !aResumePoint.resumeBp);
+    MOZ_ASSERT_IF(!aResumePoint.resumeSp, !aResumePoint.resumePc);
+
+    // If BP and/or PC are not null, SP must not be null. (But we allow BP/PC to
+    // be null even if SP is not null.)
+    MOZ_ASSERT_IF(aResumePoint.resumeBp, aResumePoint.resumeSp);
+    MOZ_ASSERT_IF(aResumePoint.resumePc, aResumePoint.resumeSp);
+
+    if (mResumePointCount < scMaxResumePointCount) {
+      mResumePoint[mResumePointCount] = std::move(aResumePoint);
+      ++mResumePointCount;
+    }
+  }
+
+  // Only allow non-modifying range-for loops.
+  const ResumePoint* begin() const { return &mResumePoint[0]; }
+  const ResumePoint* end() const { return &mResumePoint[mResumePointCount]; }
+
+  // Find the next resume point that would be a caller of the function with the
+  // given SP; i.e., the resume point with the closest resumeSp > aSp.
+  const ResumePoint* GetResumePointCallingSp(void* aSp) const {
+    const ResumePoint* callingResumePoint = nullptr;
+    for (const ResumePoint& resumePoint : *this) {
+      if (resumePoint.resumeSp &&        // This is a potential resume point.
+          resumePoint.resumeSp > aSp &&  // It is a caller of the given SP.
+          (!callingResumePoint ||        // This is the first candidate.
+           resumePoint.resumeSp < callingResumePoint->resumeSp)  // Or better.
+      ) {
+        callingResumePoint = &resumePoint;
+      }
+    }
+    return callingResumePoint;
+  }
+
+ private:
+  size_t mResumePointCount = 0;
+  static constexpr size_t scMaxResumePointCount = 32;
+  ResumePoint mResumePoint[scMaxResumePointCount];
+
+#else
+ public:
+  static constexpr bool scIsSupported = false;
+  // Discarded constexpr-if statements are still checked during compilation,
+  // these declarations are necessary for that, even if not actually used.
+  void Clear();
+  size_t ResumePointCount();
+  static constexpr size_t MaxResumePointCount();
+  void AddResumePoint(ResumePoint&& aResumePoint);
+  const ResumePoint* begin() const;
+  const ResumePoint* end() const;
+  const ResumePoint* GetResumePointCallingSp(void* aSp) const;
+#endif
+};
+
+// Make a copy of the JS stack into a JSFrame array, and return the number of
+// copied frames.
+// This copy is necessary since, like the native stack, the JS stack is iterated
+// youngest-to-oldest and we need to iterate oldest-to-youngest in MergeStacks.
+static uint32_t ExtractJsFrames(
+    bool aIsSynchronous,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, ProfilerStackCollector& aCollector,
+    JsFrameBuffer aJsFrames, StackWalkControl* aStackWalkControlIfSupported) {
+  MOZ_ASSERT(aJsFrames,
+             "ExtractJsFrames should only be called if there is a "
+             "JsFrameBuffer to fill.");
+
+  uint32_t jsFramesCount = 0;
+
+  // Only walk jit stack if profiling frame iterator is turned on.
+  JSContext* context = aThreadData.GetJSContext();
+  if (context && JS::IsProfilingEnabledForContext(context)) {
+    AutoWalkJSStack autoWalkJSStack;
+
+    if (autoWalkJSStack.walkAllowed) {
+      JS::ProfilingFrameIterator::RegisterState registerState;
+      registerState.pc = aRegs.mPC;
+      registerState.sp = aRegs.mSP;
+      registerState.lr = aRegs.mLR;
+      registerState.fp = aRegs.mFP;
+
+      // Non-periodic sampling passes Nothing() as the buffer write position to
+      // ProfilingFrameIterator to avoid incorrectly resetting the buffer
+      // position of sampled JIT frames inside the JS engine.
+      Maybe<uint64_t> samplePosInBuffer;
+      if (!aIsSynchronous) {
+        // aCollector.SamplePositionInBuffer() will return Nothing() when
+        // profiler_suspend_and_sample_thread is called from the background hang
+        // reporter.
+        samplePosInBuffer = aCollector.SamplePositionInBuffer();
+      }
+
+      for (JS::ProfilingFrameIterator jsIter(context, registerState,
+                                             samplePosInBuffer);
+           !jsIter.done(); ++jsIter) {
+        if (aIsSynchronous || jsIter.isWasm()) {
+          jsFramesCount +=
+              jsIter.extractStack(aJsFrames, jsFramesCount, MAX_JS_FRAMES);
+          if (jsFramesCount == MAX_JS_FRAMES) {
+            break;
+          }
+        } else {
+          Maybe<JS::ProfilingFrameIterator::Frame> frame =
+              jsIter.getPhysicalFrameWithoutLabel();
+          if (frame.isSome()) {
+            aJsFrames[jsFramesCount++] = std::move(frame).ref();
+            if (jsFramesCount == MAX_JS_FRAMES) {
+              break;
+            }
+          }
+        }
+
+        if constexpr (StackWalkControl::scIsSupported) {
+          if (aStackWalkControlIfSupported) {
+            jsIter.getCppEntryRegisters().apply(
+                [&](const JS::ProfilingFrameIterator::RegisterState&
+                        aCppEntry) {
+                  StackWalkControl::ResumePoint resumePoint;
+                  resumePoint.resumeSp = aCppEntry.sp;
+                  resumePoint.resumeBp = aCppEntry.fp;
+                  resumePoint.resumePc = aCppEntry.pc;
+                  aStackWalkControlIfSupported->AddResumePoint(
+                      std::move(resumePoint));
+                });
+          }
+        } else {
+          MOZ_ASSERT(!aStackWalkControlIfSupported,
+                     "aStackWalkControlIfSupported should be null when "
+                     "!StackWalkControl::scIsSupported");
+          (void)aStackWalkControlIfSupported;
+        }
+      }
+    }
+  }
+
+  return jsFramesCount;
+}
+
+// Merges the profiling stack, native stack, and JS stack, outputting the
+// details to aCollector.
+static void MergeStacks(
+    uint32_t aFeatures, bool aIsSynchronous,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, const NativeStack& aNativeStack,
+    ProfilerStackCollector& aCollector, JsFrame* aJsFrames,
+    uint32_t aJsFramesCount) {
+  // WARNING: this function runs within the profiler's "critical section".
+  // WARNING: this function might be called while the profiler is inactive, and
+  //          cannot rely on ActivePS.
+
+  MOZ_ASSERT_IF(!aJsFrames, aJsFramesCount == 0);
+
+  const ProfilingStack& profilingStack = aThreadData.ProfilingStackCRef();
+  const js::ProfilingStackFrame* profilingStackFrames = profilingStack.frames;
+  uint32_t profilingStackFrameCount = profilingStack.stackSize();
+
+  // While the profiling stack array is ordered oldest-to-youngest, the JS and
+  // native arrays are ordered youngest-to-oldest. We must add frames to aInfo
+  // oldest-to-youngest. Thus, iterate over the profiling stack forwards and JS
+  // and native arrays backwards. Note: this means the terminating condition
+  // jsIndex and nativeIndex is being < 0.
+  uint32_t profilingStackIndex = 0;
+  int32_t jsIndex = aJsFramesCount - 1;
+  int32_t nativeIndex = aNativeStack.mCount - 1;
+
+  uint8_t* lastLabelFrameStackAddr = nullptr;
+  uint8_t* jitEndStackAddr = nullptr;
+
+  // Iterate as long as there is at least one frame remaining.
+  while (profilingStackIndex != profilingStackFrameCount || jsIndex >= 0 ||
+         nativeIndex >= 0) {
+    // There are 1 to 3 frames available. Find and add the oldest.
+    uint8_t* profilingStackAddr = nullptr;
+    uint8_t* jsStackAddr = nullptr;
+    uint8_t* nativeStackAddr = nullptr;
+    uint8_t* jsActivationAddr = nullptr;
+
+    if (profilingStackIndex != profilingStackFrameCount) {
+      const js::ProfilingStackFrame& profilingStackFrame =
+          profilingStackFrames[profilingStackIndex];
+
+      if (profilingStackFrame.isLabelFrame() ||
+          profilingStackFrame.isSpMarkerFrame()) {
+        lastLabelFrameStackAddr = (uint8_t*)profilingStackFrame.stackAddress();
+      }
+
+      // Skip any JS_OSR frames. Such frames are used when the JS interpreter
+      // enters a jit frame on a loop edge (via on-stack-replacement, or OSR).
+      // To avoid both the profiling stack frame and jit frame being recorded
+      // (and showing up twice), the interpreter marks the interpreter
+      // profiling stack frame as JS_OSR to ensure that it doesn't get counted.
+      if (profilingStackFrame.isOSRFrame()) {
+        profilingStackIndex++;
+        continue;
+      }
+
+      MOZ_ASSERT(lastLabelFrameStackAddr);
+      profilingStackAddr = lastLabelFrameStackAddr;
+    }
+
+    if (jsIndex >= 0) {
+      jsStackAddr = (uint8_t*)aJsFrames[jsIndex].stackAddress;
+      jsActivationAddr = (uint8_t*)aJsFrames[jsIndex].activation;
+    }
+
+    if (nativeIndex >= 0) {
+      nativeStackAddr = (uint8_t*)aNativeStack.mSPs[nativeIndex];
+    }
+
+    // If there's a native stack frame which has the same SP as a profiling
+    // stack frame, pretend we didn't see the native stack frame.  Ditto for a
+    // native stack frame which has the same SP as a JS stack frame.  In effect
+    // this means profiling stack frames or JS frames trump conflicting native
+    // frames.
+    if (nativeStackAddr && (profilingStackAddr == nativeStackAddr ||
+                            jsStackAddr == nativeStackAddr)) {
+      nativeStackAddr = nullptr;
+      nativeIndex--;
+      MOZ_ASSERT(profilingStackAddr || jsStackAddr);
+    }
+
+    // Sanity checks.
+    MOZ_ASSERT_IF(profilingStackAddr,
+                  profilingStackAddr != jsStackAddr &&
+                      profilingStackAddr != nativeStackAddr);
+    MOZ_ASSERT_IF(jsStackAddr, jsStackAddr != profilingStackAddr &&
+                                   jsStackAddr != nativeStackAddr);
+    MOZ_ASSERT_IF(nativeStackAddr, nativeStackAddr != profilingStackAddr &&
+                                       nativeStackAddr != jsStackAddr);
+
+    // Check to see if profiling stack frame is top-most.
+    if (profilingStackAddr > jsStackAddr &&
+        profilingStackAddr > nativeStackAddr) {
+      MOZ_ASSERT(profilingStackIndex < profilingStackFrameCount);
+      const js::ProfilingStackFrame& profilingStackFrame =
+          profilingStackFrames[profilingStackIndex];
+
+      // Sp marker frames are just annotations and should not be recorded in
+      // the profile.
+      if (!profilingStackFrame.isSpMarkerFrame()) {
+        // The JIT only allows the top-most frame to have a nullptr pc.
+        MOZ_ASSERT_IF(
+            profilingStackFrame.isJsFrame() && profilingStackFrame.script() &&
+                !profilingStackFrame.pc(),
+            &profilingStackFrame ==
+                &profilingStack.frames[profilingStack.stackSize() - 1]);
+        if (aIsSynchronous && profilingStackFrame.categoryPair() ==
+                                  JS::ProfilingCategoryPair::PROFILER) {
+          // For stacks captured synchronously (ie. marker stacks), stop
+          // walking the stack as soon as we enter the profiler category,
+          // to avoid showing profiler internal code in marker stacks.
+          return;
+        }
+        aCollector.CollectProfilingStackFrame(profilingStackFrame);
+      }
+      profilingStackIndex++;
+      continue;
+    }
+
+    // Check to see if JS jit stack frame is top-most
+    if (jsStackAddr > nativeStackAddr) {
+      MOZ_ASSERT(jsIndex >= 0);
+      const JS::ProfilingFrameIterator::Frame& jsFrame = aJsFrames[jsIndex];
+      jitEndStackAddr = (uint8_t*)jsFrame.endStackAddress;
+      // Stringifying non-wasm JIT frames is delayed until streaming time. To
+      // re-lookup the entry in the JitcodeGlobalTable, we need to store the
+      // JIT code address (OptInfoAddr) in the circular buffer.
+      //
+      // Note that we cannot do this when we are sychronously sampling the
+      // current thread; that is, when called from profiler_get_backtrace. The
+      // captured backtrace is usually externally stored for an indeterminate
+      // amount of time, such as in nsRefreshDriver. Problematically, the
+      // stored backtrace may be alive across a GC during which the profiler
+      // itself is disabled. In that case, the JS engine is free to discard its
+      // JIT code. This means that if we inserted such OptInfoAddr entries into
+      // the buffer, nsRefreshDriver would now be holding on to a backtrace
+      // with stale JIT code return addresses.
+      if (aIsSynchronous ||
+          jsFrame.kind == JS::ProfilingFrameIterator::Frame_Wasm) {
+        aCollector.CollectWasmFrame(jsFrame.label);
+      } else if (jsFrame.kind ==
+                 JS::ProfilingFrameIterator::Frame_BaselineInterpreter) {
+        // Materialize a ProfilingStackFrame similar to the C++ Interpreter. We
+        // also set the IS_BLINTERP_FRAME flag to differentiate though.
+        JSScript* script = jsFrame.interpreterScript;
+        jsbytecode* pc = jsFrame.interpreterPC();
+        js::ProfilingStackFrame stackFrame;
+        constexpr uint32_t ExtraFlags =
+            uint32_t(js::ProfilingStackFrame::Flags::IS_BLINTERP_FRAME);
+        stackFrame.initJsFrame<JS::ProfilingCategoryPair::JS_BaselineInterpret,
+                               ExtraFlags>("", jsFrame.label, script, pc,
+                                           jsFrame.realmID);
+        aCollector.CollectProfilingStackFrame(stackFrame);
+      } else {
+        MOZ_ASSERT(jsFrame.kind == JS::ProfilingFrameIterator::Frame_Ion ||
+                   jsFrame.kind == JS::ProfilingFrameIterator::Frame_Baseline);
+        aCollector.CollectJitReturnAddr(jsFrame.returnAddress());
+      }
+
+      jsIndex--;
+      continue;
+    }
+
+    // If we reach here, there must be a native stack frame and it must be the
+    // greatest frame.
+    if (nativeStackAddr &&
+        // If the latest JS frame was JIT, this could be the native frame that
+        // corresponds to it. In that case, skip the native frame, because
+        // there's no need for the same frame to be present twice in the stack.
+        // The JS frame can be considered the symbolicated version of the native
+        // frame.
+        (!jitEndStackAddr || nativeStackAddr < jitEndStackAddr) &&
+        // This might still be a JIT operation, check to make sure that is not
+        // in range of the NEXT JavaScript's stacks' activation address.
+        (!jsActivationAddr || nativeStackAddr > jsActivationAddr)) {
+      MOZ_ASSERT(nativeIndex >= 0);
+      void* addr = (void*)aNativeStack.mPCs[nativeIndex];
+      aCollector.CollectNativeLeafAddr(addr);
+    }
+    if (nativeIndex >= 0) {
+      nativeIndex--;
+    }
+  }
+
+  // Update the JS context with the current profile sample buffer generation.
+  //
+  // Only do this for periodic samples. We don't want to do this for
+  // synchronous samples, and we also don't want to do it for calls to
+  // profiler_suspend_and_sample_thread() from the background hang reporter -
+  // in that case, aCollector.BufferRangeStart() will return Nothing().
+  if (!aIsSynchronous) {
+    aCollector.BufferRangeStart().apply(
+        [&aThreadData](uint64_t aBufferRangeStart) {
+          JSContext* context = aThreadData.GetJSContext();
+          if (context) {
+            JS::SetJSContextProfilerSampleBufferRangeStart(context,
+                                                           aBufferRangeStart);
+          }
+        });
+  }
+}
+
+#if defined(USE_FRAME_POINTER_STACK_WALK) || defined(USE_MOZ_STACK_WALK)
+static void StackWalkCallback(uint32_t aFrameNumber, void* aPC, void* aSP,
+                              void* aClosure) {
+  NativeStack* nativeStack = static_cast<NativeStack*>(aClosure);
+  MOZ_ASSERT(nativeStack->mCount < MAX_NATIVE_FRAMES);
+  nativeStack->mSPs[nativeStack->mCount] = aSP;
+  nativeStack->mPCs[nativeStack->mCount] = aPC;
+  nativeStack->mCount++;
+}
+#endif
+
+#if defined(USE_FRAME_POINTER_STACK_WALK)
+static void DoFramePointerBacktrace(
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, NativeStack& aNativeStack,
+    StackWalkControl* aStackWalkControlIfSupported) {
+  // WARNING: this function runs within the profiler's "critical section".
+  // WARNING: this function might be called while the profiler is inactive, and
+  //          cannot rely on ActivePS.
+
+  // Make a local copy of the Registers, to allow modifications.
+  Registers regs = aRegs;
+
+  // Start with the current function. We use 0 as the frame number here because
+  // the FramePointerStackWalk() call below will use 1..N. This is a bit weird
+  // but it doesn't matter because StackWalkCallback() doesn't use the frame
+  // number argument.
+  StackWalkCallback(/* frameNum */ 0, regs.mPC, regs.mSP, &aNativeStack);
+
+  const void* const stackEnd = aThreadData.StackTop();
+
+  // This is to check forward-progress after using a resume point.
+  void* previousResumeSp = nullptr;
+
+  for (;;) {
+    if (!(regs.mSP && regs.mSP <= regs.mFP && regs.mFP <= stackEnd)) {
+      break;
+    }
+    FramePointerStackWalk(StackWalkCallback,
+                          uint32_t(MAX_NATIVE_FRAMES - aNativeStack.mCount),
+                          &aNativeStack, reinterpret_cast<void**>(regs.mFP),
+                          const_cast<void*>(stackEnd));
+
+    if constexpr (!StackWalkControl::scIsSupported) {
+      break;
+    } else {
+      if (aNativeStack.mCount >= MAX_NATIVE_FRAMES) {
+        // No room to add more frames.
+        break;
+      }
+      if (!aStackWalkControlIfSupported ||
+          aStackWalkControlIfSupported->ResumePointCount() == 0) {
+        // No resume information.
+        break;
+      }
+      void* lastSP = aNativeStack.mSPs[aNativeStack.mCount - 1];
+      if (previousResumeSp &&
+          ((uintptr_t)lastSP <= (uintptr_t)previousResumeSp)) {
+        // No progress after the previous resume point.
+        break;
+      }
+      const StackWalkControl::ResumePoint* resumePoint =
+          aStackWalkControlIfSupported->GetResumePointCallingSp(lastSP);
+      if (!resumePoint) {
+        break;
+      }
+      void* sp = resumePoint->resumeSp;
+      if (!sp) {
+        // Null SP in a resume point means we stop here.
+        break;
+      }
+      void* pc = resumePoint->resumePc;
+      StackWalkCallback(/* frameNum */ aNativeStack.mCount, pc, sp,
+                        &aNativeStack);
+      ++aNativeStack.mCount;
+      if (aNativeStack.mCount >= MAX_NATIVE_FRAMES) {
+        break;
+      }
+      // Prepare context to resume stack walking.
+      regs.mPC = (Address)pc;
+      regs.mSP = (Address)sp;
+      regs.mFP = (Address)resumePoint->resumeBp;
+
+      previousResumeSp = sp;
+    }
+  }
+}
+#endif
+
+#if defined(USE_MOZ_STACK_WALK)
+static void DoMozStackWalkBacktrace(
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, NativeStack& aNativeStack,
+    StackWalkControl* aStackWalkControlIfSupported) {
+  // WARNING: this function runs within the profiler's "critical section".
+  // WARNING: this function might be called while the profiler is inactive, and
+  //          cannot rely on ActivePS.
+
+  // Start with the current function. We use 0 as the frame number here because
+  // the MozStackWalkThread() call below will use 1..N. This is a bit weird but
+  // it doesn't matter because StackWalkCallback() doesn't use the frame number
+  // argument.
+  StackWalkCallback(/* frameNum */ 0, aRegs.mPC, aRegs.mSP, &aNativeStack);
+
+  HANDLE thread = aThreadData.PlatformDataCRef().ProfiledThread();
+  MOZ_ASSERT(thread);
+
+  CONTEXT context_buf;
+  CONTEXT* context = nullptr;
+  if constexpr (StackWalkControl::scIsSupported) {
+    context = &context_buf;
+    memset(&context_buf, 0, sizeof(CONTEXT));
+    context_buf.ContextFlags = CONTEXT_FULL;
+#  if defined(_M_AMD64)
+    context_buf.Rsp = (DWORD64)aRegs.mSP;
+    context_buf.Rbp = (DWORD64)aRegs.mFP;
+    context_buf.Rip = (DWORD64)aRegs.mPC;
+#  else
+    static_assert(!StackWalkControl::scIsSupported,
+                  "Mismatched support between StackWalkControl and "
+                  "DoMozStackWalkBacktrace");
+#  endif
+  } else {
+    context = nullptr;
+  }
+
+  // This is to check forward-progress after using a resume point.
+  void* previousResumeSp = nullptr;
+
+  for (;;) {
+    MozStackWalkThread(StackWalkCallback,
+                       uint32_t(MAX_NATIVE_FRAMES - aNativeStack.mCount),
+                       &aNativeStack, thread, context);
+
+    if constexpr (!StackWalkControl::scIsSupported) {
+      break;
+    } else {
+      if (aNativeStack.mCount >= MAX_NATIVE_FRAMES) {
+        // No room to add more frames.
+        break;
+      }
+      if (!aStackWalkControlIfSupported ||
+          aStackWalkControlIfSupported->ResumePointCount() == 0) {
+        // No resume information.
+        break;
+      }
+      void* lastSP = aNativeStack.mSPs[aNativeStack.mCount - 1];
+      if (previousResumeSp &&
+          ((uintptr_t)lastSP <= (uintptr_t)previousResumeSp)) {
+        // No progress after the previous resume point.
+        break;
+      }
+      const StackWalkControl::ResumePoint* resumePoint =
+          aStackWalkControlIfSupported->GetResumePointCallingSp(lastSP);
+      if (!resumePoint) {
+        break;
+      }
+      void* sp = resumePoint->resumeSp;
+      if (!sp) {
+        // Null SP in a resume point means we stop here.
+        break;
+      }
+      void* pc = resumePoint->resumePc;
+      StackWalkCallback(/* frameNum */ aNativeStack.mCount, pc, sp,
+                        &aNativeStack);
+      ++aNativeStack.mCount;
+      if (aNativeStack.mCount >= MAX_NATIVE_FRAMES) {
+        break;
+      }
+      // Prepare context to resume stack walking.
+      memset(&context_buf, 0, sizeof(CONTEXT));
+      context_buf.ContextFlags = CONTEXT_FULL;
+#  if defined(_M_AMD64)
+      context_buf.Rsp = (DWORD64)sp;
+      context_buf.Rbp = (DWORD64)resumePoint->resumeBp;
+      context_buf.Rip = (DWORD64)pc;
+#  else
+      static_assert(!StackWalkControl::scIsSupported,
+                    "Mismatched support between StackWalkControl and "
+                    "DoMozStackWalkBacktrace");
+#  endif
+      previousResumeSp = sp;
+    }
+  }
+}
+#endif
+
+#ifdef USE_EHABI_STACKWALK
+static void DoEHABIBacktrace(
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, NativeStack& aNativeStack,
+    StackWalkControl* aStackWalkControlIfSupported) {
+  // WARNING: this function runs within the profiler's "critical section".
+  // WARNING: this function might be called while the profiler is inactive, and
+  //          cannot rely on ActivePS.
+
+  aNativeStack.mCount = EHABIStackWalk(
+      aRegs.mContext->uc_mcontext, const_cast<void*>(aThreadData.StackTop()),
+      aNativeStack.mSPs, aNativeStack.mPCs, MAX_NATIVE_FRAMES);
+  (void)aStackWalkControlIfSupported;  // TODO: Implement.
+}
+#endif
+
+#ifdef USE_LUL_STACKWALK
+
+// See the comment at the callsite for why this function is necessary.
+#  if defined(MOZ_HAVE_ASAN_IGNORE)
+MOZ_ASAN_IGNORE static void ASAN_memcpy(void* aDst, const void* aSrc,
+                                        size_t aLen) {
+  // The obvious thing to do here is call memcpy(). However, although
+  // ASAN_memcpy() is not instrumented by ASAN, memcpy() still is, and the
+  // false positive still manifests! So we must implement memcpy() ourselves
+  // within this function.
+  char* dst = static_cast<char*>(aDst);
+  const char* src = static_cast<const char*>(aSrc);
+
+  for (size_t i = 0; i < aLen; i++) {
+    dst[i] = src[i];
+  }
+}
+#  endif
+
+static void DoLULBacktrace(
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, NativeStack& aNativeStack,
+    StackWalkControl* aStackWalkControlIfSupported) {
+  // WARNING: this function runs within the profiler's "critical section".
+  // WARNING: this function might be called while the profiler is inactive, and
+  //          cannot rely on ActivePS.
+
+  (void)aStackWalkControlIfSupported;  // TODO: Implement.
+
+  const mcontext_t* mc = &aRegs.mContext->uc_mcontext;
+
+  lul::UnwindRegs startRegs;
+  memset(&startRegs, 0, sizeof(startRegs));
+
+#  if defined(GP_PLAT_amd64_linux) || defined(GP_PLAT_amd64_android)
+  startRegs.xip = lul::TaggedUWord(mc->gregs[REG_RIP]);
+  startRegs.xsp = lul::TaggedUWord(mc->gregs[REG_RSP]);
+  startRegs.xbp = lul::TaggedUWord(mc->gregs[REG_RBP]);
+#  elif defined(GP_PLAT_amd64_freebsd)
+  startRegs.xip = lul::TaggedUWord(mc->mc_rip);
+  startRegs.xsp = lul::TaggedUWord(mc->mc_rsp);
+  startRegs.xbp = lul::TaggedUWord(mc->mc_rbp);
+#  elif defined(GP_PLAT_arm_linux) || defined(GP_PLAT_arm_android)
+  startRegs.r15 = lul::TaggedUWord(mc->arm_pc);
+  startRegs.r14 = lul::TaggedUWord(mc->arm_lr);
+  startRegs.r13 = lul::TaggedUWord(mc->arm_sp);
+  startRegs.r12 = lul::TaggedUWord(mc->arm_ip);
+  startRegs.r11 = lul::TaggedUWord(mc->arm_fp);
+  startRegs.r7 = lul::TaggedUWord(mc->arm_r7);
+#  elif defined(GP_PLAT_arm64_linux) || defined(GP_PLAT_arm64_android)
+  startRegs.pc = lul::TaggedUWord(mc->pc);
+  startRegs.x29 = lul::TaggedUWord(mc->regs[29]);
+  startRegs.x30 = lul::TaggedUWord(mc->regs[30]);
+  startRegs.sp = lul::TaggedUWord(mc->sp);
+#  elif defined(GP_PLAT_arm64_freebsd)
+  startRegs.pc = lul::TaggedUWord(mc->mc_gpregs.gp_elr);
+  startRegs.x29 = lul::TaggedUWord(mc->mc_gpregs.gp_x[29]);
+  startRegs.x30 = lul::TaggedUWord(mc->mc_gpregs.gp_lr);
+  startRegs.sp = lul::TaggedUWord(mc->mc_gpregs.gp_sp);
+#  elif defined(GP_PLAT_x86_linux) || defined(GP_PLAT_x86_android)
+  startRegs.xip = lul::TaggedUWord(mc->gregs[REG_EIP]);
+  startRegs.xsp = lul::TaggedUWord(mc->gregs[REG_ESP]);
+  startRegs.xbp = lul::TaggedUWord(mc->gregs[REG_EBP]);
+#  elif defined(GP_PLAT_mips64_linux)
+  startRegs.pc = lul::TaggedUWord(mc->pc);
+  startRegs.sp = lul::TaggedUWord(mc->gregs[29]);
+  startRegs.fp = lul::TaggedUWord(mc->gregs[30]);
+#  else
+#    error "Unknown plat"
+#  endif
+
+  // Copy up to N_STACK_BYTES from rsp-REDZONE upwards, but not going past the
+  // stack's registered top point.  Do some basic validity checks too.  This
+  // assumes that the TaggedUWord holding the stack pointer value is valid, but
+  // it should be, since it was constructed that way in the code just above.
+
+  // We could construct |stackImg| so that LUL reads directly from the stack in
+  // question, rather than from a copy of it.  That would reduce overhead and
+  // space use a bit.  However, it gives a problem with dynamic analysis tools
+  // (ASan, TSan, Valgrind) which is that such tools will report invalid or
+  // racing memory accesses, and such accesses will be reported deep inside LUL.
+  // By taking a copy here, we can either sanitise the copy (for Valgrind) or
+  // copy it using an unchecked memcpy (for ASan, TSan).  That way we don't have
+  // to try and suppress errors inside LUL.
+  //
+  // N_STACK_BYTES is set to 160KB.  This is big enough to hold all stacks
+  // observed in some minutes of testing, whilst keeping the size of this
+  // function (DoNativeBacktrace)'s frame reasonable.  Most stacks observed in
+  // practice are small, 4KB or less, and so the copy costs are insignificant
+  // compared to other profiler overhead.
+  //
+  // |stackImg| is allocated on this (the sampling thread's) stack.  That
+  // implies that the frame for this function is at least N_STACK_BYTES large.
+  // In general it would be considered unacceptable to have such a large frame
+  // on a stack, but it only exists for the unwinder thread, and so is not
+  // expected to be a problem.  Allocating it on the heap is troublesome because
+  // this function runs whilst the sampled thread is suspended, so any heap
+  // allocation risks deadlock.  Allocating it as a global variable is not
+  // thread safe, which would be a problem if we ever allow multiple sampler
+  // threads.  Hence allocating it on the stack seems to be the least-worst
+  // option.
+
+  lul::StackImage stackImg;
+
+  {
+#  if defined(GP_PLAT_amd64_linux) || defined(GP_PLAT_amd64_android) || \
+      defined(GP_PLAT_amd64_freebsd)
+    uintptr_t rEDZONE_SIZE = 128;
+    uintptr_t start = startRegs.xsp.Value() - rEDZONE_SIZE;
+#  elif defined(GP_PLAT_arm_linux) || defined(GP_PLAT_arm_android)
+    uintptr_t rEDZONE_SIZE = 0;
+    uintptr_t start = startRegs.r13.Value() - rEDZONE_SIZE;
+#  elif defined(GP_PLAT_arm64_linux) || defined(GP_PLAT_arm64_android) || \
+      defined(GP_PLAT_arm64_freebsd)
+    uintptr_t rEDZONE_SIZE = 0;
+    uintptr_t start = startRegs.sp.Value() - rEDZONE_SIZE;
+#  elif defined(GP_PLAT_x86_linux) || defined(GP_PLAT_x86_android)
+    uintptr_t rEDZONE_SIZE = 0;
+    uintptr_t start = startRegs.xsp.Value() - rEDZONE_SIZE;
+#  elif defined(GP_PLAT_mips64_linux)
+    uintptr_t rEDZONE_SIZE = 0;
+    uintptr_t start = startRegs.sp.Value() - rEDZONE_SIZE;
+#  else
+#    error "Unknown plat"
+#  endif
+    uintptr_t end = reinterpret_cast<uintptr_t>(aThreadData.StackTop());
+    uintptr_t ws = sizeof(void*);
+    start &= ~(ws - 1);
+    end &= ~(ws - 1);
+    uintptr_t nToCopy = 0;
+    if (start < end) {
+      nToCopy = end - start;
+      if (nToCopy >= 1024u * 1024u) {
+        // start is abnormally far from end, possibly due to some special code
+        // that uses a separate stack elsewhere (e.g.: rr). In this case we just
+        // give up on this sample.
+        nToCopy = 0;
+      } else if (nToCopy > lul::N_STACK_BYTES) {
+        nToCopy = lul::N_STACK_BYTES;
+      }
+    }
+    MOZ_ASSERT(nToCopy <= lul::N_STACK_BYTES);
+    stackImg.mLen = nToCopy;
+    stackImg.mStartAvma = start;
+    if (nToCopy > 0) {
+      // If this is a vanilla memcpy(), ASAN makes the following complaint:
+      //
+      //   ERROR: AddressSanitizer: stack-buffer-underflow ...
+      //   ...
+      //   HINT: this may be a false positive if your program uses some custom
+      //   stack unwind mechanism or swapcontext
+      //
+      // This code is very much a custom stack unwind mechanism! So we use an
+      // alternative memcpy() implementation that is ignored by ASAN.
+#  if defined(MOZ_HAVE_ASAN_IGNORE)
+      ASAN_memcpy(&stackImg.mContents[0], (void*)start, nToCopy);
+#  else
+      memcpy(&stackImg.mContents[0], (void*)start, nToCopy);
+#  endif
+      (void)VALGRIND_MAKE_MEM_DEFINED(&stackImg.mContents[0], nToCopy);
+    }
+  }
+
+  size_t framePointerFramesAcquired = 0;
+  lul::LUL* lul = CorePS::Lul();
+  MOZ_RELEASE_ASSERT(lul);
+  lul->Unwind(reinterpret_cast<uintptr_t*>(aNativeStack.mPCs),
+              reinterpret_cast<uintptr_t*>(aNativeStack.mSPs),
+              &aNativeStack.mCount, &framePointerFramesAcquired,
+              MAX_NATIVE_FRAMES, &startRegs, &stackImg);
+
+  // Update stats in the LUL stats object.  Unfortunately this requires
+  // three global memory operations.
+  lul->mStats.mContext += 1;
+  lul->mStats.mCFI += aNativeStack.mCount - 1 - framePointerFramesAcquired;
+  lul->mStats.mFP += framePointerFramesAcquired;
+}
+
+#endif
+
+#ifdef HAVE_NATIVE_UNWIND
+static void DoNativeBacktrace(
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, NativeStack& aNativeStack,
+    StackWalkControl* aStackWalkControlIfSupported) {
+  // This method determines which stackwalker is used for periodic and
+  // synchronous samples. (Backtrace samples are treated differently, see
+  // profiler_suspend_and_sample_thread() for details). The only part of the
+  // ordering that matters is that LUL must precede FRAME_POINTER, because on
+  // Linux they can both be present.
+#  if defined(USE_LUL_STACKWALK)
+  DoLULBacktrace(aThreadData, aRegs, aNativeStack,
+                 aStackWalkControlIfSupported);
+#  elif defined(USE_EHABI_STACKWALK)
+  DoEHABIBacktrace(aThreadData, aRegs, aNativeStack,
+                   aStackWalkControlIfSupported);
+#  elif defined(USE_FRAME_POINTER_STACK_WALK)
+  DoFramePointerBacktrace(aThreadData, aRegs, aNativeStack,
+                          aStackWalkControlIfSupported);
+#  elif defined(USE_MOZ_STACK_WALK)
+  DoMozStackWalkBacktrace(aThreadData, aRegs, aNativeStack,
+                          aStackWalkControlIfSupported);
+#  else
+#    error "Invalid configuration"
+#  endif
+}
+#endif
+
+// Writes some components shared by periodic and synchronous profiles to
+// ActivePS's ProfileBuffer. (This should only be called from DoSyncSample()
+// and DoPeriodicSample().)
+//
+// The grammar for entry sequences is in a comment above
+// ProfileBuffer::StreamSamplesToJSON.
+static inline void DoSharedSample(
+    bool aIsSynchronous, uint32_t aFeatures,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    JsFrame* aJsFrames, const Registers& aRegs, uint64_t aSamplePos,
+    uint64_t aBufferRangeStart, ProfileBuffer& aBuffer,
+    StackCaptureOptions aCaptureOptions = StackCaptureOptions::Full) {
+  // WARNING: this function runs within the profiler's "critical section".
+
+  MOZ_ASSERT(!aBuffer.IsThreadSafe(),
+             "Mutexes cannot be used inside this critical section");
+
+  ProfileBufferCollector collector(aBuffer, aSamplePos, aBufferRangeStart);
+  StackWalkControl* stackWalkControlIfSupported = nullptr;
+#if defined(HAVE_NATIVE_UNWIND)
+  const bool captureNative = ProfilerFeature::HasStackWalk(aFeatures) &&
+                             aCaptureOptions == StackCaptureOptions::Full;
+  StackWalkControl stackWalkControl;
+  if constexpr (StackWalkControl::scIsSupported) {
+    if (captureNative) {
+      stackWalkControlIfSupported = &stackWalkControl;
+    }
+  }
+#endif  // defined(HAVE_NATIVE_UNWIND)
+  const uint32_t jsFramesCount =
+      aJsFrames ? ExtractJsFrames(aIsSynchronous, aThreadData, aRegs, collector,
+                                  aJsFrames, stackWalkControlIfSupported)
+                : 0;
+  NativeStack nativeStack;
+#if defined(HAVE_NATIVE_UNWIND)
+  if (captureNative) {
+    DoNativeBacktrace(aThreadData, aRegs, nativeStack,
+                      stackWalkControlIfSupported);
+
+    MergeStacks(aFeatures, aIsSynchronous, aThreadData, aRegs, nativeStack,
+                collector, aJsFrames, jsFramesCount);
+  } else
+#endif
+  {
+    MergeStacks(aFeatures, aIsSynchronous, aThreadData, aRegs, nativeStack,
+                collector, aJsFrames, jsFramesCount);
+
+    // We can't walk the whole native stack, but we can record the top frame.
+    if (aCaptureOptions == StackCaptureOptions::Full) {
+      aBuffer.AddEntry(ProfileBufferEntry::NativeLeafAddr((void*)aRegs.mPC));
+    }
+  }
+}
+
+// Writes the components of a synchronous sample to the given ProfileBuffer.
+static void DoSyncSample(
+    uint32_t aFeatures,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const TimeStamp& aNow, const Registers& aRegs, ProfileBuffer& aBuffer,
+    StackCaptureOptions aCaptureOptions) {
+  // WARNING: this function runs within the profiler's "critical section".
+
+  MOZ_ASSERT(aCaptureOptions != StackCaptureOptions::NoStack,
+             "DoSyncSample should not be called when no capture is needed");
+
+  const uint64_t bufferRangeStart = aBuffer.BufferRangeStart();
+
+  const uint64_t samplePos =
+      aBuffer.AddThreadIdEntry(aThreadData.Info().ThreadId());
+
+  TimeDuration delta = aNow - CorePS::ProcessStartTime();
+  aBuffer.AddEntry(ProfileBufferEntry::Time(delta.ToMilliseconds()));
+
+  if (!aThreadData.GetJSContext()) {
+    // No JSContext, there is no JS frame buffer (and no need for it).
+    DoSharedSample(/* aIsSynchronous = */ true, aFeatures, aThreadData,
+                   /* aJsFrames = */ nullptr, aRegs, samplePos,
+                   bufferRangeStart, aBuffer, aCaptureOptions);
+  } else {
+    // JSContext is present, we need to lock the thread data to access the JS
+    // frame buffer.
+    ThreadRegistration::WithOnThreadRef([&](ThreadRegistration::OnThreadRef
+                                                aOnThreadRef) {
+      aOnThreadRef.WithConstLockedRWOnThread(
+          [&](const ThreadRegistration::LockedRWOnThread& aLockedThreadData) {
+            DoSharedSample(/* aIsSynchronous = */ true, aFeatures, aThreadData,
+                           aLockedThreadData.GetJsFrameBuffer(), aRegs,
+                           samplePos, bufferRangeStart, aBuffer,
+                           aCaptureOptions);
+          });
+    });
+  }
+}
+
+// Writes the components of a periodic sample to ActivePS's ProfileBuffer.
+// The ThreadId entry is already written in the main ProfileBuffer, its location
+// is `aSamplePos`, we can write the rest to `aBuffer` (which may be different).
+static inline void DoPeriodicSample(
+    PSLockRef aLock,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    const Registers& aRegs, uint64_t aSamplePos, uint64_t aBufferRangeStart,
+    ProfileBuffer& aBuffer) {
+  // WARNING: this function runs within the profiler's "critical section".
+
+  MOZ_RELEASE_ASSERT(ActivePS::Exists(aLock));
+
+  JsFrameBuffer& jsFrames = CorePS::JsFrames(aLock);
+  DoSharedSample(/* aIsSynchronous = */ false, ActivePS::Features(aLock),
+                 aThreadData, jsFrames, aRegs, aSamplePos, aBufferRangeStart,
+                 aBuffer);
+}
+
+// END sampling/unwinding code
+////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN saving/streaming code
+
+const static uint64_t kJS_MAX_SAFE_UINTEGER = +9007199254740991ULL;
+
+static int64_t SafeJSInteger(uint64_t aValue) {
+  return aValue <= kJS_MAX_SAFE_UINTEGER ? int64_t(aValue) : -1;
+}
+
+static void AddSharedLibraryInfoToStream(JSONWriter& aWriter,
+                                         const SharedLibrary& aLib) {
+  aWriter.StartObjectElement();
+  aWriter.IntProperty("start", SafeJSInteger(aLib.GetStart()));
+  aWriter.IntProperty("end", SafeJSInteger(aLib.GetEnd()));
+  aWriter.IntProperty("offset", SafeJSInteger(aLib.GetOffset()));
+  aWriter.StringProperty("name", NS_ConvertUTF16toUTF8(aLib.GetModuleName()));
+  aWriter.StringProperty("path", NS_ConvertUTF16toUTF8(aLib.GetModulePath()));
+  aWriter.StringProperty("debugName",
+                         NS_ConvertUTF16toUTF8(aLib.GetDebugName()));
+  aWriter.StringProperty("debugPath",
+                         NS_ConvertUTF16toUTF8(aLib.GetDebugPath()));
+  aWriter.StringProperty("breakpadId", aLib.GetBreakpadId());
+  aWriter.StringProperty("codeId", aLib.GetCodeId());
+  aWriter.StringProperty("arch", aLib.GetArch());
+  aWriter.EndObject();
+}
+
+void AppendSharedLibraries(JSONWriter& aWriter,
+                           const SharedLibraryInfo& aInfo) {
+  for (size_t i = 0; i < aInfo.GetSize(); i++) {
+    AddSharedLibraryInfoToStream(aWriter, aInfo.GetEntry(i));
+  }
+}
+
+static void StreamCategories(SpliceableJSONWriter& aWriter) {
+  // Same order as ProfilingCategory. Format:
+  // [
+  //   {
+  //     name: "Idle",
+  //     color: "transparent",
+  //     subcategories: ["Other"],
+  //   },
+  //   {
+  //     name: "Other",
+  //     color: "grey",
+  //     subcategories: [
+  //       "JSM loading",
+  //       "Subprocess launching",
+  //       "DLL loading"
+  //     ]
+  //   },
+  //   ...
+  // ]
+
+#define CATEGORY_JSON_BEGIN_CATEGORY(name, labelAsString, color) \
+  aWriter.Start();                                               \
+  aWriter.StringProperty("name", labelAsString);                 \
+  aWriter.StringProperty("color", color);                        \
+  aWriter.StartArrayProperty("subcategories");
+#define CATEGORY_JSON_SUBCATEGORY(supercategory, name, labelAsString) \
+  aWriter.StringElement(labelAsString);
+#define CATEGORY_JSON_END_CATEGORY \
+  aWriter.EndArray();              \
+  aWriter.EndObject();
+
+  MOZ_PROFILING_CATEGORY_LIST(CATEGORY_JSON_BEGIN_CATEGORY,
+                              CATEGORY_JSON_SUBCATEGORY,
+                              CATEGORY_JSON_END_CATEGORY)
+
+#undef CATEGORY_JSON_BEGIN_CATEGORY
+#undef CATEGORY_JSON_SUBCATEGORY
+#undef CATEGORY_JSON_END_CATEGORY
+}
+
+static void StreamMarkerSchema(SpliceableJSONWriter& aWriter) {
+  // Get an array view with all registered marker-type-specific functions.
+  base_profiler_markers_detail::Streaming::LockedMarkerTypeFunctionsList
+      markerTypeFunctionsArray;
+  // List of streamed marker names, this is used to spot duplicates.
+  std::set<std::string> names;
+  // Stream the display schema for each different one. (Duplications may come
+  // from the same code potentially living in different libraries.)
+  for (const auto& markerTypeFunctions : markerTypeFunctionsArray) {
+    auto name = markerTypeFunctions.mMarkerTypeNameFunction();
+    // std::set.insert(T&&) returns a pair, its `second` is true if the element
+    // was actually inserted (i.e., it was not there yet.)
+    const bool didInsert =
+        names.insert(std::string(name.data(), name.size())).second;
+    if (didInsert) {
+      markerTypeFunctions.mMarkerSchemaFunction().Stream(aWriter, name);
+    }
+  }
+
+  // Now stream the Rust marker schemas. Passing the names set as a void pointer
+  // as well, so we can continue checking if the schemes are added already in
+  // the Rust side.
+  profiler::ffi::gecko_profiler_stream_marker_schemas(
+      &aWriter, static_cast<void*>(&names));
+}
+
+// Some meta information that is better recorded before streaming the profile.
+// This is *not* intended to be cached, as some values could change between
+// profiling sessions.
+struct PreRecordedMetaInformation {
+  bool mAsyncStacks;
+
+  // This struct should only live on the stack, so it's fine to use Auto
+  // strings.
+  nsAutoCString mHttpPlatform;
+  nsAutoCString mHttpOscpu;
+  nsAutoCString mHttpMisc;
+
+  nsAutoCString mRuntimeABI;
+  nsAutoCString mRuntimeToolkit;
+
+  nsAutoCString mAppInfoProduct;
+  nsAutoCString mAppInfoAppBuildID;
+  nsAutoCString mAppInfoSourceURL;
+
+  int32_t mProcessInfoCpuCount;
+  int32_t mProcessInfoCpuCores;
+  nsAutoCString mProcessInfoCpuName;
+};
+
+// This function should be called out of the profiler lock.
+// It gathers non-trivial data that doesn't require the profiler to stop, or for
+// which the request could theoretically deadlock if the profiler is locked.
+static PreRecordedMetaInformation PreRecordMetaInformation() {
+  MOZ_ASSERT(!PSAutoLock::IsLockedOnCurrentThread());
+
+  PreRecordedMetaInformation info = {};  // Aggregate-init all fields.
+
+  if (!NS_IsMainThread()) {
+    // Leave these properties out if we're not on the main thread.
+    // At the moment, the only case in which this function is called on a
+    // background thread is if we're in a content process and are going to
+    // send this profile to the parent process. In that case, the parent
+    // process profile's "meta" object already has the rest of the properties,
+    // and the parent process profile is dumped on that process's main thread.
+    return info;
+  }
+
+  info.mAsyncStacks = Preferences::GetBool("javascript.options.asyncstack");
+
+  nsresult res;
+
+  if (nsCOMPtr<nsIHttpProtocolHandler> http =
+          do_GetService(NS_NETWORK_PROTOCOL_CONTRACTID_PREFIX "http", &res);
+      !NS_FAILED(res) && http) {
+    Unused << http->GetPlatform(info.mHttpPlatform);
+
+#if defined(GP_OS_darwin)
+    // On Mac, the http "oscpu" is capped at 10.15, so we need to get the real
+    // OS version directly.
+    int major = 0;
+    int minor = 0;
+    int bugfix = 0;
+    nsCocoaFeatures::GetSystemVersion(major, minor, bugfix);
+    if (major != 0) {
+      info.mHttpOscpu.AppendLiteral("macOS ");
+      info.mHttpOscpu.AppendInt(major);
+      info.mHttpOscpu.AppendLiteral(".");
+      info.mHttpOscpu.AppendInt(minor);
+      info.mHttpOscpu.AppendLiteral(".");
+      info.mHttpOscpu.AppendInt(bugfix);
+    } else
+#endif
+#if defined(GP_OS_windows)
+      // On Windows, the http "oscpu" is capped at Windows 10, so we need to get
+      // the real OS version directly.
+      OSVERSIONINFO ovi = {sizeof(OSVERSIONINFO)};
+    if (GetVersionEx(&ovi)) {
+      info.mHttpOscpu.AppendLiteral("Windows ");
+      // The major version returned for Windows 11 is 10, but we can
+      // identify it from the build number.
+      info.mHttpOscpu.AppendInt(
+          ovi.dwBuildNumber >= 22000 ? 11 : int32_t(ovi.dwMajorVersion));
+      info.mHttpOscpu.AppendLiteral(".");
+      info.mHttpOscpu.AppendInt(int32_t(ovi.dwMinorVersion));
+#  if defined(_ARM64_)
+      info.mHttpOscpu.AppendLiteral(" Arm64");
+#  endif
+      info.mHttpOscpu.AppendLiteral("; build=");
+      info.mHttpOscpu.AppendInt(int32_t(ovi.dwBuildNumber));
+    } else
+#endif
+    {
+      Unused << http->GetOscpu(info.mHttpOscpu);
+    }
+
+    // Firefox version is capped to 109.0 in the http "misc" field due to some
+    // webcompat issues (Bug 1805967). We need to put the real version instead.
+    info.mHttpMisc.AssignLiteral("rv:");
+    info.mHttpMisc.AppendLiteral(MOZILLA_UAVERSION);
+  }
+
+  if (nsCOMPtr<nsIXULRuntime> runtime =
+          do_GetService("@mozilla.org/xre/runtime;1");
+      runtime) {
+    Unused << runtime->GetXPCOMABI(info.mRuntimeABI);
+    Unused << runtime->GetWidgetToolkit(info.mRuntimeToolkit);
+  }
+
+  if (nsCOMPtr<nsIXULAppInfo> appInfo =
+          do_GetService("@mozilla.org/xre/app-info;1");
+      appInfo) {
+    Unused << appInfo->GetName(info.mAppInfoProduct);
+    Unused << appInfo->GetAppBuildID(info.mAppInfoAppBuildID);
+    Unused << appInfo->GetSourceURL(info.mAppInfoSourceURL);
+  }
+
+  ProcessInfo processInfo = {};  // Aggregate-init all fields to false/zeroes.
+  if (NS_SUCCEEDED(CollectProcessInfo(processInfo))) {
+    info.mProcessInfoCpuCount = processInfo.cpuCount;
+    info.mProcessInfoCpuCores = processInfo.cpuCores;
+    info.mProcessInfoCpuName = processInfo.cpuName;
+  }
+
+  return info;
+}
+
+// Implemented in platform-specific cpps, to add object properties describing
+// the units of CPU measurements in samples.
+static void StreamMetaPlatformSampleUnits(PSLockRef aLock,
+                                          SpliceableJSONWriter& aWriter);
+
+static void StreamMetaJSCustomObject(
+    PSLockRef aLock, SpliceableJSONWriter& aWriter, bool aIsShuttingDown,
+    const PreRecordedMetaInformation& aPreRecordedMetaInformation) {
+  MOZ_RELEASE_ASSERT(CorePS::Exists() && ActivePS::Exists(aLock));
+
+  aWriter.IntProperty("version", 27);
+
+  // The "startTime" field holds the number of milliseconds since midnight
+  // January 1, 1970 GMT. This grotty code computes (Now - (Now -
+  // ProcessStartTime)) to convert CorePS::ProcessStartTime() into that form.
+  // Note: This is the only absolute time in the profile! All other timestamps
+  // are relative to this startTime.
+  TimeDuration delta = TimeStamp::Now() - CorePS::ProcessStartTime();
+  aWriter.DoubleProperty(
+      "startTime",
+      static_cast<double>(PR_Now() / 1000.0 - delta.ToMilliseconds()));
+
+  aWriter.DoubleProperty("profilingStartTime", (ActivePS::ProfilingStartTime() -
+                                                CorePS::ProcessStartTime())
+                                                   .ToMilliseconds());
+
+  if (const TimeStamp contentEarliestTime =
+          ActivePS::Buffer(aLock)
+              .UnderlyingChunkedBuffer()
+              .GetEarliestChunkStartTimeStamp();
+      !contentEarliestTime.IsNull()) {
+    aWriter.DoubleProperty(
+        "contentEarliestTime",
+        (contentEarliestTime - CorePS::ProcessStartTime()).ToMilliseconds());
+  } else {
+    aWriter.NullProperty("contentEarliestTime");
+  }
+
+  const double profilingEndTime = profiler_time();
+  aWriter.DoubleProperty("profilingEndTime", profilingEndTime);
+
+  if (aIsShuttingDown) {
+    aWriter.DoubleProperty("shutdownTime", profilingEndTime);
+  } else {
+    aWriter.NullProperty("shutdownTime");
+  }
+
+  aWriter.StartArrayProperty("categories");
+  StreamCategories(aWriter);
+  aWriter.EndArray();
+
+  aWriter.StartArrayProperty("markerSchema");
+  StreamMarkerSchema(aWriter);
+  aWriter.EndArray();
+
+  ActivePS::WriteActiveConfiguration(aLock, aWriter,
+                                     MakeStringSpan("configuration"));
+
+  if (!NS_IsMainThread()) {
+    // Leave the rest of the properties out if we're not on the main thread.
+    // At the moment, the only case in which this function is called on a
+    // background thread is if we're in a content process and are going to
+    // send this profile to the parent process. In that case, the parent
+    // process profile's "meta" object already has the rest of the properties,
+    // and the parent process profile is dumped on that process's main thread.
+    return;
+  }
+
+  aWriter.DoubleProperty("interval", ActivePS::Interval(aLock));
+  aWriter.IntProperty("stackwalk", ActivePS::FeatureStackWalk(aLock));
+
+#ifdef DEBUG
+  aWriter.IntProperty("debug", 1);
+#else
+  aWriter.IntProperty("debug", 0);
+#endif
+
+  aWriter.IntProperty("gcpoison", JS::IsGCPoisoning() ? 1 : 0);
+
+  aWriter.IntProperty("asyncstack", aPreRecordedMetaInformation.mAsyncStacks);
+
+  aWriter.IntProperty("processType", XRE_GetProcessType());
+
+  aWriter.StringProperty("updateChannel", MOZ_STRINGIFY(MOZ_UPDATE_CHANNEL));
+
+  if (!aPreRecordedMetaInformation.mHttpPlatform.IsEmpty()) {
+    aWriter.StringProperty("platform",
+                           aPreRecordedMetaInformation.mHttpPlatform);
+  }
+  if (!aPreRecordedMetaInformation.mHttpOscpu.IsEmpty()) {
+    aWriter.StringProperty("oscpu", aPreRecordedMetaInformation.mHttpOscpu);
+  }
+  if (!aPreRecordedMetaInformation.mHttpMisc.IsEmpty()) {
+    aWriter.StringProperty("misc", aPreRecordedMetaInformation.mHttpMisc);
+  }
+
+  if (!aPreRecordedMetaInformation.mRuntimeABI.IsEmpty()) {
+    aWriter.StringProperty("abi", aPreRecordedMetaInformation.mRuntimeABI);
+  }
+  if (!aPreRecordedMetaInformation.mRuntimeToolkit.IsEmpty()) {
+    aWriter.StringProperty("toolkit",
+                           aPreRecordedMetaInformation.mRuntimeToolkit);
+  }
+
+  if (!aPreRecordedMetaInformation.mAppInfoProduct.IsEmpty()) {
+    aWriter.StringProperty("product",
+                           aPreRecordedMetaInformation.mAppInfoProduct);
+  }
+  if (!aPreRecordedMetaInformation.mAppInfoAppBuildID.IsEmpty()) {
+    aWriter.StringProperty("appBuildID",
+                           aPreRecordedMetaInformation.mAppInfoAppBuildID);
+  }
+  if (!aPreRecordedMetaInformation.mAppInfoSourceURL.IsEmpty()) {
+    aWriter.StringProperty("sourceURL",
+                           aPreRecordedMetaInformation.mAppInfoSourceURL);
+  }
+
+  if (!aPreRecordedMetaInformation.mProcessInfoCpuName.IsEmpty()) {
+    aWriter.StringProperty("CPUName",
+                           aPreRecordedMetaInformation.mProcessInfoCpuName);
+  }
+  if (aPreRecordedMetaInformation.mProcessInfoCpuCores > 0) {
+    aWriter.IntProperty("physicalCPUs",
+                        aPreRecordedMetaInformation.mProcessInfoCpuCores);
+  }
+  if (aPreRecordedMetaInformation.mProcessInfoCpuCount > 0) {
+    aWriter.IntProperty("logicalCPUs",
+                        aPreRecordedMetaInformation.mProcessInfoCpuCount);
+  }
+
+#if defined(GP_OS_android)
+  jni::String::LocalRef deviceInformation =
+      java::GeckoJavaSampler::GetDeviceInformation();
+  aWriter.StringProperty("device", deviceInformation->ToCString());
+#endif
+
+  aWriter.StartObjectProperty("sampleUnits");
+  {
+    aWriter.StringProperty("time", "ms");
+    aWriter.StringProperty("eventDelay", "ms");
+    StreamMetaPlatformSampleUnits(aLock, aWriter);
+  }
+  aWriter.EndObject();
+
+  // We should avoid collecting extension metadata for profiler when there is no
+  // observer service, since a ExtensionPolicyService could not be created then.
+  if (nsCOMPtr<nsIObserverService> os = services::GetObserverService()) {
+    aWriter.StartObjectProperty("extensions");
+    {
+      {
+        JSONSchemaWriter schema(aWriter);
+        schema.WriteField("id");
+        schema.WriteField("name");
+        schema.WriteField("baseURL");
+      }
+
+      aWriter.StartArrayProperty("data");
+      {
+        nsTArray<RefPtr<WebExtensionPolicy>> exts;
+        ExtensionPolicyService::GetSingleton().GetAll(exts);
+
+        for (auto& ext : exts) {
+          aWriter.StartArrayElement();
+
+          nsAutoString id;
+          ext->GetId(id);
+          aWriter.StringElement(NS_ConvertUTF16toUTF8(id));
+
+          aWriter.StringElement(NS_ConvertUTF16toUTF8(ext->Name()));
+
+          auto url = ext->GetURL(u""_ns);
+          if (url.isOk()) {
+            aWriter.StringElement(NS_ConvertUTF16toUTF8(url.unwrap()));
+          }
+
+          aWriter.EndArray();
+        }
+      }
+      aWriter.EndArray();
+    }
+    aWriter.EndObject();
+  }
+}
+
+static void StreamPages(PSLockRef aLock, SpliceableJSONWriter& aWriter) {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+  ActivePS::DiscardExpiredPages(aLock);
+  for (const auto& page : ActivePS::ProfiledPages(aLock)) {
+    page->StreamJSON(aWriter);
+  }
+}
+
+#if defined(GP_OS_android)
+template <int N>
+static bool StartsWith(const nsACString& string, const char (&prefix)[N]) {
+  if (N - 1 > string.Length()) {
+    return false;
+  }
+  return memcmp(string.Data(), prefix, N - 1) == 0;
+}
+
+static JS::ProfilingCategoryPair InferJavaCategory(nsACString& aName) {
+  if (aName.EqualsLiteral("android.os.MessageQueue.nativePollOnce()")) {
+    return JS::ProfilingCategoryPair::IDLE;
+  }
+  if (aName.EqualsLiteral("java.lang.Object.wait()")) {
+    return JS::ProfilingCategoryPair::JAVA_BLOCKED;
+  }
+  if (StartsWith(aName, "android.") || StartsWith(aName, "com.android.")) {
+    return JS::ProfilingCategoryPair::JAVA_ANDROID;
+  }
+  if (StartsWith(aName, "mozilla.") || StartsWith(aName, "org.mozilla.")) {
+    return JS::ProfilingCategoryPair::JAVA_MOZILLA;
+  }
+  if (StartsWith(aName, "java.") || StartsWith(aName, "sun.") ||
+      StartsWith(aName, "com.sun.")) {
+    return JS::ProfilingCategoryPair::JAVA_LANGUAGE;
+  }
+  if (StartsWith(aName, "kotlin.") || StartsWith(aName, "kotlinx.")) {
+    return JS::ProfilingCategoryPair::JAVA_KOTLIN;
+  }
+  if (StartsWith(aName, "androidx.")) {
+    return JS::ProfilingCategoryPair::JAVA_ANDROIDX;
+  }
+  return JS::ProfilingCategoryPair::OTHER;
+}
+
+// Marker type for Java markers without any details.
+struct JavaMarker {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("Java");
+  }
+  static void StreamJSONMarkerData(
+      baseprofiler::SpliceableJSONWriter& aWriter) {}
+  static MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::TimelineOverview, MS::Location::MarkerChart,
+              MS::Location::MarkerTable};
+    schema.SetAllLabels("{marker.name}");
+    return schema;
+  }
+};
+
+// Marker type for Java markers with a detail field.
+struct JavaMarkerWithDetails {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("JavaWithDetails");
+  }
+  static void StreamJSONMarkerData(baseprofiler::SpliceableJSONWriter& aWriter,
+                                   const ProfilerString8View& aText) {
+    // This (currently) needs to be called "name" to be searchable on the
+    // front-end.
+    aWriter.StringProperty("name", aText);
+  }
+  static MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::TimelineOverview, MS::Location::MarkerChart,
+              MS::Location::MarkerTable};
+    schema.SetTooltipLabel("{marker.name}");
+    schema.SetChartLabel("{marker.data.name}");
+    schema.SetTableLabel("{marker.name} - {marker.data.name}");
+    schema.AddKeyLabelFormatSearchable("name", "Details", MS::Format::String,
+                                       MS::Searchable::Searchable);
+    return schema;
+  }
+};
+
+static void CollectJavaThreadProfileData(
+    nsTArray<java::GeckoJavaSampler::ThreadInfo::LocalRef>& javaThreads,
+    ProfileBuffer& aProfileBuffer) {
+  // Retrieve metadata about the threads.
+  const auto threadCount = java::GeckoJavaSampler::GetRegisteredThreadCount();
+  for (int i = 0; i < threadCount; i++) {
+    javaThreads.AppendElement(
+        java::GeckoJavaSampler::GetRegisteredThreadInfo(i));
+  }
+
+  // locked_profiler_start uses sample count is 1000 for Java thread.
+  // This entry size is enough now, but we might have to estimate it
+  // if we can customize it
+  // Pass the samples
+  int sampleId = 0;
+  while (true) {
+    const auto threadId = java::GeckoJavaSampler::GetThreadId(sampleId);
+    double sampleTime = java::GeckoJavaSampler::GetSampleTime(sampleId);
+    if (threadId == 0 || sampleTime == 0.0) {
+      break;
+    }
+
+    aProfileBuffer.AddThreadIdEntry(ProfilerThreadId::FromNumber(threadId));
+    aProfileBuffer.AddEntry(ProfileBufferEntry::Time(sampleTime));
+    int frameId = 0;
+    while (true) {
+      jni::String::LocalRef frameName =
+          java::GeckoJavaSampler::GetFrameName(sampleId, frameId++);
+      if (!frameName) {
+        break;
+      }
+      nsCString frameNameString = frameName->ToCString();
+
+      auto categoryPair = InferJavaCategory(frameNameString);
+      aProfileBuffer.CollectCodeLocation("", frameNameString.get(), 0, 0,
+                                         Nothing(), Nothing(),
+                                         Some(categoryPair));
+    }
+    sampleId++;
+  }
+
+  // Pass the markers now
+  while (true) {
+    // Gets the data from the Android UI thread only.
+    java::GeckoJavaSampler::Marker::LocalRef marker =
+        java::GeckoJavaSampler::PollNextMarker();
+    if (!marker) {
+      // All markers are transferred.
+      break;
+    }
+
+    // Get all the marker information from the Java thread using JNI.
+    const auto threadId = ProfilerThreadId::FromNumber(marker->GetThreadId());
+    nsCString markerName = marker->GetMarkerName()->ToCString();
+    jni::String::LocalRef text = marker->GetMarkerText();
+    TimeStamp startTime =
+        CorePS::ProcessStartTime() +
+        TimeDuration::FromMilliseconds(marker->GetStartTime());
+
+    double endTimeMs = marker->GetEndTime();
+    // A marker can be either a duration with start and end, or a point in time
+    // with only startTime. If endTime is 0, this means it's a point in time.
+    TimeStamp endTime = endTimeMs == 0
+                            ? startTime
+                            : CorePS::ProcessStartTime() +
+                                  TimeDuration::FromMilliseconds(endTimeMs);
+    MarkerTiming timing = endTimeMs == 0
+                              ? MarkerTiming::InstantAt(startTime)
+                              : MarkerTiming::Interval(startTime, endTime);
+
+    if (!text) {
+      // This marker doesn't have a text.
+      AddMarkerToBuffer(aProfileBuffer.UnderlyingChunkedBuffer(), markerName,
+                        geckoprofiler::category::JAVA_ANDROID,
+                        {MarkerThreadId(threadId), std::move(timing)},
+                        JavaMarker{});
+    } else {
+      // This marker has a text.
+      AddMarkerToBuffer(aProfileBuffer.UnderlyingChunkedBuffer(), markerName,
+                        geckoprofiler::category::JAVA_ANDROID,
+                        {MarkerThreadId(threadId), std::move(timing)},
+                        JavaMarkerWithDetails{}, text->ToCString());
+    }
+  }
+}
+#endif
+
+UniquePtr<ProfilerCodeAddressService>
+profiler_code_address_service_for_presymbolication() {
+  static const bool preSymbolicate = []() {
+    const char* symbolicate = getenv("MOZ_PROFILER_SYMBOLICATE");
+    return symbolicate && symbolicate[0] != '\0';
+  }();
+  return preSymbolicate ? MakeUnique<ProfilerCodeAddressService>() : nullptr;
+}
+
+static ProfilerResult<ProfileGenerationAdditionalInformation>
+locked_profiler_stream_json_for_this_process(
+    PSLockRef aLock, SpliceableJSONWriter& aWriter, double aSinceTime,
+    const PreRecordedMetaInformation& aPreRecordedMetaInformation,
+    bool aIsShuttingDown, ProfilerCodeAddressService* aService,
+    mozilla::ProgressLogger aProgressLogger) {
+  LOG("locked_profiler_stream_json_for_this_process");
+
+#ifdef DEBUG
+  PRIntervalTime slowWithSleeps = 0;
+  if (!XRE_IsParentProcess()) {
+    for (const auto& filter : ActivePS::Filters(aLock)) {
+      if (filter == "test-debug-child-slow-json") {
+        LOG("test-debug-child-slow-json");
+        // There are 10 slow-downs below, each will sleep 250ms, for a total of
+        // 2.5s, which should trigger the first progress request after 1s, and
+        // the next progress which will have advanced further, so this profile
+        // shouldn't get dropped.
+        slowWithSleeps = PR_MillisecondsToInterval(250);
+      } else if (filter == "test-debug-child-very-slow-json") {
+        LOG("test-debug-child-very-slow-json");
+        // Wait for more than 2s without any progress, which should get this
+        // profile discarded.
+        PR_Sleep(PR_SecondsToInterval(5));
+      }
+    }
+  }
+#  define SLOW_DOWN_FOR_TESTING()                                        \
+    if (slowWithSleeps != 0) {                                           \
+      DEBUG_LOG("progress=%.0f%%, sleep...",                             \
+                aProgressLogger.GetGlobalProgress().ToDouble() * 100.0); \
+      PR_Sleep(slowWithSleeps);                                          \
+    }
+#else                             // #ifdef DEBUG
+#  define SLOW_DOWN_FOR_TESTING() /* No slow-downs */
+#endif                            // #ifdef DEBUG #else
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists() && ActivePS::Exists(aLock));
+
+  AUTO_PROFILER_STATS(locked_profiler_stream_json_for_this_process);
+
+  const double collectionStartMs = profiler_time();
+
+  ProfileBuffer& buffer = ActivePS::Buffer(aLock);
+
+  aProgressLogger.SetLocalProgress(1_pc, "Locked profile buffer");
+
+  SLOW_DOWN_FOR_TESTING();
+
+  // If there is a set "Window length", discard older data.
+  Maybe<double> durationS = ActivePS::Duration(aLock);
+  if (durationS.isSome()) {
+    const double durationStartMs = collectionStartMs - *durationS * 1000;
+    buffer.DiscardSamplesBeforeTime(durationStartMs);
+  }
+  aProgressLogger.SetLocalProgress(2_pc, "Discarded old data");
+
+  if (aWriter.Failed()) {
+    return Err(ProfilerError::JsonGenerationFailed);
+  }
+  SLOW_DOWN_FOR_TESTING();
+
+#if defined(GP_OS_android)
+  // Java thread profile data should be collected before serializing the meta
+  // object. This is because Java thread adds some markers with marker schema
+  // objects. And these objects should be added before the serialization of the
+  // `profile.meta.markerSchema` array, so these marker schema objects can also
+  // be serialized properly. That's why java thread profile data needs to be
+  // done before everything.
+
+  // We are allocating it chunk by chunk. So this will not allocate 64 MiB
+  // at once. This size should be more than enough for java threads.
+  // This buffer is being created for each process but Android has
+  // relatively fewer processes compared to desktop, so it's okay here.
+  mozilla::ProfileBufferChunkManagerWithLocalLimit javaChunkManager(
+      64 * 1024 * 1024, 1024 * 1024);
+  ProfileChunkedBuffer javaBufferManager(
+      ProfileChunkedBuffer::ThreadSafety::WithoutMutex, javaChunkManager);
+  ProfileBuffer javaBuffer(javaBufferManager);
+
+  nsTArray<java::GeckoJavaSampler::ThreadInfo::LocalRef> javaThreads;
+
+  if (ActivePS::FeatureJava(aLock)) {
+    CollectJavaThreadProfileData(javaThreads, javaBuffer);
+    aProgressLogger.SetLocalProgress(3_pc, "Collected Java thread");
+  }
+#endif
+
+  // Put shared library info
+  aWriter.StartArrayProperty("libs");
+  SharedLibraryInfo sharedLibraryInfo = SharedLibraryInfo::GetInfoForSelf();
+  sharedLibraryInfo.SortByAddress();
+  AppendSharedLibraries(aWriter, sharedLibraryInfo);
+  aWriter.EndArray();
+  aProgressLogger.SetLocalProgress(4_pc, "Wrote library information");
+
+  if (aWriter.Failed()) {
+    return Err(ProfilerError::JsonGenerationFailed);
+  }
+  SLOW_DOWN_FOR_TESTING();
+
+  // Put meta data
+  aWriter.StartObjectProperty("meta");
+  {
+    StreamMetaJSCustomObject(aLock, aWriter, aIsShuttingDown,
+                             aPreRecordedMetaInformation);
+  }
+  aWriter.EndObject();
+  aProgressLogger.SetLocalProgress(5_pc, "Wrote profile metadata");
+
+  if (aWriter.Failed()) {
+    return Err(ProfilerError::JsonGenerationFailed);
+  }
+  SLOW_DOWN_FOR_TESTING();
+
+  // Put page data
+  aWriter.StartArrayProperty("pages");
+  { StreamPages(aLock, aWriter); }
+  aWriter.EndArray();
+  aProgressLogger.SetLocalProgress(6_pc, "Wrote pages");
+
+  buffer.StreamProfilerOverheadToJSON(
+      aWriter, CorePS::ProcessStartTime(), aSinceTime,
+      aProgressLogger.CreateSubLoggerTo(10_pc, "Wrote profiler overheads"));
+
+  buffer.StreamCountersToJSON(
+      aWriter, CorePS::ProcessStartTime(), aSinceTime,
+      aProgressLogger.CreateSubLoggerTo(14_pc, "Wrote counters"));
+
+  if (aWriter.Failed()) {
+    return Err(ProfilerError::JsonGenerationFailed);
+  }
+  SLOW_DOWN_FOR_TESTING();
+
+  // Lists the samples for each thread profile
+  aWriter.StartArrayProperty("threads");
+  {
+    ActivePS::DiscardExpiredDeadProfiledThreads(aLock);
+    aProgressLogger.SetLocalProgress(15_pc, "Discarded expired profiles");
+
+    ThreadRegistry::LockedRegistry lockedRegistry;
+    ActivePS::ProfiledThreadList threads =
+        ActivePS::ProfiledThreads(lockedRegistry, aLock);
+
+    const uint32_t threadCount = uint32_t(threads.length());
+
+    if (aWriter.Failed()) {
+      return Err(ProfilerError::JsonGenerationFailed);
+    }
+    SLOW_DOWN_FOR_TESTING();
+
+    // Prepare the streaming context for each thread.
+    ProcessStreamingContext processStreamingContext(
+        threadCount, aWriter.SourceFailureLatch(), CorePS::ProcessStartTime(),
+        aSinceTime);
+    for (auto&& [i, progressLogger] : aProgressLogger.CreateLoopSubLoggersTo(
+             20_pc, threadCount, "Preparing thread streaming contexts...")) {
+      ActivePS::ProfiledThreadListElement& thread = threads[i];
+      MOZ_RELEASE_ASSERT(thread.mProfiledThreadData);
+      processStreamingContext.AddThreadStreamingContext(
+          *thread.mProfiledThreadData, buffer, thread.mJSContext, aService,
+          std::move(progressLogger));
+      if (aWriter.Failed()) {
+        return Err(ProfilerError::JsonGenerationFailed);
+      }
+    }
+
+    SLOW_DOWN_FOR_TESTING();
+
+    // Read the buffer once, and extract all samples and markers that the
+    // context expects.
+    buffer.StreamSamplesAndMarkersToJSON(
+        processStreamingContext, aProgressLogger.CreateSubLoggerTo(
+                                     "Processing samples and markers...", 80_pc,
+                                     "Processed samples and markers"));
+
+    if (aWriter.Failed()) {
+      return Err(ProfilerError::JsonGenerationFailed);
+    }
+    SLOW_DOWN_FOR_TESTING();
+
+    // Stream each thread from the pre-filled context.
+    ThreadStreamingContext* const contextListBegin =
+        processStreamingContext.begin();
+    MOZ_ASSERT(uint32_t(processStreamingContext.end() - contextListBegin) ==
+               threadCount);
+    for (auto&& [i, progressLogger] : aProgressLogger.CreateLoopSubLoggersTo(
+             92_pc, threadCount, "Streaming threads...")) {
+      ThreadStreamingContext& threadStreamingContext = contextListBegin[i];
+      threadStreamingContext.FinalizeWriter();
+      threadStreamingContext.mProfiledThreadData.StreamJSON(
+          std::move(threadStreamingContext), aWriter,
+          CorePS::ProcessName(aLock), CorePS::ETLDplus1(aLock),
+          CorePS::ProcessStartTime(), aService, std::move(progressLogger));
+      if (aWriter.Failed()) {
+        return Err(ProfilerError::JsonGenerationFailed);
+      }
+    }
+    aProgressLogger.SetLocalProgress(92_pc, "Wrote samples and markers");
+
+#if defined(GP_OS_android)
+    if (ActivePS::FeatureJava(aLock)) {
+      for (java::GeckoJavaSampler::ThreadInfo::LocalRef& threadInfo :
+           javaThreads) {
+        ProfiledThreadData threadData(ThreadRegistrationInfo{
+            threadInfo->GetName()->ToCString().BeginReading(),
+            ProfilerThreadId::FromNumber(threadInfo->GetId()), false,
+            CorePS::ProcessStartTime()});
+
+        threadData.StreamJSON(
+            javaBuffer, nullptr, aWriter, CorePS::ProcessName(aLock),
+            CorePS::ETLDplus1(aLock), CorePS::ProcessStartTime(), aSinceTime,
+            nullptr,
+            aProgressLogger.CreateSubLoggerTo("Streaming Java thread...", 96_pc,
+                                              "Streamed Java thread"));
+      }
+      if (aWriter.Failed()) {
+        return Err(ProfilerError::JsonGenerationFailed);
+      }
+    } else {
+      aProgressLogger.SetLocalProgress(96_pc, "No Java thread");
+    }
+#endif
+
+    UniquePtr<char[]> baseProfileThreads =
+        ActivePS::MoveBaseProfileThreads(aLock);
+    if (baseProfileThreads) {
+      aWriter.Splice(MakeStringSpan(baseProfileThreads.get()));
+      if (aWriter.Failed()) {
+        return Err(ProfilerError::JsonGenerationFailed);
+      }
+      aProgressLogger.SetLocalProgress(97_pc, "Wrote baseprofiler data");
+    } else {
+      aProgressLogger.SetLocalProgress(97_pc, "No baseprofiler data");
+    }
+  }
+  aWriter.EndArray();
+
+  SLOW_DOWN_FOR_TESTING();
+
+  aWriter.StartArrayProperty("pausedRanges");
+  {
+    buffer.StreamPausedRangesToJSON(
+        aWriter, aSinceTime,
+        aProgressLogger.CreateSubLoggerTo("Streaming pauses...", 99_pc,
+                                          "Streamed pauses"));
+  }
+  aWriter.EndArray();
+
+  if (aWriter.Failed()) {
+    return Err(ProfilerError::JsonGenerationFailed);
+  }
+
+  ProfilingLog::Access([&](Json::Value& aProfilingLogObject) {
+    aProfilingLogObject[Json::StaticString{
+        "profilingLogEnd" TIMESTAMP_JSON_SUFFIX}] = ProfilingLog::Timestamp();
+
+    aWriter.StartObjectProperty("profilingLog");
+    {
+      nsAutoCString pid;
+      pid.AppendInt(int64_t(profiler_current_process_id().ToNumber()));
+      Json::String logString = ToCompactString(aProfilingLogObject);
+      aWriter.SplicedJSONProperty(pid, logString);
+    }
+    aWriter.EndObject();
+  });
+
+  const double collectionEndMs = profiler_time();
+
+  // Record timestamps for the collection into the buffer, so that consumers
+  // know why we didn't collect any samples for its duration.
+  // We put these entries into the buffer after we've collected the profile,
+  // so they'll be visible for the *next* profile collection (if they haven't
+  // been overwritten due to buffer wraparound by then).
+  buffer.AddEntry(ProfileBufferEntry::CollectionStart(collectionStartMs));
+  buffer.AddEntry(ProfileBufferEntry::CollectionEnd(collectionEndMs));
+
+#ifdef DEBUG
+  if (slowWithSleeps != 0) {
+    LOG("locked_profiler_stream_json_for_this_process done");
+  }
+#endif  // DEBUG
+
+  return ProfileGenerationAdditionalInformation{std::move(sharedLibraryInfo)};
+}
+
+// Keep this internal function non-static, so it may be used by tests.
+ProfilerResult<ProfileGenerationAdditionalInformation>
+do_profiler_stream_json_for_this_process(
+    SpliceableJSONWriter& aWriter, double aSinceTime, bool aIsShuttingDown,
+    ProfilerCodeAddressService* aService,
+    mozilla::ProgressLogger aProgressLogger) {
+  LOG("profiler_stream_json_for_this_process");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  const auto preRecordedMetaInformation = PreRecordMetaInformation();
+
+  aProgressLogger.SetLocalProgress(2_pc, "PreRecordMetaInformation done");
+
+  if (profiler_is_active()) {
+    invoke_profiler_state_change_callbacks(ProfilingState::GeneratingProfile);
+  }
+
+  PSAutoLock lock;
+
+  if (!ActivePS::Exists(lock)) {
+    return Err(ProfilerError::IsInactive);
+  }
+
+  ProfileGenerationAdditionalInformation additionalInfo;
+  MOZ_TRY_VAR(
+      additionalInfo,
+      locked_profiler_stream_json_for_this_process(
+          lock, aWriter, aSinceTime, preRecordedMetaInformation,
+          aIsShuttingDown, aService,
+          aProgressLogger.CreateSubLoggerFromTo(
+              3_pc, "locked_profiler_stream_json_for_this_process started",
+              100_pc, "locked_profiler_stream_json_for_this_process done")));
+
+  if (aWriter.Failed()) {
+    return Err(ProfilerError::JsonGenerationFailed);
+  }
+  return additionalInfo;
+}
+
+ProfilerResult<ProfileGenerationAdditionalInformation>
+profiler_stream_json_for_this_process(SpliceableJSONWriter& aWriter,
+                                      double aSinceTime, bool aIsShuttingDown,
+                                      ProfilerCodeAddressService* aService,
+                                      mozilla::ProgressLogger aProgressLogger) {
+  MOZ_RELEASE_ASSERT(
+      !XRE_IsParentProcess() || NS_IsMainThread(),
+      "In the parent process, profiles should only be generated from the main "
+      "thread, otherwise they will be incomplete.");
+
+  ProfileGenerationAdditionalInformation additionalInfo;
+  MOZ_TRY_VAR(additionalInfo, do_profiler_stream_json_for_this_process(
+                                  aWriter, aSinceTime, aIsShuttingDown,
+                                  aService, std::move(aProgressLogger)));
+
+  return additionalInfo;
+}
+
+// END saving/streaming code
+////////////////////////////////////////////////////////////////////////
+
+static char FeatureCategory(uint32_t aFeature) {
+  if (aFeature & DefaultFeatures()) {
+    if (aFeature & AvailableFeatures()) {
+      return 'D';
+    }
+    return 'd';
+  }
+
+  if (aFeature & StartupExtraDefaultFeatures()) {
+    if (aFeature & AvailableFeatures()) {
+      return 'S';
+    }
+    return 's';
+  }
+
+  if (aFeature & AvailableFeatures()) {
+    return '-';
+  }
+  return 'x';
+}
+
+static void PrintUsage() {
+  MOZ_RELEASE_ASSERT(NS_IsMainThread());
+
+  printf(
+      "\n"
+      "Profiler environment variable usage:\n"
+      "\n"
+      "  MOZ_PROFILER_HELP\n"
+      "  If set to any value, prints this message.\n"
+      "  Use MOZ_BASE_PROFILER_HELP for BaseProfiler help.\n"
+      "\n"
+      "  MOZ_LOG\n"
+      "  Enables logging. The levels of logging available are\n"
+      "  'prof:3' (least verbose), 'prof:4', 'prof:5' (most verbose).\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP\n"
+      "  If set to any value other than '' or '0'/'N'/'n', starts the\n"
+      "  profiler immediately on start-up.\n"
+      "  Useful if you want profile code that runs very early.\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP_ENTRIES=<%u..%u>\n"
+      "  If MOZ_PROFILER_STARTUP is set, specifies the number of entries per\n"
+      "  process in the profiler's circular buffer when the profiler is first\n"
+      "  started.\n"
+      "  If unset, the platform default is used:\n"
+      "  %u entries per process, or %u when MOZ_PROFILER_STARTUP is set.\n"
+      "  (%u bytes per entry -> %u or %u total bytes per process)\n"
+      "  Optional units in bytes: KB, KiB, MB, MiB, GB, GiB\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP_DURATION=<1..>\n"
+      "  If MOZ_PROFILER_STARTUP is set, specifies the maximum life time of\n"
+      "  entries in the the profiler's circular buffer when the profiler is\n"
+      "  first started, in seconds.\n"
+      "  If unset, the life time of the entries will only be restricted by\n"
+      "  MOZ_PROFILER_STARTUP_ENTRIES (or its default value), and no\n"
+      "  additional time duration restriction will be applied.\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP_INTERVAL=<1..%d>\n"
+      "  If MOZ_PROFILER_STARTUP is set, specifies the sample interval,\n"
+      "  measured in milliseconds, when the profiler is first started.\n"
+      "  If unset, the platform default is used.\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP_FEATURES_BITFIELD=<Number>\n"
+      "  If MOZ_PROFILER_STARTUP is set, specifies the profiling features, as\n"
+      "  the integer value of the features bitfield.\n"
+      "  If unset, the value from MOZ_PROFILER_STARTUP_FEATURES is used.\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP_FEATURES=<Features>\n"
+      "  If MOZ_PROFILER_STARTUP is set, specifies the profiling features, as\n"
+      "  a comma-separated list of strings.\n"
+      "  Ignored if  MOZ_PROFILER_STARTUP_FEATURES_BITFIELD is set.\n"
+      "  If unset, the platform default is used.\n"
+      "\n"
+      "    Features: (x=unavailable, D/d=default/unavailable,\n"
+      "               S/s=MOZ_PROFILER_STARTUP extra default/unavailable)\n",
+      unsigned(ActivePS::scMinimumBufferEntries),
+      unsigned(ActivePS::scMaximumBufferEntries),
+      unsigned(PROFILER_DEFAULT_ENTRIES.Value()),
+      unsigned(PROFILER_DEFAULT_STARTUP_ENTRIES.Value()),
+      unsigned(scBytesPerEntry),
+      unsigned(PROFILER_DEFAULT_ENTRIES.Value() * scBytesPerEntry),
+      unsigned(PROFILER_DEFAULT_STARTUP_ENTRIES.Value() * scBytesPerEntry),
+      PROFILER_MAX_INTERVAL);
+
+#define PRINT_FEATURE(n_, str_, Name_, desc_)                                  \
+  printf("    %c %7u: \"%s\" (%s)\n", FeatureCategory(ProfilerFeature::Name_), \
+         ProfilerFeature::Name_, str_, desc_);
+
+  PROFILER_FOR_EACH_FEATURE(PRINT_FEATURE)
+
+#undef PRINT_FEATURE
+
+  printf(
+      "    -          \"default\" (All above D+S defaults)\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP_FILTERS=<Filters>\n"
+      "  If MOZ_PROFILER_STARTUP is set, specifies the thread filters, as a\n"
+      "  comma-separated list of strings. A given thread will be sampled if\n"
+      "  any of the filters is a case-insensitive substring of the thread\n"
+      "  name. If unset, a default is used.\n"
+      "\n"
+      "  MOZ_PROFILER_STARTUP_ACTIVE_TAB_ID=<Number>\n"
+      "  This variable is used to propagate the activeTabID of\n"
+      "  the profiler init params to subprocesses.\n"
+      "\n"
+      "  MOZ_PROFILER_SHUTDOWN=<Filename>\n"
+      "  If set, the profiler saves a profile to the named file on shutdown.\n"
+      "  If the Filename contains \"%%p\", this will be replaced with the'\n"
+      "  process id of the parent process.\n"
+      "\n"
+      "  MOZ_PROFILER_SYMBOLICATE\n"
+      "  If set, the profiler will pre-symbolicate profiles.\n"
+      "  *Note* This will add a significant pause when gathering data, and\n"
+      "  is intended mainly for local development.\n"
+      "\n"
+      "  MOZ_PROFILER_LUL_TEST\n"
+      "  If set to any value, runs LUL unit tests at startup.\n"
+      "\n"
+      "  This platform %s native unwinding.\n"
+      "\n",
+#if defined(HAVE_NATIVE_UNWIND)
+      "supports"
+#else
+      "does not support"
+#endif
+  );
+}
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN Sampler
+
+#if defined(GP_OS_linux) || defined(GP_OS_android)
+struct SigHandlerCoordinator;
+#endif
+
+// Sampler performs setup and teardown of the state required to sample with the
+// profiler. Sampler may exist when ActivePS is not present.
+//
+// SuspendAndSampleAndResumeThread must only be called from a single thread,
+// and must not sample the thread it is being called from. A separate Sampler
+// instance must be used for each thread which wants to capture samples.
+
+// WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+//
+// With the exception of SamplerThread, all Sampler objects must be Disable-d
+// before releasing the lock which was used to create them. This avoids races
+// on linux with the SIGPROF signal handler.
+
+class Sampler {
+ public:
+  // Sets up the profiler such that it can begin sampling.
+  explicit Sampler(PSLockRef aLock);
+
+  // Disable the sampler, restoring it to its previous state. This must be
+  // called once, and only once, before the Sampler is destroyed.
+  void Disable(PSLockRef aLock);
+
+  // This method suspends and resumes the samplee thread. It calls the passed-in
+  // function-like object aProcessRegs (passing it a populated |const
+  // Registers&| arg) while the samplee thread is suspended.  Note that
+  // the aProcessRegs function must be very careful not to do anything that
+  // requires a lock, since we may have interrupted the thread at any point.
+  // As an example, you can't call TimeStamp::Now() since on windows it
+  // takes a lock on the performance counter.
+  //
+  // Func must be a function-like object of type `void()`.
+  template <typename Func>
+  void SuspendAndSampleAndResumeThread(
+      PSLockRef aLock,
+      const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+      const TimeStamp& aNow, const Func& aProcessRegs);
+
+ private:
+#if defined(GP_OS_linux) || defined(GP_OS_android) || defined(GP_OS_freebsd)
+  // Used to restore the SIGPROF handler when ours is removed.
+  struct sigaction mOldSigprofHandler;
+
+  // This process' ID. Needed as an argument for tgkill in
+  // SuspendAndSampleAndResumeThread.
+  ProfilerProcessId mMyPid;
+
+  // The sampler thread's ID.  Used to assert that it is not sampling itself,
+  // which would lead to deadlock.
+  ProfilerThreadId mSamplerTid;
+
+ public:
+  // This is the one-and-only variable used to communicate between the sampler
+  // thread and the samplee thread's signal handler. It's static because the
+  // samplee thread's signal handler is static.
+  static struct SigHandlerCoordinator* sSigHandlerCoordinator;
+#endif
+};
+
+// END Sampler
+////////////////////////////////////////////////////////////////////////
+
+// Platform-specific function that retrieves per-thread CPU measurements.
+static RunningTimes GetThreadRunningTimesDiff(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData);
+// Platform-specific function that *may* discard CPU measurements since the
+// previous call to GetThreadRunningTimesDiff, if the way to suspend threads on
+// this platform may add running times to that thread.
+// No-op otherwise, if suspending a thread doesn't make it work.
+static void DiscardSuspendedThreadRunningTimes(
+    PSLockRef aLock,
+    ThreadRegistration::UnlockedRWForLockedProfiler& aThreadData);
+
+// Platform-specific function that retrieves process CPU measurements.
+static RunningTimes GetProcessRunningTimesDiff(
+    PSLockRef aLock, RunningTimes& aPreviousRunningTimesToBeUpdated);
+
+// Template function to be used by `GetThreadRunningTimesDiff()` (unless some
+// platform has a better way to achieve this).
+// It help perform CPU measurements and tie them to a timestamp, such that the
+// measurements and timestamp are very close together.
+// This is necessary, because the relative CPU usage is computed by dividing
+// consecutive CPU measurements by their timestamp difference; if there was an
+// unexpected big gap, it could skew this computation and produce impossible
+// spikes that would hide the rest of the data. See bug 1685938 for more info.
+// Note that this may call the measurement function more than once; it is
+// assumed to normally be fast.
+// This was verified experimentally, but there is currently no regression
+// testing for it; see follow-up bug 1687402.
+template <typename GetCPURunningTimesFunction>
+RunningTimes GetRunningTimesWithTightTimestamp(
+    GetCPURunningTimesFunction&& aGetCPURunningTimesFunction) {
+  // Once per process, compute a threshold over which running times and their
+  // timestamp is considered too far apart.
+  static const TimeDuration scMaxRunningTimesReadDuration = [&]() {
+    // Run the main CPU measurements + timestamp a number of times and capture
+    // their durations.
+    constexpr int loops = 128;
+    TimeDuration durations[loops];
+    RunningTimes runningTimes;
+    TimeStamp before = TimeStamp::Now();
+    for (int i = 0; i < loops; ++i) {
+      AUTO_PROFILER_STATS(GetRunningTimes_MaxRunningTimesReadDuration);
+      aGetCPURunningTimesFunction(runningTimes);
+      const TimeStamp after = TimeStamp::Now();
+      durations[i] = after - before;
+      before = after;
+    }
+    // Move median duration to the middle.
+    std::nth_element(&durations[0], &durations[loops / 2], &durations[loops]);
+    // Use median*8 as cut-off point.
+    // Typical durations should be around a microsecond, the cut-off should then
+    // be around 10 microseconds, well below the expected minimum inter-sample
+    // interval (observed as a few milliseconds), so overall this should keep
+    // cpu/interval spikes
+    return durations[loops / 2] * 8;
+  }();
+
+  // Record CPU measurements between two timestamps.
+  RunningTimes runningTimes;
+  TimeStamp before = TimeStamp::Now();
+  aGetCPURunningTimesFunction(runningTimes);
+  TimeStamp after = TimeStamp::Now();
+  const TimeDuration duration = after - before;
+
+  // In most cases, the above should be quick enough. But if not (e.g., because
+  // of an OS context switch), repeat once:
+  if (MOZ_UNLIKELY(duration > scMaxRunningTimesReadDuration)) {
+    AUTO_PROFILER_STATS(GetRunningTimes_REDO);
+    RunningTimes runningTimes2;
+    aGetCPURunningTimesFunction(runningTimes2);
+    TimeStamp after2 = TimeStamp::Now();
+    const TimeDuration duration2 = after2 - after;
+    if (duration2 < duration) {
+      // We did it faster, use the new results. (But it could still be slower
+      // than expected, see note below for why it's acceptable.)
+      // This must stay *after* the CPU measurements.
+      runningTimes2.SetPostMeasurementTimeStamp(after2);
+      return runningTimes2;
+    }
+    // Otherwise use the initial results, they were slow, but faster than the
+    // second attempt.
+    // This means that something bad happened twice in a row on the same thread!
+    // So trying more times would be unlikely to get much better, and would be
+    // more expensive than the precision is worth.
+    // At worst, it means that a spike of activity may be reported in the next
+    // time slice. But in the end, the cumulative work is conserved, so it
+    // should still be visible at about the correct time in the graph.
+    AUTO_PROFILER_STATS(GetRunningTimes_RedoWasWorse);
+  }
+
+  // This must stay *after* the CPU measurements.
+  runningTimes.SetPostMeasurementTimeStamp(after);
+
+  return runningTimes;
+}
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN SamplerThread
+
+// The sampler thread controls sampling and runs whenever the profiler is
+// active. It periodically runs through all registered threads, finds those
+// that should be sampled, then pauses and samples them.
+
+class SamplerThread {
+ public:
+  // Creates a sampler thread, but doesn't start it.
+  SamplerThread(PSLockRef aLock, uint32_t aActivityGeneration,
+                double aIntervalMilliseconds, uint32_t aFeatures);
+  ~SamplerThread();
+
+  // This runs on (is!) the sampler thread.
+  void Run();
+
+#if defined(GP_OS_windows)
+  // This runs on (is!) the thread to spy on unregistered threads.
+  void RunUnregisteredThreadSpy();
+#endif
+
+  // This runs on the main thread.
+  void Stop(PSLockRef aLock);
+
+  void AppendPostSamplingCallback(PSLockRef, PostSamplingCallback&& aCallback) {
+    // We are under lock, so it's safe to just modify the list pointer.
+    // Also this means the sampler has not started its run yet, so any callback
+    // added now will be invoked at the end of the next loop; this guarantees
+    // that the callback will be invoked after at least one full sampling loop.
+    mPostSamplingCallbackList = MakeUnique<PostSamplingCallbackListItem>(
+        std::move(mPostSamplingCallbackList), std::move(aCallback));
+  }
+
+ private:
+  void SpyOnUnregisteredThreads();
+
+  // Item containing a post-sampling callback, and a tail-list of more items.
+  // Using a linked list means no need to move items when adding more, and
+  // "stealing" the whole list is one pointer move.
+  struct PostSamplingCallbackListItem {
+    UniquePtr<PostSamplingCallbackListItem> mPrev;
+    PostSamplingCallback mCallback;
+
+    PostSamplingCallbackListItem(UniquePtr<PostSamplingCallbackListItem> aPrev,
+                                 PostSamplingCallback&& aCallback)
+        : mPrev(std::move(aPrev)), mCallback(std::move(aCallback)) {}
+  };
+
+  [[nodiscard]] UniquePtr<PostSamplingCallbackListItem>
+  TakePostSamplingCallbacks(PSLockRef) {
+    return std::move(mPostSamplingCallbackList);
+  }
+
+  static void InvokePostSamplingCallbacks(
+      UniquePtr<PostSamplingCallbackListItem> aCallbacks,
+      SamplingState aSamplingState) {
+    if (!aCallbacks) {
+      return;
+    }
+    // We want to drill down to the last element in this list, which is the
+    // oldest one, so that we invoke them in FIFO order.
+    // We don't expect many callbacks, so it's safe to recurse. Note that we're
+    // moving-from the UniquePtr, so the tail will implicitly get destroyed.
+    InvokePostSamplingCallbacks(std::move(aCallbacks->mPrev), aSamplingState);
+    // We are going to destroy this item, so we can safely move-from the
+    // callback before calling it (in case it has an rvalue-ref-qualified call
+    // operator).
+    std::move(aCallbacks->mCallback)(aSamplingState);
+    // It may be tempting for a future maintainer to change aCallbacks into an
+    // rvalue reference; this will remind them not to do that!
+    static_assert(
+        std::is_same_v<decltype(aCallbacks),
+                       UniquePtr<PostSamplingCallbackListItem>>,
+        "We need to capture the list by-value, to implicitly destroy it");
+  }
+
+  // This suspends the calling thread for the given number of microseconds.
+  // Best effort timing.
+  void SleepMicro(uint32_t aMicroseconds);
+
+  // The sampler used to suspend and sample threads.
+  Sampler mSampler;
+
+  // The activity generation, for detecting when the sampler thread must stop.
+  const uint32_t mActivityGeneration;
+
+  // The interval between samples, measured in microseconds.
+  const int mIntervalMicroseconds;
+
+  // The OS-specific handle for the sampler thread.
+#if defined(GP_OS_windows)
+  HANDLE mThread;
+  HANDLE mUnregisteredThreadSpyThread = nullptr;
+  enum class SpyingState {
+    NoSpying,
+    Spy_Initializing,
+    // Spy is waiting for SamplerToSpy_Start or MainToSpy_Shutdown.
+    Spy_Waiting,
+    // Sampler requests spy to start working. May be pre-empted by
+    // MainToSpy_Shutdown.
+    SamplerToSpy_Start,
+    // Spy is currently working, cannot be interrupted, only the spy is allowed
+    // to change the state again.
+    Spy_Working,
+    // Main control requests spy to shut down.
+    MainToSpy_Shutdown,
+    // Spy notified main control that it's out of the loop, about to exit.
+    SpyToMain_ShuttingDown
+  };
+  SpyingState mSpyingState = SpyingState::NoSpying;
+  // The sampler will increment this while the spy is working, then while the
+  // spy is waiting the sampler will decrement it until <=0 before starting the
+  // spy. This will ensure that the work doesn't take more than 50% of a CPU
+  // core.
+  int mDelaySpyStart = 0;
+  Monitor mSpyingStateMonitor MOZ_UNANNOTATED{
+      "SamplerThread::mSpyingStateMonitor"};
+#elif defined(GP_OS_darwin) || defined(GP_OS_linux) || \
+    defined(GP_OS_android) || defined(GP_OS_freebsd)
+  pthread_t mThread;
+#endif
+
+  // Post-sampling callbacks are kept in a simple linked list, which will be
+  // stolen by the sampler thread at the end of its next run.
+  UniquePtr<PostSamplingCallbackListItem> mPostSamplingCallbackList;
+
+#if defined(GP_OS_windows)
+  bool mNoTimerResolutionChange = true;
+#endif
+
+  struct SpiedThread {
+    base::ProcessId mThreadId;
+    nsCString mName;
+    uint64_t mCPUTimeNs;
+
+    SpiedThread(base::ProcessId aThreadId, const nsACString& aName,
+                uint64_t aCPUTimeNs)
+        : mThreadId(aThreadId), mName(aName), mCPUTimeNs(aCPUTimeNs) {}
+
+    // Comparisons with just a thread id, for easy searching in an array.
+    friend bool operator==(const SpiedThread& aSpiedThread,
+                           base::ProcessId aThreadId) {
+      return aSpiedThread.mThreadId == aThreadId;
+    }
+    friend bool operator==(base::ProcessId aThreadId,
+                           const SpiedThread& aSpiedThread) {
+      return aThreadId == aSpiedThread.mThreadId;
+    }
+  };
+
+  // Time at which mSpiedThreads was previously updated. Null before 1st update.
+  TimeStamp mLastSpying;
+  // Unregistered threads that have been found, and are being spied on.
+  using SpiedThreads = AutoTArray<SpiedThread, 128>;
+  SpiedThreads mSpiedThreads;
+
+  SamplerThread(const SamplerThread&) = delete;
+  void operator=(const SamplerThread&) = delete;
+};
+
+// [[nodiscard]] static
+bool ActivePS::AppendPostSamplingCallback(PSLockRef aLock,
+                                          PostSamplingCallback&& aCallback) {
+  if (!sInstance || !sInstance->mSamplerThread) {
+    return false;
+  }
+  sInstance->mSamplerThread->AppendPostSamplingCallback(aLock,
+                                                        std::move(aCallback));
+  return true;
+}
+
+// This function is required because we need to create a SamplerThread within
+// ActivePS's constructor, but SamplerThread is defined after ActivePS. It
+// could probably be removed by moving some code around.
+static SamplerThread* NewSamplerThread(PSLockRef aLock, uint32_t aGeneration,
+                                       double aInterval, uint32_t aFeatures) {
+  return new SamplerThread(aLock, aGeneration, aInterval, aFeatures);
+}
+
+// This function is the sampler thread.  This implementation is used for all
+// targets.
+void SamplerThread::Run() {
+  PR_SetCurrentThreadName("SamplerThread");
+
+  // Features won't change during this SamplerThread's lifetime, so we can read
+  // them once and store them locally.
+  const uint32_t features = []() -> uint32_t {
+    PSAutoLock lock;
+    if (!ActivePS::Exists(lock)) {
+      // If there is no active profiler, it doesn't matter what we return,
+      // because this thread will exit before any feature is used.
+      return 0;
+    }
+    return ActivePS::Features(lock);
+  }();
+
+  // Not *no*-stack-sampling means we do want stack sampling.
+  const bool stackSampling = !ProfilerFeature::HasNoStackSampling(features);
+
+  const bool cpuUtilization = ProfilerFeature::HasCPUUtilization(features);
+
+  // Use local ProfileBuffer and underlying buffer to capture the stack.
+  // (This is to avoid touching the core buffer lock while a thread is
+  // suspended, because that thread could be working with the core buffer as
+  // well.
+  mozilla::ProfileBufferChunkManagerSingle localChunkManager(
+      ProfileBufferChunkManager::scExpectedMaximumStackSize);
+  ProfileChunkedBuffer localBuffer(
+      ProfileChunkedBuffer::ThreadSafety::WithoutMutex, localChunkManager);
+  ProfileBuffer localProfileBuffer(localBuffer);
+
+  // Will be kept between collections, to know what each collection does.
+  auto previousState = localBuffer.GetState();
+
+  // This will be filled at every loop, to be used by the next loop to compute
+  // the CPU utilization between samples.
+  RunningTimes processRunningTimes;
+
+  // This will be set inside the loop, from inside the lock scope, to capture
+  // all callbacks added before that, but none after the lock is released.
+  UniquePtr<PostSamplingCallbackListItem> postSamplingCallbacks;
+  // This will be set inside the loop, before invoking callbacks outside.
+  SamplingState samplingState{};
+
+  const TimeDuration sampleInterval =
+      TimeDuration::FromMicroseconds(mIntervalMicroseconds);
+  const uint32_t minimumIntervalSleepUs =
+      static_cast<uint32_t>(mIntervalMicroseconds / 4);
+
+  // This is the scheduled time at which each sampling loop should start.
+  // It will determine the ideal next sampling start by adding the expected
+  // interval, unless when sampling runs late -- See end of while() loop.
+  TimeStamp scheduledSampleStart = TimeStamp::Now();
+
+  while (true) {
+    const TimeStamp sampleStart = TimeStamp::Now();
+
+    // This scope is for |lock|. It ends before we sleep below.
+    {
+      // There should be no local callbacks left from a previous loop.
+      MOZ_ASSERT(!postSamplingCallbacks);
+
+      PSAutoLock lock;
+      TimeStamp lockAcquired = TimeStamp::Now();
+
+      // Move all the post-sampling callbacks locally, so that new ones cannot
+      // sneak in between the end of the lock scope and the invocation after it.
+      postSamplingCallbacks = TakePostSamplingCallbacks(lock);
+
+      if (!ActivePS::Exists(lock)) {
+        // Exit the `while` loop, including the lock scope, before invoking
+        // callbacks and returning.
+        samplingState = SamplingState::JustStopped;
+        break;
+      }
+
+      // At this point profiler_stop() might have been called, and
+      // profiler_start() might have been called on another thread. If this
+      // happens the generation won't match.
+      if (ActivePS::Generation(lock) != mActivityGeneration) {
+        samplingState = SamplingState::JustStopped;
+        // Exit the `while` loop, including the lock scope, before invoking
+        // callbacks and returning.
+        break;
+      }
+
+      ActivePS::ClearExpiredExitProfiles(lock);
+
+      TimeStamp expiredMarkersCleaned = TimeStamp::Now();
+
+      if (int(gSkipSampling) <= 0 && !ActivePS::IsSamplingPaused(lock)) {
+        double sampleStartDeltaMs =
+            (sampleStart - CorePS::ProcessStartTime()).ToMilliseconds();
+        ProfileBuffer& buffer = ActivePS::Buffer(lock);
+
+        // Before sampling counters, update the process CPU counter if active.
+        if (ActivePS::ProcessCPUCounter* processCPUCounter =
+                ActivePS::MaybeProcessCPUCounter(lock);
+            processCPUCounter) {
+          RunningTimes processRunningTimesDiff =
+              GetProcessRunningTimesDiff(lock, processRunningTimes);
+          Maybe<uint64_t> cpu = processRunningTimesDiff.GetJsonThreadCPUDelta();
+          if (cpu) {
+            processCPUCounter->Add(static_cast<int64_t>(*cpu));
+          }
+        }
+
+        if (PowerCounters* powerCounters = ActivePS::MaybePowerCounters(lock);
+            powerCounters) {
+          powerCounters->Sample();
+        }
+
+        // handle per-process generic counters
+        const Vector<BaseProfilerCount*>& counters = CorePS::Counters(lock);
+        for (auto& counter : counters) {
+          if (auto sample = counter->Sample(); sample.isSampleNew) {
+            // create Buffer entries for each counter
+            buffer.AddEntry(ProfileBufferEntry::CounterId(counter));
+            buffer.AddEntry(ProfileBufferEntry::Time(sampleStartDeltaMs));
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+            if (ActivePS::IsMemoryCounter(counter)) {
+              // For the memory counter, substract the size of our buffer to
+              // avoid giving the misleading impression that the memory use
+              // keeps on growing when it's just the profiler session that's
+              // using a larger buffer as it gets longer.
+              sample.count -= static_cast<int64_t>(
+                  ActivePS::ControlledChunkManager(lock).TotalSize());
+            }
+#endif
+            // In the future, we may support keyed counters - for example,
+            // counters with a key which is a thread ID. For "simple" counters
+            // we'll just use a key of 0.
+            buffer.AddEntry(ProfileBufferEntry::CounterKey(0));
+            buffer.AddEntry(ProfileBufferEntry::Count(sample.count));
+            if (sample.number) {
+              buffer.AddEntry(ProfileBufferEntry::Number(sample.number));
+            }
+          }
+        }
+        TimeStamp countersSampled = TimeStamp::Now();
+
+        if (stackSampling || cpuUtilization) {
+          samplingState = SamplingState::SamplingCompleted;
+
+          // Prevent threads from ending (or starting) and allow access to all
+          // OffThreadRef's.
+          ThreadRegistry::LockedRegistry lockedRegistry;
+
+          for (ThreadRegistry::OffThreadRef offThreadRef : lockedRegistry) {
+            ThreadRegistration::UnlockedRWForLockedProfiler&
+                unlockedThreadData =
+                    offThreadRef.UnlockedRWForLockedProfilerRef();
+            ProfiledThreadData* profiledThreadData =
+                unlockedThreadData.GetProfiledThreadData(lock);
+            if (!profiledThreadData) {
+              // This thread is not being profiled, continue with the next one.
+              continue;
+            }
+
+            const ThreadProfilingFeatures whatToProfile =
+                unlockedThreadData.ProfilingFeatures();
+            const bool threadCPUUtilization =
+                cpuUtilization &&
+                DoFeaturesIntersect(whatToProfile,
+                                    ThreadProfilingFeatures::CPUUtilization);
+            const bool threadStackSampling =
+                stackSampling &&
+                DoFeaturesIntersect(whatToProfile,
+                                    ThreadProfilingFeatures::Sampling);
+            if (!threadCPUUtilization && !threadStackSampling) {
+              // Nothing to profile on this thread, continue with the next one.
+              continue;
+            }
+
+            const ProfilerThreadId threadId =
+                unlockedThreadData.Info().ThreadId();
+
+            const RunningTimes runningTimesDiff = [&]() {
+              if (!threadCPUUtilization) {
+                // If we don't need CPU measurements, we only need a timestamp.
+                return RunningTimes(TimeStamp::Now());
+              }
+              return GetThreadRunningTimesDiff(lock, unlockedThreadData);
+            }();
+
+            const TimeStamp& now = runningTimesDiff.PostMeasurementTimeStamp();
+            double threadSampleDeltaMs =
+                (now - CorePS::ProcessStartTime()).ToMilliseconds();
+
+            // If the thread is asleep and has been sampled before in the same
+            // sleep episode, or otherwise(*) if there was zero CPU activity
+            // since the previous sampling, find and copy the previous sample,
+            // as that's cheaper than taking a new sample.
+            // (*) Tech note: The asleep check is done first and always, because
+            //     it is more reliable, and knows if it's the first asleep
+            //     sample, which cannot be duplicated; if the test was the other
+            //     way around, it could find zero CPU and then short-circuit
+            //     that state-changing second-asleep-check operation, which
+            //     could result in an unneeded sample.
+            // However we're using current running times (instead of copying the
+            // old ones) because some work could have happened.
+            if (threadStackSampling &&
+                (unlockedThreadData.CanDuplicateLastSampleDueToSleep() ||
+                 runningTimesDiff.GetThreadCPUDelta() == Some(uint64_t(0)))) {
+              const bool dup_ok = ActivePS::Buffer(lock).DuplicateLastSample(
+                  threadId, threadSampleDeltaMs,
+                  profiledThreadData->LastSample(), runningTimesDiff);
+              if (dup_ok) {
+                continue;
+              }
+            }
+
+            AUTO_PROFILER_STATS(gecko_SamplerThread_Run_DoPeriodicSample);
+
+            // Record the global profiler buffer's range start now, before
+            // adding the first entry for this thread's sample.
+            const uint64_t bufferRangeStart = buffer.BufferRangeStart();
+
+            // Add the thread ID now, so we know its position in the main
+            // buffer, which is used by some JS data.
+            // (DoPeriodicSample only knows about the temporary local buffer.)
+            const uint64_t samplePos = buffer.AddThreadIdEntry(threadId);
+            profiledThreadData->LastSample() = Some(samplePos);
+
+            // Also add the time, so it's always there after the thread ID, as
+            // expected by the parser. (Other stack data is optional.)
+            buffer.AddEntry(ProfileBufferEntry::TimeBeforeCompactStack(
+                threadSampleDeltaMs));
+
+            Maybe<double> unresponsiveDuration_ms;
+
+            // If we have RunningTimes data, store it before the CompactStack.
+            // Note: It is not stored inside the CompactStack so that it doesn't
+            // get incorrectly duplicated when the thread is sleeping.
+            if (!runningTimesDiff.IsEmpty()) {
+              profiler_get_core_buffer().PutObjects(
+                  ProfileBufferEntry::Kind::RunningTimes, runningTimesDiff);
+            }
+
+            if (threadStackSampling) {
+              ThreadRegistry::OffThreadRef::RWFromAnyThreadWithLock
+                  lockedThreadData = offThreadRef.GetLockedRWFromAnyThread();
+              // Suspend the thread and collect its stack data in the local
+              // buffer.
+              mSampler.SuspendAndSampleAndResumeThread(
+                  lock, lockedThreadData.DataCRef(), now,
+                  [&](const Registers& aRegs, const TimeStamp& aNow) {
+                    DoPeriodicSample(lock, lockedThreadData.DataCRef(), aRegs,
+                                     samplePos, bufferRangeStart,
+                                     localProfileBuffer);
+
+                    // For "eventDelay", we want the input delay - but if
+                    // there are no events in the input queue (or even if there
+                    // are), we're interested in how long the delay *would* be
+                    // for an input event now, which would be the time to finish
+                    // the current event + the delay caused by any events
+                    // already in the input queue (plus any High priority
+                    // events).  Events at lower priorities (in a
+                    // PrioritizedEventQueue) than Input count for input delay
+                    // only for the duration that they're running, since when
+                    // they finish, any queued input event would run.
+                    //
+                    // Unless we record the time state of all events and queue
+                    // states at all times, this is hard to precisely calculate,
+                    // but we can approximate it well in post-processing with
+                    // RunningEventDelay and RunningEventStart.
+                    //
+                    // RunningEventDelay is the time duration the event was
+                    // queued before starting execution.  RunningEventStart is
+                    // the time the event started. (Note: since we care about
+                    // Input event delays on MainThread, for
+                    // PrioritizedEventQueues we return 0 for RunningEventDelay
+                    // if the currently running event has a lower priority than
+                    // Input (since Input events won't queue behind them).
+                    //
+                    // To directly measure this we would need to record the time
+                    // at which the newest event currently in each queue at time
+                    // X (the sample time) finishes running.  This of course
+                    // would require looking into the future, or recording all
+                    // this state and then post-processing it later. If we were
+                    // to trace every event start and end we could do this, but
+                    // it would have significant overhead to do so (and buffer
+                    // usage).  From a recording of RunningEventDelays and
+                    // RunningEventStarts we can infer the actual delay:
+                    //
+                    // clang-format off
+                    // Event queue: <tail> D  :  C  :  B  : A <head>
+                    // Time inserted (ms): 40 :  20 : 10  : 0
+                    // Run Time (ms):      30 : 100 : 40  : 30
+                    //
+                    // 0    10   20   30   40   50   60   70   80   90  100  110  120  130  140  150  160  170
+                    // [A||||||||||||]
+                    //      ----------[B|||||||||||||||||]
+                    //           -------------------------[C|||||||||||||||||||||||||||||||||||||||||||||||]
+                    //                     -----------------------------------------------------------------[D|||||||||...]
+                    //
+                    // Calculate the delay of a new event added at time t: (run every sample)
+                    //    TimeSinceRunningEventBlockedInputEvents = RunningEventDelay + (now - RunningEventStart);
+                    //    effective_submission = now - TimeSinceRunningEventBlockedInputEvents;
+                    //    delta = (now - last_sample_time);
+                    //    last_sample_time = now;
+                    //    for (t=effective_submission to now) {
+                    //       delay[t] += delta;
+                    //    }
+                    //
+                    // Can be reduced in overhead by:
+                    //    TimeSinceRunningEventBlockedInputEvents = RunningEventDelay + (now - RunningEventStart);
+                    //    effective_submission = now - TimeSinceRunningEventBlockedInputEvents;
+                    //    if (effective_submission != last_submission) {
+                    //      delta = (now - last_submision);
+                    //      // this loop should be made to match each sample point in the range
+                    //      // intead of assuming 1ms sampling as this pseudocode does
+                    //      for (t=last_submission to effective_submission-1) {
+                    //         delay[t] += delta;
+                    //         delta -= 1; // assumes 1ms; adjust as needed to match for()
+                    //      }
+                    //      last_submission = effective_submission;
+                    //    }
+                    //
+                    // Time  Head of queue   Running Event  RunningEventDelay  Delay of       Effective     Started    Calc (submission->now add 10ms)  Final
+                    //                                                         hypothetical   Submission    Running @                                   result
+                    //                                                         event E
+                    // 0        Empty            A                0                30              0           0       @0=10                             30
+                    // 10         B              A                0                60              0           0       @0=20, @10=10                     60
+                    // 20         B              A                0               150              0           0       @0=30, @10=20, @20=10            150
+                    // 30         C              B               20               140             10          30       @10=20, @20=10, @30=0            140
+                    // 40         C              B               20               160                                  @10=30, @20=20...                160
+                    // 50         C              B               20               150                                                                   150
+                    // 60         C              B               20               140                                  @10=50, @20=40...                140
+                    // 70         D              C               50               130             20          70       @20=50, @30=40...                130
+                    // ...
+                    // 160        D              C               50                40                                  @20=140, @30=130...               40
+                    // 170      <empty>          D              140                30             40                   @40=140, @50=130... (rounding)    30
+                    // 180      <empty>          D              140                20             40                   @40=150                           20
+                    // 190      <empty>          D              140                10             40                   @40=160                           10
+                    // 200      <empty>        <empty>            0                 0             NA                                                      0
+                    //
+                    // Function Delay(t) = the time between t and the time at which a hypothetical
+                    // event e would start executing, if e was enqueued at time t.
+                    //
+                    // Delay(-1) = 0 // Before A was enqueued. No wait time, can start running
+                    //               // instantly.
+                    // Delay(0) = 30 // The hypothetical event e got enqueued just after A got
+                    //               // enqueued. It can start running at 30, when A is done.
+                    // Delay(5) = 25
+                    // Delay(10) = 60 // Can start running at 70, after both A and B are done.
+                    // Delay(19) = 51
+                    // Delay(20) = 150 // Can start running at 170, after A, B & C.
+                    // Delay(25) = 145
+                    // Delay(30) = 170 // Can start running at 200, after A, B, C & D.
+                    // Delay(120) = 80
+                    // Delay(200) = 0 // (assuming nothing was enqueued after D)
+                    //
+                    // For every event that gets enqueued, the Delay time will go up by the
+                    // event's running time at the time at which the event is enqueued.
+                    // The Delay function will be a sawtooth of the following shape:
+                    //
+                    //             |\           |...
+                    //             | \          |
+                    //        |\   |  \         |
+                    //        | \  |   \        |
+                    //     |\ |  \ |    \       |
+                    //  |\ | \|   \|     \      |
+                    //  | \|              \     |
+                    // _|                  \____|
+                    //
+                    //
+                    // A more complex example with a PrioritizedEventQueue:
+                    //
+                    // Event queue: <tail> D  :  C  :  B  : A <head>
+                    // Time inserted (ms): 40 :  20 : 10  : 0
+                    // Run Time (ms):      30 : 100 : 40  : 30
+                    // Priority:         Input: Norm: Norm: Norm
+                    //
+                    // 0    10   20   30   40   50   60   70   80   90  100  110  120  130  140  150  160  170
+                    // [A||||||||||||]
+                    //      ----------[B|||||||||||||||||]
+                    //           ----------------------------------------[C|||||||||||||||||||||||||||||||||||||||||||||||]
+                    //                     ---------------[D||||||||||||]
+                    //
+                    //
+                    // Time  Head of queue   Running Event  RunningEventDelay  Delay of       Effective   Started    Calc (submission->now add 10ms)   Final
+                    //                                                         hypothetical   Submission  Running @                                    result
+                    //                                                         event
+                    // 0        Empty            A                0                30              0           0       @0=10                             30
+                    // 10         B              A                0                20              0           0       @0=20, @10=10                     20
+                    // 20         B              A                0                10              0           0       @0=30, @10=20, @20=10             10
+                    // 30         C              B                0                40             30          30       @30=10                            40
+                    // 40         C              B                0                60             30                   @40=10, @30=20                    60
+                    // 50         C              B                0                50             30                   @50=10, @40=20, @30=30            50
+                    // 60         C              B                0                40             30                   @60=10, @50=20, @40=30, @30=40    40
+                    // 70         C              D               30                30             40          70       @60=20, @50=30, @40=40            30
+                    // 80         C              D               30                20             40          70       ...@50=40, @40=50                 20
+                    // 90         C              D               30                10             40          70       ...@60=40, @50=50, @40=60         10
+                    // 100      <empty>          C                0               100             100        100       @100=10                          100
+                    // 110      <empty>          C                0                90             100        100       @110=10, @100=20                  90
+
+                    //
+                    // For PrioritizedEventQueue, the definition of the Delay(t) function is adjusted: the hypothetical event e has Input priority.
+                    // Delay(-1) = 0 // Before A was enqueued. No wait time, can start running
+                    //               // instantly.
+                    // Delay(0) = 30 // The hypothetical input event e got enqueued just after A got
+                    //               // enqueued. It can start running at 30, when A is done.
+                    // Delay(5) = 25
+                    // Delay(10) = 20
+                    // Delay(25) = 5 // B has been queued, but e does not need to wait for B because e has Input priority and B does not.
+                    //               // So e can start running at 30, when A is done.
+                    // Delay(30) = 40 // Can start running at 70, after B is done.
+                    // Delay(40) = 60 // Can start at 100, after B and D are done (D is Input Priority)
+                    // Delay(80) = 20
+                    // Delay(100) = 100 // Wait for C to finish
+
+                    // clang-format on
+                    //
+                    // Alternatively we could insert (recycled instead of
+                    // allocated/freed) input events at every sample period
+                    // (1ms...), and use them to back-calculate the delay.  This
+                    // might also be somewhat expensive, and would require
+                    // guessing at the maximum delay, which would likely be in
+                    // the seconds, and so you'd need 1000's of pre-allocated
+                    // events per queue per thread - so there would be a memory
+                    // impact as well.
+
+                    TimeDuration currentEventDelay;
+                    TimeDuration currentEventRunning;
+                    lockedThreadData->GetRunningEventDelay(
+                        aNow, currentEventDelay, currentEventRunning);
+
+                    // Note: eventDelay is a different definition of
+                    // responsiveness than the 16ms event injection.
+
+                    // Don't suppress 0's for now; that can be a future
+                    // optimization.  We probably want one zero to be stored
+                    // before we start suppressing, which would be more
+                    // complex.
+                    unresponsiveDuration_ms =
+                        Some(currentEventDelay.ToMilliseconds() +
+                             currentEventRunning.ToMilliseconds());
+                  });
+
+              if (cpuUtilization) {
+                // Suspending the thread for sampling could have added some
+                // running time to it, discard any since the call to
+                // GetThreadRunningTimesDiff above.
+                DiscardSuspendedThreadRunningTimes(lock, unlockedThreadData);
+              }
+
+              // If we got eventDelay data, store it before the CompactStack.
+              // Note: It is not stored inside the CompactStack so that it
+              // doesn't get incorrectly duplicated when the thread is sleeping.
+              if (unresponsiveDuration_ms.isSome()) {
+                profiler_get_core_buffer().PutObjects(
+                    ProfileBufferEntry::Kind::UnresponsiveDurationMs,
+                    *unresponsiveDuration_ms);
+              }
+            }
+
+            // There *must* be a CompactStack after a TimeBeforeCompactStack;
+            // but note that other entries may have been concurrently inserted
+            // between the TimeBeforeCompactStack above and now. If the captured
+            // sample from `DoPeriodicSample` is complete, copy it into the
+            // global buffer, otherwise add an empty one to satisfy the parser
+            // that expects one.
+            auto state = localBuffer.GetState();
+            if (NS_WARN_IF(state.mFailedPutBytes !=
+                           previousState.mFailedPutBytes)) {
+              LOG("Stack sample too big for local storage, failed to store %u "
+                  "bytes",
+                  unsigned(state.mFailedPutBytes -
+                           previousState.mFailedPutBytes));
+              // There *must* be a CompactStack after a TimeBeforeCompactStack,
+              // even an empty one.
+              profiler_get_core_buffer().PutObjects(
+                  ProfileBufferEntry::Kind::CompactStack,
+                  UniquePtr<ProfileChunkedBuffer>(nullptr));
+            } else if (state.mRangeEnd - previousState.mRangeEnd >=
+                       *profiler_get_core_buffer().BufferLength()) {
+              LOG("Stack sample too big for profiler storage, needed %u bytes",
+                  unsigned(state.mRangeEnd - previousState.mRangeEnd));
+              // There *must* be a CompactStack after a TimeBeforeCompactStack,
+              // even an empty one.
+              profiler_get_core_buffer().PutObjects(
+                  ProfileBufferEntry::Kind::CompactStack,
+                  UniquePtr<ProfileChunkedBuffer>(nullptr));
+            } else {
+              profiler_get_core_buffer().PutObjects(
+                  ProfileBufferEntry::Kind::CompactStack, localBuffer);
+            }
+
+            // Clean up for the next run.
+            localBuffer.Clear();
+            previousState = localBuffer.GetState();
+          }
+        } else {
+          samplingState = SamplingState::NoStackSamplingCompleted;
+        }
+
+#if defined(USE_LUL_STACKWALK)
+        // The LUL unwind object accumulates frame statistics. Periodically we
+        // should poke it to give it a chance to print those statistics.  This
+        // involves doing I/O (fprintf, __android_log_print, etc.) and so
+        // can't safely be done from the critical section inside
+        // SuspendAndSampleAndResumeThread, which is why it is done here.
+        lul::LUL* lul = CorePS::Lul();
+        if (lul) {
+          lul->MaybeShowStats();
+        }
+#endif
+        TimeStamp threadsSampled = TimeStamp::Now();
+
+        {
+          AUTO_PROFILER_STATS(Sampler_FulfillChunkRequests);
+          ActivePS::FulfillChunkRequests(lock);
+        }
+
+        buffer.CollectOverheadStats(sampleStartDeltaMs,
+                                    lockAcquired - sampleStart,
+                                    expiredMarkersCleaned - lockAcquired,
+                                    countersSampled - expiredMarkersCleaned,
+                                    threadsSampled - countersSampled);
+      } else {
+        samplingState = SamplingState::SamplingPaused;
+      }
+    }
+    // gPSMutex is not held after this point.
+
+    // Invoke end-of-sampling callbacks outside of the locked scope.
+    InvokePostSamplingCallbacks(std::move(postSamplingCallbacks),
+                                samplingState);
+
+    ProfilerChild::ProcessPendingUpdate();
+
+    if (ProfilerFeature::HasUnregisteredThreads(features)) {
+#if defined(GP_OS_windows)
+      {
+        MonitorAutoLock spyingStateLock{mSpyingStateMonitor};
+        switch (mSpyingState) {
+          case SpyingState::SamplerToSpy_Start:
+          case SpyingState::Spy_Working:
+            // If the spy is working (or about to work), record this loop
+            // iteration to delay the next start.
+            ++mDelaySpyStart;
+            break;
+          case SpyingState::Spy_Waiting:
+            // The Spy is idle, waiting for instructions. Should we delay?
+            if (--mDelaySpyStart <= 0) {
+              mDelaySpyStart = 0;
+              mSpyingState = SpyingState::SamplerToSpy_Start;
+              mSpyingStateMonitor.NotifyAll();
+            }
+            break;
+          default:
+            // Otherwise the spy should be initializing or shutting down.
+            MOZ_ASSERT(mSpyingState == SpyingState::Spy_Initializing ||
+                       mSpyingState == SpyingState::MainToSpy_Shutdown ||
+                       mSpyingState == SpyingState::SpyToMain_ShuttingDown);
+            break;
+        }
+      }
+#else
+      // On non-Windows platforms, this is fast enough to run in this thread,
+      // each sampling loop.
+      SpyOnUnregisteredThreads();
+#endif
+    }
+
+    // We expect the next sampling loop to start `sampleInterval` after this
+    // loop here was scheduled to start.
+    scheduledSampleStart += sampleInterval;
+
+    // Try to sleep until we reach that next scheduled time.
+    const TimeStamp beforeSleep = TimeStamp::Now();
+    if (scheduledSampleStart >= beforeSleep) {
+      // There is still time before the next scheduled sample time.
+      const uint32_t sleepTimeUs = static_cast<uint32_t>(
+          (scheduledSampleStart - beforeSleep).ToMicroseconds());
+      if (sleepTimeUs >= minimumIntervalSleepUs) {
+        SleepMicro(sleepTimeUs);
+      } else {
+        // If we're too close to that time, sleep the minimum amount of time.
+        // Note that the next scheduled start is not shifted, so at the end of
+        // the next loop, sleep may again be adjusted to get closer to schedule.
+        SleepMicro(minimumIntervalSleepUs);
+      }
+    } else {
+      // This sampling loop ended after the next sampling should have started!
+      // There is little point to try and keep up to schedule now, it would
+      // require more work, while it's likely we're late because the system is
+      // already busy. Try and restart a normal schedule from now.
+      scheduledSampleStart = beforeSleep + sampleInterval;
+      SleepMicro(static_cast<uint32_t>(sampleInterval.ToMicroseconds()));
+    }
+  }
+
+  // End of `while` loop. We can only be here from a `break` inside the loop.
+  InvokePostSamplingCallbacks(std::move(postSamplingCallbacks), samplingState);
+}
+
+namespace geckoprofiler::markers {
+
+struct UnregisteredThreadLifetimeMarker {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("UnregisteredThreadLifetime");
+  }
+  static void StreamJSONMarkerData(baseprofiler::SpliceableJSONWriter& aWriter,
+                                   base::ProcessId aThreadId,
+                                   const ProfilerString8View& aName,
+                                   const ProfilerString8View& aEndEvent) {
+    aWriter.IntProperty("Thread Id", aThreadId);
+    aWriter.StringProperty("Thread Name", aName.Length() != 0
+                                              ? aName.AsSpan()
+                                              : MakeStringSpan("~Unnamed~"));
+    if (aEndEvent.Length() != 0) {
+      aWriter.StringProperty("End Event", aEndEvent);
+    }
+  }
+  static MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::MarkerChart, MS::Location::MarkerTable};
+    schema.AddKeyFormatSearchable("Thread Id", MS::Format::Integer,
+                                  MS::Searchable::Searchable);
+    schema.AddKeyFormatSearchable("Thread Name", MS::Format::String,
+                                  MS::Searchable::Searchable);
+    schema.AddKeyFormat("End Event", MS::Format::String);
+    schema.AddStaticLabelValue(
+        "Note",
+        "Start and end are approximate, based on first and last appearances.");
+    schema.SetChartLabel(
+        "{marker.data.Thread Name} (tid {marker.data.Thread Id})");
+    schema.SetTableLabel("{marker.name} lifetime");
+    return schema;
+  }
+};
+
+struct UnregisteredThreadCPUMarker {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("UnregisteredThreadCPU");
+  }
+  static void StreamJSONMarkerData(baseprofiler::SpliceableJSONWriter& aWriter,
+                                   base::ProcessId aThreadId,
+                                   int64_t aCPUDiffNs, const TimeStamp& aStart,
+                                   const TimeStamp& aEnd) {
+    aWriter.IntProperty("Thread Id", aThreadId);
+    aWriter.IntProperty("CPU Time", aCPUDiffNs);
+    aWriter.DoubleProperty(
+        "CPU Utilization",
+        double(aCPUDiffNs) / ((aEnd - aStart).ToMicroseconds() * 1000.0));
+  }
+  static MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::MarkerChart, MS::Location::MarkerTable};
+    schema.AddKeyFormatSearchable("Thread Id", MS::Format::Integer,
+                                  MS::Searchable::Searchable);
+    schema.AddKeyFormat("CPU Time", MS::Format::Nanoseconds);
+    schema.AddKeyFormat("CPU Utilization", MS::Format::Percentage);
+    schema.SetChartLabel("{marker.data.CPU Utilization}");
+    schema.SetTableLabel(
+        "{marker.name} - Activity: {marker.data.CPU Utilization}");
+    return schema;
+  }
+};
+
+}  // namespace geckoprofiler::markers
+
+static bool IsThreadIdRegistered(ProfilerThreadId aThreadId) {
+  ThreadRegistry::LockedRegistry lockedRegistry;
+  const auto registryEnd = lockedRegistry.end();
+  return std::find_if(
+             lockedRegistry.begin(), registryEnd,
+             [aThreadId](const ThreadRegistry::OffThreadRef& aOffThreadRef) {
+               return aOffThreadRef.UnlockedConstReaderCRef()
+                          .Info()
+                          .ThreadId() == aThreadId;
+             }) != registryEnd;
+}
+
+static nsAutoCString MakeThreadInfoMarkerName(base::ProcessId aThreadId,
+                                              const nsACString& aName) {
+  nsAutoCString markerName{"tid "};
+  markerName.AppendInt(int64_t(aThreadId));
+  if (!aName.IsEmpty()) {
+    markerName.AppendLiteral(" ");
+    markerName.Append(aName);
+  }
+  return markerName;
+}
+
+void SamplerThread::SpyOnUnregisteredThreads() {
+  const TimeStamp unregisteredThreadSearchStart = TimeStamp::Now();
+
+  const base::ProcessId currentProcessId =
+      base::ProcessId(profiler_current_process_id().ToNumber());
+  nsTArray<ProcInfoRequest> request(1);
+  request.EmplaceBack(
+      /* aPid = */ currentProcessId,
+      /* aProcessType = */ ProcType::Unknown,
+      /* aOrigin = */ ""_ns,
+      /* aWindowInfo = */ nsTArray<WindowInfo>{},
+      /* aUtilityInfo = */ nsTArray<UtilityInfo>{},
+      /* aChild = */ 0
+#ifdef XP_MACOSX
+      ,
+      /* aChildTask = */ MACH_PORT_NULL
+#endif  // XP_MACOSX
+  );
+
+  const ProcInfoPromise::ResolveOrRejectValue procInfoOrError =
+      GetProcInfoSync(std::move(request));
+
+  if (!procInfoOrError.IsResolve()) {
+    PROFILER_MARKER_TEXT("Failed unregistered thread search", PROFILER,
+                         MarkerOptions(MarkerThreadId::MainThread(),
+                                       MarkerTiming::IntervalUntilNowFrom(
+                                           unregisteredThreadSearchStart)),
+                         "Could not retrieve any process information");
+    return;
+  }
+
+  const auto& procInfoHashMap = procInfoOrError.ResolveValue();
+  // Expecting the requested (current) process information to be present in the
+  // hashmap.
+  const auto& procInfoPtr =
+      procInfoHashMap.readonlyThreadsafeLookup(currentProcessId);
+  if (!procInfoPtr) {
+    PROFILER_MARKER_TEXT("Failed unregistered thread search", PROFILER,
+                         MarkerOptions(MarkerThreadId::MainThread(),
+                                       MarkerTiming::IntervalUntilNowFrom(
+                                           unregisteredThreadSearchStart)),
+                         "Could not retrieve information about this process");
+    return;
+  }
+
+  // Record the time spent so far, which is OS-bound...
+  PROFILER_MARKER_TEXT("Unregistered thread search", PROFILER,
+                       MarkerOptions(MarkerThreadId::MainThread(),
+                                     MarkerTiming::IntervalUntilNowFrom(
+                                         unregisteredThreadSearchStart)),
+                       "Work to discover threads");
+
+  // ... and record the time needed to process the data, which we can control.
+  AUTO_PROFILER_MARKER_TEXT(
+      "Unregistered thread search", PROFILER,
+      MarkerOptions(MarkerThreadId::MainThread()),
+      "Work to process discovered threads and record unregistered ones"_ns);
+
+  const Span<const mozilla::ThreadInfo> threads = procInfoPtr->value().threads;
+
+  // mLastSpying timestamp should be null only at the beginning of a session,
+  // when mSpiedThreads is still empty.
+  MOZ_ASSERT_IF(mLastSpying.IsNull(), mSpiedThreads.IsEmpty());
+
+  const TimeStamp previousSpying = std::exchange(mLastSpying, TimeStamp::Now());
+
+  // Find threads that were spied on but are not present anymore.
+  const auto threadsBegin = threads.begin();
+  const auto threadsEnd = threads.end();
+  for (size_t spiedThreadIndexPlus1 = mSpiedThreads.Length();
+       spiedThreadIndexPlus1 != 0; --spiedThreadIndexPlus1) {
+    const SpiedThread& spiedThread = mSpiedThreads[spiedThreadIndexPlus1 - 1];
+    if (std::find_if(threadsBegin, threadsEnd,
+                     [spiedTid = spiedThread.mThreadId](
+                         const mozilla::ThreadInfo& aThreadInfo) {
+                       return aThreadInfo.tid == spiedTid;
+                     }) == threadsEnd) {
+      // This spied thread is gone.
+      PROFILER_MARKER(
+          MakeThreadInfoMarkerName(spiedThread.mThreadId, spiedThread.mName),
+          PROFILER,
+          MarkerOptions(
+              MarkerThreadId::MainThread(),
+              // Place the end between this update and the previous one.
+              MarkerTiming::IntervalEnd(previousSpying +
+                                        (mLastSpying - previousSpying) /
+                                            int64_t(2))),
+          UnregisteredThreadLifetimeMarker, spiedThread.mThreadId,
+          spiedThread.mName, "Thread disappeared");
+
+      // Don't spy on it anymore, assuming it won't come back.
+      mSpiedThreads.RemoveElementAt(spiedThreadIndexPlus1 - 1);
+    }
+  }
+
+  for (const mozilla::ThreadInfo& threadInfo : threads) {
+    // Index of this encountered thread in mSpiedThreads, or NoIndex.
+    size_t spiedThreadIndex = mSpiedThreads.IndexOf(threadInfo.tid);
+    if (IsThreadIdRegistered(ProfilerThreadId::FromNumber(threadInfo.tid))) {
+      // This thread id is already officially registered.
+      if (spiedThreadIndex != SpiedThreads::NoIndex) {
+        // This now-registered thread was previously being spied.
+        SpiedThread& spiedThread = mSpiedThreads[spiedThreadIndex];
+        PROFILER_MARKER(
+            MakeThreadInfoMarkerName(spiedThread.mThreadId, spiedThread.mName),
+            PROFILER,
+            MarkerOptions(
+                MarkerThreadId::MainThread(),
+                // Place the end between this update and the previous one.
+                // TODO: Find the real time from the thread registration?
+                MarkerTiming::IntervalEnd(previousSpying +
+                                          (mLastSpying - previousSpying) /
+                                              int64_t(2))),
+            UnregisteredThreadLifetimeMarker, spiedThread.mThreadId,
+            spiedThread.mName, "Thread registered itself");
+
+        // Remove from mSpiedThreads, since it can be profiled normally.
+        mSpiedThreads.RemoveElement(threadInfo.tid);
+      }
+    } else {
+      // This thread id is not registered.
+      if (spiedThreadIndex == SpiedThreads::NoIndex) {
+        // This unregistered thread has not been spied yet, store it now.
+        NS_ConvertUTF16toUTF8 name(threadInfo.name);
+        mSpiedThreads.EmplaceBack(threadInfo.tid, name, threadInfo.cpuTime);
+
+        PROFILER_MARKER(
+            MakeThreadInfoMarkerName(threadInfo.tid, name), PROFILER,
+            MarkerOptions(
+                MarkerThreadId::MainThread(),
+                // Place the start between this update and the previous one (or
+                // the start of this search if it's the first one).
+                MarkerTiming::IntervalStart(
+                    mLastSpying -
+                    (mLastSpying - (previousSpying.IsNull()
+                                        ? unregisteredThreadSearchStart
+                                        : previousSpying)) /
+                        int64_t(2))),
+            UnregisteredThreadLifetimeMarker, threadInfo.tid, name,
+            /* aEndEvent */ "");
+      } else {
+        // This unregistered thread was already being spied, record its work.
+        SpiedThread& spiedThread = mSpiedThreads[spiedThreadIndex];
+        int64_t diffCPUTimeNs =
+            int64_t(threadInfo.cpuTime) - int64_t(spiedThread.mCPUTimeNs);
+        spiedThread.mCPUTimeNs = threadInfo.cpuTime;
+        if (diffCPUTimeNs != 0) {
+          PROFILER_MARKER(
+              MakeThreadInfoMarkerName(threadInfo.tid, spiedThread.mName),
+              PROFILER,
+              MarkerOptions(
+                  MarkerThreadId::MainThread(),
+                  MarkerTiming::Interval(previousSpying, mLastSpying)),
+              UnregisteredThreadCPUMarker, threadInfo.tid, diffCPUTimeNs,
+              previousSpying, mLastSpying);
+        }
+      }
+    }
+  }
+
+  PROFILER_MARKER_TEXT("Unregistered thread search", PROFILER,
+                       MarkerOptions(MarkerThreadId::MainThread(),
+                                     MarkerTiming::IntervalUntilNowFrom(
+                                         unregisteredThreadSearchStart)),
+                       "Work to discover and record unregistered threads");
+}
+
+// We #include these files directly because it means those files can use
+// declarations from this file trivially.  These provide target-specific
+// implementations of all SamplerThread methods except Run().
+#if defined(GP_OS_windows)
+#  include "platform-win32.cpp"
+#elif defined(GP_OS_darwin)
+#  include "platform-macos.cpp"
+#elif defined(GP_OS_linux) || defined(GP_OS_android) || defined(GP_OS_freebsd)
+#  include "platform-linux-android.cpp"
+#else
+#  error "bad platform"
+#endif
+
+// END SamplerThread
+////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////
+// BEGIN externally visible functions
+
+MOZ_DEFINE_MALLOC_SIZE_OF(GeckoProfilerMallocSizeOf)
+
+NS_IMETHODIMP
+GeckoProfilerReporter::CollectReports(nsIHandleReportCallback* aHandleReport,
+                                      nsISupports* aData, bool aAnonymize) {
+  MOZ_RELEASE_ASSERT(NS_IsMainThread());
+
+  size_t profSize = 0;
+  size_t lulSize = 0;
+
+  {
+    PSAutoLock lock;
+
+    if (CorePS::Exists()) {
+      CorePS::AddSizeOf(lock, GeckoProfilerMallocSizeOf, profSize, lulSize);
+    }
+
+    if (ActivePS::Exists(lock)) {
+      profSize += ActivePS::SizeOf(lock, GeckoProfilerMallocSizeOf);
+    }
+  }
+
+  MOZ_COLLECT_REPORT(
+      "explicit/profiler/profiler-state", KIND_HEAP, UNITS_BYTES, profSize,
+      "Memory used by the Gecko Profiler's global state (excluding memory used "
+      "by LUL).");
+
+#if defined(USE_LUL_STACKWALK)
+  MOZ_COLLECT_REPORT(
+      "explicit/profiler/lul", KIND_HEAP, UNITS_BYTES, lulSize,
+      "Memory used by LUL, a stack unwinder used by the Gecko Profiler.");
+#endif
+
+  return NS_OK;
+}
+
+NS_IMPL_ISUPPORTS(GeckoProfilerReporter, nsIMemoryReporter)
+
+static uint32_t ParseFeature(const char* aFeature, bool aIsStartup) {
+  if (strcmp(aFeature, "default") == 0) {
+    return (aIsStartup ? (DefaultFeatures() | StartupExtraDefaultFeatures())
+                       : DefaultFeatures()) &
+           AvailableFeatures();
+  }
+
+#define PARSE_FEATURE_BIT(n_, str_, Name_, desc_) \
+  if (strcmp(aFeature, str_) == 0) {              \
+    return ProfilerFeature::Name_;                \
+  }
+
+  PROFILER_FOR_EACH_FEATURE(PARSE_FEATURE_BIT)
+
+#undef PARSE_FEATURE_BIT
+
+  printf("\nUnrecognized feature \"%s\".\n\n", aFeature);
+  // Since we may have an old feature we don't implement anymore, don't exit.
+  PrintUsage();
+  return 0;
+}
+
+uint32_t ParseFeaturesFromStringArray(const char** aFeatures,
+                                      uint32_t aFeatureCount,
+                                      bool aIsStartup /* = false */) {
+  uint32_t features = 0;
+  for (size_t i = 0; i < aFeatureCount; i++) {
+    features |= ParseFeature(aFeatures[i], aIsStartup);
+  }
+  return features;
+}
+
+static ProfilingStack* locked_register_thread(
+    PSLockRef aLock, ThreadRegistry::OffThreadRef aOffThreadRef) {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  VTUNE_REGISTER_THREAD(aOffThreadRef.UnlockedConstReaderCRef().Info().Name());
+
+  if (ActivePS::Exists(aLock)) {
+    ThreadProfilingFeatures threadProfilingFeatures =
+        ActivePS::ProfilingFeaturesForThread(
+            aLock, aOffThreadRef.UnlockedConstReaderCRef().Info());
+    if (threadProfilingFeatures != ThreadProfilingFeatures::NotProfiled) {
+      ThreadRegistry::OffThreadRef::RWFromAnyThreadWithLock
+          lockedRWFromAnyThread = aOffThreadRef.GetLockedRWFromAnyThread();
+
+      ProfiledThreadData* profiledThreadData = ActivePS::AddLiveProfiledThread(
+          aLock, MakeUnique<ProfiledThreadData>(
+                     aOffThreadRef.UnlockedConstReaderCRef().Info()));
+      lockedRWFromAnyThread->SetProfilingFeaturesAndData(
+          threadProfilingFeatures, profiledThreadData, aLock);
+
+      if (ActivePS::FeatureJS(aLock)) {
+        lockedRWFromAnyThread->StartJSSampling(ActivePS::JSFlags(aLock));
+        if (ThreadRegistration::LockedRWOnThread* lockedRWOnThread =
+                lockedRWFromAnyThread.GetLockedRWOnThread();
+            lockedRWOnThread) {
+          // We can manually poll the current thread so it starts sampling
+          // immediately.
+          lockedRWOnThread->PollJSSampling();
+        }
+        if (lockedRWFromAnyThread->GetJSContext()) {
+          profiledThreadData->NotifyReceivedJSContext(
+              ActivePS::Buffer(aLock).BufferRangeEnd());
+        }
+      }
+    }
+  }
+
+  return &aOffThreadRef.UnlockedConstReaderAndAtomicRWRef().ProfilingStackRef();
+}
+
+static void NotifyObservers(const char* aTopic,
+                            nsISupports* aSubject = nullptr) {
+  if (!NS_IsMainThread()) {
+    // Dispatch a task to the main thread that notifies observers.
+    // If NotifyObservers is called both on and off the main thread within a
+    // short time, the order of the notifications can be different from the
+    // order of the calls to NotifyObservers.
+    // Getting the order 100% right isn't that important at the moment, because
+    // these notifications are only observed in the parent process, where the
+    // profiler_* functions are currently only called on the main thread.
+    nsCOMPtr<nsISupports> subject = aSubject;
+    NS_DispatchToMainThread(NS_NewRunnableFunction(
+        "NotifyObservers", [=] { NotifyObservers(aTopic, subject); }));
+    return;
+  }
+
+  if (nsCOMPtr<nsIObserverService> os = services::GetObserverService()) {
+    os->NotifyObservers(aSubject, aTopic, nullptr);
+  }
+}
+
+[[nodiscard]] static RefPtr<GenericPromise> NotifyProfilerStarted(
+    const PowerOfTwo32& aCapacity, const Maybe<double>& aDuration,
+    double aInterval, uint32_t aFeatures, const char** aFilters,
+    uint32_t aFilterCount, uint64_t aActiveTabID) {
+  nsTArray<nsCString> filtersArray;
+  for (size_t i = 0; i < aFilterCount; ++i) {
+    filtersArray.AppendElement(aFilters[i]);
+  }
+
+  nsCOMPtr<nsIProfilerStartParams> params = new nsProfilerStartParams(
+      aCapacity.Value(), aDuration, aInterval, aFeatures,
+      std::move(filtersArray), aActiveTabID);
+
+  RefPtr<GenericPromise> startPromise = ProfilerParent::ProfilerStarted(params);
+  NotifyObservers("profiler-started", params);
+  return startPromise;
+}
+
+static void locked_profiler_start(PSLockRef aLock, PowerOfTwo32 aCapacity,
+                                  double aInterval, uint32_t aFeatures,
+                                  const char** aFilters, uint32_t aFilterCount,
+                                  uint64_t aActiveTabID,
+                                  const Maybe<double>& aDuration);
+
+// This basically duplicates AutoProfilerLabel's constructor.
+static void* MozGlueLabelEnter(const char* aLabel, const char* aDynamicString,
+                               void* aSp) {
+  ThreadRegistration::OnThreadPtr onThreadPtr =
+      ThreadRegistration::GetOnThreadPtr();
+  if (!onThreadPtr) {
+    return nullptr;
+  }
+  ProfilingStack& profilingStack =
+      onThreadPtr->UnlockedConstReaderAndAtomicRWRef().ProfilingStackRef();
+  profilingStack.pushLabelFrame(aLabel, aDynamicString, aSp,
+                                JS::ProfilingCategoryPair::OTHER);
+  return &profilingStack;
+}
+
+// This basically duplicates AutoProfilerLabel's destructor.
+static void MozGlueLabelExit(void* aProfilingStack) {
+  if (aProfilingStack) {
+    reinterpret_cast<ProfilingStack*>(aProfilingStack)->pop();
+  }
+}
+
+static Vector<const char*> SplitAtCommas(const char* aString,
+                                         UniquePtr<char[]>& aStorage) {
+  size_t len = strlen(aString);
+  aStorage = MakeUnique<char[]>(len + 1);
+  PodCopy(aStorage.get(), aString, len + 1);
+
+  // Iterate over all characters in aStorage and split at commas, by
+  // overwriting commas with the null char.
+  Vector<const char*> array;
+  size_t currentElementStart = 0;
+  for (size_t i = 0; i <= len; i++) {
+    if (aStorage[i] == ',') {
+      aStorage[i] = '\0';
+    }
+    if (aStorage[i] == '\0') {
+      // Only add non-empty elements, otherwise ParseFeatures would later
+      // complain about unrecognized features.
+      if (currentElementStart != i) {
+        MOZ_RELEASE_ASSERT(array.append(&aStorage[currentElementStart]));
+      }
+      currentElementStart = i + 1;
+    }
+  }
+  return array;
+}
+
+void profiler_init_threadmanager() {
+  LOG("profiler_init_threadmanager");
+
+  ThreadRegistration::WithOnThreadRef(
+      [](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        aOnThreadRef.WithLockedRWOnThread(
+            [](ThreadRegistration::LockedRWOnThread& aThreadData) {
+              if (!aThreadData.GetEventTarget()) {
+                aThreadData.ResetMainThread(NS_GetCurrentThreadNoCreate());
+              }
+            });
+      });
+}
+
+static const char* get_size_suffix(const char* str) {
+  const char* ptr = str;
+
+  while (isdigit(*ptr)) {
+    ptr++;
+  }
+
+  return ptr;
+}
+
+void profiler_init(void* aStackTop) {
+  LOG("profiler_init");
+
+  profiler_init_main_thread_id();
+
+  VTUNE_INIT();
+
+  MOZ_RELEASE_ASSERT(!CorePS::Exists());
+
+  if (getenv("MOZ_PROFILER_HELP")) {
+    PrintUsage();
+    exit(0);
+  }
+
+  SharedLibraryInfo::Initialize();
+
+  uint32_t features = DefaultFeatures() & AvailableFeatures();
+
+  UniquePtr<char[]> filterStorage;
+
+  Vector<const char*> filters;
+  MOZ_RELEASE_ASSERT(filters.append("GeckoMain"));
+  MOZ_RELEASE_ASSERT(filters.append("Compositor"));
+  MOZ_RELEASE_ASSERT(filters.append("Renderer"));
+  MOZ_RELEASE_ASSERT(filters.append("DOM Worker"));
+
+  PowerOfTwo32 capacity = PROFILER_DEFAULT_ENTRIES;
+  Maybe<double> duration = Nothing();
+  double interval = PROFILER_DEFAULT_INTERVAL;
+  uint64_t activeTabID = PROFILER_DEFAULT_ACTIVE_TAB_ID;
+
+  ThreadRegistration::RegisterThread(kMainThreadName, aStackTop);
+
+  {
+    PSAutoLock lock;
+
+    // We've passed the possible failure point. Instantiate CorePS, which
+    // indicates that the profiler has initialized successfully.
+    CorePS::Create(lock);
+
+    // Make sure threads already in the ThreadRegistry (like the main thread)
+    // get registered in CorePS as well.
+    {
+      ThreadRegistry::LockedRegistry lockedRegistry;
+      for (ThreadRegistry::OffThreadRef offThreadRef : lockedRegistry) {
+        locked_register_thread(lock, offThreadRef);
+      }
+    }
+
+    // Platform-specific initialization.
+    PlatformInit(lock);
+
+#if defined(GP_OS_android)
+    if (jni::IsAvailable()) {
+      GeckoJavaSampler::Init();
+    }
+#endif
+
+    // (Linux-only) We could create CorePS::mLul and read unwind info into it
+    // at this point. That would match the lifetime implied by destruction of
+    // it in profiler_shutdown() just below. However, that gives a big delay on
+    // startup, even if no profiling is actually to be done. So, instead, it is
+    // created on demand at the first call to PlatformStart().
+
+    const char* startupEnv = getenv("MOZ_PROFILER_STARTUP");
+    if (!startupEnv || startupEnv[0] == '\0' ||
+        ((startupEnv[0] == '0' || startupEnv[0] == 'N' ||
+          startupEnv[0] == 'n') &&
+         startupEnv[1] == '\0')) {
+      return;
+    }
+
+    LOG("- MOZ_PROFILER_STARTUP is set");
+
+    // Startup default capacity may be different.
+    capacity = PROFILER_DEFAULT_STARTUP_ENTRIES;
+
+    const char* startupCapacity = getenv("MOZ_PROFILER_STARTUP_ENTRIES");
+    if (startupCapacity && startupCapacity[0] != '\0') {
+      errno = 0;
+      long capacityLong = strtol(startupCapacity, nullptr, 10);
+      std::string_view sizeSuffix = get_size_suffix(startupCapacity);
+
+      if (sizeSuffix == "KB") {
+        capacityLong *= 1000 / scBytesPerEntry;
+      } else if (sizeSuffix == "KiB") {
+        capacityLong *= 1024 / scBytesPerEntry;
+      } else if (sizeSuffix == "MB") {
+        capacityLong *= (1000 * 1000) / scBytesPerEntry;
+      } else if (sizeSuffix == "MiB") {
+        capacityLong *= (1024 * 1024) / scBytesPerEntry;
+      } else if (sizeSuffix == "GB") {
+        capacityLong *= (1000 * 1000 * 1000) / scBytesPerEntry;
+      } else if (sizeSuffix == "GiB") {
+        capacityLong *= (1024 * 1024 * 1024) / scBytesPerEntry;
+      } else if (!sizeSuffix.empty()) {
+        LOG("- MOZ_PROFILER_STARTUP_ENTRIES unit must be one of the "
+            "following: KB, KiB, MB, MiB, GB, GiB");
+        PrintUsage();
+        exit(1);
+      }
+
+      // `long` could be 32 or 64 bits, so we force a 64-bit comparison with
+      // the maximum 32-bit signed number (as more than that is clamped down to
+      // 2^31 anyway).
+      if (errno == 0 && capacityLong > 0 &&
+          static_cast<uint64_t>(capacityLong) <=
+              static_cast<uint64_t>(INT32_MAX)) {
+        capacity = PowerOfTwo32(ActivePS::ClampToAllowedEntries(
+            static_cast<uint32_t>(capacityLong)));
+        LOG("- MOZ_PROFILER_STARTUP_ENTRIES = %u", unsigned(capacity.Value()));
+      } else {
+        LOG("- MOZ_PROFILER_STARTUP_ENTRIES not a valid integer: %s",
+            startupCapacity);
+        PrintUsage();
+        exit(1);
+      }
+    }
+
+    const char* startupDuration = getenv("MOZ_PROFILER_STARTUP_DURATION");
+    if (startupDuration && startupDuration[0] != '\0') {
+      errno = 0;
+      double durationVal = PR_strtod(startupDuration, nullptr);
+      if (errno == 0 && durationVal >= 0.0) {
+        if (durationVal > 0.0) {
+          duration = Some(durationVal);
+        }
+        LOG("- MOZ_PROFILER_STARTUP_DURATION = %f", durationVal);
+      } else {
+        LOG("- MOZ_PROFILER_STARTUP_DURATION not a valid float: %s",
+            startupDuration);
+        PrintUsage();
+        exit(1);
+      }
+    }
+
+    const char* startupInterval = getenv("MOZ_PROFILER_STARTUP_INTERVAL");
+    if (startupInterval && startupInterval[0] != '\0') {
+      errno = 0;
+      interval = PR_strtod(startupInterval, nullptr);
+      if (errno == 0 && interval > 0.0 && interval <= PROFILER_MAX_INTERVAL) {
+        LOG("- MOZ_PROFILER_STARTUP_INTERVAL = %f", interval);
+      } else {
+        LOG("- MOZ_PROFILER_STARTUP_INTERVAL not a valid float: %s",
+            startupInterval);
+        PrintUsage();
+        exit(1);
+      }
+    }
+
+    features |= StartupExtraDefaultFeatures() & AvailableFeatures();
+
+    const char* startupFeaturesBitfield =
+        getenv("MOZ_PROFILER_STARTUP_FEATURES_BITFIELD");
+    if (startupFeaturesBitfield && startupFeaturesBitfield[0] != '\0') {
+      errno = 0;
+      features = strtol(startupFeaturesBitfield, nullptr, 10);
+      if (errno == 0) {
+        LOG("- MOZ_PROFILER_STARTUP_FEATURES_BITFIELD = %d", features);
+      } else {
+        LOG("- MOZ_PROFILER_STARTUP_FEATURES_BITFIELD not a valid integer: %s",
+            startupFeaturesBitfield);
+        PrintUsage();
+        exit(1);
+      }
+    } else {
+      const char* startupFeatures = getenv("MOZ_PROFILER_STARTUP_FEATURES");
+      if (startupFeatures) {
+        // Interpret startupFeatures as a list of feature strings, separated by
+        // commas.
+        UniquePtr<char[]> featureStringStorage;
+        Vector<const char*> featureStringArray =
+            SplitAtCommas(startupFeatures, featureStringStorage);
+        features = ParseFeaturesFromStringArray(featureStringArray.begin(),
+                                                featureStringArray.length(),
+                                                /* aIsStartup */ true);
+        LOG("- MOZ_PROFILER_STARTUP_FEATURES = %d", features);
+      }
+    }
+
+    const char* startupFilters = getenv("MOZ_PROFILER_STARTUP_FILTERS");
+    if (startupFilters && startupFilters[0] != '\0') {
+      filters = SplitAtCommas(startupFilters, filterStorage);
+      LOG("- MOZ_PROFILER_STARTUP_FILTERS = %s", startupFilters);
+
+      if (mozilla::profiler::detail::FiltersExcludePid(filters)) {
+        LOG(" -> This process is excluded and won't be profiled");
+        return;
+      }
+    }
+
+    const char* startupActiveTabID =
+        getenv("MOZ_PROFILER_STARTUP_ACTIVE_TAB_ID");
+    if (startupActiveTabID && startupActiveTabID[0] != '\0') {
+      std::istringstream iss(startupActiveTabID);
+      iss >> activeTabID;
+      if (!iss.fail()) {
+        LOG("- MOZ_PROFILER_STARTUP_ACTIVE_TAB_ID = %" PRIu64, activeTabID);
+      } else {
+        LOG("- MOZ_PROFILER_STARTUP_ACTIVE_TAB_ID not a valid "
+            "uint64_t: %s",
+            startupActiveTabID);
+        PrintUsage();
+        exit(1);
+      }
+    }
+
+    locked_profiler_start(lock, capacity, interval, features, filters.begin(),
+                          filters.length(), activeTabID, duration);
+  }
+
+  // The GeckoMain thread registration happened too early to record a marker,
+  // so let's record it again now.
+  profiler_mark_thread_awake();
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  // Start counting memory allocations (outside of lock because this may call
+  // profiler_add_sampled_counter which would attempt to take the lock.)
+  ActivePS::SetMemoryCounter(mozilla::profiler::install_memory_hooks());
+#endif
+
+  invoke_profiler_state_change_callbacks(ProfilingState::Started);
+
+  // We do this with gPSMutex unlocked. The comment in profiler_stop() explains
+  // why.
+  Unused << NotifyProfilerStarted(capacity, duration, interval, features,
+                                  filters.begin(), filters.length(), 0);
+}
+
+static void locked_profiler_save_profile_to_file(
+    PSLockRef aLock, const char* aFilename,
+    const PreRecordedMetaInformation& aPreRecordedMetaInformation,
+    bool aIsShuttingDown);
+
+static SamplerThread* locked_profiler_stop(PSLockRef aLock);
+
+void profiler_shutdown(IsFastShutdown aIsFastShutdown) {
+  LOG("profiler_shutdown");
+
+  VTUNE_SHUTDOWN();
+
+  MOZ_RELEASE_ASSERT(NS_IsMainThread());
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  if (profiler_is_active()) {
+    invoke_profiler_state_change_callbacks(ProfilingState::Stopping);
+  }
+  invoke_profiler_state_change_callbacks(ProfilingState::ShuttingDown);
+
+  const auto preRecordedMetaInformation = PreRecordMetaInformation();
+
+  ProfilerParent::ProfilerWillStopIfStarted();
+
+  // If the profiler is active we must get a handle to the SamplerThread before
+  // ActivePS is destroyed, in order to delete it.
+  SamplerThread* samplerThread = nullptr;
+  {
+    PSAutoLock lock;
+
+    // Save the profile on shutdown if requested.
+    if (ActivePS::Exists(lock)) {
+      const char* filename = getenv("MOZ_PROFILER_SHUTDOWN");
+      if (filename && filename[0] != '\0') {
+        locked_profiler_save_profile_to_file(lock, filename,
+                                             preRecordedMetaInformation,
+                                             /* aIsShuttingDown */ true);
+      }
+      if (aIsFastShutdown == IsFastShutdown::Yes) {
+        return;
+      }
+
+      samplerThread = locked_profiler_stop(lock);
+    } else if (aIsFastShutdown == IsFastShutdown::Yes) {
+      return;
+    }
+
+    CorePS::Destroy(lock);
+  }
+
+  // We do these operations with gPSMutex unlocked. The comments in
+  // profiler_stop() explain why.
+  if (samplerThread) {
+    Unused << ProfilerParent::ProfilerStopped();
+    NotifyObservers("profiler-stopped");
+    delete samplerThread;
+  }
+
+  // Reverse the registration done in profiler_init.
+  ThreadRegistration::UnregisterThread();
+}
+
+static bool WriteProfileToJSONWriter(SpliceableChunkedJSONWriter& aWriter,
+                                     double aSinceTime, bool aIsShuttingDown,
+                                     ProfilerCodeAddressService* aService,
+                                     mozilla::ProgressLogger aProgressLogger) {
+  LOG("WriteProfileToJSONWriter");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  aWriter.Start();
+  {
+    auto rv = profiler_stream_json_for_this_process(
+        aWriter, aSinceTime, aIsShuttingDown, aService,
+        aProgressLogger.CreateSubLoggerFromTo(
+            0_pc,
+            "WriteProfileToJSONWriter: "
+            "profiler_stream_json_for_this_process started",
+            100_pc,
+            "WriteProfileToJSONWriter: "
+            "profiler_stream_json_for_this_process done"));
+
+    if (rv.isErr()) {
+      return false;
+    }
+
+    // Don't include profiles from other processes because this is a
+    // synchronous function.
+    aWriter.StartArrayProperty("processes");
+    aWriter.EndArray();
+  }
+  aWriter.End();
+  return !aWriter.Failed();
+}
+
+void profiler_set_process_name(const nsACString& aProcessName,
+                               const nsACString* aETLDplus1) {
+  LOG("profiler_set_process_name(\"%s\", \"%s\")", aProcessName.Data(),
+      aETLDplus1 ? aETLDplus1->Data() : "<none>");
+  PSAutoLock lock;
+  CorePS::SetProcessName(lock, aProcessName);
+  if (aETLDplus1) {
+    CorePS::SetETLDplus1(lock, *aETLDplus1);
+  }
+}
+
+UniquePtr<char[]> profiler_get_profile(double aSinceTime,
+                                       bool aIsShuttingDown) {
+  LOG("profiler_get_profile");
+
+  UniquePtr<ProfilerCodeAddressService> service =
+      profiler_code_address_service_for_presymbolication();
+
+  FailureLatchSource failureLatch;
+  SpliceableChunkedJSONWriter b{failureLatch};
+  if (!WriteProfileToJSONWriter(b, aSinceTime, aIsShuttingDown, service.get(),
+                                ProgressLogger{})) {
+    return nullptr;
+  }
+  return b.ChunkedWriteFunc().CopyData();
+}
+
+[[nodiscard]] bool profiler_get_profile_json(
+    SpliceableChunkedJSONWriter& aSpliceableChunkedJSONWriter,
+    double aSinceTime, bool aIsShuttingDown,
+    mozilla::ProgressLogger aProgressLogger) {
+  LOG("profiler_get_profile_json");
+
+  UniquePtr<ProfilerCodeAddressService> service =
+      profiler_code_address_service_for_presymbolication();
+
+  return WriteProfileToJSONWriter(
+      aSpliceableChunkedJSONWriter, aSinceTime, aIsShuttingDown, service.get(),
+      aProgressLogger.CreateSubLoggerFromTo(
+          0.1_pc, "profiler_get_profile_json: WriteProfileToJSONWriter started",
+          99.9_pc, "profiler_get_profile_json: WriteProfileToJSONWriter done"));
+}
+
+void profiler_get_start_params(int* aCapacity, Maybe<double>* aDuration,
+                               double* aInterval, uint32_t* aFeatures,
+                               Vector<const char*>* aFilters,
+                               uint64_t* aActiveTabID) {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  if (NS_WARN_IF(!aCapacity) || NS_WARN_IF(!aDuration) ||
+      NS_WARN_IF(!aInterval) || NS_WARN_IF(!aFeatures) ||
+      NS_WARN_IF(!aFilters)) {
+    return;
+  }
+
+  PSAutoLock lock;
+
+  if (!ActivePS::Exists(lock)) {
+    *aCapacity = 0;
+    *aDuration = Nothing();
+    *aInterval = 0;
+    *aFeatures = 0;
+    *aActiveTabID = 0;
+    aFilters->clear();
+    return;
+  }
+
+  *aCapacity = ActivePS::Capacity(lock).Value();
+  *aDuration = ActivePS::Duration(lock);
+  *aInterval = ActivePS::Interval(lock);
+  *aFeatures = ActivePS::Features(lock);
+  *aActiveTabID = ActivePS::ActiveTabID(lock);
+
+  const Vector<std::string>& filters = ActivePS::Filters(lock);
+  MOZ_ALWAYS_TRUE(aFilters->resize(filters.length()));
+  for (uint32_t i = 0; i < filters.length(); ++i) {
+    (*aFilters)[i] = filters[i].c_str();
+  }
+}
+
+ProfileBufferControlledChunkManager* profiler_get_controlled_chunk_manager() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+  PSAutoLock lock;
+  if (NS_WARN_IF(!ActivePS::Exists(lock))) {
+    return nullptr;
+  }
+  return &ActivePS::ControlledChunkManager(lock);
+}
+
+namespace mozilla {
+
+void GetProfilerEnvVarsForChildProcess(
+    std::function<void(const char* key, const char* value)>&& aSetEnv) {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  PSAutoLock lock;
+
+  if (!ActivePS::Exists(lock)) {
+    aSetEnv("MOZ_PROFILER_STARTUP", "");
+    return;
+  }
+
+  aSetEnv("MOZ_PROFILER_STARTUP", "1");
+
+  // If MOZ_PROFILER_SHUTDOWN is defined, make sure it's empty in children, so
+  // that they don't attempt to write over that file.
+  if (getenv("MOZ_PROFILER_SHUTDOWN")) {
+    aSetEnv("MOZ_PROFILER_SHUTDOWN", "");
+  }
+
+  // Hidden option to stop Base Profiler, mostly due to Talos intermittents,
+  // see https://bugzilla.mozilla.org/show_bug.cgi?id=1638851#c3
+  // TODO: Investigate root cause and remove this in bugs 1648324 and 1648325.
+  if (getenv("MOZ_PROFILER_STARTUP_NO_BASE")) {
+    aSetEnv("MOZ_PROFILER_STARTUP_NO_BASE", "1");
+  }
+
+  auto capacityString =
+      Smprintf("%u", unsigned(ActivePS::Capacity(lock).Value()));
+  aSetEnv("MOZ_PROFILER_STARTUP_ENTRIES", capacityString.get());
+
+  // Use AppendFloat instead of Smprintf with %f because the decimal
+  // separator used by %f is locale-dependent. But the string we produce needs
+  // to be parseable by strtod, which only accepts the period character as a
+  // decimal separator. AppendFloat always uses the period character.
+  nsCString intervalString;
+  intervalString.AppendFloat(ActivePS::Interval(lock));
+  aSetEnv("MOZ_PROFILER_STARTUP_INTERVAL", intervalString.get());
+
+  auto featuresString = Smprintf("%d", ActivePS::Features(lock));
+  aSetEnv("MOZ_PROFILER_STARTUP_FEATURES_BITFIELD", featuresString.get());
+
+  std::string filtersString;
+  const Vector<std::string>& filters = ActivePS::Filters(lock);
+  for (uint32_t i = 0; i < filters.length(); ++i) {
+    if (i != 0) {
+      filtersString += ",";
+    }
+    filtersString += filters[i];
+  }
+  aSetEnv("MOZ_PROFILER_STARTUP_FILTERS", filtersString.c_str());
+
+  auto activeTabIDString = Smprintf("%" PRIu64, ActivePS::ActiveTabID(lock));
+  aSetEnv("MOZ_PROFILER_STARTUP_ACTIVE_TAB_ID", activeTabIDString.get());
+}
+
+}  // namespace mozilla
+
+void profiler_received_exit_profile(const nsACString& aExitProfile) {
+  MOZ_RELEASE_ASSERT(NS_IsMainThread());
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+  PSAutoLock lock;
+  if (!ActivePS::Exists(lock)) {
+    return;
+  }
+  ActivePS::AddExitProfile(lock, aExitProfile);
+}
+
+Vector<nsCString> profiler_move_exit_profiles() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+  PSAutoLock lock;
+  Vector<nsCString> profiles;
+  if (ActivePS::Exists(lock)) {
+    profiles = ActivePS::MoveExitProfiles(lock);
+  }
+  return profiles;
+}
+
+static void locked_profiler_save_profile_to_file(
+    PSLockRef aLock, const char* aFilename,
+    const PreRecordedMetaInformation& aPreRecordedMetaInformation,
+    bool aIsShuttingDown = false) {
+  nsAutoCString processedFilename(aFilename);
+  const auto processInsertionIndex = processedFilename.Find("%p");
+  if (processInsertionIndex != kNotFound) {
+    // Replace "%p" with the process id.
+    nsAutoCString process;
+    process.AppendInt(profiler_current_process_id().ToNumber());
+    processedFilename.Replace(processInsertionIndex, 2, process);
+    LOG("locked_profiler_save_profile_to_file(\"%s\" -> \"%s\")", aFilename,
+        processedFilename.get());
+  } else {
+    LOG("locked_profiler_save_profile_to_file(\"%s\")", aFilename);
+  }
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists() && ActivePS::Exists(aLock));
+
+  std::ofstream stream;
+  stream.open(processedFilename.get());
+  if (stream.is_open()) {
+    OStreamJSONWriteFunc sw(stream);
+    SpliceableJSONWriter w(sw, FailureLatchInfallibleSource::Singleton());
+    w.Start();
+    {
+      Unused << locked_profiler_stream_json_for_this_process(
+          aLock, w, /* sinceTime */ 0, aPreRecordedMetaInformation,
+          aIsShuttingDown, nullptr, ProgressLogger{});
+
+      w.StartArrayProperty("processes");
+      Vector<nsCString> exitProfiles = ActivePS::MoveExitProfiles(aLock);
+      for (auto& exitProfile : exitProfiles) {
+        if (!exitProfile.IsEmpty() && exitProfile[0] != '*') {
+          w.Splice(exitProfile);
+        }
+      }
+      w.EndArray();
+    }
+    w.End();
+
+    stream.close();
+  }
+}
+
+void profiler_save_profile_to_file(const char* aFilename) {
+  LOG("profiler_save_profile_to_file(%s)", aFilename);
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  const auto preRecordedMetaInformation = PreRecordMetaInformation();
+
+  PSAutoLock lock;
+
+  if (!ActivePS::Exists(lock)) {
+    return;
+  }
+
+  locked_profiler_save_profile_to_file(lock, aFilename,
+                                       preRecordedMetaInformation);
+}
+
+uint32_t profiler_get_available_features() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+  return AvailableFeatures();
+}
+
+Maybe<ProfilerBufferInfo> profiler_get_buffer_info() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  PSAutoLock lock;
+
+  if (!ActivePS::Exists(lock)) {
+    return Nothing();
+  }
+
+  return Some(ActivePS::Buffer(lock).GetProfilerBufferInfo());
+}
+
+static void PollJSSamplingForCurrentThread() {
+  ThreadRegistration::WithOnThreadRef(
+      [](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        aOnThreadRef.WithLockedRWOnThread(
+            [](ThreadRegistration::LockedRWOnThread& aThreadData) {
+              aThreadData.PollJSSampling();
+            });
+      });
+}
+
+// When the profiler is started on a background thread, we can't synchronously
+// call PollJSSampling on the main thread's ThreadInfo. And the next regular
+// call to PollJSSampling on the main thread would only happen once the main
+// thread triggers a JS interrupt callback.
+// This means that all the JS execution between profiler_start() and the first
+// JS interrupt would happen with JS sampling disabled, and we wouldn't get any
+// JS function information for that period of time.
+// So in order to start JS sampling as soon as possible, we dispatch a runnable
+// to the main thread which manually calls PollJSSamplingForCurrentThread().
+// In some cases this runnable will lose the race with the next JS interrupt.
+// That's fine; PollJSSamplingForCurrentThread() is immune to redundant calls.
+static void TriggerPollJSSamplingOnMainThread() {
+  nsCOMPtr<nsIThread> mainThread;
+  nsresult rv = NS_GetMainThread(getter_AddRefs(mainThread));
+  if (NS_SUCCEEDED(rv) && mainThread) {
+    nsCOMPtr<nsIRunnable> task =
+        NS_NewRunnableFunction("TriggerPollJSSamplingOnMainThread",
+                               []() { PollJSSamplingForCurrentThread(); });
+    SchedulerGroup::Dispatch(TaskCategory::Other, task.forget());
+  }
+}
+
+static void locked_profiler_start(PSLockRef aLock, PowerOfTwo32 aCapacity,
+                                  double aInterval, uint32_t aFeatures,
+                                  const char** aFilters, uint32_t aFilterCount,
+                                  uint64_t aActiveTabID,
+                                  const Maybe<double>& aDuration) {
+  TimeStamp profilingStartTime = TimeStamp::Now();
+
+  if (LOG_TEST) {
+    LOG("locked_profiler_start");
+    LOG("- capacity  = %u", unsigned(aCapacity.Value()));
+    LOG("- duration  = %.2f", aDuration ? *aDuration : -1);
+    LOG("- interval = %.2f", aInterval);
+    LOG("- tab ID = %" PRIu64, aActiveTabID);
+
+#define LOG_FEATURE(n_, str_, Name_, desc_)     \
+  if (ProfilerFeature::Has##Name_(aFeatures)) { \
+    LOG("- feature  = %s", str_);               \
+  }
+
+    PROFILER_FOR_EACH_FEATURE(LOG_FEATURE)
+
+#undef LOG_FEATURE
+
+    for (uint32_t i = 0; i < aFilterCount; i++) {
+      LOG("- threads  = %s", aFilters[i]);
+    }
+  }
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists() && !ActivePS::Exists(aLock));
+
+  // Do this before the Base Profiler is stopped, to keep the existing buffer
+  // (if any) alive for our use.
+  if (NS_IsMainThread()) {
+    mozilla::base_profiler_markers_detail::EnsureBufferForMainThreadAddMarker();
+  } else {
+    NS_DispatchToMainThread(
+        NS_NewRunnableFunction("EnsureBufferForMainThreadAddMarker",
+                               &mozilla::base_profiler_markers_detail::
+                                   EnsureBufferForMainThreadAddMarker));
+  }
+
+  UniquePtr<ProfileBufferChunkManagerWithLocalLimit> baseChunkManager;
+  bool profilersHandOver = false;
+  if (baseprofiler::profiler_is_active()) {
+    // Note that we still hold the lock, so the sampler cannot run yet and
+    // interact negatively with the still-active BaseProfiler sampler.
+    // Assume that Base Profiler is active because of MOZ_PROFILER_STARTUP.
+
+    // Take ownership of the chunk manager from the Base Profiler, to extend its
+    // lifetime during the new Gecko Profiler session. Since we're using the
+    // same core buffer, all the base profiler data remains.
+    baseChunkManager = baseprofiler::detail::ExtractBaseProfilerChunkManager();
+
+    if (baseChunkManager) {
+      profilersHandOver = true;
+      if (const TimeStamp baseProfilingStartTime =
+              baseprofiler::detail::GetProfilingStartTime();
+          !baseProfilingStartTime.IsNull()) {
+        profilingStartTime = baseProfilingStartTime;
+      }
+
+      BASE_PROFILER_MARKER_TEXT(
+          "Profilers handover", PROFILER, MarkerTiming::IntervalStart(),
+          "Transition from Base to Gecko Profiler, some data may be missing");
+    }
+
+    // Now stop Base Profiler (BP), as further recording will be ignored anyway,
+    // and so that it won't clash with Gecko Profiler (GP) sampling starting
+    // after the lock is dropped.
+    // On Linux this is especially important to do before creating the GP
+    // sampler, because the BP sampler may send a signal (to stop threads to be
+    // sampled), which the GP would intercept before its own initialization is
+    // complete and ready to handle such signals.
+    // Note that even though `profiler_stop()` doesn't immediately destroy and
+    // join the sampler thread, it safely deactivates it in such a way that the
+    // thread will soon exit without doing any actual work.
+    // TODO: Allow non-sampling profiling to continue.
+    // TODO: Re-start BP after GP shutdown, to capture post-XPCOM shutdown.
+    baseprofiler::profiler_stop();
+  }
+
+#if defined(GP_PLAT_amd64_windows)
+  InitializeWin64ProfilerHooks();
+#endif
+
+  // Fall back to the default values if the passed-in values are unreasonable.
+  // We want to be able to store at least one full stack.
+  PowerOfTwo32 capacity =
+      (aCapacity.Value() >=
+       ProfileBufferChunkManager::scExpectedMaximumStackSize / scBytesPerEntry)
+          ? aCapacity
+          : PROFILER_DEFAULT_ENTRIES;
+  Maybe<double> duration = aDuration;
+
+  if (aDuration && *aDuration <= 0) {
+    duration = Nothing();
+  }
+
+  double interval = aInterval > 0 ? aInterval : PROFILER_DEFAULT_INTERVAL;
+
+  ActivePS::Create(aLock, profilingStartTime, capacity, interval, aFeatures,
+                   aFilters, aFilterCount, aActiveTabID, duration,
+                   std::move(baseChunkManager));
+
+  // ActivePS::Create can only succeed or crash.
+  MOZ_ASSERT(ActivePS::Exists(aLock));
+
+  // Set up profiling for each registered thread, if appropriate.
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  bool isMainThreadBeingProfiled = false;
+#endif
+  ThreadRegistry::LockedRegistry lockedRegistry;
+  for (ThreadRegistry::OffThreadRef offThreadRef : lockedRegistry) {
+    const ThreadRegistrationInfo& info =
+        offThreadRef.UnlockedConstReaderCRef().Info();
+
+    ThreadProfilingFeatures threadProfilingFeatures =
+        ActivePS::ProfilingFeaturesForThread(aLock, info);
+    if (threadProfilingFeatures != ThreadProfilingFeatures::NotProfiled) {
+      ThreadRegistry::OffThreadRef::RWFromAnyThreadWithLock lockedThreadData =
+          offThreadRef.GetLockedRWFromAnyThread();
+      ProfiledThreadData* profiledThreadData = ActivePS::AddLiveProfiledThread(
+          aLock, MakeUnique<ProfiledThreadData>(info));
+      lockedThreadData->SetProfilingFeaturesAndData(threadProfilingFeatures,
+                                                    profiledThreadData, aLock);
+      lockedThreadData->GetNewCpuTimeInNs();
+      if (ActivePS::FeatureJS(aLock)) {
+        lockedThreadData->StartJSSampling(ActivePS::JSFlags(aLock));
+        if (ThreadRegistration::LockedRWOnThread* lockedRWOnThread =
+                lockedThreadData.GetLockedRWOnThread();
+            lockedRWOnThread) {
+          // We can manually poll the current thread so it starts sampling
+          // immediately.
+          lockedRWOnThread->PollJSSampling();
+        } else if (info.IsMainThread()) {
+          // Dispatch a runnable to the main thread to call
+          // PollJSSampling(), so that we don't have wait for the next JS
+          // interrupt callback in order to start profiling JS.
+          TriggerPollJSSamplingOnMainThread();
+        }
+      }
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+      if (info.IsMainThread()) {
+        isMainThreadBeingProfiled = true;
+      }
+#endif
+      lockedThreadData->ReinitializeOnResume();
+      if (ActivePS::FeatureJS(aLock) && lockedThreadData->GetJSContext()) {
+        profiledThreadData->NotifyReceivedJSContext(0);
+      }
+    }
+  }
+
+  // Setup support for pushing/popping labels in mozglue.
+  RegisterProfilerLabelEnterExit(MozGlueLabelEnter, MozGlueLabelExit);
+
+#if defined(GP_OS_android)
+  if (ActivePS::FeatureJava(aLock)) {
+    int javaInterval = interval;
+    // Java sampling doesn't accurately keep up with the sampling rate that is
+    // lower than 1ms.
+    if (javaInterval < 1) {
+      javaInterval = 1;
+    }
+
+    JNIEnv* env = jni::GetEnvForThread();
+    const auto& filters = ActivePS::Filters(aLock);
+    jni::ObjectArray::LocalRef javaFilters =
+        jni::ObjectArray::New<jni::String>(filters.length());
+    for (size_t i = 0; i < filters.length(); i++) {
+      javaFilters->SetElement(i, jni::StringParam(filters[i].data(), env));
+    }
+
+    // Send the interval-relative entry count, but we have 100000 hard cap in
+    // the java code, it can't be more than that.
+    java::GeckoJavaSampler::Start(
+        javaFilters, javaInterval,
+        std::round((double)(capacity.Value()) * interval /
+                   (double)(javaInterval)));
+  }
+#endif
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  if (ActivePS::FeatureNativeAllocations(aLock)) {
+    if (isMainThreadBeingProfiled) {
+      mozilla::profiler::enable_native_allocations();
+    } else {
+      NS_WARNING(
+          "The nativeallocations feature is turned on, but the main thread is "
+          "not being profiled. The allocations are only stored on the main "
+          "thread.");
+    }
+  }
+#endif
+
+  if (ProfilerFeature::HasAudioCallbackTracing(aFeatures)) {
+    StartAudioCallbackTracing();
+  }
+
+  // At the very end, set up RacyFeatures.
+  RacyFeatures::SetActive(ActivePS::Features(aLock));
+
+  if (profilersHandOver) {
+    PROFILER_MARKER_UNTYPED("Profilers handover", PROFILER,
+                            MarkerTiming::IntervalEnd());
+  }
+}
+
+RefPtr<GenericPromise> profiler_start(PowerOfTwo32 aCapacity, double aInterval,
+                                      uint32_t aFeatures, const char** aFilters,
+                                      uint32_t aFilterCount,
+                                      uint64_t aActiveTabID,
+                                      const Maybe<double>& aDuration) {
+  LOG("profiler_start");
+
+  ProfilerParent::ProfilerWillStopIfStarted();
+
+  SamplerThread* samplerThread = nullptr;
+  {
+    PSAutoLock lock;
+
+    // Initialize if necessary.
+    if (!CorePS::Exists()) {
+      profiler_init(nullptr);
+    }
+
+    // Reset the current state if the profiler is running.
+    if (ActivePS::Exists(lock)) {
+      // Note: Not invoking callbacks with ProfilingState::Stopping, because
+      // we're under lock, and also it would not be useful: Any profiling data
+      // will be discarded, and we're immediately restarting the profiler below
+      // and then notifying ProfilingState::Started.
+      samplerThread = locked_profiler_stop(lock);
+    }
+
+    locked_profiler_start(lock, aCapacity, aInterval, aFeatures, aFilters,
+                          aFilterCount, aActiveTabID, aDuration);
+  }
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  // Start counting memory allocations (outside of lock because this may call
+  // profiler_add_sampled_counter which would attempt to take the lock.)
+  ActivePS::SetMemoryCounter(mozilla::profiler::install_memory_hooks());
+#endif
+
+  invoke_profiler_state_change_callbacks(ProfilingState::Started);
+
+  // We do these operations with gPSMutex unlocked. The comments in
+  // profiler_stop() explain why.
+  if (samplerThread) {
+    Unused << ProfilerParent::ProfilerStopped();
+    NotifyObservers("profiler-stopped");
+    delete samplerThread;
+  }
+  return NotifyProfilerStarted(aCapacity, aDuration, aInterval, aFeatures,
+                               aFilters, aFilterCount, aActiveTabID);
+}
+
+void profiler_ensure_started(PowerOfTwo32 aCapacity, double aInterval,
+                             uint32_t aFeatures, const char** aFilters,
+                             uint32_t aFilterCount, uint64_t aActiveTabID,
+                             const Maybe<double>& aDuration) {
+  LOG("profiler_ensure_started");
+
+  ProfilerParent::ProfilerWillStopIfStarted();
+
+  bool startedProfiler = false;
+  SamplerThread* samplerThread = nullptr;
+  {
+    PSAutoLock lock;
+
+    // Initialize if necessary.
+    if (!CorePS::Exists()) {
+      profiler_init(nullptr);
+    }
+
+    if (ActivePS::Exists(lock)) {
+      // The profiler is active.
+      if (!ActivePS::Equals(lock, aCapacity, aDuration, aInterval, aFeatures,
+                            aFilters, aFilterCount, aActiveTabID)) {
+        // Stop and restart with different settings.
+        // Note: Not invoking callbacks with ProfilingState::Stopping, because
+        // we're under lock, and also it would not be useful: Any profiling data
+        // will be discarded, and we're immediately restarting the profiler
+        // below and then notifying ProfilingState::Started.
+        samplerThread = locked_profiler_stop(lock);
+        locked_profiler_start(lock, aCapacity, aInterval, aFeatures, aFilters,
+                              aFilterCount, aActiveTabID, aDuration);
+        startedProfiler = true;
+      }
+    } else {
+      // The profiler is stopped.
+      locked_profiler_start(lock, aCapacity, aInterval, aFeatures, aFilters,
+                            aFilterCount, aActiveTabID, aDuration);
+      startedProfiler = true;
+    }
+  }
+
+  // We do these operations with gPSMutex unlocked. The comments in
+  // profiler_stop() explain why.
+  if (samplerThread) {
+    Unused << ProfilerParent::ProfilerStopped();
+    NotifyObservers("profiler-stopped");
+    delete samplerThread;
+  }
+
+  if (startedProfiler) {
+    invoke_profiler_state_change_callbacks(ProfilingState::Started);
+
+    Unused << NotifyProfilerStarted(aCapacity, aDuration, aInterval, aFeatures,
+                                    aFilters, aFilterCount, aActiveTabID);
+  }
+}
+
+[[nodiscard]] static SamplerThread* locked_profiler_stop(PSLockRef aLock) {
+  LOG("locked_profiler_stop");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists() && ActivePS::Exists(aLock));
+
+  // At the very start, clear RacyFeatures.
+  RacyFeatures::SetInactive();
+
+  if (ActivePS::FeatureAudioCallbackTracing(aLock)) {
+    StopAudioCallbackTracing();
+  }
+
+#if defined(GP_OS_android)
+  if (ActivePS::FeatureJava(aLock)) {
+    java::GeckoJavaSampler::Stop();
+  }
+#endif
+
+  // Remove support for pushing/popping labels in mozglue.
+  RegisterProfilerLabelEnterExit(nullptr, nullptr);
+
+  // Stop sampling live threads.
+  ThreadRegistry::LockedRegistry lockedRegistry;
+  for (ThreadRegistry::OffThreadRef offThreadRef : lockedRegistry) {
+    if (offThreadRef.UnlockedRWForLockedProfilerRef().ProfilingFeatures() ==
+        ThreadProfilingFeatures::NotProfiled) {
+      continue;
+    }
+
+    ThreadRegistry::OffThreadRef::RWFromAnyThreadWithLock lockedThreadData =
+        offThreadRef.GetLockedRWFromAnyThread();
+
+    lockedThreadData->ClearProfilingFeaturesAndData(aLock);
+
+    if (ActivePS::FeatureJS(aLock)) {
+      lockedThreadData->StopJSSampling();
+      if (ThreadRegistration::LockedRWOnThread* lockedRWOnThread =
+              lockedThreadData.GetLockedRWOnThread();
+          lockedRWOnThread) {
+        // We are on the thread, we can manually poll the current thread so it
+        // stops profiling immediately.
+        lockedRWOnThread->PollJSSampling();
+      } else if (lockedThreadData->Info().IsMainThread()) {
+        // Dispatch a runnable to the main thread to call PollJSSampling(),
+        // so that we don't have wait for the next JS interrupt callback in
+        // order to start profiling JS.
+        TriggerPollJSSamplingOnMainThread();
+      }
+    }
+  }
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  if (ActivePS::FeatureNativeAllocations(aLock)) {
+    mozilla::profiler::disable_native_allocations();
+  }
+#endif
+
+  // The Stop() call doesn't actually stop Run(); that happens in this
+  // function's caller when the sampler thread is destroyed. Stop() just gives
+  // the SamplerThread a chance to do some cleanup with gPSMutex locked.
+  SamplerThread* samplerThread = ActivePS::Destroy(aLock);
+  samplerThread->Stop(aLock);
+
+  if (NS_IsMainThread()) {
+    mozilla::base_profiler_markers_detail::
+        ReleaseBufferForMainThreadAddMarker();
+  } else {
+    NS_DispatchToMainThread(
+        NS_NewRunnableFunction("ReleaseBufferForMainThreadAddMarker",
+                               &mozilla::base_profiler_markers_detail::
+                                   ReleaseBufferForMainThreadAddMarker));
+  }
+
+  return samplerThread;
+}
+
+RefPtr<GenericPromise> profiler_stop() {
+  LOG("profiler_stop");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  if (profiler_is_active()) {
+    invoke_profiler_state_change_callbacks(ProfilingState::Stopping);
+  }
+
+  ProfilerParent::ProfilerWillStopIfStarted();
+
+#if defined(MOZ_REPLACE_MALLOC) && defined(MOZ_PROFILER_MEMORY)
+  // Remove the hooks early, as native allocations (if they are on) can be
+  // quite expensive.
+  mozilla::profiler::remove_memory_hooks();
+#endif
+
+  SamplerThread* samplerThread;
+  {
+    PSAutoLock lock;
+
+    if (!ActivePS::Exists(lock)) {
+      return GenericPromise::CreateAndResolve(/* unused */ true, __func__);
+    }
+
+    samplerThread = locked_profiler_stop(lock);
+  }
+
+  // We notify observers with gPSMutex unlocked. Otherwise we might get a
+  // deadlock, if code run by these functions calls a profiler function that
+  // locks gPSMutex, for example when it wants to insert a marker.
+  // (This has been seen in practise in bug 1346356, when we were still firing
+  // these notifications synchronously.)
+  RefPtr<GenericPromise> promise = ProfilerParent::ProfilerStopped();
+  NotifyObservers("profiler-stopped");
+
+  // We delete with gPSMutex unlocked. Otherwise we would get a deadlock: we
+  // would be waiting here with gPSMutex locked for SamplerThread::Run() to
+  // return so the join operation within the destructor can complete, but Run()
+  // needs to lock gPSMutex to return.
+  //
+  // Because this call occurs with gPSMutex unlocked, it -- including the final
+  // iteration of Run()'s loop -- must be able detect deactivation and return
+  // in a way that's safe with respect to other gPSMutex-locking operations
+  // that may have occurred in the meantime.
+  delete samplerThread;
+
+  return promise;
+}
+
+bool profiler_is_paused() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  PSAutoLock lock;
+
+  if (!ActivePS::Exists(lock)) {
+    return false;
+  }
+
+  return ActivePS::IsPaused(lock);
+}
+
+/* [[nodiscard]] */ bool profiler_callback_after_sampling(
+    PostSamplingCallback&& aCallback) {
+  LOG("profiler_callback_after_sampling");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  PSAutoLock lock;
+
+  return ActivePS::AppendPostSamplingCallback(lock, std::move(aCallback));
+}
+
+RefPtr<GenericPromise> profiler_pause() {
+  LOG("profiler_pause");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  invoke_profiler_state_change_callbacks(ProfilingState::Pausing);
+
+  {
+    PSAutoLock lock;
+
+    if (!ActivePS::Exists(lock)) {
+      return GenericPromise::CreateAndResolve(/* unused */ true, __func__);
+    }
+
+#if defined(GP_OS_android)
+    if (ActivePS::FeatureJava(lock) && !ActivePS::IsSamplingPaused(lock)) {
+      // Not paused yet, so this is the first pause, let Java know.
+      // TODO: Distinguish Pause and PauseSampling in Java.
+      java::GeckoJavaSampler::PauseSampling();
+    }
+#endif
+
+    RacyFeatures::SetPaused();
+    ActivePS::SetIsPaused(lock, true);
+    ActivePS::Buffer(lock).AddEntry(ProfileBufferEntry::Pause(profiler_time()));
+  }
+
+  // gPSMutex must be unlocked when we notify, to avoid potential deadlocks.
+  RefPtr<GenericPromise> promise = ProfilerParent::ProfilerPaused();
+  NotifyObservers("profiler-paused");
+  return promise;
+}
+
+RefPtr<GenericPromise> profiler_resume() {
+  LOG("profiler_resume");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  {
+    PSAutoLock lock;
+
+    if (!ActivePS::Exists(lock)) {
+      return GenericPromise::CreateAndResolve(/* unused */ true, __func__);
+    }
+
+    ActivePS::Buffer(lock).AddEntry(
+        ProfileBufferEntry::Resume(profiler_time()));
+    ActivePS::SetIsPaused(lock, false);
+    RacyFeatures::SetUnpaused();
+
+#if defined(GP_OS_android)
+    if (ActivePS::FeatureJava(lock) && !ActivePS::IsSamplingPaused(lock)) {
+      // Not paused anymore, so this is the last unpause, let Java know.
+      // TODO: Distinguish Unpause and UnpauseSampling in Java.
+      java::GeckoJavaSampler::UnpauseSampling();
+    }
+#endif
+  }
+
+  // gPSMutex must be unlocked when we notify, to avoid potential deadlocks.
+  RefPtr<GenericPromise> promise = ProfilerParent::ProfilerResumed();
+  NotifyObservers("profiler-resumed");
+
+  invoke_profiler_state_change_callbacks(ProfilingState::Resumed);
+
+  return promise;
+}
+
+bool profiler_is_sampling_paused() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  PSAutoLock lock;
+
+  if (!ActivePS::Exists(lock)) {
+    return false;
+  }
+
+  return ActivePS::IsSamplingPaused(lock);
+}
+
+RefPtr<GenericPromise> profiler_pause_sampling() {
+  LOG("profiler_pause_sampling");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  {
+    PSAutoLock lock;
+
+    if (!ActivePS::Exists(lock)) {
+      return GenericPromise::CreateAndResolve(/* unused */ true, __func__);
+    }
+
+#if defined(GP_OS_android)
+    if (ActivePS::FeatureJava(lock) && !ActivePS::IsSamplingPaused(lock)) {
+      // Not paused yet, so this is the first pause, let Java know.
+      // TODO: Distinguish Pause and PauseSampling in Java.
+      java::GeckoJavaSampler::PauseSampling();
+    }
+#endif
+
+    RacyFeatures::SetSamplingPaused();
+    ActivePS::SetIsSamplingPaused(lock, true);
+    ActivePS::Buffer(lock).AddEntry(
+        ProfileBufferEntry::PauseSampling(profiler_time()));
+  }
+
+  // gPSMutex must be unlocked when we notify, to avoid potential deadlocks.
+  RefPtr<GenericPromise> promise = ProfilerParent::ProfilerPausedSampling();
+  NotifyObservers("profiler-paused-sampling");
+  return promise;
+}
+
+RefPtr<GenericPromise> profiler_resume_sampling() {
+  LOG("profiler_resume_sampling");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  {
+    PSAutoLock lock;
+
+    if (!ActivePS::Exists(lock)) {
+      return GenericPromise::CreateAndResolve(/* unused */ true, __func__);
+    }
+
+    ActivePS::Buffer(lock).AddEntry(
+        ProfileBufferEntry::ResumeSampling(profiler_time()));
+    ActivePS::SetIsSamplingPaused(lock, false);
+    RacyFeatures::SetSamplingUnpaused();
+
+#if defined(GP_OS_android)
+    if (ActivePS::FeatureJava(lock) && !ActivePS::IsSamplingPaused(lock)) {
+      // Not paused anymore, so this is the last unpause, let Java know.
+      // TODO: Distinguish Unpause and UnpauseSampling in Java.
+      java::GeckoJavaSampler::UnpauseSampling();
+    }
+#endif
+  }
+
+  // gPSMutex must be unlocked when we notify, to avoid potential deadlocks.
+  RefPtr<GenericPromise> promise = ProfilerParent::ProfilerResumedSampling();
+  NotifyObservers("profiler-resumed-sampling");
+  return promise;
+}
+
+bool profiler_feature_active(uint32_t aFeature) {
+  // This function runs both on and off the main thread.
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  // This function is hot enough that we use RacyFeatures, not ActivePS.
+  return RacyFeatures::IsActiveWithFeature(aFeature);
+}
+
+bool profiler_active_without_feature(uint32_t aFeature) {
+  // This function runs both on and off the main thread.
+
+  // This function is hot enough that we use RacyFeatures, not ActivePS.
+  return RacyFeatures::IsActiveWithoutFeature(aFeature);
+}
+
+void profiler_write_active_configuration(JSONWriter& aWriter) {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+  PSAutoLock lock;
+  ActivePS::WriteActiveConfiguration(lock, aWriter);
+}
+
+void profiler_add_sampled_counter(BaseProfilerCount* aCounter) {
+  DEBUG_LOG("profiler_add_sampled_counter(%s)", aCounter->mLabel);
+  PSAutoLock lock;
+  locked_profiler_add_sampled_counter(lock, aCounter);
+}
+
+void profiler_remove_sampled_counter(BaseProfilerCount* aCounter) {
+  DEBUG_LOG("profiler_remove_sampled_counter(%s)", aCounter->mLabel);
+  PSAutoLock lock;
+  locked_profiler_remove_sampled_counter(lock, aCounter);
+}
+
+ProfilingStack* profiler_register_thread(const char* aName,
+                                         void* aGuessStackTop) {
+  DEBUG_LOG("profiler_register_thread(%s)", aName);
+
+  // This will call `ThreadRegistry::Register()` (see below).
+  return ThreadRegistration::RegisterThread(aName, aGuessStackTop);
+}
+
+/* static */
+void ThreadRegistry::Register(ThreadRegistration::OnThreadRef aOnThreadRef) {
+  // Set the thread name (except for the main thread, which is controlled
+  // elsewhere, and influences the process name on some systems like Linux).
+  if (!aOnThreadRef.UnlockedConstReaderCRef().Info().IsMainThread()) {
+    // Make sure we have a nsThread wrapper for the current thread, and that
+    // NSPR knows its name.
+    (void)NS_GetCurrentThread();
+    NS_SetCurrentThreadName(
+        aOnThreadRef.UnlockedConstReaderCRef().Info().Name());
+  }
+
+  PSAutoLock lock;
+
+  {
+    RegistryLockExclusive lock{sRegistryMutex};
+    MOZ_RELEASE_ASSERT(sRegistryContainer.append(OffThreadRef{aOnThreadRef}));
+  }
+
+  if (!CorePS::Exists()) {
+    // CorePS has not been created yet.
+    // If&when that happens, it will handle already-registered threads then.
+    return;
+  }
+
+  (void)locked_register_thread(lock, OffThreadRef{aOnThreadRef});
+}
+
+void profiler_unregister_thread() {
+  // This will call `ThreadRegistry::Unregister()` (see below).
+  ThreadRegistration::UnregisterThread();
+}
+
+static void locked_unregister_thread(
+    PSLockRef lock, ThreadRegistration::OnThreadRef aOnThreadRef) {
+  if (!CorePS::Exists()) {
+    // This function can be called after the main thread has already shut
+    // down.
+    return;
+  }
+
+  // We don't call StopJSSampling() here; there's no point doing that for a JS
+  // thread that is in the process of disappearing.
+
+  ThreadRegistration::OnThreadRef::RWOnThreadWithLock lockedThreadData =
+      aOnThreadRef.GetLockedRWOnThread();
+
+  ProfiledThreadData* profiledThreadData =
+      lockedThreadData->GetProfiledThreadData(lock);
+  lockedThreadData->ClearProfilingFeaturesAndData(lock);
+
+  MOZ_RELEASE_ASSERT(
+      lockedThreadData->Info().ThreadId() == profiler_current_thread_id(),
+      "Thread being unregistered has changed its TID");
+
+  DEBUG_LOG("profiler_unregister_thread: %s", lockedThreadData->Info().Name());
+
+  if (profiledThreadData && ActivePS::Exists(lock)) {
+    ActivePS::UnregisterThread(lock, profiledThreadData);
+  }
+}
+
+/* static */
+void ThreadRegistry::Unregister(ThreadRegistration::OnThreadRef aOnThreadRef) {
+  PSAutoLock psLock;
+  locked_unregister_thread(psLock, aOnThreadRef);
+
+  RegistryLockExclusive lock{sRegistryMutex};
+  for (OffThreadRef& thread : sRegistryContainer) {
+    if (thread.IsPointingAt(*aOnThreadRef.mThreadRegistration)) {
+      sRegistryContainer.erase(&thread);
+      break;
+    }
+  }
+}
+
+void profiler_register_page(uint64_t aTabID, uint64_t aInnerWindowID,
+                            const nsCString& aUrl,
+                            uint64_t aEmbedderInnerWindowID,
+                            bool aIsPrivateBrowsing) {
+  DEBUG_LOG("profiler_register_page(%" PRIu64 ", %" PRIu64 ", %s, %" PRIu64
+            ", %s)",
+            aTabID, aInnerWindowID, aUrl.get(), aEmbedderInnerWindowID,
+            aIsPrivateBrowsing ? "true" : "false");
+
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  PSAutoLock lock;
+
+  // When a Browsing context is first loaded, the first url loaded in it will be
+  // about:blank. Because of that, this call keeps the first non-about:blank
+  // registration of window and discards the previous one.
+  RefPtr<PageInformation> pageInfo = new PageInformation(
+      aTabID, aInnerWindowID, aUrl, aEmbedderInnerWindowID, aIsPrivateBrowsing);
+  CorePS::AppendRegisteredPage(lock, std::move(pageInfo));
+
+  // After appending the given page to CorePS, look for the expired
+  // pages and remove them if there are any.
+  if (ActivePS::Exists(lock)) {
+    ActivePS::DiscardExpiredPages(lock);
+  }
+}
+
+void profiler_unregister_page(uint64_t aRegisteredInnerWindowID) {
+  PSAutoLock lock;
+
+  if (!CorePS::Exists()) {
+    // This function can be called after the main thread has already shut down.
+    return;
+  }
+
+  // During unregistration, if the profiler is active, we have to keep the
+  // page information since there may be some markers associated with the given
+  // page. But if profiler is not active. we have no reason to keep the
+  // page information here because there can't be any marker associated with it.
+  if (ActivePS::Exists(lock)) {
+    ActivePS::UnregisterPage(lock, aRegisteredInnerWindowID);
+  } else {
+    CorePS::RemoveRegisteredPage(lock, aRegisteredInnerWindowID);
+  }
+}
+
+void profiler_clear_all_pages() {
+  {
+    PSAutoLock lock;
+
+    if (!CorePS::Exists()) {
+      // This function can be called after the main thread has already shut
+      // down.
+      return;
+    }
+
+    CorePS::ClearRegisteredPages(lock);
+    if (ActivePS::Exists(lock)) {
+      ActivePS::ClearUnregisteredPages(lock);
+    }
+  }
+
+  // gPSMutex must be unlocked when we notify, to avoid potential deadlocks.
+  ProfilerParent::ClearAllPages();
+}
+
+namespace geckoprofiler::markers::detail {
+
+Maybe<uint64_t> profiler_get_inner_window_id_from_docshell(
+    nsIDocShell* aDocshell) {
+  Maybe<uint64_t> innerWindowID = Nothing();
+  if (aDocshell) {
+    auto outerWindow = aDocshell->GetWindow();
+    if (outerWindow) {
+      auto innerWindow = outerWindow->GetCurrentInnerWindow();
+      if (innerWindow) {
+        innerWindowID = Some(innerWindow->WindowID());
+      }
+    }
+  }
+  return innerWindowID;
+}
+
+}  // namespace geckoprofiler::markers::detail
+
+namespace geckoprofiler::markers {
+
+struct CPUAwakeMarker {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("Awake");
+  }
+  static void StreamJSONMarkerData(baseprofiler::SpliceableJSONWriter& aWriter,
+                                   int64_t aCPUId
+#ifdef GP_OS_darwin
+                                   ,
+                                   uint32_t aQoS
+#endif
+#ifdef GP_OS_windows
+                                   ,
+                                   int32_t aAbsolutePriority,
+                                   int32_t aRelativePriority,
+                                   int32_t aCurrentPriority
+#endif
+  ) {
+#ifndef GP_PLAT_arm64_darwin
+    aWriter.IntProperty("CPU Id", aCPUId);
+#endif
+#ifdef GP_OS_windows
+    if (aAbsolutePriority) {
+      aWriter.IntProperty("absPriority", aAbsolutePriority);
+    }
+    if (aCurrentPriority) {
+      aWriter.IntProperty("curPriority", aCurrentPriority);
+    }
+    aWriter.IntProperty("priority", aRelativePriority);
+#endif
+#ifdef GP_OS_darwin
+    const char* QoS = "";
+    switch (aQoS) {
+      case QOS_CLASS_USER_INTERACTIVE:
+        QoS = "User Interactive";
+        break;
+      case QOS_CLASS_USER_INITIATED:
+        QoS = "User Initiated";
+        break;
+      case QOS_CLASS_DEFAULT:
+        QoS = "Default";
+        break;
+      case QOS_CLASS_UTILITY:
+        QoS = "Utility";
+        break;
+      case QOS_CLASS_BACKGROUND:
+        QoS = "Background";
+        break;
+      default:
+        QoS = "Unspecified";
+    }
+
+    aWriter.StringProperty("QoS",
+                           ProfilerString8View::WrapNullTerminatedString(QoS));
+#endif
+  }
+
+  static MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::MarkerChart, MS::Location::MarkerTable};
+    schema.AddKeyFormat("CPU Time", MS::Format::Duration);
+#ifndef GP_PLAT_arm64_darwin
+    schema.AddKeyFormat("CPU Id", MS::Format::Integer);
+    schema.SetTableLabel("Awake - CPU Id = {marker.data.CPU Id}");
+#endif
+#ifdef GP_OS_windows
+    schema.AddKeyLabelFormat("priority", "Relative Thread Priority",
+                             MS::Format::Integer);
+    schema.AddKeyLabelFormat("absPriority", "Base Thread Priority",
+                             MS::Format::Integer);
+    schema.AddKeyLabelFormat("curPriority", "Current Thread Priority",
+                             MS::Format::Integer);
+#endif
+#ifdef GP_OS_darwin
+    schema.AddKeyLabelFormat("QoS", "Quality of Service", MS::Format::String);
+#endif
+    return schema;
+  }
+};
+
+struct CPUAwakeMarkerEnd : public CPUAwakeMarker {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("AwakeEnd");
+  }
+  static void StreamJSONMarkerData(baseprofiler::SpliceableJSONWriter& aWriter,
+                                   int64_t aCPUTimeNs) {
+    if (aCPUTimeNs) {
+      constexpr double NS_PER_MS = 1'000'000;
+      aWriter.DoubleProperty("CPU Time", double(aCPUTimeNs) / NS_PER_MS);
+    }
+  }
+};
+
+}  // namespace geckoprofiler::markers
+
+void profiler_mark_thread_asleep() {
+  if (!profiler_thread_is_being_profiled_for_markers()) {
+    return;
+  }
+
+  uint64_t cpuTimeNs = ThreadRegistration::WithOnThreadRefOr(
+      [](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        return aOnThreadRef.UnlockedConstReaderAndAtomicRWRef()
+            .GetNewCpuTimeInNs();
+      },
+      0);
+  PROFILER_MARKER("Awake", OTHER, MarkerTiming::IntervalEnd(),
+                  CPUAwakeMarkerEnd, cpuTimeNs);
+}
+
+void profiler_thread_sleep() {
+  profiler_mark_thread_asleep();
+  ThreadRegistration::WithOnThreadRef(
+      [](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        aOnThreadRef.UnlockedConstReaderAndAtomicRWRef().SetSleeping();
+      });
+}
+
+#if defined(GP_OS_windows)
+#  if !defined(__MINGW32__)
+enum {
+  ThreadBasicInformation,
+};
+#  endif
+
+struct THREAD_BASIC_INFORMATION {
+  NTSTATUS ExitStatus;
+  PVOID TebBaseAddress;
+  CLIENT_ID ClientId;
+  KAFFINITY AffMask;
+  DWORD Priority;
+  DWORD BasePriority;
+};
+#endif
+
+static mozilla::Atomic<uint64_t, mozilla::MemoryOrdering::Relaxed> gWakeCount(
+    0);
+
+namespace geckoprofiler::markers {
+struct WakeUpCountMarker {
+  static constexpr Span<const char> MarkerTypeName() {
+    return MakeStringSpan("WakeUpCount");
+  }
+  static void StreamJSONMarkerData(baseprofiler::SpliceableJSONWriter& aWriter,
+                                   int32_t aCount,
+                                   const ProfilerString8View& aType) {
+    aWriter.IntProperty("Count", aCount);
+    aWriter.StringProperty("label", aType);
+  }
+  static MarkerSchema MarkerTypeDisplay() {
+    using MS = MarkerSchema;
+    MS schema{MS::Location::MarkerChart, MS::Location::MarkerTable};
+    schema.AddKeyFormat("Count", MS::Format::Integer);
+    schema.SetTooltipLabel("{marker.name} - {marker.data.label}");
+    schema.SetTableLabel(
+        "{marker.name} - {marker.data.label}: {marker.data.count}");
+    return schema;
+  }
+};
+}  // namespace geckoprofiler::markers
+
+void profiler_record_wakeup_count(const nsACString& aProcessType) {
+  static uint64_t previousThreadWakeCount = 0;
+
+  uint64_t newWakeups = gWakeCount - previousThreadWakeCount;
+  if (newWakeups > 0) {
+    if (newWakeups < std::numeric_limits<int32_t>::max()) {
+      int32_t newWakeups32 = int32_t(newWakeups);
+      mozilla::glean::power::total_thread_wakeups.Add(newWakeups32);
+      mozilla::glean::power::wakeups_per_process_type.Get(aProcessType)
+          .Add(newWakeups32);
+      PROFILER_MARKER("Thread Wake-ups", OTHER, {}, WakeUpCountMarker,
+                      newWakeups32, aProcessType);
+    }
+
+    previousThreadWakeCount += newWakeups;
+  }
+
+#ifdef NIGHTLY_BUILD
+  ThreadRegistry::LockedRegistry lockedRegistry;
+  for (ThreadRegistry::OffThreadRef offThreadRef : lockedRegistry) {
+    const ThreadRegistry::UnlockedConstReaderAndAtomicRW& threadData =
+        offThreadRef.UnlockedConstReaderAndAtomicRWRef();
+    threadData.RecordWakeCount();
+  }
+#endif
+}
+
+void profiler_mark_thread_awake() {
+  ++gWakeCount;
+  if (!profiler_thread_is_being_profiled_for_markers()) {
+    return;
+  }
+
+  int64_t cpuId = 0;
+#if defined(GP_OS_windows)
+  cpuId = GetCurrentProcessorNumber();
+#elif defined(GP_OS_darwin)
+#  ifdef GP_PLAT_amd64_darwin
+  unsigned int eax, ebx, ecx, edx;
+  __cpuid_count(1, 0, eax, ebx, ecx, edx);
+  // Check if we have an APIC.
+  if ((edx & (1 << 9))) {
+    // APIC ID is bits 24-31 of EBX
+    cpuId = ebx >> 24;
+  }
+#  endif
+#else
+  cpuId = sched_getcpu();
+#endif
+
+#if defined(GP_OS_windows)
+  LONG priority;
+  static const auto get_thread_information_fn =
+      reinterpret_cast<decltype(&::GetThreadInformation)>(::GetProcAddress(
+          ::GetModuleHandle(L"Kernel32.dll"), "GetThreadInformation"));
+
+  if (!get_thread_information_fn ||
+      !get_thread_information_fn(GetCurrentThread(), ThreadAbsoluteCpuPriority,
+                                 &priority, sizeof(priority))) {
+    priority = 0;
+  }
+
+  static const auto nt_query_information_thread_fn =
+      reinterpret_cast<decltype(&::NtQueryInformationThread)>(::GetProcAddress(
+          ::GetModuleHandle(L"ntdll.dll"), "NtQueryInformationThread"));
+
+  LONG currentPriority = 0;
+  if (nt_query_information_thread_fn) {
+    THREAD_BASIC_INFORMATION threadInfo;
+    auto status = (*nt_query_information_thread_fn)(
+        GetCurrentThread(), (THREADINFOCLASS)ThreadBasicInformation,
+        &threadInfo, sizeof(threadInfo), NULL);
+    if (NT_SUCCESS(status)) {
+      currentPriority = threadInfo.Priority;
+    }
+  }
+#endif
+  PROFILER_MARKER(
+      "Awake", OTHER, MarkerTiming::IntervalStart(), CPUAwakeMarker, cpuId
+#if defined(GP_OS_darwin)
+      ,
+      qos_class_self()
+#endif
+#if defined(GP_OS_windows)
+          ,
+      priority, GetThreadPriority(GetCurrentThread()), currentPriority
+#endif
+  );
+}
+
+void profiler_thread_wake() {
+  profiler_mark_thread_awake();
+  ThreadRegistration::WithOnThreadRef(
+      [](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        aOnThreadRef.UnlockedConstReaderAndAtomicRWRef().SetAwake();
+      });
+}
+
+void profiler_js_interrupt_callback() {
+  // This function runs on JS threads being sampled.
+  PollJSSamplingForCurrentThread();
+}
+
+double profiler_time() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  TimeDuration delta = TimeStamp::Now() - CorePS::ProcessStartTime();
+  return delta.ToMilliseconds();
+}
+
+bool profiler_capture_backtrace_into(ProfileChunkedBuffer& aChunkedBuffer,
+                                     StackCaptureOptions aCaptureOptions) {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  if (!profiler_is_active() ||
+      aCaptureOptions == StackCaptureOptions::NoStack) {
+    return false;
+  }
+
+  return ThreadRegistration::WithOnThreadRefOr(
+      [&](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        mozilla::Maybe<uint32_t> maybeFeatures =
+            RacyFeatures::FeaturesIfActiveAndUnpaused();
+        if (!maybeFeatures) {
+          return false;
+        }
+
+        ProfileBuffer profileBuffer(aChunkedBuffer);
+
+        Registers regs;
+#if defined(HAVE_NATIVE_UNWIND)
+        REGISTERS_SYNC_POPULATE(regs);
+#else
+        regs.Clear();
+#endif
+
+        DoSyncSample(*maybeFeatures,
+                     aOnThreadRef.UnlockedReaderAndAtomicRWOnThreadCRef(),
+                     TimeStamp::Now(), regs, profileBuffer, aCaptureOptions);
+
+        return true;
+      },
+      // If this was called from a non-registered thread, return false and do no
+      // more work. This can happen from a memory hook.
+      false);
+}
+
+UniquePtr<ProfileChunkedBuffer> profiler_capture_backtrace() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+  AUTO_PROFILER_LABEL("profiler_capture_backtrace", PROFILER);
+
+  // Quick is-active and feature check before allocating a buffer.
+  // If NoMarkerStacks is set, we don't want to capture a backtrace.
+  if (!profiler_active_without_feature(ProfilerFeature::NoMarkerStacks)) {
+    return nullptr;
+  }
+
+  auto buffer = MakeUnique<ProfileChunkedBuffer>(
+      ProfileChunkedBuffer::ThreadSafety::WithoutMutex,
+      MakeUnique<ProfileBufferChunkManagerSingle>(
+          ProfileBufferChunkManager::scExpectedMaximumStackSize));
+
+  if (!profiler_capture_backtrace_into(*buffer, StackCaptureOptions::Full)) {
+    return nullptr;
+  }
+
+  return buffer;
+}
+
+UniqueProfilerBacktrace profiler_get_backtrace() {
+  UniquePtr<ProfileChunkedBuffer> buffer = profiler_capture_backtrace();
+
+  if (!buffer) {
+    return nullptr;
+  }
+
+  return UniqueProfilerBacktrace(
+      new ProfilerBacktrace("SyncProfile", std::move(buffer)));
+}
+
+void ProfilerBacktraceDestructor::operator()(ProfilerBacktrace* aBacktrace) {
+  delete aBacktrace;
+}
+
+bool profiler_is_locked_on_current_thread() {
+  // This function is used to help users avoid calling `profiler_...` functions
+  // when the profiler may already have a lock in place, which would prevent a
+  // 2nd recursive lock (resulting in a crash or a never-ending wait), or a
+  // deadlock between any two mutexes. So we must return `true` for any of:
+  // - The main profiler mutex, used by most functions, and/or
+  // - The buffer mutex, used directly in some functions without locking the
+  //   main mutex, e.g., marker-related functions.
+  // - The ProfilerParent or ProfilerChild mutex, used to store and process
+  //   buffer chunk updates.
+  return PSAutoLock::IsLockedOnCurrentThread() ||
+         ThreadRegistry::IsRegistryMutexLockedOnCurrentThread() ||
+         ThreadRegistration::IsDataMutexLockedOnCurrentThread() ||
+         profiler_get_core_buffer().IsThreadSafeAndLockedOnCurrentThread() ||
+         ProfilerParent::IsLockedOnCurrentThread() ||
+         ProfilerChild::IsLockedOnCurrentThread();
+}
+
+void profiler_set_js_context(JSContext* aCx) {
+  MOZ_ASSERT(aCx);
+  ThreadRegistration::WithOnThreadRef(
+      [&](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        // The profiler mutex must be locked before the ThreadRegistration's.
+        PSAutoLock lock;
+        aOnThreadRef.WithLockedRWOnThread(
+            [&](ThreadRegistration::LockedRWOnThread& aThreadData) {
+              aThreadData.SetJSContext(aCx);
+
+              if (!ActivePS::Exists(lock) || !ActivePS::FeatureJS(lock)) {
+                return;
+              }
+
+              // This call is on-thread, so we can call PollJSSampling() to
+              // start JS sampling immediately.
+              aThreadData.PollJSSampling();
+
+              if (ProfiledThreadData* profiledThreadData =
+                      aThreadData.GetProfiledThreadData(lock);
+                  profiledThreadData) {
+                profiledThreadData->NotifyReceivedJSContext(
+                    ActivePS::Buffer(lock).BufferRangeEnd());
+              }
+            });
+      });
+}
+
+void profiler_clear_js_context() {
+  MOZ_RELEASE_ASSERT(CorePS::Exists());
+
+  ThreadRegistration::WithOnThreadRef(
+      [](ThreadRegistration::OnThreadRef aOnThreadRef) {
+        JSContext* cx =
+            aOnThreadRef.UnlockedReaderAndAtomicRWOnThreadCRef().GetJSContext();
+        if (!cx) {
+          return;
+        }
+
+        // The profiler mutex must be locked before the ThreadRegistration's.
+        PSAutoLock lock;
+        ThreadRegistration::OnThreadRef::RWOnThreadWithLock lockedThreadData =
+            aOnThreadRef.GetLockedRWOnThread();
+
+        if (ProfiledThreadData* profiledThreadData =
+                lockedThreadData->GetProfiledThreadData(lock);
+            profiledThreadData && ActivePS::Exists(lock) &&
+            ActivePS::FeatureJS(lock)) {
+          profiledThreadData->NotifyAboutToLoseJSContext(
+              cx, CorePS::ProcessStartTime(), ActivePS::Buffer(lock));
+
+          // Notify the JS context that profiling for this context has
+          // stopped. Do this by calling StopJSSampling and PollJSSampling
+          // before nulling out the JSContext.
+          lockedThreadData->StopJSSampling();
+          lockedThreadData->PollJSSampling();
+
+          lockedThreadData->ClearJSContext();
+
+          // Tell the thread that we'd like to have JS sampling on this
+          // thread again, once it gets a new JSContext (if ever).
+          lockedThreadData->StartJSSampling(ActivePS::JSFlags(lock));
+        } else {
+          // This thread is not being profiled or JS profiling is off, we only
+          // need to clear the context pointer.
+          lockedThreadData->ClearJSContext();
+        }
+      });
+}
+
+static void profiler_suspend_and_sample_thread(
+    const PSAutoLock* aLockIfAsynchronousSampling,
+    const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread& aThreadData,
+    JsFrame* aJsFrames, uint32_t aFeatures, ProfilerStackCollector& aCollector,
+    bool aSampleNative) {
+  const ThreadRegistrationInfo& info = aThreadData.Info();
+
+  if (info.IsMainThread()) {
+    aCollector.SetIsMainThread();
+  }
+
+  // Allocate the space for the native stack
+  NativeStack nativeStack;
+
+  auto collectStack = [&](const Registers& aRegs, const TimeStamp& aNow) {
+    // The target thread is now suspended. Collect a native backtrace,
+    // and call the callback.
+    StackWalkControl* stackWalkControlIfSupported = nullptr;
+#if defined(HAVE_FASTINIT_NATIVE_UNWIND)
+    StackWalkControl stackWalkControl;
+    if constexpr (StackWalkControl::scIsSupported) {
+      if (aSampleNative) {
+        stackWalkControlIfSupported = &stackWalkControl;
+      }
+    }
+#endif
+    const uint32_t jsFramesCount =
+        aJsFrames ? ExtractJsFrames(!aLockIfAsynchronousSampling, aThreadData,
+                                    aRegs, aCollector, aJsFrames,
+                                    stackWalkControlIfSupported)
+                  : 0;
+
+#if defined(HAVE_FASTINIT_NATIVE_UNWIND)
+    if (aSampleNative) {
+      // We can only use FramePointerStackWalk or MozStackWalk from
+      // suspend_and_sample_thread as other stackwalking methods may not be
+      // initialized.
+#  if defined(USE_FRAME_POINTER_STACK_WALK)
+      DoFramePointerBacktrace(aThreadData, aRegs, nativeStack,
+                              stackWalkControlIfSupported);
+#  elif defined(USE_MOZ_STACK_WALK)
+      DoMozStackWalkBacktrace(aThreadData, aRegs, nativeStack,
+                              stackWalkControlIfSupported);
+#  else
+#    error "Invalid configuration"
+#  endif
+
+      MergeStacks(aFeatures, !aLockIfAsynchronousSampling, aThreadData, aRegs,
+                  nativeStack, aCollector, aJsFrames, jsFramesCount);
+    } else
+#endif
+    {
+      MergeStacks(aFeatures, !aLockIfAsynchronousSampling, aThreadData, aRegs,
+                  nativeStack, aCollector, aJsFrames, jsFramesCount);
+
+      aCollector.CollectNativeLeafAddr((void*)aRegs.mPC);
+    }
+  };
+
+  if (!aLockIfAsynchronousSampling) {
+    // Sampling the current thread, do NOT suspend it!
+    Registers regs;
+#if defined(HAVE_NATIVE_UNWIND)
+    REGISTERS_SYNC_POPULATE(regs);
+#else
+    regs.Clear();
+#endif
+    collectStack(regs, TimeStamp::Now());
+  } else {
+    // Suspend, sample, and then resume the target thread.
+    Sampler sampler(*aLockIfAsynchronousSampling);
+    TimeStamp now = TimeStamp::Now();
+    sampler.SuspendAndSampleAndResumeThread(*aLockIfAsynchronousSampling,
+                                            aThreadData, now, collectStack);
+
+    // NOTE: Make sure to disable the sampler before it is destroyed, in
+    // case the profiler is running at the same time.
+    sampler.Disable(*aLockIfAsynchronousSampling);
+  }
+}
+
+// NOTE: aCollector's methods will be called while the target thread is paused.
+// Doing things in those methods like allocating -- which may try to claim
+// locks -- is a surefire way to deadlock.
+void profiler_suspend_and_sample_thread(ProfilerThreadId aThreadId,
+                                        uint32_t aFeatures,
+                                        ProfilerStackCollector& aCollector,
+                                        bool aSampleNative /* = true */) {
+  if (!aThreadId.IsSpecified() || aThreadId == profiler_current_thread_id()) {
+    // Sampling the current thread. Get its information from the TLS (no locking
+    // required.)
+    ThreadRegistration::WithOnThreadRef(
+        [&](ThreadRegistration::OnThreadRef aOnThreadRef) {
+          aOnThreadRef.WithUnlockedReaderAndAtomicRWOnThread(
+              [&](const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread&
+                      aThreadData) {
+                if (!aThreadData.GetJSContext()) {
+                  // No JSContext, there is no JS frame buffer (and no need for
+                  // it).
+                  profiler_suspend_and_sample_thread(
+                      /* aLockIfAsynchronousSampling = */ nullptr, aThreadData,
+                      /* aJsFrames = */ nullptr, aFeatures, aCollector,
+                      aSampleNative);
+                } else {
+                  // JSContext is present, we need to lock the thread data to
+                  // access the JS frame buffer.
+                  aOnThreadRef.WithConstLockedRWOnThread(
+                      [&](const ThreadRegistration::LockedRWOnThread&
+                              aLockedThreadData) {
+                        profiler_suspend_and_sample_thread(
+                            /* aLockIfAsynchronousSampling = */ nullptr,
+                            aThreadData, aLockedThreadData.GetJsFrameBuffer(),
+                            aFeatures, aCollector, aSampleNative);
+                      });
+                }
+              });
+        });
+  } else {
+    // Lock the profiler before accessing the ThreadRegistry.
+    PSAutoLock lock;
+    ThreadRegistry::WithOffThreadRef(
+        aThreadId, [&](ThreadRegistry::OffThreadRef aOffThreadRef) {
+          aOffThreadRef.WithLockedRWFromAnyThread(
+              [&](const ThreadRegistration::UnlockedReaderAndAtomicRWOnThread&
+                      aThreadData) {
+                JsFrameBuffer& jsFrames = CorePS::JsFrames(lock);
+                profiler_suspend_and_sample_thread(&lock, aThreadData, jsFrames,
+                                                   aFeatures, aCollector,
+                                                   aSampleNative);
+              });
+        });
+  }
+}
+
+// END externally visible functions
+////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/core/platform.h b/tools/profiler/core/platform.h
new file mode 100644
index 0000000000..59d2c7ff42
--- /dev/null
+++ b/tools/profiler/core/platform.h
@@ -0,0 +1,381 @@
+// Copyright (c) 2006-2011 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in
+//    the documentation and/or other materials provided with the
+//    distribution.
+//  * Neither the name of Google, Inc. nor the names of its contributors
+//    may be used to endorse or promote products derived from this
+//    software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+#ifndef TOOLS_PLATFORM_H_
+#define TOOLS_PLATFORM_H_
+
+#include "PlatformMacros.h"
+
+#include "json/json.h"
+#include "mozilla/Atomics.h"
+#include "mozilla/BaseProfilerDetail.h"
+#include "mozilla/Logging.h"
+#include "mozilla/MathAlgorithms.h"
+#include "mozilla/ProfileBufferEntrySerialization.h"
+#include "mozilla/ProfileJSONWriter.h"
+#include "mozilla/ProfilerUtils.h"
+#include "mozilla/ProgressLogger.h"
+#include "mozilla/TimeStamp.h"
+#include "mozilla/UniquePtr.h"
+#include "mozilla/Vector.h"
+#include "nsString.h"
+#include "shared-libraries.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+
+class ProfilerCodeAddressService;
+
+namespace mozilla {
+struct SymbolTable;
+}
+
+extern mozilla::LazyLogModule gProfilerLog;
+
+// These are for MOZ_LOG="prof:3" or higher. It's the default logging level for
+// the profiler, and should be used sparingly.
+#define LOG_TEST MOZ_LOG_TEST(gProfilerLog, mozilla::LogLevel::Info)
+#define LOG(arg, ...)                            \
+  MOZ_LOG(gProfilerLog, mozilla::LogLevel::Info, \
+          ("[%" PRIu64 "] " arg,                 \
+           uint64_t(profiler_current_process_id().ToNumber()), ##__VA_ARGS__))
+
+// These are for MOZ_LOG="prof:4" or higher. It should be used for logging that
+// is somewhat more verbose than LOG.
+#define DEBUG_LOG_TEST MOZ_LOG_TEST(gProfilerLog, mozilla::LogLevel::Debug)
+#define DEBUG_LOG(arg, ...)                       \
+  MOZ_LOG(gProfilerLog, mozilla::LogLevel::Debug, \
+          ("[%" PRIu64 "] " arg,                  \
+           uint64_t(profiler_current_process_id().ToNumber()), ##__VA_ARGS__))
+
+typedef uint8_t* Address;
+
+// Stringify the given JSON value, in the most compact format.
+// Note: Numbers are limited to a precision of 6 decimal digits, so that
+// timestamps in ms have a precision in ns.
+Json::String ToCompactString(const Json::Value& aJsonValue);
+
+// Profiling log stored in a Json::Value. The actual log only exists while the
+// profiler is running, and will be inserted at the end of the JSON profile.
+class ProfilingLog {
+ public:
+  // These will be called by ActivePS when the profiler starts/stops.
+  static void Init();
+  static void Destroy();
+
+  // Access the profiling log JSON object, in order to modify it.
+  // Only calls the given function if the profiler is active.
+  // Thread-safe. But `aF` must not call other locking profiler functions.
+  // This is intended to capture some internal logging that doesn't belong in
+  // other places like markers. The log is accessible through the JS console on
+  // profiler.firefox.com, in the `profile.profilingLog` object; the data format
+  // is intentionally not defined, and not intended to be shown in the
+  // front-end.
+  // Please use caution not to output too much data.
+  template <typename F>
+  static void Access(F&& aF) {
+    mozilla::baseprofiler::detail::BaseProfilerAutoLock lock{gMutex};
+    if (gLog) {
+      std::forward<F>(aF)(*gLog);
+    }
+  }
+
+#define DURATION_JSON_SUFFIX "_ms"
+
+  // Convert a TimeDuration to the value to be stored in the log.
+  // Use DURATION_JSON_SUFFIX as suffix in the property name.
+  static Json::Value Duration(const mozilla::TimeDuration& aDuration) {
+    return Json::Value{aDuration.ToMilliseconds()};
+  }
+
+#define TIMESTAMP_JSON_SUFFIX "_TSms"
+
+  // Convert a TimeStamp to the value to be stored in the log.
+  // Use TIMESTAMP_JSON_SUFFIX as suffix in the property name.
+  static Json::Value Timestamp(
+      const mozilla::TimeStamp& aTimestamp = mozilla::TimeStamp::Now()) {
+    if (aTimestamp.IsNull()) {
+      return Json::Value{0.0};
+    }
+    return Duration(aTimestamp - mozilla::TimeStamp::ProcessCreation());
+  }
+
+  static bool IsLockedOnCurrentThread();
+
+ private:
+  static mozilla::baseprofiler::detail::BaseProfilerMutex gMutex;
+  static mozilla::UniquePtr<Json::Value> gLog;
+};
+
+// ----------------------------------------------------------------------------
+// Miscellaneous
+
+// If positive, skip stack-sampling in the sampler thread loop.
+// Users should increment it atomically when samplings should be avoided, and
+// later decrement it back. Multiple uses can overlap.
+// There could be a sampling in progress when this is first incremented, so if
+// it is critical to prevent any sampling, lock the profiler mutex instead.
+// Relaxed ordering, because it's used to request that the profiler pause
+// future sampling; this is not time critical, nor dependent on anything else.
+extern mozilla::Atomic<int, mozilla::MemoryOrdering::Relaxed> gSkipSampling;
+
+void AppendSharedLibraries(mozilla::JSONWriter& aWriter,
+                           const SharedLibraryInfo& aInfo);
+
+// Convert the array of strings to a bitfield.
+uint32_t ParseFeaturesFromStringArray(const char** aFeatures,
+                                      uint32_t aFeatureCount,
+                                      bool aIsStartup = false);
+
+// Add the begin/end 'Awake' markers for the thread.
+void profiler_mark_thread_awake();
+
+void profiler_mark_thread_asleep();
+
+[[nodiscard]] bool profiler_get_profile_json(
+    SpliceableChunkedJSONWriter& aSpliceableChunkedJSONWriter,
+    double aSinceTime, bool aIsShuttingDown,
+    mozilla::ProgressLogger aProgressLogger);
+
+// Flags to conveniently track various JS instrumentations.
+enum class JSInstrumentationFlags {
+  StackSampling = 0x1,
+  Allocations = 0x2,
+};
+
+// Write out the information of the active profiling configuration.
+void profiler_write_active_configuration(mozilla::JSONWriter& aWriter);
+
+// Extract all received exit profiles that have not yet expired (i.e., they
+// still intersect with this process' buffer range).
+mozilla::Vector<nsCString> profiler_move_exit_profiles();
+
+// If the "MOZ_PROFILER_SYMBOLICATE" env-var is set, we return a new
+// ProfilerCodeAddressService object to use for local symbolication of profiles.
+// This is off by default, and mainly intended for local development.
+mozilla::UniquePtr<ProfilerCodeAddressService>
+profiler_code_address_service_for_presymbolication();
+
+extern "C" {
+// This function is defined in the profiler rust module at
+// tools/profiler/rust-helper. mozilla::SymbolTable and CompactSymbolTable
+// have identical memory layout.
+bool profiler_get_symbol_table(const char* debug_path, const char* breakpad_id,
+                               mozilla::SymbolTable* symbol_table);
+
+bool profiler_demangle_rust(const char* mangled, char* buffer, size_t len);
+}
+
+// For each running times value, call MACRO(index, name, unit, jsonProperty)
+#define PROFILER_FOR_EACH_RUNNING_TIME(MACRO) \
+  MACRO(0, ThreadCPU, Delta, threadCPUDelta)
+
+// This class contains all "running times" such as CPU usage measurements.
+// All measurements are listed in `PROFILER_FOR_EACH_RUNNING_TIME` above.
+// Each measurement is optional and only takes a value when explicitly set.
+// Two RunningTimes object may be subtracted, to get the difference between
+// known values.
+class RunningTimes {
+ public:
+  constexpr RunningTimes() = default;
+
+  // Constructor with only a timestamp, useful when no measurements will be
+  // taken.
+  constexpr explicit RunningTimes(const mozilla::TimeStamp& aTimeStamp)
+      : mPostMeasurementTimeStamp(aTimeStamp) {}
+
+  constexpr void Clear() { *this = RunningTimes{}; }
+
+  constexpr bool IsEmpty() const { return mKnownBits == 0; }
+
+  // This should be called right after CPU measurements have been taken.
+  void SetPostMeasurementTimeStamp(const mozilla::TimeStamp& aTimeStamp) {
+    mPostMeasurementTimeStamp = aTimeStamp;
+  }
+
+  const mozilla::TimeStamp& PostMeasurementTimeStamp() const {
+    return mPostMeasurementTimeStamp;
+  }
+
+  // Should be filled for any registered thread.
+
+#define RUNNING_TIME_MEMBER(index, name, unit, jsonProperty)          \
+  constexpr bool Is##name##unit##Known() const {                      \
+    return (mKnownBits & mGot##name##unit) != 0;                      \
+  }                                                                   \
+                                                                      \
+  constexpr void Clear##name##unit() {                                \
+    m##name##unit = 0;                                                \
+    mKnownBits &= ~mGot##name##unit;                                  \
+  }                                                                   \
+                                                                      \
+  constexpr void Reset##name##unit(uint64_t a##name##unit) {          \
+    m##name##unit = a##name##unit;                                    \
+    mKnownBits |= mGot##name##unit;                                   \
+  }                                                                   \
+                                                                      \
+  constexpr void Set##name##unit(uint64_t a##name##unit) {            \
+    MOZ_ASSERT(!Is##name##unit##Known(), #name #unit " already set"); \
+    Reset##name##unit(a##name##unit);                                 \
+  }                                                                   \
+                                                                      \
+  constexpr mozilla::Maybe<uint64_t> Get##name##unit() const {        \
+    if (Is##name##unit##Known()) {                                    \
+      return mozilla::Some(m##name##unit);                            \
+    }                                                                 \
+    return mozilla::Nothing{};                                        \
+  }                                                                   \
+                                                                      \
+  constexpr mozilla::Maybe<uint64_t> GetJson##name##unit() const {    \
+    if (Is##name##unit##Known()) {                                    \
+      return mozilla::Some(ConvertRawToJson(m##name##unit));          \
+    }                                                                 \
+    return mozilla::Nothing{};                                        \
+  }
+
+  PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_MEMBER)
+
+#undef RUNNING_TIME_MEMBER
+
+  // Take values from another RunningTimes.
+  RunningTimes& TakeFrom(RunningTimes& aOther) {
+    if (!aOther.IsEmpty()) {
+#define RUNNING_TIME_TAKE(index, name, unit, jsonProperty)   \
+  if (aOther.Is##name##unit##Known()) {                      \
+    Set##name##unit(std::exchange(aOther.m##name##unit, 0)); \
+  }
+
+      PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_TAKE)
+
+#undef RUNNING_TIME_TAKE
+
+      aOther.mKnownBits = 0;
+    }
+    return *this;
+  }
+
+  // Difference from `aBefore` to `this`. Any unknown makes the result unknown.
+  // PostMeasurementTimeStamp set to `this` PostMeasurementTimeStamp, to keep
+  // the most recent timestamp associated with the end of the interval over
+  // which the difference applies.
+  RunningTimes operator-(const RunningTimes& aBefore) const {
+    RunningTimes diff;
+    diff.mPostMeasurementTimeStamp = mPostMeasurementTimeStamp;
+#define RUNNING_TIME_SUB(index, name, unit, jsonProperty)           \
+  if (Is##name##unit##Known() && aBefore.Is##name##unit##Known()) { \
+    diff.Set##name##unit(m##name##unit - aBefore.m##name##unit);    \
+  }
+
+    PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_SUB)
+
+#undef RUNNING_TIME_SUB
+    return diff;
+  }
+
+ private:
+  friend mozilla::ProfileBufferEntryWriter::Serializer<RunningTimes>;
+  friend mozilla::ProfileBufferEntryReader::Deserializer<RunningTimes>;
+
+  // Platform-dependent.
+  static uint64_t ConvertRawToJson(uint64_t aRawValue);
+
+  mozilla::TimeStamp mPostMeasurementTimeStamp;
+
+  uint32_t mKnownBits = 0u;
+
+#define RUNNING_TIME_MEMBER(index, name, unit, jsonProperty) \
+  static constexpr uint32_t mGot##name##unit = 1u << index;  \
+  uint64_t m##name##unit = 0;
+
+  PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_MEMBER)
+
+#undef RUNNING_TIME_MEMBER
+};
+
+template <>
+struct mozilla::ProfileBufferEntryWriter::Serializer<RunningTimes> {
+  static Length Bytes(const RunningTimes& aRunningTimes) {
+    Length bytes = 0;
+
+#define RUNNING_TIME_SERIALIZATION_BYTES(index, name, unit, jsonProperty) \
+  if (aRunningTimes.Is##name##unit##Known()) {                            \
+    bytes += ULEB128Size(aRunningTimes.m##name##unit);                    \
+  }
+
+    PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_SERIALIZATION_BYTES)
+
+#undef RUNNING_TIME_SERIALIZATION_BYTES
+    return ULEB128Size(aRunningTimes.mKnownBits) + bytes;
+  }
+
+  static void Write(ProfileBufferEntryWriter& aEW,
+                    const RunningTimes& aRunningTimes) {
+    aEW.WriteULEB128(aRunningTimes.mKnownBits);
+
+#define RUNNING_TIME_SERIALIZE(index, name, unit, jsonProperty) \
+  if (aRunningTimes.Is##name##unit##Known()) {                  \
+    aEW.WriteULEB128(aRunningTimes.m##name##unit);              \
+  }
+
+    PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_SERIALIZE)
+
+#undef RUNNING_TIME_SERIALIZE
+  }
+};
+
+template <>
+struct mozilla::ProfileBufferEntryReader::Deserializer<RunningTimes> {
+  static void ReadInto(ProfileBufferEntryReader& aER,
+                       RunningTimes& aRunningTimes) {
+    aRunningTimes = Read(aER);
+  }
+
+  static RunningTimes Read(ProfileBufferEntryReader& aER) {
+    // Start with empty running times, everything is cleared.
+    RunningTimes times;
+
+    // This sets all the bits into mKnownBits, we don't need to modify it
+    // further.
+    times.mKnownBits = aER.ReadULEB128<uint32_t>();
+
+    // For each member that should be known, read its value.
+#define RUNNING_TIME_DESERIALIZE(index, name, unit, jsonProperty)           \
+  if (times.Is##name##unit##Known()) {                                      \
+    times.m##name##unit = aER.ReadULEB128<decltype(times.m##name##unit)>(); \
+  }
+
+    PROFILER_FOR_EACH_RUNNING_TIME(RUNNING_TIME_DESERIALIZE)
+
+#undef RUNNING_TIME_DESERIALIZE
+
+    return times;
+  }
+};
+
+#endif /* ndef TOOLS_PLATFORM_H_ */
diff --git a/tools/profiler/core/shared-libraries-linux.cc b/tools/profiler/core/shared-libraries-linux.cc
new file mode 100644
index 0000000000..2991e64909
--- /dev/null
+++ b/tools/profiler/core/shared-libraries-linux.cc
@@ -0,0 +1,280 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "shared-libraries.h"
+
+#define PATH_MAX_TOSTRING(x) #x
+#define PATH_MAX_STRING(x) PATH_MAX_TOSTRING(x)
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+#include <fstream>
+#include "platform.h"
+#include "shared-libraries.h"
+#include "GeckoProfiler.h"
+#include "mozilla/Sprintf.h"
+#include "mozilla/Unused.h"
+#include "nsDebug.h"
+#include "nsNativeCharsetUtils.h"
+#include <nsTArray.h>
+
+#include "common/linux/file_id.h"
+#include <algorithm>
+#include <dlfcn.h>
+#if defined(GP_OS_linux) || defined(GP_OS_android)
+#  include <features.h>
+#endif
+#include <sys/types.h>
+
+#if defined(GP_OS_linux) || defined(GP_OS_android) || defined(GP_OS_freebsd)
+#  include <link.h>  // dl_phdr_info
+#else
+#  error "Unexpected configuration"
+#endif
+
+#if defined(GP_OS_android)
+extern "C" MOZ_EXPORT __attribute__((weak)) int dl_iterate_phdr(
+    int (*callback)(struct dl_phdr_info* info, size_t size, void* data),
+    void* data);
+#endif
+
+struct LoadedLibraryInfo {
+  LoadedLibraryInfo(const char* aName, unsigned long aBaseAddress,
+                    unsigned long aFirstMappingStart,
+                    unsigned long aLastMappingEnd)
+      : mName(aName),
+        mBaseAddress(aBaseAddress),
+        mFirstMappingStart(aFirstMappingStart),
+        mLastMappingEnd(aLastMappingEnd) {}
+
+  nsCString mName;
+  unsigned long mBaseAddress;
+  unsigned long mFirstMappingStart;
+  unsigned long mLastMappingEnd;
+};
+
+static nsCString IDtoUUIDString(
+    const google_breakpad::wasteful_vector<uint8_t>& aIdentifier) {
+  using namespace google_breakpad;
+
+  nsCString uuid;
+  const std::string str = FileID::ConvertIdentifierToUUIDString(aIdentifier);
+  uuid.Append(str.c_str(), str.size());
+  // This is '0', not '\0', since it represents the breakpad id age.
+  uuid.Append('0');
+  return uuid;
+}
+
+// Return raw Build ID in hex.
+static nsCString IDtoString(
+    const google_breakpad::wasteful_vector<uint8_t>& aIdentifier) {
+  using namespace google_breakpad;
+
+  nsCString uuid;
+  const std::string str = FileID::ConvertIdentifierToString(aIdentifier);
+  uuid.Append(str.c_str(), str.size());
+  return uuid;
+}
+
+// Get the breakpad Id for the binary file pointed by bin_name
+static nsCString getBreakpadId(const char* bin_name) {
+  using namespace google_breakpad;
+
+  PageAllocator allocator;
+  auto_wasteful_vector<uint8_t, kDefaultBuildIdSize> identifier(&allocator);
+
+  FileID file_id(bin_name);
+  if (file_id.ElfFileIdentifier(identifier)) {
+    return IDtoUUIDString(identifier);
+  }
+
+  return ""_ns;
+}
+
+// Get the code Id for the binary file pointed by bin_name
+static nsCString getCodeId(const char* bin_name) {
+  using namespace google_breakpad;
+
+  PageAllocator allocator;
+  auto_wasteful_vector<uint8_t, kDefaultBuildIdSize> identifier(&allocator);
+
+  FileID file_id(bin_name);
+  if (file_id.ElfFileIdentifier(identifier)) {
+    return IDtoString(identifier);
+  }
+
+  return ""_ns;
+}
+
+static SharedLibrary SharedLibraryAtPath(const char* path,
+                                         unsigned long libStart,
+                                         unsigned long libEnd,
+                                         unsigned long offset = 0) {
+  nsAutoString pathStr;
+  mozilla::Unused << NS_WARN_IF(
+      NS_FAILED(NS_CopyNativeToUnicode(nsDependentCString(path), pathStr)));
+
+  nsAutoString nameStr = pathStr;
+  int32_t pos = nameStr.RFindChar('/');
+  if (pos != kNotFound) {
+    nameStr.Cut(0, pos + 1);
+  }
+
+  return SharedLibrary(libStart, libEnd, offset, getBreakpadId(path),
+                       getCodeId(path), nameStr, pathStr, nameStr, pathStr,
+                       ""_ns, "");
+}
+
+static int dl_iterate_callback(struct dl_phdr_info* dl_info, size_t size,
+                               void* data) {
+  auto libInfoList = reinterpret_cast<nsTArray<LoadedLibraryInfo>*>(data);
+
+  if (dl_info->dlpi_phnum <= 0) return 0;
+
+  unsigned long baseAddress = dl_info->dlpi_addr;
+  unsigned long firstMappingStart = -1;
+  unsigned long lastMappingEnd = 0;
+
+  for (size_t i = 0; i < dl_info->dlpi_phnum; i++) {
+    if (dl_info->dlpi_phdr[i].p_type != PT_LOAD) {
+      continue;
+    }
+    unsigned long start = dl_info->dlpi_addr + dl_info->dlpi_phdr[i].p_vaddr;
+    unsigned long end = start + dl_info->dlpi_phdr[i].p_memsz;
+    if (start < firstMappingStart) {
+      firstMappingStart = start;
+    }
+    if (end > lastMappingEnd) {
+      lastMappingEnd = end;
+    }
+  }
+
+  libInfoList->AppendElement(LoadedLibraryInfo(
+      dl_info->dlpi_name, baseAddress, firstMappingStart, lastMappingEnd));
+
+  return 0;
+}
+
+SharedLibraryInfo SharedLibraryInfo::GetInfoForSelf() {
+  SharedLibraryInfo info;
+
+#if defined(GP_OS_linux)
+  // We need to find the name of the executable (exeName, exeNameLen) and the
+  // address of its executable section (exeExeAddr) in the running image.
+  char exeName[PATH_MAX];
+  memset(exeName, 0, sizeof(exeName));
+
+  ssize_t exeNameLen = readlink("/proc/self/exe", exeName, sizeof(exeName) - 1);
+  if (exeNameLen == -1) {
+    // readlink failed for whatever reason.  Note this, but keep going.
+    exeName[0] = '\0';
+    exeNameLen = 0;
+    LOG("SharedLibraryInfo::GetInfoForSelf(): readlink failed");
+  } else {
+    // Assert no buffer overflow.
+    MOZ_RELEASE_ASSERT(exeNameLen >= 0 &&
+                       exeNameLen < static_cast<ssize_t>(sizeof(exeName)));
+  }
+
+  unsigned long exeExeAddr = 0;
+#endif
+
+#if defined(GP_OS_android)
+  // If dl_iterate_phdr doesn't exist, we give up immediately.
+  if (!dl_iterate_phdr) {
+    // On ARM Android, dl_iterate_phdr is provided by the custom linker.
+    // So if libxul was loaded by the system linker (e.g. as part of
+    // xpcshell when running tests), it won't be available and we should
+    // not call it.
+    return info;
+  }
+#endif
+
+#if defined(GP_OS_linux) || defined(GP_OS_android)
+  // Read info from /proc/self/maps. We ignore most of it.
+  pid_t pid = profiler_current_process_id().ToNumber();
+  char path[PATH_MAX];
+  SprintfLiteral(path, "/proc/%d/maps", pid);
+  std::ifstream maps(path);
+  std::string line;
+  while (std::getline(maps, line)) {
+    int ret;
+    unsigned long start;
+    unsigned long end;
+    char perm[6 + 1] = "";
+    unsigned long offset;
+    char modulePath[PATH_MAX + 1] = "";
+    ret = sscanf(line.c_str(),
+                 "%lx-%lx %6s %lx %*s %*x %" PATH_MAX_STRING(PATH_MAX) "s\n",
+                 &start, &end, perm, &offset, modulePath);
+    if (!strchr(perm, 'x')) {
+      // Ignore non executable entries
+      continue;
+    }
+    if (ret != 5 && ret != 4) {
+      LOG("SharedLibraryInfo::GetInfoForSelf(): "
+          "reading /proc/self/maps failed");
+      continue;
+    }
+
+#  if defined(GP_OS_linux)
+    // Try to establish the main executable's load address.
+    if (exeNameLen > 0 && strcmp(modulePath, exeName) == 0) {
+      exeExeAddr = start;
+    }
+#  elif defined(GP_OS_android)
+    // Use /proc/pid/maps to get the dalvik-jit section since it has no
+    // associated phdrs.
+    if (0 == strcmp(modulePath, "/dev/ashmem/dalvik-jit-code-cache")) {
+      info.AddSharedLibrary(
+          SharedLibraryAtPath(modulePath, start, end, offset));
+      if (info.GetSize() > 10000) {
+        LOG("SharedLibraryInfo::GetInfoForSelf(): "
+            "implausibly large number of mappings acquired");
+        break;
+      }
+    }
+#  endif
+  }
+#endif
+
+  nsTArray<LoadedLibraryInfo> libInfoList;
+
+  // We collect the bulk of the library info using dl_iterate_phdr.
+  dl_iterate_phdr(dl_iterate_callback, &libInfoList);
+
+  for (const auto& libInfo : libInfoList) {
+    info.AddSharedLibrary(
+        SharedLibraryAtPath(libInfo.mName.get(), libInfo.mFirstMappingStart,
+                            libInfo.mLastMappingEnd,
+                            libInfo.mFirstMappingStart - libInfo.mBaseAddress));
+  }
+
+#if defined(GP_OS_linux)
+  // Make another pass over the information we just harvested from
+  // dl_iterate_phdr.  If we see a nameless object mapped at what we earlier
+  // established to be the main executable's load address, attach the
+  // executable's name to that entry.
+  for (size_t i = 0; i < info.GetSize(); i++) {
+    SharedLibrary& lib = info.GetMutableEntry(i);
+    if (lib.GetStart() <= exeExeAddr && exeExeAddr <= lib.GetEnd() &&
+        lib.GetNativeDebugPath().empty()) {
+      lib = SharedLibraryAtPath(exeName, lib.GetStart(), lib.GetEnd(),
+                                lib.GetOffset());
+
+      // We only expect to see one such entry.
+      break;
+    }
+  }
+#endif
+
+  return info;
+}
+
+void SharedLibraryInfo::Initialize() { /* do nothing */
+}
diff --git a/tools/profiler/core/shared-libraries-macos.cc b/tools/profiler/core/shared-libraries-macos.cc
new file mode 100644
index 0000000000..415fda3633
--- /dev/null
+++ b/tools/profiler/core/shared-libraries-macos.cc
@@ -0,0 +1,211 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "shared-libraries.h"
+
+#include "ClearOnShutdown.h"
+#include "mozilla/StaticMutex.h"
+#include "mozilla/Unused.h"
+#include "nsNativeCharsetUtils.h"
+#include <AvailabilityMacros.h>
+
+#include <dlfcn.h>
+#include <mach-o/arch.h>
+#include <mach-o/dyld_images.h>
+#include <mach-o/dyld.h>
+#include <mach-o/loader.h>
+#include <mach/mach_init.h>
+#include <mach/mach_traps.h>
+#include <mach/task_info.h>
+#include <mach/task.h>
+#include <sstream>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+
+// Architecture specific abstraction.
+#if defined(GP_ARCH_x86)
+typedef mach_header platform_mach_header;
+typedef segment_command mach_segment_command_type;
+#  define MACHO_MAGIC_NUMBER MH_MAGIC
+#  define CMD_SEGMENT LC_SEGMENT
+#  define seg_size uint32_t
+#else
+typedef mach_header_64 platform_mach_header;
+typedef segment_command_64 mach_segment_command_type;
+#  define MACHO_MAGIC_NUMBER MH_MAGIC_64
+#  define CMD_SEGMENT LC_SEGMENT_64
+#  define seg_size uint64_t
+#endif
+
+struct NativeSharedLibrary {
+  const platform_mach_header* header;
+  std::string path;
+};
+static std::vector<NativeSharedLibrary>* sSharedLibrariesList = nullptr;
+static mozilla::StaticMutex sSharedLibrariesMutex MOZ_UNANNOTATED;
+
+static void SharedLibraryAddImage(const struct mach_header* mh,
+                                  intptr_t vmaddr_slide) {
+  // NOTE: Presumably for backwards-compatibility reasons, this function accepts
+  // a mach_header even on 64-bit where it ought to be a mach_header_64. We cast
+  // it to the right type here.
+  auto header = reinterpret_cast<const platform_mach_header*>(mh);
+
+  Dl_info info;
+  if (!dladdr(header, &info)) {
+    return;
+  }
+
+  mozilla::StaticMutexAutoLock lock(sSharedLibrariesMutex);
+  if (!sSharedLibrariesList) {
+    return;
+  }
+
+  NativeSharedLibrary lib = {header, info.dli_fname};
+  sSharedLibrariesList->push_back(lib);
+}
+
+static void SharedLibraryRemoveImage(const struct mach_header* mh,
+                                     intptr_t vmaddr_slide) {
+  // NOTE: Presumably for backwards-compatibility reasons, this function accepts
+  // a mach_header even on 64-bit where it ought to be a mach_header_64. We cast
+  // it to the right type here.
+  auto header = reinterpret_cast<const platform_mach_header*>(mh);
+
+  mozilla::StaticMutexAutoLock lock(sSharedLibrariesMutex);
+  if (!sSharedLibrariesList) {
+    return;
+  }
+
+  uint32_t count = sSharedLibrariesList->size();
+  for (uint32_t i = 0; i < count; ++i) {
+    if ((*sSharedLibrariesList)[i].header == header) {
+      sSharedLibrariesList->erase(sSharedLibrariesList->begin() + i);
+      return;
+    }
+  }
+}
+
+void SharedLibraryInfo::Initialize() {
+  // NOTE: We intentionally leak this memory here. We're allocating dynamically
+  // in order to avoid static initializers.
+  sSharedLibrariesList = new std::vector<NativeSharedLibrary>();
+
+  _dyld_register_func_for_add_image(SharedLibraryAddImage);
+  _dyld_register_func_for_remove_image(SharedLibraryRemoveImage);
+}
+
+static void addSharedLibrary(const platform_mach_header* header,
+                             const char* path, SharedLibraryInfo& info) {
+  const struct load_command* cmd =
+      reinterpret_cast<const struct load_command*>(header + 1);
+
+  seg_size size = 0;
+  unsigned long long start = reinterpret_cast<unsigned long long>(header);
+  // Find the cmd segment in the macho image. It will contain the offset we care
+  // about.
+  const uint8_t* uuid_bytes = nullptr;
+  for (unsigned int i = 0;
+       cmd && (i < header->ncmds) && (uuid_bytes == nullptr || size == 0);
+       ++i) {
+    if (cmd->cmd == CMD_SEGMENT) {
+      const mach_segment_command_type* seg =
+          reinterpret_cast<const mach_segment_command_type*>(cmd);
+
+      if (!strcmp(seg->segname, "__TEXT")) {
+        size = seg->vmsize;
+      }
+    } else if (cmd->cmd == LC_UUID) {
+      const uuid_command* ucmd = reinterpret_cast<const uuid_command*>(cmd);
+      uuid_bytes = ucmd->uuid;
+    }
+
+    cmd = reinterpret_cast<const struct load_command*>(
+        reinterpret_cast<const char*>(cmd) + cmd->cmdsize);
+  }
+
+  nsAutoCString uuid;
+  nsAutoCString breakpadId;
+  if (uuid_bytes != nullptr) {
+    uuid.AppendPrintf(
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X"
+        "%02X",
+        uuid_bytes[0], uuid_bytes[1], uuid_bytes[2], uuid_bytes[3],
+        uuid_bytes[4], uuid_bytes[5], uuid_bytes[6], uuid_bytes[7],
+        uuid_bytes[8], uuid_bytes[9], uuid_bytes[10], uuid_bytes[11],
+        uuid_bytes[12], uuid_bytes[13], uuid_bytes[14], uuid_bytes[15]);
+
+    // Breakpad id is the same as the uuid but with the additional trailing 0
+    // for the breakpad id age.
+    breakpadId.AppendPrintf(
+        "%s"
+        "0" /* breakpad id age */,
+        uuid.get());
+  }
+
+  nsAutoString pathStr;
+  mozilla::Unused << NS_WARN_IF(
+      NS_FAILED(NS_CopyNativeToUnicode(nsDependentCString(path), pathStr)));
+
+  nsAutoString nameStr = pathStr;
+  int32_t pos = nameStr.RFindChar('/');
+  if (pos != kNotFound) {
+    nameStr.Cut(0, pos + 1);
+  }
+
+  const NXArchInfo* archInfo =
+      NXGetArchInfoFromCpuType(header->cputype, header->cpusubtype);
+
+  info.AddSharedLibrary(SharedLibrary(start, start + size, 0, breakpadId, uuid,
+                                      nameStr, pathStr, nameStr, pathStr, ""_ns,
+                                      archInfo ? archInfo->name : ""));
+}
+
+// Translate the statically stored sSharedLibrariesList information into a
+// SharedLibraryInfo object.
+SharedLibraryInfo SharedLibraryInfo::GetInfoForSelf() {
+  mozilla::StaticMutexAutoLock lock(sSharedLibrariesMutex);
+  SharedLibraryInfo sharedLibraryInfo;
+
+  for (auto& info : *sSharedLibrariesList) {
+    addSharedLibrary(info.header, info.path.c_str(), sharedLibraryInfo);
+  }
+
+  // Add the entry for dyld itself.
+  // We only support macOS 10.12+, which corresponds to dyld version 15+.
+  // dyld version 15 added the dyldPath property.
+  task_dyld_info_data_t task_dyld_info;
+  mach_msg_type_number_t count = TASK_DYLD_INFO_COUNT;
+  if (task_info(mach_task_self(), TASK_DYLD_INFO, (task_info_t)&task_dyld_info,
+                &count) != KERN_SUCCESS) {
+    return sharedLibraryInfo;
+  }
+
+  struct dyld_all_image_infos* aii =
+      (struct dyld_all_image_infos*)task_dyld_info.all_image_info_addr;
+  if (aii->version >= 15) {
+    const platform_mach_header* header =
+        reinterpret_cast<const platform_mach_header*>(
+            aii->dyldImageLoadAddress);
+    addSharedLibrary(header, aii->dyldPath, sharedLibraryInfo);
+  }
+
+  return sharedLibraryInfo;
+}
diff --git a/tools/profiler/core/shared-libraries-win32.cc b/tools/profiler/core/shared-libraries-win32.cc
new file mode 100644
index 0000000000..cb0bcd1f41
--- /dev/null
+++ b/tools/profiler/core/shared-libraries-win32.cc
@@ -0,0 +1,167 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <windows.h>
+
+#include "shared-libraries.h"
+#include "nsWindowsHelpers.h"
+#include "mozilla/NativeNt.h"
+#include "mozilla/WindowsEnumProcessModules.h"
+#include "mozilla/WindowsProcessMitigations.h"
+#include "mozilla/WindowsVersion.h"
+#include "nsPrintfCString.h"
+
+static bool IsModuleUnsafeToLoad(const nsAString& aModuleName) {
+#if defined(_M_AMD64) || defined(_M_IX86)
+  // Hackaround for Bug 1607574.  Nvidia's shim driver nvd3d9wrap[x].dll detours
+  // LoadLibraryExW and it causes AV when the following conditions are met.
+  //   1. LoadLibraryExW was called for "detoured.dll"
+  //   2. nvinit[x].dll was unloaded
+  //   3. OS version is older than 6.2
+#  if defined(_M_AMD64)
+  LPCWSTR kNvidiaShimDriver = L"nvd3d9wrapx.dll";
+  LPCWSTR kNvidiaInitDriver = L"nvinitx.dll";
+#  elif defined(_M_IX86)
+  LPCWSTR kNvidiaShimDriver = L"nvd3d9wrap.dll";
+  LPCWSTR kNvidiaInitDriver = L"nvinit.dll";
+#  endif
+  if (aModuleName.LowerCaseEqualsLiteral("detoured.dll") &&
+      !mozilla::IsWin8OrLater() && ::GetModuleHandleW(kNvidiaShimDriver) &&
+      !::GetModuleHandleW(kNvidiaInitDriver)) {
+    return true;
+  }
+#endif  // defined(_M_AMD64) || defined(_M_IX86)
+
+  // Hackaround for Bug 1723868.  There is no safe way to prevent the module
+  // Microsoft's VP9 Video Decoder from being unloaded because mfplat.dll may
+  // have posted more than one task to unload the module in the work queue
+  // without calling LoadLibrary.
+  if (aModuleName.LowerCaseEqualsLiteral("msvp9dec_store.dll")) {
+    return true;
+  }
+
+  return false;
+}
+
+void AddSharedLibraryFromModuleInfo(SharedLibraryInfo& sharedLibraryInfo,
+                                    const wchar_t* aModulePath,
+                                    mozilla::Maybe<HMODULE> aModule) {
+  nsDependentSubstring moduleNameStr(
+      mozilla::nt::GetLeafName(nsDependentString(aModulePath)));
+
+  // If the module is unsafe to call LoadLibraryEx for, we skip.
+  if (IsModuleUnsafeToLoad(moduleNameStr)) {
+    return;
+  }
+
+  // If EAF+ is enabled, parsing ntdll's PE header causes a crash.
+  if (mozilla::IsEafPlusEnabled() &&
+      moduleNameStr.LowerCaseEqualsLiteral("ntdll.dll")) {
+    return;
+  }
+
+  // Load the module again - to make sure that its handle will remain valid as
+  // we attempt to read the PDB information from it - or for the first time if
+  // we only have a path. We want to load the DLL without running the newly
+  // loaded module's DllMain function, but not as a data file because we want
+  // to be able to do RVA computations easily. Hence, we use the flag
+  // LOAD_LIBRARY_AS_IMAGE_RESOURCE which ensures that the sections (not PE
+  // headers) will be relocated by the loader. Otherwise GetPdbInfo() and/or
+  // GetVersionInfo() can cause a crash. If the original handle |aModule| is
+  // valid, LoadLibraryEx just increments its refcount.
+  nsModuleHandle handleLock(
+      ::LoadLibraryExW(aModulePath, NULL, LOAD_LIBRARY_AS_IMAGE_RESOURCE));
+  if (!handleLock) {
+    return;
+  }
+
+  mozilla::nt::PEHeaders headers(handleLock.get());
+  if (!headers) {
+    return;
+  }
+
+  mozilla::Maybe<mozilla::Range<const uint8_t>> bounds = headers.GetBounds();
+  if (!bounds) {
+    return;
+  }
+
+  // Put the original |aModule| into SharedLibrary, but we get debug info
+  // from |handleLock| as |aModule| might be inaccessible.
+  const uintptr_t modStart =
+      aModule.isSome() ? reinterpret_cast<uintptr_t>(*aModule)
+                       : reinterpret_cast<uintptr_t>(handleLock.get());
+  const uintptr_t modEnd = modStart + bounds->length();
+
+  nsAutoCString breakpadId;
+  nsAutoString pdbPathStr;
+  if (const auto* debugInfo = headers.GetPdbInfo()) {
+    MOZ_ASSERT(breakpadId.IsEmpty());
+    const GUID& pdbSig = debugInfo->pdbSignature;
+    breakpadId.AppendPrintf(
+        "%08lX"                             // m0
+        "%04X%04X"                          // m1,m2
+        "%02X%02X%02X%02X%02X%02X%02X%02X"  // m3
+        "%X",                               // pdbAge
+        pdbSig.Data1, pdbSig.Data2, pdbSig.Data3, pdbSig.Data4[0],
+        pdbSig.Data4[1], pdbSig.Data4[2], pdbSig.Data4[3], pdbSig.Data4[4],
+        pdbSig.Data4[5], pdbSig.Data4[6], pdbSig.Data4[7], debugInfo->pdbAge);
+
+    // The PDB file name could be different from module filename,
+    // so report both
+    // e.g. The PDB for C:\Windows\SysWOW64\ntdll.dll is wntdll.pdb
+    pdbPathStr = NS_ConvertUTF8toUTF16(debugInfo->pdbFileName);
+  }
+
+  nsAutoCString codeId;
+  DWORD timestamp;
+  DWORD imageSize;
+  if (headers.GetTimeStamp(timestamp) && headers.GetImageSize(imageSize)) {
+    codeId.AppendPrintf(
+        "%08lX"  // Uppercase 8 digits of hex timestamp with leading zeroes.
+        "%lx",   // Lowercase hex image size
+        timestamp, imageSize);
+  }
+
+  nsAutoCString versionStr;
+  uint64_t version;
+  if (headers.GetVersionInfo(version)) {
+    versionStr.AppendPrintf("%u.%u.%u.%u",
+                            static_cast<uint32_t>((version >> 48) & 0xFFFFu),
+                            static_cast<uint32_t>((version >> 32) & 0xFFFFu),
+                            static_cast<uint32_t>((version >> 16) & 0xFFFFu),
+                            static_cast<uint32_t>(version & 0xFFFFu));
+  }
+
+  const nsString& pdbNameStr =
+      PromiseFlatString(mozilla::nt::GetLeafName(pdbPathStr));
+  SharedLibrary shlib(modStart, modEnd,
+                      0,  // DLLs are always mapped at offset 0 on Windows
+                      breakpadId, codeId, PromiseFlatString(moduleNameStr),
+                      nsDependentString(aModulePath), pdbNameStr, pdbPathStr,
+                      versionStr, "");
+  sharedLibraryInfo.AddSharedLibrary(shlib);
+}
+
+SharedLibraryInfo SharedLibraryInfo::GetInfoForSelf() {
+  SharedLibraryInfo sharedLibraryInfo;
+
+  auto addSharedLibraryFromModuleInfo =
+      [&sharedLibraryInfo](const wchar_t* aModulePath, HMODULE aModule) {
+        AddSharedLibraryFromModuleInfo(sharedLibraryInfo, aModulePath,
+                                       mozilla::Some(aModule));
+      };
+
+  mozilla::EnumerateProcessModules(addSharedLibraryFromModuleInfo);
+  return sharedLibraryInfo;
+}
+
+SharedLibraryInfo SharedLibraryInfo::GetInfoFromPath(const wchar_t* aPath) {
+  SharedLibraryInfo sharedLibraryInfo;
+  AddSharedLibraryFromModuleInfo(sharedLibraryInfo, aPath, mozilla::Nothing());
+  return sharedLibraryInfo;
+}
+
+void SharedLibraryInfo::Initialize() { /* do nothing */
+}
diff --git a/tools/profiler/core/vtune/ittnotify.h b/tools/profiler/core/vtune/ittnotify.h
new file mode 100644
index 0000000000..f1d65b3328
--- /dev/null
+++ b/tools/profiler/core/vtune/ittnotify.h
@@ -0,0 +1,4123 @@
+/* <copyright>
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+  The full GNU General Public License is included in this distribution
+  in the file called LICENSE.GPL.
+
+  Contact Information:
+  http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/
+
+  BSD LICENSE
+
+  Copyright (c) 2005-2014 Intel Corporation. All rights reserved.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+</copyright> */
+#ifndef _ITTNOTIFY_H_
+#define _ITTNOTIFY_H_
+
+/**
+@file
+@brief Public User API functions and types
+@mainpage
+
+The ITT API is used to annotate a user's program with additional information
+that can be used by correctness and performance tools. The user inserts
+calls in their program. Those calls generate information that is collected
+at runtime, and used by Intel(R) Threading Tools.
+
+@section API Concepts
+The following general concepts are used throughout the API.
+
+@subsection Unicode Support
+Many API functions take character string arguments. On Windows, there
+are two versions of each such function. The function name is suffixed
+by W if Unicode support is enabled, and by A otherwise. Any API function
+that takes a character string argument adheres to this convention.
+
+@subsection Conditional Compilation
+Many users prefer having an option to modify ITT API code when linking it
+inside their runtimes. ITT API header file provides a mechanism to replace
+ITT API function names inside your code with empty strings. To do this,
+define the macros INTEL_NO_ITTNOTIFY_API during compilation and remove the
+static library from the linker script.
+
+@subsection Domains
+[see domains]
+Domains provide a way to separate notification for different modules or
+libraries in a program. Domains are specified by dotted character strings,
+e.g. TBB.Internal.Control.
+
+A mechanism (to be specified) is provided to enable and disable
+domains. By default, all domains are enabled.
+@subsection Named Entities and Instances
+Named entities (frames, regions, tasks, and markers) communicate
+information about the program to the analysis tools. A named entity often
+refers to a section of program code, or to some set of logical concepts
+that the programmer wants to group together.
+
+Named entities relate to the programmer's static view of the program. When
+the program actually executes, many instances of a given named entity
+may be created.
+
+The API annotations denote instances of named entities. The actual
+named entities are displayed using the analysis tools. In other words,
+the named entities come into existence when instances are created.
+
+Instances of named entities may have instance identifiers (IDs). Some
+API calls use instance identifiers to create relationships between
+different instances of named entities. Other API calls associate data
+with instances of named entities.
+
+Some named entities must always have instance IDs. In particular, regions
+and frames always have IDs. Task and markers need IDs only if the ID is
+needed in another API call (such as adding a relation or metadata).
+
+The lifetime of instance IDs is distinct from the lifetime of
+instances. This allows various relationships to be specified separate
+from the actual execution of instances. This flexibility comes at the
+expense of extra API calls.
+
+The same ID may not be reused for different instances, unless a previous
+[ref] __itt_id_destroy call for that ID has been issued.
+*/
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#  include "vtune/legacy/ittnotify.h"
+#endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup public Public API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup control Collection Control
+ * @ingroup public
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+/** @brief Detach collection */
+void ITTAPI __itt_detach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,  (void))
+ITT_STUBV(ITTAPI, void, resume, (void))
+ITT_STUBV(ITTAPI, void, detach, (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#define __itt_detach()
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} control group */
+/** @endcond */
+
+/**
+ * @defgroup threads Threads
+ * @ingroup public
+ * Give names to threads
+ * @{
+ */
+/**
+ * @brief Sets thread name of calling thread
+ * @param[in] name - name of thread
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_thread_set_nameA(const char    *name);
+void ITTAPI __itt_thread_set_nameW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thread_set_name     __itt_thread_set_nameW
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_thread_set_name     __itt_thread_set_nameA
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_thread_set_name(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name))
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA     ITTNOTIFY_VOID(thread_set_nameA)
+#define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA)
+#define __itt_thread_set_nameW     ITTNOTIFY_VOID(thread_set_nameW)
+#define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name     ITTNOTIFY_VOID(thread_set_name)
+#define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA(name)
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW(name)
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name(name)
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void ITTAPI __itt_thread_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, thread_ignore, (void))
+#define __itt_thread_ignore     ITTNOTIFY_VOID(thread_ignore)
+#define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thread_ignore()
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} threads group */
+
+/**
+ * @defgroup suppress Error suppression
+ * @ingroup public
+ * General behavior: application continues to run, but errors are suppressed
+ *
+ * @{
+ */
+
+/*****************************************************************//**
+ * @name group of functions used for error suppression in correctness tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask
+ */
+#define __itt_suppress_all_errors 0x7fffffff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from threading analysis)
+ */
+#define __itt_suppress_threading_errors 0x000000ff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from memory analysis)
+ */
+#define __itt_suppress_memory_errors 0x0000ff00
+
+/**
+ * @brief Start suppressing errors identified in mask on this thread
+ */
+void ITTAPI __itt_suppress_push(unsigned int mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
+#define __itt_suppress_push     ITTNOTIFY_VOID(suppress_push)
+#define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_push(mask)
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effects of the matching call to __itt_suppress_push
+ */
+void ITTAPI __itt_suppress_pop(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_pop, (void))
+#define __itt_suppress_pop     ITTNOTIFY_VOID(suppress_pop)
+#define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_pop()
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum __itt_suppress_mode {
+    __itt_unsuppress_range,
+    __itt_suppress_range
+} __itt_suppress_mode_t;
+
+/**
+ * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
+ */
+void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_mark_range     ITTNOTIFY_VOID(suppress_mark_range)
+#define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_mark_range(mask)
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If not matching
+ *        call is found, nothing is changed.
+ */
+void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_clear_range     ITTNOTIFY_VOID(suppress_clear_range)
+#define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_clear_range(mask)
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+/** @} suppress group */
+
+/**
+ * @defgroup sync Synchronization
+ * @ingroup public
+ * Indicate user-written synchronization code
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+@brief Name a synchronization object
+@param[in] addr       Handle for the synchronization object. You should
+use a real address to uniquely identify the synchronization object.
+@param[in] objtype    null-terminated object type string. If NULL is
+passed, the name will be "User Synchronization".
+@param[in] objname    null-terminated object name string. If NULL,
+no name will be assigned to the object.
+@param[in] attribute  one of [#__itt_attr_barrier, #__itt_attr_mutex]
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_createA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_create     __itt_sync_createW
+#  define __itt_sync_create_ptr __itt_sync_createW_ptr
+#else /* UNICODE */
+#  define __itt_sync_create     __itt_sync_createA
+#  define __itt_sync_create_ptr __itt_sync_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const char*    objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA     ITTNOTIFY_VOID(sync_createA)
+#define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA)
+#define __itt_sync_createW     ITTNOTIFY_VOID(sync_createW)
+#define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create     ITTNOTIFY_VOID(sync_create)
+#define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA(addr, objtype, objname, attribute)
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW(addr, objtype, objname, attribute)
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create(addr, objtype, objname, attribute)
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+@brief Rename a synchronization object
+
+You can use the rename call to assign or reassign a name to a given
+synchronization object.
+@param[in] addr  handle for the synchronization object.
+@param[in] name  null-terminated object name string.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_renameA(void *addr, const char    *name);
+void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_rename     __itt_sync_renameW
+#  define __itt_sync_rename_ptr __itt_sync_renameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_rename     __itt_sync_renameA
+#  define __itt_sync_rename_ptr __itt_sync_renameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_rename(void *addr, const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name))
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA     ITTNOTIFY_VOID(sync_renameA)
+#define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA)
+#define __itt_sync_renameW     ITTNOTIFY_VOID(sync_renameW)
+#define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename     ITTNOTIFY_VOID(sync_rename)
+#define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA(addr, name)
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW(addr, name)
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename(addr, name)
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ @brief Destroy a synchronization object.
+ @param addr Handle for the synchronization object.
+ */
+void ITTAPI __itt_sync_destroy(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr))
+#define __itt_sync_destroy     ITTNOTIFY_VOID(sync_destroy)
+#define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_destroy(addr)
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/*****************************************************************//**
+ * @name group of functions is used for performance measurement tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @brief Enter spin loop on user-defined sync object
+ */
+void ITTAPI __itt_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr))
+#define __itt_sync_prepare     ITTNOTIFY_VOID(sync_prepare)
+#define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_prepare(addr)
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Quit spin loop without acquiring spin object
+ */
+void ITTAPI __itt_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr))
+#define __itt_sync_cancel     ITTNOTIFY_VOID(sync_cancel)
+#define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_cancel(addr)
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void ITTAPI __itt_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr))
+#define __itt_sync_acquired     ITTNOTIFY_VOID(sync_acquired)
+#define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_acquired(addr)
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void ITTAPI __itt_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
+#define __itt_sync_releasing     ITTNOTIFY_VOID(sync_releasing)
+#define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_releasing(addr)
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/** @} sync group */
+
+/**************************************************************//**
+ * @name group of functions is used for correctness checking tools
+ ******************************************************************/
+/** @{ */
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_prepare(void* addr);
+ */
+void ITTAPI __itt_fsync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
+#define __itt_fsync_prepare     ITTNOTIFY_VOID(fsync_prepare)
+#define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_prepare(addr)
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_cancel(void *addr);
+ */
+void ITTAPI __itt_fsync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
+#define __itt_fsync_cancel     ITTNOTIFY_VOID(fsync_cancel)
+#define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_cancel(addr)
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_acquired(void *addr);
+ */
+void ITTAPI __itt_fsync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
+#define __itt_fsync_acquired     ITTNOTIFY_VOID(fsync_acquired)
+#define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_acquired(addr)
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_releasing(void* addr);
+ */
+void ITTAPI __itt_fsync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
+#define __itt_fsync_releasing     ITTNOTIFY_VOID(fsync_releasing)
+#define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_releasing(addr)
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/**
+ * @defgroup model Modeling by Intel(R) Parallel Advisor
+ * @ingroup public
+ * This is the subset of itt used for modeling by Intel(R) Parallel Advisor.
+ * This API is called ONLY using annotate.h, by "Annotation" macros
+ * the user places in their sources during the parallelism modeling steps.
+ *
+ * site_begin/end and task_begin/end take the address of handle variables,
+ * which are writeable by the API.  Handles must be 0 initialized prior
+ * to the first call to begin, or may cause a run-time failure.
+ * The handles are initialized in a multi-thread safe way by the API if
+ * the handle is 0.  The commonly expected idiom is one static handle to
+ * identify a site or task.  If a site or task of the same name has already
+ * been started during this collection, the same handle MAY be returned,
+ * but is not required to be - it is unspecified if data merging is done
+ * based on name.  These routines also take an instance variable.  Like
+ * the lexical instance, these must be 0 initialized.  Unlike the lexical
+ * instance, this is used to track a single dynamic instance.
+ *
+ * API used by the Intel(R) Parallel Advisor to describe potential concurrency
+ * and related activities. User-added source annotations expand to calls
+ * to these procedures to enable modeling of a hypothetical concurrent
+ * execution serially.
+ * @{
+ */
+#if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL)
+
+typedef void* __itt_model_site;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_site_instance;    /*!< @brief handle for dynamic instance */
+typedef void* __itt_model_task;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_task_instance;    /*!< @brief handle for dynamic instance */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum {
+    __itt_model_disable_observation,
+    __itt_model_disable_collection
+} __itt_model_disable;
+
+#endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */
+
+/**
+ * @brief ANNOTATE_SITE_BEGIN/ANNOTATE_SITE_END support.
+ *
+ * site_begin/end model a potential concurrency site.
+ * site instances may be recursively nested with themselves.
+ * site_end exits the most recently started but unended site for the current
+ * thread.  The handle passed to end may be used to validate structure.
+ * Instances of a site encountered on different threads concurrently
+ * are considered completely distinct. If the site name for two different
+ * lexical sites match, it is unspecified whether they are treated as the
+ * same or different for data presentation.
+ */
+void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_site_beginW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_site_beginA(const char *name);
+void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen);
+void ITTAPI __itt_model_site_end  (__itt_model_site *site, __itt_model_site_instance *instance);
+void ITTAPI __itt_model_site_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_site_begin,  (__itt_model_site *site, __itt_model_site_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,  (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_site_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_site_beginAL,  (const char *name, size_t siteNameLen))
+ITT_STUBV(ITTAPI, void, model_site_end,    (__itt_model_site *site, __itt_model_site_instance *instance))
+ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
+#define __itt_model_site_begin      ITTNOTIFY_VOID(model_site_begin)
+#define __itt_model_site_begin_ptr  ITTNOTIFY_NAME(model_site_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW      ITTNOTIFY_VOID(model_site_beginW)
+#define __itt_model_site_beginW_ptr  ITTNOTIFY_NAME(model_site_beginW)
+#endif
+#define __itt_model_site_beginA      ITTNOTIFY_VOID(model_site_beginA)
+#define __itt_model_site_beginA_ptr  ITTNOTIFY_NAME(model_site_beginA)
+#define __itt_model_site_beginAL      ITTNOTIFY_VOID(model_site_beginAL)
+#define __itt_model_site_beginAL_ptr  ITTNOTIFY_NAME(model_site_beginAL)
+#define __itt_model_site_end        ITTNOTIFY_VOID(model_site_end)
+#define __itt_model_site_end_ptr    ITTNOTIFY_NAME(model_site_end)
+#define __itt_model_site_end_2        ITTNOTIFY_VOID(model_site_end_2)
+#define __itt_model_site_end_2_ptr    ITTNOTIFY_NAME(model_site_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_site_begin(site, instance, name)
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW(name)
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA(name)
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL(name, siteNameLen)
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end(site, instance)
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2()
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_TASK_BEGIN/ANNOTATE_TASK_END support
+ *
+ * task_begin/end model a potential task, which is contained within the most
+ * closely enclosing dynamic site.  task_end exits the most recently started
+ * but unended task.  The handle passed to end may be used to validate
+ * structure.  It is unspecified if bad dynamic nesting is detected.  If it
+ * is, it should be encoded in the resulting data collection.  The collector
+ * should not fail due to construct nesting issues, nor attempt to directly
+ * indicate the problem.
+ */
+void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_task_beginW(const wchar_t *name);
+void ITTAPI __itt_model_iteration_taskW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_task_beginA(const char *name);
+void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_iteration_taskA(const char *name);
+void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_task_end  (__itt_model_task *task, __itt_model_task_instance *instance);
+void ITTAPI __itt_model_task_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_task_begin,  (__itt_model_task *task, __itt_model_task_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_task_beginW,  (const wchar_t *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_task_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_task_beginAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_task_end,    (__itt_model_task *task, __itt_model_task_instance *instance))
+ITT_STUBV(ITTAPI, void, model_task_end_2,  (void))
+#define __itt_model_task_begin      ITTNOTIFY_VOID(model_task_begin)
+#define __itt_model_task_begin_ptr  ITTNOTIFY_NAME(model_task_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW     ITTNOTIFY_VOID(model_task_beginW)
+#define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW)
+#define __itt_model_iteration_taskW     ITTNOTIFY_VOID(model_iteration_taskW)
+#define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW)
+#endif
+#define __itt_model_task_beginA    ITTNOTIFY_VOID(model_task_beginA)
+#define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA)
+#define __itt_model_task_beginAL    ITTNOTIFY_VOID(model_task_beginAL)
+#define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL)
+#define __itt_model_iteration_taskA    ITTNOTIFY_VOID(model_iteration_taskA)
+#define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA)
+#define __itt_model_iteration_taskAL    ITTNOTIFY_VOID(model_iteration_taskAL)
+#define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL)
+#define __itt_model_task_end        ITTNOTIFY_VOID(model_task_end)
+#define __itt_model_task_end_ptr    ITTNOTIFY_NAME(model_task_end)
+#define __itt_model_task_end_2        ITTNOTIFY_VOID(model_task_end_2)
+#define __itt_model_task_end_2_ptr    ITTNOTIFY_NAME(model_task_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_task_begin(task, instance, name)
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW(name)
+#define __itt_model_task_beginW_ptr  0
+#endif
+#define __itt_model_task_beginA(name)
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL(name, siteNameLen)
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA(name)
+#define __itt_model_iteration_taskA_ptr  0
+#define __itt_model_iteration_taskAL(name, siteNameLen)
+#define __itt_model_iteration_taskAL_ptr  0
+#define __itt_model_task_end(task, instance)
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2()
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW_ptr 0
+#endif
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA_ptr    0
+#define __itt_model_iteration_taskAL_ptr    0
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_LOCK_ACQUIRE/ANNOTATE_LOCK_RELEASE support
+ *
+ * lock_acquire/release model a potential lock for both lockset and
+ * performance modeling.  Each unique address is modeled as a separate
+ * lock, with invalid addresses being valid lock IDs.  Specifically:
+ * no storage is accessed by the API at the specified address - it is only
+ * used for lock identification.  Lock acquires may be self-nested and are
+ * unlocked by a corresponding number of releases.
+ * (These closely correspond to __itt_sync_acquired/__itt_sync_releasing,
+ * but may not have identical semantics.)
+ */
+void ITTAPI __itt_model_lock_acquire(void *lock);
+void ITTAPI __itt_model_lock_acquire_2(void *lock);
+void ITTAPI __itt_model_lock_release(void *lock);
+void ITTAPI __itt_model_lock_release_2(void *lock);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
+#define __itt_model_lock_acquire     ITTNOTIFY_VOID(model_lock_acquire)
+#define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire)
+#define __itt_model_lock_acquire_2     ITTNOTIFY_VOID(model_lock_acquire_2)
+#define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2)
+#define __itt_model_lock_release     ITTNOTIFY_VOID(model_lock_release)
+#define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release)
+#define __itt_model_lock_release_2     ITTNOTIFY_VOID(model_lock_release_2)
+#define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_lock_acquire(lock)
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2(lock)
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release(lock)
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2(lock)
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_RECORD_ALLOCATION/ANNOTATE_RECORD_DEALLOCATION support
+ *
+ * record_allocation/deallocation describe user-defined memory allocator
+ * behavior, which may be required for correctness modeling to understand
+ * when storage is not expected to be actually reused across threads.
+ */
+void ITTAPI __itt_model_record_allocation  (void *addr, size_t size);
+void ITTAPI __itt_model_record_deallocation(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size))
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
+#define __itt_model_record_allocation       ITTNOTIFY_VOID(model_record_allocation)
+#define __itt_model_record_allocation_ptr   ITTNOTIFY_NAME(model_record_allocation)
+#define __itt_model_record_deallocation     ITTNOTIFY_VOID(model_record_deallocation)
+#define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_record_allocation(addr, size)
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation(addr)
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_INDUCTION_USES support
+ *
+ * Note particular storage is inductive through the end of the current site
+ */
+void ITTAPI __itt_model_induction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
+#define __itt_model_induction_uses     ITTNOTIFY_VOID(model_induction_uses)
+#define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_induction_uses(addr, size)
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_REDUCTION_USES support
+ *
+ * Note particular storage is used for reduction through the end
+ * of the current site
+ */
+void ITTAPI __itt_model_reduction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
+#define __itt_model_reduction_uses     ITTNOTIFY_VOID(model_reduction_uses)
+#define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_reduction_uses(addr, size)
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_OBSERVE_USES support
+ *
+ * Have correctness modeling record observations about uses of storage
+ * through the end of the current site
+ */
+void ITTAPI __itt_model_observe_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
+#define __itt_model_observe_uses     ITTNOTIFY_VOID(model_observe_uses)
+#define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_observe_uses(addr, size)
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_CLEAR_USES support
+ *
+ * Clear the special handling of a piece of storage related to induction,
+ * reduction or observe_uses
+ */
+void ITTAPI __itt_model_clear_uses(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr))
+#define __itt_model_clear_uses     ITTNOTIFY_VOID(model_clear_uses)
+#define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_clear_uses(addr)
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_DISABLE_*_PUSH/ANNOTATE_DISABLE_*_POP support
+ *
+ * disable_push/disable_pop push and pop disabling based on a parameter.
+ * Disabling observations stops processing of memory references during
+ * correctness modeling, and all annotations that occur in the disabled
+ * region.  This allows description of code that is expected to be handled
+ * specially during conversion to parallelism or that is not recognized
+ * by tools (e.g. some kinds of synchronization operations.)
+ * This mechanism causes all annotations in the disabled region, other
+ * than disable_push and disable_pop, to be ignored.  (For example, this
+ * might validly be used to disable an entire parallel site and the contained
+ * tasks and locking in it for data collection purposes.)
+ * The disable for collection is a more expensive operation, but reduces
+ * collector overhead significantly.  This applies to BOTH correctness data
+ * collection and performance data collection.  For example, a site
+ * containing a task might only enable data collection for the first 10
+ * iterations.  Both performance and correctness data should reflect this,
+ * and the program should run as close to full speed as possible when
+ * collection is disabled.
+ */
+void ITTAPI __itt_model_disable_push(__itt_model_disable x);
+void ITTAPI __itt_model_disable_pop(void);
+void ITTAPI __itt_model_aggregate_task(size_t x);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x))
+ITT_STUBV(ITTAPI, void, model_disable_pop,  (void))
+ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
+#define __itt_model_disable_push     ITTNOTIFY_VOID(model_disable_push)
+#define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push)
+#define __itt_model_disable_pop      ITTNOTIFY_VOID(model_disable_pop)
+#define __itt_model_disable_pop_ptr  ITTNOTIFY_NAME(model_disable_pop)
+#define __itt_model_aggregate_task      ITTNOTIFY_VOID(model_aggregate_task)
+#define __itt_model_aggregate_task_ptr  ITTNOTIFY_NAME(model_aggregate_task)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_disable_push(x)
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop()
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task(x)
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} model group */
+
+/**
+ * @defgroup heap Heap
+ * @ingroup public
+ * Heap group
+ * @{
+ */
+
+typedef void* __itt_heap_function;
+
+/**
+ * @brief Create an identification for heap function
+ * @return non-zero identifier or NULL
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_heap_function ITTAPI __itt_heap_function_createA(const char*    name, const char*    domain);
+__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_heap_function_create     __itt_heap_function_createW
+#  define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
+#else
+#  define __itt_heap_function_create     __itt_heap_function_createA
+#  define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char*    name, const char*    domain))
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    name, const char*    domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA     ITTNOTIFY_DATA(heap_function_createA)
+#define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA)
+#define __itt_heap_function_createW     ITTNOTIFY_DATA(heap_function_createW)
+#define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create      ITTNOTIFY_DATA(heap_function_create)
+#define __itt_heap_function_create_ptr  ITTNOTIFY_NAME(heap_function_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create(name, domain)  (__itt_heap_function)0
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation begin occurrence.
+ */
+void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized))
+#define __itt_heap_allocate_begin     ITTNOTIFY_VOID(heap_allocate_begin)
+#define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_begin(h, size, initialized)
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation end occurrence.
+ */
+void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized))
+#define __itt_heap_allocate_end     ITTNOTIFY_VOID(heap_allocate_end)
+#define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_end(h, addr, size, initialized)
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free begin occurrence.
+ */
+void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_begin     ITTNOTIFY_VOID(heap_free_begin)
+#define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_begin(h, addr)
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free end occurrence.
+ */
+void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_end     ITTNOTIFY_VOID(heap_free_end)
+#define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_end(h, addr)
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation begin occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_begin     ITTNOTIFY_VOID(heap_reallocate_begin)
+#define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_begin(h, addr, new_size, initialized)
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation end occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_end     ITTNOTIFY_VOID(heap_reallocate_end)
+#define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized)
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access begin */
+void ITTAPI __itt_heap_internal_access_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin,  (void))
+#define __itt_heap_internal_access_begin      ITTNOTIFY_VOID(heap_internal_access_begin)
+#define __itt_heap_internal_access_begin_ptr  ITTNOTIFY_NAME(heap_internal_access_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_begin()
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access end */
+void ITTAPI __itt_heap_internal_access_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void))
+#define __itt_heap_internal_access_end     ITTNOTIFY_VOID(heap_internal_access_end)
+#define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_end()
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth begin */
+void ITTAPI __itt_heap_record_memory_growth_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin,  (void))
+#define __itt_heap_record_memory_growth_begin      ITTNOTIFY_VOID(heap_record_memory_growth_begin)
+#define __itt_heap_record_memory_growth_begin_ptr  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_begin()
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth end */
+void ITTAPI __itt_heap_record_memory_growth_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
+#define __itt_heap_record_memory_growth_end     ITTNOTIFY_VOID(heap_record_memory_growth_end)
+#define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_end()
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Specify the type of heap detection/reporting to modify.
+ */
+/**
+ * @hideinitializer
+ * @brief Report on memory leaks.
+ */
+#define __itt_heap_leaks 0x00000001
+
+/**
+ * @hideinitializer
+ * @brief Report on memory growth.
+ */
+#define __itt_heap_growth 0x00000002
+
+
+/** @brief heap reset detection */
+void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reset_detection,  (unsigned int reset_mask))
+#define __itt_heap_reset_detection      ITTNOTIFY_VOID(heap_reset_detection)
+#define __itt_heap_reset_detection_ptr  ITTNOTIFY_NAME(heap_reset_detection)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reset_detection()
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief report */
+void ITTAPI __itt_heap_record(unsigned int record_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
+#define __itt_heap_record     ITTNOTIFY_VOID(heap_record)
+#define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record()
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} heap group */
+/** @endcond */
+/* ========================================================================== */
+
+/**
+ * @defgroup domains Domains
+ * @ingroup public
+ * Domains group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_domain
+{
+    volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */
+    const char* nameA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved to the runtime */
+    void* extra2; /*!< Reserved to the runtime */
+    struct ___itt_domain* next;
+} __itt_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup domains
+ * @brief Create a domain.
+ * Create domain using some domain name: the URI naming style is recommended.
+ * Because the set of domains is expected to be static over the application's
+ * execution time, there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of
+ * which thread created the domain. This call is thread-safe.
+ * @param[in] name name of domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_domain* ITTAPI __itt_domain_createA(const char    *name);
+__itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_domain_create     __itt_domain_createW
+#  define __itt_domain_create_ptr __itt_domain_createW_ptr
+#else /* UNICODE */
+#  define __itt_domain_create     __itt_domain_createA
+#  define __itt_domain_create_ptr __itt_domain_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_domain* ITTAPI __itt_domain_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA     ITTNOTIFY_DATA(domain_createA)
+#define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA)
+#define __itt_domain_createW     ITTNOTIFY_DATA(domain_createW)
+#define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create     ITTNOTIFY_DATA(domain_create)
+#define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA(name) (__itt_domain*)0
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW(name) (__itt_domain*)0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create(name)  (__itt_domain*)0
+#define __itt_domain_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} domains group */
+
+/**
+ * @defgroup ids IDs
+ * @ingroup public
+ * IDs group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_id
+{
+    unsigned long long d1, d2, d3;
+} __itt_id;
+
+#pragma pack(pop)
+/** @endcond */
+
+const __itt_id __itt_null = { 0, 0, 0 };
+
+/**
+ * @ingroup ids
+ * @brief A convenience function is provided to create an ID without domain control.
+ * @brief This is a convenience function to initialize an __itt_id structure. This function
+ * does not affect the collector runtime in any way. After you make the ID with this
+ * function, you still must create it with the __itt_id_create function before using the ID
+ * to identify a named entity.
+ * @param[in] addr The address of object; high QWORD of the ID value.
+ * @param[in] extra The extra data to unique identify object; low QWORD of the ID value.
+ */
+
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra)
+{
+    __itt_id id = __itt_null;
+    id.d1 = (unsigned long long)((uintptr_t)addr);
+    id.d2 = (unsigned long long)extra;
+    id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
+    return id;
+}
+
+/**
+ * @ingroup ids
+ * @brief Create an instance of identifier.
+ * This establishes the beginning of the lifetime of an instance of
+ * the given ID in the trace. Once this lifetime starts, the ID
+ * can be used to tag named entity instances in calls such as
+ * __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * Instance IDs are not domain specific!
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x)
+#define __itt_id_create_ptr  ITTNOTIFY_NAME(id_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create(domain,id)
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup ids
+ * @brief Destroy an instance of identifier.
+ * This ends the lifetime of the current instance of the given ID value in the trace.
+ * Any relationships that are established after this lifetime ends are invalid.
+ * This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x)
+#define __itt_id_destroy_ptr  ITTNOTIFY_NAME(id_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_destroy(domain,id)
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} ids group */
+
+/**
+ * @defgroup handless String Handles
+ * @ingroup public
+ * String Handles group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_string_handle
+{
+    const char* strA; /*!< Copy of original string in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* strW; /*!< Copy of original string in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* strW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved. Must be zero   */
+    void* extra2; /*!< Reserved. Must be zero   */
+    struct ___itt_string_handle* next;
+} __itt_string_handle;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup handles
+ * @brief Create a string handle.
+ * Create and return handle value that can be associated with a string.
+ * Consecutive calls to __itt_string_handle_create with the same name
+ * return the same value. Because the set of string handles is expected to remain
+ * static during the application's execution time, there is no mechanism to destroy a string handle.
+ * Any string handle can be accessed by any thread in the process, regardless of which thread created
+ * the string handle. This call is thread-safe.
+ * @param[in] name The input string
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_string_handle* ITTAPI __itt_string_handle_createA(const char    *name);
+__itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_string_handle_create     __itt_string_handle_createW
+#  define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
+#else /* UNICODE */
+#  define __itt_string_handle_create     __itt_string_handle_createA
+#  define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_string_handle* ITTAPI __itt_string_handle_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA     ITTNOTIFY_DATA(string_handle_createA)
+#define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA)
+#define __itt_string_handle_createW     ITTNOTIFY_DATA(string_handle_createW)
+#define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create     ITTNOTIFY_DATA(string_handle_create)
+#define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA(name) (__itt_string_handle*)0
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW(name) (__itt_string_handle*)0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create(name)  (__itt_string_handle*)0
+#define __itt_string_handle_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} handles group */
+
+/** @cond exclude_from_documentation */
+typedef unsigned long long __itt_timestamp;
+/** @endcond */
+
+#define __itt_timestamp_none ((__itt_timestamp)-1LL)
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @ingroup timestamps
+ * @brief Return timestamp corresponding to the current moment.
+ * This returns the timestamp in the format that is the most relevant for the current
+ * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
+ * compare __itt_timestamp values.
+ */
+__itt_timestamp ITTAPI __itt_get_timestamp(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
+#define __itt_get_timestamp      ITTNOTIFY_DATA(get_timestamp)
+#define __itt_get_timestamp_ptr  ITTNOTIFY_NAME(get_timestamp)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_get_timestamp()
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} timestamps */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @defgroup regions Regions
+ * @ingroup public
+ * Regions group
+ * @{
+ */
+/**
+ * @ingroup regions
+ * @brief Begin of region instance.
+ * Successive calls to __itt_region_begin with the same ID are ignored
+ * until a call to __itt_region_end with the same ID
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance. Must not be __itt_null
+ * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null
+ * @param[in] name The name of this region
+ */
+void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup regions
+ * @brief End of region instance.
+ * The first call to __itt_region_end with a given ID ends the
+ * region. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_region_begin call.
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance
+ */
+void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
+#define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z)
+#define __itt_region_begin_ptr      ITTNOTIFY_NAME(region_begin)
+#define __itt_region_end(d,x)       ITTNOTIFY_VOID_D1(region_end,d,x)
+#define __itt_region_end_ptr        ITTNOTIFY_NAME(region_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_region_begin(d,x,y,z)
+#define __itt_region_begin_ptr 0
+#define __itt_region_end(d,x)
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_region_begin_ptr 0
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} regions group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup public
+ * Frames are similar to regions, but are intended to be easier to use and to implement.
+ * In particular:
+ * - Frames always represent periods of elapsed time
+ * - By default, frames have no nesting relationships
+ * @{
+ */
+
+/**
+ * @ingroup frames
+ * @brief Begin a frame instance.
+ * Successive calls to __itt_frame_begin with the
+ * same ID are ignored until a call to __itt_frame_end with the same ID.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ */
+void ITTAPI __itt_frame_begin_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief End a frame instance.
+ * The first call to __itt_frame_end with a given ID
+ * ends the frame. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_frame_begin call.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL for current
+ */
+void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief Submits a frame instance.
+ * Successive calls to __itt_frame_begin or __itt_frame_submit with the
+ * same ID are ignored until a call to __itt_frame_end or __itt_frame_submit
+ * with the same ID.
+ * Passing special __itt_timestamp_none value as "end" argument means
+ * take the current timestamp as the end timestamp.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ * @param[in] begin Timestamp of the beginning of the frame
+ * @param[in] end Timestamp of the end of the frame
+ */
+void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
+    __itt_timestamp begin, __itt_timestamp end);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
+#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
+#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
+#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
+#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
+#define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
+#define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin_v3(domain,id)
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3(domain,id)
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3(domain,id,begin,end)
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+/** @endcond */
+
+/**
+ * @defgroup taskgroup Task Group
+ * @ingroup public
+ * Task Group
+ * @{
+ */
+/**
+ * @ingroup task_groups
+ * @brief Denotes a task_group instance.
+ * Successive calls to __itt_task_group with the same ID are ignored.
+ * @param[in] domain The domain for this task_group instance
+ * @param[in] id The instance ID for this task_group instance. Must not be __itt_null.
+ * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null.
+ * @param[in] name The name of this task_group
+ */
+void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+#define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z)
+#define __itt_task_group_ptr      ITTNOTIFY_NAME(task_group)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_group(d,x,y,z)
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} taskgroup group */
+
+/**
+ * @defgroup tasks Tasks
+ * @ingroup public
+ * A task instance represents a piece of work performed by a particular
+ * thread for a period of time. A call to __itt_task_begin creates a
+ * task instance. This becomes the current instance for that task on that
+ * thread. A following call to __itt_task_end on the same thread ends the
+ * instance. There may be multiple simultaneous instances of tasks with the
+ * same name on different threads. If an ID is specified, the task instance
+ * receives that ID. Nested tasks are allowed.
+ *
+ * Note: The task is defined by the bracketing of __itt_task_begin and
+ * __itt_task_end on the same thread. If some scheduling mechanism causes
+ * task switching (the thread executes a different user task) or task
+ * switching (the user task switches to a different thread) then this breaks
+ * the notion of  current instance. Additional API calls are required to
+ * deal with that possibility.
+ * @{
+ */
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The identifier for this task instance (may be 0)
+ * @param[in] parentid The parent of this task (may be 0)
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup tasks
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ */
+void ITTAPI __itt_task_end(const __itt_domain *domain);
+
+/**
+ * @ingroup tasks
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup tasks
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain))
+ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped,   (const __itt_domain *domain, __itt_id taskid))
+#define __itt_task_begin(d,x,y,z)    ITTNOTIFY_VOID_D3(task_begin,d,x,y,z)
+#define __itt_task_begin_ptr         ITTNOTIFY_NAME(task_begin)
+#define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z)
+#define __itt_task_begin_fn_ptr      ITTNOTIFY_NAME(task_begin_fn)
+#define __itt_task_end(d)            ITTNOTIFY_VOID_D0(task_end,d)
+#define __itt_task_end_ptr           ITTNOTIFY_NAME(task_end)
+#define __itt_task_begin_overlapped(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z)
+#define __itt_task_begin_overlapped_ptr      ITTNOTIFY_NAME(task_begin_overlapped)
+#define __itt_task_end_overlapped(d,x)       ITTNOTIFY_VOID_D1(task_end_overlapped,d,x)
+#define __itt_task_end_overlapped_ptr        ITTNOTIFY_NAME(task_end_overlapped)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin(domain,id,parentid,name)
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn(domain,id,parentid,fn)
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end(domain)
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped(domain,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ptr         0
+#define __itt_task_end_overlapped(domain,taskid)
+#define __itt_task_end_overlapped_ptr           0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped_ptr 0
+#define __itt_task_end_overlapped_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} tasks group */
+
+
+/**
+ * @defgroup markers Markers
+ * Markers represent a single discreet event in time. Markers have a scope,
+ * described by an enumerated type __itt_scope. Markers are created by
+ * the API call __itt_marker. A marker instance can be given an ID for use in
+ * adding metadata.
+ * @{
+ */
+
+/**
+ * @brief Describes the scope of an event object in the trace.
+ */
+typedef enum
+{
+    __itt_scope_unknown = 0,
+    __itt_scope_global,
+    __itt_scope_track_group,
+    __itt_scope_track,
+    __itt_scope_task,
+    __itt_scope_marker
+} __itt_scope;
+
+/** @cond exclude_from_documentation */
+#define __itt_marker_scope_unknown  __itt_scope_unknown
+#define __itt_marker_scope_global   __itt_scope_global
+#define __itt_marker_scope_process  __itt_scope_track_group
+#define __itt_marker_scope_thread   __itt_scope_track
+#define __itt_marker_scope_task     __itt_scope_task
+/** @endcond */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance
+ * @param[in] domain The domain for this marker
+ * @param[in] id The instance ID for this marker or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z)
+#define __itt_marker_ptr      ITTNOTIFY_NAME(marker)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker(domain,id,name,scope)
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} markers group */
+
+/**
+ * @defgroup metadata Metadata
+ * The metadata API is used to attach extra information to named
+ * entities. Metadata can be attached to an identified named entity by ID,
+ * or to the current entity (which is always a task).
+ *
+ * Conceptually metadata has a type (what kind of metadata), a key (the
+ * name of the metadata), and a value (the actual data). The encoding of
+ * the value depends on the type of the metadata.
+ *
+ * The type of metadata is specified by an enumerated type __itt_metdata_type.
+ * @{
+ */
+
+/**
+ * @ingroup parameters
+ * @brief describes the type of metadata
+ */
+typedef enum {
+    __itt_metadata_unknown = 0,
+    __itt_metadata_u64,     /**< Unsigned 64-bit integer */
+    __itt_metadata_s64,     /**< Signed 64-bit integer */
+    __itt_metadata_u32,     /**< Unsigned 32-bit integer */
+    __itt_metadata_s32,     /**< Signed 32-bit integer */
+    __itt_metadata_u16,     /**< Unsigned 16-bit integer */
+    __itt_metadata_s16,     /**< Signed 16-bit integer */
+    __itt_metadata_float,   /**< Signed 32-bit floating-point */
+    __itt_metadata_double   /**< SIgned 64-bit floating-point */
+} __itt_metadata_type;
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b)
+#define __itt_metadata_add_ptr          ITTNOTIFY_NAME(metadata_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add(d,x,y,z,a,b)
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add     __itt_metadata_str_addW
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add     __itt_metadata_str_addA
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr        ITTNOTIFY_NAME(metadata_str_addA)
+#define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr        ITTNOTIFY_NAME(metadata_str_addW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a)
+#define __itt_metadata_str_add_ptr         ITTNOTIFY_NAME(metadata_str_add)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW(d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)
+#define __itt_metadata_str_add_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr          ITTNOTIFY_NAME(metadata_add_with_scope)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeW
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeA
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr         ITTNOTIFY_NAME(metadata_str_add_with_scope)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} metadata group */
+
+/**
+ * @defgroup relations Relations
+ * Instances of named entities can be explicitly associated with other
+ * instances using instance IDs and the relationship API calls.
+ *
+ * @{
+ */
+
+/**
+ * @ingroup relations
+ * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation.
+ * Relations between instances can be added with an API call. The relation
+ * API uses instance IDs. Relations can be added before or after the actual
+ * instances are created and persist independently of the instances. This
+ * is the motivation for having different lifetimes for instance IDs and
+ * the actual instances.
+ */
+typedef enum
+{
+    __itt_relation_is_unknown = 0,
+    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
+    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
+    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
+    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
+    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
+    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
+    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+} __itt_relation;
+
+/**
+ * @ingroup relations
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup relations
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y)
+#define __itt_relation_add_to_current_ptr    ITTNOTIFY_NAME(relation_add_to_current)
+#define __itt_relation_add(d,x,y,z)          ITTNOTIFY_VOID_D3(relation_add,d,x,y,z)
+#define __itt_relation_add_ptr               ITTNOTIFY_NAME(relation_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current(d,x,y)
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add(d,x,y,z)
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} relations group */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_info
+{
+    unsigned long long clock_freq; /*!< Clock domain frequency */
+    unsigned long long clock_base; /*!< Clock domain base timestamp */
+} __itt_clock_info;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_domain
+{
+    __itt_clock_info info;      /*!< Most recent clock domain info */
+    __itt_get_clock_info_fn fn; /*!< Callback function pointer */
+    void* fn_data;              /*!< Input argument for the callback function */
+    int   extra1;               /*!< Reserved. Must be zero */
+    void* extra2;               /*!< Reserved. Must be zero */
+    struct ___itt_clock_domain* next;
+} __itt_clock_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Create a clock domain.
+ * Certain applications require the capability to trace their application using
+ * a clock domain different than the CPU, for instance the instrumentation of events
+ * that occur on a GPU.
+ * Because the set of domains is expected to be static over the application's execution time,
+ * there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of which thread created
+ * the domain. This call is thread-safe.
+ * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps
+ * @param[in] fn_data Argument for a callback function; may be NULL
+ */
+__itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data))
+#define __itt_clock_domain_create     ITTNOTIFY_DATA(clock_domain_create)
+#define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Recalculate clock domains frequences and clock base timestamps.
+ */
+void ITTAPI __itt_clock_domain_reset(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, clock_domain_reset, (void))
+#define __itt_clock_domain_reset     ITTNOTIFY_VOID(clock_domain_reset)
+#define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_reset()
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Create an instance of identifier. This establishes the beginning of the lifetime of
+ * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to
+ * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/**
+ * @ingroup clockdomain
+ * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the
+ * given ID value in the trace. Any relationships that are established after this lifetime ends are
+ * invalid. This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+#define __itt_id_create_ex(d,x,y,z)  ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z)
+#define __itt_id_create_ex_ptr       ITTNOTIFY_NAME(id_create_ex)
+#define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z)
+#define __itt_id_destroy_ex_ptr      ITTNOTIFY_NAME(id_destroy_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, or __itt_null
+ * @param[in] parentid The parent of this task, or __itt_null
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup clockdomain
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ */
+void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_ex,        (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex,     (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp))
+#define __itt_task_begin_ex(d,x,y,z,a,b)      ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b)
+#define __itt_task_begin_ex_ptr               ITTNOTIFY_NAME(task_begin_ex)
+#define __itt_task_begin_fn_ex(d,x,y,z,a,b)   ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b)
+#define __itt_task_begin_fn_ex_ptr            ITTNOTIFY_NAME(task_begin_fn_ex)
+#define __itt_task_end_ex(d,x,y)              ITTNOTIFY_VOID_D2(task_end_ex,d,x,y)
+#define __itt_task_end_ex_ptr                 ITTNOTIFY_NAME(task_end_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name)
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn)
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex(domain,clock_domain,timestamp)
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup counters Counters
+ * @ingroup public
+ * Counters are user-defined objects with a monotonically increasing
+ * value. Counter values are 64-bit unsigned integers.
+ * Counters have names that can be displayed in
+ * the tools.
+ * @{
+ */
+
+/**
+ * @brief opaque structure for counter identification
+ */
+/** @cond exclude_from_documentation */
+
+typedef struct ___itt_counter* __itt_counter;
+
+/**
+ * @brief Create an unsigned 64 bits integer counter with given name/domain
+ *
+ * After __itt_counter_create() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter, where value_ptr is a pointer to an unsigned 64 bits integer
+ *
+ * The call is equal to __itt_counter_create_typed(name, domain, __itt_metadata_u64)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA(const char    *name, const char    *domain);
+__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create     __itt_counter_createW
+#  define __itt_counter_create_ptr __itt_counter_createW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create     __itt_counter_createA
+#  define __itt_counter_create_ptr __itt_counter_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char *name, const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA     ITTNOTIFY_DATA(counter_createA)
+#define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA)
+#define __itt_counter_createW     ITTNOTIFY_DATA(counter_createW)
+#define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create     ITTNOTIFY_DATA(counter_create)
+#define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA(name, domain)
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW(name, domain)
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create(name, domain)
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Increment the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id))
+#define __itt_counter_inc     ITTNOTIFY_VOID(counter_inc)
+#define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc(id)
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Increment the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_inc_delta     ITTNOTIFY_VOID(counter_inc_delta)
+#define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_delta(id, value)
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id))
+#define __itt_counter_dec     ITTNOTIFY_VOID(counter_dec)
+#define __itt_counter_dec_ptr ITTNOTIFY_NAME(counter_dec)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec(id)
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_dec_delta     ITTNOTIFY_VOID(counter_dec_delta)
+#define __itt_counter_dec_delta_ptr ITTNOTIFY_NAME(counter_dec_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_delta(id, value)
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls increment the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to increment the counter
+ */
+void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_inc_v3(d,x)         ITTNOTIFY_VOID_D1(counter_inc_v3,d,x)
+#define __itt_counter_inc_v3_ptr          ITTNOTIFY_NAME(counter_inc_v3)
+#define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y)
+#define __itt_counter_inc_delta_v3_ptr    ITTNOTIFY_NAME(counter_inc_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_v3(domain,name)
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3(domain,name,delta)
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls decrement the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to decrement the counter
+ */
+void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_dec_v3(d,x)         ITTNOTIFY_VOID_D1(counter_dec_v3,d,x)
+#define __itt_counter_dec_v3_ptr          ITTNOTIFY_NAME(counter_dec_v3)
+#define __itt_counter_dec_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_dec_delta_v3,d,x,y)
+#define __itt_counter_dec_delta_v3_ptr    ITTNOTIFY_NAME(counter_dec_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_v3(domain,name)
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3(domain,name,delta)
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} counters group */
+
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value(__itt_counter id, void *value_ptr);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr))
+#define __itt_counter_set_value     ITTNOTIFY_VOID(counter_set_value)
+#define __itt_counter_set_value_ptr ITTNOTIFY_NAME(counter_set_value)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value(id, value_ptr)
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value_ex(__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr))
+#define __itt_counter_set_value_ex     ITTNOTIFY_VOID(counter_set_value_ex)
+#define __itt_counter_set_value_ex_ptr ITTNOTIFY_NAME(counter_set_value_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create a typed counter with given name/domain
+ *
+ * After __itt_counter_create_typed() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_create_typedA(const char    *name, const char    *domain, __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create_typed     __itt_counter_create_typedW
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create_typed     __itt_counter_create_typedA
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create_typed(const char *name, const char *domain, __itt_metadata_type type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type))
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char *name, const char *domain, __itt_metadata_type type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA     ITTNOTIFY_DATA(counter_create_typedA)
+#define __itt_counter_create_typedA_ptr ITTNOTIFY_NAME(counter_create_typedA)
+#define __itt_counter_create_typedW     ITTNOTIFY_DATA(counter_create_typedW)
+#define __itt_counter_create_typedW_ptr ITTNOTIFY_NAME(counter_create_typedW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed     ITTNOTIFY_DATA(counter_create_typed)
+#define __itt_counter_create_typed_ptr ITTNOTIFY_NAME(counter_create_typed)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA(name, domain, type)
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW(name, domain, type)
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed(name, domain, type)
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create() or
+ * __itt_counter_create_typed()
+ */
+void ITTAPI __itt_counter_destroy(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
+#define __itt_counter_destroy     ITTNOTIFY_VOID(counter_destroy)
+#define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_destroy(id)
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} counters group */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance.
+ * @param[in] domain The domain for this marker
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The instance ID for this marker, or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker_ex(d,x,y,z,a,b)    ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b)
+#define __itt_marker_ex_ptr             ITTNOTIFY_NAME(marker_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope)
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a)
+#define __itt_relation_add_to_current_ex_ptr        ITTNOTIFY_NAME(relation_add_to_current_ex)
+#define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
+#define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum ___itt_track_group_type
+{
+    __itt_track_group_type_normal = 0
+} __itt_track_group_type;
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track_group
+{
+    __itt_string_handle* name;     /*!< Name of the track group */
+    struct ___itt_track* track;    /*!< List of child tracks    */
+    __itt_track_group_type tgtype; /*!< Type of the track group */
+    int   extra1;                  /*!< Reserved. Must be zero  */
+    void* extra2;                  /*!< Reserved. Must be zero  */
+    struct ___itt_track_group* next;
+} __itt_track_group;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Placeholder for custom track types. Currently, "normal" custom track
+ * is the only available track type.
+ */
+typedef enum ___itt_track_type
+{
+    __itt_track_type_normal = 0
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+    , __itt_track_type_queue
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
+} __itt_track_type;
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track
+{
+    __itt_string_handle* name; /*!< Name of the track group */
+    __itt_track_group* group;  /*!< Parent group to a track */
+    __itt_track_type ttype;    /*!< Type of the track       */
+    int   extra1;              /*!< Reserved. Must be zero  */
+    void* extra2;              /*!< Reserved. Must be zero  */
+    struct ___itt_track* next;
+} __itt_track;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Create logical track group.
+ */
+__itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type))
+#define __itt_track_group_create     ITTNOTIFY_DATA(track_group_create)
+#define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_group_create(name)  (__itt_track_group*)0
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create logical track.
+ */
+__itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type))
+#define __itt_track_create     ITTNOTIFY_DATA(track_create)
+#define __itt_track_create_ptr ITTNOTIFY_NAME(track_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_create(track_group,name,track_type)  (__itt_track*)0
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the logical track.
+ */
+void ITTAPI __itt_set_track(__itt_track* track);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track))
+#define __itt_set_track     ITTNOTIFY_VOID(set_track)
+#define __itt_set_track_ptr ITTNOTIFY_NAME(set_track)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_set_track(track)
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/* ========================================================================== */
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup events Events
+ * @ingroup public
+ * Events group
+ * @{
+ */
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} events group */
+
+
+/**
+ * @defgroup arrays Arrays Visualizer
+ * @ingroup public
+ * Visualize arrays
+ * @{
+ */
+
+/**
+ * @enum __itt_av_data_type
+ * @brief Defines types of arrays data (for C/C++ intrinsic types)
+ */
+typedef enum
+{
+    __itt_e_first = 0,
+    __itt_e_char = 0,  /* 1-byte integer */
+    __itt_e_uchar,     /* 1-byte unsigned integer */
+    __itt_e_int16,     /* 2-byte integer */
+    __itt_e_uint16,    /* 2-byte unsigned integer  */
+    __itt_e_int32,     /* 4-byte integer */
+    __itt_e_uint32,    /* 4-byte unsigned integer */
+    __itt_e_int64,     /* 8-byte integer */
+    __itt_e_uint64,    /* 8-byte unsigned integer */
+    __itt_e_float,     /* 4-byte floating */
+    __itt_e_double,    /* 8-byte floating */
+    __itt_e_last = __itt_e_double
+} __itt_av_data_type;
+
+/**
+ * @brief Save an array data to a file.
+ * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
+ * @param[in] data - pointer to the array data
+ * @param[in] rank - the rank of the array
+ * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions.
+ * The size of dimensions must be equal to the rank
+ * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
+ * @param[in] filePath - the file path; the output format is defined by the file extension
+ * @param[in] columnOrder - defines how the array is stored in the linear memory.
+ * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C).
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_av_save     __itt_av_saveW
+#  define __itt_av_save_ptr __itt_av_saveW_ptr
+#else /* UNICODE */
+#  define __itt_av_save     __itt_av_saveA
+#  define __itt_av_save_ptr __itt_av_saveA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA     ITTNOTIFY_DATA(av_saveA)
+#define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA)
+#define __itt_av_saveW     ITTNOTIFY_DATA(av_saveW)
+#define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save     ITTNOTIFY_DATA(av_save)
+#define __itt_av_save_ptr ITTNOTIFY_NAME(av_save)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA(name)
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW(name)
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save(name)
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+void ITTAPI __itt_enable_attach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, enable_attach, (void))
+#define __itt_enable_attach     ITTNOTIFY_VOID(enable_attach)
+#define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_enable_attach()
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/** @} arrays group */
+
+/** @endcond */
+
+/**
+ * @brief Module load info
+ * This API is used to report necessary information in case of module relocation
+ * @param[in] start_addr - relocated module start address
+ * @param[in] end_addr - relocated module end address
+ * @param[in] path - file system path to the module
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path);
+void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr, const wchar_t *path);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_module_load     __itt_module_loadW
+#  define __itt_module_load_ptr __itt_module_loadW_ptr
+#else /* UNICODE */
+#  define __itt_module_load     __itt_module_loadA
+#  define __itt_module_load_ptr __itt_module_loadA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_module_load(void *start_addr, void *end_addr, const char *path);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, void, module_loadA, (void *start_addr, void *end_addr, const char *path))
+ITT_STUB(ITTAPI, void, module_loadW, (void *start_addr, void *end_addr, const wchar_t *path))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const char *path))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA     ITTNOTIFY_VOID(module_loadA)
+#define __itt_module_loadA_ptr ITTNOTIFY_NAME(module_loadA)
+#define __itt_module_loadW     ITTNOTIFY_VOID(module_loadW)
+#define __itt_module_loadW_ptr ITTNOTIFY_NAME(module_loadW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load     ITTNOTIFY_VOID(module_load)
+#define __itt_module_load_ptr ITTNOTIFY_NAME(module_load)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA(start_addr, end_addr, path)
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW(start_addr, end_addr, path)
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load(start_addr, end_addr, path)
+#define __itt_module_load_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_H_ */
+
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+
+#ifndef _ITTNOTIFY_PRIVATE_
+#define _ITTNOTIFY_PRIVATE_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,       (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid))
+#define __itt_task_begin_overlapped_ex(d,x,y,z,a,b)     ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b)
+#define __itt_task_begin_overlapped_ex_ptr              ITTNOTIFY_NAME(task_begin_overlapped_ex)
+#define __itt_task_end_overlapped_ex(d,x,y,z)           ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z)
+#define __itt_task_end_overlapped_ex_ptr                ITTNOTIFY_NAME(task_end_overlapped_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid)
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ptr           0
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup makrs_internal Marks
+ * @ingroup internal
+ * Marks group
+ * @warning Internal API:
+ *   - It is not shipped to outside of Intel
+ *   - It is delivered to internal Intel teams using e-mail or SVN access only
+ * @{
+ */
+/** @brief user mark type */
+typedef int __itt_mark_type;
+
+/**
+ * @brief Creates a user mark type with the specified name using char or Unicode string.
+ * @param[in] name - name of mark to create
+ * @return Returns a handle to the mark type
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_mark_type ITTAPI __itt_mark_createA(const char    *name);
+__itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_create     __itt_mark_createW
+#  define __itt_mark_create_ptr __itt_mark_createW_ptr
+#else /* UNICODE */
+#  define __itt_mark_create     __itt_mark_createA
+#  define __itt_mark_create_ptr __itt_mark_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_mark_type ITTAPI __itt_mark_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA     ITTNOTIFY_DATA(mark_createA)
+#define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA)
+#define __itt_mark_createW     ITTNOTIFY_DATA(mark_createW)
+#define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create      ITTNOTIFY_DATA(mark_create)
+#define __itt_mark_create_ptr  ITTNOTIFY_NAME(mark_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA(name) (__itt_mark_type)0
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW(name) (__itt_mark_type)0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create(name)  (__itt_mark_type)0
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string.
+ *
+ * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign.
+ * - The call is "synchronous" - function returns after mark is actually added to results.
+ * - This function is useful, for example, to mark different phases of application
+ *   (beginning of the next mark automatically meand end of current region).
+ * - Can be used together with "continuous" marks (see below) at the same collection session
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @param[in] parameter - string parameter of mark
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_markA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark     __itt_markW
+#  define __itt_mark_ptr __itt_markW_ptr
+#else /* UNICODE  */
+#  define __itt_mark     __itt_markA
+#  define __itt_mark_ptr __itt_markA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA     ITTNOTIFY_DATA(markA)
+#define __itt_markA_ptr ITTNOTIFY_NAME(markA)
+#define __itt_markW     ITTNOTIFY_DATA(markW)
+#define __itt_markW_ptr ITTNOTIFY_NAME(markW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark      ITTNOTIFY_DATA(mark)
+#define __itt_mark_ptr  ITTNOTIFY_NAME(mark)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA(mt, parameter) (int)0
+#define __itt_markA_ptr 0
+#define __itt_markW(mt, parameter) (int)0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark(mt, parameter)  (int)0
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA_ptr 0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create a "discrete" user event type (mark) for process
+ * rather then for one thread
+ * @see int __itt_mark(__itt_mark_type mt, const char* parameter);
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_global     __itt_mark_globalW
+#  define __itt_mark_global_ptr __itt_mark_globalW_ptr
+#else /* UNICODE  */
+#  define __itt_mark_global     __itt_mark_globalA
+#  define __itt_mark_global_ptr __itt_mark_globalA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark_global,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA     ITTNOTIFY_DATA(mark_globalA)
+#define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA)
+#define __itt_mark_globalW     ITTNOTIFY_DATA(mark_globalW)
+#define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global      ITTNOTIFY_DATA(mark_global)
+#define __itt_mark_global_ptr  ITTNOTIFY_NAME(mark_global)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA(mt, parameter) (int)0
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW(mt, parameter) (int)0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global(mt, parameter)  (int)0
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates an "end" point for "continuous" mark with specified name.
+ *
+ * - Returns zero value in case of success, non-zero value otherwise.
+ *   Also returns non-zero value when preceding "begin" point for the
+ *   mark with the same name failed to be created or not created.
+ * - The mark of "continuous" type is placed to collection results in
+ *   case of success. It appears in overtime view(s) as a special tick
+ *   sign (different from "discrete" mark) together with line from
+ *   corresponding "begin" mark to "end" mark.
+ * @note Continuous marks can overlap and be nested inside each other.
+ * Discrete mark can be nested inside marked region
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+int ITTAPI __itt_mark_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt))
+#define __itt_mark_off     ITTNOTIFY_DATA(mark_off)
+#define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_off(mt) (int)0
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create an "end" point for mark of process
+ * @see int __itt_mark_off(__itt_mark_type mt);
+ */
+int ITTAPI __itt_mark_global_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
+#define __itt_mark_global_off     ITTNOTIFY_DATA(mark_global_off)
+#define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_global_off(mt) (int)0
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} marks group */
+
+/**
+ * @defgroup counters_internal Counters
+ * @ingroup internal
+ * Counters group
+ * @{
+ */
+
+
+/**
+ * @defgroup stitch Stack Stitching
+ * @ingroup internal
+ * Stack Stitching group
+ * @{
+ */
+/**
+ * @brief opaque structure for counter identification
+ */
+typedef struct ___itt_caller *__itt_caller;
+
+/**
+ * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to.
+ * The function returns a unique identifier which is used to match the cut points with corresponding stitch points.
+ */
+__itt_caller ITTAPI __itt_stack_caller_create(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
+#define __itt_stack_caller_create     ITTNOTIFY_DATA(stack_caller_create)
+#define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_create() (__itt_caller)0
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the inforamtion about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ */
+void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id))
+#define __itt_stack_caller_destroy     ITTNOTIFY_VOID(stack_caller_destroy)
+#define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_destroy(id)
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Sets the cut point. Stack from each event which occurs after this call will be cut
+ * at the same stack level the function was called and stitched to the corresponding stitch point.
+ */
+void ITTAPI __itt_stack_callee_enter(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id))
+#define __itt_stack_callee_enter     ITTNOTIFY_VOID(stack_callee_enter)
+#define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_enter(id)
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter().
+ */
+void ITTAPI __itt_stack_callee_leave(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id))
+#define __itt_stack_callee_leave     ITTNOTIFY_VOID(stack_callee_leave)
+#define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_leave(id)
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} stitch group */
+
+/* ***************************************************************************************************************************** */
+
+#include <stdarg.h>
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_error_code
+{
+    __itt_error_success       = 0, /*!< no error */
+    __itt_error_no_module     = 1, /*!< module can't be loaded */
+    /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */
+    __itt_error_no_symbol     = 2, /*!< symbol not found */
+    /* %1$s -- library name, %2$s -- symbol name. */
+    __itt_error_unknown_group = 3, /*!< unknown group specified */
+    /* %1$s -- env var name, %2$s -- group name. */
+    __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
+    /* %1$s -- env var name, %2$d -- system error. */
+    __itt_error_env_too_long  = 5, /*!< variable value too long */
+    /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */
+    __itt_error_system        = 6  /*!< pthread_mutexattr_init or pthread_mutex_init failed */
+    /* %1$s -- function name, %2$d -- errno. */
+} __itt_error_code;
+
+typedef void (__itt_error_handler_t)(__itt_error_code code, va_list);
+__itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*);
+
+const char* ITTAPI __itt_api_version(void);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#define __itt_error_handler ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, error_handler)
+void __itt_error_handler(__itt_error_code code, va_list args);
+extern const int ITTNOTIFY_NAME(err);
+#define __itt_err ITTNOTIFY_NAME(err)
+ITT_STUB(ITTAPI, const char*, api_version, (void))
+#define __itt_api_version     ITTNOTIFY_DATA(api_version)
+#define __itt_api_version_ptr ITTNOTIFY_NAME(api_version)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_api_version()   (const char*)0
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_PRIVATE_ */
+
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
-- 
cgit v1.2.3