summaryrefslogtreecommitdiffstats
path: root/src/tools/coverage-dump
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:44 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:44 +0000
commitc23a457e72abe608715ac76f076f47dc42af07a5 (patch)
tree2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /src/tools/coverage-dump
parentReleasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz
rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/tools/coverage-dump')
-rw-r--r--src/tools/coverage-dump/Cargo.toml14
-rw-r--r--src/tools/coverage-dump/README.md8
-rw-r--r--src/tools/coverage-dump/src/covfun.rs296
-rw-r--r--src/tools/coverage-dump/src/main.rs17
-rw-r--r--src/tools/coverage-dump/src/parser.rs80
-rw-r--r--src/tools/coverage-dump/src/parser/tests.rs38
-rw-r--r--src/tools/coverage-dump/src/prf_names.rs87
7 files changed, 540 insertions, 0 deletions
diff --git a/src/tools/coverage-dump/Cargo.toml b/src/tools/coverage-dump/Cargo.toml
new file mode 100644
index 000000000..7f14286b5
--- /dev/null
+++ b/src/tools/coverage-dump/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "coverage-dump"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0.71"
+leb128 = "0.2.5"
+md5 = { package = "md-5" , version = "0.10.5" }
+miniz_oxide = "0.7.1"
+regex = "1.8.4"
+rustc-demangle = "0.1.23"
diff --git a/src/tools/coverage-dump/README.md b/src/tools/coverage-dump/README.md
new file mode 100644
index 000000000..e2625d5ad
--- /dev/null
+++ b/src/tools/coverage-dump/README.md
@@ -0,0 +1,8 @@
+This tool extracts coverage mapping information from an LLVM IR assembly file
+(`.ll`), and prints it in a more human-readable form that can be used for
+snapshot tests.
+
+The output format is mostly arbitrary, so it's OK to change the output as long
+as any affected tests are also re-blessed. However, the output should be
+consistent across different executions on different platforms, so avoid
+printing any information that is platform-specific or non-deterministic.
diff --git a/src/tools/coverage-dump/src/covfun.rs b/src/tools/coverage-dump/src/covfun.rs
new file mode 100644
index 000000000..3a5866dea
--- /dev/null
+++ b/src/tools/coverage-dump/src/covfun.rs
@@ -0,0 +1,296 @@
+use crate::parser::{unescape_llvm_string_contents, Parser};
+use anyhow::{anyhow, Context};
+use regex::Regex;
+use std::collections::HashMap;
+use std::fmt::{self, Debug, Write as _};
+use std::sync::OnceLock;
+
+pub(crate) fn dump_covfun_mappings(
+ llvm_ir: &str,
+ function_names: &HashMap<u64, String>,
+) -> anyhow::Result<()> {
+ // Extract function coverage entries from the LLVM IR assembly, and associate
+ // each entry with its (demangled) name.
+ let mut covfun_entries = llvm_ir
+ .lines()
+ .filter_map(covfun_line_data)
+ .map(|line_data| (function_names.get(&line_data.name_hash).map(String::as_str), line_data))
+ .collect::<Vec<_>>();
+ covfun_entries.sort_by(|a, b| {
+ // Sort entries primarily by name, to help make the order consistent
+ // across platforms and relatively insensitive to changes.
+ // (Sadly we can't use `sort_by_key` because we would need to return references.)
+ Ord::cmp(&a.0, &b.0)
+ .then_with(|| Ord::cmp(&a.1.is_used, &b.1.is_used))
+ .then_with(|| Ord::cmp(a.1.payload.as_slice(), b.1.payload.as_slice()))
+ });
+
+ for (name, line_data) in &covfun_entries {
+ let name = name.unwrap_or("(unknown)");
+ let unused = if line_data.is_used { "" } else { " (unused)" };
+ println!("Function name: {name}{unused}");
+
+ let payload: &[u8] = &line_data.payload;
+ println!("Raw bytes ({len}): 0x{payload:02x?}", len = payload.len());
+
+ let mut parser = Parser::new(payload);
+
+ let num_files = parser.read_uleb128_u32()?;
+ println!("Number of files: {num_files}");
+
+ for i in 0..num_files {
+ let global_file_id = parser.read_uleb128_u32()?;
+ println!("- file {i} => global file {global_file_id}");
+ }
+
+ let num_expressions = parser.read_uleb128_u32()?;
+ println!("Number of expressions: {num_expressions}");
+
+ let mut expression_resolver = ExpressionResolver::new();
+ for i in 0..num_expressions {
+ let lhs = parser.read_simple_term()?;
+ let rhs = parser.read_simple_term()?;
+ println!("- expression {i} operands: lhs = {lhs:?}, rhs = {rhs:?}");
+ expression_resolver.push_operands(lhs, rhs);
+ }
+
+ for i in 0..num_files {
+ let num_mappings = parser.read_uleb128_u32()?;
+ println!("Number of file {i} mappings: {num_mappings}");
+
+ for _ in 0..num_mappings {
+ let (kind, region) = parser.read_mapping_kind_and_region()?;
+ println!("- {kind:?} at {region:?}");
+
+ match kind {
+ // Also print expression mappings in resolved form.
+ MappingKind::Code(term @ CovTerm::Expression { .. })
+ | MappingKind::Gap(term @ CovTerm::Expression { .. }) => {
+ println!(" = {}", expression_resolver.format_term(term));
+ }
+ // If the mapping is a branch region, print both of its arms
+ // in resolved form (even if they aren't expressions).
+ MappingKind::Branch { r#true, r#false } => {
+ println!(" true = {}", expression_resolver.format_term(r#true));
+ println!(" false = {}", expression_resolver.format_term(r#false));
+ }
+ _ => (),
+ }
+ }
+ }
+
+ parser.ensure_empty()?;
+ println!();
+ }
+ Ok(())
+}
+
+struct CovfunLineData {
+ name_hash: u64,
+ is_used: bool,
+ payload: Vec<u8>,
+}
+
+/// Checks a line of LLVM IR assembly to see if it contains an `__llvm_covfun`
+/// entry, and if so extracts relevant data in a `CovfunLineData`.
+fn covfun_line_data(line: &str) -> Option<CovfunLineData> {
+ let re = {
+ // We cheat a little bit and match variable names `@__covrec_[HASH]u`
+ // rather than the section name, because the section name is harder to
+ // extract and differs across Linux/Windows/macOS. We also extract the
+ // symbol name hash from the variable name rather than the data, since
+ // it's easier and both should match.
+ static RE: OnceLock<Regex> = OnceLock::new();
+ RE.get_or_init(|| {
+ Regex::new(
+ r#"^@__covrec_(?<name_hash>[0-9A-Z]+)(?<is_used>u)? = .*\[[0-9]+ x i8\] c"(?<payload>[^"]*)".*$"#,
+ )
+ .unwrap()
+ })
+ };
+
+ let captures = re.captures(line)?;
+ let name_hash = u64::from_str_radix(&captures["name_hash"], 16).unwrap();
+ let is_used = captures.name("is_used").is_some();
+ let payload = unescape_llvm_string_contents(&captures["payload"]);
+
+ Some(CovfunLineData { name_hash, is_used, payload })
+}
+
+// Extra parser methods only needed when parsing `covfun` payloads.
+impl<'a> Parser<'a> {
+ fn read_simple_term(&mut self) -> anyhow::Result<CovTerm> {
+ let raw_term = self.read_uleb128_u32()?;
+ CovTerm::decode(raw_term).context("decoding term")
+ }
+
+ fn read_mapping_kind_and_region(&mut self) -> anyhow::Result<(MappingKind, MappingRegion)> {
+ let mut kind = self.read_raw_mapping_kind()?;
+ let mut region = self.read_raw_mapping_region()?;
+
+ const HIGH_BIT: u32 = 1u32 << 31;
+ if region.end_column & HIGH_BIT != 0 {
+ region.end_column &= !HIGH_BIT;
+ kind = match kind {
+ MappingKind::Code(term) => MappingKind::Gap(term),
+ // LLVM's coverage mapping reader will actually handle this
+ // case without complaint, but the result is almost certainly
+ // a meaningless implementation artifact.
+ _ => return Err(anyhow!("unexpected base kind for gap region: {kind:?}")),
+ }
+ }
+
+ Ok((kind, region))
+ }
+
+ fn read_raw_mapping_kind(&mut self) -> anyhow::Result<MappingKind> {
+ let raw_mapping_kind = self.read_uleb128_u32()?;
+ if let Some(term) = CovTerm::decode(raw_mapping_kind) {
+ return Ok(MappingKind::Code(term));
+ }
+
+ assert_eq!(raw_mapping_kind & 0b11, 0);
+ assert_ne!(raw_mapping_kind, 0);
+
+ let (high, is_expansion) = (raw_mapping_kind >> 3, raw_mapping_kind & 0b100 != 0);
+ if is_expansion {
+ Ok(MappingKind::Expansion(high))
+ } else {
+ match high {
+ 0 => unreachable!("zero kind should have already been handled as a code mapping"),
+ 2 => Ok(MappingKind::Skip),
+ 4 => {
+ let r#true = self.read_simple_term()?;
+ let r#false = self.read_simple_term()?;
+ Ok(MappingKind::Branch { r#true, r#false })
+ }
+ _ => Err(anyhow!("unknown mapping kind: {raw_mapping_kind:#x}")),
+ }
+ }
+ }
+
+ fn read_raw_mapping_region(&mut self) -> anyhow::Result<MappingRegion> {
+ let start_line_offset = self.read_uleb128_u32()?;
+ let start_column = self.read_uleb128_u32()?;
+ let end_line_offset = self.read_uleb128_u32()?;
+ let end_column = self.read_uleb128_u32()?;
+ Ok(MappingRegion { start_line_offset, start_column, end_line_offset, end_column })
+ }
+}
+
+/// Enum that can hold a constant zero value, the ID of an physical coverage
+/// counter, or the ID (and operation) of a coverage-counter expression.
+///
+/// Terms are used as the operands of coverage-counter expressions, as the arms
+/// of branch mappings, and as the value of code/gap mappings.
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum CovTerm {
+ Zero,
+ Counter(u32),
+ Expression(u32, Op),
+}
+
+/// Operator (addition or subtraction) used by an expression.
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum Op {
+ Sub,
+ Add,
+}
+
+impl CovTerm {
+ pub(crate) fn decode(input: u32) -> Option<Self> {
+ let (high, tag) = (input >> 2, input & 0b11);
+ match tag {
+ 0b00 if high == 0 => Some(Self::Zero),
+ 0b01 => Some(Self::Counter(high)),
+ 0b10 => Some(Self::Expression(high, Op::Sub)),
+ 0b11 => Some(Self::Expression(high, Op::Add)),
+ // When reading expression operands or branch arms, the LLVM coverage
+ // mapping reader will always interpret a `0b00` tag as a zero
+ // term, even when the high bits are non-zero.
+ // We treat that case as failure instead, so that this code can be
+ // shared by the full mapping-kind reader as well.
+ _ => None,
+ }
+ }
+}
+
+#[derive(Debug)]
+enum MappingKind {
+ Code(CovTerm),
+ Gap(CovTerm),
+ Expansion(u32),
+ Skip,
+ // Using raw identifiers here makes the dump output a little bit nicer
+ // (via the derived Debug), at the expense of making this tool's source
+ // code a little bit uglier.
+ Branch { r#true: CovTerm, r#false: CovTerm },
+}
+
+struct MappingRegion {
+ /// Offset of this region's start line, relative to the *start line* of
+ /// the *previous mapping* (or 0). Line numbers are 1-based.
+ start_line_offset: u32,
+ /// This region's start column, absolute and 1-based.
+ start_column: u32,
+ /// Offset of this region's end line, relative to the *this mapping's*
+ /// start line. Line numbers are 1-based.
+ end_line_offset: u32,
+ /// This region's end column, absolute, 1-based, and exclusive.
+ ///
+ /// If the highest bit is set, that bit is cleared and the associated
+ /// mapping becomes a gap region mapping.
+ end_column: u32,
+}
+
+impl Debug for MappingRegion {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "(prev + {}, {}) to (start + {}, {})",
+ self.start_line_offset, self.start_column, self.end_line_offset, self.end_column
+ )
+ }
+}
+
+/// Helper type that prints expressions in a "resolved" form, so that
+/// developers reading the dump don't need to resolve expressions by hand.
+struct ExpressionResolver {
+ operands: Vec<(CovTerm, CovTerm)>,
+}
+
+impl ExpressionResolver {
+ fn new() -> Self {
+ Self { operands: Vec::new() }
+ }
+
+ fn push_operands(&mut self, lhs: CovTerm, rhs: CovTerm) {
+ self.operands.push((lhs, rhs));
+ }
+
+ fn format_term(&self, term: CovTerm) -> String {
+ let mut output = String::new();
+ self.write_term(&mut output, term);
+ output
+ }
+
+ fn write_term(&self, output: &mut String, term: CovTerm) {
+ match term {
+ CovTerm::Zero => output.push_str("Zero"),
+ CovTerm::Counter(id) => write!(output, "c{id}").unwrap(),
+ CovTerm::Expression(id, op) => {
+ let (lhs, rhs) = self.operands[id as usize];
+ let op = match op {
+ Op::Sub => "-",
+ Op::Add => "+",
+ };
+
+ output.push('(');
+ self.write_term(output, lhs);
+ write!(output, " {op} ").unwrap();
+ self.write_term(output, rhs);
+ output.push(')');
+ }
+ }
+ }
+}
diff --git a/src/tools/coverage-dump/src/main.rs b/src/tools/coverage-dump/src/main.rs
new file mode 100644
index 000000000..93fed1799
--- /dev/null
+++ b/src/tools/coverage-dump/src/main.rs
@@ -0,0 +1,17 @@
+mod covfun;
+mod parser;
+mod prf_names;
+
+fn main() -> anyhow::Result<()> {
+ use anyhow::Context as _;
+
+ let args = std::env::args().collect::<Vec<_>>();
+
+ let llvm_ir_path = args.get(1).context("LLVM IR file not specified")?;
+ let llvm_ir = std::fs::read_to_string(llvm_ir_path).context("couldn't read LLVM IR file")?;
+
+ let function_names = crate::prf_names::make_function_names_table(&llvm_ir)?;
+ crate::covfun::dump_covfun_mappings(&llvm_ir, &function_names)?;
+
+ Ok(())
+}
diff --git a/src/tools/coverage-dump/src/parser.rs b/src/tools/coverage-dump/src/parser.rs
new file mode 100644
index 000000000..eefac1a4f
--- /dev/null
+++ b/src/tools/coverage-dump/src/parser.rs
@@ -0,0 +1,80 @@
+#[cfg(test)]
+mod tests;
+
+use anyhow::ensure;
+use regex::bytes;
+use std::sync::OnceLock;
+
+/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
+/// backslash escapes and returns a vector containing the resulting byte string.
+pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
+ let escape_re = {
+ static RE: OnceLock<bytes::Regex> = OnceLock::new();
+ // LLVM IR supports two string escapes: `\\` and `\xx`.
+ RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
+ };
+
+ fn u8_from_hex_digits(digits: &[u8]) -> u8 {
+ // We know that the input contains exactly 2 hex digits, so these calls
+ // should never fail.
+ assert_eq!(digits.len(), 2);
+ let digits = std::str::from_utf8(digits).unwrap();
+ u8::from_str_radix(digits, 16).unwrap()
+ }
+
+ escape_re
+ .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
+ let byte = match captures.get(1) {
+ None => b'\\',
+ Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
+ };
+ [byte]
+ })
+ .into_owned()
+}
+
+pub(crate) struct Parser<'a> {
+ rest: &'a [u8],
+}
+
+impl<'a> Parser<'a> {
+ pub(crate) fn new(input: &'a [u8]) -> Self {
+ Self { rest: input }
+ }
+
+ pub(crate) fn ensure_empty(self) -> anyhow::Result<()> {
+ ensure!(self.rest.is_empty(), "unparsed bytes: 0x{:02x?}", self.rest);
+ Ok(())
+ }
+
+ pub(crate) fn read_n_bytes(&mut self, n: usize) -> anyhow::Result<&'a [u8]> {
+ ensure!(n <= self.rest.len());
+
+ let (bytes, rest) = self.rest.split_at(n);
+ self.rest = rest;
+ Ok(bytes)
+ }
+
+ pub(crate) fn read_uleb128_u32(&mut self) -> anyhow::Result<u32> {
+ self.read_uleb128_u64_and_convert()
+ }
+
+ pub(crate) fn read_uleb128_usize(&mut self) -> anyhow::Result<usize> {
+ self.read_uleb128_u64_and_convert()
+ }
+
+ fn read_uleb128_u64_and_convert<T>(&mut self) -> anyhow::Result<T>
+ where
+ T: TryFrom<u64> + 'static,
+ T::Error: std::error::Error + Send + Sync,
+ {
+ let mut temp_rest = self.rest;
+ let raw_value: u64 = leb128::read::unsigned(&mut temp_rest)?;
+ let converted_value = T::try_from(raw_value)?;
+
+ // Only update `self.rest` if the above steps succeeded, so that the
+ // parser position can be used for error reporting if desired.
+ self.rest = temp_rest;
+ Ok(converted_value)
+ }
+}
diff --git a/src/tools/coverage-dump/src/parser/tests.rs b/src/tools/coverage-dump/src/parser/tests.rs
new file mode 100644
index 000000000..a673606b9
--- /dev/null
+++ b/src/tools/coverage-dump/src/parser/tests.rs
@@ -0,0 +1,38 @@
+use super::unescape_llvm_string_contents;
+
+// WARNING: These tests don't necessarily run in CI, and were mainly used to
+// help track down problems when originally developing this tool.
+// (The tool is still tested indirectly by snapshot tests that rely on it.)
+
+// Tests for `unescape_llvm_string_contents`:
+
+#[test]
+fn unescape_empty() {
+ assert_eq!(unescape_llvm_string_contents(""), &[]);
+}
+
+#[test]
+fn unescape_noop() {
+ let input = "The quick brown fox jumps over the lazy dog.";
+ assert_eq!(unescape_llvm_string_contents(input), input.as_bytes());
+}
+
+#[test]
+fn unescape_backslash() {
+ let input = r"\\Hello\\world\\";
+ assert_eq!(unescape_llvm_string_contents(input), r"\Hello\world\".as_bytes());
+}
+
+#[test]
+fn unescape_hex() {
+ let input = r"\01\02\03\04\0a\0b\0C\0D\fd\fE\FF";
+ let expected: &[u8] = &[0x01, 0x02, 0x03, 0x04, 0x0a, 0x0b, 0x0c, 0x0d, 0xfd, 0xfe, 0xff];
+ assert_eq!(unescape_llvm_string_contents(input), expected);
+}
+
+#[test]
+fn unescape_mixed() {
+ let input = r"\\01.\5c\5c";
+ let expected: &[u8] = br"\01.\\";
+ assert_eq!(unescape_llvm_string_contents(input), expected);
+}
diff --git a/src/tools/coverage-dump/src/prf_names.rs b/src/tools/coverage-dump/src/prf_names.rs
new file mode 100644
index 000000000..d3f7b819e
--- /dev/null
+++ b/src/tools/coverage-dump/src/prf_names.rs
@@ -0,0 +1,87 @@
+use crate::parser::{unescape_llvm_string_contents, Parser};
+use anyhow::{anyhow, ensure};
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
+/// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names`
+/// entries, decodes them, and creates a table that maps name hash values to
+/// (demangled) function names.
+pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result<HashMap<u64, String>> {
+ fn prf_names_payload(line: &str) -> Option<&str> {
+ let re = {
+ // We cheat a little bit and match the variable name `@__llvm_prf_nm`
+ // rather than the section name, because the section name is harder
+ // to extract and differs across Linux/Windows/macOS.
+ static RE: OnceLock<Regex> = OnceLock::new();
+ RE.get_or_init(|| {
+ Regex::new(r#"^@__llvm_prf_nm =.*\[[0-9]+ x i8\] c"([^"]*)".*$"#).unwrap()
+ })
+ };
+
+ let payload = re.captures(line)?.get(1).unwrap().as_str();
+ Some(payload)
+ }
+
+ /// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
+ /// 64 bits as a way to associate data stored in different tables/sections.
+ fn truncated_md5(bytes: &[u8]) -> u64 {
+ use md5::{Digest, Md5};
+ let mut hasher = Md5::new();
+ hasher.update(bytes);
+ let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
+ // The truncated hash is explicitly little-endian, regardless of host
+ // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
+ u64::from_le_bytes(hash)
+ }
+
+ fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result<String> {
+ // In practice, raw symbol names should always be ASCII.
+ let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?;
+ match rustc_demangle::try_demangle(symbol_name_str) {
+ Ok(d) => Ok(format!("{d:#}")),
+ // If demangling failed, don't treat it as an error. This lets us
+ // run the dump tool against non-Rust coverage maps produced by
+ // `clang`, for testing purposes.
+ Err(_) => Ok(format!("(couldn't demangle) {symbol_name_str}")),
+ }
+ }
+
+ let mut map = HashMap::new();
+
+ for payload in llvm_ir.lines().filter_map(prf_names_payload).map(unescape_llvm_string_contents)
+ {
+ let mut parser = Parser::new(&payload);
+ let uncompressed_len = parser.read_uleb128_usize()?;
+ let compressed_len = parser.read_uleb128_usize()?;
+
+ let uncompressed_bytes_vec;
+ let uncompressed_bytes: &[u8] = if compressed_len == 0 {
+ // The symbol name bytes are uncompressed, so read them directly.
+ parser.read_n_bytes(uncompressed_len)?
+ } else {
+ // The symbol name bytes are compressed, so read and decompress them.
+ let compressed_bytes = parser.read_n_bytes(compressed_len)?;
+
+ uncompressed_bytes_vec = miniz_oxide::inflate::decompress_to_vec_zlib_with_limit(
+ compressed_bytes,
+ uncompressed_len,
+ )
+ .map_err(|e| anyhow!("{e:?}"))?;
+ ensure!(uncompressed_bytes_vec.len() == uncompressed_len);
+
+ &uncompressed_bytes_vec
+ };
+
+ // Symbol names in the payload are separated by `0x01` bytes.
+ for raw_name in uncompressed_bytes.split(|&b| b == 0x01) {
+ let hash = truncated_md5(raw_name);
+ let demangled = demangle_if_able(raw_name)?;
+ map.insert(hash, demangled);
+ }
+
+ parser.ensure_empty()?;
+ }
+
+ Ok(map)
+}