From 4547b622d8d29df964fa2914213088b148c498fc Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:18:32 +0200 Subject: Merging upstream version 1.67.1+dfsg1. Signed-off-by: Daniel Baumann --- src/tools/collect-license-metadata/Cargo.toml | 10 + src/tools/collect-license-metadata/src/licenses.rs | 65 +++++ src/tools/collect-license-metadata/src/main.rs | 30 +++ .../collect-license-metadata/src/path_tree.rs | 294 +++++++++++++++++++++ src/tools/collect-license-metadata/src/reuse.rs | 49 ++++ 5 files changed, 448 insertions(+) create mode 100644 src/tools/collect-license-metadata/Cargo.toml create mode 100644 src/tools/collect-license-metadata/src/licenses.rs create mode 100644 src/tools/collect-license-metadata/src/main.rs create mode 100644 src/tools/collect-license-metadata/src/path_tree.rs create mode 100644 src/tools/collect-license-metadata/src/reuse.rs (limited to 'src/tools/collect-license-metadata') diff --git a/src/tools/collect-license-metadata/Cargo.toml b/src/tools/collect-license-metadata/Cargo.toml new file mode 100644 index 000000000..d0820cfc2 --- /dev/null +++ b/src/tools/collect-license-metadata/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "collect-license-metadata" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.65" +serde = { version = "1.0.147", features = ["derive"] } +serde_json = "1.0.85" +spdx-rs = "0.5.1" diff --git a/src/tools/collect-license-metadata/src/licenses.rs b/src/tools/collect-license-metadata/src/licenses.rs new file mode 100644 index 000000000..1c95b1bc8 --- /dev/null +++ b/src/tools/collect-license-metadata/src/licenses.rs @@ -0,0 +1,65 @@ +use std::collections::HashMap; + +const COPYRIGHT_PREFIXES: &[&str] = &["SPDX-FileCopyrightText:", "Copyright", "(c)", "(C)", "©"]; + +pub(crate) struct LicensesInterner { + by_id: Vec, + by_struct: HashMap, +} + +impl LicensesInterner { + pub(crate) fn new() -> Self { + LicensesInterner { by_id: Vec::new(), by_struct: HashMap::new() } + } + + pub(crate) fn intern(&mut self, mut license: License) -> LicenseId { + license.simplify(); + if let Some(id) = self.by_struct.get(&license) { + LicenseId(*id) + } else { + let id = self.by_id.len(); + self.by_id.push(license.clone()); + self.by_struct.insert(license, id); + LicenseId(id) + } + } + + pub(crate) fn resolve(&self, id: LicenseId) -> &License { + &self.by_id[id.0] + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)] +#[serde(transparent)] +pub(crate) struct LicenseId(usize); + +#[derive(Clone, Hash, PartialEq, Eq, serde::Serialize)] +pub(crate) struct License { + pub(crate) spdx: String, + pub(crate) copyright: Vec, +} + +impl License { + fn simplify(&mut self) { + self.remove_copyright_prefixes(); + self.copyright.sort(); + self.copyright.dedup(); + } + + fn remove_copyright_prefixes(&mut self) { + for copyright in &mut self.copyright { + let mut stripped = copyright.trim(); + let mut previous_stripped; + loop { + previous_stripped = stripped; + for pattern in COPYRIGHT_PREFIXES { + stripped = stripped.trim_start_matches(pattern).trim_start(); + } + if stripped == previous_stripped { + break; + } + } + *copyright = stripped.into(); + } + } +} diff --git a/src/tools/collect-license-metadata/src/main.rs b/src/tools/collect-license-metadata/src/main.rs new file mode 100644 index 000000000..ca2a6f4b8 --- /dev/null +++ b/src/tools/collect-license-metadata/src/main.rs @@ -0,0 +1,30 @@ +mod licenses; +mod path_tree; +mod reuse; + +use crate::licenses::LicensesInterner; +use anyhow::Error; +use std::path::PathBuf; + +fn main() -> Result<(), Error> { + let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into(); + let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into(); + + let mut interner = LicensesInterner::new(); + let paths = crate::reuse::collect(&reuse_exe, &mut interner)?; + + let mut tree = crate::path_tree::build(paths); + tree.simplify(); + + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent)?; + } + std::fs::write( + &dest, + &serde_json::to_vec_pretty(&serde_json::json!({ + "files": crate::path_tree::expand_interned_licenses(tree, &interner), + }))?, + )?; + + Ok(()) +} diff --git a/src/tools/collect-license-metadata/src/path_tree.rs b/src/tools/collect-license-metadata/src/path_tree.rs new file mode 100644 index 000000000..133ff6837 --- /dev/null +++ b/src/tools/collect-license-metadata/src/path_tree.rs @@ -0,0 +1,294 @@ +//! Tools like REUSE output per-file licensing information, but we need to condense it in the +//! minimum amount of data that still represents the same licensing metadata. This module is +//! responsible for that, by turning the list of paths into a tree and executing simplification +//! passes over the tree to remove redundant information. + +use crate::licenses::{License, LicenseId, LicensesInterner}; +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; + +#[derive(serde::Serialize)] +#[serde(rename_all = "kebab-case", tag = "type")] +pub(crate) enum Node { + Root { childs: Vec> }, + Directory { name: PathBuf, childs: Vec>, license: Option }, + File { name: PathBuf, license: L }, + FileGroup { names: Vec, license: L }, + Empty, +} + +impl Node { + pub(crate) fn simplify(&mut self) { + self.merge_directories(); + self.collapse_in_licensed_directories(); + self.merge_directory_licenses(); + self.merge_file_groups(); + self.remove_empty(); + } + + /// Initially, the build() function constructs a list of separate paths from the file + /// system root down to each file, like so: + /// + /// ```text + /// ┌─► ./ ──► compiler/ ──► rustc/ ──► src/ ──► main.rs + /// │ + /// ─┼─► ./ ──► compiler/ ──► rustc/ ──► Cargo.toml + /// │ + /// └─► ./ ──► library/ ───► std/ ──► Cargo.toml + /// ``` + /// + /// This pass is responsible for turning that into a proper directory tree: + /// + /// ```text + /// ┌─► compiler/ ──► rustc/ ──┬─► src/ ──► main.rs + /// │ │ + /// ──► ./ ──┤ └─► Cargo.toml + /// │ + /// └─► library/ ───► std/ ──► Cargo.toml + /// ``` + fn merge_directories(&mut self) { + match self { + Node::Root { childs } | Node::Directory { childs, license: None, .. } => { + let mut directories = BTreeMap::new(); + let mut files = Vec::new(); + + for child in childs.drain(..) { + match child { + Node::Directory { name, mut childs, license: None } => { + directories.entry(name).or_insert_with(Vec::new).append(&mut childs); + } + file @ Node::File { .. } => { + files.push(file); + } + Node::Empty => {} + Node::Root { .. } => { + panic!("can't have a root inside another element"); + } + Node::FileGroup { .. } => { + panic!("FileGroup should not be present at this stage"); + } + Node::Directory { license: Some(_), .. } => { + panic!("license should not be set at this stage"); + } + } + } + + childs.extend(directories.into_iter().map(|(name, childs)| Node::Directory { + name, + childs, + license: None, + })); + childs.append(&mut files); + + for child in &mut *childs { + child.merge_directories(); + } + } + Node::Empty => {} + Node::File { .. } => {} + Node::FileGroup { .. } => { + panic!("FileGroup should not be present at this stage"); + } + Node::Directory { license: Some(_), .. } => { + panic!("license should not be set at this stage"); + } + } + } + + /// In our codebase, most files in a directory have the same license as the other files in that + /// same directory, so it's redundant to store licensing metadata for all the files. Instead, + /// we can add a license for a whole directory, and only record the exceptions to a directory + /// licensing metadata. + /// + /// We cannot instead record only the difference to Rust's standard licensing, as the majority + /// of the files in our repository are *not* licensed under Rust's standard licensing due to + /// our inclusion of LLVM. + fn collapse_in_licensed_directories(&mut self) { + match self { + Node::Directory { childs, license, .. } => { + for child in &mut *childs { + child.collapse_in_licensed_directories(); + } + + let mut licenses_count = BTreeMap::new(); + for child in &*childs { + let Some(license) = child.license() else { continue }; + *licenses_count.entry(license).or_insert(0) += 1; + } + + let most_popular_license = licenses_count + .into_iter() + .max_by_key(|(_, count)| *count) + .map(|(license, _)| license); + + if let Some(most_popular_license) = most_popular_license { + childs.retain(|child| child.license() != Some(most_popular_license)); + *license = Some(most_popular_license); + } + } + Node::Root { childs } => { + for child in &mut *childs { + child.collapse_in_licensed_directories(); + } + } + Node::File { .. } => {} + Node::FileGroup { .. } => {} + Node::Empty => {} + } + } + + /// Reduce the depth of the tree by merging subdirectories with the same license as their + /// parent directory into their parent, and adjusting the paths of the childs accordingly. + fn merge_directory_licenses(&mut self) { + match self { + Node::Root { childs } => { + for child in &mut *childs { + child.merge_directory_licenses(); + } + } + Node::Directory { childs, license, .. } => { + let mut to_add = Vec::new(); + for child in &mut *childs { + child.merge_directory_licenses(); + + let Node::Directory { + name: child_name, + childs: child_childs, + license: child_license, + } = child else { continue }; + + if child_license != license { + continue; + } + for mut child_child in child_childs.drain(..) { + match &mut child_child { + Node::Root { .. } => { + panic!("can't have a root inside another element"); + } + Node::FileGroup { .. } => { + panic!("FileGroup should not be present at this stage"); + } + Node::Directory { name: child_child_name, .. } => { + *child_child_name = child_name.join(&child_child_name); + } + Node::File { name: child_child_name, .. } => { + *child_child_name = child_name.join(&child_child_name); + } + Node::Empty => {} + } + to_add.push(child_child); + } + + *child = Node::Empty; + } + childs.append(&mut to_add); + } + Node::Empty => {} + Node::File { .. } => {} + Node::FileGroup { .. } => {} + } + } + + /// This pass groups multiple files in a directory with the same license into a single + /// "FileGroup", so that the license of all those files can be reported as a group. + /// + /// Crucially this pass runs after collapse_in_licensed_directories, so the most common license + /// will already be marked as the directory's license and won't be turned into a group. + fn merge_file_groups(&mut self) { + match self { + Node::Root { childs } | Node::Directory { childs, .. } => { + let mut grouped = BTreeMap::new(); + + for child in &mut *childs { + child.merge_file_groups(); + if let Node::File { name, license } = child { + grouped.entry(*license).or_insert_with(Vec::new).push(name.clone()); + *child = Node::Empty; + } + } + + for (license, mut names) in grouped.into_iter() { + if names.len() == 1 { + childs.push(Node::File { license, name: names.pop().unwrap() }); + } else { + childs.push(Node::FileGroup { license, names }); + } + } + } + Node::File { .. } => {} + Node::FileGroup { .. } => panic!("FileGroup should not be present at this stage"), + Node::Empty => {} + } + } + + /// Some nodes were replaced with Node::Empty to mark them for deletion. As the last step, make + /// sure to remove them from the tree. + fn remove_empty(&mut self) { + match self { + Node::Root { childs } | Node::Directory { childs, .. } => { + for child in &mut *childs { + child.remove_empty(); + } + childs.retain(|child| !matches!(child, Node::Empty)); + } + Node::FileGroup { .. } => {} + Node::File { .. } => {} + Node::Empty => {} + } + } + + fn license(&self) -> Option { + match self { + Node::Directory { childs, license: Some(license), .. } if childs.is_empty() => { + Some(*license) + } + Node::File { license, .. } => Some(*license), + _ => None, + } + } +} + +pub(crate) fn build(mut input: Vec<(PathBuf, LicenseId)>) -> Node { + let mut childs = Vec::new(); + + // Ensure reproducibility of all future steps. + input.sort(); + + for (path, license) in input { + let mut node = Node::File { name: path.file_name().unwrap().into(), license }; + for component in path.parent().unwrap_or_else(|| Path::new(".")).components().rev() { + node = Node::Directory { + name: component.as_os_str().into(), + childs: vec![node], + license: None, + }; + } + + childs.push(node); + } + + Node::Root { childs } +} + +/// Convert a `Node` into a `Node<&License>`, expanding all interned license IDs with a +/// reference to the actual license metadata. +pub(crate) fn expand_interned_licenses( + node: Node, + interner: &LicensesInterner, +) -> Node<&License> { + match node { + Node::Root { childs } => Node::Root { + childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(), + }, + Node::Directory { name, childs, license } => Node::Directory { + childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(), + license: license.map(|license| interner.resolve(license)), + name, + }, + Node::File { name, license } => Node::File { name, license: interner.resolve(license) }, + Node::FileGroup { names, license } => { + Node::FileGroup { names, license: interner.resolve(license) } + } + Node::Empty => Node::Empty, + } +} diff --git a/src/tools/collect-license-metadata/src/reuse.rs b/src/tools/collect-license-metadata/src/reuse.rs new file mode 100644 index 000000000..d6b3772ba --- /dev/null +++ b/src/tools/collect-license-metadata/src/reuse.rs @@ -0,0 +1,49 @@ +use crate::licenses::{License, LicenseId, LicensesInterner}; +use anyhow::Error; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Instant; + +pub(crate) fn collect( + reuse_exe: &Path, + interner: &mut LicensesInterner, +) -> Result, Error> { + eprintln!("gathering license information from REUSE"); + let start = Instant::now(); + let raw = &obtain_spdx_document(reuse_exe)?; + eprintln!("finished gathering the license information from REUSE in {:.2?}", start.elapsed()); + + let document = spdx_rs::parsers::spdx_from_tag_value(&raw)?; + + let mut result = Vec::new(); + for file in document.file_information { + let license = interner.intern(License { + spdx: file.concluded_license.to_string(), + copyright: file.copyright_text.split('\n').map(|s| s.into()).collect(), + }); + + result.push((file.file_name.into(), license)); + } + + Ok(result) +} + +fn obtain_spdx_document(reuse_exe: &Path) -> Result { + let output = Command::new(reuse_exe) + .args(&["spdx", "--add-license-concluded", "--creator-person=bors"]) + .stdout(Stdio::piped()) + .spawn()? + .wait_with_output()?; + + if !output.status.success() { + eprintln!(); + eprintln!("Note that Rust requires some REUSE features that might not be present in the"); + eprintln!("release you're using. Make sure your REUSE release includes these PRs:"); + eprintln!(); + eprintln!(" - https://github.com/fsfe/reuse-tool/pull/623"); + eprintln!(); + anyhow::bail!("collecting licensing information with REUSE failed"); + } + + Ok(String::from_utf8(output.stdout)?) +} -- cgit v1.2.3