summaryrefslogtreecommitdiffstats
path: root/src/tools/collect-license-metadata
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
commit4547b622d8d29df964fa2914213088b148c498fc (patch)
tree9fc6b25f3c3add6b745be9a2400a6e96140046e9 /src/tools/collect-license-metadata
parentReleasing progress-linux version 1.66.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-4547b622d8d29df964fa2914213088b148c498fc.tar.xz
rustc-4547b622d8d29df964fa2914213088b148c498fc.zip
Merging upstream version 1.67.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/tools/collect-license-metadata')
-rw-r--r--src/tools/collect-license-metadata/Cargo.toml10
-rw-r--r--src/tools/collect-license-metadata/src/licenses.rs65
-rw-r--r--src/tools/collect-license-metadata/src/main.rs30
-rw-r--r--src/tools/collect-license-metadata/src/path_tree.rs294
-rw-r--r--src/tools/collect-license-metadata/src/reuse.rs49
5 files changed, 448 insertions, 0 deletions
diff --git a/src/tools/collect-license-metadata/Cargo.toml b/src/tools/collect-license-metadata/Cargo.toml
new file mode 100644
index 000000000..d0820cfc2
--- /dev/null
+++ b/src/tools/collect-license-metadata/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "collect-license-metadata"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+anyhow = "1.0.65"
+serde = { version = "1.0.147", features = ["derive"] }
+serde_json = "1.0.85"
+spdx-rs = "0.5.1"
diff --git a/src/tools/collect-license-metadata/src/licenses.rs b/src/tools/collect-license-metadata/src/licenses.rs
new file mode 100644
index 000000000..1c95b1bc8
--- /dev/null
+++ b/src/tools/collect-license-metadata/src/licenses.rs
@@ -0,0 +1,65 @@
+use std::collections::HashMap;
+
+const COPYRIGHT_PREFIXES: &[&str] = &["SPDX-FileCopyrightText:", "Copyright", "(c)", "(C)", "©"];
+
+pub(crate) struct LicensesInterner {
+ by_id: Vec<License>,
+ by_struct: HashMap<License, usize>,
+}
+
+impl LicensesInterner {
+ pub(crate) fn new() -> Self {
+ LicensesInterner { by_id: Vec::new(), by_struct: HashMap::new() }
+ }
+
+ pub(crate) fn intern(&mut self, mut license: License) -> LicenseId {
+ license.simplify();
+ if let Some(id) = self.by_struct.get(&license) {
+ LicenseId(*id)
+ } else {
+ let id = self.by_id.len();
+ self.by_id.push(license.clone());
+ self.by_struct.insert(license, id);
+ LicenseId(id)
+ }
+ }
+
+ pub(crate) fn resolve(&self, id: LicenseId) -> &License {
+ &self.by_id[id.0]
+ }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)]
+#[serde(transparent)]
+pub(crate) struct LicenseId(usize);
+
+#[derive(Clone, Hash, PartialEq, Eq, serde::Serialize)]
+pub(crate) struct License {
+ pub(crate) spdx: String,
+ pub(crate) copyright: Vec<String>,
+}
+
+impl License {
+ fn simplify(&mut self) {
+ self.remove_copyright_prefixes();
+ self.copyright.sort();
+ self.copyright.dedup();
+ }
+
+ fn remove_copyright_prefixes(&mut self) {
+ for copyright in &mut self.copyright {
+ let mut stripped = copyright.trim();
+ let mut previous_stripped;
+ loop {
+ previous_stripped = stripped;
+ for pattern in COPYRIGHT_PREFIXES {
+ stripped = stripped.trim_start_matches(pattern).trim_start();
+ }
+ if stripped == previous_stripped {
+ break;
+ }
+ }
+ *copyright = stripped.into();
+ }
+ }
+}
diff --git a/src/tools/collect-license-metadata/src/main.rs b/src/tools/collect-license-metadata/src/main.rs
new file mode 100644
index 000000000..ca2a6f4b8
--- /dev/null
+++ b/src/tools/collect-license-metadata/src/main.rs
@@ -0,0 +1,30 @@
+mod licenses;
+mod path_tree;
+mod reuse;
+
+use crate::licenses::LicensesInterner;
+use anyhow::Error;
+use std::path::PathBuf;
+
+fn main() -> Result<(), Error> {
+ let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into();
+ let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into();
+
+ let mut interner = LicensesInterner::new();
+ let paths = crate::reuse::collect(&reuse_exe, &mut interner)?;
+
+ let mut tree = crate::path_tree::build(paths);
+ tree.simplify();
+
+ if let Some(parent) = dest.parent() {
+ std::fs::create_dir_all(parent)?;
+ }
+ std::fs::write(
+ &dest,
+ &serde_json::to_vec_pretty(&serde_json::json!({
+ "files": crate::path_tree::expand_interned_licenses(tree, &interner),
+ }))?,
+ )?;
+
+ Ok(())
+}
diff --git a/src/tools/collect-license-metadata/src/path_tree.rs b/src/tools/collect-license-metadata/src/path_tree.rs
new file mode 100644
index 000000000..133ff6837
--- /dev/null
+++ b/src/tools/collect-license-metadata/src/path_tree.rs
@@ -0,0 +1,294 @@
+//! Tools like REUSE output per-file licensing information, but we need to condense it in the
+//! minimum amount of data that still represents the same licensing metadata. This module is
+//! responsible for that, by turning the list of paths into a tree and executing simplification
+//! passes over the tree to remove redundant information.
+
+use crate::licenses::{License, LicenseId, LicensesInterner};
+use std::collections::BTreeMap;
+use std::path::{Path, PathBuf};
+
+#[derive(serde::Serialize)]
+#[serde(rename_all = "kebab-case", tag = "type")]
+pub(crate) enum Node<L> {
+ Root { childs: Vec<Node<L>> },
+ Directory { name: PathBuf, childs: Vec<Node<L>>, license: Option<L> },
+ File { name: PathBuf, license: L },
+ FileGroup { names: Vec<PathBuf>, license: L },
+ Empty,
+}
+
+impl Node<LicenseId> {
+ pub(crate) fn simplify(&mut self) {
+ self.merge_directories();
+ self.collapse_in_licensed_directories();
+ self.merge_directory_licenses();
+ self.merge_file_groups();
+ self.remove_empty();
+ }
+
+ /// Initially, the build() function constructs a list of separate paths from the file
+ /// system root down to each file, like so:
+ ///
+ /// ```text
+ /// ┌─► ./ ──► compiler/ ──► rustc/ ──► src/ ──► main.rs
+ /// │
+ /// <root> ─┼─► ./ ──► compiler/ ──► rustc/ ──► Cargo.toml
+ /// │
+ /// └─► ./ ──► library/ ───► std/ ──► Cargo.toml
+ /// ```
+ ///
+ /// This pass is responsible for turning that into a proper directory tree:
+ ///
+ /// ```text
+ /// ┌─► compiler/ ──► rustc/ ──┬─► src/ ──► main.rs
+ /// │ │
+ /// <root> ──► ./ ──┤ └─► Cargo.toml
+ /// │
+ /// └─► library/ ───► std/ ──► Cargo.toml
+ /// ```
+ fn merge_directories(&mut self) {
+ match self {
+ Node::Root { childs } | Node::Directory { childs, license: None, .. } => {
+ let mut directories = BTreeMap::new();
+ let mut files = Vec::new();
+
+ for child in childs.drain(..) {
+ match child {
+ Node::Directory { name, mut childs, license: None } => {
+ directories.entry(name).or_insert_with(Vec::new).append(&mut childs);
+ }
+ file @ Node::File { .. } => {
+ files.push(file);
+ }
+ Node::Empty => {}
+ Node::Root { .. } => {
+ panic!("can't have a root inside another element");
+ }
+ Node::FileGroup { .. } => {
+ panic!("FileGroup should not be present at this stage");
+ }
+ Node::Directory { license: Some(_), .. } => {
+ panic!("license should not be set at this stage");
+ }
+ }
+ }
+
+ childs.extend(directories.into_iter().map(|(name, childs)| Node::Directory {
+ name,
+ childs,
+ license: None,
+ }));
+ childs.append(&mut files);
+
+ for child in &mut *childs {
+ child.merge_directories();
+ }
+ }
+ Node::Empty => {}
+ Node::File { .. } => {}
+ Node::FileGroup { .. } => {
+ panic!("FileGroup should not be present at this stage");
+ }
+ Node::Directory { license: Some(_), .. } => {
+ panic!("license should not be set at this stage");
+ }
+ }
+ }
+
+ /// In our codebase, most files in a directory have the same license as the other files in that
+ /// same directory, so it's redundant to store licensing metadata for all the files. Instead,
+ /// we can add a license for a whole directory, and only record the exceptions to a directory
+ /// licensing metadata.
+ ///
+ /// We cannot instead record only the difference to Rust's standard licensing, as the majority
+ /// of the files in our repository are *not* licensed under Rust's standard licensing due to
+ /// our inclusion of LLVM.
+ fn collapse_in_licensed_directories(&mut self) {
+ match self {
+ Node::Directory { childs, license, .. } => {
+ for child in &mut *childs {
+ child.collapse_in_licensed_directories();
+ }
+
+ let mut licenses_count = BTreeMap::new();
+ for child in &*childs {
+ let Some(license) = child.license() else { continue };
+ *licenses_count.entry(license).or_insert(0) += 1;
+ }
+
+ let most_popular_license = licenses_count
+ .into_iter()
+ .max_by_key(|(_, count)| *count)
+ .map(|(license, _)| license);
+
+ if let Some(most_popular_license) = most_popular_license {
+ childs.retain(|child| child.license() != Some(most_popular_license));
+ *license = Some(most_popular_license);
+ }
+ }
+ Node::Root { childs } => {
+ for child in &mut *childs {
+ child.collapse_in_licensed_directories();
+ }
+ }
+ Node::File { .. } => {}
+ Node::FileGroup { .. } => {}
+ Node::Empty => {}
+ }
+ }
+
+ /// Reduce the depth of the tree by merging subdirectories with the same license as their
+ /// parent directory into their parent, and adjusting the paths of the childs accordingly.
+ fn merge_directory_licenses(&mut self) {
+ match self {
+ Node::Root { childs } => {
+ for child in &mut *childs {
+ child.merge_directory_licenses();
+ }
+ }
+ Node::Directory { childs, license, .. } => {
+ let mut to_add = Vec::new();
+ for child in &mut *childs {
+ child.merge_directory_licenses();
+
+ let Node::Directory {
+ name: child_name,
+ childs: child_childs,
+ license: child_license,
+ } = child else { continue };
+
+ if child_license != license {
+ continue;
+ }
+ for mut child_child in child_childs.drain(..) {
+ match &mut child_child {
+ Node::Root { .. } => {
+ panic!("can't have a root inside another element");
+ }
+ Node::FileGroup { .. } => {
+ panic!("FileGroup should not be present at this stage");
+ }
+ Node::Directory { name: child_child_name, .. } => {
+ *child_child_name = child_name.join(&child_child_name);
+ }
+ Node::File { name: child_child_name, .. } => {
+ *child_child_name = child_name.join(&child_child_name);
+ }
+ Node::Empty => {}
+ }
+ to_add.push(child_child);
+ }
+
+ *child = Node::Empty;
+ }
+ childs.append(&mut to_add);
+ }
+ Node::Empty => {}
+ Node::File { .. } => {}
+ Node::FileGroup { .. } => {}
+ }
+ }
+
+ /// This pass groups multiple files in a directory with the same license into a single
+ /// "FileGroup", so that the license of all those files can be reported as a group.
+ ///
+ /// Crucially this pass runs after collapse_in_licensed_directories, so the most common license
+ /// will already be marked as the directory's license and won't be turned into a group.
+ fn merge_file_groups(&mut self) {
+ match self {
+ Node::Root { childs } | Node::Directory { childs, .. } => {
+ let mut grouped = BTreeMap::new();
+
+ for child in &mut *childs {
+ child.merge_file_groups();
+ if let Node::File { name, license } = child {
+ grouped.entry(*license).or_insert_with(Vec::new).push(name.clone());
+ *child = Node::Empty;
+ }
+ }
+
+ for (license, mut names) in grouped.into_iter() {
+ if names.len() == 1 {
+ childs.push(Node::File { license, name: names.pop().unwrap() });
+ } else {
+ childs.push(Node::FileGroup { license, names });
+ }
+ }
+ }
+ Node::File { .. } => {}
+ Node::FileGroup { .. } => panic!("FileGroup should not be present at this stage"),
+ Node::Empty => {}
+ }
+ }
+
+ /// Some nodes were replaced with Node::Empty to mark them for deletion. As the last step, make
+ /// sure to remove them from the tree.
+ fn remove_empty(&mut self) {
+ match self {
+ Node::Root { childs } | Node::Directory { childs, .. } => {
+ for child in &mut *childs {
+ child.remove_empty();
+ }
+ childs.retain(|child| !matches!(child, Node::Empty));
+ }
+ Node::FileGroup { .. } => {}
+ Node::File { .. } => {}
+ Node::Empty => {}
+ }
+ }
+
+ fn license(&self) -> Option<LicenseId> {
+ match self {
+ Node::Directory { childs, license: Some(license), .. } if childs.is_empty() => {
+ Some(*license)
+ }
+ Node::File { license, .. } => Some(*license),
+ _ => None,
+ }
+ }
+}
+
+pub(crate) fn build(mut input: Vec<(PathBuf, LicenseId)>) -> Node<LicenseId> {
+ let mut childs = Vec::new();
+
+ // Ensure reproducibility of all future steps.
+ input.sort();
+
+ for (path, license) in input {
+ let mut node = Node::File { name: path.file_name().unwrap().into(), license };
+ for component in path.parent().unwrap_or_else(|| Path::new(".")).components().rev() {
+ node = Node::Directory {
+ name: component.as_os_str().into(),
+ childs: vec![node],
+ license: None,
+ };
+ }
+
+ childs.push(node);
+ }
+
+ Node::Root { childs }
+}
+
+/// Convert a `Node<LicenseId>` into a `Node<&License>`, expanding all interned license IDs with a
+/// reference to the actual license metadata.
+pub(crate) fn expand_interned_licenses(
+ node: Node<LicenseId>,
+ interner: &LicensesInterner,
+) -> Node<&License> {
+ match node {
+ Node::Root { childs } => Node::Root {
+ childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(),
+ },
+ Node::Directory { name, childs, license } => Node::Directory {
+ childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(),
+ license: license.map(|license| interner.resolve(license)),
+ name,
+ },
+ Node::File { name, license } => Node::File { name, license: interner.resolve(license) },
+ Node::FileGroup { names, license } => {
+ Node::FileGroup { names, license: interner.resolve(license) }
+ }
+ Node::Empty => Node::Empty,
+ }
+}
diff --git a/src/tools/collect-license-metadata/src/reuse.rs b/src/tools/collect-license-metadata/src/reuse.rs
new file mode 100644
index 000000000..d6b3772ba
--- /dev/null
+++ b/src/tools/collect-license-metadata/src/reuse.rs
@@ -0,0 +1,49 @@
+use crate::licenses::{License, LicenseId, LicensesInterner};
+use anyhow::Error;
+use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};
+use std::time::Instant;
+
+pub(crate) fn collect(
+ reuse_exe: &Path,
+ interner: &mut LicensesInterner,
+) -> Result<Vec<(PathBuf, LicenseId)>, Error> {
+ eprintln!("gathering license information from REUSE");
+ let start = Instant::now();
+ let raw = &obtain_spdx_document(reuse_exe)?;
+ eprintln!("finished gathering the license information from REUSE in {:.2?}", start.elapsed());
+
+ let document = spdx_rs::parsers::spdx_from_tag_value(&raw)?;
+
+ let mut result = Vec::new();
+ for file in document.file_information {
+ let license = interner.intern(License {
+ spdx: file.concluded_license.to_string(),
+ copyright: file.copyright_text.split('\n').map(|s| s.into()).collect(),
+ });
+
+ result.push((file.file_name.into(), license));
+ }
+
+ Ok(result)
+}
+
+fn obtain_spdx_document(reuse_exe: &Path) -> Result<String, Error> {
+ let output = Command::new(reuse_exe)
+ .args(&["spdx", "--add-license-concluded", "--creator-person=bors"])
+ .stdout(Stdio::piped())
+ .spawn()?
+ .wait_with_output()?;
+
+ if !output.status.success() {
+ eprintln!();
+ eprintln!("Note that Rust requires some REUSE features that might not be present in the");
+ eprintln!("release you're using. Make sure your REUSE release includes these PRs:");
+ eprintln!();
+ eprintln!(" - https://github.com/fsfe/reuse-tool/pull/623");
+ eprintln!();
+ anyhow::bail!("collecting licensing information with REUSE failed");
+ }
+
+ Ok(String::from_utf8(output.stdout)?)
+}