summaryrefslogtreecommitdiffstats
path: root/vendor/imara-diff/src/intern.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:41:35 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:41:35 +0000
commit7e5d7eea9c580ef4b41a765bde624af431942b96 (patch)
tree2c0d9ca12878fc4525650aa4e54d77a81a07cc09 /vendor/imara-diff/src/intern.rs
parentAdding debian version 1.70.0+dfsg1-9. (diff)
downloadrustc-7e5d7eea9c580ef4b41a765bde624af431942b96.tar.xz
rustc-7e5d7eea9c580ef4b41a765bde624af431942b96.zip
Merging upstream version 1.70.0+dfsg2.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/imara-diff/src/intern.rs')
-rw-r--r--vendor/imara-diff/src/intern.rs172
1 files changed, 172 insertions, 0 deletions
diff --git a/vendor/imara-diff/src/intern.rs b/vendor/imara-diff/src/intern.rs
new file mode 100644
index 000000000..7d06522c8
--- /dev/null
+++ b/vendor/imara-diff/src/intern.rs
@@ -0,0 +1,172 @@
+use std::hash::Hash;
+use std::ops::Index;
+
+use ahash::RandomState;
+use hashbrown::raw::RawTable;
+
+/// A token represented as an interned integer.
+///
+/// A token represents the smallest possible unit of change during a diff.
+/// For text this is usually a line, a word or a single character.
+/// All [algorithms](crate::Algorithm) operate on interned tokens instead
+/// of using the token data directly.
+/// This allows for much better performance by amortizing the cost hashing/equality.
+///
+/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`](crate::intern::InternedInput) module.
+#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
+#[repr(transparent)]
+pub struct Token(pub u32);
+
+impl From<u32> for Token {
+ fn from(token: u32) -> Self {
+ Token(token)
+ }
+}
+
+impl From<Token> for u32 {
+ fn from(token: Token) -> Self {
+ token.0
+ }
+}
+
+pub trait TokenSource {
+ type Token: Hash + Eq;
+ type Tokenizer: Iterator<Item = Self::Token>;
+ fn tokenize(&self) -> Self::Tokenizer;
+ fn estimate_tokens(&self) -> u32;
+}
+
+/// Two lists of interned [tokens](crate::intern::Token) that can be compared with the [`diff`](crate::diff) function.
+///
+/// A token represents the smallest possible unit of change during a diff.
+/// For text this is usually a line, a word or a single character.
+/// All [algorithms](crate::Algorithm) operate on interned tokens instead
+/// of using the token data directly.
+/// This allows for much better performance by amortizing the cost hashing/equality.
+///
+/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`](crate::intern::InternedInput) module.
+#[derive(Default)]
+pub struct InternedInput<T: Eq + Hash> {
+ pub before: Vec<Token>,
+ pub after: Vec<Token>,
+ pub interner: Interner<T>,
+}
+
+impl<T: Eq + Hash> InternedInput<T> {
+ pub fn new<I: TokenSource<Token = T>>(before: I, after: I) -> Self {
+ let token_estimate_before = before.estimate_tokens() as usize;
+ let token_estimate_after = after.estimate_tokens() as usize;
+ let mut res = Self {
+ before: Vec::with_capacity(token_estimate_before),
+ after: Vec::with_capacity(token_estimate_after),
+ interner: Interner::new(token_estimate_before + token_estimate_after),
+ };
+ res.update_before(before.tokenize());
+ res.update_after(after.tokenize());
+ res
+ }
+
+ /// replaces `self.before` wtih the iterned Tokens yielded by `input`
+ /// Note that this does not erase any tokens from the interner and might therefore be considered
+ /// a memory leak. If this function is called often over a long_running process
+ /// consider clearing the interner with [`clear`](crate::intern::Interner::clear).
+ pub fn update_before(&mut self, input: impl Iterator<Item = T>) {
+ self.before.clear();
+ self.before.extend(input.map(|token| self.interner.intern(token)));
+ }
+
+ /// replaces `self.before` wtih the iterned Tokens yielded by `input`
+ /// Note that this does not erase any tokens from the interner and might therefore be considered
+ /// a memory leak. If this function is called often over a long_running process
+ /// consider clearing the interner with [`clear`](crate::intern::Interner::clear) or
+ /// [`erase_tokens_after`](crate::intern::Interner::erase_tokens_after).
+ pub fn update_after(&mut self, input: impl Iterator<Item = T>) {
+ self.after.clear();
+ self.after.extend(input.map(|token| self.interner.intern(token)));
+ }
+
+ pub fn clear(&mut self) {
+ self.before.clear();
+ self.after.clear();
+ self.interner.clear();
+ }
+}
+
+/// A hastable based interner that allows
+#[derive(Default)]
+pub struct Interner<T: Hash + Eq> {
+ tokens: Vec<T>,
+ table: RawTable<Token>,
+ hasher: RandomState,
+}
+
+impl<T: Hash + Eq> Interner<T> {
+ /// Create an Interner with an intial capacity calculated by calling
+ /// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`
+ pub fn new_for_token_source<S: TokenSource<Token = T>>(before: &S, after: &S) -> Self {
+ Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
+ }
+
+ /// Create an Interner with inital capacity `capacity`.
+ pub fn new(capacity: usize) -> Interner<T> {
+ Interner {
+ tokens: Vec::with_capacity(capacity),
+ table: RawTable::with_capacity(capacity),
+ hasher: RandomState::new(),
+ }
+ }
+
+ /// Remove all interned tokens
+ pub fn clear(&mut self) {
+ self.table.clear_no_drop();
+ self.tokens.clear();
+ }
+
+ /// Intern `token` and return a the interned integer
+ pub fn intern(&mut self, token: T) -> Token {
+ let hash = self.hasher.hash_one(&token);
+ if let Some(&token) = self.table.get(hash, |&it| self.tokens[it.0 as usize] == token) {
+ token
+ } else {
+ let interned = Token(self.tokens.len() as u32);
+ self.table.insert(hash, interned, |&token| {
+ self.hasher.hash_one(&self.tokens[token.0 as usize])
+ });
+ self.tokens.push(token);
+ interned
+ }
+ }
+
+ /// Returns to total number of **distinct** tokens currently interned.
+ pub fn num_tokens(&self) -> u32 {
+ self.tokens.len() as u32
+ }
+
+ /// Erases `first_erased_token` and any tokens interned afterwards from the interner.
+ pub fn erase_tokens_after(&mut self, first_erased_token: Token) {
+ assert!(first_erased_token.0 <= self.tokens.len() as u32);
+ let retained = first_erased_token.0 as usize;
+ let erased = self.tokens.len() - retained;
+ if retained <= erased {
+ self.table.clear_no_drop();
+ // safety, we assert that retained is smaller then the table size so the table will never have to grow
+ unsafe {
+ for (i, token) in self.tokens[0..retained].iter().enumerate() {
+ self.table.insert_no_grow(self.hasher.hash_one(token), Token(i as u32));
+ }
+ }
+ } else {
+ for (i, token) in self.tokens[retained..].iter().enumerate() {
+ self.table.erase_entry(self.hasher.hash_one(token), |token| token.0 == (retained + i) as u32);
+ }
+ }
+ self.tokens.truncate(first_erased_token.0 as usize);
+ }
+}
+
+impl<T: Hash + Eq> Index<Token> for Interner<T> {
+ type Output = T;
+ fn index(&self, index: Token) -> &Self::Output {
+ &self.tokens[index.0 as usize]
+ }
+}