From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/unicode-normalization/src/decompose.rs | 161 ++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 vendor/unicode-normalization/src/decompose.rs (limited to 'vendor/unicode-normalization/src/decompose.rs') diff --git a/vendor/unicode-normalization/src/decompose.rs b/vendor/unicode-normalization/src/decompose.rs new file mode 100644 index 000000000..23cdb1a4e --- /dev/null +++ b/vendor/unicode-normalization/src/decompose.rs @@ -0,0 +1,161 @@ +// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +use core::fmt::{self, Write}; +use core::iter::Fuse; +use core::ops::Range; +use tinyvec::TinyVec; + +#[derive(Clone)] +enum DecompositionType { + Canonical, + Compatible, +} + +/// External iterator for a string decomposition's characters. +#[derive(Clone)] +pub struct Decompositions { + kind: DecompositionType, + iter: Fuse, + + // This buffer stores pairs of (canonical combining class, character), + // pushed onto the end in text order. + // + // It's divided into up to three sections: + // 1) A prefix that is free space; + // 2) "Ready" characters which are sorted and ready to emit on demand; + // 3) A "pending" block which stills needs more characters for us to be able + // to sort in canonical order and is not safe to emit. + buffer: TinyVec<[(u8, char); 4]>, + ready: Range, +} + +#[inline] +pub fn new_canonical>(iter: I) -> Decompositions { + Decompositions { + kind: self::DecompositionType::Canonical, + iter: iter.fuse(), + buffer: TinyVec::new(), + ready: 0..0, + } +} + +#[inline] +pub fn new_compatible>(iter: I) -> Decompositions { + Decompositions { + kind: self::DecompositionType::Compatible, + iter: iter.fuse(), + buffer: TinyVec::new(), + ready: 0..0, + } +} + +impl Decompositions { + #[inline] + fn push_back(&mut self, ch: char) { + let class = super::char::canonical_combining_class(ch); + + if class == 0 { + self.sort_pending(); + self.buffer.push((class, ch)); + self.ready.end = self.buffer.len(); + } else { + self.buffer.push((class, ch)); + } + } + + #[inline] + fn sort_pending(&mut self) { + // NB: `sort_by_key` is stable, so it will preserve the original text's + // order within a combining class. + self.buffer[self.ready.end..].sort_by_key(|k| k.0); + } + + #[inline] + fn reset_buffer(&mut self) { + // Equivalent to `self.buffer.drain(0..self.ready.end)` + // but faster than drain() if the buffer is a SmallVec or TinyVec + let pending = self.buffer.len() - self.ready.end; + for i in 0..pending { + self.buffer[i] = self.buffer[i + self.ready.end]; + } + self.buffer.truncate(pending); + self.ready = 0..0; + } + + #[inline] + fn increment_next_ready(&mut self) { + let next = self.ready.start + 1; + if next == self.ready.end { + self.reset_buffer(); + } else { + self.ready.start = next; + } + } +} + +impl> Iterator for Decompositions { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + while self.ready.end == 0 { + match (self.iter.next(), &self.kind) { + (Some(ch), &DecompositionType::Canonical) => { + super::char::decompose_canonical(ch, |d| self.push_back(d)); + } + (Some(ch), &DecompositionType::Compatible) => { + super::char::decompose_compatible(ch, |d| self.push_back(d)); + } + (None, _) => { + if self.buffer.is_empty() { + return None; + } else { + self.sort_pending(); + self.ready.end = self.buffer.len(); + + // This implementation means that we can call `next` + // on an exhausted iterator; the last outer `next` call + // will result in an inner `next` call. To make this + // safe, we use `fuse`. + break; + } + } + } + } + + // We can assume here that, if `self.ready.end` is greater than zero, + // it's also greater than `self.ready.start`. That's because we only + // increment `self.ready.start` inside `increment_next_ready`, and + // whenever it reaches equality with `self.ready.end`, we reset both + // to zero, maintaining the invariant that: + // self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0 + // + // This less-than-obviously-safe implementation is chosen for performance, + // minimizing the number & complexity of branches in `next` in the common + // case of buffering then unbuffering a single character with each call. + let (_, ch) = self.buffer[self.ready.start]; + self.increment_next_ready(); + Some(ch) + } + + fn size_hint(&self) -> (usize, Option) { + let (lower, _) = self.iter.size_hint(); + (lower, None) + } +} + +impl + Clone> fmt::Display for Decompositions { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.clone() { + f.write_char(c)?; + } + Ok(()) + } +} -- cgit v1.2.3