/*! Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. This is sub-module is useful for constructing byte based automatons that need to embed UTF-8 decoding. The most common use of this module is in conjunction with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type. See the documentation on the `Utf8Sequences` iterator for more details and an example. # Wait, what is this? This is simplest to explain with an example. Let's say you wanted to test whether a particular byte sequence was a Cyrillic character. One possible scalar value range is `[0400-04FF]`. The set of allowed bytes for this range can be expressed as a sequence of byte ranges: ```text [D0-D3][80-BF] ``` This is simple enough: simply encode the boundaries, `0400` encodes to `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. However, what if you wanted to add the Cyrillic Supplementary characters to your range? Your range might then become `[0400-052F]`. The same procedure as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, this isn't quite correct because this range doesn't capture many characters, for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). Instead, you need multiple sequences of byte ranges: ```text [D0-D3][80-BF] # matches codepoints 0400-04FF [D4][80-AF] # matches codepoints 0500-052F ``` This gets even more complicated if you want bigger ranges, particularly if they naively contain surrogate codepoints. For example, the sequence of byte ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: ```text [0-7F] [C2-DF][80-BF] [E0][A0-BF][80-BF] [E1-EC][80-BF][80-BF] [ED][80-9F][80-BF] [EE-EF][80-BF][80-BF] ``` Note that the byte ranges above will *not* match any erroneous encoding of UTF-8, including encodings of surrogate codepoints. And, of course, for all of Unicode (`[000000-10FFFF]`): ```text [0-7F] [C2-DF][80-BF] [E0][A0-BF][80-BF] [E1-EC][80-BF][80-BF] [ED][80-9F][80-BF] [EE-EF][80-BF][80-BF] [F0][90-BF][80-BF][80-BF] [F1-F3][80-BF][80-BF][80-BF] [F4][80-8F][80-BF][80-BF] ``` This module automates the process of creating these byte ranges from ranges of Unicode scalar values. # Lineage I got the idea and general implementation strategy from Russ Cox in his [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). I also got the idea from [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), which uses it for executing automata on their term index. */ #![deny(missing_docs)] use std::char; use std::fmt; use std::iter::FusedIterator; use std::slice; const MAX_UTF8_BYTES: usize = 4; /// Utf8Sequence represents a sequence of byte ranges. /// /// To match a Utf8Sequence, a candidate byte sequence must match each /// successive range. /// /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte /// sequence `\xDD\x61` would not match because `0x61 < 0x80`. #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] pub enum Utf8Sequence { /// One byte range. One(Utf8Range), /// Two successive byte ranges. Two([Utf8Range; 2]), /// Three successive byte ranges. Three([Utf8Range; 3]), /// Four successive byte ranges. Four([Utf8Range; 4]), } impl Utf8Sequence { /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value /// range. /// /// This assumes that `start` and `end` have the same length. fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { assert_eq!(start.len(), end.len()); match start.len() { 2 => Utf8Sequence::Two([ Utf8Range::new(start[0], end[0]), Utf8Range::new(start[1], end[1]), ]), 3 => Utf8Sequence::Three([ Utf8Range::new(start[0], end[0]), Utf8Range::new(start[1], end[1]), Utf8Range::new(start[2], end[2]), ]), 4 => Utf8Sequence::Four([ Utf8Range::new(start[0], end[0]), Utf8Range::new(start[1], end[1]), Utf8Range::new(start[2], end[2]), Utf8Range::new(start[3], end[3]), ]), n => unreachable!("invalid encoded length: {}", n), } } /// Returns the underlying sequence of byte ranges as a slice. pub fn as_slice(&self) -> &[Utf8Range] { use self::Utf8Sequence::*; match *self { One(ref r) => slice::from_ref(r), Two(ref r) => &r[..], Three(ref r) => &r[..], Four(ref r) => &r[..], } } /// Returns the number of byte ranges in this sequence. /// /// The length is guaranteed to be in the closed interval `[1, 4]`. pub fn len(&self) -> usize { self.as_slice().len() } /// Reverses the ranges in this sequence. /// /// For example, if this corresponds to the following sequence: /// /// ```text /// [D0-D3][80-BF] /// ``` /// /// Then after reversal, it will be /// /// ```text /// [80-BF][D0-D3] /// ``` /// /// This is useful when one is constructing a UTF-8 automaton to match /// character classes in reverse. pub fn reverse(&mut self) { match *self { Utf8Sequence::One(_) => {} Utf8Sequence::Two(ref mut x) => x.reverse(), Utf8Sequence::Three(ref mut x) => x.reverse(), Utf8Sequence::Four(ref mut x) => x.reverse(), } } /// Returns true if and only if a prefix of `bytes` matches this sequence /// of byte ranges. pub fn matches(&self, bytes: &[u8]) -> bool { if bytes.len() < self.len() { return false; } for (&b, r) in bytes.iter().zip(self) { if !r.matches(b) { return false; } } true } } impl<'a> IntoIterator for &'a Utf8Sequence { type IntoIter = slice::Iter<'a, Utf8Range>; type Item = &'a Utf8Range; fn into_iter(self) -> Self::IntoIter { self.as_slice().into_iter() } } impl fmt::Debug for Utf8Sequence { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Utf8Sequence::*; match *self { One(ref r) => write!(f, "{:?}", r), Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]), Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]), Four(ref r) => { write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3]) } } } } /// A single inclusive range of UTF-8 bytes. #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] pub struct Utf8Range { /// Start of byte range (inclusive). pub start: u8, /// End of byte range (inclusive). pub end: u8, } impl Utf8Range { fn new(start: u8, end: u8) -> Self { Utf8Range { start, end } } /// Returns true if and only if the given byte is in this range. pub fn matches(&self, b: u8) -> bool { self.start <= b && b <= self.end } } impl fmt::Debug for Utf8Range { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.start == self.end { write!(f, "[{:X}]", self.start) } else { write!(f, "[{:X}-{:X}]", self.start, self.end) } } } /// An iterator over ranges of matching UTF-8 byte sequences. /// /// The iteration represents an alternation of comprehensive byte sequences /// that match precisely the set of UTF-8 encoded scalar values. /// /// A byte sequence corresponds to one of the scalar values in the range given /// if and only if it completely matches exactly one of the sequences of byte /// ranges produced by this iterator. /// /// Each sequence of byte ranges matches a unique set of bytes. That is, no two /// sequences will match the same bytes. /// /// # Example /// /// This shows how to match an arbitrary byte sequence against a range of /// scalar values. /// /// ```rust /// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence}; /// /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { /// for range in seqs { /// if range.matches(bytes) { /// return true; /// } /// } /// false /// } /// /// // Test the basic multilingual plane. /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect(); /// /// // UTF-8 encoding of 'a'. /// assert!(matches(&seqs, &[0x61])); /// // UTF-8 encoding of '☃' (`\u{2603}`). /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); /// // UTF-8 encoding of `\u{10348}` (outside the BMP). /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); /// // Tries to match against a UTF-8 encoding of a surrogate codepoint, /// // which is invalid UTF-8, and therefore fails, despite the fact that /// // the corresponding codepoint (0xD800) falls in the range given. /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); /// // And fails against plain old invalid UTF-8. /// assert!(!matches(&seqs, &[0xFF, 0xFF])); /// ``` /// /// If this example seems circuitous, that's because it is! It's meant to be /// illustrative. In practice, you could just try to decode your byte sequence /// and compare it with the scalar value range directly. However, this is not /// always possible (for example, in a byte based automaton). #[derive(Debug)] pub struct Utf8Sequences { range_stack: Vec, } impl Utf8Sequences { /// Create a new iterator over UTF-8 byte ranges for the scalar value range /// given. pub fn new(start: char, end: char) -> Self { let mut it = Utf8Sequences { range_stack: vec![] }; it.push(start as u32, end as u32); it } /// reset resets the scalar value range. /// Any existing state is cleared, but resources may be reused. /// /// N.B. Benchmarks say that this method is dubious. #[doc(hidden)] pub fn reset(&mut self, start: char, end: char) { self.range_stack.clear(); self.push(start as u32, end as u32); } fn push(&mut self, start: u32, end: u32) { self.range_stack.push(ScalarRange { start, end }); } } struct ScalarRange { start: u32, end: u32, } impl fmt::Debug for ScalarRange { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "ScalarRange({:X}, {:X})", self.start, self.end) } } impl Iterator for Utf8Sequences { type Item = Utf8Sequence; fn next(&mut self) -> Option { 'TOP: while let Some(mut r) = self.range_stack.pop() { 'INNER: loop { if let Some((r1, r2)) = r.split() { self.push(r2.start, r2.end); r.start = r1.start; r.end = r1.end; continue 'INNER; } if !r.is_valid() { continue 'TOP; } for i in 1..MAX_UTF8_BYTES { let max = max_scalar_value(i); if r.start <= max && max < r.end { self.push(max + 1, r.end); r.end = max; continue 'INNER; } } if let Some(ascii_range) = r.as_ascii() { return Some(Utf8Sequence::One(ascii_range)); } for i in 1..MAX_UTF8_BYTES { let m = (1 << (6 * i)) - 1; if (r.start & !m) != (r.end & !m) { if (r.start & m) != 0 { self.push((r.start | m) + 1, r.end); r.end = r.start | m; continue 'INNER; } if (r.end & m) != m { self.push(r.end & !m, r.end); r.end = (r.end & !m) - 1; continue 'INNER; } } } let mut start = [0; MAX_UTF8_BYTES]; let mut end = [0; MAX_UTF8_BYTES]; let n = r.encode(&mut start, &mut end); return Some(Utf8Sequence::from_encoded_range( &start[0..n], &end[0..n], )); } } None } } impl FusedIterator for Utf8Sequences {} impl ScalarRange { /// split splits this range if it overlaps with a surrogate codepoint. /// /// Either or both ranges may be invalid. fn split(&self) -> Option<(ScalarRange, ScalarRange)> { if self.start < 0xE000 && self.end > 0xD7FF { Some(( ScalarRange { start: self.start, end: 0xD7FF }, ScalarRange { start: 0xE000, end: self.end }, )) } else { None } } /// is_valid returns true if and only if start <= end. fn is_valid(&self) -> bool { self.start <= self.end } /// as_ascii returns this range as a Utf8Range if and only if all scalar /// values in this range can be encoded as a single byte. fn as_ascii(&self) -> Option { if self.is_ascii() { Some(Utf8Range::new(self.start as u8, self.end as u8)) } else { None } } /// is_ascii returns true if the range is ASCII only (i.e., takes a single /// byte to encode any scalar value). fn is_ascii(&self) -> bool { self.is_valid() && self.end <= 0x7f } /// encode writes the UTF-8 encoding of the start and end of this range /// to the corresponding destination slices, and returns the number of /// bytes written. /// /// The slices should have room for at least `MAX_UTF8_BYTES`. fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { let cs = char::from_u32(self.start).unwrap(); let ce = char::from_u32(self.end).unwrap(); let ss = cs.encode_utf8(start); let se = ce.encode_utf8(end); assert_eq!(ss.len(), se.len()); ss.len() } } fn max_scalar_value(nbytes: usize) -> u32 { match nbytes { 1 => 0x007F, 2 => 0x07FF, 3 => 0xFFFF, 4 => 0x10FFFF, _ => unreachable!("invalid UTF-8 byte sequence size"), } } #[cfg(test)] mod tests { use std::char; use crate::utf8::{Utf8Range, Utf8Sequences}; fn rutf8(s: u8, e: u8) -> Utf8Range { Utf8Range::new(s, e) } fn never_accepts_surrogate_codepoints(start: char, end: char) { for cp in 0xD800..0xE000 { let buf = encode_surrogate(cp); for r in Utf8Sequences::new(start, end) { if r.matches(&buf) { panic!( "Sequence ({:X}, {:X}) contains range {:?}, \ which matches surrogate code point {:X} \ with encoded bytes {:?}", start as u32, end as u32, r, cp, buf, ); } } } } #[test] fn codepoints_no_surrogates() { never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}'); never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}'); never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}'); never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}'); never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}'); } #[test] fn single_codepoint_one_sequence() { // Tests that every range of scalar values that contains a single // scalar value is recognized by one sequence of byte ranges. for i in 0x0..(0x10FFFF + 1) { let c = match char::from_u32(i) { None => continue, Some(c) => c, }; let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); assert_eq!(seqs.len(), 1); } } #[test] fn bmp() { use crate::utf8::Utf8Sequence::*; let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::>(); assert_eq!( seqs, vec![ One(rutf8(0x0, 0x7F)), Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), Three([ rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF) ]), Three([ rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF) ]), Three([ rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF) ]), Three([ rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF) ]), ] ); } #[test] fn reverse() { use crate::utf8::Utf8Sequence::*; let mut s = One(rutf8(0xA, 0xB)); s.reverse(); assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]); let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]); s.reverse(); assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]); let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]); s.reverse(); assert_eq!( s.as_slice(), &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)] ); let mut s = Four([ rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD), rutf8(0xD, 0xE), ]); s.reverse(); assert_eq!( s.as_slice(), &[ rutf8(0xD, 0xE), rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB) ] ); } fn encode_surrogate(cp: u32) -> [u8; 3] { const TAG_CONT: u8 = 0b1000_0000; const TAG_THREE_B: u8 = 0b1110_0000; assert!(0xD800 <= cp && cp < 0xE000); let mut dst = [0; 3]; dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B; dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT; dst[2] = (cp & 0x3F) as u8 | TAG_CONT; dst } }