diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/fluent-syntax/src/unicode.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/fluent-syntax/src/unicode.rs')
-rw-r--r-- | vendor/fluent-syntax/src/unicode.rs | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/vendor/fluent-syntax/src/unicode.rs b/vendor/fluent-syntax/src/unicode.rs new file mode 100644 index 000000000..ab95a8688 --- /dev/null +++ b/vendor/fluent-syntax/src/unicode.rs @@ -0,0 +1,159 @@ +//! A set of helper functions for unescaping Fluent unicode escape sequences. +//! +//! # Unicode +//! +//! Fluent supports UTF-8 in all FTL resources, but it also allows +//! unicode sequences to be escaped in [`String +//! Literals`](super::ast::InlineExpression::StringLiteral). +//! +//! Four byte sequences are encoded with `\u` and six byte +//! sqeuences using `\U`. +//! ## Example +//! +//! ``` +//! use fluent_syntax::unicode::unescape_unicode_to_string; +//! +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\u5bd2 Bar"), +//! "Foo 寒 Bar" +//! ); +//! +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\U01F68A Bar"), +//! "Foo 🚊 Bar" +//! ); +//! ``` +//! +//! # Other unescapes +//! +//! This also allows for a char `"` to be present inside an FTL string literal, +//! and for `\` itself to be escaped. +//! +//! ## Example +//! +//! ``` +//! use fluent_syntax::unicode::unescape_unicode_to_string; +//! +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\\" Bar"), +//! "Foo \" Bar" +//! ); +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\\\ Bar"), +//! "Foo \\ Bar" +//! ); +//! ``` +use std::borrow::Cow; +use std::char; +use std::fmt; + +const UNKNOWN_CHAR: char = '�'; + +fn encode_unicode(s: Option<&str>) -> char { + s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32)) + .unwrap_or(UNKNOWN_CHAR) +} + +/// Unescapes to a writer without allocating. +/// +/// ## Example +/// +/// ``` +/// use fluent_syntax::unicode::unescape_unicode; +/// +/// let mut s = String::new(); +/// unescape_unicode(&mut s, "Foo \\U01F60A Bar"); +/// assert_eq!(s, "Foo 😊 Bar"); +/// ``` +pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result +where + W: fmt::Write, +{ + let bytes = input.as_bytes(); + + let mut start = 0; + let mut ptr = 0; + + while let Some(b) = bytes.get(ptr) { + if b != &b'\\' { + ptr += 1; + continue; + } + if start != ptr { + w.write_str(&input[start..ptr])?; + } + + ptr += 1; + + let new_char = match bytes.get(ptr) { + Some(b'\\') => '\\', + Some(b'"') => '"', + Some(u @ b'u') | Some(u @ b'U') => { + let seq_start = ptr + 1; + let len = if u == &b'u' { 4 } else { 6 }; + ptr += len; + encode_unicode(input.get(seq_start..seq_start + len)) + } + _ => UNKNOWN_CHAR, + }; + ptr += 1; + w.write_char(new_char)?; + start = ptr; + } + if start != ptr { + w.write_str(&input[start..ptr])?; + } + Ok(()) +} + +/// Unescapes to a `Cow<str>` optionally allocating. +/// +/// ## Example +/// +/// ``` +/// use fluent_syntax::unicode::unescape_unicode_to_string; +/// +/// assert_eq!( +/// unescape_unicode_to_string("Foo \\U01F60A Bar"), +/// "Foo 😊 Bar" +/// ); +/// ``` +pub fn unescape_unicode_to_string(input: &str) -> Cow<str> { + let bytes = input.as_bytes(); + let mut result = Cow::from(input); + + let mut ptr = 0; + + while let Some(b) = bytes.get(ptr) { + if b != &b'\\' { + if let Cow::Owned(ref mut s) = result { + s.push(*b as char); + } + ptr += 1; + continue; + } + + if let Cow::Borrowed(_) = result { + result = Cow::from(&input[0..ptr]); + } + + ptr += 1; + + let new_char = match bytes.get(ptr) { + Some(b'\\') => '\\', + Some(b'"') => '"', + Some(u @ b'u') | Some(u @ b'U') => { + let start = ptr + 1; + let len = if u == &b'u' { 4 } else { 6 }; + ptr += len; + input + .get(start..(start + len)) + .map_or(UNKNOWN_CHAR, |slice| encode_unicode(Some(slice))) + } + _ => UNKNOWN_CHAR, + }; + result.to_mut().push(new_char); + ptr += 1; + } + result +} |