diff options
Diffstat (limited to 'third_party/rust/fluent-syntax/src/unicode.rs')
-rw-r--r-- | third_party/rust/fluent-syntax/src/unicode.rs | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/third_party/rust/fluent-syntax/src/unicode.rs b/third_party/rust/fluent-syntax/src/unicode.rs new file mode 100644 index 0000000000..ab95a86884 --- /dev/null +++ b/third_party/rust/fluent-syntax/src/unicode.rs @@ -0,0 +1,159 @@ +//! A set of helper functions for unescaping Fluent unicode escape sequences. +//! +//! # Unicode +//! +//! Fluent supports UTF-8 in all FTL resources, but it also allows +//! unicode sequences to be escaped in [`String +//! Literals`](super::ast::InlineExpression::StringLiteral). +//! +//! Four byte sequences are encoded with `\u` and six byte +//! sqeuences using `\U`. +//! ## Example +//! +//! ``` +//! use fluent_syntax::unicode::unescape_unicode_to_string; +//! +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\u5bd2 Bar"), +//! "Foo 寒 Bar" +//! ); +//! +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\U01F68A Bar"), +//! "Foo 🚊 Bar" +//! ); +//! ``` +//! +//! # Other unescapes +//! +//! This also allows for a char `"` to be present inside an FTL string literal, +//! and for `\` itself to be escaped. +//! +//! ## Example +//! +//! ``` +//! use fluent_syntax::unicode::unescape_unicode_to_string; +//! +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\\" Bar"), +//! "Foo \" Bar" +//! ); +//! assert_eq!( +//! unescape_unicode_to_string("Foo \\\\ Bar"), +//! "Foo \\ Bar" +//! ); +//! ``` +use std::borrow::Cow; +use std::char; +use std::fmt; + +const UNKNOWN_CHAR: char = '�'; + +fn encode_unicode(s: Option<&str>) -> char { + s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32)) + .unwrap_or(UNKNOWN_CHAR) +} + +/// Unescapes to a writer without allocating. +/// +/// ## Example +/// +/// ``` +/// use fluent_syntax::unicode::unescape_unicode; +/// +/// let mut s = String::new(); +/// unescape_unicode(&mut s, "Foo \\U01F60A Bar"); +/// assert_eq!(s, "Foo 😊 Bar"); +/// ``` +pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result +where + W: fmt::Write, +{ + let bytes = input.as_bytes(); + + let mut start = 0; + let mut ptr = 0; + + while let Some(b) = bytes.get(ptr) { + if b != &b'\\' { + ptr += 1; + continue; + } + if start != ptr { + w.write_str(&input[start..ptr])?; + } + + ptr += 1; + + let new_char = match bytes.get(ptr) { + Some(b'\\') => '\\', + Some(b'"') => '"', + Some(u @ b'u') | Some(u @ b'U') => { + let seq_start = ptr + 1; + let len = if u == &b'u' { 4 } else { 6 }; + ptr += len; + encode_unicode(input.get(seq_start..seq_start + len)) + } + _ => UNKNOWN_CHAR, + }; + ptr += 1; + w.write_char(new_char)?; + start = ptr; + } + if start != ptr { + w.write_str(&input[start..ptr])?; + } + Ok(()) +} + +/// Unescapes to a `Cow<str>` optionally allocating. +/// +/// ## Example +/// +/// ``` +/// use fluent_syntax::unicode::unescape_unicode_to_string; +/// +/// assert_eq!( +/// unescape_unicode_to_string("Foo \\U01F60A Bar"), +/// "Foo 😊 Bar" +/// ); +/// ``` +pub fn unescape_unicode_to_string(input: &str) -> Cow<str> { + let bytes = input.as_bytes(); + let mut result = Cow::from(input); + + let mut ptr = 0; + + while let Some(b) = bytes.get(ptr) { + if b != &b'\\' { + if let Cow::Owned(ref mut s) = result { + s.push(*b as char); + } + ptr += 1; + continue; + } + + if let Cow::Borrowed(_) = result { + result = Cow::from(&input[0..ptr]); + } + + ptr += 1; + + let new_char = match bytes.get(ptr) { + Some(b'\\') => '\\', + Some(b'"') => '"', + Some(u @ b'u') | Some(u @ b'U') => { + let start = ptr + 1; + let len = if u == &b'u' { 4 } else { 6 }; + ptr += len; + input + .get(start..(start + len)) + .map_or(UNKNOWN_CHAR, |slice| encode_unicode(Some(slice))) + } + _ => UNKNOWN_CHAR, + }; + result.to_mut().push(new_char); + ptr += 1; + } + result +} |