diff options
Diffstat (limited to 'third_party/rust/idna/src/punycode.rs')
-rw-r--r-- | third_party/rust/idna/src/punycode.rs | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/third_party/rust/idna/src/punycode.rs b/third_party/rust/idna/src/punycode.rs new file mode 100644 index 0000000000..21955f3596 --- /dev/null +++ b/third_party/rust/idna/src/punycode.rs @@ -0,0 +1,315 @@ +// Copyright 2013 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation. +//! +//! Since Punycode fundamentally works on unicode code points, +//! `encode` and `decode` take and return slices and vectors of `char`. +//! `encode_str` and `decode_to_string` provide convenience wrappers +//! that convert from and to Rust’s UTF-8 based `str` and `String` types. + +use std::char; +use std::u32; + +// Bootstring parameters for Punycode +static BASE: u32 = 36; +static T_MIN: u32 = 1; +static T_MAX: u32 = 26; +static SKEW: u32 = 38; +static DAMP: u32 = 700; +static INITIAL_BIAS: u32 = 72; +static INITIAL_N: u32 = 0x80; +static DELIMITER: char = '-'; + +#[inline] +fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 { + delta /= if first_time { DAMP } else { 2 }; + delta += delta / num_points; + let mut k = 0; + while delta > ((BASE - T_MIN) * T_MAX) / 2 { + delta /= BASE - T_MIN; + k += BASE; + } + k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW)) +} + +/// Convert Punycode to an Unicode `String`. +/// +/// This is a convenience wrapper around `decode`. +#[inline] +pub fn decode_to_string(input: &str) -> Option<String> { + decode(input).map(|chars| chars.into_iter().collect()) +} + +/// Convert Punycode to Unicode. +/// +/// Return None on malformed input or overflow. +/// Overflow can only happen on inputs that take more than +/// 63 encoded bytes, the DNS limit on domain name labels. +pub fn decode(input: &str) -> Option<Vec<char>> { + Some(Decoder::default().decode(input).ok()?.collect()) +} + +#[derive(Default)] +pub(crate) struct Decoder { + insertions: Vec<(usize, char)>, +} + +impl Decoder { + /// Split the input iterator and return a Vec with insertions of encoded characters + pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()> { + self.insertions.clear(); + // Handle "basic" (ASCII) code points. + // They are encoded as-is before the last delimiter, if any. + let (base, input) = match input.rfind(DELIMITER) { + None => ("", input), + Some(position) => ( + &input[..position], + if position > 0 { + &input[position + 1..] + } else { + input + }, + ), + }; + + if !base.is_ascii() { + return Err(()); + } + + let base_len = base.len(); + let mut length = base_len as u32; + let mut code_point = INITIAL_N; + let mut bias = INITIAL_BIAS; + let mut i = 0; + let mut iter = input.bytes(); + loop { + let previous_i = i; + let mut weight = 1; + let mut k = BASE; + let mut byte = match iter.next() { + None => break, + Some(byte) => byte, + }; + + // Decode a generalized variable-length integer into delta, + // which gets added to i. + loop { + let digit = match byte { + byte @ b'0'..=b'9' => byte - b'0' + 26, + byte @ b'A'..=b'Z' => byte - b'A', + byte @ b'a'..=b'z' => byte - b'a', + _ => return Err(()), + } as u32; + if digit > (u32::MAX - i) / weight { + return Err(()); // Overflow + } + i += digit * weight; + let t = if k <= bias { + T_MIN + } else if k >= bias + T_MAX { + T_MAX + } else { + k - bias + }; + if digit < t { + break; + } + if weight > u32::MAX / (BASE - t) { + return Err(()); // Overflow + } + weight *= BASE - t; + k += BASE; + byte = match iter.next() { + None => return Err(()), // End of input before the end of this delta + Some(byte) => byte, + }; + } + + bias = adapt(i - previous_i, length + 1, previous_i == 0); + if i / (length + 1) > u32::MAX - code_point { + return Err(()); // Overflow + } + + // i was supposed to wrap around from length+1 to 0, + // incrementing code_point each time. + code_point += i / (length + 1); + i %= length + 1; + let c = match char::from_u32(code_point) { + Some(c) => c, + None => return Err(()), + }; + + // Move earlier insertions farther out in the string + for (idx, _) in &mut self.insertions { + if *idx >= i as usize { + *idx += 1; + } + } + self.insertions.push((i as usize, c)); + length += 1; + i += 1; + } + + self.insertions.sort_by_key(|(i, _)| *i); + Ok(Decode { + base: base.chars(), + insertions: &self.insertions, + inserted: 0, + position: 0, + len: base_len + self.insertions.len(), + }) + } +} + +pub(crate) struct Decode<'a> { + base: std::str::Chars<'a>, + pub(crate) insertions: &'a [(usize, char)], + inserted: usize, + position: usize, + len: usize, +} + +impl<'a> Iterator for Decode<'a> { + type Item = char; + + fn next(&mut self) -> Option<Self::Item> { + loop { + match self.insertions.get(self.inserted) { + Some((pos, c)) if *pos == self.position => { + self.inserted += 1; + self.position += 1; + return Some(*c); + } + _ => {} + } + if let Some(c) = self.base.next() { + self.position += 1; + return Some(c); + } else if self.inserted >= self.insertions.len() { + return None; + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let len = self.len - self.position; + (len, Some(len)) + } +} + +impl<'a> ExactSizeIterator for Decode<'a> { + fn len(&self) -> usize { + self.len - self.position + } +} + +/// Convert an Unicode `str` to Punycode. +/// +/// This is a convenience wrapper around `encode`. +#[inline] +pub fn encode_str(input: &str) -> Option<String> { + let mut buf = String::with_capacity(input.len()); + encode_into(input.chars(), &mut buf).ok().map(|()| buf) +} + +/// Convert Unicode to Punycode. +/// +/// Return None on overflow, which can only happen on inputs that would take more than +/// 63 encoded bytes, the DNS limit on domain name labels. +pub fn encode(input: &[char]) -> Option<String> { + let mut buf = String::with_capacity(input.len()); + encode_into(input.iter().copied(), &mut buf) + .ok() + .map(|()| buf) +} + +pub(crate) fn encode_into<I>(input: I, output: &mut String) -> Result<(), ()> +where + I: Iterator<Item = char> + Clone, +{ + // Handle "basic" (ASCII) code points. They are encoded as-is. + let (mut input_length, mut basic_length) = (0, 0); + for c in input.clone() { + input_length += 1; + if c.is_ascii() { + output.push(c); + basic_length += 1; + } + } + + if basic_length > 0 { + output.push('-') + } + let mut code_point = INITIAL_N; + let mut delta = 0; + let mut bias = INITIAL_BIAS; + let mut processed = basic_length; + while processed < input_length { + // All code points < code_point have been handled already. + // Find the next larger one. + let min_code_point = input + .clone() + .map(|c| c as u32) + .filter(|&c| c >= code_point) + .min() + .unwrap(); + if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) { + return Err(()); // Overflow + } + // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0> + delta += (min_code_point - code_point) * (processed + 1); + code_point = min_code_point; + for c in input.clone() { + let c = c as u32; + if c < code_point { + delta += 1; + if delta == 0 { + return Err(()); // Overflow + } + } + if c == code_point { + // Represent delta as a generalized variable-length integer: + let mut q = delta; + let mut k = BASE; + loop { + let t = if k <= bias { + T_MIN + } else if k >= bias + T_MAX { + T_MAX + } else { + k - bias + }; + if q < t { + break; + } + let value = t + ((q - t) % (BASE - t)); + output.push(value_to_digit(value)); + q = (q - t) / (BASE - t); + k += BASE; + } + output.push(value_to_digit(q)); + bias = adapt(delta, processed + 1, processed == basic_length); + delta = 0; + processed += 1; + } + } + delta += 1; + code_point += 1; + } + Ok(()) +} + +#[inline] +fn value_to_digit(value: u32) -> char { + match value { + 0..=25 => (value as u8 + b'a') as char, // a..z + 26..=35 => (value as u8 - 26 + b'0') as char, // 0..9 + _ => panic!(), + } +} |