summaryrefslogtreecommitdiffstats
path: root/third_party/rust/idna/src/punycode.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/idna/src/punycode.rs')
-rw-r--r--third_party/rust/idna/src/punycode.rs315
1 files changed, 315 insertions, 0 deletions
diff --git a/third_party/rust/idna/src/punycode.rs b/third_party/rust/idna/src/punycode.rs
new file mode 100644
index 0000000000..21955f3596
--- /dev/null
+++ b/third_party/rust/idna/src/punycode.rs
@@ -0,0 +1,315 @@
+// Copyright 2013 The rust-url developers.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
+//!
+//! Since Punycode fundamentally works on unicode code points,
+//! `encode` and `decode` take and return slices and vectors of `char`.
+//! `encode_str` and `decode_to_string` provide convenience wrappers
+//! that convert from and to Rust’s UTF-8 based `str` and `String` types.
+
+use std::char;
+use std::u32;
+
+// Bootstring parameters for Punycode
+static BASE: u32 = 36;
+static T_MIN: u32 = 1;
+static T_MAX: u32 = 26;
+static SKEW: u32 = 38;
+static DAMP: u32 = 700;
+static INITIAL_BIAS: u32 = 72;
+static INITIAL_N: u32 = 0x80;
+static DELIMITER: char = '-';
+
+#[inline]
+fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
+ delta /= if first_time { DAMP } else { 2 };
+ delta += delta / num_points;
+ let mut k = 0;
+ while delta > ((BASE - T_MIN) * T_MAX) / 2 {
+ delta /= BASE - T_MIN;
+ k += BASE;
+ }
+ k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
+}
+
+/// Convert Punycode to an Unicode `String`.
+///
+/// This is a convenience wrapper around `decode`.
+#[inline]
+pub fn decode_to_string(input: &str) -> Option<String> {
+ decode(input).map(|chars| chars.into_iter().collect())
+}
+
+/// Convert Punycode to Unicode.
+///
+/// Return None on malformed input or overflow.
+/// Overflow can only happen on inputs that take more than
+/// 63 encoded bytes, the DNS limit on domain name labels.
+pub fn decode(input: &str) -> Option<Vec<char>> {
+ Some(Decoder::default().decode(input).ok()?.collect())
+}
+
+#[derive(Default)]
+pub(crate) struct Decoder {
+ insertions: Vec<(usize, char)>,
+}
+
+impl Decoder {
+ /// Split the input iterator and return a Vec with insertions of encoded characters
+ pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()> {
+ self.insertions.clear();
+ // Handle "basic" (ASCII) code points.
+ // They are encoded as-is before the last delimiter, if any.
+ let (base, input) = match input.rfind(DELIMITER) {
+ None => ("", input),
+ Some(position) => (
+ &input[..position],
+ if position > 0 {
+ &input[position + 1..]
+ } else {
+ input
+ },
+ ),
+ };
+
+ if !base.is_ascii() {
+ return Err(());
+ }
+
+ let base_len = base.len();
+ let mut length = base_len as u32;
+ let mut code_point = INITIAL_N;
+ let mut bias = INITIAL_BIAS;
+ let mut i = 0;
+ let mut iter = input.bytes();
+ loop {
+ let previous_i = i;
+ let mut weight = 1;
+ let mut k = BASE;
+ let mut byte = match iter.next() {
+ None => break,
+ Some(byte) => byte,
+ };
+
+ // Decode a generalized variable-length integer into delta,
+ // which gets added to i.
+ loop {
+ let digit = match byte {
+ byte @ b'0'..=b'9' => byte - b'0' + 26,
+ byte @ b'A'..=b'Z' => byte - b'A',
+ byte @ b'a'..=b'z' => byte - b'a',
+ _ => return Err(()),
+ } as u32;
+ if digit > (u32::MAX - i) / weight {
+ return Err(()); // Overflow
+ }
+ i += digit * weight;
+ let t = if k <= bias {
+ T_MIN
+ } else if k >= bias + T_MAX {
+ T_MAX
+ } else {
+ k - bias
+ };
+ if digit < t {
+ break;
+ }
+ if weight > u32::MAX / (BASE - t) {
+ return Err(()); // Overflow
+ }
+ weight *= BASE - t;
+ k += BASE;
+ byte = match iter.next() {
+ None => return Err(()), // End of input before the end of this delta
+ Some(byte) => byte,
+ };
+ }
+
+ bias = adapt(i - previous_i, length + 1, previous_i == 0);
+ if i / (length + 1) > u32::MAX - code_point {
+ return Err(()); // Overflow
+ }
+
+ // i was supposed to wrap around from length+1 to 0,
+ // incrementing code_point each time.
+ code_point += i / (length + 1);
+ i %= length + 1;
+ let c = match char::from_u32(code_point) {
+ Some(c) => c,
+ None => return Err(()),
+ };
+
+ // Move earlier insertions farther out in the string
+ for (idx, _) in &mut self.insertions {
+ if *idx >= i as usize {
+ *idx += 1;
+ }
+ }
+ self.insertions.push((i as usize, c));
+ length += 1;
+ i += 1;
+ }
+
+ self.insertions.sort_by_key(|(i, _)| *i);
+ Ok(Decode {
+ base: base.chars(),
+ insertions: &self.insertions,
+ inserted: 0,
+ position: 0,
+ len: base_len + self.insertions.len(),
+ })
+ }
+}
+
+pub(crate) struct Decode<'a> {
+ base: std::str::Chars<'a>,
+ pub(crate) insertions: &'a [(usize, char)],
+ inserted: usize,
+ position: usize,
+ len: usize,
+}
+
+impl<'a> Iterator for Decode<'a> {
+ type Item = char;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ match self.insertions.get(self.inserted) {
+ Some((pos, c)) if *pos == self.position => {
+ self.inserted += 1;
+ self.position += 1;
+ return Some(*c);
+ }
+ _ => {}
+ }
+ if let Some(c) = self.base.next() {
+ self.position += 1;
+ return Some(c);
+ } else if self.inserted >= self.insertions.len() {
+ return None;
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let len = self.len - self.position;
+ (len, Some(len))
+ }
+}
+
+impl<'a> ExactSizeIterator for Decode<'a> {
+ fn len(&self) -> usize {
+ self.len - self.position
+ }
+}
+
+/// Convert an Unicode `str` to Punycode.
+///
+/// This is a convenience wrapper around `encode`.
+#[inline]
+pub fn encode_str(input: &str) -> Option<String> {
+ let mut buf = String::with_capacity(input.len());
+ encode_into(input.chars(), &mut buf).ok().map(|()| buf)
+}
+
+/// Convert Unicode to Punycode.
+///
+/// Return None on overflow, which can only happen on inputs that would take more than
+/// 63 encoded bytes, the DNS limit on domain name labels.
+pub fn encode(input: &[char]) -> Option<String> {
+ let mut buf = String::with_capacity(input.len());
+ encode_into(input.iter().copied(), &mut buf)
+ .ok()
+ .map(|()| buf)
+}
+
+pub(crate) fn encode_into<I>(input: I, output: &mut String) -> Result<(), ()>
+where
+ I: Iterator<Item = char> + Clone,
+{
+ // Handle "basic" (ASCII) code points. They are encoded as-is.
+ let (mut input_length, mut basic_length) = (0, 0);
+ for c in input.clone() {
+ input_length += 1;
+ if c.is_ascii() {
+ output.push(c);
+ basic_length += 1;
+ }
+ }
+
+ if basic_length > 0 {
+ output.push('-')
+ }
+ let mut code_point = INITIAL_N;
+ let mut delta = 0;
+ let mut bias = INITIAL_BIAS;
+ let mut processed = basic_length;
+ while processed < input_length {
+ // All code points < code_point have been handled already.
+ // Find the next larger one.
+ let min_code_point = input
+ .clone()
+ .map(|c| c as u32)
+ .filter(|&c| c >= code_point)
+ .min()
+ .unwrap();
+ if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
+ return Err(()); // Overflow
+ }
+ // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
+ delta += (min_code_point - code_point) * (processed + 1);
+ code_point = min_code_point;
+ for c in input.clone() {
+ let c = c as u32;
+ if c < code_point {
+ delta += 1;
+ if delta == 0 {
+ return Err(()); // Overflow
+ }
+ }
+ if c == code_point {
+ // Represent delta as a generalized variable-length integer:
+ let mut q = delta;
+ let mut k = BASE;
+ loop {
+ let t = if k <= bias {
+ T_MIN
+ } else if k >= bias + T_MAX {
+ T_MAX
+ } else {
+ k - bias
+ };
+ if q < t {
+ break;
+ }
+ let value = t + ((q - t) % (BASE - t));
+ output.push(value_to_digit(value));
+ q = (q - t) / (BASE - t);
+ k += BASE;
+ }
+ output.push(value_to_digit(q));
+ bias = adapt(delta, processed + 1, processed == basic_length);
+ delta = 0;
+ processed += 1;
+ }
+ }
+ delta += 1;
+ code_point += 1;
+ }
+ Ok(())
+}
+
+#[inline]
+fn value_to_digit(value: u32) -> char {
+ match value {
+ 0..=25 => (value as u8 + b'a') as char, // a..z
+ 26..=35 => (value as u8 - 26 + b'0') as char, // 0..9
+ _ => panic!(),
+ }
+}