// Copyright 2013 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation. //! //! Since Punycode fundamentally works on unicode code points, //! `encode` and `decode` take and return slices and vectors of `char`. //! `encode_str` and `decode_to_string` provide convenience wrappers //! that convert from and to Rust’s UTF-8 based `str` and `String` types. use alloc::{string::String, vec::Vec}; use core::char; use core::u32; // Bootstring parameters for Punycode static BASE: u32 = 36; static T_MIN: u32 = 1; static T_MAX: u32 = 26; static SKEW: u32 = 38; static DAMP: u32 = 700; static INITIAL_BIAS: u32 = 72; static INITIAL_N: u32 = 0x80; static DELIMITER: char = '-'; #[inline] fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 { delta /= if first_time { DAMP } else { 2 }; delta += delta / num_points; let mut k = 0; while delta > ((BASE - T_MIN) * T_MAX) / 2 { delta /= BASE - T_MIN; k += BASE; } k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW)) } /// Convert Punycode to an Unicode `String`. /// /// This is a convenience wrapper around `decode`. #[inline] pub fn decode_to_string(input: &str) -> Option { decode(input).map(|chars| chars.into_iter().collect()) } /// Convert Punycode to Unicode. /// /// Return None on malformed input or overflow. /// Overflow can only happen on inputs that take more than /// 63 encoded bytes, the DNS limit on domain name labels. pub fn decode(input: &str) -> Option> { Some(Decoder::default().decode(input).ok()?.collect()) } #[derive(Default)] pub(crate) struct Decoder { insertions: Vec<(usize, char)>, } impl Decoder { /// Split the input iterator and return a Vec with insertions of encoded characters pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result, ()> { self.insertions.clear(); // Handle "basic" (ASCII) code points. // They are encoded as-is before the last delimiter, if any. let (base, input) = match input.rfind(DELIMITER) { None => ("", input), Some(position) => ( &input[..position], if position > 0 { &input[position + 1..] } else { input }, ), }; if !base.is_ascii() { return Err(()); } let base_len = base.len(); let mut length = base_len as u32; let mut code_point = INITIAL_N; let mut bias = INITIAL_BIAS; let mut i = 0; let mut iter = input.bytes(); loop { let previous_i = i; let mut weight = 1; let mut k = BASE; let mut byte = match iter.next() { None => break, Some(byte) => byte, }; // Decode a generalized variable-length integer into delta, // which gets added to i. loop { let digit = match byte { byte @ b'0'..=b'9' => byte - b'0' + 26, byte @ b'A'..=b'Z' => byte - b'A', byte @ b'a'..=b'z' => byte - b'a', _ => return Err(()), } as u32; if digit > (u32::MAX - i) / weight { return Err(()); // Overflow } i += digit * weight; let t = if k <= bias { T_MIN } else if k >= bias + T_MAX { T_MAX } else { k - bias }; if digit < t { break; } if weight > u32::MAX / (BASE - t) { return Err(()); // Overflow } weight *= BASE - t; k += BASE; byte = match iter.next() { None => return Err(()), // End of input before the end of this delta Some(byte) => byte, }; } bias = adapt(i - previous_i, length + 1, previous_i == 0); if i / (length + 1) > u32::MAX - code_point { return Err(()); // Overflow } // i was supposed to wrap around from length+1 to 0, // incrementing code_point each time. code_point += i / (length + 1); i %= length + 1; let c = match char::from_u32(code_point) { Some(c) => c, None => return Err(()), }; // Move earlier insertions farther out in the string for (idx, _) in &mut self.insertions { if *idx >= i as usize { *idx += 1; } } self.insertions.push((i as usize, c)); length += 1; i += 1; } self.insertions.sort_by_key(|(i, _)| *i); Ok(Decode { base: base.chars(), insertions: &self.insertions, inserted: 0, position: 0, len: base_len + self.insertions.len(), }) } } pub(crate) struct Decode<'a> { base: core::str::Chars<'a>, pub(crate) insertions: &'a [(usize, char)], inserted: usize, position: usize, len: usize, } impl<'a> Iterator for Decode<'a> { type Item = char; fn next(&mut self) -> Option { loop { match self.insertions.get(self.inserted) { Some((pos, c)) if *pos == self.position => { self.inserted += 1; self.position += 1; return Some(*c); } _ => {} } if let Some(c) = self.base.next() { self.position += 1; return Some(c); } else if self.inserted >= self.insertions.len() { return None; } } } fn size_hint(&self) -> (usize, Option) { let len = self.len - self.position; (len, Some(len)) } } impl<'a> ExactSizeIterator for Decode<'a> { fn len(&self) -> usize { self.len - self.position } } /// Convert an Unicode `str` to Punycode. /// /// This is a convenience wrapper around `encode`. #[inline] pub fn encode_str(input: &str) -> Option { if input.len() > u32::MAX as usize { return None; } let mut buf = String::with_capacity(input.len()); encode_into(input.chars(), &mut buf).ok().map(|()| buf) } /// Convert Unicode to Punycode. /// /// Return None on overflow, which can only happen on inputs that would take more than /// 63 encoded bytes, the DNS limit on domain name labels. pub fn encode(input: &[char]) -> Option { if input.len() > u32::MAX as usize { return None; } let mut buf = String::with_capacity(input.len()); encode_into(input.iter().copied(), &mut buf) .ok() .map(|()| buf) } pub(crate) fn encode_into(input: I, output: &mut String) -> Result<(), ()> where I: Iterator + Clone, { // Handle "basic" (ASCII) code points. They are encoded as-is. let (mut input_length, mut basic_length) = (0u32, 0); for c in input.clone() { input_length = input_length.checked_add(1).ok_or(())?; if c.is_ascii() { output.push(c); basic_length += 1; } } if basic_length > 0 { output.push('-') } let mut code_point = INITIAL_N; let mut delta = 0; let mut bias = INITIAL_BIAS; let mut processed = basic_length; while processed < input_length { // All code points < code_point have been handled already. // Find the next larger one. let min_code_point = input .clone() .map(|c| c as u32) .filter(|&c| c >= code_point) .min() .unwrap(); if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) { return Err(()); // Overflow } // Increase delta to advance the decoder’s state to delta += (min_code_point - code_point) * (processed + 1); code_point = min_code_point; for c in input.clone() { let c = c as u32; if c < code_point { delta = delta.checked_add(1).ok_or(())?; } if c == code_point { // Represent delta as a generalized variable-length integer: let mut q = delta; let mut k = BASE; loop { let t = if k <= bias { T_MIN } else if k >= bias + T_MAX { T_MAX } else { k - bias }; if q < t { break; } let value = t + ((q - t) % (BASE - t)); output.push(value_to_digit(value)); q = (q - t) / (BASE - t); k += BASE; } output.push(value_to_digit(q)); bias = adapt(delta, processed + 1, processed == basic_length); delta = 0; processed += 1; } } delta += 1; code_point += 1; } Ok(()) } #[inline] fn value_to_digit(value: u32) -> char { match value { 0..=25 => (value as u8 + b'a') as char, // a..z 26..=35 => (value as u8 - 26 + b'0') as char, // 0..9 _ => panic!(), } } #[test] #[ignore = "slow"] #[cfg(target_pointer_width = "64")] fn huge_encode() { let mut buf = String::new(); assert!(encode_into(std::iter::repeat('ß').take(u32::MAX as usize + 1), &mut buf).is_err()); assert_eq!(buf.len(), 0); }