summaryrefslogtreecommitdiffstats
path: root/third_party/rust/idna/src/uts46.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/idna/src/uts46.rs')
-rw-r--r--third_party/rust/idna/src/uts46.rs727
1 files changed, 727 insertions, 0 deletions
diff --git a/third_party/rust/idna/src/uts46.rs b/third_party/rust/idna/src/uts46.rs
new file mode 100644
index 0000000000..ad798055fc
--- /dev/null
+++ b/third_party/rust/idna/src/uts46.rs
@@ -0,0 +1,727 @@
+// Copyright 2013-2014 The rust-url developers.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! [*Unicode IDNA Compatibility Processing*
+//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
+
+use self::Mapping::*;
+use crate::punycode;
+use std::{error::Error as StdError, fmt};
+use unicode_bidi::{bidi_class, BidiClass};
+use unicode_normalization::char::is_combining_mark;
+use unicode_normalization::{is_nfc, UnicodeNormalization};
+
+include!("uts46_mapping_table.rs");
+
+const PUNYCODE_PREFIX: &str = "xn--";
+
+#[derive(Debug)]
+struct StringTableSlice {
+ // Store these as separate fields so the structure will have an
+ // alignment of 1 and thus pack better into the Mapping enum, below.
+ byte_start_lo: u8,
+ byte_start_hi: u8,
+ byte_len: u8,
+}
+
+fn decode_slice(slice: &StringTableSlice) -> &'static str {
+ let lo = slice.byte_start_lo as usize;
+ let hi = slice.byte_start_hi as usize;
+ let start = (hi << 8) | lo;
+ let len = slice.byte_len as usize;
+ &STRING_TABLE[start..(start + len)]
+}
+
+#[repr(u8)]
+#[derive(Debug)]
+enum Mapping {
+ Valid,
+ Ignored,
+ Mapped(StringTableSlice),
+ Deviation(StringTableSlice),
+ Disallowed,
+ DisallowedStd3Valid,
+ DisallowedStd3Mapped(StringTableSlice),
+ DisallowedIdna2008,
+}
+
+fn find_char(codepoint: char) -> &'static Mapping {
+ let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
+ Ok(idx) => idx,
+ Err(idx) => idx - 1,
+ };
+
+ const SINGLE_MARKER: u16 = 1 << 15;
+
+ let (base, x) = TABLE[idx];
+ let single = (x & SINGLE_MARKER) != 0;
+ let offset = !SINGLE_MARKER & x;
+
+ if single {
+ &MAPPING_TABLE[offset as usize]
+ } else {
+ &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
+ }
+}
+
+struct Mapper<'a> {
+ chars: std::str::Chars<'a>,
+ config: Config,
+ errors: &'a mut Errors,
+ slice: Option<std::str::Chars<'static>>,
+}
+
+impl<'a> Iterator for Mapper<'a> {
+ type Item = char;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ if let Some(s) = &mut self.slice {
+ match s.next() {
+ Some(c) => return Some(c),
+ None => {
+ self.slice = None;
+ }
+ }
+ }
+
+ let codepoint = self.chars.next()?;
+ if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
+ return Some(codepoint);
+ }
+
+ return Some(match *find_char(codepoint) {
+ Mapping::Valid => codepoint,
+ Mapping::Ignored => continue,
+ Mapping::Mapped(ref slice) => {
+ self.slice = Some(decode_slice(slice).chars());
+ continue;
+ }
+ Mapping::Deviation(ref slice) => {
+ if self.config.transitional_processing {
+ self.slice = Some(decode_slice(slice).chars());
+ continue;
+ } else {
+ codepoint
+ }
+ }
+ Mapping::Disallowed => {
+ self.errors.disallowed_character = true;
+ codepoint
+ }
+ Mapping::DisallowedStd3Valid => {
+ if self.config.use_std3_ascii_rules {
+ self.errors.disallowed_by_std3_ascii_rules = true;
+ };
+ codepoint
+ }
+ Mapping::DisallowedStd3Mapped(ref slice) => {
+ if self.config.use_std3_ascii_rules {
+ self.errors.disallowed_mapped_in_std3 = true;
+ };
+ self.slice = Some(decode_slice(slice).chars());
+ continue;
+ }
+ Mapping::DisallowedIdna2008 => {
+ if self.config.use_idna_2008_rules {
+ self.errors.disallowed_in_idna_2008 = true;
+ }
+ codepoint
+ }
+ });
+ }
+ }
+}
+
+// http://tools.ietf.org/html/rfc5893#section-2
+fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
+ // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
+ // is RTL if it contains at least one character of bidi class R, AL or AN.
+ if !is_bidi_domain {
+ return true;
+ }
+
+ let mut chars = label.chars();
+ let first_char_class = match chars.next() {
+ Some(c) => bidi_class(c),
+ None => return true, // empty string
+ };
+
+ match first_char_class {
+ // LTR label
+ BidiClass::L => {
+ // Rule 5
+ while let Some(c) = chars.next() {
+ if !matches!(
+ bidi_class(c),
+ BidiClass::L
+ | BidiClass::EN
+ | BidiClass::ES
+ | BidiClass::CS
+ | BidiClass::ET
+ | BidiClass::ON
+ | BidiClass::BN
+ | BidiClass::NSM
+ ) {
+ return false;
+ }
+ }
+
+ // Rule 6
+ // must end in L or EN followed by 0 or more NSM
+ let mut rev_chars = label.chars().rev();
+ let mut last_non_nsm = rev_chars.next();
+ loop {
+ match last_non_nsm {
+ Some(c) if bidi_class(c) == BidiClass::NSM => {
+ last_non_nsm = rev_chars.next();
+ continue;
+ }
+ _ => {
+ break;
+ }
+ }
+ }
+ match last_non_nsm {
+ Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
+ Some(_) => {
+ return false;
+ }
+ _ => {}
+ }
+ }
+
+ // RTL label
+ BidiClass::R | BidiClass::AL => {
+ let mut found_en = false;
+ let mut found_an = false;
+
+ // Rule 2
+ for c in chars {
+ let char_class = bidi_class(c);
+ if char_class == BidiClass::EN {
+ found_en = true;
+ } else if char_class == BidiClass::AN {
+ found_an = true;
+ }
+
+ if !matches!(
+ char_class,
+ BidiClass::R
+ | BidiClass::AL
+ | BidiClass::AN
+ | BidiClass::EN
+ | BidiClass::ES
+ | BidiClass::CS
+ | BidiClass::ET
+ | BidiClass::ON
+ | BidiClass::BN
+ | BidiClass::NSM
+ ) {
+ return false;
+ }
+ }
+ // Rule 3
+ let mut rev_chars = label.chars().rev();
+ let mut last = rev_chars.next();
+ loop {
+ // must end in L or EN followed by 0 or more NSM
+ match last {
+ Some(c) if bidi_class(c) == BidiClass::NSM => {
+ last = rev_chars.next();
+ continue;
+ }
+ _ => {
+ break;
+ }
+ }
+ }
+ match last {
+ Some(c)
+ if matches!(
+ bidi_class(c),
+ BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
+ ) => {}
+ _ => {
+ return false;
+ }
+ }
+
+ // Rule 4
+ if found_an && found_en {
+ return false;
+ }
+ }
+
+ // Rule 1: Should start with L or R/AL
+ _ => {
+ return false;
+ }
+ }
+
+ true
+}
+
+/// Check the validity criteria for the given label
+///
+/// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
+///
+/// http://www.unicode.org/reports/tr46/#Validity_Criteria
+fn check_validity(label: &str, config: Config, errors: &mut Errors) {
+ let first_char = label.chars().next();
+ if first_char == None {
+ // Empty string, pass
+ return;
+ }
+
+ // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
+ //
+ // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
+ // third and fourth positions. But nobody follows this criteria. See the spec issue below:
+ // https://github.com/whatwg/url/issues/53
+
+ // V3: neither begin nor end with a U+002D HYPHEN-MINUS
+ if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
+ errors.check_hyphens = true;
+ return;
+ }
+
+ // V4: not contain a U+002E FULL STOP
+ //
+ // Here, label can't contain '.' since the input is from .split('.')
+
+ // V5: not begin with a GC=Mark
+ if is_combining_mark(first_char.unwrap()) {
+ errors.start_combining_mark = true;
+ return;
+ }
+
+ // V6: Check against Mapping Table
+ if label.chars().any(|c| match *find_char(c) {
+ Mapping::Valid | Mapping::DisallowedIdna2008 => false,
+ Mapping::Deviation(_) => config.transitional_processing,
+ Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
+ _ => true,
+ }) {
+ errors.invalid_mapping = true;
+ }
+
+ // V7: ContextJ rules
+ //
+ // TODO: Implement rules and add *CheckJoiners* flag.
+
+ // V8: Bidi rules are checked inside `processing()`
+}
+
+/// http://www.unicode.org/reports/tr46/#Processing
+#[allow(clippy::manual_strip)] // introduced in 1.45, MSRV is 1.36
+fn processing(
+ domain: &str,
+ config: Config,
+ normalized: &mut String,
+ output: &mut String,
+) -> Errors {
+ // Weed out the simple cases: only allow all lowercase ASCII characters and digits where none
+ // of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
+ let (mut prev, mut simple, mut puny_prefix) = ('?', !domain.is_empty(), 0);
+ for c in domain.chars() {
+ if c == '.' {
+ if prev == '-' {
+ simple = false;
+ break;
+ }
+ puny_prefix = 0;
+ continue;
+ } else if puny_prefix == 0 && c == '-' {
+ simple = false;
+ break;
+ } else if puny_prefix < 5 {
+ if c == ['x', 'n', '-', '-'][puny_prefix] {
+ puny_prefix += 1;
+ if puny_prefix == 4 {
+ simple = false;
+ break;
+ }
+ } else {
+ puny_prefix = 5;
+ }
+ }
+ if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
+ simple = false;
+ break;
+ }
+ prev = c;
+ }
+
+ if simple {
+ output.push_str(domain);
+ return Errors::default();
+ }
+
+ normalized.clear();
+ let mut errors = Errors::default();
+ let offset = output.len();
+
+ let iter = Mapper {
+ chars: domain.chars(),
+ config,
+ errors: &mut errors,
+ slice: None,
+ };
+
+ normalized.extend(iter.nfc());
+
+ let mut decoder = punycode::Decoder::default();
+ let non_transitional = config.transitional_processing(false);
+ let (mut first, mut has_bidi_labels) = (true, false);
+ for label in normalized.split('.') {
+ if !first {
+ output.push('.');
+ }
+ first = false;
+ if label.starts_with(PUNYCODE_PREFIX) {
+ match decoder.decode(&label[PUNYCODE_PREFIX.len()..]) {
+ Ok(decode) => {
+ let start = output.len();
+ output.extend(decode);
+ let decoded_label = &output[start..];
+
+ if !has_bidi_labels {
+ has_bidi_labels |= is_bidi_domain(decoded_label);
+ }
+
+ if !errors.is_err() {
+ if !is_nfc(&decoded_label) {
+ errors.nfc = true;
+ } else {
+ check_validity(decoded_label, non_transitional, &mut errors);
+ }
+ }
+ }
+ Err(()) => {
+ has_bidi_labels = true;
+ errors.punycode = true;
+ }
+ }
+ } else {
+ if !has_bidi_labels {
+ has_bidi_labels |= is_bidi_domain(label);
+ }
+
+ // `normalized` is already `NFC` so we can skip that check
+ check_validity(label, config, &mut errors);
+ output.push_str(label)
+ }
+ }
+
+ for label in output[offset..].split('.') {
+ // V8: Bidi rules
+ //
+ // TODO: Add *CheckBidi* flag
+ if !passes_bidi(label, has_bidi_labels) {
+ errors.check_bidi = true;
+ break;
+ }
+ }
+
+ errors
+}
+
+#[derive(Default)]
+pub struct Idna {
+ config: Config,
+ normalized: String,
+ output: String,
+}
+
+impl Idna {
+ pub fn new(config: Config) -> Self {
+ Self {
+ config,
+ normalized: String::new(),
+ output: String::new(),
+ }
+ }
+
+ /// http://www.unicode.org/reports/tr46/#ToASCII
+ #[allow(clippy::wrong_self_convention)]
+ pub fn to_ascii<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
+ let mut errors = processing(domain, self.config, &mut self.normalized, &mut self.output);
+
+ let mut first = true;
+ for label in self.output.split('.') {
+ if !first {
+ out.push('.');
+ }
+ first = false;
+
+ if label.is_ascii() {
+ out.push_str(label);
+ } else {
+ let offset = out.len();
+ out.push_str(PUNYCODE_PREFIX);
+ if let Err(()) = punycode::encode_into(label.chars(), out) {
+ errors.punycode = true;
+ out.truncate(offset);
+ }
+ }
+ }
+
+ if self.config.verify_dns_length {
+ let domain = if out.ends_with('.') {
+ &out[..out.len() - 1]
+ } else {
+ &*out
+ };
+ if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
+ errors.too_short_for_dns = true;
+ }
+ if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
+ errors.too_long_for_dns = true;
+ }
+ }
+
+ errors.into()
+ }
+
+ /// http://www.unicode.org/reports/tr46/#ToUnicode
+ #[allow(clippy::wrong_self_convention)]
+ pub fn to_unicode<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
+ processing(domain, self.config, &mut self.normalized, out).into()
+ }
+}
+
+#[derive(Clone, Copy)]
+pub struct Config {
+ use_std3_ascii_rules: bool,
+ transitional_processing: bool,
+ verify_dns_length: bool,
+ check_hyphens: bool,
+ use_idna_2008_rules: bool,
+}
+
+/// The defaults are that of https://url.spec.whatwg.org/#idna
+impl Default for Config {
+ fn default() -> Self {
+ Config {
+ use_std3_ascii_rules: false,
+ transitional_processing: false,
+ check_hyphens: false,
+ // check_bidi: true,
+ // check_joiners: true,
+
+ // Only use for to_ascii, not to_unicode
+ verify_dns_length: false,
+ use_idna_2008_rules: false,
+ }
+ }
+}
+
+impl Config {
+ #[inline]
+ pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
+ self.use_std3_ascii_rules = value;
+ self
+ }
+
+ #[inline]
+ pub fn transitional_processing(mut self, value: bool) -> Self {
+ self.transitional_processing = value;
+ self
+ }
+
+ #[inline]
+ pub fn verify_dns_length(mut self, value: bool) -> Self {
+ self.verify_dns_length = value;
+ self
+ }
+
+ #[inline]
+ pub fn check_hyphens(mut self, value: bool) -> Self {
+ self.check_hyphens = value;
+ self
+ }
+
+ #[inline]
+ pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
+ self.use_idna_2008_rules = value;
+ self
+ }
+
+ /// http://www.unicode.org/reports/tr46/#ToASCII
+ pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
+ let mut result = String::new();
+ let mut codec = Idna::new(self);
+ codec.to_ascii(domain, &mut result).map(|()| result)
+ }
+
+ /// http://www.unicode.org/reports/tr46/#ToUnicode
+ pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
+ let mut codec = Idna::new(self);
+ let mut out = String::with_capacity(domain.len());
+ let result = codec.to_unicode(domain, &mut out);
+ (out, result)
+ }
+}
+
+fn is_bidi_domain(s: &str) -> bool {
+ for c in s.chars() {
+ if c.is_ascii_graphic() {
+ continue;
+ }
+ match bidi_class(c) {
+ BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
+ _ => {}
+ }
+ }
+ false
+}
+
+/// Errors recorded during UTS #46 processing.
+///
+/// This is opaque for now, indicating what types of errors have been encountered at least once.
+/// More details may be exposed in the future.
+#[derive(Default)]
+pub struct Errors {
+ punycode: bool,
+ check_hyphens: bool,
+ check_bidi: bool,
+ start_combining_mark: bool,
+ invalid_mapping: bool,
+ nfc: bool,
+ disallowed_by_std3_ascii_rules: bool,
+ disallowed_mapped_in_std3: bool,
+ disallowed_character: bool,
+ too_long_for_dns: bool,
+ too_short_for_dns: bool,
+ disallowed_in_idna_2008: bool,
+}
+
+impl Errors {
+ fn is_err(&self) -> bool {
+ let Errors {
+ punycode,
+ check_hyphens,
+ check_bidi,
+ start_combining_mark,
+ invalid_mapping,
+ nfc,
+ disallowed_by_std3_ascii_rules,
+ disallowed_mapped_in_std3,
+ disallowed_character,
+ too_long_for_dns,
+ too_short_for_dns,
+ disallowed_in_idna_2008,
+ } = *self;
+ punycode
+ || check_hyphens
+ || check_bidi
+ || start_combining_mark
+ || invalid_mapping
+ || nfc
+ || disallowed_by_std3_ascii_rules
+ || disallowed_mapped_in_std3
+ || disallowed_character
+ || too_long_for_dns
+ || too_short_for_dns
+ || disallowed_in_idna_2008
+ }
+}
+
+impl fmt::Debug for Errors {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let Errors {
+ punycode,
+ check_hyphens,
+ check_bidi,
+ start_combining_mark,
+ invalid_mapping,
+ nfc,
+ disallowed_by_std3_ascii_rules,
+ disallowed_mapped_in_std3,
+ disallowed_character,
+ too_long_for_dns,
+ too_short_for_dns,
+ disallowed_in_idna_2008,
+ } = *self;
+
+ let fields = [
+ ("punycode", punycode),
+ ("check_hyphens", check_hyphens),
+ ("check_bidi", check_bidi),
+ ("start_combining_mark", start_combining_mark),
+ ("invalid_mapping", invalid_mapping),
+ ("nfc", nfc),
+ (
+ "disallowed_by_std3_ascii_rules",
+ disallowed_by_std3_ascii_rules,
+ ),
+ ("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
+ ("disallowed_character", disallowed_character),
+ ("too_long_for_dns", too_long_for_dns),
+ ("too_short_for_dns", too_short_for_dns),
+ ("disallowed_in_idna_2008", disallowed_in_idna_2008),
+ ];
+
+ let mut empty = true;
+ f.write_str("Errors { ")?;
+ for (name, val) in &fields {
+ if *val {
+ if !empty {
+ f.write_str(", ")?;
+ }
+ f.write_str(*name)?;
+ empty = false;
+ }
+ }
+
+ if !empty {
+ f.write_str(" }")
+ } else {
+ f.write_str("}")
+ }
+ }
+}
+
+impl From<Errors> for Result<(), Errors> {
+ fn from(e: Errors) -> Result<(), Errors> {
+ if !e.is_err() {
+ Ok(())
+ } else {
+ Err(e)
+ }
+ }
+}
+
+impl StdError for Errors {}
+
+impl fmt::Display for Errors {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Debug::fmt(self, f)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{find_char, Mapping};
+
+ #[test]
+ fn mapping_fast_path() {
+ assert_matches!(find_char('-'), &Mapping::Valid);
+ assert_matches!(find_char('.'), &Mapping::Valid);
+ for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
+ assert_matches!(find_char(*c), &Mapping::Valid);
+ }
+ for c in &[
+ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+ 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+ ] {
+ assert_matches!(find_char(*c), &Mapping::Valid);
+ }
+ }
+}