// Copyright 2019 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! `chardetng` is a character encoding detector for legacy Web content. //! //! It is optimized for binary size in applications that already depend //! on `encoding_rs` for other reasons. use encoding_rs::Decoder; use encoding_rs::DecoderResult; use encoding_rs::Encoding; use encoding_rs::BIG5; use encoding_rs::EUC_JP; use encoding_rs::EUC_KR; use encoding_rs::GBK; use encoding_rs::ISO_2022_JP; use encoding_rs::ISO_8859_8; use encoding_rs::SHIFT_JIS; use encoding_rs::UTF_8; use encoding_rs::WINDOWS_1255; mod data; mod tld; use data::*; use tld::classify_tld; use tld::Tld; const LATIN_ADJACENCY_PENALTY: i64 = -50; const IMPLAUSIBILITY_PENALTY: i64 = -220; const ORDINAL_BONUS: i64 = 300; /// Must match the ISO-8859-2 score for " Š ". Note: There /// are four Slovenian Wikipedia list page titles where the /// list is split by letter so that Š stands alone for the /// list part for Š. Let's assume that's a special case not /// worth detecting even though the copyright sign detection /// makes Slovenian title detection round to one percentage /// point worse. const COPYRIGHT_BONUS: i64 = 222; const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180; const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40; const NON_LATIN_ALL_CAPS_PENALTY: i64 = -40; const NON_LATIN_MIXED_CASE_PENALTY: i64 = -20; // Manually calibrated relative to windows-1256 Arabic const CJK_BASE_SCORE: i64 = 41; const CJK_SECONDARY_BASE_SCORE: i64 = 20; // Was 20 const SHIFT_JIS_SCORE_PER_KANA: i64 = 20; const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE; const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE; // Manually calibrated relative to windows-1256 Persian and Urdu const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75; const HALF_WIDTH_KATAKANA_SCORE: i64 = 1; // Unclear if this is a good idea; seems not harmful, but can't be sure. const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10; const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger? const SHIFT_JIS_EXTENSION_PENALTY: i64 = SHIFT_JIS_PUA_PENALTY * 2; const SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY: i64 = SHIFT_JIS_EXTENSION_PENALTY; const EUC_JP_SCORE_PER_KANA: i64 = CJK_BASE_SCORE + (CJK_BASE_SCORE / 3); // Relative to Big5 const EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA: i64 = CJK_BASE_SCORE - 1; const EUC_JP_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE; const EUC_JP_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE; const EUC_JP_SCORE_PER_OTHER_KANJI: i64 = CJK_SECONDARY_BASE_SCORE / 4; const EUC_JP_INITIAL_KANA_PENALTY: i64 = -((CJK_BASE_SCORE / 3) + 1); const EUC_JP_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 50); // Needs to be more severe than for Shift_JIS to avoid misdetecting EUC-KR! const BIG5_SCORE_PER_LEVEL_1_HANZI: i64 = CJK_BASE_SCORE; const BIG5_SCORE_PER_OTHER_HANZI: i64 = CJK_SECONDARY_BASE_SCORE; const BIG5_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 30); // More severe than other PUA penalties to avoid misdetecting EUC-KR! (25 as the multiplier is too little) const BIG5_SINGLE_BYTE_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 40); const EUC_KR_SCORE_PER_EUC_HANGUL: i64 = CJK_BASE_SCORE + 1; const EUC_KR_SCORE_PER_NON_EUC_HANGUL: i64 = CJK_SECONDARY_BASE_SCORE / 5; const EUC_KR_SCORE_PER_HANJA: i64 = CJK_SECONDARY_BASE_SCORE / 2; const EUC_KR_HANJA_AFTER_HANGUL_PENALTY: i64 = -(CJK_BASE_SCORE * 10); const EUC_KR_LONG_WORD_PENALTY: i64 = -6; const EUC_KR_PUA_PENALTY: i64 = GBK_PUA_PENALTY - 1; // Break tie in favor of GBK const EUC_KR_MAC_KOREAN_PENALTY: i64 = EUC_KR_PUA_PENALTY * 2; const EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY: i64 = EUC_KR_MAC_KOREAN_PENALTY; const GBK_SCORE_PER_LEVEL_1: i64 = CJK_BASE_SCORE; const GBK_SCORE_PER_LEVEL_2: i64 = CJK_SECONDARY_BASE_SCORE; const GBK_SCORE_PER_NON_EUC: i64 = CJK_SECONDARY_BASE_SCORE / 4; const GBK_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Factor should be at least 2, but should it be larger? const GBK_SINGLE_BYTE_EXTENSION_PENALTY: i64 = GBK_PUA_PENALTY * 4; const CJK_LATIN_ADJACENCY_PENALTY: i64 = -CJK_BASE_SCORE; // smaller penalty than LATIN_ADJACENCY_PENALTY const CJ_PUNCTUATION: i64 = CJK_BASE_SCORE / 2; const CJK_OTHER: i64 = CJK_SECONDARY_BASE_SCORE / 4; /// Latin letter caseless class const LATIN_LETTER: u8 = 1; fn contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool { for &b in label.into_iter() { if b >= 0x80 { return true; } if b == b'.' { return true; } if b >= b'A' && b <= b'Z' { return true; } } false } // For Latin, we only penalize pairwise bad transitions // if one participant is non-ASCII. This avoids violating // the principle that ASCII pairs never contribute to the // score. (Maybe that's a bad principle, though!) #[derive(PartialEq)] enum LatinCaseState { Space, Upper, Lower, AllCaps, } // Fon non-Latin, we calculate case-related penalty // or bonus on a per-non-Latin-word basis. #[derive(PartialEq)] enum NonLatinCaseState { Space, Upper, Lower, UpperLower, AllCaps, Mix, } struct NonLatinCasedCandidate { data: &'static SingleByteData, prev: u8, case_state: NonLatinCaseState, prev_ascii: bool, current_word_len: u64, longest_word: u64, ibm866: bool, prev_was_a0: bool, // Only used with IBM866 } impl NonLatinCasedCandidate { fn new(data: &'static SingleByteData) -> Self { NonLatinCasedCandidate { data: data, prev: 0, case_state: NonLatinCaseState::Space, prev_ascii: true, current_word_len: 0, longest_word: 0, ibm866: data == &SINGLE_BYTE_DATA[IBM866_INDEX], prev_was_a0: false, } } fn feed(&mut self, buffer: &[u8]) -> Option { let mut score = 0i64; for &b in buffer { let class = self.data.classify(b); if class == 255 { return None; } let caseless_class = class & 0x7F; let ascii = b < 0x80; let ascii_pair = self.prev_ascii && ascii; let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); // The purpose of this state machine is to avoid misdetecting Greek as // Cyrillic by: // // * Giving a small bonus to words that start with an upper-case letter // and are lower-case for the rest. // * Giving a large penalty to start with one lower-case letter followed // by all upper-case (obviously upper and lower case inverted, which // unfortunately is possible due to KOI8-U). // * Giving a small per-word penalty to all-uppercase KOI8-U (to favor // all-lowercase Greek over all-caps KOI8-U). // * Giving large penalties for mixed-case other than initial upper-case. // This also helps relative to non-cased encodings. // ASCII doesn't participate in non-Latin casing. if caseless_class == LATIN_LETTER { // Latin // Mark this word as a mess. If there end up being non-Latin // letters in this word, the ASCII-adjacency penalty gets // applied to Latin/non-Latin pairs and the mix penalty // to non-Latin/non-Latin pairs. // XXX Apply penalty here self.case_state = NonLatinCaseState::Mix; } else if !non_ascii_alphabetic { // Space match self.case_state { NonLatinCaseState::Space | NonLatinCaseState::Upper | NonLatinCaseState::Lower => {} NonLatinCaseState::UpperLower => { // Intentionally applied only once per word. score += NON_LATIN_CAPITALIZATION_BONUS; } NonLatinCaseState::AllCaps => { // Intentionally applied only once per word. if self.data == &SINGLE_BYTE_DATA[KOI8_U_INDEX] { // Apply only to KOI8-U. score += NON_LATIN_ALL_CAPS_PENALTY; } } NonLatinCaseState::Mix => { // Per letter score += NON_LATIN_MIXED_CASE_PENALTY * (self.current_word_len as i64); } } self.case_state = NonLatinCaseState::Space; } else if (class >> 7) == 0 { // Lower case match self.case_state { NonLatinCaseState::Space => { self.case_state = NonLatinCaseState::Lower; } NonLatinCaseState::Upper => { self.case_state = NonLatinCaseState::UpperLower; } NonLatinCaseState::Lower | NonLatinCaseState::UpperLower | NonLatinCaseState::Mix => {} NonLatinCaseState::AllCaps => { self.case_state = NonLatinCaseState::Mix; } } } else { // Upper case match self.case_state { NonLatinCaseState::Space => { self.case_state = NonLatinCaseState::Upper; } NonLatinCaseState::Upper => { self.case_state = NonLatinCaseState::AllCaps; } NonLatinCaseState::Lower | NonLatinCaseState::UpperLower => { self.case_state = NonLatinCaseState::Mix; } NonLatinCaseState::AllCaps | NonLatinCaseState::Mix => {} } } // XXX Apply penalty if > 16 if non_ascii_alphabetic { self.current_word_len += 1; } else { if self.current_word_len > self.longest_word { self.longest_word = self.current_word_len; } self.current_word_len = 0; } let is_a0 = b == 0xA0; if !ascii_pair { // 0xA0 is no-break space in many other encodings, so avoid // assigning score to IBM866 when 0xA0 occurs next to itself // or a space-like byte. if !(self.ibm866 && ((is_a0 && (self.prev_was_a0 || self.prev == 0)) || caseless_class == 0 && self.prev_was_a0)) { score += self.data.score(caseless_class, self.prev, false); } if self.prev == LATIN_LETTER && non_ascii_alphabetic { score += LATIN_ADJACENCY_PENALTY; } else if caseless_class == LATIN_LETTER && self.data.is_non_latin_alphabetic(self.prev, false) { score += LATIN_ADJACENCY_PENALTY; } } self.prev_ascii = ascii; self.prev = caseless_class; self.prev_was_a0 = is_a0; } Some(score) } } enum OrdinalState { Other, Space, PeriodAfterN, OrdinalExpectingSpace, OrdinalExpectingSpaceUndoImplausibility, OrdinalExpectingSpaceOrDigit, OrdinalExpectingSpaceOrDigitUndoImplausibily, UpperN, LowerN, FeminineAbbreviationStartLetter, Digit, Roman, Copyright, } struct LatinCandidate { data: &'static SingleByteData, prev: u8, case_state: LatinCaseState, prev_non_ascii: u32, ordinal_state: OrdinalState, // Used only when `windows1252 == true` windows1252: bool, } impl LatinCandidate { fn new(data: &'static SingleByteData) -> Self { LatinCandidate { data: data, prev: 0, case_state: LatinCaseState::Space, prev_non_ascii: 0, ordinal_state: OrdinalState::Space, windows1252: data == &SINGLE_BYTE_DATA[WINDOWS_1252_INDEX], } } fn feed(&mut self, buffer: &[u8]) -> Option { let mut score = 0i64; for &b in buffer { let class = self.data.classify(b); if class == 255 { return None; } let caseless_class = class & 0x7F; let ascii = b < 0x80; let ascii_pair = self.prev_non_ascii == 0 && ascii; let non_ascii_penalty = match self.prev_non_ascii { 0 | 1 | 2 => 0, 3 => -5, 4 => -20, _ => -200, }; score += non_ascii_penalty; // XXX if has Vietnamese-only characters and word length > 7, // apply penalty if !self.data.is_latin_alphabetic(caseless_class) { self.case_state = LatinCaseState::Space; } else if (class >> 7) == 0 { // Penalizing lower case after two upper case // is important for avoiding misdetecting // windows-1250 as windows-1252 (byte 0x9F). if self.case_state == LatinCaseState::AllCaps && !ascii_pair { score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; } self.case_state = LatinCaseState::Lower; } else { match self.case_state { LatinCaseState::Space => { self.case_state = LatinCaseState::Upper; } LatinCaseState::Upper | LatinCaseState::AllCaps => { self.case_state = LatinCaseState::AllCaps; } LatinCaseState::Lower => { if !ascii_pair { // XXX How bad is this for Irish Gaelic? score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; } self.case_state = LatinCaseState::Upper; } } } // Treat pairing space-like, which can be non-ASCII, with ASCII as // ASCIIish enough not to get a score in order to avoid giving // ASCII i and I in windows-1254 next to windows-125x apostrophe/quote // a score. This avoids detecting English I’ as Turkish. let ascii_ish_pair = ascii_pair || (ascii && self.prev == 0) || (caseless_class == 0 && self.prev_non_ascii == 0); if !ascii_ish_pair { score += self.data.score(caseless_class, self.prev, false); } if self.windows1252 { // This state machine assigns score to the sequences // * " º " (Spanish) // * " ª " (Spanish) // * ".ª " (Spanish) // * ".º " (Spanish) // * "n.º1" (Spanish) // * " Mª " (Spanish) // * " Dª " (Spanish) // * " Nª " (Spanish) // * " Sª " (Spanish) // * " 3º " (Italian, where 3 is an ASCII digit) // * " 3ª " (Italian, where 3 is an ASCII digit) // * " Xº " (Italian, where X is a small Roman numeral) // * " Xª " (Italian, where X is a small Roman numeral) // * " Nº1" (Italian, where 1 is an ASCII digit) // * " Nº " (Italian) // * " © " (otherwise ASCII-only) // which are problematic to deal with by pairwise scoring // without messing up Romanian detection. // Initial sc match self.ordinal_state { OrdinalState::Other => { if caseless_class == 0 { self.ordinal_state = OrdinalState::Space; } } OrdinalState::Space => { if caseless_class == 0 { // pass } else if b == 0xAA || b == 0xBA { self.ordinal_state = OrdinalState::OrdinalExpectingSpace; } else if b == b'M' || b == b'D' || b == b'S' { self.ordinal_state = OrdinalState::FeminineAbbreviationStartLetter; } else if b == b'N' { // numero or Nuestra self.ordinal_state = OrdinalState::UpperN; } else if b == b'n' { // numero self.ordinal_state = OrdinalState::LowerN; } else if caseless_class == (ASCII_DIGIT as u8) { self.ordinal_state = OrdinalState::Digit; } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24 /* X */ { self.ordinal_state = OrdinalState::Roman; } else if b == 0xA9 { self.ordinal_state = OrdinalState::Copyright; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::OrdinalExpectingSpace => { if caseless_class == 0 { score += ORDINAL_BONUS; self.ordinal_state = OrdinalState::Space; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::OrdinalExpectingSpaceUndoImplausibility => { if caseless_class == 0 { score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; self.ordinal_state = OrdinalState::Space; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::OrdinalExpectingSpaceOrDigit => { if caseless_class == 0 { score += ORDINAL_BONUS; self.ordinal_state = OrdinalState::Space; } else if caseless_class == (ASCII_DIGIT as u8) { score += ORDINAL_BONUS; // Deliberately set to `Other` self.ordinal_state = OrdinalState::Other; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily => { if caseless_class == 0 { score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; self.ordinal_state = OrdinalState::Space; } else if caseless_class == (ASCII_DIGIT as u8) { score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY; // Deliberately set to `Other` self.ordinal_state = OrdinalState::Other; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::UpperN => { if b == 0xAA { self.ordinal_state = OrdinalState::OrdinalExpectingSpaceUndoImplausibility; } else if b == 0xBA { self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; } else if b == b'.' { self.ordinal_state = OrdinalState::PeriodAfterN; } else if caseless_class == 0 { self.ordinal_state = OrdinalState::Space; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::LowerN => { if b == 0xBA { self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily; } else if b == b'.' { self.ordinal_state = OrdinalState::PeriodAfterN; } else if caseless_class == 0 { self.ordinal_state = OrdinalState::Space; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::FeminineAbbreviationStartLetter => { if b == 0xAA { self.ordinal_state = OrdinalState::OrdinalExpectingSpaceUndoImplausibility; } else if caseless_class == 0 { self.ordinal_state = OrdinalState::Space; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::Digit => { if b == 0xAA || b == 0xBA { self.ordinal_state = OrdinalState::OrdinalExpectingSpace; } else if caseless_class == 0 { self.ordinal_state = OrdinalState::Space; } else if caseless_class == (ASCII_DIGIT as u8) { // pass } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::Roman => { if b == 0xAA || b == 0xBA { self.ordinal_state = OrdinalState::OrdinalExpectingSpaceUndoImplausibility; } else if caseless_class == 0 { self.ordinal_state = OrdinalState::Space; } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24 /* X */ { // pass } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::PeriodAfterN => { if b == 0xBA { self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit; } else if caseless_class == 0 { self.ordinal_state = OrdinalState::Space; } else { self.ordinal_state = OrdinalState::Other; } } OrdinalState::Copyright => { if caseless_class == 0 { score += COPYRIGHT_BONUS; self.ordinal_state = OrdinalState::Space; } else { self.ordinal_state = OrdinalState::Other; } } } } if ascii { self.prev_non_ascii = 0; } else { self.prev_non_ascii += 1; } self.prev = caseless_class; } Some(score) } } struct ArabicFrenchCandidate { data: &'static SingleByteData, prev: u8, case_state: LatinCaseState, prev_ascii: bool, current_word_len: u64, longest_word: u64, } impl ArabicFrenchCandidate { fn new(data: &'static SingleByteData) -> Self { ArabicFrenchCandidate { data: data, prev: 0, case_state: LatinCaseState::Space, prev_ascii: true, current_word_len: 0, longest_word: 0, } } fn feed(&mut self, buffer: &[u8]) -> Option { let mut score = 0i64; for &b in buffer { let class = self.data.classify(b); if class == 255 { return None; } let caseless_class = class & 0x7F; let ascii = b < 0x80; let ascii_pair = self.prev_ascii && ascii; if caseless_class != LATIN_LETTER { // We compute case penalties for French only self.case_state = LatinCaseState::Space; } else if (class >> 7) == 0 { if self.case_state == LatinCaseState::AllCaps && !ascii_pair { score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; } self.case_state = LatinCaseState::Lower; } else { match self.case_state { LatinCaseState::Space => { self.case_state = LatinCaseState::Upper; } LatinCaseState::Upper | LatinCaseState::AllCaps => { self.case_state = LatinCaseState::AllCaps; } LatinCaseState::Lower => { if !ascii_pair { score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY; } self.case_state = LatinCaseState::Upper; } } } // Count only Arabic word length and ignore French let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, true); // XXX apply penalty if > 23 if non_ascii_alphabetic { self.current_word_len += 1; } else { if self.current_word_len > self.longest_word { self.longest_word = self.current_word_len; } self.current_word_len = 0; } if !ascii_pair { score += self.data.score(caseless_class, self.prev, true); if self.prev == LATIN_LETTER && non_ascii_alphabetic { score += LATIN_ADJACENCY_PENALTY; } else if caseless_class == LATIN_LETTER && self.data.is_non_latin_alphabetic(self.prev, true) { score += LATIN_ADJACENCY_PENALTY; } } self.prev_ascii = ascii; self.prev = caseless_class; } Some(score) } } struct CaselessCandidate { data: &'static SingleByteData, prev: u8, prev_ascii: bool, current_word_len: u64, longest_word: u64, } impl CaselessCandidate { fn new(data: &'static SingleByteData) -> Self { CaselessCandidate { data: data, prev: 0, prev_ascii: true, current_word_len: 0, longest_word: 0, } } fn feed(&mut self, buffer: &[u8]) -> Option { let mut score = 0i64; for &b in buffer { let class = self.data.classify(b); if class == 255 { return None; } let caseless_class = class & 0x7F; let ascii = b < 0x80; let ascii_pair = self.prev_ascii && ascii; let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); // Apply penalty if > 23 and not Thai if non_ascii_alphabetic { self.current_word_len += 1; } else { if self.current_word_len > self.longest_word { self.longest_word = self.current_word_len; } self.current_word_len = 0; } if !ascii_pair { score += self.data.score(caseless_class, self.prev, false); if self.prev == LATIN_LETTER && non_ascii_alphabetic { score += LATIN_ADJACENCY_PENALTY; } else if caseless_class == LATIN_LETTER && self.data.is_non_latin_alphabetic(self.prev, false) { score += LATIN_ADJACENCY_PENALTY; } } self.prev_ascii = ascii; self.prev = caseless_class; } Some(score) } } fn is_ascii_punctuation(byte: u8) -> bool { match byte { b'.' | b',' | b':' | b';' | b'?' | b'!' => true, _ => false, } } struct LogicalCandidate { data: &'static SingleByteData, prev: u8, prev_ascii: bool, plausible_punctuation: u64, current_word_len: u64, longest_word: u64, } impl LogicalCandidate { fn new(data: &'static SingleByteData) -> Self { LogicalCandidate { data: data, prev: 0, prev_ascii: true, plausible_punctuation: 0, current_word_len: 0, longest_word: 0, } } fn feed(&mut self, buffer: &[u8]) -> Option { let mut score = 0i64; for &b in buffer { let class = self.data.classify(b); if class == 255 { return None; } let caseless_class = class & 0x7F; let ascii = b < 0x80; let ascii_pair = self.prev_ascii && ascii; let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); // XXX apply penalty if > 22 if non_ascii_alphabetic { self.current_word_len += 1; } else { if self.current_word_len > self.longest_word { self.longest_word = self.current_word_len; } self.current_word_len = 0; } if !ascii_pair { score += self.data.score(caseless_class, self.prev, false); let prev_non_ascii_alphabetic = self.data.is_non_latin_alphabetic(self.prev, false); if caseless_class == 0 && prev_non_ascii_alphabetic && is_ascii_punctuation(b) { self.plausible_punctuation += 1; } if self.prev == LATIN_LETTER && non_ascii_alphabetic { score += LATIN_ADJACENCY_PENALTY; } else if caseless_class == LATIN_LETTER && prev_non_ascii_alphabetic { score += LATIN_ADJACENCY_PENALTY; } } self.prev_ascii = ascii; self.prev = caseless_class; } Some(score) } } struct VisualCandidate { data: &'static SingleByteData, prev: u8, prev_ascii: bool, prev_punctuation: bool, plausible_punctuation: u64, current_word_len: u64, longest_word: u64, } impl VisualCandidate { fn new(data: &'static SingleByteData) -> Self { VisualCandidate { data: data, prev: 0, prev_ascii: true, prev_punctuation: false, plausible_punctuation: 0, current_word_len: 0, longest_word: 0, } } fn feed(&mut self, buffer: &[u8]) -> Option { let mut score = 0i64; for &b in buffer { let class = self.data.classify(b); if class == 255 { return None; } let caseless_class = class & 0x7F; let ascii = b < 0x80; let ascii_pair = self.prev_ascii && ascii; let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false); // XXX apply penalty if > 22 if non_ascii_alphabetic { self.current_word_len += 1; } else { if self.current_word_len > self.longest_word { self.longest_word = self.current_word_len; } self.current_word_len = 0; } if !ascii_pair { score += self.data.score(caseless_class, self.prev, false); if non_ascii_alphabetic && self.prev_punctuation { self.plausible_punctuation += 1; } if self.prev == LATIN_LETTER && non_ascii_alphabetic { score += LATIN_ADJACENCY_PENALTY; } else if caseless_class == LATIN_LETTER && self.data.is_non_latin_alphabetic(self.prev, false) { score += LATIN_ADJACENCY_PENALTY; } } self.prev_ascii = ascii; self.prev = caseless_class; self.prev_punctuation = caseless_class == 0 && is_ascii_punctuation(b); } Some(score) } } struct Utf8Candidate { decoder: Decoder, } impl Utf8Candidate { fn feed(&mut self, buffer: &[u8], last: bool) -> Option { let mut dst = [0u8; 1024]; let mut total_read = 0; loop { let (result, read, _) = self.decoder.decode_to_utf8_without_replacement( &buffer[total_read..], &mut dst, last, ); total_read += read; match result { DecoderResult::InputEmpty => { return Some(0); } DecoderResult::Malformed(_, _) => { return None; } DecoderResult::OutputFull => { continue; } } } } } struct Iso2022Candidate { decoder: Decoder, } impl Iso2022Candidate { fn feed(&mut self, buffer: &[u8], last: bool) -> Option { let mut dst = [0u16; 1024]; let mut total_read = 0; loop { let (result, read, _) = self.decoder.decode_to_utf16_without_replacement( &buffer[total_read..], &mut dst, last, ); total_read += read; match result { DecoderResult::InputEmpty => { return Some(0); } DecoderResult::Malformed(_, _) => { return None; } DecoderResult::OutputFull => { continue; } } } } } #[derive(PartialEq)] enum LatinCj { AsciiLetter, Cj, Other, } #[derive(PartialEq, Copy, Clone)] enum HalfWidthKatakana { DakutenForbidden, DakutenAllowed, DakutenOrHandakutenAllowed, } #[derive(PartialEq)] enum LatinKorean { AsciiLetter, Hangul, Hanja, Other, } fn cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64 { if let Some(pos) = table.iter().position(|&x| x == u) { ((128 - pos) / 16) as i64 } else { 0 } } struct GbkCandidate { decoder: Decoder, prev_byte: u8, prev: LatinCj, pending_score: Option, } impl GbkCandidate { fn maybe_set_as_pending(&mut self, s: i64) -> i64 { assert!(self.pending_score.is_none()); if self.prev == LatinCj::Cj || !more_problematic_lead(self.prev_byte) { s } else { self.pending_score = Some(s); 0 } } fn feed(&mut self, buffer: &[u8], last: bool) -> Option { let mut score = 0i64; let mut src = [0u8]; let mut dst = [0u16; 2]; for &b in buffer { src[0] = b; let (result, read, written) = self .decoder .decode_to_utf16_without_replacement(&src, &mut dst, false); if written == 1 { let u = dst[0]; if (u >= u16::from(b'a') && u <= u16::from(b'z')) || (u >= u16::from(b'A') && u <= u16::from(b'Z')) { self.pending_score = None; // Discard pending score if self.prev == LatinCj::Cj { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::AsciiLetter; } else if u == 0x20AC { // euro sign self.pending_score = None; // Discard pending score // Should there even be a penalty? self.prev = LatinCj::Other; } else if u >= 0x4E00 && u <= 0x9FA5 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } if b >= 0xA1 && b <= 0xFE { match self.prev_byte { 0xA1..=0xD7 => { score += GBK_SCORE_PER_LEVEL_1; score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_simplified); } 0xD8..=0xFE => score += GBK_SCORE_PER_LEVEL_2, _ => { score += GBK_SCORE_PER_NON_EUC; } } } else { score += self.maybe_set_as_pending(GBK_SCORE_PER_NON_EUC); } if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } // XXX score? if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else if u >= 0xE000 && u < 0xF900 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } // Treat the GB18030-required PUA mappings as non-EUC ideographs. match u { 0xE78D..=0xE796 | 0xE816..=0xE818 | 0xE81E | 0xE826 | 0xE82B | 0xE82C | 0xE831 | 0xE832 | 0xE83B | 0xE843 | 0xE854 | 0xE855 | 0xE864 => { score += GBK_SCORE_PER_NON_EUC; if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } _ => { score += GBK_PUA_PENALTY; self.prev = LatinCj::Other; } } } else { match u { 0x3000 // Distinct from Korean, space | 0x3001 // Distinct from Korean, enumeration comma | 0x3002 // Distinct from Korean, full stop | 0xFF08 // Distinct from Korean, parenthesis | 0xFF09 // Distinct from Korean, parenthesis | 0xFF01 // Distinct from Japanese, exclamation | 0xFF0C // Distinct from Japanese, comma | 0xFF1B // Distinct from Japanese, semicolon | 0xFF1F // Distinct from Japanese, question => { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += CJ_PUNCTUATION; } 0..=0x7F => { self.pending_score = None; // Discard pending score } _ => { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += CJK_OTHER; } } self.prev = LatinCj::Other; } } else if written == 2 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } let u = dst[0]; if u >= 0xDB80 && u <= 0xDBFF { score += GBK_PUA_PENALTY; self.prev = LatinCj::Other; } else if u >= 0xD480 && u < 0xD880 { score += GBK_SCORE_PER_NON_EUC; if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else { score += CJK_OTHER; self.prev = LatinCj::Other; } } match result { DecoderResult::InputEmpty => { assert_eq!(read, 1); } DecoderResult::Malformed(malformed_len, _) => { if (self.prev_byte == 0xA0 || self.prev_byte == 0xFE || self.prev_byte == 0xFD) && (b < 0x80 || b == 0xFF) { // Mac OS Chinese Simplified single-byte that conflicts with code page GBK lead byte // followed by ASCII or a non-conflicting single-byte extension. self.pending_score = None; // Just in case score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { self.prev = LatinCj::AsciiLetter; } else if b == 0xFF { score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; self.prev = LatinCj::Other; } else { self.prev = LatinCj::Other; } // The GBK decoder has the pending ASCII concept, which is // a problem with this trickery, so let's reset the state. self.decoder = GBK.new_decoder_without_bom_handling(); } else if malformed_len == 1 && b == 0xFF { // Mac OS Chinese Simplified single-byte extension that doesn't conflict with lead bytes self.pending_score = None; // Just in case score += GBK_SINGLE_BYTE_EXTENSION_PENALTY; self.prev = LatinCj::Other; // The GBK decoder has the pending ASCII concept, which is // a problem with this trickery, so let's reset the state. self.decoder = GBK.new_decoder_without_bom_handling(); } else { return None; } } DecoderResult::OutputFull => { unreachable!(); } } self.prev_byte = b; } if last { let (result, _, _) = self .decoder .decode_to_utf16_without_replacement(b"", &mut dst, true); match result { DecoderResult::InputEmpty => {} DecoderResult::Malformed(_, _) => { return None; } DecoderResult::OutputFull => { unreachable!(); } } } Some(score) } } // Shift_JIS and Big5 fn problematic_lead(b: u8) -> bool { match b { 0x91..=0x97 | 0x9A | 0x8A | 0x9B | 0x8B | 0x9E | 0x8E | 0xB0 => true, _ => false, } } // GBK and EUC-KR fn more_problematic_lead(b: u8) -> bool { problematic_lead(b) || b == 0x82 || b == 0x84 || b == 0x85 || b == 0xA0 } struct ShiftJisCandidate { decoder: Decoder, half_width_katakana_seen: bool, half_width_katakana_state: HalfWidthKatakana, prev: LatinCj, prev_byte: u8, pending_score: Option, } impl ShiftJisCandidate { fn maybe_set_as_pending(&mut self, s: i64) -> i64 { assert!(self.pending_score.is_none()); if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) { s } else { self.pending_score = Some(s); 0 } } fn feed(&mut self, buffer: &[u8], last: bool) -> Option { let mut score = 0i64; let mut src = [0u8]; let mut dst = [0u16; 2]; for &b in buffer { src[0] = b; let (result, read, written) = self .decoder .decode_to_utf16_without_replacement(&src, &mut dst, false); if written > 0 { let half_width_katakana_state = self.half_width_katakana_state; self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden; let u = dst[0]; if (u >= u16::from(b'a') && u <= u16::from(b'z')) || (u >= u16::from(b'A') && u <= u16::from(b'Z')) { self.pending_score = None; // Discard pending score if self.prev == LatinCj::Cj { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::AsciiLetter; } else if u >= 0xFF61 && u <= 0xFF9F { if !self.half_width_katakana_seen { self.half_width_katakana_seen = true; // To avoid misdetecting title-length inputs score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY; } self.pending_score = None; // Discard pending score score += HALF_WIDTH_KATAKANA_SCORE; if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 { self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed; } else if u >= 0xFF8A && u <= 0xFF8E { self.half_width_katakana_state = HalfWidthKatakana::DakutenOrHandakutenAllowed; } else if u == 0xFF9E { if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden { score += IMPLAUSIBILITY_PENALTY; } else { score += HALF_WIDTH_KATAKANA_VOICING_SCORE; } } else if u == 0xFF9F { if half_width_katakana_state != HalfWidthKatakana::DakutenOrHandakutenAllowed { score += IMPLAUSIBILITY_PENALTY; } else { score += HALF_WIDTH_KATAKANA_VOICING_SCORE; } } if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else if u >= 0x3040 && u < 0x3100 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += SHIFT_JIS_SCORE_PER_KANA; if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } if self.prev_byte < 0x98 || (self.prev_byte == 0x98 && b < 0x73) { score += self.maybe_set_as_pending( SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji), ); } else { score += self.maybe_set_as_pending(SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI); } if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else if u >= 0xE000 && u < 0xF900 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += SHIFT_JIS_PUA_PENALTY; self.prev = LatinCj::Other; } else { match u { 0x3000 // Distinct from Korean, space | 0x3001 // Distinct from Korean, enumeration comma | 0x3002 // Distinct from Korean, full stop | 0xFF08 // Distinct from Korean, parenthesis | 0xFF09 // Distinct from Korean, parenthesis => { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } // Not really needed for CJK distinction // but let's give non-zero score for these // common byte pairs anyway. score += CJ_PUNCTUATION; } 0..=0x7F => { self.pending_score = None; // Discard pending score } 0x80 => { // This is a control character that overlaps euro // in windows-1252 and happens to be a non-error // is Shift_JIS. self.pending_score = None; // Discard pending score score += IMPLAUSIBILITY_PENALTY; } _ => { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += CJK_OTHER; } } self.prev = LatinCj::Other; } } match result { DecoderResult::InputEmpty => { assert_eq!(read, 1); } DecoderResult::Malformed(malformed_len, _) => { if (((self.prev_byte >= 0x81 && self.prev_byte <= 0x9F) || (self.prev_byte >= 0xE0 && self.prev_byte <= 0xFC)) && ((b >= 0x40 && b <= 0x7E) || (b >= 0x80 && b <= 0xFC))) && !((self.prev_byte == 0x82 && b >= 0xFA) || (self.prev_byte == 0x84 && ((b >= 0xDD && b <= 0xE4) || b >= 0xFB)) || (self.prev_byte == 0x86 && b >= 0xF2 && b <= 0xFA) || (self.prev_byte == 0x87 && b >= 0x77 && b <= 0x7D) || (self.prev_byte == 0xFC && b >= 0xF5)) { // Shift_JIS2004 or MacJapanese if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += SHIFT_JIS_EXTENSION_PENALTY; // Approximate boundary if self.prev_byte < 0x87 { self.prev = LatinCj::Other; } else { if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } } else if malformed_len == 1 && (b == 0xA0 || b >= 0xFD) { self.pending_score = None; // Just in case score += SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY; self.prev = LatinCj::Other; } else { return None; } } DecoderResult::OutputFull => { unreachable!(); } } self.prev_byte = b; } if last { let (result, _, _) = self .decoder .decode_to_utf16_without_replacement(b"", &mut dst, true); match result { DecoderResult::InputEmpty => {} DecoderResult::Malformed(_, _) => { return None; } DecoderResult::OutputFull => { unreachable!(); } } } Some(score) } } struct EucJpCandidate { decoder: Decoder, non_ascii_seen: bool, half_width_katakana_state: HalfWidthKatakana, prev: LatinCj, prev_byte: u8, prev_prev_byte: u8, } impl EucJpCandidate { fn feed(&mut self, buffer: &[u8], last: bool) -> Option { let mut score = 0i64; let mut src = [0u8]; let mut dst = [0u16; 2]; for &b in buffer { src[0] = b; let (result, read, written) = self .decoder .decode_to_utf16_without_replacement(&src, &mut dst, false); if written > 0 { let half_width_katakana_state = self.half_width_katakana_state; self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden; let u = dst[0]; if !self.non_ascii_seen && u >= 0x80 { self.non_ascii_seen = true; if u >= 0x3040 && u < 0x3100 { // Remove the kana advantage over initial Big5 // hanzi. score += EUC_JP_INITIAL_KANA_PENALTY; } } if (u >= u16::from(b'a') && u <= u16::from(b'z')) || (u >= u16::from(b'A') && u <= u16::from(b'Z')) { if self.prev == LatinCj::Cj { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::AsciiLetter; } else if u >= 0xFF61 && u <= 0xFF9F { score += HALF_WIDTH_KATAKANA_SCORE; if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 { self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed; } else if u >= 0xFF8A && u <= 0xFF8E { self.half_width_katakana_state = HalfWidthKatakana::DakutenOrHandakutenAllowed; } else if u == 0xFF9E { if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden { score += IMPLAUSIBILITY_PENALTY; } else { score += HALF_WIDTH_KATAKANA_VOICING_SCORE; } } else if u == 0xFF9F { if half_width_katakana_state != HalfWidthKatakana::DakutenOrHandakutenAllowed { score += IMPLAUSIBILITY_PENALTY; } else { score += HALF_WIDTH_KATAKANA_VOICING_SCORE; } } if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Other; } else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) { match u { 0x3090 // hiragana wi | 0x3091 // hiragana we | 0x30F0 // katakana wi | 0x30F1 // katakana we => { // Remove advantage over Big5 Hanzi score += EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA; } _ => { score += EUC_JP_SCORE_PER_KANA; } } if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { if self.prev_prev_byte == 0x8F { score += EUC_JP_SCORE_PER_OTHER_KANJI; } else if self.prev_byte < 0xD0 { score += EUC_JP_SCORE_PER_LEVEL_1_KANJI; score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji); } else { score += EUC_JP_SCORE_PER_LEVEL_2_KANJI; } if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else { match u { 0x3000 // Distinct from Korean, space | 0x3001 // Distinct from Korean, enumeration comma | 0x3002 // Distinct from Korean, full stop | 0xFF08 // Distinct from Korean, parenthesis | 0xFF09 // Distinct from Korean, parenthesis => { score += CJ_PUNCTUATION; } 0..=0x7F => {} _ => { score += CJK_OTHER; } } self.prev = LatinCj::Other; } } match result { DecoderResult::InputEmpty => { assert_eq!(read, 1); } DecoderResult::Malformed(_, _) => { if b >= 0xA1 && b <= 0xFE && self.prev_byte >= 0xA1 && self.prev_byte <= 0xFE && ((self.prev_prev_byte != 0x8F && !(self.prev_byte == 0xA8 && b >= 0xDF && b <= 0xE6) && !(self.prev_byte == 0xAC && b >= 0xF4 && b <= 0xFC) && !(self.prev_byte == 0xAD && b >= 0xD8 && b <= 0xDE)) || (self.prev_prev_byte == 0x8F && self.prev_byte != 0xA2 && self.prev_byte != 0xA6 && self.prev_byte != 0xA7 && self.prev_byte != 0xA9 && self.prev_byte != 0xAA && self.prev_byte != 0xAB && self.prev_byte != 0xED && !(self.prev_byte == 0xFE && b >= 0xF7))) { score += EUC_JP_EXTENSION_PENALTY; if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else { return None; } } DecoderResult::OutputFull => { unreachable!(); } } self.prev_prev_byte = self.prev_byte; self.prev_byte = b; } if last { let (result, _, _) = self .decoder .decode_to_utf16_without_replacement(b"", &mut dst, true); match result { DecoderResult::InputEmpty => {} DecoderResult::Malformed(_, _) => { return None; } DecoderResult::OutputFull => { unreachable!(); } } } Some(score) } } struct Big5Candidate { decoder: Decoder, prev: LatinCj, prev_byte: u8, pending_score: Option, } impl Big5Candidate { fn maybe_set_as_pending(&mut self, s: i64) -> i64 { assert!(self.pending_score.is_none()); if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) { s } else { self.pending_score = Some(s); 0 } } fn feed(&mut self, buffer: &[u8], last: bool) -> Option { let mut score = 0i64; let mut src = [0u8]; let mut dst = [0u16; 2]; for &b in buffer { src[0] = b; let (result, read, written) = self .decoder .decode_to_utf16_without_replacement(&src, &mut dst, false); if written == 1 { let u = dst[0]; if (u >= u16::from(b'a') && u <= u16::from(b'z')) || (u >= u16::from(b'A') && u <= u16::from(b'Z')) { self.pending_score = None; // Discard pending score if self.prev == LatinCj::Cj { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::AsciiLetter; } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } match self.prev_byte { 0xA4..=0xC6 => { score += self.maybe_set_as_pending(BIG5_SCORE_PER_LEVEL_1_HANZI); // score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_traditional); } _ => { score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI); } } if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else { match u { 0x3000 // Distinct from Korean, space | 0x3001 // Distinct from Korean, enumeration comma | 0x3002 // Distinct from Korean, full stop | 0xFF08 // Distinct from Korean, parenthesis | 0xFF09 // Distinct from Korean, parenthesis | 0xFF01 // Distinct from Japanese, exclamation | 0xFF0C // Distinct from Japanese, comma | 0xFF1B // Distinct from Japanese, semicolon | 0xFF1F // Distinct from Japanese, question => { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } // Not really needed for CJK distinction // but let's give non-zero score for these // common byte pairs anyway. score += CJ_PUNCTUATION; } 0..=0x7F => { self.pending_score = None; // Discard pending score } _ => { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += CJK_OTHER; } } self.prev = LatinCj::Other; } } else if written == 2 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } if dst[0] == 0xCA || dst[0] == 0xEA { score += CJK_OTHER; self.prev = LatinCj::Other; } else { debug_assert!(dst[0] >= 0xD480 && dst[0] < 0xD880); score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI); if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } } match result { DecoderResult::InputEmpty => { assert_eq!(read, 1); } DecoderResult::Malformed(malformed_len, _) => { if self.prev_byte >= 0x81 && self.prev_byte <= 0xFE && ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE)) { // The byte pair is in the Big5 range but unmapped. // Treat as PUA to avoid rejecting Big5-UAO, etc. // We don't reprocess `b` even if ASCII, since it's // logically part of the pair. if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += BIG5_PUA_PENALTY; // Assume Hanzi semantics if self.prev == LatinCj::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinCj::Cj; } else if (self.prev_byte == 0xA0 || self.prev_byte == 0xFD || self.prev_byte == 0xFE) && (b < 0x80 || b == 0xFF) { // Mac OS Chinese Traditional single-byte that conflicts with code page Big5 lead byte // followed by ASCII or a non-conflicting single-byte extension. self.pending_score = None; // Just in case score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { self.prev = LatinCj::AsciiLetter; } else if b == 0xFF { score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; self.prev = LatinCj::Other; } else { self.prev = LatinCj::Other; } } else if malformed_len == 1 && b == 0xFF { // Mac OS Chinese Traditional single-byte extension that doesn't conflict with lead bytes self.pending_score = None; // Just in case score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY; self.prev = LatinCj::Other; } else { return None; } } DecoderResult::OutputFull => { unreachable!(); } } self.prev_byte = b; } if last { let (result, _, _) = self .decoder .decode_to_utf16_without_replacement(b"", &mut dst, true); match result { DecoderResult::InputEmpty => {} DecoderResult::Malformed(_, _) => { return None; } DecoderResult::OutputFull => { unreachable!(); } } } Some(score) } } struct EucKrCandidate { decoder: Decoder, prev_byte: u8, prev_was_euc_range: bool, prev: LatinKorean, current_word_len: u64, pending_score: Option, } impl EucKrCandidate { fn maybe_set_as_pending(&mut self, s: i64) -> i64 { assert!(self.pending_score.is_none()); if self.prev == LatinKorean::Hangul || !more_problematic_lead(self.prev_byte) { s } else { self.pending_score = Some(s); 0 } } fn feed(&mut self, buffer: &[u8], last: bool) -> Option { let mut score = 0i64; let mut src = [0u8]; let mut dst = [0u16; 2]; for &b in buffer { let in_euc_range = b >= 0xA1 && b <= 0xFE; src[0] = b; let (result, read, written) = self .decoder .decode_to_utf16_without_replacement(&src, &mut dst, false); if written > 0 { let u = dst[0]; if (u >= u16::from(b'a') && u <= u16::from(b'z')) || (u >= u16::from(b'A') && u <= u16::from(b'Z')) { self.pending_score = None; // Discard pending score match self.prev { LatinKorean::Hangul | LatinKorean::Hanja => { score += CJK_LATIN_ADJACENCY_PENALTY; } _ => {} } self.prev = LatinKorean::AsciiLetter; self.current_word_len = 0; } else if u >= 0xAC00 && u <= 0xD7A3 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } if self.prev_was_euc_range && in_euc_range { score += EUC_KR_SCORE_PER_EUC_HANGUL; score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_hangul); } else { score += self.maybe_set_as_pending(EUC_KR_SCORE_PER_NON_EUC_HANGUL); } if self.prev == LatinKorean::AsciiLetter { score += CJK_LATIN_ADJACENCY_PENALTY; } self.prev = LatinKorean::Hangul; self.current_word_len += 1; if self.current_word_len > 5 { score += EUC_KR_LONG_WORD_PENALTY; } } else if (u >= 0x4E00 && u < 0xAC00) || (u >= 0xF900 && u <= 0xFA0B) { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += EUC_KR_SCORE_PER_HANJA; match self.prev { LatinKorean::AsciiLetter => { score += CJK_LATIN_ADJACENCY_PENALTY; } LatinKorean::Hangul => { score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY; } _ => {} } self.prev = LatinKorean::Hanja; self.current_word_len += 1; if self.current_word_len > 5 { score += EUC_KR_LONG_WORD_PENALTY; } } else { if u >= 0x80 { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } score += CJK_OTHER; } else { self.pending_score = None; // Discard pending score } self.prev = LatinKorean::Other; self.current_word_len = 0; } } match result { DecoderResult::InputEmpty => { assert_eq!(read, 1); } DecoderResult::Malformed(malformed_len, _) => { if (self.prev_byte == 0xC9 || self.prev_byte == 0xFE) && b >= 0xA1 && b <= 0xFE { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } // The byte pair is in code page 949 EUDC range score += EUC_KR_PUA_PENALTY; // Assume Hanja semantics match self.prev { LatinKorean::AsciiLetter => { score += CJK_LATIN_ADJACENCY_PENALTY; } LatinKorean::Hangul => { score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY; } _ => {} } self.prev = LatinKorean::Hanja; self.current_word_len += 1; if self.current_word_len > 5 { score += EUC_KR_LONG_WORD_PENALTY; } } else if (self.prev_byte == 0xA1 || (self.prev_byte >= 0xA3 && self.prev_byte <= 0xA8) || (self.prev_byte >= 0xAA && self.prev_byte <= 0xAD)) && (b >= 0x7B && b <= 0x7D) { if let Some(pending) = self.pending_score { score += pending; self.pending_score = None; } // MacKorean symbols in range not part of code page 949 score += EUC_KR_MAC_KOREAN_PENALTY; self.prev = LatinKorean::Other; self.current_word_len = 0; } else if (self.prev_byte >= 0x81 && self.prev_byte <= 0x84) && (b <= 0x80 || b == 0xFF) { // MacKorean single-byte that conflicts with code page 949 lead byte // followed by ASCII or a non-conflicting single-byte extension. self.pending_score = None; // Just in case score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') { self.prev = LatinKorean::AsciiLetter; } else if b == 0x80 || b == 0xFF { score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; self.prev = LatinKorean::Other; } else { self.prev = LatinKorean::Other; } self.current_word_len = 0; } else if malformed_len == 1 && (b == 0x80 || b == 0xFF) { // MacKorean single-byte extensions that don't conflict with lead bytes self.pending_score = None; // Just in case score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY; self.prev = LatinKorean::Other; self.current_word_len = 0; } else { return None; } } DecoderResult::OutputFull => { unreachable!(); } } self.prev_was_euc_range = in_euc_range; self.prev_byte = b; } if last { let (result, _, _) = self .decoder .decode_to_utf16_without_replacement(b"", &mut dst, true); match result { DecoderResult::InputEmpty => {} DecoderResult::Malformed(_, _) => { return None; } DecoderResult::OutputFull => { unreachable!(); } } } Some(score) } } enum InnerCandidate { Latin(LatinCandidate), NonLatinCased(NonLatinCasedCandidate), Caseless(CaselessCandidate), ArabicFrench(ArabicFrenchCandidate), Logical(LogicalCandidate), Visual(VisualCandidate), Utf8(Utf8Candidate), Iso2022(Iso2022Candidate), Shift(ShiftJisCandidate), EucJp(EucJpCandidate), EucKr(EucKrCandidate), Big5(Big5Candidate), Gbk(GbkCandidate), } impl InnerCandidate { fn feed(&mut self, buffer: &[u8], last: bool) -> Option { match self { InnerCandidate::Latin(c) => { if let Some(new_score) = c.feed(buffer) { if last { // Treat EOF as space-like if let Some(additional_score) = c.feed(b" ") { Some(new_score + additional_score) } else { None } } else { Some(new_score) } } else { None } } InnerCandidate::NonLatinCased(c) => { if let Some(new_score) = c.feed(buffer) { if last { // Treat EOF as space-like if let Some(additional_score) = c.feed(b" ") { Some(new_score + additional_score) } else { None } } else { Some(new_score) } } else { None } } InnerCandidate::Caseless(c) => { if let Some(new_score) = c.feed(buffer) { if last { // Treat EOF as space-like if let Some(additional_score) = c.feed(b" ") { Some(new_score + additional_score) } else { None } } else { Some(new_score) } } else { None } } InnerCandidate::ArabicFrench(c) => { if let Some(new_score) = c.feed(buffer) { if last { // Treat EOF as space-like if let Some(additional_score) = c.feed(b" ") { Some(new_score + additional_score) } else { None } } else { Some(new_score) } } else { None } } InnerCandidate::Logical(c) => { if let Some(new_score) = c.feed(buffer) { if last { // Treat EOF as space-like if let Some(additional_score) = c.feed(b" ") { Some(new_score + additional_score) } else { None } } else { Some(new_score) } } else { None } } InnerCandidate::Visual(c) => { if let Some(new_score) = c.feed(buffer) { if last { // Treat EOF as space-like if let Some(additional_score) = c.feed(b" ") { Some(new_score + additional_score) } else { None } } else { Some(new_score) } } else { None } } InnerCandidate::Utf8(c) => c.feed(buffer, last), InnerCandidate::Iso2022(c) => c.feed(buffer, last), InnerCandidate::Shift(c) => c.feed(buffer, last), InnerCandidate::EucJp(c) => c.feed(buffer, last), InnerCandidate::EucKr(c) => c.feed(buffer, last), InnerCandidate::Big5(c) => c.feed(buffer, last), InnerCandidate::Gbk(c) => c.feed(buffer, last), } } } fn encoding_for_tld(tld: Tld) -> usize { match tld { Tld::CentralWindows | Tld::CentralCyrillic => EncodingDetector::CENTRAL_WINDOWS_INDEX, Tld::Cyrillic => EncodingDetector::CYRILLIC_WINDOWS_INDEX, Tld::Generic | Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic | Tld::Eu => { EncodingDetector::WESTERN_INDEX } Tld::IcelandicFaroese => EncodingDetector::ICELANDIC_INDEX, Tld::Greek => EncodingDetector::GREEK_ISO_INDEX, Tld::TurkishAzeri => EncodingDetector::TURKISH_INDEX, Tld::Hebrew => EncodingDetector::LOGICAL_INDEX, Tld::Arabic => EncodingDetector::ARABIC_WINDOWS_INDEX, Tld::Baltic => EncodingDetector::BALTIC_WINDOWS_INDEX, Tld::Vietnamese => EncodingDetector::VIETNAMESE_INDEX, Tld::Thai => EncodingDetector::THAI_INDEX, Tld::Simplified | Tld::SimplifiedTraditional => EncodingDetector::GBK_INDEX, Tld::Traditional | Tld::TraditionalSimplified => EncodingDetector::BIG5_INDEX, Tld::Japanese => EncodingDetector::SHIFT_JIS_INDEX, Tld::Korean => EncodingDetector::EUC_KR_INDEX, Tld::CentralIso => EncodingDetector::CENTRAL_ISO_INDEX, } } fn encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool { match tld { Tld::CentralWindows => encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX, Tld::Cyrillic => { encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX || encoding == EncodingDetector::CYRILLIC_KOI_INDEX || encoding == EncodingDetector::CYRILLIC_IBM_INDEX || encoding == EncodingDetector::CYRILLIC_ISO_INDEX } Tld::Western => encoding == EncodingDetector::WESTERN_INDEX, Tld::Greek => { encoding == EncodingDetector::GREEK_WINDOWS_INDEX || encoding == EncodingDetector::GREEK_ISO_INDEX } Tld::TurkishAzeri => encoding == EncodingDetector::TURKISH_INDEX, Tld::Hebrew => encoding == EncodingDetector::LOGICAL_INDEX, Tld::Arabic => { encoding == EncodingDetector::ARABIC_WINDOWS_INDEX || encoding == EncodingDetector::ARABIC_ISO_INDEX } Tld::Baltic => { encoding == EncodingDetector::BALTIC_WINDOWS_INDEX || encoding == EncodingDetector::BALTIC_ISO13_INDEX || encoding == EncodingDetector::BALTIC_ISO4_INDEX } Tld::Vietnamese => encoding == EncodingDetector::VIETNAMESE_INDEX, Tld::Thai => encoding == EncodingDetector::THAI_INDEX, Tld::Simplified => encoding == EncodingDetector::GBK_INDEX, Tld::Traditional => encoding == EncodingDetector::BIG5_INDEX, Tld::Japanese => { encoding == EncodingDetector::SHIFT_JIS_INDEX || encoding == EncodingDetector::EUC_JP_INDEX } Tld::Korean => encoding == EncodingDetector::EUC_KR_INDEX, Tld::SimplifiedTraditional | Tld::TraditionalSimplified => { encoding == EncodingDetector::GBK_INDEX || encoding == EncodingDetector::BIG5_INDEX } Tld::CentralIso => encoding == EncodingDetector::CENTRAL_ISO_INDEX, Tld::IcelandicFaroese => encoding == EncodingDetector::ICELANDIC_INDEX, Tld::WesternCyrillic => { encoding == EncodingDetector::WESTERN_INDEX || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX || encoding == EncodingDetector::CYRILLIC_KOI_INDEX || encoding == EncodingDetector::CYRILLIC_IBM_INDEX || encoding == EncodingDetector::CYRILLIC_ISO_INDEX } Tld::CentralCyrillic => { encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX || encoding == EncodingDetector::CENTRAL_ISO_INDEX || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX || encoding == EncodingDetector::CYRILLIC_KOI_INDEX || encoding == EncodingDetector::CYRILLIC_IBM_INDEX || encoding == EncodingDetector::CYRILLIC_ISO_INDEX } Tld::WesternArabic => { encoding == EncodingDetector::WESTERN_INDEX || encoding == EncodingDetector::ARABIC_WINDOWS_INDEX || encoding == EncodingDetector::ARABIC_ISO_INDEX } Tld::Eu => { encoding == EncodingDetector::WESTERN_INDEX || encoding == EncodingDetector::ICELANDIC_INDEX || encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX || encoding == EncodingDetector::CENTRAL_ISO_INDEX || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX || encoding == EncodingDetector::CYRILLIC_KOI_INDEX || encoding == EncodingDetector::CYRILLIC_IBM_INDEX || encoding == EncodingDetector::CYRILLIC_ISO_INDEX || encoding == EncodingDetector::GREEK_WINDOWS_INDEX || encoding == EncodingDetector::GREEK_ISO_INDEX || encoding == EncodingDetector::BALTIC_WINDOWS_INDEX || encoding == EncodingDetector::BALTIC_ISO13_INDEX || encoding == EncodingDetector::BALTIC_ISO4_INDEX } Tld::Generic => false, } } fn score_adjustment(score: i64, encoding: usize, tld: Tld) -> i64 { if score < 1 { return 0; } // This is the most ad hoc part of this library. let (divisor, constant) = match tld { Tld::Generic => { unreachable!(); } Tld::CentralWindows | Tld::CentralIso => { match encoding { EncodingDetector::WESTERN_INDEX | EncodingDetector::ICELANDIC_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::VIETNAMESE_INDEX | EncodingDetector::TURKISH_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Cyrillic => { match encoding { EncodingDetector::BIG5_INDEX | EncodingDetector::GBK_INDEX | EncodingDetector::EUC_JP_INDEX | EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::GREEK_WINDOWS_INDEX | EncodingDetector::GREEK_ISO_INDEX | EncodingDetector::VISUAL_INDEX | EncodingDetector::LOGICAL_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::TURKISH_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic => { match encoding { EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::TURKISH_INDEX | EncodingDetector::VIETNAMESE_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Greek => { match encoding { EncodingDetector::BIG5_INDEX | EncodingDetector::GBK_INDEX | EncodingDetector::EUC_JP_INDEX | EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::CYRILLIC_WINDOWS_INDEX | EncodingDetector::CYRILLIC_ISO_INDEX | EncodingDetector::CYRILLIC_KOI_INDEX | EncodingDetector::CYRILLIC_IBM_INDEX | EncodingDetector::VISUAL_INDEX | EncodingDetector::LOGICAL_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::TURKISH_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::TurkishAzeri => { match encoding { EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::VIETNAMESE_INDEX | EncodingDetector::ICELANDIC_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Hebrew => { match encoding { EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::CYRILLIC_WINDOWS_INDEX | EncodingDetector::CYRILLIC_ISO_INDEX | EncodingDetector::CYRILLIC_KOI_INDEX | EncodingDetector::CYRILLIC_IBM_INDEX | EncodingDetector::GREEK_WINDOWS_INDEX | EncodingDetector::GREEK_ISO_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::VIETNAMESE_INDEX | EncodingDetector::TURKISH_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Arabic => { match encoding { EncodingDetector::BIG5_INDEX | EncodingDetector::GBK_INDEX | EncodingDetector::EUC_JP_INDEX | EncodingDetector::EUC_KR_INDEX | EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::CYRILLIC_WINDOWS_INDEX | EncodingDetector::CYRILLIC_ISO_INDEX | EncodingDetector::CYRILLIC_KOI_INDEX | EncodingDetector::CYRILLIC_IBM_INDEX | EncodingDetector::GREEK_WINDOWS_INDEX | EncodingDetector::GREEK_ISO_INDEX | EncodingDetector::VISUAL_INDEX | EncodingDetector::LOGICAL_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::VIETNAMESE_INDEX | EncodingDetector::TURKISH_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Baltic => { match encoding { EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::ICELANDIC_INDEX | EncodingDetector::TURKISH_INDEX | EncodingDetector::VIETNAMESE_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Vietnamese => { match encoding { EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::TURKISH_INDEX | EncodingDetector::ICELANDIC_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Thai => { match encoding { EncodingDetector::BIG5_INDEX | EncodingDetector::GBK_INDEX | EncodingDetector::EUC_JP_INDEX | EncodingDetector::EUC_KR_INDEX | EncodingDetector::SHIFT_JIS_INDEX | EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::CYRILLIC_WINDOWS_INDEX | EncodingDetector::CYRILLIC_ISO_INDEX | EncodingDetector::CYRILLIC_KOI_INDEX | EncodingDetector::CYRILLIC_IBM_INDEX | EncodingDetector::GREEK_WINDOWS_INDEX | EncodingDetector::GREEK_ISO_INDEX | EncodingDetector::ARABIC_WINDOWS_INDEX | EncodingDetector::ARABIC_ISO_INDEX | EncodingDetector::VISUAL_INDEX | EncodingDetector::LOGICAL_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::TURKISH_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Simplified | Tld::Traditional | Tld::TraditionalSimplified | Tld::SimplifiedTraditional | Tld::Japanese | Tld::Korean => { // If TLD default is valid, everything else scores zero return score; } Tld::IcelandicFaroese => { match encoding { EncodingDetector::CENTRAL_WINDOWS_INDEX | EncodingDetector::CENTRAL_ISO_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::TURKISH_INDEX | EncodingDetector::VIETNAMESE_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::CentralCyrillic => { match encoding { EncodingDetector::BIG5_INDEX | EncodingDetector::GBK_INDEX | EncodingDetector::EUC_JP_INDEX | EncodingDetector::GREEK_WINDOWS_INDEX | EncodingDetector::GREEK_ISO_INDEX | EncodingDetector::VISUAL_INDEX | EncodingDetector::LOGICAL_INDEX | EncodingDetector::BALTIC_WINDOWS_INDEX | EncodingDetector::BALTIC_ISO4_INDEX | EncodingDetector::BALTIC_ISO13_INDEX | EncodingDetector::TURKISH_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } Tld::Eu => { match encoding { EncodingDetector::BIG5_INDEX | EncodingDetector::GBK_INDEX | EncodingDetector::EUC_JP_INDEX | EncodingDetector::TURKISH_INDEX | EncodingDetector::VIETNAMESE_INDEX => { // XXX Tune this better instead of this kind of absolute. return score; } _ => (50, 60), } } }; (score / divisor) + constant } struct Candidate { inner: InnerCandidate, score: Option, } impl Candidate { fn feed(&mut self, buffer: &[u8], last: bool) { if let Some(old_score) = self.score { if let Some(new_score) = self.inner.feed(buffer, last) { self.score = Some(old_score + new_score); } else { self.score = None; } } } fn new_latin(data: &'static SingleByteData) -> Self { Candidate { inner: InnerCandidate::Latin(LatinCandidate::new(data)), score: Some(0), } } fn new_non_latin_cased(data: &'static SingleByteData) -> Self { Candidate { inner: InnerCandidate::NonLatinCased(NonLatinCasedCandidate::new(data)), score: Some(0), } } fn new_caseless(data: &'static SingleByteData) -> Self { Candidate { inner: InnerCandidate::Caseless(CaselessCandidate::new(data)), score: Some(0), } } fn new_arabic_french(data: &'static SingleByteData) -> Self { Candidate { inner: InnerCandidate::ArabicFrench(ArabicFrenchCandidate::new(data)), score: Some(0), } } fn new_logical(data: &'static SingleByteData) -> Self { Candidate { inner: InnerCandidate::Logical(LogicalCandidate::new(data)), score: Some(0), } } fn new_visual(data: &'static SingleByteData) -> Self { Candidate { inner: InnerCandidate::Visual(VisualCandidate::new(data)), score: Some(0), } } fn new_utf_8() -> Self { Candidate { inner: InnerCandidate::Utf8(Utf8Candidate { decoder: UTF_8.new_decoder_without_bom_handling(), }), score: Some(0), } } fn new_iso_2022_jp() -> Self { Candidate { inner: InnerCandidate::Iso2022(Iso2022Candidate { decoder: ISO_2022_JP.new_decoder_without_bom_handling(), }), score: Some(0), } } fn new_shift_jis() -> Self { Candidate { inner: InnerCandidate::Shift(ShiftJisCandidate { decoder: SHIFT_JIS.new_decoder_without_bom_handling(), half_width_katakana_seen: false, half_width_katakana_state: HalfWidthKatakana::DakutenForbidden, prev: LatinCj::Other, prev_byte: 0, pending_score: None, }), score: Some(0), } } fn new_euc_jp() -> Self { Candidate { inner: InnerCandidate::EucJp(EucJpCandidate { decoder: EUC_JP.new_decoder_without_bom_handling(), non_ascii_seen: false, half_width_katakana_state: HalfWidthKatakana::DakutenForbidden, prev: LatinCj::Other, prev_byte: 0, prev_prev_byte: 0, }), score: Some(0), } } fn new_euc_kr() -> Self { Candidate { inner: InnerCandidate::EucKr(EucKrCandidate { decoder: EUC_KR.new_decoder_without_bom_handling(), prev_byte: 0, prev_was_euc_range: false, prev: LatinKorean::Other, current_word_len: 0, pending_score: None, }), score: Some(0), } } fn new_big5() -> Self { Candidate { inner: InnerCandidate::Big5(Big5Candidate { decoder: BIG5.new_decoder_without_bom_handling(), prev: LatinCj::Other, prev_byte: 0, pending_score: None, }), score: Some(0), } } fn new_gbk() -> Self { Candidate { inner: InnerCandidate::Gbk(GbkCandidate { decoder: GBK.new_decoder_without_bom_handling(), prev: LatinCj::Other, prev_byte: 0, pending_score: None, }), score: Some(0), } } fn score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option { match &self.inner { InnerCandidate::NonLatinCased(c) => { if c.longest_word < 2 { return None; } } InnerCandidate::Caseless(c) => { if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { return None; } } InnerCandidate::ArabicFrench(c) => { if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { return None; } } InnerCandidate::Logical(c) => { if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { return None; } } InnerCandidate::Visual(c) => { if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) { return None; } } _ => {} } if tld == Tld::Generic { return self.score; } if let Some(score) = self.score { if encoding == encoding_for_tld(tld) { return Some(score + 1); } if encoding_is_native_to_tld(tld, encoding) { return Some(score); } if expectation_is_valid { return Some(score - score_adjustment(score, encoding, tld)); } // If expectation is no longer valid, fall back to // generic behavior. // XXX Flipped Chinese and Central return Some(score); } None } fn plausible_punctuation(&self) -> u64 { match &self.inner { InnerCandidate::Logical(c) => { return c.plausible_punctuation; } InnerCandidate::Visual(c) => { return c.plausible_punctuation; } _ => { unreachable!(); } } } fn encoding(&self) -> &'static Encoding { match &self.inner { InnerCandidate::Latin(c) => { return c.data.encoding; } InnerCandidate::NonLatinCased(c) => { return c.data.encoding; } InnerCandidate::Caseless(c) => { return c.data.encoding; } InnerCandidate::ArabicFrench(c) => { return c.data.encoding; } InnerCandidate::Logical(c) => { return c.data.encoding; } InnerCandidate::Visual(c) => { return c.data.encoding; } InnerCandidate::Shift(_) => { return SHIFT_JIS; } InnerCandidate::EucJp(_) => { return EUC_JP; } InnerCandidate::Big5(_) => { return BIG5; } InnerCandidate::EucKr(_) => { return EUC_KR; } InnerCandidate::Gbk(_) => { return GBK; } InnerCandidate::Utf8(_) => { return UTF_8; } InnerCandidate::Iso2022(_) => { return ISO_2022_JP; } } } } fn count_non_ascii(buffer: &[u8]) -> u64 { let mut count = 0; for &b in buffer { if b >= 0x80 { count += 1; } } count } #[derive(Clone, Copy)] enum BeforeNonAscii { None, One([u8; 1]), Two([u8; 2]), } impl BeforeNonAscii { fn as_slice(&self) -> &[u8] { match self { BeforeNonAscii::None => b"", BeforeNonAscii::One(arr) => &arr[..], BeforeNonAscii::Two(arr) => &arr[..], } } fn push(&mut self, buffer: &[u8]) { let len = buffer.len(); if len >= 2 { let arr = [buffer[len - 2], buffer[len - 1]]; *self = BeforeNonAscii::Two(arr); } else if len == 1 { match self { BeforeNonAscii::None => { let arr = [buffer[0]]; *self = BeforeNonAscii::One(arr); } BeforeNonAscii::One(first) => { let arr = [first[0], buffer[0]]; *self = BeforeNonAscii::Two(arr); } BeforeNonAscii::Two(first) => { let arr = [first[1], buffer[0]]; *self = BeforeNonAscii::Two(arr); } } } } } /// A Web browser-oriented detector for guessing what character /// encoding a stream of bytes is encoded in. /// /// The bytes are fed to the detector incrementally using the `feed` /// method. The current guess of the detector can be queried using /// the `guess` method. The guessing parameters are arguments to the /// `guess` method rather than arguments to the constructor in order /// to enable the application to check if the arguments affect the /// guessing outcome. (The specific use case is to disable UI for /// re-running the detector with UTF-8 allowed and the top-level /// domain name ignored if those arguments don't change the guess.) pub struct EncodingDetector { candidates: [Candidate; 27], non_ascii_seen: u64, // We need to feed up to two bytes of context before non-ASCII // thanks to Spanish n.º. last_before_non_ascii: BeforeNonAscii, esc_seen: bool, closed: bool, } impl EncodingDetector { fn feed_impl(&mut self, buffer: &[u8], last: bool) { for candidate in self.candidates.iter_mut() { candidate.feed(buffer, last); } self.non_ascii_seen += count_non_ascii(buffer); } /// Inform the detector of a chunk of input. /// /// The byte stream is represented as a sequence of calls to this /// method such that the concatenation of the arguments to this /// method form the byte stream. It does not matter how the application /// chooses to chunk the stream. It is OK to call this method with /// a zero-length byte slice. /// /// The end of the stream is indicated by calling this method with /// `last` set to `true`. In that case, the end of the stream is /// considered to occur after the last byte of the `buffer` (which /// may be zero-length) passed in the same call. Once this method /// has been called with `last` set to `true` this method must not /// be called again. /// /// If you want to perform detection on just the prefix of a longer /// stream, do not pass `last=true` after the prefix if the stream /// actually still continues. /// /// Returns `true` if after processing `buffer` the stream has /// contained at least one non-ASCII byte and `false` if only /// ASCII has been seen so far. /// /// # Panics /// /// If this method has previously been called with `last` set to `true`. pub fn feed(&mut self, buffer: &[u8], last: bool) -> bool { assert!( !self.closed, "Must not feed again after feeding with last equaling true." ); if last { self.closed = true; } let start = if self.non_ascii_seen == 0 && !self.esc_seen { let up_to = Encoding::ascii_valid_up_to(buffer); let start = if let Some(escape) = memchr::memchr(0x1B, &buffer[..up_to]) { self.esc_seen = true; escape } else { up_to }; if start == buffer.len() { self.last_before_non_ascii.push(buffer); return self.non_ascii_seen != 0; } if start == 0 || start == 1 { let last_before = self.last_before_non_ascii; self.last_before_non_ascii = BeforeNonAscii::None; self.feed_impl(last_before.as_slice(), false); 0 } else { start - 2 } } else { 0 }; self.feed_impl(&buffer[start..], last); self.non_ascii_seen != 0 } /// Guess the encoding given the bytes pushed to the detector so far /// (via `feed()`), the top-level domain name from which the bytes were /// loaded, and an indication of whether to consider UTF-8 as a permissible /// guess. /// /// The `tld` argument takes the rightmost DNS label of the hostname of the /// host the stream was loaded from in lower-case ASCII form. That is, if /// the label is an internationalized top-level domain name, it must be /// provided in its Punycode form. If the TLD that the stream was loaded /// from is unavalable, `None` may be passed instead, which is equivalent /// to passing `Some(b"com")`. /// /// If the `allow_utf8` argument is set to `false`, the return value of /// this method won't be `encoding_rs::UTF_8`. When performing detection /// on `text/html` on non-`file:` URLs, Web browsers must pass `false`, /// unless the user has taken a specific contextual action to request an /// override. This way, Web developers cannot start depending on UTF-8 /// detection. Such reliance would make the Web Platform more brittle. /// /// Returns the guessed encoding. /// /// # Panics /// /// If `tld` contains non-ASCII, period, or upper-case letters. (The panic /// condition is intentionally limited to signs of failing to extract the /// label correctly, failing to provide it in its Punycode form, and failure /// to lower-case it. Full DNS label validation is intentionally not performed /// to avoid panics when the reality doesn't match the specs.) pub fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding { let mut tld_type = tld.map_or(Tld::Generic, |tld| { assert!(!contains_upper_case_period_or_non_ascii(tld)); classify_tld(tld) }); if self.non_ascii_seen == 0 && self.esc_seen && self.candidates[Self::ISO_2022_JP_INDEX].score.is_some() { return ISO_2022_JP; } if self.candidates[Self::UTF_8_INDEX].score.is_some() { if allow_utf8 { return UTF_8; } // Various test cases that prohibit UTF-8 detection want to // see windows-1252 specifically. These tests run on generic // domains. However, if we returned windows-1252 on // some non-generic domains, we'd cause reloads. return self.candidates[encoding_for_tld(tld_type)].encoding(); } let mut encoding = self.candidates[encoding_for_tld(tld_type)].encoding(); let mut max = 0i64; let mut expectation_is_valid = false; if tld_type != Tld::Generic { for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() { expectation_is_valid = true; break; } } } if !expectation_is_valid { // Flip Chinese and Central around match tld_type { Tld::Simplified => { if self.candidates[Self::BIG5_INDEX].score.is_some() { tld_type = Tld::Traditional; expectation_is_valid = true; } } Tld::Traditional => { if self.candidates[Self::GBK_INDEX].score.is_some() { tld_type = Tld::Simplified; expectation_is_valid = true; } } Tld::CentralWindows => { if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() { tld_type = Tld::CentralIso; expectation_is_valid = true; } } Tld::CentralIso => { if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() { tld_type = Tld::CentralWindows; expectation_is_valid = true; } } _ => {} } } for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { if let Some(score) = candidate.score(i, tld_type, expectation_is_valid) { if score > max { max = score; encoding = candidate.encoding(); } } } let visual = &self.candidates[Self::VISUAL_INDEX]; if let Some(visual_score) = visual.score(Self::VISUAL_INDEX, tld_type, expectation_is_valid) { if (visual_score > max || encoding == WINDOWS_1255) && visual.plausible_punctuation() > self.candidates[Self::LOGICAL_INDEX].plausible_punctuation() { // max = visual_score; encoding = ISO_8859_8; } } encoding } // XXX Test-only API #[cfg(feature = "testing-only-no-semver-guarantees-do-not-use")] pub fn find_score(&self, encoding: &'static Encoding) -> Option { let mut tld_type = Tld::Generic; let mut expectation_is_valid = false; if tld_type != Tld::Generic { for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) { if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() { expectation_is_valid = true; break; } } } if !expectation_is_valid { // Flip Chinese and Central around match tld_type { Tld::Simplified => { if self.candidates[Self::BIG5_INDEX].score.is_some() { tld_type = Tld::Traditional; expectation_is_valid = true; } } Tld::Traditional => { if self.candidates[Self::GBK_INDEX].score.is_some() { tld_type = Tld::Simplified; expectation_is_valid = true; } } Tld::CentralWindows => { if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() { tld_type = Tld::CentralIso; expectation_is_valid = true; } } Tld::CentralIso => { if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() { tld_type = Tld::CentralWindows; expectation_is_valid = true; } } _ => {} } } for (i, candidate) in self.candidates.iter().enumerate() { if encoding == candidate.encoding() { return candidate.score(i, tld_type, expectation_is_valid); } } Some(0) } const FIRST_NORMAL: usize = 3; const UTF_8_INDEX: usize = 0; const ISO_2022_JP_INDEX: usize = 1; const VISUAL_INDEX: usize = 2; const GBK_INDEX: usize = 3; const EUC_JP_INDEX: usize = 4; const EUC_KR_INDEX: usize = 5; const SHIFT_JIS_INDEX: usize = 6; const BIG5_INDEX: usize = 7; const WESTERN_INDEX: usize = 8; const CYRILLIC_WINDOWS_INDEX: usize = 9; const CENTRAL_WINDOWS_INDEX: usize = 10; const CENTRAL_ISO_INDEX: usize = 11; const ARABIC_WINDOWS_INDEX: usize = 12; const ICELANDIC_INDEX: usize = 13; const TURKISH_INDEX: usize = 14; const THAI_INDEX: usize = 15; const LOGICAL_INDEX: usize = 16; const GREEK_WINDOWS_INDEX: usize = 17; const GREEK_ISO_INDEX: usize = 18; const BALTIC_WINDOWS_INDEX: usize = 19; const BALTIC_ISO13_INDEX: usize = 20; const CYRILLIC_KOI_INDEX: usize = 21; const CYRILLIC_IBM_INDEX: usize = 22; const ARABIC_ISO_INDEX: usize = 23; const VIETNAMESE_INDEX: usize = 24; const BALTIC_ISO4_INDEX: usize = 25; const CYRILLIC_ISO_INDEX: usize = 26; /// Creates a new instance of the detector. pub fn new() -> Self { EncodingDetector { candidates: [ Candidate::new_utf_8(), // 0 Candidate::new_iso_2022_jp(), // 1 Candidate::new_visual(&SINGLE_BYTE_DATA[ISO_8859_8_INDEX]), // 2 Candidate::new_gbk(), // 3 Candidate::new_euc_jp(), // 4 Candidate::new_euc_kr(), // 5 Candidate::new_shift_jis(), // 6 Candidate::new_big5(), // 7 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_INDEX]), // 8 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1251_INDEX]), // 9 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1250_INDEX]), // 10 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_2_INDEX]), // 11 Candidate::new_arabic_french(&SINGLE_BYTE_DATA[WINDOWS_1256_INDEX]), // 12 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_ICELANDIC_INDEX]), // 13 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1254_INDEX]), // 14 Candidate::new_caseless(&SINGLE_BYTE_DATA[WINDOWS_874_INDEX]), // 15 Candidate::new_logical(&SINGLE_BYTE_DATA[WINDOWS_1255_INDEX]), // 16 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1253_INDEX]), // 17 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_7_INDEX]), // 18 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1257_INDEX]), // 19 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_13_INDEX]), // 20 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[KOI8_U_INDEX]), // 21 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[IBM866_INDEX]), // 22 Candidate::new_caseless(&SINGLE_BYTE_DATA[ISO_8859_6_INDEX]), // 23 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1258_INDEX]), // 24 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_4_INDEX]), // 25 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_5_INDEX]), // 26 ], non_ascii_seen: 0, last_before_non_ascii: BeforeNonAscii::None, esc_seen: false, closed: false, } } /// Queries whether the TLD is considered non-generic and could affect the guess. pub fn tld_may_affect_guess(tld: Option<&[u8]>) -> bool { if let Some(tld) = tld { classify_tld(tld) != Tld::Generic } else { false } } } #[cfg(test)] mod tests { use super::*; use detone::IterDecomposeVietnamese; use encoding_rs::IBM866; use encoding_rs::ISO_8859_2; use encoding_rs::ISO_8859_4; use encoding_rs::ISO_8859_5; use encoding_rs::ISO_8859_6; use encoding_rs::ISO_8859_7; use encoding_rs::KOI8_U; use encoding_rs::WINDOWS_1250; use encoding_rs::WINDOWS_1251; use encoding_rs::WINDOWS_1252; use encoding_rs::WINDOWS_1253; use encoding_rs::WINDOWS_1254; use encoding_rs::WINDOWS_1256; use encoding_rs::WINDOWS_1257; use encoding_rs::WINDOWS_1258; use encoding_rs::WINDOWS_874; fn check_bytes(bytes: &[u8], encoding: &'static Encoding) { let mut det = EncodingDetector::new(); det.feed(bytes, true); let enc = det.guess(None, false); let (decoded, _) = enc.decode_without_bom_handling(bytes); println!("{:?}", decoded); assert_eq!(enc, encoding); } fn check(input: &str, encoding: &'static Encoding) { let orthographic; let (bytes, _, _) = if encoding == WINDOWS_1258 { orthographic = input .chars() .decompose_vietnamese_tones(true) .collect::(); encoding.encode(&orthographic) } else { encoding.encode(input) }; check_bytes(&bytes, encoding); } #[test] fn test_i_apostrophe() { let mut det = EncodingDetector::new(); det.feed(b"I\x92", true); let enc = det.guess(None, false); assert_eq!(enc, WINDOWS_1252); } #[test] fn test_streaming_numero_one_by_one() { let mut det = EncodingDetector::new(); det.feed(b"n", false); det.feed(b".", false); det.feed(b"\xBA", false); det.feed(b"1", true); let enc = det.guess(None, false); assert_eq!(enc, WINDOWS_1252); } #[test] fn test_streaming_numero_two_together() { let mut det = EncodingDetector::new(); det.feed(b"n.", false); det.feed(b"\xBA", false); det.feed(b"1", true); let enc = det.guess(None, false); assert_eq!(enc, WINDOWS_1252); } #[test] fn test_streaming_numero_one_by_one_extra_before() { let mut det = EncodingDetector::new(); det.feed(b" n", false); det.feed(b".", false); det.feed(b"\xBA", false); det.feed(b"1", true); let enc = det.guess(None, false); assert_eq!(enc, WINDOWS_1252); } #[test] fn test_streaming_numero_one_before() { let mut det = EncodingDetector::new(); det.feed(b"n", false); det.feed(b".\xBA", false); det.feed(b"1", true); let enc = det.guess(None, false); assert_eq!(enc, WINDOWS_1252); } #[test] fn test_streaming_numero_longer_first_buffer() { let mut det = EncodingDetector::new(); det.feed(b"rrn.", false); det.feed(b"\xBA", false); det.feed(b"1", true); let enc = det.guess(None, false); assert_eq!(enc, WINDOWS_1252); } #[test] fn test_empty() { let mut det = EncodingDetector::new(); let seen_non_ascii = det.feed(b"", true); let enc = det.guess(None, false); assert_eq!(enc, WINDOWS_1252); assert!(!seen_non_ascii); } #[test] fn test_fi() { check("Ääni", WINDOWS_1252); } #[test] fn test_fi_bis() { check("Tämä", WINDOWS_1252); } #[test] fn test_pt() { check( "Este é um teste de codificação de caracteres.", WINDOWS_1252, ); } #[test] fn test_is() { check("Þetta er kóðunarpróf á staf. Fyrir sum tungumál sem nota latneska stafi þurfum við meira inntak til að taka ákvörðunina.", WINDOWS_1252); } #[test] fn test_ru_short() { check("Русский", WINDOWS_1251); } #[test] fn test_ru() { check("Это тест кодировки символов.", WINDOWS_1251); } #[test] fn test_ru_iso() { check("Это тест кодировки символов.", ISO_8859_5); } #[test] fn test_ru_ibm() { check("Это тест кодировки символов.", IBM866); } #[test] fn test_ru_koi() { check("Это тест кодировки символов.", KOI8_U); } #[test] fn test_uk() { check("Це тест на кодування символів.", WINDOWS_1251); } #[test] fn test_uk_koi() { check("Це тест на кодування символів.", KOI8_U); } #[test] fn test_el_short() { check("Ελληνικά", WINDOWS_1253); } #[test] fn test_el() { check( "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης", WINDOWS_1253, ); } #[test] fn test_el_iso() { check( "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης", ISO_8859_7, ); } #[test] fn test_de() { check("Straße", WINDOWS_1252); } #[test] fn test_he() { check("\u{5E2}\u{5D1}\u{5E8}\u{5D9}\u{5EA}", WINDOWS_1255); } #[test] fn test_2022() { check("日本語", ISO_2022_JP); } #[test] fn test_th() { check("นี่คือการทดสอบการเข้ารหัสอักขระ", WINDOWS_874); } #[test] fn test_vi() { check("Đây là một thử nghiệm mã hóa ký tự.", WINDOWS_1258); } #[test] fn test_tr() { check("Bu bir karakter kodlama testidir. Latince karakterleri kullanan bazı dillerde karar vermek için daha fazla girdiye ihtiyacımız var.", WINDOWS_1254); } #[test] fn test_simplified() { check("这是一个字符编码测试。", GBK); } #[test] fn test_traditional() { check("這是一個字符編碼測試。", BIG5); } #[test] fn test_ko() { check("이것은 문자 인코딩 테스트입니다.", EUC_KR); } #[test] fn test_shift() { check("これは文字実験です。", SHIFT_JIS); } #[test] fn test_euc() { check("これは文字実験です。", EUC_JP); } #[test] fn test_ar() { check("هذا هو اختبار ترميز الأحرف.", WINDOWS_1256); } #[test] fn test_ar_iso() { check("هذا هو اختبار ترميز الأحرف.", ISO_8859_6); } #[test] fn test_fa() { check("این یک تست رمزگذاری کاراکتر است.", WINDOWS_1256); } #[test] fn test_visual() { check(".םיוות דודיק ןחבמ והז", ISO_8859_8); } #[test] fn test_yi() { check("דאָס איז אַ טעסט פֿאַר קאָדירונג פון כאַראַקטער.", WINDOWS_1255); } #[test] fn test_it() { check("è", WINDOWS_1252); } #[test] fn test_en() { check("isn’t", WINDOWS_1252); } #[test] fn test_en_bis() { check("Rock ’n Roll", WINDOWS_1252); } #[test] fn test_ca() { check("Codificació de caràcters", WINDOWS_1252); } #[test] fn test_et() { check("või", WINDOWS_1252); } #[test] fn test_pl_iso() { check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", ISO_8859_2); } #[test] fn test_pl() { check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", WINDOWS_1250); } #[test] fn test_lt() { check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", WINDOWS_1257); } // TODO: Detected as ISO-8859-2. // #[test] // fn test_lt_windows_iso_8859_4() { // check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", ISO_8859_4); // } #[test] fn test_lv() { check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", WINDOWS_1257); } #[test] fn test_lv_iso_8859_4() { check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", ISO_8859_4); } #[test] fn test_a0() { // Test that this isn't IBM866. TODO: What about GBK with fully paired 0xA0? check("\u{A0}\u{A0} \u{A0}", WINDOWS_1252); } #[test] fn test_a0a0() { // Test that this isn't GBK or EUC-KR. check("\u{A0}\u{A0}", WINDOWS_1252); } #[test] fn test_space_copyright_space() { check(" © ", WINDOWS_1252); } #[test] fn test_space_masculine_space() { check(" º ", WINDOWS_1252); } #[test] fn test_space_feminine_space() { check(" ª ", WINDOWS_1252); } #[test] fn test_period_masculine_space() { check(".º ", WINDOWS_1252); } #[test] fn test_period_feminine_space() { check(".ª ", WINDOWS_1252); } #[test] fn test_maria() { check(" Mª ", WINDOWS_1252); } #[test] fn test_dona() { check(" Dª ", WINDOWS_1252); } #[test] fn test_nuestra() { check(" Nª ", WINDOWS_1252); } #[test] fn test_senora() { check(" Sª ", WINDOWS_1252); } #[test] fn test_digit_feminine() { check(" 42ª ", WINDOWS_1252); } #[test] fn test_digit_masculine() { check(" 42º ", WINDOWS_1252); } #[test] fn test_roman_feminine() { check(" XIVª ", WINDOWS_1252); } #[test] fn test_roman_masculine() { check(" XIVº ", WINDOWS_1252); } #[test] fn test_numero_uno() { check("Nº1", WINDOWS_1252); } #[test] fn test_numero() { check("Nº", WINDOWS_1252); } #[test] fn test_euro() { check(" €9", WINDOWS_1252); } #[test] fn test_shift_jis_half_width_katakana() { check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS); } #[test] fn test_big5_pua() { let mut v = Vec::new(); for _ in 0..40 { v.extend_from_slice(b"\xA4\x40"); } v.extend_from_slice(b"\x81\x40\xA4\x40"); check_bytes(&v, BIG5); } #[test] fn test_big5_single_byte_a0() { let mut v = Vec::new(); for _ in 0..80 { v.extend_from_slice(b"\xA4\x40"); } v.extend_from_slice(b"\x81\x40\xA0 "); check_bytes(&v, BIG5); } #[test] fn test_big5_single_byte_ff() { let mut v = Vec::new(); for _ in 0..80 { v.extend_from_slice(b"\xA4\x40"); } v.extend_from_slice(b"\x81\x40\xFF "); check_bytes(&v, BIG5); } #[test] fn test_not_big5() { let mut v = Vec::new(); for _ in 0..40 { v.extend_from_slice(b"\xA4\x40"); } v.extend_from_slice(b"\x81\x40\xA0\xA0"); check_bytes(&v, IBM866); } #[test] fn test_euc_kr_pua() { let mut v = Vec::new(); v.extend_from_slice(b"\xC9\xA1\xB0\xA1 "); for _ in 0..40 { v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); } check_bytes(&v, EUC_KR); } #[test] fn test_euc_kr_pua_bis() { let mut v = Vec::new(); v.extend_from_slice(b"\xFE\xA1\xB0\xA1 "); for _ in 0..40 { v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); } check_bytes(&v, EUC_KR); } #[test] fn test_euc_kr_single_byte_ff() { let mut v = Vec::new(); v.extend_from_slice(b"\xFF "); for _ in 0..40 { v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); } check_bytes(&v, EUC_KR); } #[test] fn test_euc_kr_single_byte_81() { let mut v = Vec::new(); v.extend_from_slice(b"\x81 "); for _ in 0..40 { v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); } check_bytes(&v, EUC_KR); } #[test] fn test_euc_kr_single_byte_84() { let mut v = Vec::new(); v.extend_from_slice(b"\x84 "); for _ in 0..40 { v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); } check_bytes(&v, EUC_KR); } #[test] fn test_not_euc_kr() { let mut v = Vec::new(); v.extend_from_slice(b"\xC9\xA0\xB0\xA1 "); for _ in 0..40 { v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. "); } check_bytes(&v, GBK); } #[test] fn test_shift_jis_x0213() { let mut v = Vec::new(); v.extend_from_slice(b"\x87\xE5"); for _ in 0..40 { v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); } check_bytes(&v, SHIFT_JIS); } #[test] fn test_shift_jis_single_byte_fd() { let mut v = Vec::new(); v.extend_from_slice(b"\xFD"); for _ in 0..40 { v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); } check_bytes(&v, SHIFT_JIS); } #[test] fn test_not_shift_jis() { let mut v = Vec::new(); v.extend_from_slice(b"\x84\xE0"); for _ in 0..40 { v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); } check_bytes(&v, GBK); } #[test] fn test_not_shift_jis_bis() { let mut v = Vec::new(); v.extend_from_slice(b"\x87\x7D"); for _ in 0..40 { v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2"); } check_bytes(&v, GBK); } #[test] fn test_euc_jp_x0213() { let mut v = Vec::new(); v.extend_from_slice(b"\xAD\xBF"); for _ in 0..80 { v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); } check_bytes(&v, EUC_JP); } #[test] fn test_euc_jp_x0213_other_plane() { let mut v = Vec::new(); v.extend_from_slice(b"\x8F\xFE\xF6"); for _ in 0..80 { v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); } check_bytes(&v, EUC_JP); } #[test] fn test_not_euc_jp() { let mut v = Vec::new(); v.extend_from_slice(b"\x8F\xFE\xF7"); for _ in 0..80 { v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); } check_bytes(&v, WINDOWS_1252); } #[test] fn test_not_euc_jp_bis() { let mut v = Vec::new(); v.extend_from_slice(b"\xA8\xDF"); for _ in 0..80 { v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4"); } check_bytes(&v, BIG5); } #[test] fn test_gbk_single_byte_ff() { let mut v = Vec::new(); v.extend_from_slice(b"\xFF"); for _ in 0..80 { v.extend_from_slice(b"\xB5\xC4"); } check_bytes(&v, GBK); } #[test] fn test_gbk_single_byte_a0() { let mut v = Vec::new(); v.extend_from_slice(b"\xA0 "); for _ in 0..80 { v.extend_from_slice(b"\xB5\xC4"); } check_bytes(&v, GBK); } #[test] fn test_gbk_single_byte_fe() { let mut v = Vec::new(); v.extend_from_slice(b"\xFE "); for _ in 0..80 { v.extend_from_slice(b"\xB5\xC4"); } check_bytes(&v, GBK); } #[test] fn test_not_gbk_single_byte_fc() { let mut v = Vec::new(); v.extend_from_slice(b"\xFC "); for _ in 0..80 { v.extend_from_slice(b"\xB5\xC4"); } check_bytes(&v, ISO_8859_5); } }