diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/shift_or_euc/src | |
parent | Initial commit. (diff) | |
download | firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/shift_or_euc/src')
-rw-r--r-- | third_party/rust/shift_or_euc/src/lib.rs | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/third_party/rust/shift_or_euc/src/lib.rs b/third_party/rust/shift_or_euc/src/lib.rs new file mode 100644 index 0000000000..978fc7f27e --- /dev/null +++ b/third_party/rust/shift_or_euc/src/lib.rs @@ -0,0 +1,278 @@ +// Copyright 2018 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![doc(html_root_url = "https://docs.rs/shift_or_euc/0.1.0")] + +//! A Japanese legacy encoding detector for detecting between Shift_JIS, +//! EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the +//! encoding is one of those. +//! +//! This detector is generally more accurate (but see below about the failure +//! mode on half-width katakana) and decides much sooner than machine +//! learning-based detectors. To decide EUC-JP, machine learning-based +//! detectors try to gain confidence that the input looks like EUC-JP. To +//! decide EUC-JP, this detector instead looks for two simple rule-based +//! signs of the input not being Shift_JIS. +//! +//! As a consequence of not containing machine learning tables, the binary +//! size footprint that this crate adds on top of +//! [`encoding_rs`](https://docs.rs/crate/encoding_rs) is tiny. +//! +//! # Licensing +//! +//! See the file named [COPYRIGHT](https://github.com/hsivonen/shift_or_euc/blob/master/COPYRIGHT). +//! +//! # Principle of Operation +//! +//! The detector is based on two observations: +//! +//! 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or +//! EUC-JP, so encountering such an escape sequence (before non-ASCII has been +//! encountered) can be taken as indication of ISO-2022-JP. +//! 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is +//! decoded as EUC-JP, or vice versa, the result is either an error or +//! half-width katakana, and it's very uncommon for Japanese HTML to have +//! half-width katakana character before a normal kana or common kanji +//! character. Therefore, if decoding as Shift_JIS results in error or +//! have-width katakana, the detector decides that the content is EUC-JP, and +//! vice versa. +//! +//! # Failure Modes +//! +//! The detector gives the wrong answer if the text has a half-width katakana +//! character before normal kana or common kanji. Some uncommon kanji are +//! undecidable. (All JIS X 0208 Level 1 kanji are decidable.) +//! +//! The half-width katakana issue is mainly relevant for old 8-bit JIS X +//! 0201-only text files that would decode correctly as Shift_JIS but that the +//! detector detects as EUC-JP. +//! +//! The undecidable kanji issue does not realistically show up when a full +//! document is fed to the detector, because, realistically, in a full +//! document, there is at least one kana or common kanji. It can occur, +//! though, if the detector is only run on a prefix of a document and the +//! prefix only contains the title of the document. It is possible for +//! document title to consist entirely of undecidable kanji. (Indeed, +//! Japanese Wikipedia has articles with such titles.) If the detector is +//! undecided, falling back to Shift_JIS is typically the Web oriented better +//! guess. + +use encoding_rs::Decoder; +use encoding_rs::DecoderResult; +use encoding_rs::Encoding; +use encoding_rs::EUC_JP; +use encoding_rs::ISO_2022_JP; +use encoding_rs::SHIFT_JIS; + +/// Returns the index of the first non-ASCII byte or the first +/// 0x1B, whichever comes first, or the length of the buffer +/// if neither is found. +fn find_non_ascii_or_escape(buffer: &[u8]) -> usize { + let ascii_up_to = Encoding::ascii_valid_up_to(buffer); + if let Some(escape) = memchr::memchr(0x1B, &buffer[..ascii_up_to]) { + escape + } else { + ascii_up_to + } +} + +/// Feed decoder with one byte (if `last` is `false`) or EOF (if `last` is +/// `true`). `byte` is ignored if `last` is `true`. +/// Returns `true` if there was no rejection or `false` upon rejecting the +/// encoding hypothesis represented by this decoder. +#[inline(always)] +fn feed_decoder(decoder: &mut Decoder, byte: u8, last: bool) -> bool { + let mut output = [0u16; 1]; + let input = [byte]; + let (result, _read, written) = decoder.decode_to_utf16_without_replacement( + if last { b"" } else { &input }, + &mut output, + last, + ); + match result { + DecoderResult::InputEmpty => { + if written == 1 { + match output[0] { + 0xFF61...0xFF9F => { + return false; + } + _ => {} + } + } + } + DecoderResult::Malformed(_, _) => { + return false; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + true +} + +/// A detector for detecting the character encoding of input on the +/// precondition that the encoding is a Japanese legacy encoding. +pub struct Detector { + shift_jis_decoder: Decoder, + euc_jp_decoder: Decoder, + second_byte_in_escape: u8, + iso_2022_jp_disqualified: bool, + escape_seen: bool, + finished: bool, +} + +impl Detector { + /// Instantiates the detector. If `allow_2022` is `true` the possible + /// guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If + /// `allow_2022` is `false`, the possible guesses are Shift_JIS, EUC-JP, + /// and undecided. + pub fn new(allow_2022: bool) -> Self { + Detector { + shift_jis_decoder: SHIFT_JIS.new_decoder_without_bom_handling(), + euc_jp_decoder: EUC_JP.new_decoder_without_bom_handling(), + second_byte_in_escape: 0, + iso_2022_jp_disqualified: !allow_2022, + escape_seen: false, + finished: false, + } + } + + /// Feeds bytes to the detector. If `last` is `true` the end of the stream + /// is considered to occur immediately after the end of `buffer`. + /// Otherwise, the stream is expected to continue. `buffer` may be empty. + /// + /// If you're running the detector only on a prefix of a complete + /// document, _do not_ pass `last` as `true` after the prefix if the + /// stream as a whole still contains more content. + /// + /// Returns `Some(encoding_rs::SHIFT_JIS)` if the detector guessed + /// Shift_JIS. Returns `Some(encoding_rs::EUC_JP)` if the detector + /// guessed EUC-JP. Returns `Some(encoding_rs::ISO_2022_JP)` if the + /// detector guessed ISO-2022-JP (only possible if `true` was passed as + /// `allow_2022` when instantiating the detector). Returns `None` if the + /// detector is undecided. If `None` is returned even when passing `true` + /// as `last`, falling back to Shift_JIS is the best guess for Web + /// purposes. + /// + /// Do not call again after the method has returned `Some(_)` or after + /// the method has been called with `true` as `last`. + /// + /// # Panics + /// + /// If called after the method has returned `Some(_)` or after the method + /// has been called with `true` as `last`. + pub fn feed(&mut self, buffer: &[u8], last: bool) -> Option<&'static Encoding> { + assert!( + !self.finished, + "Tried to used a detector that has finished." + ); + self.finished = true; // Will change back to false unless we return early + let mut i = 0; + if !self.iso_2022_jp_disqualified { + if !self.escape_seen { + i = find_non_ascii_or_escape(buffer); + } + while i < buffer.len() { + let byte = buffer[i]; + if byte > 0x7F { + self.iso_2022_jp_disqualified = true; + break; + } + if !self.escape_seen && byte == 0x1B { + self.escape_seen = true; + i += 1; + continue; + } + if self.escape_seen && self.second_byte_in_escape == 0 { + self.second_byte_in_escape = byte; + i += 1; + continue; + } + match (self.second_byte_in_escape, byte) { + (0x28, 0x42) | (0x28, 0x4A) | (0x28, 0x49) | (0x24, 0x40) | (0x24, 0x42) => { + return Some(ISO_2022_JP); + } + _ => {} + } + if self.escape_seen { + self.iso_2022_jp_disqualified = true; + break; + } + i += 1; + } + } + for &byte in &buffer[i..] { + if !feed_decoder(&mut self.euc_jp_decoder, byte, false) { + return Some(SHIFT_JIS); + } + if !feed_decoder(&mut self.shift_jis_decoder, byte, false) { + return Some(EUC_JP); + } + } + if last { + if !feed_decoder(&mut self.euc_jp_decoder, 0, true) { + return Some(SHIFT_JIS); + } + if !feed_decoder(&mut self.shift_jis_decoder, 0, true) { + return Some(EUC_JP); + } + return None; + } + self.finished = false; + None + } +} + +// Any copyright to the test code below this comment is dedicated to the +// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_iso_2022_jp() { + let mut detector = Detector::new(true); + assert_eq!( + detector.feed(b"abc\x1B\x28\x42\xFF", true), + Some(ISO_2022_JP) + ); + } + + #[test] + fn test_error_precedence() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\xFF", true), Some(SHIFT_JIS)); + } + + #[test] + fn test_invalid_euc_jp() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\x81\x40", true), Some(SHIFT_JIS)); + } + + #[test] + fn test_invalid_shift_jis() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\xEB\xA8", true), Some(EUC_JP)); + } + + #[test] + fn test_invalid_shift_jis_before_invalid_euc_jp() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\xEB\xA8\x81\x40", true), Some(EUC_JP)); + } + + #[test] + fn test_undecided() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc", false), None); + assert_eq!(detector.feed(b"abc", false), None); + } + +} |