summaryrefslogtreecommitdiffstats
path: root/intl/JapaneseDetector.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /intl/JapaneseDetector.h
parentInitial commit. (diff)
downloadfirefox-upstream.tar.xz
firefox-upstream.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/JapaneseDetector.h')
-rw-r--r--intl/JapaneseDetector.h124
1 files changed, 124 insertions, 0 deletions
diff --git a/intl/JapaneseDetector.h b/intl/JapaneseDetector.h
new file mode 100644
index 0000000000..4407016fac
--- /dev/null
+++ b/intl/JapaneseDetector.h
@@ -0,0 +1,124 @@
+// Copyright 2018 Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// Mostly copied and pasted from
+// third_party/rust/shift_or_euc/src/lib.rs , so
+// "top-level directory of this distribution" above refers to
+// third_party/rust/shift_or_euc/
+
+#ifndef mozilla_JapaneseDetector_h
+#define mozilla_JapaneseDetector_h
+
+#include "mozilla/Encoding.h"
+
+namespace mozilla {
+class JapaneseDetector;
+}; // namespace mozilla
+
+#define SHIFT_OR_EUC_DETECTOR mozilla::JapaneseDetector
+
+#include "shift_or_euc.h"
+
+namespace mozilla {
+
+/**
+ * A Japanese legacy encoding detector for detecting between Shift_JIS,
+ * EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the
+ * encoding is one of those.
+ *
+ * # Principle of Operation
+ *
+ * The detector is based on two observations:
+ *
+ * 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or
+ * EUC-JP, so encountering such an escape sequence (before non-ASCII has been
+ * encountered) can be taken as indication of ISO-2022-JP.
+ * 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is
+ * decoded as EUC-JP, or vice versa, the result is either an error or
+ * half-width katakana, and it's very uncommon for Japanese HTML to have
+ * half-width katakana character before a normal kana or common kanji
+ * character. Therefore, if decoding as Shift_JIS results in error or
+ * have-width katakana, the detector decides that the content is EUC-JP, and
+ * vice versa.
+ *
+ * # Failure Modes
+ *
+ * The detector gives the wrong answer if the text has a half-width katakana
+ * character before normal kana or common kanji. Some uncommon kanji are
+ * undecidable. (All JIS X 0208 Level 1 kanji are decidable.)
+ *
+ * The half-width katakana issue is mainly relevant for old 8-bit JIS X
+ * 0201-only text files that would decode correctly as Shift_JIS but that the
+ * detector detects as EUC-JP.
+ *
+ * The undecidable kanji issue does not realistically show up when a full
+ * document is fed to the detector, because, realistically, in a full
+ * document, there is at least one kana or common kanji. It can occur,
+ * though, if the detector is only run on a prefix of a document and the
+ * prefix only contains the title of the document. It is possible for
+ * document title to consist entirely of undecidable kanji. (Indeed,
+ * Japanese Wikipedia has articles with such titles.) If the detector is
+ * undecided, a fallback to Shift_JIS should be used.
+ */
+class JapaneseDetector final {
+ public:
+ ~JapaneseDetector() {}
+
+ static void operator delete(void* aDetector) {
+ shift_or_euc_detector_free(reinterpret_cast<JapaneseDetector*>(aDetector));
+ }
+
+ /**
+ * Instantiates the detector. If `aAllow2022` is `true` the possible
+ * guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If
+ * `aAllow2022` is `false`, the possible guesses are Shift_JIS, EUC-JP,
+ * and undecided.
+ */
+ static inline UniquePtr<JapaneseDetector> Create(bool aAllow2022) {
+ UniquePtr<JapaneseDetector> detector(shift_or_euc_detector_new(aAllow2022));
+ return detector;
+ }
+
+ /**
+ * Feeds bytes to the detector. If `aLast` is `true` the end of the stream
+ * is considered to occur immediately after the end of `aBuffer`.
+ * Otherwise, the stream is expected to continue. `aBuffer` may be empty.
+ *
+ * If you're running the detector only on a prefix of a complete
+ * document, _do not_ pass `aLast` as `true` after the prefix if the
+ * stream as a whole still contains more content.
+ *
+ * Returns `SHIFT_JIS_ENCODING` if the detector guessed
+ * Shift_JIS. Returns `EUC_JP_ENCODING` if the detector
+ * guessed EUC-JP. Returns `ISO_2022_JP_ENCODING` if the
+ * detector guessed ISO-2022-JP (only possible if `true` was passed as
+ * `aAllow2022` when instantiating the detector). Returns `nullptr` if the
+ * detector is undecided. If `nullptr` is returned even when passing `true`
+ * as `aLast`, falling back to Shift_JIS is the best guess for Web
+ * purposes.
+ *
+ * Do not call again after the method has returned non-`nullptr` or after
+ * the method has been called with `true` as `aLast`. (Asserts if the
+ * previous sentence isn't adhered to.)
+ */
+ inline const mozilla::Encoding* Feed(Span<const uint8_t> aBuffer,
+ bool aLast) {
+ return shift_or_euc_detector_feed(this, aBuffer.Elements(),
+ aBuffer.Length(), aLast);
+ }
+
+ private:
+ JapaneseDetector() = delete;
+ JapaneseDetector(const JapaneseDetector&) = delete;
+ JapaneseDetector& operator=(const JapaneseDetector&) = delete;
+};
+
+}; // namespace mozilla
+
+#endif // mozilla_JapaneseDetector_h