summaryrefslogtreecommitdiffstats
path: root/intl/JapaneseDetector.h
blob: 4407016facc8d9d7f53963067f12c839f7b3c3b1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// Copyright 2018 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// Mostly copied and pasted from
// third_party/rust/shift_or_euc/src/lib.rs , so
// "top-level directory of this distribution" above refers to
// third_party/rust/shift_or_euc/

#ifndef mozilla_JapaneseDetector_h
#define mozilla_JapaneseDetector_h

#include "mozilla/Encoding.h"

namespace mozilla {
class JapaneseDetector;
};  // namespace mozilla

#define SHIFT_OR_EUC_DETECTOR mozilla::JapaneseDetector

#include "shift_or_euc.h"

namespace mozilla {

/**
 * A Japanese legacy encoding detector for detecting between Shift_JIS,
 * EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the
 * encoding is one of those.
 *
 * # Principle of Operation
 *
 * The detector is based on two observations:
 *
 * 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or
 * EUC-JP, so encountering such an escape sequence (before non-ASCII has been
 * encountered) can be taken as indication of ISO-2022-JP.
 * 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is
 * decoded as EUC-JP, or vice versa, the result is either an error or
 * half-width katakana, and it's very uncommon for Japanese HTML to have
 * half-width katakana character before a normal kana or common kanji
 * character. Therefore, if decoding as Shift_JIS results in error or
 * have-width katakana, the detector decides that the content is EUC-JP, and
 * vice versa.
 *
 * # Failure Modes
 *
 * The detector gives the wrong answer if the text has a half-width katakana
 * character before normal kana or common kanji. Some uncommon kanji are
 * undecidable. (All JIS X 0208 Level 1 kanji are decidable.)
 *
 * The half-width katakana issue is mainly relevant for old 8-bit JIS X
 * 0201-only text files that would decode correctly as Shift_JIS but that the
 * detector detects as EUC-JP.
 *
 * The undecidable kanji issue does not realistically show up when a full
 * document is fed to the detector, because, realistically, in a full
 * document, there is at least one kana or common kanji. It can occur,
 * though, if the detector is only run on a prefix of a document and the
 * prefix only contains the title of the document. It is possible for
 * document title to consist entirely of undecidable kanji. (Indeed,
 * Japanese Wikipedia has articles with such titles.) If the detector is
 * undecided, a fallback to Shift_JIS should be used.
 */
class JapaneseDetector final {
 public:
  ~JapaneseDetector() {}

  static void operator delete(void* aDetector) {
    shift_or_euc_detector_free(reinterpret_cast<JapaneseDetector*>(aDetector));
  }

  /**
   * Instantiates the detector. If `aAllow2022` is `true` the possible
   * guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If
   * `aAllow2022` is `false`, the possible guesses are Shift_JIS, EUC-JP,
   * and undecided.
   */
  static inline UniquePtr<JapaneseDetector> Create(bool aAllow2022) {
    UniquePtr<JapaneseDetector> detector(shift_or_euc_detector_new(aAllow2022));
    return detector;
  }

  /**
   * Feeds bytes to the detector. If `aLast` is `true` the end of the stream
   * is considered to occur immediately after the end of `aBuffer`.
   * Otherwise, the stream is expected to continue. `aBuffer` may be empty.
   *
   * If you're running the detector only on a prefix of a complete
   * document, _do not_ pass `aLast` as `true` after the prefix if the
   * stream as a whole still contains more content.
   *
   * Returns `SHIFT_JIS_ENCODING` if the detector guessed
   * Shift_JIS. Returns `EUC_JP_ENCODING` if the detector
   * guessed EUC-JP. Returns `ISO_2022_JP_ENCODING` if the
   * detector guessed ISO-2022-JP (only possible if `true` was passed as
   * `aAllow2022` when instantiating the detector). Returns `nullptr` if the
   * detector is undecided. If `nullptr` is returned even when passing `true`
   * as `aLast`, falling back to Shift_JIS is the best guess for Web
   * purposes.
   *
   * Do not call again after the method has returned non-`nullptr` or after
   * the method has been called with `true` as `aLast`. (Asserts if the
   * previous sentence isn't adhered to.)
   */
  inline const mozilla::Encoding* Feed(Span<const uint8_t> aBuffer,
                                       bool aLast) {
    return shift_or_euc_detector_feed(this, aBuffer.Elements(),
                                      aBuffer.Length(), aLast);
  }

 private:
  JapaneseDetector() = delete;
  JapaneseDetector(const JapaneseDetector&) = delete;
  JapaneseDetector& operator=(const JapaneseDetector&) = delete;
};

};  // namespace mozilla

#endif  // mozilla_JapaneseDetector_h