summaryrefslogtreecommitdiffstats
path: root/intl/EncodingDetector.h
blob: 269f4c01f2e610c890e153b211339f1055057db9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// Mostly copied and pasted from
// third_party/rust/chardetng/src/lib.rs , so
// "top-level directory of this distribution" above refers to
// third_party/rust/chardetng/

#ifndef mozilla_EncodingDetector_h
#define mozilla_EncodingDetector_h

#include "mozilla/Encoding.h"

namespace mozilla {
class EncodingDetector;
};  // namespace mozilla

#define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector

#include "chardetng.h"

namespace mozilla {

/**
 * A Web browser-oriented detector for guessing what character
 * encoding a stream of bytes is encoded in.
 *
 * The bytes are fed to the detector incrementally using the `feed`
 * method. The current guess of the detector can be queried using
 * the `guess` method. The guessing parameters are arguments to the
 * `guess` method rather than arguments to the constructor in order
 * to enable the application to check if the arguments affect the
 * guessing outcome. (The specific use case is to disable UI for
 * re-running the detector with UTF-8 allowed and the top-level
 * domain name ignored if those arguments don't change the guess.)
 */
class EncodingDetector final {
 public:
  ~EncodingDetector() = default;

  static void operator delete(void* aDetector) {
    chardetng_encoding_detector_free(
        reinterpret_cast<EncodingDetector*>(aDetector));
  }

  /**
   * Creates a new instance of the detector.
   */
  static inline UniquePtr<EncodingDetector> Create() {
    UniquePtr<EncodingDetector> detector(chardetng_encoding_detector_new());
    return detector;
  }

  /**
   * Queries whether the TLD is considered non-generic and could affect the
   * guess.
   */
  static inline bool TldMayAffectGuess(Span<const char> aTLD) {
    return chardetng_encoding_detector_tld_may_affect_guess(aTLD.Elements(),
                                                            aTLD.Length());
  }

  /**
   * Inform the detector of a chunk of input.
   *
   * The byte stream is represented as a sequence of calls to this
   * method such that the concatenation of the arguments to this
   * method form the byte stream. It does not matter how the application
   * chooses to chunk the stream. It is OK to call this method with
   * a zero-length byte slice.
   *
   * The end of the stream is indicated by calling this method with
   * `aLast` set to `true`. In that case, the end of the stream is
   * considered to occur after the last byte of the `aBuffer` (which
   * may be zero-length) passed in the same call. Once this method
   * has been called with `last` set to `true` this method must not
   * be called again.
   *
   * If you want to perform detection on just the prefix of a longer
   * stream, do not pass `aLast=true` after the prefix if the stream
   * actually still continues.
   *
   * Returns `true` if after processing `aBuffer` the stream has
   * contained at least one non-ASCII byte and `false` if only
   * ASCII has been seen so far.
   *
   * # Panics
   *
   * If this method has previously been called with `aLast` set to `true`.
   */
  inline bool Feed(Span<const uint8_t> aBuffer, bool aLast) {
    return chardetng_encoding_detector_feed(this, aBuffer.Elements(),
                                            aBuffer.Length(), aLast);
  }

  /**
   * Guess the encoding given the bytes pushed to the detector so far
   * (via `Feed()`), the top-level domain name from which the bytes were
   * loaded, and an indication of whether to consider UTF-8 as a permissible
   * guess.
   *
   * The `aTld` argument takes the rightmost DNS label of the hostname of the
   * host the stream was loaded from in lower-case ASCII form. That is, if
   * the label is an internationalized top-level domain name, it must be
   * provided in its Punycode form. If the TLD that the stream was loaded
   * from is unavalable, an empty `Spane` may be passed instead, which is
   * equivalent to passing a `Span` for "com".
   *
   * If the `aAllowUTF8` argument is set to `false`, the return value of
   * this method won't be `UTF_8_ENCODING`. When performing detection
   * on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
   * unless the user has taken a specific contextual action to request an
   * override. This way, Web developers cannot start depending on UTF-8
   * detection. Such reliance would make the Web Platform more brittle.
   *
   * Returns the guessed encoding.
   *
   * # Panics
   *
   * If `aTld` contains non-ASCII, period, or upper-case letters. (The panic
   * condition is intentionally limited to signs of failing to extract the
   * label correctly, failing to provide it in its Punycode form, and failure
   * to lower-case it. Full DNS label validation is intentionally not performed
   * to avoid panics when the reality doesn't match the specs.)
   */
  inline mozilla::NotNull<const mozilla::Encoding*> Guess(
      Span<const char> aTLD, bool aAllowUTF8) const {
    return WrapNotNull(chardetng_encoding_detector_guess(
        this, aTLD.Elements(), aTLD.Length(), aAllowUTF8));
  }

 private:
  EncodingDetector() = delete;
  EncodingDetector(const EncodingDetector&) = delete;
  EncodingDetector& operator=(const EncodingDetector&) = delete;
};

};  // namespace mozilla

#endif  // mozilla_EncodingDetector_h