// Copyright 2019 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. // Mostly copied and pasted from // third_party/rust/chardetng/src/lib.rs , so // "top-level directory of this distribution" above refers to // third_party/rust/chardetng/ #ifndef mozilla_EncodingDetector_h #define mozilla_EncodingDetector_h #include "mozilla/Encoding.h" namespace mozilla { class EncodingDetector; }; // namespace mozilla #define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector #include "chardetng.h" namespace mozilla { /** * A Web browser-oriented detector for guessing what character * encoding a stream of bytes is encoded in. * * The bytes are fed to the detector incrementally using the `feed` * method. The current guess of the detector can be queried using * the `guess` method. The guessing parameters are arguments to the * `guess` method rather than arguments to the constructor in order * to enable the application to check if the arguments affect the * guessing outcome. (The specific use case is to disable UI for * re-running the detector with UTF-8 allowed and the top-level * domain name ignored if those arguments don't change the guess.) */ class EncodingDetector final { public: ~EncodingDetector() = default; static void operator delete(void* aDetector) { chardetng_encoding_detector_free( reinterpret_cast(aDetector)); } /** * Creates a new instance of the detector. */ static inline UniquePtr Create() { UniquePtr detector(chardetng_encoding_detector_new()); return detector; } /** * Queries whether the TLD is considered non-generic and could affect the * guess. */ static inline bool TldMayAffectGuess(Span aTLD) { return chardetng_encoding_detector_tld_may_affect_guess(aTLD.Elements(), aTLD.Length()); } /** * Inform the detector of a chunk of input. * * The byte stream is represented as a sequence of calls to this * method such that the concatenation of the arguments to this * method form the byte stream. It does not matter how the application * chooses to chunk the stream. It is OK to call this method with * a zero-length byte slice. * * The end of the stream is indicated by calling this method with * `aLast` set to `true`. In that case, the end of the stream is * considered to occur after the last byte of the `aBuffer` (which * may be zero-length) passed in the same call. Once this method * has been called with `last` set to `true` this method must not * be called again. * * If you want to perform detection on just the prefix of a longer * stream, do not pass `aLast=true` after the prefix if the stream * actually still continues. * * Returns `true` if after processing `aBuffer` the stream has * contained at least one non-ASCII byte and `false` if only * ASCII has been seen so far. * * # Panics * * If this method has previously been called with `aLast` set to `true`. */ inline bool Feed(Span aBuffer, bool aLast) { return chardetng_encoding_detector_feed(this, aBuffer.Elements(), aBuffer.Length(), aLast); } /** * Guess the encoding given the bytes pushed to the detector so far * (via `Feed()`), the top-level domain name from which the bytes were * loaded, and an indication of whether to consider UTF-8 as a permissible * guess. * * The `aTld` argument takes the rightmost DNS label of the hostname of the * host the stream was loaded from in lower-case ASCII form. That is, if * the label is an internationalized top-level domain name, it must be * provided in its Punycode form. If the TLD that the stream was loaded * from is unavalable, an empty `Spane` may be passed instead, which is * equivalent to passing a `Span` for "com". * * If the `aAllowUTF8` argument is set to `false`, the return value of * this method won't be `UTF_8_ENCODING`. When performing detection * on `text/html` on non-`file:` URLs, Web browsers must pass `false`, * unless the user has taken a specific contextual action to request an * override. This way, Web developers cannot start depending on UTF-8 * detection. Such reliance would make the Web Platform more brittle. * * Returns the guessed encoding. * * # Panics * * If `aTld` contains non-ASCII, period, or upper-case letters. (The panic * condition is intentionally limited to signs of failing to extract the * label correctly, failing to provide it in its Punycode form, and failure * to lower-case it. Full DNS label validation is intentionally not performed * to avoid panics when the reality doesn't match the specs.) */ inline mozilla::NotNull Guess( Span aTLD, bool aAllowUTF8) const { return WrapNotNull(chardetng_encoding_detector_guess( this, aTLD.Elements(), aTLD.Length(), aAllowUTF8)); } private: EncodingDetector() = delete; EncodingDetector(const EncodingDetector&) = delete; EncodingDetector& operator=(const EncodingDetector&) = delete; }; }; // namespace mozilla #endif // mozilla_EncodingDetector_h