// Copyright Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #ifndef chardetng_h #define chardetng_h #ifdef __cplusplus extern "C" { #endif #include #include #include "encoding_rs.h" #ifndef CHARDETNG_ENCODING_DETECTOR #define CHARDETNG_ENCODING_DETECTOR EncodingDetector #ifndef __cplusplus typedef struct Detector_ EncodingDetector; #endif #endif /// Instantiates a Web browser-oriented detector for guessing what /// character encoding a stream of bytes is encoded in. /// /// The bytes are fed to the detector incrementally using the /// `chardetng_encoding_detector_free` function. The current guess of the /// detector can be queried using the `chardetng_encoding_detector_guess` /// function. The guessing parameters are arguments to the /// `chardetng_encoding_detector_guess` function rather than arguments to the /// constructor in order to enable the application to check if the arguments /// affect the guessing outcome. (The specific use case is to disable UI for /// re-running the detector with UTF-8 allowed and the top-level domain name /// ignored if those arguments don't change the guess.) /// /// The instantiated detector must be freed after use using /// `chardetng_detectordetector_free`. CHARDETNG_ENCODING_DETECTOR* chardetng_encoding_detector_new(); /// Deallocates a detector obtained from `chardetng_encodingdetector_new`. void chardetng_encoding_detector_free(CHARDETNG_ENCODING_DETECTOR* detector); /// Queries whether the TLD is considered non-generic and could affect the guess. /// /// # Undefined Behavior /// /// UB ensues if /// /// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len` /// don't designate a range of memory valid for reading. bool chardetng_encoding_detector_tld_may_affect_guess(char const* tld, size_t tld_len); /// Inform the detector of a chunk of input. /// /// The byte stream is represented as a sequence of calls to this /// function such that the concatenation of the arguments to this /// function form the byte stream. It does not matter how the application /// chooses to chunk the stream. It is OK to call this function with /// a zero-length byte slice. /// /// The end of the stream is indicated by calling this function with /// `last` set to `true`. In that case, the end of the stream is /// considered to occur after the last byte of the `buffer` (which /// may be zero-length) passed in the same call. Once this function /// has been called with `last` set to `true` this function must not /// be called again. /// /// If you want to perform detection on just the prefix of a longer /// stream, do not pass `last=true` after the prefix if the stream /// actually still continues. /// /// Returns `true` if after processing `buffer` the stream has /// contained at least one non-ASCII byte and `false` if only /// ASCII has been seen so far. /// /// # Panics /// /// If this function has previously been called with `last` set to `true`. /// /// # Undefined Behavior /// /// UB ensues if /// /// * `detector` does not point to a detector obtained from /// `chardetng_detector_new` but not yet freed with /// `chardetng_detector_free`. /// * `buffer` is `NULL`. (It can be a bogus pointer when `buffer_len` is 0.) /// * ,buffer_len` is non-zero and `buffer` and `buffer_len` don't designate /// a range of memory valid for reading. bool chardetng_encoding_detector_feed( CHARDETNG_ENCODING_DETECTOR* detector, uint8_t const* buffer, size_t buffer_len, bool last ); /// Guess the encoding given the bytes pushed to the detector so far /// (via `chardetng_encoding_detector_feed()`), the top-level domain name /// from which the bytes were loaded, and an indication of whether to /// consider UTF-8 as a permissible guess. /// /// The `tld` argument takes the rightmost DNS label of the hostname of the /// host the stream was loaded from in lower-case ASCII form. That is, if /// the label is an internationalized top-level domain name, it must be /// provided in its Punycode form. If the TLD that the stream was loaded /// from is unavalable, `NULL` may be passed instead (and 0 as `tld_len`), /// which is equivalent to passing pointer to "com" as `tld` and 3 as /// `tld_len`. /// /// If the `allow_utf8` argument is set to `false`, the return value of /// this function won't be `UTF_8_ENCODING`. When performing detection /// on `text/html` on non-`file:` URLs, Web browsers must pass `false`, /// unless the user has taken a specific contextual action to request an /// override. This way, Web developers cannot start depending on UTF-8 /// detection. Such reliance would make the Web Platform more brittle. /// /// Returns the guessed encoding (never `NULL`). /// /// # Panics /// /// If `tld` is `NULL` but `tld_len` is not zero. /// /// If `tld` contains non-ASCII, period, or upper-case letters. (The panic /// condition is intentionally limited to signs of failing to extract the /// label correctly, failing to provide it in its Punycode form, and failure /// to lower-case it. Full DNS label validation is intentionally not performed /// to avoid panics when the reality doesn't match the specs.) /// /// # Undefined Behavior /// /// UB ensues if /// /// * `detector` does not point to a detector obtained from /// `chardetng_detector_new` but not yet freed with /// `chardetng_detector_free`. /// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len` /// don't designate a range of memory valid for reading. ENCODING_RS_ENCODING const* chardetng_encoding_detector_guess( CHARDETNG_ENCODING_DETECTOR const* detector, char const* tld, size_t tld_len, bool allow_utf8 ); #ifdef __cplusplus } #endif #endif // chardetng_h