diff options
Diffstat (limited to 'third_party/rust/chardetng_c/src')
-rw-r--r-- | third_party/rust/chardetng_c/src/lib.rs | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/third_party/rust/chardetng_c/src/lib.rs b/third_party/rust/chardetng_c/src/lib.rs new file mode 100644 index 0000000000..0ca3383b9e --- /dev/null +++ b/third_party/rust/chardetng_c/src/lib.rs @@ -0,0 +1,176 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![doc(html_root_url = "https://docs.rs/chardetng_c/0.1.0")] + +//! C API for [`chardetng`](https://docs.rs/chardetng/) +//! +//! # Panics +//! +//! This crate is designed to be used only in a `panic=abort` scenario. +//! Panic propagation across FFI is not handled! +//! +//! # Licensing +//! +//! See the file named [COPYRIGHT](https://github.com/hsivonen/chardetng_c/blob/master/COPYRIGHT). + +use encoding_rs::Encoding; +use chardetng::EncodingDetector; + +/// Instantiates a Web browser-oriented detector for guessing what +/// character encoding a stream of bytes is encoded in. +/// +/// The bytes are fed to the detector incrementally using the +/// `chardetng_encoding_detector_free` function. The current guess of the +/// detector can be queried using the `chardetng_encoding_detector_guess` +/// function. The guessing parameters are arguments to the +/// `chardetng_encoding_detector_guess` function rather than arguments to the +/// constructor in order to enable the application to check if the arguments +/// affect the guessing outcome. (The specific use case is to disable UI for +/// re-running the detector with UTF-8 allowed and the top-level domain name +/// ignored if those arguments don't change the guess.) +/// +/// The instantiated detector must be freed after use using +/// `chardetng_detectordetector_free`. +#[no_mangle] +pub unsafe extern "C" fn chardetng_encoding_detector_new() -> *mut EncodingDetector { + Box::into_raw(Box::new(EncodingDetector::new())) +} + +/// Deallocates a detector obtained from `chardetng_encodingdetector_new`. +#[no_mangle] +pub unsafe extern "C" fn chardetng_encoding_detector_free(detector: *mut EncodingDetector) { + let _ = Box::from_raw(detector); +} + +/// Queries whether the TLD is considered non-generic and could affect the guess. +/// +/// # Undefined Behavior +/// +/// UB ensues if +/// +/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len` +/// don't designate a range of memory valid for reading. +#[no_mangle] +pub unsafe extern "C" fn chardetng_encoding_detector_tld_may_affect_guess( + tld: *const u8, + tld_len: usize, +) -> bool { + let tld_opt = if tld.is_null() { + assert_eq!(tld_len, 0); + None + } else { + Some(::std::slice::from_raw_parts(tld, tld_len)) + }; + EncodingDetector::tld_may_affect_guess(tld_opt) +} + +/// Inform the detector of a chunk of input. +/// +/// The byte stream is represented as a sequence of calls to this +/// function such that the concatenation of the arguments to this +/// function form the byte stream. It does not matter how the application +/// chooses to chunk the stream. It is OK to call this function with +/// a zero-length byte slice. +/// +/// The end of the stream is indicated by calling this function with +/// `last` set to `true`. In that case, the end of the stream is +/// considered to occur after the last byte of the `buffer` (which +/// may be zero-length) passed in the same call. Once this function +/// has been called with `last` set to `true` this function must not +/// be called again. +/// +/// If you want to perform detection on just the prefix of a longer +/// stream, do not pass `last=true` after the prefix if the stream +/// actually still continues. +/// +/// Returns `true` if after processing `buffer` the stream has +/// contained at least one non-ASCII byte and `false` if only +/// ASCII has been seen so far. +/// +/// # Panics +/// +/// If this function has previously been called with `last` set to `true`. +/// +/// # Undefined Behavior +/// +/// UB ensues if +/// +/// * `detector` does not point to a detector obtained from +/// `chardetng_detector_new` but not yet freed with +/// `chardetng_detector_free`. +/// * `buffer` is `NULL`. (It can be a bogus pointer when `buffer_len` is 0.) +/// * ,buffer_len` is non-zero and `buffer` and `buffer_len` don't designate +/// a range of memory valid for reading. +#[no_mangle] +pub unsafe extern "C" fn chardetng_encoding_detector_feed( + detector: *mut EncodingDetector, + buffer: *const u8, + buffer_len: usize, + last: bool, +) -> bool { + (*detector).feed(::std::slice::from_raw_parts(buffer, buffer_len), last) +} + +/// Guess the encoding given the bytes pushed to the detector so far +/// (via `chardetng_encoding_detector_feed()`), the top-level domain name +/// from which the bytes were loaded, and an indication of whether to +/// consider UTF-8 as a permissible guess. +/// +/// The `tld` argument takes the rightmost DNS label of the hostname of the +/// host the stream was loaded from in lower-case ASCII form. That is, if +/// the label is an internationalized top-level domain name, it must be +/// provided in its Punycode form. If the TLD that the stream was loaded +/// from is unavalable, `NULL` may be passed instead (and 0 as `tld_len`), +/// which is equivalent to passing pointer to "com" as `tld` and 3 as +/// `tld_len`. +/// +/// If the `allow_utf8` argument is set to `false`, the return value of +/// this function won't be `UTF_8_ENCODING`. When performing detection +/// on `text/html` on non-`file:` URLs, Web browsers must pass `false`, +/// unless the user has taken a specific contextual action to request an +/// override. This way, Web developers cannot start depending on UTF-8 +/// detection. Such reliance would make the Web Platform more brittle. +/// +/// Returns the guessed encoding (never `NULL`). +/// +/// # Panics +/// +/// If `tld` is `NULL` but `tld_len` is not zero. +/// +/// If `tld` contains non-ASCII, period, or upper-case letters. (The panic +/// condition is intentionally limited to signs of failing to extract the +/// label correctly, failing to provide it in its Punycode form, and failure +/// to lower-case it. Full DNS label validation is intentionally not performed +/// to avoid panics when the reality doesn't match the specs.) +/// +/// # Undefined Behavior +/// +/// UB ensues if +/// +/// * `detector` does not point to a detector obtained from +/// `chardetng_detector_new` but not yet freed with +/// `chardetng_detector_free`. +/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len` +/// don't designate a range of memory valid for reading. +#[no_mangle] +pub unsafe extern "C" fn chardetng_encoding_detector_guess( + detector: *const EncodingDetector, + tld: *const u8, + tld_len: usize, + allow_utf8: bool, +) -> *const Encoding { + let tld_opt = if tld.is_null() { + assert_eq!(tld_len, 0); + None + } else { + Some(::std::slice::from_raw_parts(tld, tld_len)) + }; + (*detector).guess(tld_opt, allow_utf8) +} |