1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#ifndef chardetng_h
#define chardetng_h
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include <stdbool.h>
#include "encoding_rs.h"
#ifndef CHARDETNG_ENCODING_DETECTOR
#define CHARDETNG_ENCODING_DETECTOR EncodingDetector
#ifndef __cplusplus
typedef struct Detector_ EncodingDetector;
#endif
#endif
/// Instantiates a Web browser-oriented detector for guessing what
/// character encoding a stream of bytes is encoded in.
///
/// The bytes are fed to the detector incrementally using the
/// `chardetng_encoding_detector_free` function. The current guess of the
/// detector can be queried using the `chardetng_encoding_detector_guess`
/// function. The guessing parameters are arguments to the
/// `chardetng_encoding_detector_guess` function rather than arguments to the
/// constructor in order to enable the application to check if the arguments
/// affect the guessing outcome. (The specific use case is to disable UI for
/// re-running the detector with UTF-8 allowed and the top-level domain name
/// ignored if those arguments don't change the guess.)
///
/// The instantiated detector must be freed after use using
/// `chardetng_detectordetector_free`.
CHARDETNG_ENCODING_DETECTOR* chardetng_encoding_detector_new();
/// Deallocates a detector obtained from `chardetng_encodingdetector_new`.
void chardetng_encoding_detector_free(CHARDETNG_ENCODING_DETECTOR* detector);
/// Queries whether the TLD is considered non-generic and could affect the guess.
///
/// # Undefined Behavior
///
/// UB ensues if
///
/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len`
/// don't designate a range of memory valid for reading.
bool chardetng_encoding_detector_tld_may_affect_guess(char const* tld, size_t tld_len);
/// Inform the detector of a chunk of input.
///
/// The byte stream is represented as a sequence of calls to this
/// function such that the concatenation of the arguments to this
/// function form the byte stream. It does not matter how the application
/// chooses to chunk the stream. It is OK to call this function with
/// a zero-length byte slice.
///
/// The end of the stream is indicated by calling this function with
/// `last` set to `true`. In that case, the end of the stream is
/// considered to occur after the last byte of the `buffer` (which
/// may be zero-length) passed in the same call. Once this function
/// has been called with `last` set to `true` this function must not
/// be called again.
///
/// If you want to perform detection on just the prefix of a longer
/// stream, do not pass `last=true` after the prefix if the stream
/// actually still continues.
///
/// Returns `true` if after processing `buffer` the stream has
/// contained at least one non-ASCII byte and `false` if only
/// ASCII has been seen so far.
///
/// # Panics
///
/// If this function has previously been called with `last` set to `true`.
///
/// # Undefined Behavior
///
/// UB ensues if
///
/// * `detector` does not point to a detector obtained from
/// `chardetng_detector_new` but not yet freed with
/// `chardetng_detector_free`.
/// * `buffer` is `NULL`. (It can be a bogus pointer when `buffer_len` is 0.)
/// * ,buffer_len` is non-zero and `buffer` and `buffer_len` don't designate
/// a range of memory valid for reading.
bool chardetng_encoding_detector_feed(
CHARDETNG_ENCODING_DETECTOR* detector,
uint8_t const* buffer,
size_t buffer_len,
bool last
);
/// Guess the encoding given the bytes pushed to the detector so far
/// (via `chardetng_encoding_detector_feed()`), the top-level domain name
/// from which the bytes were loaded, and an indication of whether to
/// consider UTF-8 as a permissible guess.
///
/// The `tld` argument takes the rightmost DNS label of the hostname of the
/// host the stream was loaded from in lower-case ASCII form. That is, if
/// the label is an internationalized top-level domain name, it must be
/// provided in its Punycode form. If the TLD that the stream was loaded
/// from is unavalable, `NULL` may be passed instead (and 0 as `tld_len`),
/// which is equivalent to passing pointer to "com" as `tld` and 3 as
/// `tld_len`.
///
/// If the `allow_utf8` argument is set to `false`, the return value of
/// this function won't be `UTF_8_ENCODING`. When performing detection
/// on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
/// unless the user has taken a specific contextual action to request an
/// override. This way, Web developers cannot start depending on UTF-8
/// detection. Such reliance would make the Web Platform more brittle.
///
/// Returns the guessed encoding (never `NULL`).
///
/// # Panics
///
/// If `tld` is `NULL` but `tld_len` is not zero.
///
/// If `tld` contains non-ASCII, period, or upper-case letters. (The panic
/// condition is intentionally limited to signs of failing to extract the
/// label correctly, failing to provide it in its Punycode form, and failure
/// to lower-case it. Full DNS label validation is intentionally not performed
/// to avoid panics when the reality doesn't match the specs.)
///
/// # Undefined Behavior
///
/// UB ensues if
///
/// * `detector` does not point to a detector obtained from
/// `chardetng_detector_new` but not yet freed with
/// `chardetng_detector_free`.
/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len`
/// don't designate a range of memory valid for reading.
ENCODING_RS_ENCODING const* chardetng_encoding_detector_guess(
CHARDETNG_ENCODING_DETECTOR const* detector,
char const* tld,
size_t tld_len,
bool allow_utf8
);
#ifdef __cplusplus
}
#endif
#endif // chardetng_h
|