diff options
Diffstat (limited to 'third_party/rust/encoding_c_mem/include')
-rw-r--r-- | third_party/rust/encoding_c_mem/include/encoding_rs_mem.h | 704 | ||||
-rw-r--r-- | third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h | 578 |
2 files changed, 1282 insertions, 0 deletions
diff --git a/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h b/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h new file mode 100644 index 0000000000..2327a9dd0b --- /dev/null +++ b/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h @@ -0,0 +1,704 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#ifndef encoding_rs_mem_h_ +#define encoding_rs_mem_h_ + +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> + +/* + * _Note:_ "Latin1" in this header refers to the Unicode range from U+0000 to + * U+00FF, inclusive, and does not refer to the windows-1252 range. This + * in-memory encoding is sometimes used as a storage optimization of text + * when UTF-16 indexing and length semantics are exposed. + */ + +/** + * Classification of text as Latin1 (all code points are below U+0100), + * left-to-right with some non-Latin1 characters or as containing at least + * some right-to-left characters. + */ +typedef enum { + /** + * Every character is below U+0100. + */ + Latin1 = 0, + /** + * There is at least one character that's U+0100 or higher, but there + * are no right-to-left characters. + */ + LeftToRight = 1, + /** + * There is at least one right-to-left character. + */ + Bidi = 2, +} Latin1Bidi; + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/** + * Checks whether a valid UTF-8 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. + * Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory designated by `buffer` and + * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` + * may be bogus but still has to be non-`NULL`.) + */ +Latin1Bidi encoding_mem_check_str_for_latin1_and_bidi(const char* buffer, + size_t len); + +/** + * Checks whether a potentially invalid UTF-16 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +Latin1Bidi encoding_mem_check_utf16_for_latin1_and_bidi(const char16_t* buffer, + size_t len); + +/** + * Checks whether a potentially invalid UTF-8 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. + * + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +Latin1Bidi encoding_mem_check_utf8_for_latin1_and_bidi(const char* buffer, + size_t len); + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of `char16_t`s written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_latin1_to_utf16(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8. + * + * The length of the destination buffer must be at least the length of the + * source buffer times two. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Safety + * + * Note that this function may write garbage beyond the number of bytes + * indicated by the return value. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_latin1_to_utf8(const char* src, size_t src_len, + char* dst, size_t dst_len); + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient + * output space. + * + * Writes the number of code units read into `*src_len` and the number of + * bytes written into `*dst_len`. + * + * If the output isn't large enough, not all input is consumed. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_latin1_to_utf8_partial(const char* src, + size_t* src_len, char* dst, + size_t* dst_len); + +/** + * Converts valid UTF-8 to valid UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL`, if the two memory blocks overlap, of if the + * buffer designated by `src` and `src_len` does not contain valid UTF-8. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_str_to_utf16(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * If the input is valid UTF-16 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, does something + * that is memory-safe without any promises about any properties of the + * output and will probably assert in debug builds in future versions. + * In particular, callers shouldn't assume the output to be the same across + * crate versions or CPU architectures and should not assume that non-ASCII + * input can't map to ASCII output. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of bytes written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * (Probably in future versions if debug assertions are enabled (and not + * fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, + size_t src_len, char* dst, + size_t dst_len); + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer times three. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len, + char* dst, size_t dst_len); + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER with potentially insufficient output + * space. + * + * Writes the number of code units read into `*src_len` and the number of + * bytes written into `*dst_len`. + * + * Guarantees that the bytes in the destination beyond the number of + * bytes claimed as written by the second item of the return tuple + * are left unmodified. + * + * Not all code units are read if there isn't enough output space. + * Note that this method isn't designed for general streamability but for + * not allocating memory for the worst case up front. Specifically, + * if the input starts with or ends with an unpaired surrogate, those are + * replaced with the REPLACEMENT CHARACTER. + * + * Matches the semantics of `TextEncoder.encodeInto()` from the + * Encoding Standard. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_utf16_to_utf8_partial(const char16_t* src, + size_t* src_len, char* dst, + size_t* dst_len); + +/** + * If the input is valid UTF-8 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, this function + * panics if debug assertions are enabled (and fuzzing isn't) and otherwise + * does something that is memory-safe without any promises about any + * properties of the output. In particular, callers shouldn't assume the + * output to be the same across crate versions or CPU architectures and + * should not assume that non-ASCII input can't map to ASCII output. + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * If debug assertions are enabled (and not fuzzing) and the input is + * not in the range U+0000 to U+00FF, inclusive. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf8_to_latin1_lossy(const char* src, + size_t src_len, char* dst, + size_t dst_len); + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer _plus one_. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf8_to_utf16(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written or `SIZE_MAX` if the input was + * invalid. + * + * When the input was invalid, some output may have been written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf8_to_utf16_without_replacement(const char* src, + size_t src_len, + char16_t* dst, + size_t dst_len); + +/** + * Copies ASCII from source to destination up to the first non-ASCII byte + * (or the end of the input if it is ASCII in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_copy_ascii_to_ascii(const char* src, size_t src_len, + char* dst, size_t dst_len); + +/** + * Copies ASCII from source to destination zero-extending it to UTF-16 up to + * the first non-ASCII byte (or the end of the input if it is ASCII in its + * entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_copy_ascii_to_basic_latin(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * Copies Basic Latin from source to destination narrowing it to ASCII up to + * the first non-Basic Latin code unit (or the end of the input if it is + * Basic Latin in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_copy_basic_latin_to_ascii(const char16_t* src, + size_t src_len, char* dst, + size_t dst_len); + +/** + * Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +void encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t len); + +/** + * Checks whether the buffer is all-ASCII. + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +bool encoding_mem_is_ascii(const char* buffer, size_t len); + +/** + * Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing + * only ASCII characters). + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +bool encoding_mem_is_basic_latin(const char16_t* buffer, size_t len); + +/** + * Checks whether a scalar value triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * + * # Undefined behavior + * + * Undefined behavior ensues if `c` is not a valid Unicode Scalar Value. + */ +bool encoding_mem_is_char_bidi(char32_t c); + +/** + * Checks whether a valid UTF-8 buffer contains code points that trigger + * right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory designated by `buffer` and + * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` + * may be bogus but still has to be non-`NULL`.) + */ +bool encoding_mem_is_str_bidi(const char* buffer, size_t len); + +/** + * Checks whether the buffer represents only code points less than or equal + * to U+00FF. + * + * Fails fast. (I.e. returns before having read the whole buffer if code + * points above U+00FF are discovered. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory designated by `buffer` and + * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` + * may be bogus but still has to be non-`NULL`.) + */ +bool encoding_mem_is_str_latin1(const char* buffer, size_t len); + +/** + * Checks whether a UTF-16 buffer contains code points that trigger + * right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input contains an RTL character or an unpaired + * high surrogate that could be the high half of an RTL character. + * Returns `false` if the input contains neither RTL characters nor + * unpaired high surrogates that could be higher halves of RTL characters. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +bool encoding_mem_is_utf16_bidi(const char16_t* buffer, size_t len); + +/** + * Checks whether a UTF-16 code unit triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Since supplementary-plane right-to-left blocks are identifiable from the + * high surrogate without examining the low surrogate, this function returns + * `true` for such high surrogates making the function suitable for handling + * supplementary-plane text without decoding surrogate pairs to scalar + * values. Obviously, such high surrogates are then reported as right-to-left + * even if actually unpaired. + */ +bool encoding_mem_is_utf16_code_unit_bidi(char16_t u); + +/** + * Checks whether the buffer represents only code point less than or equal + * to U+00FF. + * + * May read the entire buffer even if it isn't all-Latin1. (I.e. the function + * is not guaranteed to fail fast.) + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +bool encoding_mem_is_utf16_latin1(const char16_t* buffer, size_t len); + +/** + * Checks whether a potentially-invalid UTF-8 buffer contains code points + * that trigger right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input is invalid UTF-8 or the input contains an + * RTL character. Returns `false` if the input is valid UTF-8 and contains + * no RTL characters. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +bool encoding_mem_is_utf8_bidi(const char* buffer, size_t len); + +/** + * Checks whether the buffer is valid UTF-8 representing only code points + * less than or equal to U+00FF. + * + * Fails fast. (I.e. returns before having read the whole buffer if UTF-8 + * invalidity or code points above U+00FF are discovered. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +bool encoding_mem_is_utf8_latin1(const char* buffer, size_t len); + +/** + * Returns the index of the first unpaired surrogate or, if the input is + * valid UTF-16 in its entirety, the length of the input. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +size_t encoding_mem_utf16_valid_up_to(const char16_t* buffer, size_t len); + +/** + * Returns the index of first byte that starts an invalid byte + * sequence or a non-Latin1 byte sequence, or the length of the + * string if there are neither. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +size_t encoding_mem_utf8_latin1_up_to(const char* buffer, size_t len); + +/** + * Returns the index of first byte that starts a non-Latin1 byte + * sequence, or the length of the string if there are none. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory block does not contain valid UTF-8. + * (If `buffer_len` is `0`, `buffer` may be bogus but still has to be non-`NULL` + * and aligned.) + */ +size_t encoding_mem_str_latin1_up_to(const char* buffer, size_t len); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // encoding_rs_mem_h_ diff --git a/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h b/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h new file mode 100644 index 0000000000..b6173d7ef4 --- /dev/null +++ b/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h @@ -0,0 +1,578 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#pragma once + +#ifndef encoding_rs_mem_cpp_h_ +#define encoding_rs_mem_cpp_h_ + +#include <optional> +#include <string_view> +#include <tuple> +#include "gsl/gsl" + +#include "encoding_rs_mem.h" + +namespace encoding_rs { +namespace mem { + +namespace detail { +/** + * Replaces `nullptr` with a bogus pointer suitable for use as part of a + * zero-length Rust slice. + */ +template <class T> +static inline T* null_to_bogus(T* ptr) { + return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); +} +}; // namespace detail + +/** + * Checks whether a potentially invalid UTF-16 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + */ +inline Latin1Bidi check_for_latin1_and_bidi(std::u16string_view buffer) { + return encoding_mem_check_utf16_for_latin1_and_bidi( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a potentially invalid UTF-8 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. + * + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + */ +inline Latin1Bidi check_for_latin1_and_bidi(std::string_view buffer) { + return encoding_mem_check_utf8_for_latin1_and_bidi( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of `char16_t`s written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline void convert_latin1_to_utf16(gsl::span<const char> src, + gsl::span<char16_t> dst) { + encoding_mem_convert_latin1_to_utf16( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8. + * + * The length of the destination buffer must be at least the length of the + * source buffer times two. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Safety + * + * Note that this function may write garbage beyond the number of bytes + * indicated by the return value. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline size_t convert_latin1_to_utf8(gsl::span<const char> src, + gsl::span<char> dst) { + return encoding_mem_convert_latin1_to_utf8( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient + * output space. + * + * Returns the number of bytes read and the number of bytes written. + * + * If the output isn't large enough, not all input is consumed. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline std::tuple<size_t, size_t> convert_latin1_to_utf8_partial( + gsl::span<const char> src, gsl::span<char> dst) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + encoding_mem_convert_latin1_to_utf8_partial( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + &src_read, encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + &dst_written); + return {src_read, dst_written}; +} + +/** + * Converts valid UTF-8 to valid UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t convert_str_to_utf16(std::string_view src, + gsl::span<char16_t> dst) { + return encoding_mem_convert_str_to_utf16( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * If the input is valid UTF-16 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, does something + * that is memory-safe without any promises about any properties of the + * output and will probably assert in debug builds in future versions. + * In particular, callers shouldn't assume the output to be the same across + * crate versions or CPU architectures and should not assume that non-ASCII + * input can't map to ASCII output. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of bytes written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * (Probably in future versions if debug assertions are enabled (and not + * fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) + */ +inline void convert_utf16_to_latin1_lossy(std::u16string_view src, + gsl::span<char> dst) { + encoding_mem_convert_utf16_to_latin1_lossy( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer times three. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t convert_utf16_to_utf8(std::u16string_view src, + gsl::span<char> dst) { + return encoding_mem_convert_utf16_to_utf8( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER with potentially insufficient output + * space. + * + * Returns the number of code units read and the number of bytes written. + * + * Guarantees that the bytes in the destination beyond the number of + * bytes claimed as written by the second item of the return tuple + * are left unmodified. + * + * Not all code units are read if there isn't enough output space. + * Note that this method isn't designed for general streamability but for + * not allocating memory for the worst case up front. Specifically, + * if the input starts with or ends with an unpaired surrogate, those are + * replaced with the REPLACEMENT CHARACTER. + * + * Matches the semantics of `TextEncoder.encodeInto()` from the + * Encoding Standard. + */ +inline std::tuple<size_t, size_t> convert_utf16_to_utf8_partial( + std::u16string_view src, gsl::span<char> dst) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + encoding_mem_convert_utf16_to_utf8_partial( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + &src_read, encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + &dst_written); + return {src_read, dst_written}; +} + +/** + * If the input is valid UTF-8 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, this function + * panics if debug assertions are enabled (and fuzzing isn't) and otherwise + * does something that is memory-safe without any promises about any + * properties of the output. In particular, callers shouldn't assume the + * output to be the same across crate versions or CPU architectures and + * should not assume that non-ASCII input can't map to ASCII output. + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * If debug assertions are enabled (and not fuzzing) and the input is + * not in the range U+0000 to U+00FF, inclusive. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline size_t convert_utf8_to_latin1_lossy(std::string_view src, + gsl::span<char> dst) { + return encoding_mem_convert_utf8_to_latin1_lossy( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer _plus one_. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t convert_utf8_to_utf16(std::string_view src, + gsl::span<char16_t> dst) { + return encoding_mem_convert_utf8_to_utf16( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written or `std::nullopt` if the input was + * invalid. + * + * When the input was invalid, some output may have been written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline std::optional<size_t> convert_utf8_to_utf16_without_replacement( + std::string_view src, gsl::span<char16_t> dst) { + size_t val = encoding_mem_convert_utf8_to_utf16_without_replacement( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; +} + +/** + * Copies ASCII from source to destination up to the first non-ASCII byte + * (or the end of the input if it is ASCII in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline size_t copy_ascii_to_ascii(gsl::span<const char> src, + gsl::span<char> dst) { + return encoding_mem_copy_ascii_to_ascii( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Copies ASCII from source to destination zero-extending it to UTF-16 up to + * the first non-ASCII byte (or the end of the input if it is ASCII in its + * entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t copy_ascii_to_basic_latin(gsl::span<const char> src, + gsl::span<char16_t> dst) { + return encoding_mem_copy_ascii_to_basic_latin( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * Copies Basic Latin from source to destination narrowing it to ASCII up to + * the first non-Basic Latin code unit (or the end of the input if it is + * Basic Latin in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t copy_basic_latin_to_ascii(gsl::span<const char16_t> src, + gsl::span<char> dst) { + return encoding_mem_copy_basic_latin_to_ascii( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. + */ +inline void ensure_utf16_validity(gsl::span<char16_t> buffer) { + encoding_mem_ensure_utf16_validity( + encoding_rs::mem::detail::null_to_bogus<char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether the buffer is all-ASCII. + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + */ +inline bool is_ascii(std::string_view buffer) { + return encoding_mem_is_ascii( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing + * only ASCII characters). + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + */ +inline bool is_ascii(std::u16string_view buffer) { + return encoding_mem_is_basic_latin( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a scalar value triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * + * # Undefined behavior + * + * Undefined behavior ensues if `c` is not a valid Unicode Scalar Value. + */ +inline bool is_scalar_value_bidi(char32_t c) { + return encoding_mem_is_char_bidi(c); +} + +/** + * Checks whether a UTF-16 buffer contains code points that trigger + * right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input contains an RTL character or an unpaired + * high surrogate that could be the high half of an RTL character. + * Returns `false` if the input contains neither RTL characters nor + * unpaired high surrogates that could be higher halves of RTL characters. + */ +inline bool is_bidi(std::u16string_view buffer) { + return encoding_mem_is_utf16_bidi( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a UTF-16 code unit triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Since supplementary-plane right-to-left blocks are identifiable from the + * high surrogate without examining the low surrogate, this function returns + * `true` for such high surrogates making the function suitable for handling + * supplementary-plane text without decoding surrogate pairs to scalar + * values. Obviously, such high surrogates are then reported as right-to-left + * even if actually unpaired. + */ +inline bool is_utf16_code_unit_bidi(char16_t u) { + return encoding_mem_is_utf16_code_unit_bidi(u); +} + +/** + * Checks whether the buffer represents only code point less than or equal + * to U+00FF. + * + * May read the entire buffer even if it isn't all-Latin1. (I.e. the function + * is not guaranteed to fail fast.) + */ +inline bool is_utf16_latin1(std::u16string_view buffer) { + return encoding_mem_is_utf16_latin1( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a potentially-invalid UTF-8 buffer contains code points + * that trigger right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input is invalid UTF-8 or the input contains an + * RTL character. Returns `false` if the input is valid UTF-8 and contains + * no RTL characters. + */ +inline bool is_bidi(std::string_view buffer) { + return encoding_mem_is_utf8_bidi( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether the buffer is valid UTF-8 representing only code points + * less than or equal to U+00FF. + * + * Fails fast. (I.e. returns before having read the whole buffer if UTF-8 + * invalidity or code points above U+00FF are discovered. + */ +inline bool is_utf8_latin1(std::string_view buffer) { + return encoding_mem_is_utf8_latin1( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Returns the index of the first unpaired surrogate or, if the input is + * valid UTF-16 in its entirety, the length of the input. + */ +inline size_t utf16_valid_up_to(std::u16string_view buffer) { + return encoding_mem_utf16_valid_up_to( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Returns the index of first byte that starts a non-Latin1 byte + * sequence, or the length of the string if there are none. + */ +inline size_t utf8_latin1_up_to(std::string_view buffer) { + return encoding_mem_utf8_latin1_up_to( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +}; // namespace mem +}; // namespace encoding_rs + +#endif // encoding_rs_mem_cpp_h_ |