diff options
Diffstat (limited to '')
24 files changed, 6391 insertions, 0 deletions
diff --git a/third_party/rust/encoding_c/.cargo-checksum.json b/third_party/rust/encoding_c/.cargo-checksum.json new file mode 100644 index 0000000000..eabf0ea825 --- /dev/null +++ b/third_party/rust/encoding_c/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CONTRIBUTING.md":"8cd9262df951c4b42078aa55064ca3b8ef2676c06b8fc7c281c02ee3f1ae04a8","COPYRIGHT":"6c7cd6277ece1edbc9f653eb1812bb98dc7ada4137525f0612938490f7819d9a","Cargo.toml":"c78bebead132f39eb39f477f28a226c873320d681247f05b4e9745f67e5468c4","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"3fa4ca83dcc9237839b1bdeb2e6d16bdfb5ec0c5ce42b24694d8bbf0dcbef72c","README.md":"8e9d1ef3b3f19831c622ab7dd455dd405c3c1a25b459e556d55ec198bdd59a68","build-disabled.rs":"2cc34f4c96a235c1ec256a5b6981b08b45d420463a6c997d6ce819462626b74b","build.rs":"013c85c18b035473d3a0900b833906304a8431882e5c22053684a69588adde98","include/encoding_rs.h":"328efc1a6ee9f0fb81b1db5286f24c0cdbcabcaa123d8b209c0000ba2d618c7f","include/encoding_rs_cpp.h":"d4dcae03cc5d8127b5e944f80691cb95990a116bca9a5044ecdfd30ed569c659","include/encoding_rs_statics.h":"96a2595ad7e209a5f393e61d46899ec484329693ac164455074e041482625c9d","src/lib.rs":"98bed946e18cdb1993d46aeb644435ec0d850738fbceaba84742a83934d6c454"},"package":"9af727805f3b0d79956bde5b35732669fb5c5d45a94893798e7b7e70cfbf9cc1"}
\ No newline at end of file diff --git a/third_party/rust/encoding_c/CONTRIBUTING.md b/third_party/rust/encoding_c/CONTRIBUTING.md new file mode 100644 index 0000000000..62597bf029 --- /dev/null +++ b/third_party/rust/encoding_c/CONTRIBUTING.md @@ -0,0 +1,38 @@ +If you send a pull request / patch, please observe the following. + +## Licensing + +Since this crate is dual-licensed, +[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions) +is considered to apply in the sense of Contributions being automatically +under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file). +That is, by the act of offering a Contribution, you place your Contribution +under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT` +file. Please do not contribute if you aren't willing or allowed to license your +contributions in this manner. + +You are encouraged to dedicate test code that you contribute to the Public +Domain using the CC0 dedication. If you contribute test code that is not +dedicated to the Public Domain, please be sure not to put it in a part of +source code that the comments designate as being dedicated to the Public +Domain. + +## Copyright Notices + +If you require the addition of your copyright notice, it's up to you to edit in +your notice as part of your Contribution. Not adding a copyright notice is +taken as a waiver of copyright notice. + +## Compatibility with Stable Rust + +Please ensure that your Contribution compiles with the latest stable-channel +rustc. + +## rustfmt + +Please install [`rustfmt`](https://github.com/rust-lang-nursery/rustfmt) 0.4.1 +(the latest version has +[a bug](https://github.com/rust-lang-nursery/rustfmt/issues/1149) that renders +it unsuited for encoding_rs) and run `cargo fmt` before creating a pull +request. (It's OK for `cargo fmt` to exit with an error due to too long lines.) + diff --git a/third_party/rust/encoding_c/COPYRIGHT b/third_party/rust/encoding_c/COPYRIGHT new file mode 100644 index 0000000000..2cb666fddd --- /dev/null +++ b/third_party/rust/encoding_c/COPYRIGHT @@ -0,0 +1,9 @@ +encoding_c is copyright Mozilla Foundation. + +Licensed under the Apache License, Version 2.0 +<LICENSE-APACHE or +https://www.apache.org/licenses/LICENSE-2.0> or the MIT +license <LICENSE-MIT or https://opensource.org/licenses/MIT>, +at your option. All files in the project carrying such +notice may not be copied, modified, or distributed except +according to those terms. diff --git a/third_party/rust/encoding_c/Cargo.toml b/third_party/rust/encoding_c/Cargo.toml new file mode 100644 index 0000000000..ee7ecb6acb --- /dev/null +++ b/third_party/rust/encoding_c/Cargo.toml @@ -0,0 +1,39 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "encoding_c" +version = "0.9.8" +authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"] +build = "build.rs" +links = "encoding_c" +description = "C API for encoding_rs" +homepage = "https://docs.rs/encoding_c/" +documentation = "https://docs.rs/encoding_c/" +readme = "README.md" +keywords = ["ffi", "capi", "encoding", "unicode", "charset"] +license = "Apache-2.0 OR MIT" +repository = "https://github.com/hsivonen/encoding_c" +[dependencies.encoding_rs] +version = "0.8.20" + +[features] +fast-big5-hanzi-encode = ["encoding_rs/fast-big5-hanzi-encode"] +fast-gb-hanzi-encode = ["encoding_rs/fast-gb-hanzi-encode"] +fast-hangul-encode = ["encoding_rs/fast-hangul-encode"] +fast-hanja-encode = ["encoding_rs/fast-hanja-encode"] +fast-kanji-encode = ["encoding_rs/fast-kanji-encode"] +fast-legacy-encode = ["encoding_rs/fast-legacy-encode"] +less-slow-big5-hanzi-encode = ["encoding_rs/less-slow-big5-hanzi-encode"] +less-slow-gb-hanzi-encode = ["encoding_rs/less-slow-gb-hanzi-encode"] +less-slow-kanji-encode = ["encoding_rs/less-slow-kanji-encode"] +simd-accel = ["encoding_rs/simd-accel"] diff --git a/third_party/rust/encoding_c/LICENSE-APACHE b/third_party/rust/encoding_c/LICENSE-APACHE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/third_party/rust/encoding_c/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/rust/encoding_c/LICENSE-MIT b/third_party/rust/encoding_c/LICENSE-MIT new file mode 100644 index 0000000000..3317c82e2f --- /dev/null +++ b/third_party/rust/encoding_c/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/rust/encoding_c/README.md b/third_party/rust/encoding_c/README.md new file mode 100644 index 0000000000..d4c80865ff --- /dev/null +++ b/third_party/rust/encoding_c/README.md @@ -0,0 +1,131 @@ +# encoding_c + +[![crates.io](https://meritbadge.herokuapp.com/encoding_c)](https://crates.io/crates/encoding_c) +[![docs.rs](https://docs.rs/encoding_c/badge.svg)](https://docs.rs/encoding_c/) +[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/encoding_c/blob/master/COPYRIGHT) + +encoding_c is an FFI wrapper for [encoding_rs](https://github.com/hsivonen/encoding_rs). + +## Bindings for `encoding_rs::mem` + +See the [`encoding_c_mem` crate](https://crates.io/crates/encoding_c_mem) +for bindings for `encoding_rs::mem`. + +## Licensing + +Please see the file named +[COPYRIGHT](https://github.com/hsivonen/encoding_c/blob/master/COPYRIGHT). + +## No Unwinding Support! + +This crate is meant for use in binaries compiled with `panic = 'abort'`, which +is _required_ for correctness! Unwinding across FFI is Undefined Behavior, and +this crate does nothing to try to prevent unwinding across the FFI if +compiled with unwinding enabled. + +## C/C++ Headers + +`include/encoding_rs.h` and `include/encoding_rs_statics.h` are needed for C +usage. + +`include/encoding_rs_cpp.h` is a sample C++ API built on top of the C API using +GSL and the C++ standard library. Since C++ project typically roll their own +string classes, etc., it's probably necessary for C++ projects to manually +adapt the header to their replacements of standard-library types. + +There's a [write-up](https://hsivonen.fi/modern-cpp-in-rust/) about the C++ +wrappers. + +## Release Notes + +### 0.9.8 + +* Remove year from copyright notices. + +### 0.9.7 + +* Specify a `links` value in the Cargo manifest. +* Emit an `include_dir` variable from build script so that other build scripts + depending on this crate can rely on it. + +### 0.9.6 + +* Fix a bug in the C++ header. + +### 0.9.5 + +* Adjust documentation for encoding_rs 0.8.20. + +### 0.9.4 + +* Fix bogus C header. + +### 0.9.3 + +* Fix bogus C++ header. + +### 0.9.2 + +* Wrap `Decoder::latin1_byte_compatible_up_to`. + +### 0.9.1 + +* Wrap `Encoding::is_single_byte()`. +* Pass through new feature flags introduced in encoding_rs 0.8.11. + +### 0.9.0 + +* Update to encoding_rs 0.8.0. + +### 0.8.0 + +* Update to encoding_rs 0.7.0. +* Drop `encoding_for_name()`. +* Deal correctly with the `data()` method of `gsl::span` returning `nullptr`. + +### 0.7.6 + +* Rename `ENCODING_RS_NON_NULL_CONST_ENCODING_PTR` to + `ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR`. (Not a breaking change, + because defining that macro broke the build previously, so the + macro couldn't have been used.) +* Use the macro only for statics and not for return values. + +### 0.7.5 + +* Annotate the encoding pointers that should be wrapped with a + same-representation not-null type in C++ as + `ENCODING_RS_NON_NULL_CONST_ENCODING_PTR`. + +### 0.7.4 + +* Wrap `has_pending_state()`. + +### 0.7.3 + +* Use C preprocessor definitions for encoding constant declarations. + +### 0.7.2 + +* Parametrize the struct type names behind C preprocessor definitions. +* Leave it to the user to provide `char16_t`. Avoid including a header for it. + +### 0.7.1 + +* Fix documentation for pointers that get used in + `std::slice::from_raw_parts()`. + +### 0.7.0 + +* Map `None` to `SIZE_MAX` in the max length calculation functions. + +### 0.6.0 + +* Check in the `cheddar`-generated header and comment out the `cheddar`-using + `build.rs`. + +### 0.5.0 + +* Initial release of encoding_c. (I.e. first release with FFI in a distinct + crate.) + diff --git a/third_party/rust/encoding_c/build-disabled.rs b/third_party/rust/encoding_c/build-disabled.rs new file mode 100644 index 0000000000..ebd7493626 --- /dev/null +++ b/third_party/rust/encoding_c/build-disabled.rs @@ -0,0 +1,60 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate cheddar; + +use std::io::prelude::*; +use std::fs::File; + +fn replace(path: &str) -> std::io::Result<()> { + let mut f = try!(File::open(path)); + let mut s = String::new(); + try!(f.read_to_string(&mut s)); + s = s.replace("#ifndef cheddar_generated_encoding_rs_h", "// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. +// Instead, please regenerate using encoding_c/build.rs. + +#ifndef cheddar_generated_encoding_rs_h"); + s = s.replace("uint16_t", "char16_t"); + s = s.replace("uintptr_t", "size_t"); + s = s.replace("Encoding", "ENCODING_RS_ENCODING"); + s = s.replace("Encoder", "ENCODING_RS_ENCODER"); + s = s.replace("Decoder", "ENCODING_RS_DECODER"); + s = s.replace("ENCODING_RS_ENCODING.html", "Encoding.html"); + s = s.replace("ENCODING_RS_ENCODER.html", "Encoder.html"); + s = s.replace("ENCODING_RS_DECODER.html", "Decoder.html"); + s = s.replace("#include <stdbool.h>", + "#include <stdbool.h>\n#include \"encoding_rs_statics.h\""); + let mut f = try!(File::create(path)); + try!(f.write_all(s.as_bytes())); + Ok(()) +} + +fn main() { + println!("cargo:rerun-if-changed=src/lib.rs"); + + let path = "include/encoding_rs.h"; + + cheddar::Cheddar::new() + .expect("could not read manifest") + .run_build(path); + + match replace(path) { + Ok(_) => {} + Err(e) => println!("Performing replacements failed {}.", e), + } +} diff --git a/third_party/rust/encoding_c/build.rs b/third_party/rust/encoding_c/build.rs new file mode 100644 index 0000000000..962b7ae12b --- /dev/null +++ b/third_party/rust/encoding_c/build.rs @@ -0,0 +1,7 @@ +fn main() { + println!("cargo:rerun-if-changed="); + + let cargo_manifest_dir = std::env::var_os("CARGO_MANIFEST_DIR").unwrap(); + let include_dir = std::path::PathBuf::from(cargo_manifest_dir).join("include"); + println!("cargo:include-dir={}", include_dir.display()); +} diff --git a/third_party/rust/encoding_c/include/encoding_rs.h b/third_party/rust/encoding_c/include/encoding_rs.h new file mode 100644 index 0000000000..39231b7a0f --- /dev/null +++ b/third_party/rust/encoding_c/include/encoding_rs.h @@ -0,0 +1,692 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. +// Instead, please regenerate using encoding_c/build.rs. + +#ifndef cheddar_generated_encoding_rs_h +#define cheddar_generated_encoding_rs_h + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdbool.h> +#include <stdint.h> +#include "encoding_rs_statics.h" + +/// Implements the +/// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get) +/// algorithm. +/// +/// If, after ASCII-lowercasing and removing leading and trailing +/// whitespace, the argument matches a label defined in the ENCODING_RS_ENCODING +/// Standard, `const ENCODING_RS_ENCODING*` representing the corresponding +/// encoding is returned. If there is no match, `NULL` is returned. +/// +/// This is the right function to use if the action upon the method returning +/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) instead. +/// When the action upon the method returning `NULL` is not to proceed with +/// a fallback but to refuse processing, `encoding_for_label_no_replacement()` +/// is more appropriate. +/// +/// The argument buffer can be in any ASCII-compatible encoding. It is not +/// required to be UTF-8. +/// +/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len` +/// is zero, it is OK for `label` to be something non-dereferencable, +/// such as `0x1`. This is required due to Rust's optimization for slices +/// within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `label` and `label_len` don't designate a valid memory block +/// of if `label` is `NULL`. +ENCODING_RS_ENCODING const* encoding_for_label(uint8_t const* label, + size_t label_len); + +/// This function behaves the same as `encoding_for_label()`, except when +/// `encoding_for_label()` would return `REPLACEMENT_ENCODING`, this method +/// returns `NULL` instead. +/// +/// This method is useful in scenarios where a fatal error is required +/// upon invalid label, because in those cases the caller typically wishes +/// to treat the labels that map to the replacement encoding as fatal +/// errors, too. +/// +/// It is not OK to use this funciton when the action upon the method returning +/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In +/// such a case, the `encoding_for_label()` function should be used instead +/// in order to avoid unsafe fallback for labels that `encoding_for_label()` +/// maps to `REPLACEMENT_ENCODING`. +/// +/// The argument buffer can be in any ASCII-compatible encoding. It is not +/// required to be UTF-8. +/// +/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len` +/// is zero, it is OK for `label` to be something non-dereferencable, +/// such as `0x1`. This is required due to Rust's optimization for slices +/// within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `label` and `label_len` don't designate a valid memory block +/// of if `label` is `NULL`. +ENCODING_RS_ENCODING const* encoding_for_label_no_replacement( + uint8_t const* label, size_t label_len); + +/// Performs non-incremental BOM sniffing. +/// +/// The argument must either be a buffer representing the entire input +/// stream (non-streaming case) or a buffer representing at least the first +/// three bytes of the input stream (streaming case). +/// +/// Returns `UTF_8_ENCODING`, `UTF_16LE_ENCODING` or `UTF_16BE_ENCODING` if the +/// argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or `NULL` +/// otherwise. Upon return, `*buffer_len` is the length of the BOM (zero if +/// there is no BOM). +/// +/// `buffer` must be non-`NULL` even if `*buffer_len` is zero. When +/// `*buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +ENCODING_RS_ENCODING const* encoding_for_bom(uint8_t const* buffer, + size_t* buffer_len); + +/// Writes the name of the given `ENCODING_RS_ENCODING` to a caller-supplied +/// buffer as ASCII and returns the number of bytes / ASCII characters written. +/// +/// The output is not null-terminated. +/// +/// The caller _MUST_ ensure that `name_out` points to a buffer whose length +/// is at least `ENCODING_NAME_MAX_LENGTH` bytes. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL` or if `name_out` doesn't point to +/// a valid block of memory whose length is at least +/// `ENCODING_NAME_MAX_LENGTH` bytes. +size_t encoding_name(ENCODING_RS_ENCODING const* encoding, uint8_t* name_out); + +/// Checks whether the _output encoding_ of this encoding can encode every +/// Unicode scalar. (Only true if the output encoding is UTF-8.) +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +bool encoding_can_encode_everything(ENCODING_RS_ENCODING const* encoding); + +/// Checks whether the bytes 0x00...0x7F map exclusively to the characters +/// U+0000...U+007F and vice versa. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +bool encoding_is_ascii_compatible(ENCODING_RS_ENCODING const* encoding); + +/// Checks whether this encoding maps one byte to one Basic Multilingual +/// Plane code point (i.e. byte length equals decoded UTF-16 length) and +/// vice versa (for mappable characters). +/// +/// `true` iff this encoding is on the list of [Legacy single-byte +/// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) +/// in the spec or x-user-defined. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +bool encoding_is_single_byte(ENCODING_RS_ENCODING const* encoding); + +/// Returns the _output encoding_ of this encoding. This is UTF-8 for +/// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +ENCODING_RS_ENCODING const* encoding_output_encoding( + ENCODING_RS_ENCODING const* encoding); + +/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` +/// on the heap with BOM sniffing enabled and returns a pointer to the +/// newly-allocated `ENCODING_RS_DECODER`. +/// +/// BOM sniffing may cause the returned decoder to morph into a decoder +/// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. +/// +/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller +/// _MUST_ deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +ENCODING_RS_DECODER* encoding_new_decoder(ENCODING_RS_ENCODING const* encoding); + +/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` +/// on the heap with BOM removal and returns a pointer to the newly-allocated +/// `ENCODING_RS_DECODER`. +/// +/// If the input starts with bytes that are the BOM for this encoding, +/// those bytes are removed. However, the decoder never morphs into a +/// decoder for another encoding: A BOM for another encoding is treated as +/// (potentially malformed) input to the decoding algorithm for this +/// encoding. +/// +/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller +/// _MUST_ deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +ENCODING_RS_DECODER* encoding_new_decoder_with_bom_removal( + ENCODING_RS_ENCODING const* encoding); + +/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` +/// on the heap with BOM handling disabled and returns a pointer to the +/// newly-allocated `ENCODING_RS_DECODER`. +/// +/// If the input starts with bytes that look like a BOM, those bytes are +/// not treated as a BOM. (Hence, the decoder never morphs into a decoder +/// for another encoding.) +/// +/// _Note:_ If the caller has performed BOM sniffing on its own but has not +/// removed the BOM, the caller should use +/// `encoding_new_decoder_with_bom_removal()` instead of this function to cause +/// the BOM to be removed. +/// +/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller +/// _MUST_ deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +ENCODING_RS_DECODER* encoding_new_decoder_without_bom_handling( + ENCODING_RS_ENCODING const* encoding); + +/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` +/// into memory provided by the caller with BOM sniffing enabled. (In practice, +/// the target should likely be a pointer previously returned by +/// `encoding_new_decoder()`.) +/// +/// Note: If the caller has already performed BOM sniffing but has +/// not removed the BOM, the caller should still use this function in +/// order to cause the BOM to be ignored. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +void encoding_new_decoder_into(ENCODING_RS_ENCODING const* encoding, + ENCODING_RS_DECODER* decoder); + +/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` +/// into memory provided by the caller with BOM removal. +/// +/// If the input starts with bytes that are the BOM for this encoding, +/// those bytes are removed. However, the decoder never morphs into a +/// decoder for another encoding: A BOM for another encoding is treated as +/// (potentially malformed) input to the decoding algorithm for this +/// encoding. +/// +/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller +/// _MUST_ deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +void encoding_new_decoder_with_bom_removal_into( + ENCODING_RS_ENCODING const* encoding, ENCODING_RS_DECODER* decoder); + +/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` +/// into memory provided by the caller with BOM handling disabled. +/// +/// If the input starts with bytes that look like a BOM, those bytes are +/// not treated as a BOM. (Hence, the decoder never morphs into a decoder +/// for another encoding.) +/// +/// _Note:_ If the caller has performed BOM sniffing on its own but has not +/// removed the BOM, the caller should use +/// `encoding_new_decoder_with_bom_removal_into()` instead of this function to +/// cause the BOM to be removed. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +void encoding_new_decoder_without_bom_handling_into( + ENCODING_RS_ENCODING const* encoding, ENCODING_RS_DECODER* decoder); + +/// Allocates a new `ENCODING_RS_ENCODER` for the given `ENCODING_RS_ENCODING` +/// on the heap and returns a pointer to the newly-allocated +/// `ENCODING_RS_ENCODER`. (Exception, if the `ENCODING_RS_ENCODING` is +/// `replacement`, a new `ENCODING_RS_DECODER` for UTF-8 is instantiated (and +/// that `ENCODING_RS_DECODER` reports `UTF_8` as its `ENCODING_RS_ENCODING`). +/// +/// Once the allocated `ENCODING_RS_ENCODER` is no longer needed, the caller +/// _MUST_ deallocate it by passing the pointer returned by this function to +/// `encoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +ENCODING_RS_ENCODER* encoding_new_encoder(ENCODING_RS_ENCODING const* encoding); + +/// Allocates a new `ENCODING_RS_ENCODER` for the given `ENCODING_RS_ENCODING` +/// into memory provided by the caller. (In practice, the target should likely +/// be a pointer previously returned by `encoding_new_encoder()`.) +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +void encoding_new_encoder_into(ENCODING_RS_ENCODING const* encoding, + ENCODING_RS_ENCODER* encoder); + +/// Validates UTF-8. +/// +/// Returns the index of the first byte that makes the input malformed as +/// UTF-8 or `buffer_len` if `buffer` is entirely valid. +/// +/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When +/// `buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len); + +/// Validates ASCII. +/// +/// Returns the index of the first byte that makes the input malformed as +/// ASCII or `buffer_len` if `buffer` is entirely valid. +/// +/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When +/// `buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len); + +/// Validates ISO-2022-JP ASCII-state data. +/// +/// Returns the index of the first byte that makes the input not representable +/// in the ASCII state of ISO-2022-JP or `buffer_len` if `buffer` is entirely +/// representable in the ASCII state of ISO-2022-JP. +/// +/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When +/// `buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +size_t encoding_iso_2022_jp_ascii_valid_up_to(uint8_t const* buffer, + size_t buffer_len); + +/// Deallocates a `ENCODING_RS_DECODER` previously allocated by +/// `encoding_new_decoder()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +void decoder_free(ENCODING_RS_DECODER* decoder); + +/// The `ENCODING_RS_ENCODING` this `ENCODING_RS_DECODER` is for. +/// +/// BOM sniffing can change the return value of this method during the life +/// of the decoder. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +ENCODING_RS_ENCODING const* decoder_encoding( + ENCODING_RS_DECODER const* decoder); + +/// Query the worst-case UTF-8 output size _with replacement_. +/// +/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`) +/// that will not overflow given the current state of the decoder and +/// `byte_length` number of additional input bytes when decoding with +/// errors handled by outputting a REPLACEMENT CHARACTER for each malformed +/// sequence or `SIZE_MAX` if `size_t` would overflow. +/// +/// # Undefined behavior +/// +/// UB ensues if `decoder` is `NULL`. +size_t decoder_max_utf8_buffer_length(ENCODING_RS_DECODER const* decoder, + size_t byte_length); + +/// Query the worst-case UTF-8 output size _without replacement_. +/// +/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`) +/// that will not overflow given the current state of the decoder and +/// `byte_length` number of additional input bytes when decoding without +/// replacement error handling or `SIZE_MAX` if `size_t` would overflow. +/// +/// Note that this value may be too small for the `_with_replacement` case. +/// Use `decoder_max_utf8_buffer_length()` for that case. +/// +/// # Undefined behavior +/// +/// UB ensues if `decoder` is `NULL`. +size_t decoder_max_utf8_buffer_length_without_replacement( + ENCODING_RS_DECODER const* decoder, size_t byte_length); + +/// Incrementally decode a byte stream into UTF-8 with malformed sequences +/// replaced with the REPLACEMENT CHARACTER. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +uint32_t decoder_decode_to_utf8(ENCODING_RS_DECODER* decoder, + uint8_t const* src, size_t* src_len, + uint8_t* dst, size_t* dst_len, bool last, + bool* had_replacements); + +/// Incrementally decode a byte stream into UTF-8 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +uint32_t decoder_decode_to_utf8_without_replacement( + ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len, + uint8_t* dst, size_t* dst_len, bool last); + +/// Query the worst-case UTF-16 output size (with or without replacement). +/// +/// Returns the size of the output buffer in UTF-16 code units (`char16_t`) +/// that will not overflow given the current state of the decoder and +/// `byte_length` number of additional input bytes or `SIZE_MAX` if `size_t` +/// would overflow. +/// +/// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the +/// return value of this method applies also in the +/// `_without_replacement` case. +/// +/// # Undefined behavior +/// +/// UB ensues if `decoder` is `NULL`. +size_t decoder_max_utf16_buffer_length(ENCODING_RS_DECODER const* decoder, + size_t u16_length); + +/// Incrementally decode a byte stream into UTF-16 with malformed sequences +/// replaced with the REPLACEMENT CHARACTER. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +uint32_t decoder_decode_to_utf16(ENCODING_RS_DECODER* decoder, + uint8_t const* src, size_t* src_len, + char16_t* dst, size_t* dst_len, bool last, + bool* had_replacements); + +/// Incrementally decode a byte stream into UTF-16 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +uint32_t decoder_decode_to_utf16_without_replacement( + ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len, + char16_t* dst, size_t* dst_len, bool last); + +/// Checks for compatibility with storing Unicode scalar values as unsigned +/// bytes taking into account the state of the decoder. +/// +/// Returns `SIZE_MAX` if the decoder is not in a neutral state, including waiting +/// for the BOM, or if the encoding is never Latin1-byte-compatible. +/// +/// Otherwise returns the index of the first byte whose unsigned value doesn't +/// directly correspond to the decoded Unicode scalar value, or the length +/// of the input if all bytes in the input decode directly to scalar values +/// corresponding to the unsigned byte values. +/// +/// Does not change the state of the decoder. +/// +/// Do not use this unless you are supporting SpiderMonkey/V8-style string +/// storage optimizations. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +size_t decoder_latin1_byte_compatible_up_to(ENCODING_RS_DECODER const* decoder, + uint8_t const* buffer, + size_t buffer_len); + +/// Deallocates an `ENCODING_RS_ENCODER` previously allocated by +/// `encoding_new_encoder()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +void encoder_free(ENCODING_RS_ENCODER* encoder); + +/// The `ENCODING_RS_ENCODING` this `ENCODING_RS_ENCODER` is for. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +ENCODING_RS_ENCODING const* encoder_encoding( + ENCODING_RS_ENCODER const* encoder); + +/// Returns `true` if this is an ISO-2022-JP encoder that's not in the +/// ASCII state and `false` otherwise. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +bool encoder_has_pending_state(ENCODING_RS_ENCODER const* encoder); + +/// Query the worst-case output size when encoding from UTF-8 with +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `byte_length` number of +/// additional input code units if there are no unmappable characters in +/// the input or `SIZE_MAX` if `size_t` would overflow. +size_t encoder_max_buffer_length_from_utf8_if_no_unmappables( + ENCODING_RS_ENCODER const* encoder, size_t byte_length); + +/// Query the worst-case output size when encoding from UTF-8 without +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `byte_length` number of +/// additional input code units or `SIZE_MAX` if `size_t` would overflow. +size_t encoder_max_buffer_length_from_utf8_without_replacement( + ENCODING_RS_ENCODER const* encoder, size_t byte_length); + +/// Incrementally encode into byte stream from UTF-8 with unmappable +/// characters replaced with HTML (decimal) numeric character references. +/// +/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe! +/// If in doubt, check the validity of input before using! +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +uint32_t encoder_encode_from_utf8(ENCODING_RS_ENCODER* encoder, + uint8_t const* src, size_t* src_len, + uint8_t* dst, size_t* dst_len, bool last, + bool* had_replacements); + +/// Incrementally encode into byte stream from UTF-8 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics. +/// +/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe! +/// If in doubt, check the validity of input before using! +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +uint32_t encoder_encode_from_utf8_without_replacement( + ENCODING_RS_ENCODER* encoder, uint8_t const* src, size_t* src_len, + uint8_t* dst, size_t* dst_len, bool last); + +/// Query the worst-case output size when encoding from UTF-16 with +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `u16_length` number of +/// additional input code units if there are no unmappable characters in +/// the input or `SIZE_MAX` if `size_t` would overflow. +size_t encoder_max_buffer_length_from_utf16_if_no_unmappables( + ENCODING_RS_ENCODER const* encoder, size_t u16_length); + +/// Query the worst-case output size when encoding from UTF-16 without +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `u16_length` number of +/// additional input code units or `SIZE_MAX` if `size_t` would overflow. +size_t encoder_max_buffer_length_from_utf16_without_replacement( + ENCODING_RS_ENCODER const* encoder, size_t u16_length); + +/// Incrementally encode into byte stream from UTF-16 with unmappable +/// characters replaced with HTML (decimal) numeric character references. +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +uint32_t encoder_encode_from_utf16(ENCODING_RS_ENCODER* encoder, + char16_t const* src, size_t* src_len, + uint8_t* dst, size_t* dst_len, bool last, + bool* had_replacements); + +/// Incrementally encode into byte stream from UTF-16 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +uint32_t encoder_encode_from_utf16_without_replacement( + ENCODING_RS_ENCODER* encoder, char16_t const* src, size_t* src_len, + uint8_t* dst, size_t* dst_len, bool last); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/third_party/rust/encoding_c/include/encoding_rs_cpp.h b/third_party/rust/encoding_c/include/encoding_rs_cpp.h new file mode 100644 index 0000000000..4ec5181ee9 --- /dev/null +++ b/third_party/rust/encoding_c/include/encoding_rs_cpp.h @@ -0,0 +1,1351 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#pragma once + +#ifndef encoding_rs_cpp_h_ +#define encoding_rs_cpp_h_ + +#include <memory> +#include <optional> +#include <string> +#include <string_view> +#include <tuple> +#include <vector> +#include "gsl/gsl" + +namespace encoding_rs { +class Encoding; +class Decoder; +class Encoder; +}; // namespace encoding_rs + +#define ENCODING_RS_ENCODING encoding_rs::Encoding +#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \ + gsl::not_null<const encoding_rs::Encoding*> +#define ENCODING_RS_ENCODER encoding_rs::Encoder +#define ENCODING_RS_DECODER encoding_rs::Decoder + +#include "encoding_rs.h" + +namespace encoding_rs { + +/** + * A converter that decodes a byte stream into Unicode according to a + * character encoding in a streaming (incremental) manner. + * + * The various `decode_*` methods take an input buffer (`src`) and an output + * buffer `dst` both of which are caller-allocated. There are variants for + * both UTF-8 and UTF-16 output buffers. + * + * A `decode_*` method decodes bytes from `src` into Unicode characters stored + * into `dst` until one of the following three things happens: + * + * 1. A malformed byte sequence is encountered (`*_without_replacement` + * variants only). + * + * 2. The output buffer has been filled so near capacity that the decoder + * cannot be sure that processing an additional byte of input wouldn't + * cause so much output that the output buffer would overflow. + * + * 3. All the input bytes have been processed. + * + * The `decode_*` method then returns tuple of a status indicating which one + * of the three reasons to return happened, how many input bytes were read, + * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t` + * when decoding to UTF-16) were written, and in the case of the + * variants performing replacement, a boolean indicating whether an error was + * replaced with the REPLACEMENT CHARACTER during the call. + * + * The number of bytes "written" is what's logically written. Garbage may be + * written in the output buffer beyond the point logically written to. + * + * In the case of the `*_without_replacement` variants, the status is a + * `uint32_t` whose possible values are packed info about a malformed byte + * sequence, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding to the three cases + * listed above). + * + * Packed info about malformed sequences has the following format: + * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3, + * indicate the number of bytes that were consumed after the malformed + * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate + * the length of the malformed byte sequence (possible decimal values 1, 2, + * 3 or 4). The maximum possible sum of the two is 6. + * + * In the case of methods whose name does not end with + * `*_without_replacement`, malformed sequences are automatically replaced + * with the REPLACEMENT CHARACTER and errors do not cause the methods to + * return early. + * + * When decoding to UTF-8, the output buffer must have at least 4 bytes of + * space. When decoding to UTF-16, the output buffer must have at least two + * UTF-16 code units (`char16_t`) of space. + * + * When decoding to UTF-8 without replacement, the methods are guaranteed + * not to return indicating that more output space is needed if the length + * of the output buffer is at least the length returned by + * `max_utf8_buffer_length_without_replacement()`. When decoding to UTF-8 + * with replacement, the length of the output buffer that guarantees the + * methods not to return indicating that more output space is needed is given + * by `max_utf8_buffer_length()`. When decoding to UTF-16 with + * or without replacement, the length of the output buffer that guarantees + * the methods not to return indicating that more output space is needed is + * given by `max_utf16_buffer_length()`. + * + * The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16, + * and the output after each `decode_*` call is guaranteed to consist of + * complete characters. (I.e. the code unit sequence for the last character is + * guaranteed not to be split across output buffers.) + * + * The boolean argument `last` indicates that the end of the stream is reached + * when all the bytes in `src` have been consumed. + * + * A `Decoder` object can be used to incrementally decode a byte stream. + * + * During the processing of a single stream, the caller must call `decode_*` + * zero or more times with `last` set to `false` and then call `decode_*` at + * least once with `last` set to `true`. If `decode_*` returns `INPUT_EMPTY`, + * the processing of the stream has ended. Otherwise, the caller must call + * `decode_*` again with `last` set to `true` (or treat a malformed result, + * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error). + * + * Once the stream has ended, the `Decoder` object must not be used anymore. + * That is, you need to create another one to process another stream. + * + * When the decoder returns `OUTPUT_FULL` or the decoder returns a malformed + * result and the caller does not wish to treat it as a fatal error, the input + * buffer `src` may not have been completely consumed. In that case, the caller + * must pass the unconsumed contents of `src` to `decode_*` again upon the next + * call. + * + * # Infinite loops + * + * When converting with a fixed-size output buffer whose size is too small to + * accommodate one character of output, an infinite loop ensues. When + * converting with a fixed-size output buffer, it generally makes sense to + * make the buffer fairly large (e.g. couple of kilobytes). + */ +class Decoder final { + public: + ~Decoder() {} + static inline void operator delete(void* decoder) { + decoder_free(reinterpret_cast<Decoder*>(decoder)); + } + + /** + * The `Encoding` this `Decoder` is for. + * + * BOM sniffing can change the return value of this method during the life + * of the decoder. + */ + inline gsl::not_null<const Encoding*> encoding() const { + return gsl::not_null<const Encoding*>(decoder_encoding(this)); + } + + /** + * Query the worst-case UTF-8 output size _with replacement_. + * + * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) + * that will not overflow given the current state of the decoder and + * `byte_length` number of additional input bytes when decoding with + * errors handled by outputting a REPLACEMENT CHARACTER for each malformed + * sequence or `std::optional` without value if `size_t` would overflow. + */ + inline std::optional<size_t> max_utf8_buffer_length( + size_t byte_length) const { + size_t val = decoder_max_utf8_buffer_length(this, byte_length); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + /** + * Query the worst-case UTF-8 output size _without replacement_. + * + * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) + * that will not overflow given the current state of the decoder and + * `byte_length` number of additional input bytes when decoding without + * replacement error handling or `std::optional` without value if `size_t` + * would overflow. + * + * Note that this value may be too small for the `_with_replacement` case. + * Use `max_utf8_buffer_length()` for that case. + */ + inline std::optional<size_t> max_utf8_buffer_length_without_replacement( + size_t byte_length) const { + size_t val = + decoder_max_utf8_buffer_length_without_replacement(this, byte_length); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + /** + * Incrementally decode a byte stream into UTF-8 with malformed sequences + * replaced with the REPLACEMENT CHARACTER. + * + * See the documentation of the class for documentation for `decode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf8( + gsl::span<const uint8_t> src, gsl::span<uint8_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + bool had_replacements; + uint32_t result = + decoder_decode_to_utf8(this, null_to_bogus<const uint8_t>(src.data()), + &src_read, null_to_bogus<uint8_t>(dst.data()), + &dst_written, last, &had_replacements); + return {result, src_read, dst_written, had_replacements}; + } + + /** + * Incrementally decode a byte stream into UTF-8 _without replacement_. + * + * See the documentation of the class for documentation for `decode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t> + decode_to_utf8_without_replacement(gsl::span<const uint8_t> src, + gsl::span<uint8_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + uint32_t result = decoder_decode_to_utf8_without_replacement( + this, null_to_bogus<const uint8_t>(src.data()), &src_read, + null_to_bogus<uint8_t>(dst.data()), &dst_written, last); + return {result, src_read, dst_written}; + } + + /** + * Query the worst-case UTF-16 output size (with or without replacement). + * + * Returns the size of the output buffer in UTF-16 code units (`char16_t`) + * that will not overflow given the current state of the decoder and + * `byte_length` number of additional input bytes or `std::optional` + * without value if `size_t` would overflow. + * + * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the + * return value of this method applies also in the + * `_without_replacement` case. + */ + inline std::optional<size_t> max_utf16_buffer_length( + size_t byte_length) const { + size_t val = decoder_max_utf16_buffer_length(this, byte_length); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + /** + * Incrementally decode a byte stream into UTF-16 with malformed sequences + * replaced with the REPLACEMENT CHARACTER. + * + * See the documentation of the class for documentation for `decode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf16( + gsl::span<const uint8_t> src, gsl::span<char16_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + bool had_replacements; + uint32_t result = + decoder_decode_to_utf16(this, null_to_bogus<const uint8_t>(src.data()), + &src_read, null_to_bogus<char16_t>(dst.data()), + &dst_written, last, &had_replacements); + return {result, src_read, dst_written, had_replacements}; + } + + /** + * Incrementally decode a byte stream into UTF-16 _without replacement_. + * + * See the documentation of the class for documentation for `decode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t> + decode_to_utf16_without_replacement(gsl::span<const uint8_t> src, + gsl::span<char16_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + uint32_t result = decoder_decode_to_utf16_without_replacement( + this, null_to_bogus<const uint8_t>(src.data()), &src_read, + null_to_bogus<char16_t>(dst.data()), &dst_written, last); + return {result, src_read, dst_written}; + } + + /** + * Checks for compatibility with storing Unicode scalar values as unsigned + * bytes taking into account the state of the decoder. + * + * Returns `std::nullopt` if the decoder is not in a neutral state, including + * waiting for the BOM, or if the encoding is never Latin1-byte-compatible. + * + * Otherwise returns the index of the first byte whose unsigned value doesn't + * directly correspond to the decoded Unicode scalar value, or the length + * of the input if all bytes in the input decode directly to scalar values + * corresponding to the unsigned byte values. + * + * Does not change the state of the decoder. + * + * Do not use this unless you are supporting SpiderMonkey/V8-style string + * storage optimizations. + */ + inline std::optional<size_t> latin1_byte_compatible_up_to( + gsl::span<const uint8_t> buffer) const { + size_t val = decoder_latin1_byte_compatible_up_to( + this, null_to_bogus<const uint8_t>(buffer.data()), + static_cast<size_t>(buffer.size())); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + private: + /** + * Replaces `nullptr` with a bogus pointer suitable for use as part of a + * zero-length Rust slice. + */ + template <class T> + static inline T* null_to_bogus(T* ptr) { + return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); + } + + Decoder() = delete; + Decoder(const Decoder&) = delete; + Decoder& operator=(const Decoder&) = delete; +}; + +/** + * A converter that encodes a Unicode stream into bytes according to a + * character encoding in a streaming (incremental) manner. + * + * The various `encode_*` methods take an input buffer (`src`) and an output + * buffer `dst` both of which are caller-allocated. There are variants for + * both UTF-8 and UTF-16 input buffers. + * + * An `encode_*` method encode characters from `src` into bytes characters + * stored into `dst` until one of the following three things happens: + * + * 1. An unmappable character is encountered (`*_without_replacement` variants + * only). + * + * 2. The output buffer has been filled so near capacity that the decoder + * cannot be sure that processing an additional character of input wouldn't + * cause so much output that the output buffer would overflow. + * + * 3. All the input characters have been processed. + * + * The `encode_*` method then returns tuple of a status indicating which one + * of the three reasons to return happened, how many input code units (`uint8_t` + * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read, + * how many output bytes were written, and in the case of the variants that + * perform replacement, a boolean indicating whether an unmappable + * character was replaced with a numeric character reference during the call. + * + * The number of bytes "written" is what's logically written. Garbage may be + * written in the output buffer beyond the point logically written to. + * + * In the case of the methods whose name ends with + * `*_without_replacement`, the status is a `uint32_t` whose possible values + * are an unmappable code point, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding + * to the three cases listed above). + * + * In the case of methods whose name does not end with + * `*_without_replacement`, unmappable characters are automatically replaced + * with the corresponding numeric character references and unmappable + * characters do not cause the methods to return early. + * + * When encoding from UTF-8 without replacement, the methods are guaranteed + * not to return indicating that more output space is needed if the length + * of the output buffer is at least the length returned by + * `max_buffer_length_from_utf8_without_replacement()`. When encoding from + * UTF-8 with replacement, the length of the output buffer that guarantees the + * methods not to return indicating that more output space is needed in the + * absence of unmappable characters is given by + * `max_buffer_length_from_utf8_if_no_unmappables()`. When encoding from + * UTF-16 without replacement, the methods are guaranteed not to return + * indicating that more output space is needed if the length of the output + * buffer is at least the length returned by + * `max_buffer_length_from_utf16_without_replacement()`. When encoding + * from UTF-16 with replacement, the the length of the output buffer that + * guarantees the methods not to return indicating that more output space is + * needed in the absence of unmappable characters is given by + * `max_buffer_length_from_utf16_if_no_unmappables()`. + * When encoding with replacement, applications are not expected to size the + * buffer for the worst case ahead of time but to resize the buffer if there + * are unmappable characters. This is why max length queries are only available + * for the case where there are no unmappable characters. + * + * When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. When + * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD + * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to + * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that + * surrogate pairs are not split across input buffer boundaries. + * + * After an `encode_*` call returns, the output produced so far, taken as a + * whole from the start of the stream, is guaranteed to consist of a valid + * byte sequence in the target encoding. (I.e. the code unit sequence for a + * character is guaranteed not to be split across output buffers. However, due + * to the stateful nature of ISO-2022-JP, the stream needs to be considered + * from the start for it to be valid. For other encodings, the validity holds + * on a per-output buffer basis.) + * + * The boolean argument `last` indicates that the end of the stream is reached + * when all the characters in `src` have been consumed. This argument is needed + * for ISO-2022-JP and is ignored for other encodings. + * + * An `Encoder` object can be used to incrementally encode a byte stream. + * + * During the processing of a single stream, the caller must call `encode_*` + * zero or more times with `last` set to `false` and then call `encode_*` at + * least once with `last` set to `true`. If `encode_*` returns `INPUT_EMPTY`, + * the processing of the stream has ended. Otherwise, the caller must call + * `encode_*` again with `last` set to `true` (or treat an unmappable result, + * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error). + * + * Once the stream has ended, the `Encoder` object must not be used anymore. + * That is, you need to create another one to process another stream. + * + * When the encoder returns `OUTPUT_FULL` or the encoder returns an unmappable + * result and the caller does not wish to treat it as a fatal error, the input + * buffer `src` may not have been completely consumed. In that case, the caller + * must pass the unconsumed contents of `src` to `encode_*` again upon the next + * call. + * + * # Infinite loops + * + * When converting with a fixed-size output buffer whose size is too small to + * accommodate one character of output, an infinite loop ensues. When + * converting with a fixed-size output buffer, it generally makes sense to + * make the buffer fairly large (e.g. couple of kilobytes). + */ +class Encoder final { + public: + ~Encoder() {} + + static inline void operator delete(void* encoder) { + encoder_free(reinterpret_cast<Encoder*>(encoder)); + } + + /** + * The `Encoding` this `Encoder` is for. + */ + inline gsl::not_null<const Encoding*> encoding() const { + return gsl::not_null<const Encoding*>(encoder_encoding(this)); + } + + /** + * Returns `true` if this is an ISO-2022-JP encoder that's not in the + * ASCII state and `false` otherwise. + */ + inline bool has_pending_state() const { + return encoder_has_pending_state(this); + } + + /** + * Query the worst-case output size when encoding from UTF-8 with + * replacement. + * + * Returns the size of the output buffer in bytes that will not overflow + * given the current state of the encoder and `byte_length` number of + * additional input code units if there are no unmappable characters in + * the input or `SIZE_MAX` if `size_t` would overflow. + */ + inline std::optional<size_t> max_buffer_length_from_utf8_if_no_unmappables( + size_t byte_length) const { + size_t val = encoder_max_buffer_length_from_utf8_if_no_unmappables( + this, byte_length); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + /** + * Query the worst-case output size when encoding from UTF-8 without + * replacement. + * + * Returns the size of the output buffer in bytes that will not overflow + * given the current state of the encoder and `byte_length` number of + * additional input code units or `SIZE_MAX` if `size_t` would overflow. + */ + inline std::optional<size_t> max_buffer_length_from_utf8_without_replacement( + size_t byte_length) const { + size_t val = encoder_max_buffer_length_from_utf8_without_replacement( + this, byte_length); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + /** + * Incrementally encode into byte stream from UTF-8 with unmappable + * characters replaced with HTML (decimal) numeric character references. + * + * See the documentation of the class for documentation for `encode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf8( + std::string_view src, gsl::span<uint8_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + bool had_replacements; + uint32_t result = encoder_encode_from_utf8( + this, + null_to_bogus<const uint8_t>( + reinterpret_cast<const uint8_t*>(src.data())), + &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last, + &had_replacements); + return {result, src_read, dst_written, had_replacements}; + } + + /** + * Incrementally encode into byte stream from UTF-8 _without replacement_. + * + * See the documentation of the class for documentation for `encode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t> + encode_from_utf8_without_replacement(std::string_view src, + gsl::span<uint8_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + uint32_t result = encoder_encode_from_utf8_without_replacement( + this, + null_to_bogus<const uint8_t>( + reinterpret_cast<const uint8_t*>(src.data())), + &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last); + return {result, src_read, dst_written}; + } + + /** + * Query the worst-case output size when encoding from UTF-16 with + * replacement. + * + * Returns the size of the output buffer in bytes that will not overflow + * given the current state of the encoder and `u16_length` number of + * additional input code units if there are no unmappable characters in + * the input or `SIZE_MAX` if `size_t` would overflow. + */ + inline std::optional<size_t> max_buffer_length_from_utf16_if_no_unmappables( + size_t u16_length) const { + size_t val = encoder_max_buffer_length_from_utf16_if_no_unmappables( + this, u16_length); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + /** + * Query the worst-case output size when encoding from UTF-16 without + * replacement. + * + * Returns the size of the output buffer in bytes that will not overflow + * given the current state of the encoder and `u16_length` number of + * additional input code units or `SIZE_MAX` if `size_t` would overflow. + */ + inline std::optional<size_t> max_buffer_length_from_utf16_without_replacement( + size_t u16_length) const { + size_t val = encoder_max_buffer_length_from_utf16_without_replacement( + this, u16_length); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; + } + + /** + * Incrementally encode into byte stream from UTF-16 with unmappable + * characters replaced with HTML (decimal) numeric character references. + * + * See the documentation of the class for documentation for `encode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf16( + std::u16string_view src, gsl::span<uint8_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + bool had_replacements; + uint32_t result = encoder_encode_from_utf16( + this, null_to_bogus<const char16_t>(src.data()), &src_read, + null_to_bogus<uint8_t>(dst.data()), &dst_written, last, + &had_replacements); + return {result, src_read, dst_written, had_replacements}; + } + + /** + * Incrementally encode into byte stream from UTF-16 _without replacement_. + * + * See the documentation of the class for documentation for `encode_*` + * methods collectively. + */ + inline std::tuple<uint32_t, size_t, size_t> + encode_from_utf16_without_replacement(std::u16string_view src, + gsl::span<uint8_t> dst, bool last) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + uint32_t result = encoder_encode_from_utf16_without_replacement( + this, null_to_bogus<const char16_t>(src.data()), &src_read, + null_to_bogus<uint8_t>(dst.data()), &dst_written, last); + return {result, src_read, dst_written}; + } + + private: + /** + * Replaces `nullptr` with a bogus pointer suitable for use as part of a + * zero-length Rust slice. + */ + template <class T> + static inline T* null_to_bogus(T* ptr) { + return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); + } + + Encoder() = delete; + Encoder(const Encoder&) = delete; + Encoder& operator=(const Encoder&) = delete; +}; + +/** + * An encoding as defined in the Encoding Standard + * (https://encoding.spec.whatwg.org/). + * + * An _encoding_ defines a mapping from a byte sequence to a Unicode code point + * sequence and, in most cases, vice versa. Each encoding has a name, an output + * encoding, and one or more labels. + * + * _Labels_ are ASCII-case-insensitive strings that are used to identify an + * encoding in formats and protocols. The _name_ of the encoding is the + * preferred label in the case appropriate for returning from the + * `characterSet` property of the `Document` DOM interface, except for + * the replacement encoding whose name is not one of its labels. + * + * The _output encoding_ is the encoding used for form submission and URL + * parsing on Web pages in the encoding. This is UTF-8 for the replacement, + * UTF-16LE and UTF-16BE encodings and the encoding itself for other + * encodings. + * + * # Streaming vs. Non-Streaming + * + * When you have the entire input in a single buffer, you can use the + * methods `decode()`, `decode_with_bom_removal()`, + * `decode_without_bom_handling()`, + * `decode_without_bom_handling_and_without_replacement()` and + * `encode()`. Unlike the rest of the API, these methods perform heap + * allocations. You should the `Decoder` and `Encoder` objects when your input + * is split into multiple buffers or when you want to control the allocation of + * the output buffers. + * + * # Instances + * + * All instances of `Encoding` are statically allocated and have the process's + * lifetime. There is precisely one unique `Encoding` instance for each + * encoding defined in the Encoding Standard. + * + * To obtain a reference to a particular encoding whose identity you know at + * compile time, use a `static` that refers to encoding. There is a `static` + * for each encoding. The `static`s are named in all caps with hyphens + * replaced with underscores and with `_ENCODING` appended to the + * name. For example, if you know at compile time that you will want to + * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`. + * + * If you don't know what encoding you need at compile time and need to + * dynamically get an encoding by label, use `Encoding::for_label()`. + * + * Instances of `Encoding` can be compared with `==`. + */ +class Encoding final { + public: + /** + * Implements the _get an encoding_ algorithm + * (https://encoding.spec.whatwg.org/#concept-encoding-get). + * + * If, after ASCII-lowercasing and removing leading and trailing + * whitespace, the argument matches a label defined in the Encoding + * Standard, `const Encoding*` representing the corresponding + * encoding is returned. If there is no match, `nullptr` is returned. + * + * This is the right method to use if the action upon the method returning + * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) + * instead. When the action upon the method returning `nullptr` is not to + * proceed with a fallback but to refuse processing, + * `for_label_no_replacement()` is more appropriate. + */ + static inline const Encoding* for_label(gsl::cstring_span<> label) { + return encoding_for_label( + null_to_bogus<const uint8_t>( + reinterpret_cast<const uint8_t*>(label.data())), + label.length()); + } + + /** + * This method behaves the same as `for_label()`, except when `for_label()` + * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead. + * + * This method is useful in scenarios where a fatal error is required + * upon invalid label, because in those cases the caller typically wishes + * to treat the labels that map to the replacement encoding as fatal + * errors, too. + * + * It is not OK to use this method when the action upon the method returning + * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In + * such a case, the `for_label()` method should be used instead in order to + * avoid + * unsafe fallback for labels that `for_label()` maps to + * `REPLACEMENT_ENCODING`. + */ + static inline const Encoding* for_label_no_replacement( + gsl::cstring_span<> label) { + return encoding_for_label_no_replacement( + null_to_bogus<const uint8_t>( + reinterpret_cast<const uint8_t*>(label.data())), + label.length()); + } + + /** + * Performs non-incremental BOM sniffing. + * + * The argument must either be a buffer representing the entire input + * stream (non-streaming case) or a buffer representing at least the first + * three bytes of the input stream (streaming case). + * + * Returns a std::optinal wrapping `make_tuple(UTF_8_ENCODING, 3)`, + * `make_tuple(UTF_16LE_ENCODING, 2)` or `make_tuple(UTF_16BE_ENCODING, 3)` + * if the argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or + * `std::nullopt` otherwise. + */ + static inline std::optional< + std::tuple<gsl::not_null<const Encoding*>, size_t>> + for_bom(gsl::span<const uint8_t> buffer) { + size_t len = buffer.size(); + const Encoding* encoding = + encoding_for_bom(null_to_bogus(buffer.data()), &len); + if (encoding) { + return std::make_tuple(gsl::not_null<const Encoding*>(encoding), len); + } + return std::nullopt; + } + + /** + * Returns the name of this encoding. + * + * This name is appropriate to return as-is from the DOM + * `document.characterSet` property. + */ + inline std::string name() const { + std::string name(ENCODING_NAME_MAX_LENGTH, '\0'); + // http://herbsutter.com/2008/04/07/cringe-not-vectors-are-guaranteed-to-be-contiguous/#comment-483 + size_t length = encoding_name(this, reinterpret_cast<uint8_t*>(&name[0])); + name.resize(length); + return name; + } + + /** + * Checks whether the _output encoding_ of this encoding can encode every + * Unicode code point. (Only true if the output encoding is UTF-8.) + */ + inline bool can_encode_everything() const { + return encoding_can_encode_everything(this); + } + + /** + * Checks whether the bytes 0x00...0x7F map exclusively to the characters + * U+0000...U+007F and vice versa. + */ + inline bool is_ascii_compatible() const { + return encoding_is_ascii_compatible(this); + } + + /** + * Checks whether this encoding maps one byte to one Basic Multilingual + * Plane code point (i.e. byte length equals decoded UTF-16 length) and + * vice versa (for mappable characters). + * + * `true` iff this encoding is on the list of Legacy single-byte + * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) + * in the spec or x-user-defined. + */ + inline bool is_single_byte() const { return encoding_is_single_byte(this); } + + /** + * Returns the _output encoding_ of this encoding. This is UTF-8 for + * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. + */ + inline gsl::not_null<const Encoding*> output_encoding() const { + return gsl::not_null<const Encoding*>(encoding_output_encoding(this)); + } + + /** + * Decode complete input to `std::string` _with BOM sniffing_ and with + * malformed sequences replaced with the REPLACEMENT CHARACTER when the + * entire input is available as a single buffer (i.e. the end of the + * buffer marks the end of the stream). + * + * This method implements the (non-streaming version of) the + * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. + * + * The second item in the returned tuple is the encoding that was actually + * used (which may differ from this encoding thanks to BOM sniffing). + * + * The third item in the returned tuple indicates whether there were + * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use `new_decoder()` + * when decoding segmented input. + */ + inline std::tuple<std::string, gsl::not_null<const Encoding*>, bool> decode( + gsl::span<const uint8_t> bytes) const { + auto opt = Encoding::for_bom(bytes); + const Encoding* encoding; + if (opt) { + size_t bom_length; + std::tie(encoding, bom_length) = *opt; + bytes = bytes.subspan(bom_length); + } else { + encoding = this; + } + auto [str, had_errors] = encoding->decode_without_bom_handling(bytes); + return {str, gsl::not_null<const Encoding*>(encoding), had_errors}; + } + + /** + * Decode complete input to `std::string` _with BOM removal_ and with + * malformed sequences replaced with the REPLACEMENT CHARACTER when the + * entire input is available as a single buffer (i.e. the end of the + * buffer marks the end of the stream). + * + * When invoked on `UTF_8`, this method implements the (non-streaming + * version of) the _UTF-8 decode_ + * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. + * + * The second item in the returned pair indicates whether there were + * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use + * `new_decoder_with_bom_removal()` when decoding segmented input. + */ + inline std::tuple<std::string, bool> decode_with_bom_removal( + gsl::span<const uint8_t> bytes) const { + if (this == UTF_8_ENCODING && bytes.size() >= 3 && + (gsl::as_bytes(bytes.first<3>()) == + gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) { + bytes = bytes.subspan(3, bytes.size() - 3); + } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 && + (gsl::as_bytes(bytes.first<2>()) == + gsl::as_bytes(gsl::make_span("\xFF\xFE")))) { + bytes = bytes.subspan(2, bytes.size() - 2); + } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 && + (gsl::as_bytes(bytes.first<2>()) == + gsl::as_bytes(gsl::make_span("\xFE\xFF")))) { + bytes = bytes.subspan(2, bytes.size() - 2); + } + return decode_without_bom_handling(bytes); + } + + /** + * Decode complete input to `std::string` _without BOM handling_ and + * with malformed sequences replaced with the REPLACEMENT CHARACTER when + * the entire input is available as a single buffer (i.e. the end of the + * buffer marks the end of the stream). + * + * When invoked on `UTF_8`, this method implements the (non-streaming + * version of) the _UTF-8 decode without BOM_ + * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. + * + * The second item in the returned pair indicates whether there were + * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use + * `new_decoder_without_bom_handling()` when decoding segmented input. + */ + inline std::tuple<std::string, bool> decode_without_bom_handling( + gsl::span<const uint8_t> bytes) const { + auto decoder = new_decoder_without_bom_handling(); + auto needed = decoder->max_utf8_buffer_length(bytes.size()); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + std::string string(needed.value(), '\0'); + const auto [result, read, written, had_errors] = decoder->decode_to_utf8( + bytes, + gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]), string.size()), + true); + assert(read == static_cast<size_t>(bytes.size())); + assert(written <= static_cast<size_t>(string.size())); + assert(result == INPUT_EMPTY); + string.resize(written); + return {string, had_errors}; + } + + /** + * Decode complete input to `std::string` _without BOM handling_ and + * _with malformed sequences treated as fatal_ when the entire input is + * available as a single buffer (i.e. the end of the buffer marks the end + * of the stream). + * + * When invoked on `UTF_8`, this method implements the (non-streaming + * version of) the _UTF-8 decode without BOM or fail_ + * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) + * spec concept. + * + * Returns `std::nullopt` if a malformed sequence was encountered and the result + * of the decode as `std::optional<std::string>` otherwise. + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use + * `new_decoder_without_bom_handling()` when decoding segmented input. + */ + inline std::optional<std::string> + decode_without_bom_handling_and_without_replacement( + gsl::span<const uint8_t> bytes) const { + auto decoder = new_decoder_without_bom_handling(); + auto needed = + decoder->max_utf8_buffer_length_without_replacement(bytes.size()); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + std::string string(needed.value(), '\0'); + const auto [result, read, written] = + decoder->decode_to_utf8_without_replacement( + bytes, + gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]), + string.size()), + true); + assert(result != OUTPUT_FULL); + if (result == INPUT_EMPTY) { + assert(read == static_cast<size_t>(bytes.size())); + assert(written <= static_cast<size_t>(string.size())); + string.resize(written); + return string; + } + return std::nullopt; + } + + /** + * Decode complete input to `std::u16string` _with BOM sniffing_ and with + * malformed sequences replaced with the REPLACEMENT CHARACTER when the + * entire input is available as a single buffer (i.e. the end of the + * buffer marks the end of the stream). + * + * This method implements the (non-streaming version of) the + * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. + * + * The second item in the returned tuple is the encoding that was actually + * used (which may differ from this encoding thanks to BOM sniffing). + * + * The third item in the returned tuple indicates whether there were + * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use `new_decoder()` + * when decoding segmented input. + */ + inline std::tuple<std::u16string, gsl::not_null<const Encoding*>, bool> + decode16(gsl::span<const uint8_t> bytes) const { + auto opt = Encoding::for_bom(bytes); + const Encoding* encoding; + if (opt) { + size_t bom_length; + std::tie(encoding, bom_length) = *opt; + bytes = bytes.subspan(bom_length); + } else { + encoding = this; + } + auto [str, had_errors] = encoding->decode16_without_bom_handling(bytes); + return {str, gsl::not_null<const Encoding*>(encoding), had_errors}; + } + + /** + * Decode complete input to `std::u16string` _with BOM removal_ and with + * malformed sequences replaced with the REPLACEMENT CHARACTER when the + * entire input is available as a single buffer (i.e. the end of the + * buffer marks the end of the stream). + * + * When invoked on `UTF_8`, this method implements the (non-streaming + * version of) the _UTF-8 decode_ + * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. + * + * The second item in the returned pair indicates whether there were + * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use + * `new_decoder_with_bom_removal()` when decoding segmented input. + */ + inline std::tuple<std::u16string, bool> decode16_with_bom_removal( + gsl::span<const uint8_t> bytes) const { + if (this == UTF_8_ENCODING && bytes.size() >= 3 && + (gsl::as_bytes(bytes.first<3>()) == + gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) { + bytes = bytes.subspan(3, bytes.size() - 3); + } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 && + (gsl::as_bytes(bytes.first<2>()) == + gsl::as_bytes(gsl::make_span("\xFF\xFE")))) { + bytes = bytes.subspan(2, bytes.size() - 2); + } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 && + (gsl::as_bytes(bytes.first<2>()) == + gsl::as_bytes(gsl::make_span("\xFE\xFF")))) { + bytes = bytes.subspan(2, bytes.size() - 2); + } + return decode16_without_bom_handling(bytes); + } + + /** + * Decode complete input to `std::u16string` _without BOM handling_ and + * with malformed sequences replaced with the REPLACEMENT CHARACTER when + * the entire input is available as a single buffer (i.e. the end of the + * buffer marks the end of the stream). + * + * When invoked on `UTF_8`, this method implements the (non-streaming + * version of) the _UTF-8 decode without BOM_ + * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. + * + * The second item in the returned pair indicates whether there were + * malformed sequences (that were replaced with the REPLACEMENT CHARACTER). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use + * `new_decoder_without_bom_handling()` when decoding segmented input. + */ + inline std::tuple<std::u16string, bool> decode16_without_bom_handling( + gsl::span<const uint8_t> bytes) const { + auto decoder = new_decoder_without_bom_handling(); + auto needed = decoder->max_utf16_buffer_length(bytes.size()); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + std::u16string string(needed.value(), '\0'); + const auto [result, read, written, had_errors] = decoder->decode_to_utf16( + bytes, gsl::make_span(&string[0], string.size()), true); + assert(read == static_cast<size_t>(bytes.size())); + assert(written <= static_cast<size_t>(string.size())); + assert(result == INPUT_EMPTY); + string.resize(written); + return {string, had_errors}; + } + + /** + * Decode complete input to `std::u16string` _without BOM handling_ and + * _with malformed sequences treated as fatal_ when the entire input is + * available as a single buffer (i.e. the end of the buffer marks the end + * of the stream). + * + * When invoked on `UTF_8`, this method implements the (non-streaming + * version of) the _UTF-8 decode without BOM or fail_ + * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) + * spec concept. + * + * Returns `std::nullopt` if a malformed sequence was encountered and the result + * of the decode as `std::optional<std::u16string>` otherwise. + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use + * `new_decoder_without_bom_handling()` when decoding segmented input. + */ + inline std::optional<std::u16string> + decode16_without_bom_handling_and_without_replacement( + gsl::span<const uint8_t> bytes) const { + auto decoder = new_decoder_without_bom_handling(); + auto needed = decoder->max_utf16_buffer_length(bytes.size()); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + std::u16string string(needed.value(), '\0'); + const auto [result, read, written] = + decoder->decode_to_utf16_without_replacement( + bytes, gsl::make_span(&string[0], string.size()), true); + assert(result != OUTPUT_FULL); + if (result == INPUT_EMPTY) { + assert(read == static_cast<size_t>(bytes.size())); + assert(written <= static_cast<size_t>(string.size())); + string.resize(written); + return string; + } + return std::nullopt; + } + + /** + * Encode complete input to `std::vector<uint8_t>` with unmappable characters + * replaced with decimal numeric character references when the entire input + * is available as a single buffer (i.e. the end of the buffer marks the + * end of the stream). + * + * This method implements the (non-streaming version of) the + * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. + * + * The second item in the returned tuple is the encoding that was actually + * used (which may differ from this encoding thanks to some encodings + * having UTF-8 as their output encoding). + * + * The third item in the returned tuple indicates whether there were + * unmappable characters (that were replaced with HTML numeric character + * references). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use `new_encoder()` + * when encoding segmented output. + */ + inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool> + encode(std::string_view string) const { + auto output_enc = output_encoding(); + if (output_enc == UTF_8_ENCODING) { + std::vector<uint8_t> vec(string.size()); + std::memcpy(&vec[0], string.data(), string.size()); + } + auto encoder = output_enc->new_encoder(); + auto needed = + encoder->max_buffer_length_from_utf8_if_no_unmappables(string.size()); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + std::vector<uint8_t> vec(needed.value()); + bool total_had_errors = false; + size_t total_read = 0; + size_t total_written = 0; + for (;;) { + const auto [result, read, written, had_errors] = + encoder->encode_from_utf8(string.substr(total_read), + gsl::make_span(vec).subspan(total_written), + true); + total_read += read; + total_written += written; + total_had_errors |= had_errors; + if (result == INPUT_EMPTY) { + assert(total_read == static_cast<size_t>(string.size())); + assert(total_written <= static_cast<size_t>(vec.size())); + vec.resize(total_written); + return {vec, gsl::not_null<const Encoding*>(output_enc), + total_had_errors}; + } + auto needed = encoder->max_buffer_length_from_utf8_if_no_unmappables( + string.size() - total_read); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + vec.resize(total_written + needed.value()); + } + } + + /** + * Encode complete input to `std::vector<uint8_t>` with unmappable characters + * replaced with decimal numeric character references when the entire input + * is available as a single buffer (i.e. the end of the buffer marks the + * end of the stream). + * + * This method implements the (non-streaming version of) the + * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. + * + * The second item in the returned tuple is the encoding that was actually + * used (which may differ from this encoding thanks to some encodings + * having UTF-8 as their output encoding). + * + * The third item in the returned tuple indicates whether there were + * unmappable characters (that were replaced with HTML numeric character + * references). + * + * _Note:_ It is wrong to use this when the input buffer represents only + * a segment of the input instead of the whole input. Use `new_encoder()` + * when encoding segmented output. + */ + inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool> + encode(std::u16string_view string) const { + auto output_enc = output_encoding(); + auto encoder = output_enc->new_encoder(); + auto needed = + encoder->max_buffer_length_from_utf16_if_no_unmappables(string.size()); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + std::vector<uint8_t> vec(needed.value()); + bool total_had_errors = false; + size_t total_read = 0; + size_t total_written = 0; + for (;;) { + const auto [result, read, written, had_errors] = + encoder->encode_from_utf16(string.substr(total_read), + gsl::make_span(vec).subspan(total_written), + true); + total_read += read; + total_written += written; + total_had_errors |= had_errors; + if (result == INPUT_EMPTY) { + assert(total_read == static_cast<size_t>(string.size())); + assert(total_written <= static_cast<size_t>(vec.size())); + vec.resize(total_written); + return {vec, gsl::not_null<const Encoding*>(output_enc), + total_had_errors}; + } + auto needed = encoder->max_buffer_length_from_utf16_if_no_unmappables( + string.size() - total_read); + if (!needed) { + throw std::overflow_error("Overflow in buffer size computation."); + } + vec.resize(total_written + needed.value()); + } + } + + /** + * Instantiates a new decoder for this encoding with BOM sniffing enabled. + * + * BOM sniffing may cause the returned decoder to morph into a decoder + * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. + */ + inline std::unique_ptr<Decoder> new_decoder() const { + return std::unique_ptr<Decoder>(encoding_new_decoder(this)); + } + + /** + * Instantiates a new decoder for this encoding with BOM sniffing enabled + * into memory occupied by a previously-instantiated decoder. + * + * BOM sniffing may cause the returned decoder to morph into a decoder + * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. + */ + inline void new_decoder_into(Decoder& decoder) const { + encoding_new_decoder_into(this, &decoder); + } + + /** + * Instantiates a new decoder for this encoding with BOM removal. + * + * If the input starts with bytes that are the BOM for this encoding, + * those bytes are removed. However, the decoder never morphs into a + * decoder for another encoding: A BOM for another encoding is treated as + * (potentially malformed) input to the decoding algorithm for this + * encoding. + */ + inline std::unique_ptr<Decoder> new_decoder_with_bom_removal() const { + return std::unique_ptr<Decoder>( + encoding_new_decoder_with_bom_removal(this)); + } + + /** + * Instantiates a new decoder for this encoding with BOM removal + * into memory occupied by a previously-instantiated decoder. + * + * If the input starts with bytes that are the BOM for this encoding, + * those bytes are removed. However, the decoder never morphs into a + * decoder for another encoding: A BOM for another encoding is treated as + * (potentially malformed) input to the decoding algorithm for this + * encoding. + */ + inline void new_decoder_with_bom_removal_into(Decoder& decoder) const { + encoding_new_decoder_with_bom_removal_into(this, &decoder); + } + + /** + * Instantiates a new decoder for this encoding with BOM handling disabled. + * + * If the input starts with bytes that look like a BOM, those bytes are + * not treated as a BOM. (Hence, the decoder never morphs into a decoder + * for another encoding.) + * + * _Note:_ If the caller has performed BOM sniffing on its own but has not + * removed the BOM, the caller should use `new_decoder_with_bom_removal()` + * instead of this method to cause the BOM to be removed. + */ + inline std::unique_ptr<Decoder> new_decoder_without_bom_handling() const { + return std::unique_ptr<Decoder>( + encoding_new_decoder_without_bom_handling(this)); + } + + /** + * Instantiates a new decoder for this encoding with BOM handling disabled + * into memory occupied by a previously-instantiated decoder. + * + * If the input starts with bytes that look like a BOM, those bytes are + * not treated as a BOM. (Hence, the decoder never morphs into a decoder + * for another encoding.) + * + * _Note:_ If the caller has performed BOM sniffing on its own but has not + * removed the BOM, the caller should use + * `new_decoder_with_bom_removal_into()` + * instead of this method to cause the BOM to be removed. + */ + inline void new_decoder_without_bom_handling_into(Decoder& decoder) const { + encoding_new_decoder_without_bom_handling_into(this, &decoder); + } + + /** + * Instantiates a new encoder for the output encoding of this encoding. + */ + inline std::unique_ptr<Encoder> new_encoder() const { + return std::unique_ptr<Encoder>(encoding_new_encoder(this)); + } + + /** + * Instantiates a new encoder for the output encoding of this encoding + * into memory occupied by a previously-instantiated encoder. + */ + inline void new_encoder_into(Encoder& encoder) const { + encoding_new_encoder_into(this, &encoder); + } + + /** + * Validates UTF-8. + * + * Returns the index of the first byte that makes the input malformed as + * UTF-8 or the length of the input if the input is entirely valid. + */ + static inline size_t utf8_valid_up_to(gsl::span<const uint8_t> buffer) { + return encoding_utf8_valid_up_to( + null_to_bogus<const uint8_t>(buffer.data()), buffer.size()); + } + + /** + * Validates ASCII. + * + * Returns the index of the first byte that makes the input malformed as + * ASCII or the length of the input if the input is entirely valid. + */ + static inline size_t ascii_valid_up_to(gsl::span<const uint8_t> buffer) { + return encoding_ascii_valid_up_to( + null_to_bogus<const uint8_t>(buffer.data()), buffer.size()); + } + + /** + * Validates ISO-2022-JP ASCII-state data. + * + * Returns the index of the first byte that makes the input not + * representable in the ASCII state of ISO-2022-JP or the length of the + * input if the input is entirely representable in the ASCII state of + * ISO-2022-JP. + */ + static inline size_t iso_2022_jp_ascii_valid_up_to( + gsl::span<const uint8_t> buffer) { + return encoding_iso_2022_jp_ascii_valid_up_to( + null_to_bogus<const uint8_t>(buffer.data()), buffer.size()); + } + + private: + /** + * Replaces `nullptr` with a bogus pointer suitable for use as part of a + * zero-length Rust slice. + */ + template <class T> + static inline T* null_to_bogus(T* ptr) { + return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); + } + + Encoding() = delete; + Encoding(const Encoding&) = delete; + Encoding& operator=(const Encoding&) = delete; + ~Encoding() = delete; +}; + +}; // namespace encoding_rs + +#endif // encoding_rs_cpp_h_ diff --git a/third_party/rust/encoding_c/include/encoding_rs_statics.h b/third_party/rust/encoding_c/include/encoding_rs_statics.h new file mode 100644 index 0000000000..c3e84d586e --- /dev/null +++ b/third_party/rust/encoding_c/include/encoding_rs_statics.h @@ -0,0 +1,171 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. +// Instead, please regenerate using generate-encoding-data.py + +// This file is not meant to be included directly. Instead, encoding_rs.h +// includes this file. + +#ifndef encoding_rs_statics_h_ +#define encoding_rs_statics_h_ + +#ifndef ENCODING_RS_ENCODING +#define ENCODING_RS_ENCODING Encoding +#ifndef __cplusplus +typedef struct Encoding_ Encoding; +#endif +#endif + +#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR +#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING* +#endif + +#ifndef ENCODING_RS_ENCODER +#define ENCODING_RS_ENCODER Encoder +#ifndef __cplusplus +typedef struct Encoder_ Encoder; +#endif +#endif + +#ifndef ENCODING_RS_DECODER +#define ENCODING_RS_DECODER Decoder +#ifndef __cplusplus +typedef struct Decoder_ Decoder; +#endif +#endif + +#define INPUT_EMPTY 0 + +#define OUTPUT_FULL 0xFFFFFFFF + +// x-mac-cyrillic +#define ENCODING_NAME_MAX_LENGTH 14 + +/// The Big5 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const BIG5_ENCODING; + +/// The EUC-JP encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const EUC_JP_ENCODING; + +/// The EUC-KR encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const EUC_KR_ENCODING; + +/// The GBK encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const GBK_ENCODING; + +/// The IBM866 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const IBM866_ENCODING; + +/// The ISO-2022-JP encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_2022_JP_ENCODING; + +/// The ISO-8859-10 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_10_ENCODING; + +/// The ISO-8859-13 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_13_ENCODING; + +/// The ISO-8859-14 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_14_ENCODING; + +/// The ISO-8859-15 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_15_ENCODING; + +/// The ISO-8859-16 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_16_ENCODING; + +/// The ISO-8859-2 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_2_ENCODING; + +/// The ISO-8859-3 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_3_ENCODING; + +/// The ISO-8859-4 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_4_ENCODING; + +/// The ISO-8859-5 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_5_ENCODING; + +/// The ISO-8859-6 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_6_ENCODING; + +/// The ISO-8859-7 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_7_ENCODING; + +/// The ISO-8859-8 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_8_ENCODING; + +/// The ISO-8859-8-I encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_8_I_ENCODING; + +/// The KOI8-R encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const KOI8_R_ENCODING; + +/// The KOI8-U encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const KOI8_U_ENCODING; + +/// The Shift_JIS encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const SHIFT_JIS_ENCODING; + +/// The UTF-16BE encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const UTF_16BE_ENCODING; + +/// The UTF-16LE encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const UTF_16LE_ENCODING; + +/// The UTF-8 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const UTF_8_ENCODING; + +/// The gb18030 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const GB18030_ENCODING; + +/// The macintosh encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const MACINTOSH_ENCODING; + +/// The replacement encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const REPLACEMENT_ENCODING; + +/// The windows-1250 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1250_ENCODING; + +/// The windows-1251 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1251_ENCODING; + +/// The windows-1252 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1252_ENCODING; + +/// The windows-1253 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1253_ENCODING; + +/// The windows-1254 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1254_ENCODING; + +/// The windows-1255 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1255_ENCODING; + +/// The windows-1256 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1256_ENCODING; + +/// The windows-1257 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1257_ENCODING; + +/// The windows-1258 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1258_ENCODING; + +/// The windows-874 encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_874_ENCODING; + +/// The x-mac-cyrillic encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const X_MAC_CYRILLIC_ENCODING; + +/// The x-user-defined encoding. +extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const X_USER_DEFINED_ENCODING; + +#endif // encoding_rs_statics_h_ diff --git a/third_party/rust/encoding_c/src/lib.rs b/third_party/rust/encoding_c/src/lib.rs new file mode 100644 index 0000000000..699e6427c8 --- /dev/null +++ b/third_party/rust/encoding_c/src/lib.rs @@ -0,0 +1,1194 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![doc(html_root_url = "https://docs.rs/encoding_c/0.9.7")] + +//! The C API for encoding_rs. +//! +//! # Mapping from Rust +//! +//! ## Naming convention +//! +//! The wrapper function for each method has a name that starts with the name +//! of the struct lower-cased, followed by an underscore and ends with the +//! name of the method. +//! +//! For example, `Encoding::for_label()` is wrapped as `encoding_for_label()`. +//! +//! ## Arguments +//! +//! Functions that wrap non-static methods take the `self` object as their +//! first argument. +//! +//! Slice argument `foo` is decomposed into a pointer `foo` and a length +//! `foo_len`. +//! +//! ## Return values +//! +//! Multiple return values become out-params. When an out-param is +//! length-related, `foo_len` for a slice becomes a pointer in order to become +//! an in/out-param. +//! +//! `DecoderResult`, `EncoderResult` and `CoderResult` become `uint32_t`. +//! `InputEmpty` becomes `INPUT_EMPTY`. `OutputFull` becomes `OUTPUT_FULL`. +//! `Unmappable` becomes the scalar value of the unmappable character. +//! `Malformed` becomes a number whose lowest 8 bits, which can have the decimal +//! value 0, 1, 2 or 3, indicate the number of bytes that were consumed after +//! the malformed sequence and whose next-lowest 8 bits, when shifted right by +//! 8 indicate the length of the malformed byte sequence (possible decimal +//! values 1, 2, 3 or 4). The maximum possible sum of the two is 6. + +extern crate encoding_rs; + +use encoding_rs::*; + +/// Return value for `*_decode_*` and `*_encode_*` functions that indicates that +/// the input has been exhausted. +/// +/// (This is zero as a micro optimization. U+0000 is never unmappable and +/// malformed sequences always have a positive length.) +pub const INPUT_EMPTY: u32 = 0; + +/// Return value for `*_decode_*` and `*_encode_*` functions that indicates that +/// the output space has been exhausted. +pub const OUTPUT_FULL: u32 = 0xFFFFFFFF; + +/// Newtype for `*const Encoding` in order to be able to implement `Sync` for +/// it. +pub struct ConstEncoding(*const Encoding); + +/// Required for `static` fields. +unsafe impl Sync for ConstEncoding {} + +// BEGIN GENERATED CODE. PLEASE DO NOT EDIT. +// Instead, please regenerate using generate-encoding-data.py + +/// The minimum length of buffers that may be passed to `encoding_name()`. +pub const ENCODING_NAME_MAX_LENGTH: usize = 14; // x-mac-cyrillic + +/// The Big5 encoding. +#[no_mangle] +pub static BIG5_ENCODING: ConstEncoding = ConstEncoding(&BIG5_INIT); + +/// The EUC-JP encoding. +#[no_mangle] +pub static EUC_JP_ENCODING: ConstEncoding = ConstEncoding(&EUC_JP_INIT); + +/// The EUC-KR encoding. +#[no_mangle] +pub static EUC_KR_ENCODING: ConstEncoding = ConstEncoding(&EUC_KR_INIT); + +/// The GBK encoding. +#[no_mangle] +pub static GBK_ENCODING: ConstEncoding = ConstEncoding(&GBK_INIT); + +/// The IBM866 encoding. +#[no_mangle] +pub static IBM866_ENCODING: ConstEncoding = ConstEncoding(&IBM866_INIT); + +/// The ISO-2022-JP encoding. +#[no_mangle] +pub static ISO_2022_JP_ENCODING: ConstEncoding = ConstEncoding(&ISO_2022_JP_INIT); + +/// The ISO-8859-10 encoding. +#[no_mangle] +pub static ISO_8859_10_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_10_INIT); + +/// The ISO-8859-13 encoding. +#[no_mangle] +pub static ISO_8859_13_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_13_INIT); + +/// The ISO-8859-14 encoding. +#[no_mangle] +pub static ISO_8859_14_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_14_INIT); + +/// The ISO-8859-15 encoding. +#[no_mangle] +pub static ISO_8859_15_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_15_INIT); + +/// The ISO-8859-16 encoding. +#[no_mangle] +pub static ISO_8859_16_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_16_INIT); + +/// The ISO-8859-2 encoding. +#[no_mangle] +pub static ISO_8859_2_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_2_INIT); + +/// The ISO-8859-3 encoding. +#[no_mangle] +pub static ISO_8859_3_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_3_INIT); + +/// The ISO-8859-4 encoding. +#[no_mangle] +pub static ISO_8859_4_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_4_INIT); + +/// The ISO-8859-5 encoding. +#[no_mangle] +pub static ISO_8859_5_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_5_INIT); + +/// The ISO-8859-6 encoding. +#[no_mangle] +pub static ISO_8859_6_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_6_INIT); + +/// The ISO-8859-7 encoding. +#[no_mangle] +pub static ISO_8859_7_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_7_INIT); + +/// The ISO-8859-8 encoding. +#[no_mangle] +pub static ISO_8859_8_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_8_INIT); + +/// The ISO-8859-8-I encoding. +#[no_mangle] +pub static ISO_8859_8_I_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_8_I_INIT); + +/// The KOI8-R encoding. +#[no_mangle] +pub static KOI8_R_ENCODING: ConstEncoding = ConstEncoding(&KOI8_R_INIT); + +/// The KOI8-U encoding. +#[no_mangle] +pub static KOI8_U_ENCODING: ConstEncoding = ConstEncoding(&KOI8_U_INIT); + +/// The Shift_JIS encoding. +#[no_mangle] +pub static SHIFT_JIS_ENCODING: ConstEncoding = ConstEncoding(&SHIFT_JIS_INIT); + +/// The UTF-16BE encoding. +#[no_mangle] +pub static UTF_16BE_ENCODING: ConstEncoding = ConstEncoding(&UTF_16BE_INIT); + +/// The UTF-16LE encoding. +#[no_mangle] +pub static UTF_16LE_ENCODING: ConstEncoding = ConstEncoding(&UTF_16LE_INIT); + +/// The UTF-8 encoding. +#[no_mangle] +pub static UTF_8_ENCODING: ConstEncoding = ConstEncoding(&UTF_8_INIT); + +/// The gb18030 encoding. +#[no_mangle] +pub static GB18030_ENCODING: ConstEncoding = ConstEncoding(&GB18030_INIT); + +/// The macintosh encoding. +#[no_mangle] +pub static MACINTOSH_ENCODING: ConstEncoding = ConstEncoding(&MACINTOSH_INIT); + +/// The replacement encoding. +#[no_mangle] +pub static REPLACEMENT_ENCODING: ConstEncoding = ConstEncoding(&REPLACEMENT_INIT); + +/// The windows-1250 encoding. +#[no_mangle] +pub static WINDOWS_1250_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1250_INIT); + +/// The windows-1251 encoding. +#[no_mangle] +pub static WINDOWS_1251_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1251_INIT); + +/// The windows-1252 encoding. +#[no_mangle] +pub static WINDOWS_1252_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1252_INIT); + +/// The windows-1253 encoding. +#[no_mangle] +pub static WINDOWS_1253_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1253_INIT); + +/// The windows-1254 encoding. +#[no_mangle] +pub static WINDOWS_1254_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1254_INIT); + +/// The windows-1255 encoding. +#[no_mangle] +pub static WINDOWS_1255_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1255_INIT); + +/// The windows-1256 encoding. +#[no_mangle] +pub static WINDOWS_1256_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1256_INIT); + +/// The windows-1257 encoding. +#[no_mangle] +pub static WINDOWS_1257_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1257_INIT); + +/// The windows-1258 encoding. +#[no_mangle] +pub static WINDOWS_1258_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1258_INIT); + +/// The windows-874 encoding. +#[no_mangle] +pub static WINDOWS_874_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_874_INIT); + +/// The x-mac-cyrillic encoding. +#[no_mangle] +pub static X_MAC_CYRILLIC_ENCODING: ConstEncoding = ConstEncoding(&X_MAC_CYRILLIC_INIT); + +/// The x-user-defined encoding. +#[no_mangle] +pub static X_USER_DEFINED_ENCODING: ConstEncoding = ConstEncoding(&X_USER_DEFINED_INIT); + +// END GENERATED CODE + +#[inline(always)] +fn coder_result_to_u32(result: CoderResult) -> u32 { + match result { + CoderResult::InputEmpty => INPUT_EMPTY, + CoderResult::OutputFull => OUTPUT_FULL, + } +} + +#[inline(always)] +fn decoder_result_to_u32(result: DecoderResult) -> u32 { + match result { + DecoderResult::InputEmpty => INPUT_EMPTY, + DecoderResult::OutputFull => OUTPUT_FULL, + DecoderResult::Malformed(bad, good) => ((good as u32) << 8) | (bad as u32), + } +} + +#[inline(always)] +fn encoder_result_to_u32(result: EncoderResult) -> u32 { + match result { + EncoderResult::InputEmpty => INPUT_EMPTY, + EncoderResult::OutputFull => OUTPUT_FULL, + EncoderResult::Unmappable(c) => c as u32, + } +} + +#[inline(always)] +fn option_to_ptr(opt: Option<&'static Encoding>) -> *const Encoding { + match opt { + None => ::std::ptr::null(), + Some(e) => e, + } +} + +/// Implements the +/// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get) +/// algorithm. +/// +/// If, after ASCII-lowercasing and removing leading and trailing +/// whitespace, the argument matches a label defined in the Encoding +/// Standard, `const Encoding*` representing the corresponding +/// encoding is returned. If there is no match, `NULL` is returned. +/// +/// This is the right function to use if the action upon the method returning +/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) instead. +/// When the action upon the method returning `NULL` is not to proceed with +/// a fallback but to refuse processing, `encoding_for_label_no_replacement()` is +/// more appropriate. +/// +/// The argument buffer can be in any ASCII-compatible encoding. It is not +/// required to be UTF-8. +/// +/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len` +/// is zero, it is OK for `label` to be something non-dereferencable, +/// such as `0x1`. This is required due to Rust's optimization for slices +/// within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `label` and `label_len` don't designate a valid memory block +/// of if `label` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_for_label(label: *const u8, label_len: usize) -> *const Encoding { + let label_slice = ::std::slice::from_raw_parts(label, label_len); + option_to_ptr(Encoding::for_label(label_slice)) +} + +/// This function behaves the same as `encoding_for_label()`, except when +/// `encoding_for_label()` would return `REPLACEMENT_ENCODING`, this method +/// returns `NULL` instead. +/// +/// This method is useful in scenarios where a fatal error is required +/// upon invalid label, because in those cases the caller typically wishes +/// to treat the labels that map to the replacement encoding as fatal +/// errors, too. +/// +/// It is not OK to use this funciton when the action upon the method returning +/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In +/// such a case, the `encoding_for_label()` function should be used instead +/// in order to avoid unsafe fallback for labels that `encoding_for_label()` +/// maps to `REPLACEMENT_ENCODING`. +/// +/// The argument buffer can be in any ASCII-compatible encoding. It is not +/// required to be UTF-8. +/// +/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len` +/// is zero, it is OK for `label` to be something non-dereferencable, +/// such as `0x1`. This is required due to Rust's optimization for slices +/// within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `label` and `label_len` don't designate a valid memory block +/// of if `label` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_for_label_no_replacement( + label: *const u8, + label_len: usize, +) -> *const Encoding { + let label_slice = ::std::slice::from_raw_parts(label, label_len); + option_to_ptr(Encoding::for_label_no_replacement(label_slice)) +} + +/// Performs non-incremental BOM sniffing. +/// +/// The argument must either be a buffer representing the entire input +/// stream (non-streaming case) or a buffer representing at least the first +/// three bytes of the input stream (streaming case). +/// +/// Returns `UTF_8_ENCODING`, `UTF_16LE_ENCODING` or `UTF_16BE_ENCODING` if the +/// argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or `NULL` +/// otherwise. Upon return, `*buffer_len` is the length of the BOM (zero if +/// there is no BOM). +/// +/// `buffer` must be non-`NULL` even if `*buffer_len` is zero. When +/// `*buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_for_bom( + buffer: *const u8, + buffer_len: *mut usize, +) -> *const Encoding { + let buffer_slice = ::std::slice::from_raw_parts(buffer, *buffer_len); + let (encoding, bom_length) = match Encoding::for_bom(buffer_slice) { + Some((encoding, bom_length)) => (encoding as *const Encoding, bom_length), + None => (::std::ptr::null(), 0), + }; + *buffer_len = bom_length; + encoding +} + +/// Writes the name of the given `Encoding` to a caller-supplied buffer as +/// ASCII and returns the number of bytes / ASCII characters written. +/// +/// The output is not null-terminated. +/// +/// The caller _MUST_ ensure that `name_out` points to a buffer whose length +/// is at least `ENCODING_NAME_MAX_LENGTH` bytes. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL` or if `name_out` doesn't point to +/// a valid block of memory whose length is at least +/// `ENCODING_NAME_MAX_LENGTH` bytes. +#[no_mangle] +pub unsafe extern "C" fn encoding_name(encoding: *const Encoding, name_out: *mut u8) -> usize { + let bytes = (*encoding).name().as_bytes(); + ::std::ptr::copy_nonoverlapping(bytes.as_ptr(), name_out, bytes.len()); + bytes.len() +} + +/// Checks whether the _output encoding_ of this encoding can encode every +/// Unicode scalar. (Only true if the output encoding is UTF-8.) +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_can_encode_everything(encoding: *const Encoding) -> bool { + (*encoding).can_encode_everything() +} + +/// Checks whether the bytes 0x00...0x7F map exclusively to the characters +/// U+0000...U+007F and vice versa. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_is_ascii_compatible(encoding: *const Encoding) -> bool { + (*encoding).is_ascii_compatible() +} + +/// Checks whether this encoding maps one byte to one Basic Multilingual +/// Plane code point (i.e. byte length equals decoded UTF-16 length) and +/// vice versa (for mappable characters). +/// +/// `true` iff this encoding is on the list of [Legacy single-byte +/// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) +/// in the spec or x-user-defined. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_is_single_byte(encoding: *const Encoding) -> bool { + (*encoding).is_single_byte() +} + +/// Returns the _output encoding_ of this encoding. This is UTF-8 for +/// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_output_encoding(encoding: *const Encoding) -> *const Encoding { + (*encoding).output_encoding() +} + +/// Allocates a new `Decoder` for the given `Encoding` on the heap with BOM +/// sniffing enabled and returns a pointer to the newly-allocated `Decoder`. +/// +/// BOM sniffing may cause the returned decoder to morph into a decoder +/// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. +/// +/// Once the allocated `Decoder` is no longer needed, the caller _MUST_ +/// deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_decoder(encoding: *const Encoding) -> *mut Decoder { + Box::into_raw(Box::new((*encoding).new_decoder())) +} + +/// Allocates a new `Decoder` for the given `Encoding` on the heap with BOM +/// removal and returns a pointer to the newly-allocated `Decoder`. +/// +/// If the input starts with bytes that are the BOM for this encoding, +/// those bytes are removed. However, the decoder never morphs into a +/// decoder for another encoding: A BOM for another encoding is treated as +/// (potentially malformed) input to the decoding algorithm for this +/// encoding. +/// +/// Once the allocated `Decoder` is no longer needed, the caller _MUST_ +/// deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_decoder_with_bom_removal( + encoding: *const Encoding, +) -> *mut Decoder { + Box::into_raw(Box::new((*encoding).new_decoder_with_bom_removal())) +} + +/// Allocates a new `Decoder` for the given `Encoding` on the heap with BOM +/// handling disabled and returns a pointer to the newly-allocated `Decoder`. +/// +/// If the input starts with bytes that look like a BOM, those bytes are +/// not treated as a BOM. (Hence, the decoder never morphs into a decoder +/// for another encoding.) +/// +/// _Note:_ If the caller has performed BOM sniffing on its own but has not +/// removed the BOM, the caller should use +/// `encoding_new_decoder_with_bom_removal()` instead of this function to cause +/// the BOM to be removed. +/// +/// Once the allocated `Decoder` is no longer needed, the caller _MUST_ +/// deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_decoder_without_bom_handling( + encoding: *const Encoding, +) -> *mut Decoder { + Box::into_raw(Box::new((*encoding).new_decoder_without_bom_handling())) +} + +/// Allocates a new `Decoder` for the given `Encoding` into memory provided by +/// the caller with BOM sniffing enabled. (In practice, the target should +/// likely be a pointer previously returned by `encoding_new_decoder()`.) +/// +/// Note: If the caller has already performed BOM sniffing but has +/// not removed the BOM, the caller should still use this function in +/// order to cause the BOM to be ignored. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_decoder_into( + encoding: *const Encoding, + decoder: *mut Decoder, +) { + *decoder = (*encoding).new_decoder(); +} + +/// Allocates a new `Decoder` for the given `Encoding` into memory provided by +/// the caller with BOM removal. +/// +/// If the input starts with bytes that are the BOM for this encoding, +/// those bytes are removed. However, the decoder never morphs into a +/// decoder for another encoding: A BOM for another encoding is treated as +/// (potentially malformed) input to the decoding algorithm for this +/// encoding. +/// +/// Once the allocated `Decoder` is no longer needed, the caller _MUST_ +/// deallocate it by passing the pointer returned by this function to +/// `decoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_decoder_with_bom_removal_into( + encoding: *const Encoding, + decoder: *mut Decoder, +) { + *decoder = (*encoding).new_decoder_with_bom_removal(); +} + +/// Allocates a new `Decoder` for the given `Encoding` into memory provided by +/// the caller with BOM handling disabled. +/// +/// If the input starts with bytes that look like a BOM, those bytes are +/// not treated as a BOM. (Hence, the decoder never morphs into a decoder +/// for another encoding.) +/// +/// _Note:_ If the caller has performed BOM sniffing on its own but has not +/// removed the BOM, the caller should use +/// `encoding_new_decoder_with_bom_removal_into()` instead of this function to +/// cause the BOM to be removed. +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_decoder_without_bom_handling_into( + encoding: *const Encoding, + decoder: *mut Decoder, +) { + *decoder = (*encoding).new_decoder_without_bom_handling(); +} + +/// Allocates a new `Encoder` for the given `Encoding` on the heap and returns a +/// pointer to the newly-allocated `Encoder`. (Exception, if the `Encoding` is +/// `replacement`, a new `Decoder` for UTF-8 is instantiated (and that +/// `Decoder` reports `UTF_8` as its `Encoding`). +/// +/// Once the allocated `Encoder` is no longer needed, the caller _MUST_ +/// deallocate it by passing the pointer returned by this function to +/// `encoder_free()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_encoder(encoding: *const Encoding) -> *mut Encoder { + Box::into_raw(Box::new((*encoding).new_encoder())) +} + +/// Allocates a new `Encoder` for the given `Encoding` into memory provided by +/// the caller. (In practice, the target should likely be a pointer previously +/// returned by `encoding_new_encoder()`.) +/// +/// # Undefined behavior +/// +/// UB ensues if either argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_new_encoder_into( + encoding: *const Encoding, + encoder: *mut Encoder, +) { + *encoder = (*encoding).new_encoder(); +} + +/// Validates UTF-8. +/// +/// Returns the index of the first byte that makes the input malformed as +/// UTF-8 or `buffer_len` if `buffer` is entirely valid. +/// +/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When +/// `buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_utf8_valid_up_to(buffer: *const u8, buffer_len: usize) -> usize { + let buffer_slice = ::std::slice::from_raw_parts(buffer, buffer_len); + Encoding::utf8_valid_up_to(buffer_slice) +} + +/// Validates ASCII. +/// +/// Returns the index of the first byte that makes the input malformed as +/// ASCII or `buffer_len` if `buffer` is entirely valid. +/// +/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When +/// `buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_ascii_valid_up_to(buffer: *const u8, buffer_len: usize) -> usize { + let buffer_slice = ::std::slice::from_raw_parts(buffer, buffer_len); + Encoding::ascii_valid_up_to(buffer_slice) +} + +/// Validates ISO-2022-JP ASCII-state data. +/// +/// Returns the index of the first byte that makes the input not representable +/// in the ASCII state of ISO-2022-JP or `buffer_len` if `buffer` is entirely +/// representable in the ASCII state of ISO-2022-JP. +/// +/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When +/// `buffer_len` is zero, it is OK for `buffer` to be something +/// non-dereferencable, such as `0x1`. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoding_iso_2022_jp_ascii_valid_up_to( + buffer: *const u8, + buffer_len: usize, +) -> usize { + let buffer_slice = ::std::slice::from_raw_parts(buffer, buffer_len); + Encoding::iso_2022_jp_ascii_valid_up_to(buffer_slice) +} + +/// Deallocates a `Decoder` previously allocated by `encoding_new_decoder()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn decoder_free(decoder: *mut Decoder) { + let _ = Box::from_raw(decoder); +} + +/// The `Encoding` this `Decoder` is for. +/// +/// BOM sniffing can change the return value of this method during the life +/// of the decoder. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn decoder_encoding(decoder: *const Decoder) -> *const Encoding { + (*decoder).encoding() +} + +/// Query the worst-case UTF-8 output size _with replacement_. +/// +/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`) +/// that will not overflow given the current state of the decoder and +/// `byte_length` number of additional input bytes when decoding with +/// errors handled by outputting a REPLACEMENT CHARACTER for each malformed +/// sequence or `SIZE_MAX` if `size_t` would overflow. +/// +/// # Undefined behavior +/// +/// UB ensues if `decoder` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn decoder_max_utf8_buffer_length( + decoder: *const Decoder, + byte_length: usize, +) -> usize { + (*decoder) + .max_utf8_buffer_length(byte_length) + .unwrap_or(::std::usize::MAX) +} + +/// Query the worst-case UTF-8 output size _without replacement_. +/// +/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`) +/// that will not overflow given the current state of the decoder and +/// `byte_length` number of additional input bytes when decoding without +/// replacement error handling or `SIZE_MAX` if `size_t` would overflow. +/// +/// Note that this value may be too small for the `_with_replacement` case. +/// Use `decoder_max_utf8_buffer_length()` for that case. +/// +/// # Undefined behavior +/// +/// UB ensues if `decoder` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn decoder_max_utf8_buffer_length_without_replacement( + decoder: *const Decoder, + byte_length: usize, +) -> usize { + (*decoder) + .max_utf8_buffer_length_without_replacement(byte_length) + .unwrap_or(::std::usize::MAX) +} + +/// Incrementally decode a byte stream into UTF-8 with malformed sequences +/// replaced with the REPLACEMENT CHARACTER. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`Decoder`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +#[no_mangle] +pub unsafe extern "C" fn decoder_decode_to_utf8( + decoder: *mut Decoder, + src: *const u8, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, + last: bool, + had_replacements: *mut bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written, replaced) = (*decoder).decode_to_utf8(src_slice, dst_slice, last); + *src_len = read; + *dst_len = written; + *had_replacements = replaced; + coder_result_to_u32(result) +} + +/// Incrementally decode a byte stream into UTF-8 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`Decoder`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +#[no_mangle] +pub unsafe extern "C" fn decoder_decode_to_utf8_without_replacement( + decoder: *mut Decoder, + src: *const u8, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, + last: bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written) = + (*decoder).decode_to_utf8_without_replacement(src_slice, dst_slice, last); + *src_len = read; + *dst_len = written; + decoder_result_to_u32(result) +} + +/// Query the worst-case UTF-16 output size (with or without replacement). +/// +/// Returns the size of the output buffer in UTF-16 code units (`char16_t`) +/// that will not overflow given the current state of the decoder and +/// `byte_length` number of additional input bytes or `SIZE_MAX` if `size_t` +/// would overflow. +/// +/// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the +/// return value of this method applies also in the +/// `_without_replacement` case. +/// +/// # Undefined behavior +/// +/// UB ensues if `decoder` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn decoder_max_utf16_buffer_length( + decoder: *const Decoder, + u16_length: usize, +) -> usize { + (*decoder) + .max_utf16_buffer_length(u16_length) + .unwrap_or(::std::usize::MAX) +} + +/// Incrementally decode a byte stream into UTF-16 with malformed sequences +/// replaced with the REPLACEMENT CHARACTER. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`Decoder`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +#[no_mangle] +pub unsafe extern "C" fn decoder_decode_to_utf16( + decoder: *mut Decoder, + src: *const u8, + src_len: *mut usize, + dst: *mut u16, + dst_len: *mut usize, + last: bool, + had_replacements: *mut bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written, replaced) = (*decoder).decode_to_utf16(src_slice, dst_slice, last); + *src_len = read; + *dst_len = written; + *had_replacements = replaced; + coder_result_to_u32(result) +} + +/// Incrementally decode a byte stream into UTF-16 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `decoder_decode_*` functions are mapped from Rust and the documentation +/// for the [`Decoder`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html +#[no_mangle] +pub unsafe extern "C" fn decoder_decode_to_utf16_without_replacement( + decoder: *mut Decoder, + src: *const u8, + src_len: *mut usize, + dst: *mut u16, + dst_len: *mut usize, + last: bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written) = + (*decoder).decode_to_utf16_without_replacement(src_slice, dst_slice, last); + *src_len = read; + *dst_len = written; + decoder_result_to_u32(result) +} + +/// Checks for compatibility with storing Unicode scalar values as unsigned +/// bytes taking into account the state of the decoder. +/// +/// Returns `SIZE_MAX` if the decoder is not in a neutral state, including waiting +/// for the BOM, or if the encoding is never Latin1-byte-compatible. +/// +/// Otherwise returns the index of the first byte whose unsigned value doesn't +/// directly correspond to the decoded Unicode scalar value, or the length +/// of the input if all bytes in the input decode directly to scalar values +/// corresponding to the unsigned byte values. +/// +/// Does not change the state of the decoder. +/// +/// Do not use this unless you are supporting SpiderMonkey/V8-style string +/// storage optimizations. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory +/// block of if `buffer` is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn decoder_latin1_byte_compatible_up_to( + decoder: *const Decoder, + buffer: *const u8, + buffer_len: usize, +) -> usize { + (*decoder) + .latin1_byte_compatible_up_to(::std::slice::from_raw_parts(buffer, buffer_len)) + .unwrap_or(::std::usize::MAX) +} + +/// Deallocates an `Encoder` previously allocated by `encoding_new_encoder()`. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoder_free(encoder: *mut Encoder) { + let _ = Box::from_raw(encoder); +} + +/// The `Encoding` this `Encoder` is for. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoder_encoding(encoder: *const Encoder) -> *const Encoding { + (*encoder).encoding() +} + +/// Returns `true` if this is an ISO-2022-JP encoder that's not in the +/// ASCII state and `false` otherwise. +/// +/// # Undefined behavior +/// +/// UB ensues if the argument is `NULL`. +#[no_mangle] +pub unsafe extern "C" fn encoder_has_pending_state(encoder: *const Encoder) -> bool { + (*encoder).has_pending_state() +} + +/// Query the worst-case output size when encoding from UTF-8 with +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `byte_length` number of +/// additional input code units if there are no unmappable characters in +/// the input or `SIZE_MAX` if `size_t` would overflow. +#[no_mangle] +pub unsafe extern "C" fn encoder_max_buffer_length_from_utf8_if_no_unmappables( + encoder: *const Encoder, + byte_length: usize, +) -> usize { + (*encoder) + .max_buffer_length_from_utf8_if_no_unmappables(byte_length) + .unwrap_or(::std::usize::MAX) +} + +/// Query the worst-case output size when encoding from UTF-8 without +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `byte_length` number of +/// additional input code units or `SIZE_MAX` if `size_t` would overflow. +#[no_mangle] +pub unsafe extern "C" fn encoder_max_buffer_length_from_utf8_without_replacement( + encoder: *const Encoder, + byte_length: usize, +) -> usize { + (*encoder) + .max_buffer_length_from_utf8_without_replacement(byte_length) + .unwrap_or(::std::usize::MAX) +} + +/// Incrementally encode into byte stream from UTF-8 with unmappable +/// characters replaced with HTML (decimal) numeric character references. +/// +/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe! +/// If in doubt, check the validity of input before using! +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`Encoder`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +#[no_mangle] +pub unsafe extern "C" fn encoder_encode_from_utf8( + encoder: *mut Encoder, + src: *const u8, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, + last: bool, + had_replacements: *mut bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let string = ::std::str::from_utf8_unchecked(src_slice); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written, replaced) = (*encoder).encode_from_utf8(string, dst_slice, last); + *src_len = read; + *dst_len = written; + *had_replacements = replaced; + coder_result_to_u32(result) +} + +/// Incrementally encode into byte stream from UTF-8 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`Encoder`][1] struct for the semantics. +/// +/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe! +/// If in doubt, check the validity of input before using! +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +#[no_mangle] +pub unsafe extern "C" fn encoder_encode_from_utf8_without_replacement( + encoder: *mut Encoder, + src: *const u8, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, + last: bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let string = ::std::str::from_utf8_unchecked(src_slice); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written) = + (*encoder).encode_from_utf8_without_replacement(string, dst_slice, last); + *src_len = read; + *dst_len = written; + encoder_result_to_u32(result) +} + +/// Query the worst-case output size when encoding from UTF-16 with +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `u16_length` number of +/// additional input code units if there are no unmappable characters in +/// the input or `SIZE_MAX` if `size_t` would overflow. +#[no_mangle] +pub unsafe extern "C" fn encoder_max_buffer_length_from_utf16_if_no_unmappables( + encoder: *const Encoder, + u16_length: usize, +) -> usize { + (*encoder) + .max_buffer_length_from_utf16_if_no_unmappables(u16_length) + .unwrap_or(::std::usize::MAX) +} + +/// Query the worst-case output size when encoding from UTF-16 without +/// replacement. +/// +/// Returns the size of the output buffer in bytes that will not overflow +/// given the current state of the encoder and `u16_length` number of +/// additional input code units or `SIZE_MAX` if `size_t` would overflow. +#[no_mangle] +pub unsafe extern "C" fn encoder_max_buffer_length_from_utf16_without_replacement( + encoder: *const Encoder, + u16_length: usize, +) -> usize { + (*encoder) + .max_buffer_length_from_utf16_without_replacement(u16_length) + .unwrap_or(::std::usize::MAX) +} + +/// Incrementally encode into byte stream from UTF-16 with unmappable +/// characters replaced with HTML (decimal) numeric character references. +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`Encoder`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +#[no_mangle] +pub unsafe extern "C" fn encoder_encode_from_utf16( + encoder: *mut Encoder, + src: *const u16, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, + last: bool, + had_replacements: *mut bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written, replaced) = + (*encoder).encode_from_utf16(src_slice, dst_slice, last); + *src_len = read; + *dst_len = written; + *had_replacements = replaced; + coder_result_to_u32(result) +} + +/// Incrementally encode into byte stream from UTF-16 _without replacement_. +/// +/// See the top-level FFI documentation for documentation for how the +/// `encoder_encode_*` functions are mapped from Rust and the documentation +/// for the [`Encoder`][1] struct for the semantics. +/// +/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero, +/// it is OK for `src` to be something non-dereferencable, such as `0x1`. +/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's +/// optimization for slices within `Option`. +/// +/// # Undefined behavior +/// +/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len` +/// don't designate a valid block of memory or `dst` and `dst_len` don't +/// designate a valid block of memory. +/// +/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html +#[no_mangle] +pub unsafe extern "C" fn encoder_encode_from_utf16_without_replacement( + encoder: *mut Encoder, + src: *const u16, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, + last: bool, +) -> u32 { + let src_slice = ::std::slice::from_raw_parts(src, *src_len); + let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len); + let (result, read, written) = + (*encoder).encode_from_utf16_without_replacement(src_slice, dst_slice, last); + *src_len = read; + *dst_len = written; + encoder_result_to_u32(result) +} diff --git a/third_party/rust/encoding_c_mem/.cargo-checksum.json b/third_party/rust/encoding_c_mem/.cargo-checksum.json new file mode 100644 index 0000000000..b2de1315ea --- /dev/null +++ b/third_party/rust/encoding_c_mem/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CONTRIBUTING.md":"d393951002340c3d98011f7b654e8133408f3f0e13b9f6470f4cb5d251e3afed","COPYRIGHT":"8667a5cdf817b0123721cc7d7ca73e97f05ac926203a13646a9e8a30c70c0989","Cargo.toml":"bc7ca08a7395d4839be804fff569d96a5cf0250be792e074af2f57c1ab1fd8d4","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"3fa4ca83dcc9237839b1bdeb2e6d16bdfb5ec0c5ce42b24694d8bbf0dcbef72c","README.md":"333b861c160f5328e9fb4bf506e8aaaf1a1eab8e93af3ce03998c3f6b57a2da2","build.rs":"013c85c18b035473d3a0900b833906304a8431882e5c22053684a69588adde98","include/encoding_rs_mem.h":"99f2c8d900bdb66ffd74772419a0e50d482d25f20db54a187c78d079fe483be0","include/encoding_rs_mem_cpp.h":"5a546590508d8e1cc78493d6e0a04cdb80a499d23ef192603a31aaf2e518ca3a","src/lib.rs":"7d5940a215cd93b231aafa61cc9cff474e808893a35d5236b99e7317697ac308"},"package":"3a80a16821fe8c7cab96e0c67b57cd7090e021e9615e6ce6ab0cf866c44ed1f0"}
\ No newline at end of file diff --git a/third_party/rust/encoding_c_mem/CONTRIBUTING.md b/third_party/rust/encoding_c_mem/CONTRIBUTING.md new file mode 100644 index 0000000000..88322776f6 --- /dev/null +++ b/third_party/rust/encoding_c_mem/CONTRIBUTING.md @@ -0,0 +1,33 @@ +If you send a pull request / patch, please observe the following. + +## Licensing + +Since this crate is dual-licensed, +[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions) +is considered to apply in the sense of Contributions being automatically +under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file). +That is, by the act of offering a Contribution, you place your Contribution +under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT` +file. Please do not contribute if you aren't willing or allowed to license your +contributions in this manner. + +You are encouraged to dedicate test code that you contribute to the Public +Domain using the CC0 dedication. If you contribute test code that is not +dedicated to the Public Domain, please be sure not to put it in a part of +source code that the comments designate as being dedicated to the Public +Domain. + +## Copyright Notices + +If you require the addition of your copyright notice, it's up to you to edit in +your notice as part of your Contribution. Not adding a copyright notice is +taken as a waiver of copyright notice. + +## Compatibility with Stable Rust + +Please ensure that your Contribution compiles with the latest stable-channel +rustc. + +## rustfmt + +Please run `cargo fmt` before creating a pull.
\ No newline at end of file diff --git a/third_party/rust/encoding_c_mem/COPYRIGHT b/third_party/rust/encoding_c_mem/COPYRIGHT new file mode 100644 index 0000000000..b4569d6701 --- /dev/null +++ b/third_party/rust/encoding_c_mem/COPYRIGHT @@ -0,0 +1,9 @@ +encoding_c_mem is copyright Mozilla Foundation. + +Licensed under the Apache License, Version 2.0 +<LICENSE-APACHE or +https://www.apache.org/licenses/LICENSE-2.0> or the MIT +license <LICENSE-MIT or https://opensource.org/licenses/MIT>, +at your option. All files in the project carrying such +notice may not be copied, modified, or distributed except +according to those terms. diff --git a/third_party/rust/encoding_c_mem/Cargo.toml b/third_party/rust/encoding_c_mem/Cargo.toml new file mode 100644 index 0000000000..2284f40fd8 --- /dev/null +++ b/third_party/rust/encoding_c_mem/Cargo.toml @@ -0,0 +1,27 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "encoding_c_mem" +version = "0.2.6" +authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"] +links = "encoding_c_mem" +description = "C API for encoding_rs::mem" +homepage = "https://docs.rs/encoding_c_mem/" +documentation = "https://docs.rs/encoding_c_mem/" +readme = "README.md" +keywords = ["ffi", "capi", "encoding", "unicode", "charset"] +license = "Apache-2.0 OR MIT" +repository = "https://github.com/hsivonen/encoding_c_mem" +[dependencies.encoding_rs] +version = "0.8.19" diff --git a/third_party/rust/encoding_c_mem/LICENSE-APACHE b/third_party/rust/encoding_c_mem/LICENSE-APACHE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/third_party/rust/encoding_c_mem/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/rust/encoding_c_mem/LICENSE-MIT b/third_party/rust/encoding_c_mem/LICENSE-MIT new file mode 100644 index 0000000000..3317c82e2f --- /dev/null +++ b/third_party/rust/encoding_c_mem/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/rust/encoding_c_mem/README.md b/third_party/rust/encoding_c_mem/README.md new file mode 100644 index 0000000000..59c7abe607 --- /dev/null +++ b/third_party/rust/encoding_c_mem/README.md @@ -0,0 +1,60 @@ +# encoding_c_mem + +[![crates.io](https://meritbadge.herokuapp.com/encoding_c_mem)](https://crates.io/crates/encoding_c_mem) +[![docs.rs](https://docs.rs/encoding_c_mem/badge.svg)](https://docs.rs/encoding_c_mem/) +[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/encoding_c_mem/blob/master/COPYRIGHT) + +encoding_c_mem is an FFI wrapper for the `mem` module of [encoding_rs](https://github.com/hsivonen/encoding_rs). + +## Licensing + +Please see the file named +[COPYRIGHT](https://github.com/hsivonen/encoding_c_mem/blob/master/COPYRIGHT). + +## No Unwinding Support! + +This crate is meant for use in binaries compiled with `panic = 'abort'`, which +is _required_ for correctness! Unwinding across FFI is Undefined Behavior, and +this crate does nothing to try to prevent unwinding across the FFI if +compiled with unwinding enabled. + +## Release Notes + +### 0.2.6 + +* Remove year from copyright notices. + +### 0.2.5 + +* Specify a `links` value in the Cargo manifest. +* Emit an `include_dir` variable from build script so that other build scripts + depending on this crate can rely on it. + +### 0.2.4 + +* Documentation-only fix. + +### 0.2.3 + +* Documentation-only fix. + +### 0.2.2 + +* Wrap `convert_utf8_to_utf16_without_replacement`, `utf8_latin1_up_to`, + and `str_latin1_up_to`. + +### 0.2.1 + +* Fix a typo in README. + +### 0.2.0 + +* Use `char` instead of `uint8_t` for 8-bit-unit text in C and C++. + +### 0.1.1 + +* Add include guard to the C header. + +### 0.1.0 + +* Initial release of encoding_c_mem. diff --git a/third_party/rust/encoding_c_mem/build.rs b/third_party/rust/encoding_c_mem/build.rs new file mode 100644 index 0000000000..962b7ae12b --- /dev/null +++ b/third_party/rust/encoding_c_mem/build.rs @@ -0,0 +1,7 @@ +fn main() { + println!("cargo:rerun-if-changed="); + + let cargo_manifest_dir = std::env::var_os("CARGO_MANIFEST_DIR").unwrap(); + let include_dir = std::path::PathBuf::from(cargo_manifest_dir).join("include"); + println!("cargo:include-dir={}", include_dir.display()); +} diff --git a/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h b/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h new file mode 100644 index 0000000000..2327a9dd0b --- /dev/null +++ b/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h @@ -0,0 +1,704 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#ifndef encoding_rs_mem_h_ +#define encoding_rs_mem_h_ + +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> + +/* + * _Note:_ "Latin1" in this header refers to the Unicode range from U+0000 to + * U+00FF, inclusive, and does not refer to the windows-1252 range. This + * in-memory encoding is sometimes used as a storage optimization of text + * when UTF-16 indexing and length semantics are exposed. + */ + +/** + * Classification of text as Latin1 (all code points are below U+0100), + * left-to-right with some non-Latin1 characters or as containing at least + * some right-to-left characters. + */ +typedef enum { + /** + * Every character is below U+0100. + */ + Latin1 = 0, + /** + * There is at least one character that's U+0100 or higher, but there + * are no right-to-left characters. + */ + LeftToRight = 1, + /** + * There is at least one right-to-left character. + */ + Bidi = 2, +} Latin1Bidi; + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/** + * Checks whether a valid UTF-8 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. + * Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory designated by `buffer` and + * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` + * may be bogus but still has to be non-`NULL`.) + */ +Latin1Bidi encoding_mem_check_str_for_latin1_and_bidi(const char* buffer, + size_t len); + +/** + * Checks whether a potentially invalid UTF-16 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +Latin1Bidi encoding_mem_check_utf16_for_latin1_and_bidi(const char16_t* buffer, + size_t len); + +/** + * Checks whether a potentially invalid UTF-8 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. + * + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +Latin1Bidi encoding_mem_check_utf8_for_latin1_and_bidi(const char* buffer, + size_t len); + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of `char16_t`s written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_latin1_to_utf16(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8. + * + * The length of the destination buffer must be at least the length of the + * source buffer times two. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Safety + * + * Note that this function may write garbage beyond the number of bytes + * indicated by the return value. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_latin1_to_utf8(const char* src, size_t src_len, + char* dst, size_t dst_len); + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient + * output space. + * + * Writes the number of code units read into `*src_len` and the number of + * bytes written into `*dst_len`. + * + * If the output isn't large enough, not all input is consumed. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_latin1_to_utf8_partial(const char* src, + size_t* src_len, char* dst, + size_t* dst_len); + +/** + * Converts valid UTF-8 to valid UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL`, if the two memory blocks overlap, of if the + * buffer designated by `src` and `src_len` does not contain valid UTF-8. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_str_to_utf16(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * If the input is valid UTF-16 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, does something + * that is memory-safe without any promises about any properties of the + * output and will probably assert in debug builds in future versions. + * In particular, callers shouldn't assume the output to be the same across + * crate versions or CPU architectures and should not assume that non-ASCII + * input can't map to ASCII output. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of bytes written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * (Probably in future versions if debug assertions are enabled (and not + * fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, + size_t src_len, char* dst, + size_t dst_len); + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer times three. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len, + char* dst, size_t dst_len); + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER with potentially insufficient output + * space. + * + * Writes the number of code units read into `*src_len` and the number of + * bytes written into `*dst_len`. + * + * Guarantees that the bytes in the destination beyond the number of + * bytes claimed as written by the second item of the return tuple + * are left unmodified. + * + * Not all code units are read if there isn't enough output space. + * Note that this method isn't designed for general streamability but for + * not allocating memory for the worst case up front. Specifically, + * if the input starts with or ends with an unpaired surrogate, those are + * replaced with the REPLACEMENT CHARACTER. + * + * Matches the semantics of `TextEncoder.encodeInto()` from the + * Encoding Standard. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +void encoding_mem_convert_utf16_to_utf8_partial(const char16_t* src, + size_t* src_len, char* dst, + size_t* dst_len); + +/** + * If the input is valid UTF-8 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, this function + * panics if debug assertions are enabled (and fuzzing isn't) and otherwise + * does something that is memory-safe without any promises about any + * properties of the output. In particular, callers shouldn't assume the + * output to be the same across crate versions or CPU architectures and + * should not assume that non-ASCII input can't map to ASCII output. + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * If debug assertions are enabled (and not fuzzing) and the input is + * not in the range U+0000 to U+00FF, inclusive. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf8_to_latin1_lossy(const char* src, + size_t src_len, char* dst, + size_t dst_len); + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer _plus one_. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf8_to_utf16(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written or `SIZE_MAX` if the input was + * invalid. + * + * When the input was invalid, some output may have been written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_convert_utf8_to_utf16_without_replacement(const char* src, + size_t src_len, + char16_t* dst, + size_t dst_len); + +/** + * Copies ASCII from source to destination up to the first non-ASCII byte + * (or the end of the input if it is ASCII in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_copy_ascii_to_ascii(const char* src, size_t src_len, + char* dst, size_t dst_len); + +/** + * Copies ASCII from source to destination zero-extending it to UTF-16 up to + * the first non-ASCII byte (or the end of the input if it is ASCII in its + * entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_copy_ascii_to_basic_latin(const char* src, size_t src_len, + char16_t* dst, size_t dst_len); + +/** + * Copies Basic Latin from source to destination narrowing it to ASCII up to + * the first non-Basic Latin code unit (or the end of the input if it is + * Basic Latin in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `src_len` don't designate a valid memory block, if + * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory + * block, if `dst` is `NULL` or if the two memory blocks overlap. (If + * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and + * aligned. Likewise for `dst` and `dst_len`.) + */ +size_t encoding_mem_copy_basic_latin_to_ascii(const char16_t* src, + size_t src_len, char* dst, + size_t dst_len); + +/** + * Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +void encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t len); + +/** + * Checks whether the buffer is all-ASCII. + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +bool encoding_mem_is_ascii(const char* buffer, size_t len); + +/** + * Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing + * only ASCII characters). + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +bool encoding_mem_is_basic_latin(const char16_t* buffer, size_t len); + +/** + * Checks whether a scalar value triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * + * # Undefined behavior + * + * Undefined behavior ensues if `c` is not a valid Unicode Scalar Value. + */ +bool encoding_mem_is_char_bidi(char32_t c); + +/** + * Checks whether a valid UTF-8 buffer contains code points that trigger + * right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory designated by `buffer` and + * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` + * may be bogus but still has to be non-`NULL`.) + */ +bool encoding_mem_is_str_bidi(const char* buffer, size_t len); + +/** + * Checks whether the buffer represents only code points less than or equal + * to U+00FF. + * + * Fails fast. (I.e. returns before having read the whole buffer if code + * points above U+00FF are discovered. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory designated by `buffer` and + * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` + * may be bogus but still has to be non-`NULL`.) + */ +bool encoding_mem_is_str_latin1(const char* buffer, size_t len); + +/** + * Checks whether a UTF-16 buffer contains code points that trigger + * right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input contains an RTL character or an unpaired + * high surrogate that could be the high half of an RTL character. + * Returns `false` if the input contains neither RTL characters nor + * unpaired high surrogates that could be higher halves of RTL characters. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +bool encoding_mem_is_utf16_bidi(const char16_t* buffer, size_t len); + +/** + * Checks whether a UTF-16 code unit triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Since supplementary-plane right-to-left blocks are identifiable from the + * high surrogate without examining the low surrogate, this function returns + * `true` for such high surrogates making the function suitable for handling + * supplementary-plane text without decoding surrogate pairs to scalar + * values. Obviously, such high surrogates are then reported as right-to-left + * even if actually unpaired. + */ +bool encoding_mem_is_utf16_code_unit_bidi(char16_t u); + +/** + * Checks whether the buffer represents only code point less than or equal + * to U+00FF. + * + * May read the entire buffer even if it isn't all-Latin1. (I.e. the function + * is not guaranteed to fail fast.) + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +bool encoding_mem_is_utf16_latin1(const char16_t* buffer, size_t len); + +/** + * Checks whether a potentially-invalid UTF-8 buffer contains code points + * that trigger right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input is invalid UTF-8 or the input contains an + * RTL character. Returns `false` if the input is valid UTF-8 and contains + * no RTL characters. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +bool encoding_mem_is_utf8_bidi(const char* buffer, size_t len); + +/** + * Checks whether the buffer is valid UTF-8 representing only code points + * less than or equal to U+00FF. + * + * Fails fast. (I.e. returns before having read the whole buffer if UTF-8 + * invalidity or code points above U+00FF are discovered. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL`.) + */ +bool encoding_mem_is_utf8_latin1(const char* buffer, size_t len); + +/** + * Returns the index of the first unpaired surrogate or, if the input is + * valid UTF-16 in its entirety, the length of the input. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +size_t encoding_mem_utf16_valid_up_to(const char16_t* buffer, size_t len); + +/** + * Returns the index of first byte that starts an invalid byte + * sequence or a non-Latin1 byte sequence, or the length of the + * string if there are neither. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block + * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but + * still has to be non-`NULL` and aligned.) + */ +size_t encoding_mem_utf8_latin1_up_to(const char* buffer, size_t len); + +/** + * Returns the index of first byte that starts a non-Latin1 byte + * sequence, or the length of the string if there are none. + * + * # Undefined behavior + * + * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, + * if `buffer` is `NULL`, or if the memory block does not contain valid UTF-8. + * (If `buffer_len` is `0`, `buffer` may be bogus but still has to be non-`NULL` + * and aligned.) + */ +size_t encoding_mem_str_latin1_up_to(const char* buffer, size_t len); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // encoding_rs_mem_h_ diff --git a/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h b/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h new file mode 100644 index 0000000000..b6173d7ef4 --- /dev/null +++ b/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h @@ -0,0 +1,578 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#pragma once + +#ifndef encoding_rs_mem_cpp_h_ +#define encoding_rs_mem_cpp_h_ + +#include <optional> +#include <string_view> +#include <tuple> +#include "gsl/gsl" + +#include "encoding_rs_mem.h" + +namespace encoding_rs { +namespace mem { + +namespace detail { +/** + * Replaces `nullptr` with a bogus pointer suitable for use as part of a + * zero-length Rust slice. + */ +template <class T> +static inline T* null_to_bogus(T* ptr) { + return ptr ? ptr : reinterpret_cast<T*>(alignof(T)); +} +}; // namespace detail + +/** + * Checks whether a potentially invalid UTF-16 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + */ +inline Latin1Bidi check_for_latin1_and_bidi(std::u16string_view buffer) { + return encoding_mem_check_utf16_for_latin1_and_bidi( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a potentially invalid UTF-8 buffer contains code points + * that trigger right-to-left processing or is all-Latin1. + * + * Possibly more efficient than performing the checks separately. + * + * Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. + * + * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return + * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. + */ +inline Latin1Bidi check_for_latin1_and_bidi(std::string_view buffer) { + return encoding_mem_check_utf8_for_latin1_and_bidi( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of `char16_t`s written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline void convert_latin1_to_utf16(gsl::span<const char> src, + gsl::span<char16_t> dst) { + encoding_mem_convert_latin1_to_utf16( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8. + * + * The length of the destination buffer must be at least the length of the + * source buffer times two. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Safety + * + * Note that this function may write garbage beyond the number of bytes + * indicated by the return value. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline size_t convert_latin1_to_utf8(gsl::span<const char> src, + gsl::span<char> dst) { + return encoding_mem_convert_latin1_to_utf8( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts bytes whose unsigned value is interpreted as Unicode code point + * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient + * output space. + * + * Returns the number of bytes read and the number of bytes written. + * + * If the output isn't large enough, not all input is consumed. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline std::tuple<size_t, size_t> convert_latin1_to_utf8_partial( + gsl::span<const char> src, gsl::span<char> dst) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + encoding_mem_convert_latin1_to_utf8_partial( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + &src_read, encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + &dst_written); + return {src_read, dst_written}; +} + +/** + * Converts valid UTF-8 to valid UTF-16. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t convert_str_to_utf16(std::string_view src, + gsl::span<char16_t> dst) { + return encoding_mem_convert_str_to_utf16( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * If the input is valid UTF-16 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, does something + * that is memory-safe without any promises about any properties of the + * output and will probably assert in debug builds in future versions. + * In particular, callers shouldn't assume the output to be the same across + * crate versions or CPU architectures and should not assume that non-ASCII + * input can't map to ASCII output. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * The number of bytes written equals the length of the source buffer. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * (Probably in future versions if debug assertions are enabled (and not + * fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) + */ +inline void convert_utf16_to_latin1_lossy(std::u16string_view src, + gsl::span<char> dst) { + encoding_mem_convert_utf16_to_latin1_lossy( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer times three. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t convert_utf16_to_utf8(std::u16string_view src, + gsl::span<char> dst) { + return encoding_mem_convert_utf16_to_utf8( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced + * with the REPLACEMENT CHARACTER with potentially insufficient output + * space. + * + * Returns the number of code units read and the number of bytes written. + * + * Guarantees that the bytes in the destination beyond the number of + * bytes claimed as written by the second item of the return tuple + * are left unmodified. + * + * Not all code units are read if there isn't enough output space. + * Note that this method isn't designed for general streamability but for + * not allocating memory for the worst case up front. Specifically, + * if the input starts with or ends with an unpaired surrogate, those are + * replaced with the REPLACEMENT CHARACTER. + * + * Matches the semantics of `TextEncoder.encodeInto()` from the + * Encoding Standard. + */ +inline std::tuple<size_t, size_t> convert_utf16_to_utf8_partial( + std::u16string_view src, gsl::span<char> dst) { + size_t src_read = src.size(); + size_t dst_written = dst.size(); + encoding_mem_convert_utf16_to_utf8_partial( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + &src_read, encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + &dst_written); + return {src_read, dst_written}; +} + +/** + * If the input is valid UTF-8 representing only Unicode code points from + * U+0000 to U+00FF, inclusive, converts the input into output that + * represents the value of each code point as the unsigned byte value of + * each output byte. + * + * If the input does not fulfill the condition stated above, this function + * panics if debug assertions are enabled (and fuzzing isn't) and otherwise + * does something that is memory-safe without any promises about any + * properties of the output. In particular, callers shouldn't assume the + * output to be the same across crate versions or CPU architectures and + * should not assume that non-ASCII input can't map to ASCII output. + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * If debug assertions are enabled (and not fuzzing) and the input is + * not in the range U+0000 to U+00FF, inclusive. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline size_t convert_utf8_to_latin1_lossy(std::string_view src, + gsl::span<char> dst) { + return encoding_mem_convert_utf8_to_latin1_lossy( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced + * with the REPLACEMENT CHARACTER. + * + * The length of the destination buffer must be at least the length of the + * source buffer _plus one_. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t convert_utf8_to_utf16(std::string_view src, + gsl::span<char16_t> dst) { + return encoding_mem_convert_utf8_to_utf16( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written or `std::nullopt` if the input was + * invalid. + * + * When the input was invalid, some output may have been written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline std::optional<size_t> convert_utf8_to_utf16_without_replacement( + std::string_view src, gsl::span<char16_t> dst) { + size_t val = encoding_mem_convert_utf8_to_utf16_without_replacement( + encoding_rs::mem::detail::null_to_bogus<const char>( + reinterpret_cast<const char*>(src.data())), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); + if (val == SIZE_MAX) { + return std::nullopt; + } + return val; +} + +/** + * Copies ASCII from source to destination up to the first non-ASCII byte + * (or the end of the input if it is ASCII in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + * + * # Undefined behavior + * + * UB ensues if `src` and `dst` overlap. + */ +inline size_t copy_ascii_to_ascii(gsl::span<const char> src, + gsl::span<char> dst) { + return encoding_mem_copy_ascii_to_ascii( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Copies ASCII from source to destination zero-extending it to UTF-16 up to + * the first non-ASCII byte (or the end of the input if it is ASCII in its + * entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of `char16_t`s written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t copy_ascii_to_basic_latin(gsl::span<const char> src, + gsl::span<char16_t> dst) { + return encoding_mem_copy_ascii_to_basic_latin( + encoding_rs::mem::detail::null_to_bogus<const char>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()), + dst.size()); +} + +/** + * Copies Basic Latin from source to destination narrowing it to ASCII up to + * the first non-Basic Latin code unit (or the end of the input if it is + * Basic Latin in its entirety). + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * Returns the number of bytes written. + * + * # Panics + * + * Panics if the destination buffer is shorter than stated above. + */ +inline size_t copy_basic_latin_to_ascii(gsl::span<const char16_t> src, + gsl::span<char> dst) { + return encoding_mem_copy_basic_latin_to_ascii( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()), + src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()), + dst.size()); +} + +/** + * Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. + */ +inline void ensure_utf16_validity(gsl::span<char16_t> buffer) { + encoding_mem_ensure_utf16_validity( + encoding_rs::mem::detail::null_to_bogus<char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether the buffer is all-ASCII. + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + */ +inline bool is_ascii(std::string_view buffer) { + return encoding_mem_is_ascii( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing + * only ASCII characters). + * + * May read the entire buffer even if it isn't all-ASCII. (I.e. the function + * is not guaranteed to fail fast.) + */ +inline bool is_ascii(std::u16string_view buffer) { + return encoding_mem_is_basic_latin( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a scalar value triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * + * # Undefined behavior + * + * Undefined behavior ensues if `c` is not a valid Unicode Scalar Value. + */ +inline bool is_scalar_value_bidi(char32_t c) { + return encoding_mem_is_char_bidi(c); +} + +/** + * Checks whether a UTF-16 buffer contains code points that trigger + * right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input contains an RTL character or an unpaired + * high surrogate that could be the high half of an RTL character. + * Returns `false` if the input contains neither RTL characters nor + * unpaired high surrogates that could be higher halves of RTL characters. + */ +inline bool is_bidi(std::u16string_view buffer) { + return encoding_mem_is_utf16_bidi( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a UTF-16 code unit triggers right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Since supplementary-plane right-to-left blocks are identifiable from the + * high surrogate without examining the low surrogate, this function returns + * `true` for such high surrogates making the function suitable for handling + * supplementary-plane text without decoding surrogate pairs to scalar + * values. Obviously, such high surrogates are then reported as right-to-left + * even if actually unpaired. + */ +inline bool is_utf16_code_unit_bidi(char16_t u) { + return encoding_mem_is_utf16_code_unit_bidi(u); +} + +/** + * Checks whether the buffer represents only code point less than or equal + * to U+00FF. + * + * May read the entire buffer even if it isn't all-Latin1. (I.e. the function + * is not guaranteed to fail fast.) + */ +inline bool is_utf16_latin1(std::u16string_view buffer) { + return encoding_mem_is_utf16_latin1( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether a potentially-invalid UTF-8 buffer contains code points + * that trigger right-to-left processing. + * + * The check is done on a Unicode block basis without regard to assigned + * vs. unassigned code points in the block. Hebrew presentation forms in + * the Alphabetic Presentation Forms block are treated as if they formed + * a block on their own (i.e. it treated as right-to-left). Additionally, + * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked + * for. Control characters that are technically bidi controls but do not + * cause right-to-left behavior without the presence of right-to-left + * characters or right-to-left controls are not checked for. As a special + * case, U+FEFF is excluded from Arabic Presentation Forms-B. + * Returns `true` if the input is invalid UTF-8 or the input contains an + * RTL character. Returns `false` if the input is valid UTF-8 and contains + * no RTL characters. + */ +inline bool is_bidi(std::string_view buffer) { + return encoding_mem_is_utf8_bidi( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Checks whether the buffer is valid UTF-8 representing only code points + * less than or equal to U+00FF. + * + * Fails fast. (I.e. returns before having read the whole buffer if UTF-8 + * invalidity or code points above U+00FF are discovered. + */ +inline bool is_utf8_latin1(std::string_view buffer) { + return encoding_mem_is_utf8_latin1( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +/** + * Returns the index of the first unpaired surrogate or, if the input is + * valid UTF-16 in its entirety, the length of the input. + */ +inline size_t utf16_valid_up_to(std::u16string_view buffer) { + return encoding_mem_utf16_valid_up_to( + encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()), + buffer.size()); +} + +/** + * Returns the index of first byte that starts a non-Latin1 byte + * sequence, or the length of the string if there are none. + */ +inline size_t utf8_latin1_up_to(std::string_view buffer) { + return encoding_mem_utf8_latin1_up_to( + encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()), + buffer.size()); +} + +}; // namespace mem +}; // namespace encoding_rs + +#endif // encoding_rs_mem_cpp_h_ diff --git a/third_party/rust/encoding_c_mem/src/lib.rs b/third_party/rust/encoding_c_mem/src/lib.rs new file mode 100644 index 0000000000..e5f31c1be0 --- /dev/null +++ b/third_party/rust/encoding_c_mem/src/lib.rs @@ -0,0 +1,825 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! FFI bindings for `encoding_rs::mem`. +//! +//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to +//! U+00FF, inclusive, and does not refer to the windows-1252 range. This +//! in-memory encoding is sometimes used as a storage optimization of text +//! when UTF-16 indexing and length semantics are exposed. + +use encoding_rs::mem::Latin1Bidi; + +/// Checks whether the buffer is all-ASCII. +/// +/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function +/// is not guaranteed to fail fast.) +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_ascii(buffer: *const u8, len: usize) -> bool { + encoding_rs::mem::is_ascii(::std::slice::from_raw_parts(buffer, len)) +} + +/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing +/// only ASCII characters). +/// +/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function +/// is not guaranteed to fail fast.) +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL` and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_basic_latin(buffer: *const u16, len: usize) -> bool { + encoding_rs::mem::is_basic_latin(::std::slice::from_raw_parts(buffer, len)) +} + +/// Checks whether the buffer is valid UTF-8 representing only code points +/// less than or equal to U+00FF. +/// +/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8 +/// invalidity or code points above U+00FF are discovered. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_utf8_latin1(buffer: *const u8, len: usize) -> bool { + encoding_rs::mem::is_utf8_latin1(::std::slice::from_raw_parts(buffer, len)) +} + +/// Checks whether the buffer represents only code points less than or equal +/// to U+00FF. +/// +/// Fails fast. (I.e. returns before having read the whole buffer if code +/// points above U+00FF are discovered. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, +/// if `buffer` is `NULL`, or if the memory designated by `buffer` and `buffer_len` +/// does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_str_latin1(buffer: *const u8, len: usize) -> bool { + encoding_rs::mem::is_str_latin1(::std::str::from_utf8_unchecked( + ::std::slice::from_raw_parts(buffer, len), + )) +} + +/// Checks whether the buffer represents only code point less than or equal +/// to U+00FF. +/// +/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function +/// is not guaranteed to fail fast.) +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL` and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_utf16_latin1(buffer: *const u16, len: usize) -> bool { + encoding_rs::mem::is_utf16_latin1(::std::slice::from_raw_parts(buffer, len)) +} + +/// Checks whether a potentially-invalid UTF-8 buffer contains code points +/// that trigger right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// Returns `true` if the input is invalid UTF-8 or the input contains an +/// RTL character. Returns `false` if the input is valid UTF-8 and contains +/// no RTL characters. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_utf8_bidi(buffer: *const u8, len: usize) -> bool { + encoding_rs::mem::is_utf8_bidi(::std::slice::from_raw_parts(buffer, len)) +} + +/// Checks whether a valid UTF-8 buffer contains code points that trigger +/// right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, +/// if `buffer` is `NULL`, or if the memory designated by `buffer` and `buffer_len` +/// does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_str_bidi(buffer: *const u8, len: usize) -> bool { + encoding_rs::mem::is_str_bidi(::std::str::from_utf8_unchecked( + ::std::slice::from_raw_parts(buffer, len), + )) +} + +/// Checks whether a UTF-16 buffer contains code points that trigger +/// right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// Returns `true` if the input contains an RTL character or an unpaired +/// high surrogate that could be the high half of an RTL character. +/// Returns `false` if the input contains neither RTL characters nor +/// unpaired high surrogates that could be higher halves of RTL characters. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL` and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_utf16_bidi(buffer: *const u16, len: usize) -> bool { + encoding_rs::mem::is_utf16_bidi(::std::slice::from_raw_parts(buffer, len)) +} + +/// Checks whether a scalar value triggers right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// # Undefined behavior +/// +/// Undefined behavior ensues if `c` is not a valid Unicode Scalar Value. +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_char_bidi(c: char) -> bool { + encoding_rs::mem::is_char_bidi(c) +} + +/// Checks whether a UTF-16 code unit triggers right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// Since supplementary-plane right-to-left blocks are identifiable from the +/// high surrogate without examining the low surrogate, this function returns +/// `true` for such high surrogates making the function suitable for handling +/// supplementary-plane text without decoding surrogate pairs to scalar +/// values. Obviously, such high surrogates are then reported as right-to-left +/// even if actually unpaired. +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_is_utf16_code_unit_bidi(u: u16) -> bool { + encoding_rs::mem::is_utf16_code_unit_bidi(u) +} + +/// Checks whether a potentially invalid UTF-8 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_check_utf8_for_latin1_and_bidi( + buffer: *const u8, + len: usize, +) -> Latin1Bidi { + encoding_rs::mem::check_utf8_for_latin1_and_bidi(::std::slice::from_raw_parts(buffer, len)) +} + +/// Checks whether a valid UTF-8 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, +/// if `buffer` is `NULL`, or if the memory designated by `buffer` and `buffer_len` +/// does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_check_str_for_latin1_and_bidi( + buffer: *const u8, + len: usize, +) -> Latin1Bidi { + encoding_rs::mem::check_str_for_latin1_and_bidi(::std::str::from_utf8_unchecked( + ::std::slice::from_raw_parts(buffer, len), + )) +} + +/// Checks whether a potentially invalid UTF-16 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL` and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_check_utf16_for_latin1_and_bidi( + buffer: *const u16, + len: usize, +) -> Latin1Bidi { + encoding_rs::mem::check_utf16_for_latin1_and_bidi(::std::slice::from_raw_parts(buffer, len)) +} + +/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced +/// with the REPLACEMENT CHARACTER. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer _plus one_. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_utf8_to_utf16( + src: *const u8, + src_len: usize, + dst: *mut u16, + dst_len: usize, +) -> usize { + encoding_rs::mem::convert_utf8_to_utf16( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} + +/// Converts valid UTF-8 to valid UTF-16. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL`, if the two memory blocks overlap, of if the +/// buffer designated by `src` and `src_len` does not contain valid UTF-8. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_str_to_utf16( + src: *const u8, + src_len: usize, + dst: *mut u16, + dst_len: usize, +) -> usize { + encoding_rs::mem::convert_str_to_utf16( + ::std::str::from_utf8_unchecked(::std::slice::from_raw_parts(src, src_len)), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} + +/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written or `SIZE_MAX` if the input was invalid. +/// +/// When the input was invalid, some output may have been written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_utf8_to_utf16_without_replacement( + src: *const u8, + src_len: usize, + dst: *mut u16, + dst_len: usize, +) -> usize { + encoding_rs::mem::convert_utf8_to_utf16_without_replacement( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ).unwrap_or(::std::usize::MAX) +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER with potentially insufficient output +/// space. +/// +/// Writes the number of code units read into `*src_len` and the number of +/// bytes written into `*dst_len`. +/// +/// Guarantees that the bytes in the destination beyond the number of +/// bytes claimed as written by the second item of the return tuple +/// are left unmodified. +/// +/// Not all code units are read if there isn't enough output space. +/// +/// Note that this method isn't designed for general streamability but for +/// not allocating memory for the worst case up front. Specifically, +/// if the input starts with or ends with an unpaired surrogate, those are +/// replaced with the REPLACEMENT CHARACTER. +/// +/// Matches the semantics of `TextEncoder.encodeInto()` from the +/// Encoding Standard. +/// +/// # Safety +/// +/// If you want to convert into a `&mut str`, use +/// `convert_utf16_to_str_partial()` instead of using this function +/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8_partial( + src: *const u16, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, +) { + let (read, written) = encoding_rs::mem::convert_utf16_to_utf8_partial( + ::std::slice::from_raw_parts(src, *src_len), + ::std::slice::from_raw_parts_mut(dst, *dst_len), + ); + *src_len = read; + *dst_len = written; +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times three. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Safety +/// +/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()` +/// instead of using this function together with the `unsafe` method +/// `as_bytes_mut()` on `&mut str`. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8( + src: *const u16, + src_len: usize, + dst: *mut u8, + dst_len: usize, +) -> usize { + encoding_rs::mem::convert_utf16_to_utf8( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// The number of `u16`s written equals the length of the source buffer. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf16( + src: *const u8, + src_len: usize, + dst: *mut u16, + dst_len: usize, +) { + encoding_rs::mem::convert_latin1_to_utf16( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ); +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient +/// output space. +/// +/// Writes the number of code units read into `*src_len` and the number of +/// bytes written into `*dst_len`. +/// +/// If the output isn't large enough, not all input is consumed. +/// +/// # Safety +/// +/// If you want to convert into a `&mut str`, use +/// `encoding_mem_convert_latin1_to_str_partial()` instead of using this function +/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf8_partial( + src: *const u8, + src_len: *mut usize, + dst: *mut u8, + dst_len: *mut usize, +) { + let (read, written) = encoding_rs::mem::convert_latin1_to_utf8_partial( + ::std::slice::from_raw_parts(src, *src_len), + ::std::slice::from_raw_parts_mut(dst, *dst_len), + ); + *src_len = read; + *dst_len = written; +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times two. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Safety +/// +/// Note that this function may write garbage beyond the number of bytes +/// indicated by the return value, so using a `&mut str` interpreted as +/// `&mut [u8]` as the destination is not safe. If you want to convert into +/// a `&mut str`, use `convert_utf16_to_str()` instead of this function. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf8( + src: *const u8, + src_len: usize, + dst: *mut u8, + dst_len: usize, +) -> usize { + encoding_rs::mem::convert_latin1_to_utf8( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} + +/// If the input is valid UTF-8 representing only Unicode code points from +/// U+0000 to U+00FF, inclusive, converts the input into output that +/// represents the value of each code point as the unsigned byte value of +/// each output byte. +/// +/// If the input does not fulfill the condition stated above, this function +/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise +/// does something that is memory-safe without any promises about any +/// properties of the output. In particular, callers shouldn't assume the +/// output to be the same across crate versions or CPU architectures and +/// should not assume that non-ASCII input can't map to ASCII output. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// If debug assertions are enabled (and not fuzzing) and the input is +/// not in the range U+0000 to U+00FF, inclusive. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_utf8_to_latin1_lossy( + src: *const u8, + src_len: usize, + dst: *mut u8, + dst_len: usize, +) -> usize { + encoding_rs::mem::convert_utf8_to_latin1_lossy( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} + +/// If the input is valid UTF-16 representing only Unicode code points from +/// U+0000 to U+00FF, inclusive, converts the input into output that +/// represents the value of each code point as the unsigned byte value of +/// each output byte. +/// +/// If the input does not fulfill the condition stated above, does something +/// that is memory-safe without any promises about any properties of the +/// output and will probably assert in debug builds in future versions. +/// In particular, callers shouldn't assume the output to be the same across +/// crate versions or CPU architectures and should not assume that non-ASCII +/// input can't map to ASCII output. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// The number of bytes written equals the length of the source buffer. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// (Probably in future versions if debug assertions are enabled (and not +/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_convert_utf16_to_latin1_lossy( + src: *const u16, + src_len: usize, + dst: *mut u8, + dst_len: usize, +) { + encoding_rs::mem::convert_utf16_to_latin1_lossy( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ); +} + +/// Returns the index of the first unpaired surrogate or, if the input is +/// valid UTF-16 in its entirety, the length of the input. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL` and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_utf16_valid_up_to(buffer: *const u16, len: usize) -> usize { + encoding_rs::mem::utf16_valid_up_to(::std::slice::from_raw_parts(buffer, len)) +} + +/// Returns the index of first byte that starts an invalid byte +/// sequence or a non-Latin1 byte sequence, or the length of the +/// string if there are neither. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL` and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_utf8_latin1_up_to(buffer: *const u8, len: usize) -> usize { + encoding_rs::mem::utf8_latin1_up_to(::std::slice::from_raw_parts(buffer, len)) +} + +/// Returns the index of first byte that starts a non-Latin1 byte +/// sequence, or the length of the string if there are none. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, +/// if `buffer` is `NULL`, or if the memory block does not contain valid UTF-8. +/// (If `buffer_len` is `0`, `buffer` may be bogus but still has to be non-`NULL` +/// and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_str_latin1_up_to(buffer: *const u8, len: usize) -> usize { + encoding_rs::mem::str_latin1_up_to(::std::str::from_utf8_unchecked( + ::std::slice::from_raw_parts(buffer, len), + )) +} + +/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. +/// +/// # Undefined behavior +/// +/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block +/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but +/// still has to be non-`NULL` and aligned.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_ensure_utf16_validity(buffer: *mut u16, len: usize) { + encoding_rs::mem::ensure_utf16_validity(::std::slice::from_raw_parts_mut(buffer, len)); +} + +/// Copies ASCII from source to destination up to the first non-ASCII byte +/// (or the end of the input if it is ASCII in its entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_copy_ascii_to_ascii( + src: *const u8, + src_len: usize, + dst: *mut u8, + dst_len: usize, +) -> usize { + encoding_rs::mem::copy_ascii_to_ascii( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} + +/// Copies ASCII from source to destination zero-extending it to UTF-16 up to +/// the first non-ASCII byte (or the end of the input if it is ASCII in its +/// entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_copy_ascii_to_basic_latin( + src: *const u8, + src_len: usize, + dst: *mut u16, + dst_len: usize, +) -> usize { + encoding_rs::mem::copy_ascii_to_basic_latin( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} + +/// Copies Basic Latin from source to destination narrowing it to ASCII up to +/// the first non-Basic Latin code unit (or the end of the input if it is +/// Basic Latin in its entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Undefined behavior +/// +/// UB ensues if `src` and `src_len` don't designate a valid memory block, if +/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory +/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If +/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and +/// aligned. Likewise for `dst` and `dst_len`.) +#[no_mangle] +pub unsafe extern "C" fn encoding_mem_copy_basic_latin_to_ascii( + src: *const u16, + src_len: usize, + dst: *mut u8, + dst_len: usize, +) -> usize { + encoding_rs::mem::copy_basic_latin_to_ascii( + ::std::slice::from_raw_parts(src, src_len), + ::std::slice::from_raw_parts_mut(dst, dst_len), + ) +} |