summaryrefslogtreecommitdiffstats
path: root/third_party/rust/encoding_c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/rust/encoding_c/.cargo-checksum.json1
-rw-r--r--third_party/rust/encoding_c/CONTRIBUTING.md38
-rw-r--r--third_party/rust/encoding_c/COPYRIGHT9
-rw-r--r--third_party/rust/encoding_c/Cargo.toml39
-rw-r--r--third_party/rust/encoding_c/LICENSE-APACHE202
-rw-r--r--third_party/rust/encoding_c/LICENSE-MIT25
-rw-r--r--third_party/rust/encoding_c/README.md131
-rw-r--r--third_party/rust/encoding_c/build-disabled.rs60
-rw-r--r--third_party/rust/encoding_c/build.rs7
-rw-r--r--third_party/rust/encoding_c/include/encoding_rs.h692
-rw-r--r--third_party/rust/encoding_c/include/encoding_rs_cpp.h1351
-rw-r--r--third_party/rust/encoding_c/include/encoding_rs_statics.h171
-rw-r--r--third_party/rust/encoding_c/src/lib.rs1194
-rw-r--r--third_party/rust/encoding_c_mem/.cargo-checksum.json1
-rw-r--r--third_party/rust/encoding_c_mem/CONTRIBUTING.md33
-rw-r--r--third_party/rust/encoding_c_mem/COPYRIGHT9
-rw-r--r--third_party/rust/encoding_c_mem/Cargo.toml27
-rw-r--r--third_party/rust/encoding_c_mem/LICENSE-APACHE202
-rw-r--r--third_party/rust/encoding_c_mem/LICENSE-MIT25
-rw-r--r--third_party/rust/encoding_c_mem/README.md60
-rw-r--r--third_party/rust/encoding_c_mem/build.rs7
-rw-r--r--third_party/rust/encoding_c_mem/include/encoding_rs_mem.h704
-rw-r--r--third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h578
-rw-r--r--third_party/rust/encoding_c_mem/src/lib.rs825
24 files changed, 6391 insertions, 0 deletions
diff --git a/third_party/rust/encoding_c/.cargo-checksum.json b/third_party/rust/encoding_c/.cargo-checksum.json
new file mode 100644
index 0000000000..eabf0ea825
--- /dev/null
+++ b/third_party/rust/encoding_c/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"CONTRIBUTING.md":"8cd9262df951c4b42078aa55064ca3b8ef2676c06b8fc7c281c02ee3f1ae04a8","COPYRIGHT":"6c7cd6277ece1edbc9f653eb1812bb98dc7ada4137525f0612938490f7819d9a","Cargo.toml":"c78bebead132f39eb39f477f28a226c873320d681247f05b4e9745f67e5468c4","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"3fa4ca83dcc9237839b1bdeb2e6d16bdfb5ec0c5ce42b24694d8bbf0dcbef72c","README.md":"8e9d1ef3b3f19831c622ab7dd455dd405c3c1a25b459e556d55ec198bdd59a68","build-disabled.rs":"2cc34f4c96a235c1ec256a5b6981b08b45d420463a6c997d6ce819462626b74b","build.rs":"013c85c18b035473d3a0900b833906304a8431882e5c22053684a69588adde98","include/encoding_rs.h":"328efc1a6ee9f0fb81b1db5286f24c0cdbcabcaa123d8b209c0000ba2d618c7f","include/encoding_rs_cpp.h":"d4dcae03cc5d8127b5e944f80691cb95990a116bca9a5044ecdfd30ed569c659","include/encoding_rs_statics.h":"96a2595ad7e209a5f393e61d46899ec484329693ac164455074e041482625c9d","src/lib.rs":"98bed946e18cdb1993d46aeb644435ec0d850738fbceaba84742a83934d6c454"},"package":"9af727805f3b0d79956bde5b35732669fb5c5d45a94893798e7b7e70cfbf9cc1"} \ No newline at end of file
diff --git a/third_party/rust/encoding_c/CONTRIBUTING.md b/third_party/rust/encoding_c/CONTRIBUTING.md
new file mode 100644
index 0000000000..62597bf029
--- /dev/null
+++ b/third_party/rust/encoding_c/CONTRIBUTING.md
@@ -0,0 +1,38 @@
+If you send a pull request / patch, please observe the following.
+
+## Licensing
+
+Since this crate is dual-licensed,
+[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions)
+is considered to apply in the sense of Contributions being automatically
+under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file).
+That is, by the act of offering a Contribution, you place your Contribution
+under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT`
+file. Please do not contribute if you aren't willing or allowed to license your
+contributions in this manner.
+
+You are encouraged to dedicate test code that you contribute to the Public
+Domain using the CC0 dedication. If you contribute test code that is not
+dedicated to the Public Domain, please be sure not to put it in a part of
+source code that the comments designate as being dedicated to the Public
+Domain.
+
+## Copyright Notices
+
+If you require the addition of your copyright notice, it's up to you to edit in
+your notice as part of your Contribution. Not adding a copyright notice is
+taken as a waiver of copyright notice.
+
+## Compatibility with Stable Rust
+
+Please ensure that your Contribution compiles with the latest stable-channel
+rustc.
+
+## rustfmt
+
+Please install [`rustfmt`](https://github.com/rust-lang-nursery/rustfmt) 0.4.1
+(the latest version has
+[a bug](https://github.com/rust-lang-nursery/rustfmt/issues/1149) that renders
+it unsuited for encoding_rs) and run `cargo fmt` before creating a pull
+request. (It's OK for `cargo fmt` to exit with an error due to too long lines.)
+
diff --git a/third_party/rust/encoding_c/COPYRIGHT b/third_party/rust/encoding_c/COPYRIGHT
new file mode 100644
index 0000000000..2cb666fddd
--- /dev/null
+++ b/third_party/rust/encoding_c/COPYRIGHT
@@ -0,0 +1,9 @@
+encoding_c is copyright Mozilla Foundation.
+
+Licensed under the Apache License, Version 2.0
+<LICENSE-APACHE or
+https://www.apache.org/licenses/LICENSE-2.0> or the MIT
+license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
+at your option. All files in the project carrying such
+notice may not be copied, modified, or distributed except
+according to those terms.
diff --git a/third_party/rust/encoding_c/Cargo.toml b/third_party/rust/encoding_c/Cargo.toml
new file mode 100644
index 0000000000..ee7ecb6acb
--- /dev/null
+++ b/third_party/rust/encoding_c/Cargo.toml
@@ -0,0 +1,39 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "encoding_c"
+version = "0.9.8"
+authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
+build = "build.rs"
+links = "encoding_c"
+description = "C API for encoding_rs"
+homepage = "https://docs.rs/encoding_c/"
+documentation = "https://docs.rs/encoding_c/"
+readme = "README.md"
+keywords = ["ffi", "capi", "encoding", "unicode", "charset"]
+license = "Apache-2.0 OR MIT"
+repository = "https://github.com/hsivonen/encoding_c"
+[dependencies.encoding_rs]
+version = "0.8.20"
+
+[features]
+fast-big5-hanzi-encode = ["encoding_rs/fast-big5-hanzi-encode"]
+fast-gb-hanzi-encode = ["encoding_rs/fast-gb-hanzi-encode"]
+fast-hangul-encode = ["encoding_rs/fast-hangul-encode"]
+fast-hanja-encode = ["encoding_rs/fast-hanja-encode"]
+fast-kanji-encode = ["encoding_rs/fast-kanji-encode"]
+fast-legacy-encode = ["encoding_rs/fast-legacy-encode"]
+less-slow-big5-hanzi-encode = ["encoding_rs/less-slow-big5-hanzi-encode"]
+less-slow-gb-hanzi-encode = ["encoding_rs/less-slow-gb-hanzi-encode"]
+less-slow-kanji-encode = ["encoding_rs/less-slow-kanji-encode"]
+simd-accel = ["encoding_rs/simd-accel"]
diff --git a/third_party/rust/encoding_c/LICENSE-APACHE b/third_party/rust/encoding_c/LICENSE-APACHE
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/third_party/rust/encoding_c/LICENSE-APACHE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/third_party/rust/encoding_c/LICENSE-MIT b/third_party/rust/encoding_c/LICENSE-MIT
new file mode 100644
index 0000000000..3317c82e2f
--- /dev/null
+++ b/third_party/rust/encoding_c/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright Mozilla Foundation
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/third_party/rust/encoding_c/README.md b/third_party/rust/encoding_c/README.md
new file mode 100644
index 0000000000..d4c80865ff
--- /dev/null
+++ b/third_party/rust/encoding_c/README.md
@@ -0,0 +1,131 @@
+# encoding_c
+
+[![crates.io](https://meritbadge.herokuapp.com/encoding_c)](https://crates.io/crates/encoding_c)
+[![docs.rs](https://docs.rs/encoding_c/badge.svg)](https://docs.rs/encoding_c/)
+[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/encoding_c/blob/master/COPYRIGHT)
+
+encoding_c is an FFI wrapper for [encoding_rs](https://github.com/hsivonen/encoding_rs).
+
+## Bindings for `encoding_rs::mem`
+
+See the [`encoding_c_mem` crate](https://crates.io/crates/encoding_c_mem)
+for bindings for `encoding_rs::mem`.
+
+## Licensing
+
+Please see the file named
+[COPYRIGHT](https://github.com/hsivonen/encoding_c/blob/master/COPYRIGHT).
+
+## No Unwinding Support!
+
+This crate is meant for use in binaries compiled with `panic = 'abort'`, which
+is _required_ for correctness! Unwinding across FFI is Undefined Behavior, and
+this crate does nothing to try to prevent unwinding across the FFI if
+compiled with unwinding enabled.
+
+## C/C++ Headers
+
+`include/encoding_rs.h` and `include/encoding_rs_statics.h` are needed for C
+usage.
+
+`include/encoding_rs_cpp.h` is a sample C++ API built on top of the C API using
+GSL and the C++ standard library. Since C++ project typically roll their own
+string classes, etc., it's probably necessary for C++ projects to manually
+adapt the header to their replacements of standard-library types.
+
+There's a [write-up](https://hsivonen.fi/modern-cpp-in-rust/) about the C++
+wrappers.
+
+## Release Notes
+
+### 0.9.8
+
+* Remove year from copyright notices.
+
+### 0.9.7
+
+* Specify a `links` value in the Cargo manifest.
+* Emit an `include_dir` variable from build script so that other build scripts
+ depending on this crate can rely on it.
+
+### 0.9.6
+
+* Fix a bug in the C++ header.
+
+### 0.9.5
+
+* Adjust documentation for encoding_rs 0.8.20.
+
+### 0.9.4
+
+* Fix bogus C header.
+
+### 0.9.3
+
+* Fix bogus C++ header.
+
+### 0.9.2
+
+* Wrap `Decoder::latin1_byte_compatible_up_to`.
+
+### 0.9.1
+
+* Wrap `Encoding::is_single_byte()`.
+* Pass through new feature flags introduced in encoding_rs 0.8.11.
+
+### 0.9.0
+
+* Update to encoding_rs 0.8.0.
+
+### 0.8.0
+
+* Update to encoding_rs 0.7.0.
+* Drop `encoding_for_name()`.
+* Deal correctly with the `data()` method of `gsl::span` returning `nullptr`.
+
+### 0.7.6
+
+* Rename `ENCODING_RS_NON_NULL_CONST_ENCODING_PTR` to
+ `ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR`. (Not a breaking change,
+ because defining that macro broke the build previously, so the
+ macro couldn't have been used.)
+* Use the macro only for statics and not for return values.
+
+### 0.7.5
+
+* Annotate the encoding pointers that should be wrapped with a
+ same-representation not-null type in C++ as
+ `ENCODING_RS_NON_NULL_CONST_ENCODING_PTR`.
+
+### 0.7.4
+
+* Wrap `has_pending_state()`.
+
+### 0.7.3
+
+* Use C preprocessor definitions for encoding constant declarations.
+
+### 0.7.2
+
+* Parametrize the struct type names behind C preprocessor definitions.
+* Leave it to the user to provide `char16_t`. Avoid including a header for it.
+
+### 0.7.1
+
+* Fix documentation for pointers that get used in
+ `std::slice::from_raw_parts()`.
+
+### 0.7.0
+
+* Map `None` to `SIZE_MAX` in the max length calculation functions.
+
+### 0.6.0
+
+* Check in the `cheddar`-generated header and comment out the `cheddar`-using
+ `build.rs`.
+
+### 0.5.0
+
+* Initial release of encoding_c. (I.e. first release with FFI in a distinct
+ crate.)
+
diff --git a/third_party/rust/encoding_c/build-disabled.rs b/third_party/rust/encoding_c/build-disabled.rs
new file mode 100644
index 0000000000..ebd7493626
--- /dev/null
+++ b/third_party/rust/encoding_c/build-disabled.rs
@@ -0,0 +1,60 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate cheddar;
+
+use std::io::prelude::*;
+use std::fs::File;
+
+fn replace(path: &str) -> std::io::Result<()> {
+ let mut f = try!(File::open(path));
+ let mut s = String::new();
+ try!(f.read_to_string(&mut s));
+ s = s.replace("#ifndef cheddar_generated_encoding_rs_h", "// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+// Instead, please regenerate using encoding_c/build.rs.
+
+#ifndef cheddar_generated_encoding_rs_h");
+ s = s.replace("uint16_t", "char16_t");
+ s = s.replace("uintptr_t", "size_t");
+ s = s.replace("Encoding", "ENCODING_RS_ENCODING");
+ s = s.replace("Encoder", "ENCODING_RS_ENCODER");
+ s = s.replace("Decoder", "ENCODING_RS_DECODER");
+ s = s.replace("ENCODING_RS_ENCODING.html", "Encoding.html");
+ s = s.replace("ENCODING_RS_ENCODER.html", "Encoder.html");
+ s = s.replace("ENCODING_RS_DECODER.html", "Decoder.html");
+ s = s.replace("#include <stdbool.h>",
+ "#include <stdbool.h>\n#include \"encoding_rs_statics.h\"");
+ let mut f = try!(File::create(path));
+ try!(f.write_all(s.as_bytes()));
+ Ok(())
+}
+
+fn main() {
+ println!("cargo:rerun-if-changed=src/lib.rs");
+
+ let path = "include/encoding_rs.h";
+
+ cheddar::Cheddar::new()
+ .expect("could not read manifest")
+ .run_build(path);
+
+ match replace(path) {
+ Ok(_) => {}
+ Err(e) => println!("Performing replacements failed {}.", e),
+ }
+}
diff --git a/third_party/rust/encoding_c/build.rs b/third_party/rust/encoding_c/build.rs
new file mode 100644
index 0000000000..962b7ae12b
--- /dev/null
+++ b/third_party/rust/encoding_c/build.rs
@@ -0,0 +1,7 @@
+fn main() {
+ println!("cargo:rerun-if-changed=");
+
+ let cargo_manifest_dir = std::env::var_os("CARGO_MANIFEST_DIR").unwrap();
+ let include_dir = std::path::PathBuf::from(cargo_manifest_dir).join("include");
+ println!("cargo:include-dir={}", include_dir.display());
+}
diff --git a/third_party/rust/encoding_c/include/encoding_rs.h b/third_party/rust/encoding_c/include/encoding_rs.h
new file mode 100644
index 0000000000..39231b7a0f
--- /dev/null
+++ b/third_party/rust/encoding_c/include/encoding_rs.h
@@ -0,0 +1,692 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+// Instead, please regenerate using encoding_c/build.rs.
+
+#ifndef cheddar_generated_encoding_rs_h
+#define cheddar_generated_encoding_rs_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "encoding_rs_statics.h"
+
+/// Implements the
+/// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
+/// algorithm.
+///
+/// If, after ASCII-lowercasing and removing leading and trailing
+/// whitespace, the argument matches a label defined in the ENCODING_RS_ENCODING
+/// Standard, `const ENCODING_RS_ENCODING*` representing the corresponding
+/// encoding is returned. If there is no match, `NULL` is returned.
+///
+/// This is the right function to use if the action upon the method returning
+/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) instead.
+/// When the action upon the method returning `NULL` is not to proceed with
+/// a fallback but to refuse processing, `encoding_for_label_no_replacement()`
+/// is more appropriate.
+///
+/// The argument buffer can be in any ASCII-compatible encoding. It is not
+/// required to be UTF-8.
+///
+/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len`
+/// is zero, it is OK for `label` to be something non-dereferencable,
+/// such as `0x1`. This is required due to Rust's optimization for slices
+/// within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `label` and `label_len` don't designate a valid memory block
+/// of if `label` is `NULL`.
+ENCODING_RS_ENCODING const* encoding_for_label(uint8_t const* label,
+ size_t label_len);
+
+/// This function behaves the same as `encoding_for_label()`, except when
+/// `encoding_for_label()` would return `REPLACEMENT_ENCODING`, this method
+/// returns `NULL` instead.
+///
+/// This method is useful in scenarios where a fatal error is required
+/// upon invalid label, because in those cases the caller typically wishes
+/// to treat the labels that map to the replacement encoding as fatal
+/// errors, too.
+///
+/// It is not OK to use this funciton when the action upon the method returning
+/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
+/// such a case, the `encoding_for_label()` function should be used instead
+/// in order to avoid unsafe fallback for labels that `encoding_for_label()`
+/// maps to `REPLACEMENT_ENCODING`.
+///
+/// The argument buffer can be in any ASCII-compatible encoding. It is not
+/// required to be UTF-8.
+///
+/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len`
+/// is zero, it is OK for `label` to be something non-dereferencable,
+/// such as `0x1`. This is required due to Rust's optimization for slices
+/// within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `label` and `label_len` don't designate a valid memory block
+/// of if `label` is `NULL`.
+ENCODING_RS_ENCODING const* encoding_for_label_no_replacement(
+ uint8_t const* label, size_t label_len);
+
+/// Performs non-incremental BOM sniffing.
+///
+/// The argument must either be a buffer representing the entire input
+/// stream (non-streaming case) or a buffer representing at least the first
+/// three bytes of the input stream (streaming case).
+///
+/// Returns `UTF_8_ENCODING`, `UTF_16LE_ENCODING` or `UTF_16BE_ENCODING` if the
+/// argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or `NULL`
+/// otherwise. Upon return, `*buffer_len` is the length of the BOM (zero if
+/// there is no BOM).
+///
+/// `buffer` must be non-`NULL` even if `*buffer_len` is zero. When
+/// `*buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+ENCODING_RS_ENCODING const* encoding_for_bom(uint8_t const* buffer,
+ size_t* buffer_len);
+
+/// Writes the name of the given `ENCODING_RS_ENCODING` to a caller-supplied
+/// buffer as ASCII and returns the number of bytes / ASCII characters written.
+///
+/// The output is not null-terminated.
+///
+/// The caller _MUST_ ensure that `name_out` points to a buffer whose length
+/// is at least `ENCODING_NAME_MAX_LENGTH` bytes.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL` or if `name_out` doesn't point to
+/// a valid block of memory whose length is at least
+/// `ENCODING_NAME_MAX_LENGTH` bytes.
+size_t encoding_name(ENCODING_RS_ENCODING const* encoding, uint8_t* name_out);
+
+/// Checks whether the _output encoding_ of this encoding can encode every
+/// Unicode scalar. (Only true if the output encoding is UTF-8.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+bool encoding_can_encode_everything(ENCODING_RS_ENCODING const* encoding);
+
+/// Checks whether the bytes 0x00...0x7F map exclusively to the characters
+/// U+0000...U+007F and vice versa.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+bool encoding_is_ascii_compatible(ENCODING_RS_ENCODING const* encoding);
+
+/// Checks whether this encoding maps one byte to one Basic Multilingual
+/// Plane code point (i.e. byte length equals decoded UTF-16 length) and
+/// vice versa (for mappable characters).
+///
+/// `true` iff this encoding is on the list of [Legacy single-byte
+/// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
+/// in the spec or x-user-defined.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+bool encoding_is_single_byte(ENCODING_RS_ENCODING const* encoding);
+
+/// Returns the _output encoding_ of this encoding. This is UTF-8 for
+/// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+ENCODING_RS_ENCODING const* encoding_output_encoding(
+ ENCODING_RS_ENCODING const* encoding);
+
+/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING`
+/// on the heap with BOM sniffing enabled and returns a pointer to the
+/// newly-allocated `ENCODING_RS_DECODER`.
+///
+/// BOM sniffing may cause the returned decoder to morph into a decoder
+/// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
+///
+/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller
+/// _MUST_ deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+ENCODING_RS_DECODER* encoding_new_decoder(ENCODING_RS_ENCODING const* encoding);
+
+/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING`
+/// on the heap with BOM removal and returns a pointer to the newly-allocated
+/// `ENCODING_RS_DECODER`.
+///
+/// If the input starts with bytes that are the BOM for this encoding,
+/// those bytes are removed. However, the decoder never morphs into a
+/// decoder for another encoding: A BOM for another encoding is treated as
+/// (potentially malformed) input to the decoding algorithm for this
+/// encoding.
+///
+/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller
+/// _MUST_ deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+ENCODING_RS_DECODER* encoding_new_decoder_with_bom_removal(
+ ENCODING_RS_ENCODING const* encoding);
+
+/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING`
+/// on the heap with BOM handling disabled and returns a pointer to the
+/// newly-allocated `ENCODING_RS_DECODER`.
+///
+/// If the input starts with bytes that look like a BOM, those bytes are
+/// not treated as a BOM. (Hence, the decoder never morphs into a decoder
+/// for another encoding.)
+///
+/// _Note:_ If the caller has performed BOM sniffing on its own but has not
+/// removed the BOM, the caller should use
+/// `encoding_new_decoder_with_bom_removal()` instead of this function to cause
+/// the BOM to be removed.
+///
+/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller
+/// _MUST_ deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+ENCODING_RS_DECODER* encoding_new_decoder_without_bom_handling(
+ ENCODING_RS_ENCODING const* encoding);
+
+/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING`
+/// into memory provided by the caller with BOM sniffing enabled. (In practice,
+/// the target should likely be a pointer previously returned by
+/// `encoding_new_decoder()`.)
+///
+/// Note: If the caller has already performed BOM sniffing but has
+/// not removed the BOM, the caller should still use this function in
+/// order to cause the BOM to be ignored.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+void encoding_new_decoder_into(ENCODING_RS_ENCODING const* encoding,
+ ENCODING_RS_DECODER* decoder);
+
+/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING`
+/// into memory provided by the caller with BOM removal.
+///
+/// If the input starts with bytes that are the BOM for this encoding,
+/// those bytes are removed. However, the decoder never morphs into a
+/// decoder for another encoding: A BOM for another encoding is treated as
+/// (potentially malformed) input to the decoding algorithm for this
+/// encoding.
+///
+/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller
+/// _MUST_ deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+void encoding_new_decoder_with_bom_removal_into(
+ ENCODING_RS_ENCODING const* encoding, ENCODING_RS_DECODER* decoder);
+
+/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING`
+/// into memory provided by the caller with BOM handling disabled.
+///
+/// If the input starts with bytes that look like a BOM, those bytes are
+/// not treated as a BOM. (Hence, the decoder never morphs into a decoder
+/// for another encoding.)
+///
+/// _Note:_ If the caller has performed BOM sniffing on its own but has not
+/// removed the BOM, the caller should use
+/// `encoding_new_decoder_with_bom_removal_into()` instead of this function to
+/// cause the BOM to be removed.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+void encoding_new_decoder_without_bom_handling_into(
+ ENCODING_RS_ENCODING const* encoding, ENCODING_RS_DECODER* decoder);
+
+/// Allocates a new `ENCODING_RS_ENCODER` for the given `ENCODING_RS_ENCODING`
+/// on the heap and returns a pointer to the newly-allocated
+/// `ENCODING_RS_ENCODER`. (Exception, if the `ENCODING_RS_ENCODING` is
+/// `replacement`, a new `ENCODING_RS_DECODER` for UTF-8 is instantiated (and
+/// that `ENCODING_RS_DECODER` reports `UTF_8` as its `ENCODING_RS_ENCODING`).
+///
+/// Once the allocated `ENCODING_RS_ENCODER` is no longer needed, the caller
+/// _MUST_ deallocate it by passing the pointer returned by this function to
+/// `encoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+ENCODING_RS_ENCODER* encoding_new_encoder(ENCODING_RS_ENCODING const* encoding);
+
+/// Allocates a new `ENCODING_RS_ENCODER` for the given `ENCODING_RS_ENCODING`
+/// into memory provided by the caller. (In practice, the target should likely
+/// be a pointer previously returned by `encoding_new_encoder()`.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+void encoding_new_encoder_into(ENCODING_RS_ENCODING const* encoding,
+ ENCODING_RS_ENCODER* encoder);
+
+/// Validates UTF-8.
+///
+/// Returns the index of the first byte that makes the input malformed as
+/// UTF-8 or `buffer_len` if `buffer` is entirely valid.
+///
+/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
+/// `buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
+
+/// Validates ASCII.
+///
+/// Returns the index of the first byte that makes the input malformed as
+/// ASCII or `buffer_len` if `buffer` is entirely valid.
+///
+/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
+/// `buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
+
+/// Validates ISO-2022-JP ASCII-state data.
+///
+/// Returns the index of the first byte that makes the input not representable
+/// in the ASCII state of ISO-2022-JP or `buffer_len` if `buffer` is entirely
+/// representable in the ASCII state of ISO-2022-JP.
+///
+/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
+/// `buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+size_t encoding_iso_2022_jp_ascii_valid_up_to(uint8_t const* buffer,
+ size_t buffer_len);
+
+/// Deallocates a `ENCODING_RS_DECODER` previously allocated by
+/// `encoding_new_decoder()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+void decoder_free(ENCODING_RS_DECODER* decoder);
+
+/// The `ENCODING_RS_ENCODING` this `ENCODING_RS_DECODER` is for.
+///
+/// BOM sniffing can change the return value of this method during the life
+/// of the decoder.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+ENCODING_RS_ENCODING const* decoder_encoding(
+ ENCODING_RS_DECODER const* decoder);
+
+/// Query the worst-case UTF-8 output size _with replacement_.
+///
+/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
+/// that will not overflow given the current state of the decoder and
+/// `byte_length` number of additional input bytes when decoding with
+/// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
+/// sequence or `SIZE_MAX` if `size_t` would overflow.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `decoder` is `NULL`.
+size_t decoder_max_utf8_buffer_length(ENCODING_RS_DECODER const* decoder,
+ size_t byte_length);
+
+/// Query the worst-case UTF-8 output size _without replacement_.
+///
+/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
+/// that will not overflow given the current state of the decoder and
+/// `byte_length` number of additional input bytes when decoding without
+/// replacement error handling or `SIZE_MAX` if `size_t` would overflow.
+///
+/// Note that this value may be too small for the `_with_replacement` case.
+/// Use `decoder_max_utf8_buffer_length()` for that case.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `decoder` is `NULL`.
+size_t decoder_max_utf8_buffer_length_without_replacement(
+ ENCODING_RS_DECODER const* decoder, size_t byte_length);
+
+/// Incrementally decode a byte stream into UTF-8 with malformed sequences
+/// replaced with the REPLACEMENT CHARACTER.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+uint32_t decoder_decode_to_utf8(ENCODING_RS_DECODER* decoder,
+ uint8_t const* src, size_t* src_len,
+ uint8_t* dst, size_t* dst_len, bool last,
+ bool* had_replacements);
+
+/// Incrementally decode a byte stream into UTF-8 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+uint32_t decoder_decode_to_utf8_without_replacement(
+ ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len,
+ uint8_t* dst, size_t* dst_len, bool last);
+
+/// Query the worst-case UTF-16 output size (with or without replacement).
+///
+/// Returns the size of the output buffer in UTF-16 code units (`char16_t`)
+/// that will not overflow given the current state of the decoder and
+/// `byte_length` number of additional input bytes or `SIZE_MAX` if `size_t`
+/// would overflow.
+///
+/// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
+/// return value of this method applies also in the
+/// `_without_replacement` case.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `decoder` is `NULL`.
+size_t decoder_max_utf16_buffer_length(ENCODING_RS_DECODER const* decoder,
+ size_t u16_length);
+
+/// Incrementally decode a byte stream into UTF-16 with malformed sequences
+/// replaced with the REPLACEMENT CHARACTER.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+uint32_t decoder_decode_to_utf16(ENCODING_RS_DECODER* decoder,
+ uint8_t const* src, size_t* src_len,
+ char16_t* dst, size_t* dst_len, bool last,
+ bool* had_replacements);
+
+/// Incrementally decode a byte stream into UTF-16 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+uint32_t decoder_decode_to_utf16_without_replacement(
+ ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len,
+ char16_t* dst, size_t* dst_len, bool last);
+
+/// Checks for compatibility with storing Unicode scalar values as unsigned
+/// bytes taking into account the state of the decoder.
+///
+/// Returns `SIZE_MAX` if the decoder is not in a neutral state, including waiting
+/// for the BOM, or if the encoding is never Latin1-byte-compatible.
+///
+/// Otherwise returns the index of the first byte whose unsigned value doesn't
+/// directly correspond to the decoded Unicode scalar value, or the length
+/// of the input if all bytes in the input decode directly to scalar values
+/// corresponding to the unsigned byte values.
+///
+/// Does not change the state of the decoder.
+///
+/// Do not use this unless you are supporting SpiderMonkey/V8-style string
+/// storage optimizations.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+size_t decoder_latin1_byte_compatible_up_to(ENCODING_RS_DECODER const* decoder,
+ uint8_t const* buffer,
+ size_t buffer_len);
+
+/// Deallocates an `ENCODING_RS_ENCODER` previously allocated by
+/// `encoding_new_encoder()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+void encoder_free(ENCODING_RS_ENCODER* encoder);
+
+/// The `ENCODING_RS_ENCODING` this `ENCODING_RS_ENCODER` is for.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+ENCODING_RS_ENCODING const* encoder_encoding(
+ ENCODING_RS_ENCODER const* encoder);
+
+/// Returns `true` if this is an ISO-2022-JP encoder that's not in the
+/// ASCII state and `false` otherwise.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+bool encoder_has_pending_state(ENCODING_RS_ENCODER const* encoder);
+
+/// Query the worst-case output size when encoding from UTF-8 with
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `byte_length` number of
+/// additional input code units if there are no unmappable characters in
+/// the input or `SIZE_MAX` if `size_t` would overflow.
+size_t encoder_max_buffer_length_from_utf8_if_no_unmappables(
+ ENCODING_RS_ENCODER const* encoder, size_t byte_length);
+
+/// Query the worst-case output size when encoding from UTF-8 without
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `byte_length` number of
+/// additional input code units or `SIZE_MAX` if `size_t` would overflow.
+size_t encoder_max_buffer_length_from_utf8_without_replacement(
+ ENCODING_RS_ENCODER const* encoder, size_t byte_length);
+
+/// Incrementally encode into byte stream from UTF-8 with unmappable
+/// characters replaced with HTML (decimal) numeric character references.
+///
+/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe!
+/// If in doubt, check the validity of input before using!
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+uint32_t encoder_encode_from_utf8(ENCODING_RS_ENCODER* encoder,
+ uint8_t const* src, size_t* src_len,
+ uint8_t* dst, size_t* dst_len, bool last,
+ bool* had_replacements);
+
+/// Incrementally encode into byte stream from UTF-8 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
+///
+/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe!
+/// If in doubt, check the validity of input before using!
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+uint32_t encoder_encode_from_utf8_without_replacement(
+ ENCODING_RS_ENCODER* encoder, uint8_t const* src, size_t* src_len,
+ uint8_t* dst, size_t* dst_len, bool last);
+
+/// Query the worst-case output size when encoding from UTF-16 with
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `u16_length` number of
+/// additional input code units if there are no unmappable characters in
+/// the input or `SIZE_MAX` if `size_t` would overflow.
+size_t encoder_max_buffer_length_from_utf16_if_no_unmappables(
+ ENCODING_RS_ENCODER const* encoder, size_t u16_length);
+
+/// Query the worst-case output size when encoding from UTF-16 without
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `u16_length` number of
+/// additional input code units or `SIZE_MAX` if `size_t` would overflow.
+size_t encoder_max_buffer_length_from_utf16_without_replacement(
+ ENCODING_RS_ENCODER const* encoder, size_t u16_length);
+
+/// Incrementally encode into byte stream from UTF-16 with unmappable
+/// characters replaced with HTML (decimal) numeric character references.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+uint32_t encoder_encode_from_utf16(ENCODING_RS_ENCODER* encoder,
+ char16_t const* src, size_t* src_len,
+ uint8_t* dst, size_t* dst_len, bool last,
+ bool* had_replacements);
+
+/// Incrementally encode into byte stream from UTF-16 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+uint32_t encoder_encode_from_utf16_without_replacement(
+ ENCODING_RS_ENCODER* encoder, char16_t const* src, size_t* src_len,
+ uint8_t* dst, size_t* dst_len, bool last);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/third_party/rust/encoding_c/include/encoding_rs_cpp.h b/third_party/rust/encoding_c/include/encoding_rs_cpp.h
new file mode 100644
index 0000000000..4ec5181ee9
--- /dev/null
+++ b/third_party/rust/encoding_c/include/encoding_rs_cpp.h
@@ -0,0 +1,1351 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#pragma once
+
+#ifndef encoding_rs_cpp_h_
+#define encoding_rs_cpp_h_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <vector>
+#include "gsl/gsl"
+
+namespace encoding_rs {
+class Encoding;
+class Decoder;
+class Encoder;
+}; // namespace encoding_rs
+
+#define ENCODING_RS_ENCODING encoding_rs::Encoding
+#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
+ gsl::not_null<const encoding_rs::Encoding*>
+#define ENCODING_RS_ENCODER encoding_rs::Encoder
+#define ENCODING_RS_DECODER encoding_rs::Decoder
+
+#include "encoding_rs.h"
+
+namespace encoding_rs {
+
+/**
+ * A converter that decodes a byte stream into Unicode according to a
+ * character encoding in a streaming (incremental) manner.
+ *
+ * The various `decode_*` methods take an input buffer (`src`) and an output
+ * buffer `dst` both of which are caller-allocated. There are variants for
+ * both UTF-8 and UTF-16 output buffers.
+ *
+ * A `decode_*` method decodes bytes from `src` into Unicode characters stored
+ * into `dst` until one of the following three things happens:
+ *
+ * 1. A malformed byte sequence is encountered (`*_without_replacement`
+ * variants only).
+ *
+ * 2. The output buffer has been filled so near capacity that the decoder
+ * cannot be sure that processing an additional byte of input wouldn't
+ * cause so much output that the output buffer would overflow.
+ *
+ * 3. All the input bytes have been processed.
+ *
+ * The `decode_*` method then returns tuple of a status indicating which one
+ * of the three reasons to return happened, how many input bytes were read,
+ * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
+ * when decoding to UTF-16) were written, and in the case of the
+ * variants performing replacement, a boolean indicating whether an error was
+ * replaced with the REPLACEMENT CHARACTER during the call.
+ *
+ * The number of bytes "written" is what's logically written. Garbage may be
+ * written in the output buffer beyond the point logically written to.
+ *
+ * In the case of the `*_without_replacement` variants, the status is a
+ * `uint32_t` whose possible values are packed info about a malformed byte
+ * sequence, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding to the three cases
+ * listed above).
+ *
+ * Packed info about malformed sequences has the following format:
+ * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
+ * indicate the number of bytes that were consumed after the malformed
+ * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
+ * the length of the malformed byte sequence (possible decimal values 1, 2,
+ * 3 or 4). The maximum possible sum of the two is 6.
+ *
+ * In the case of methods whose name does not end with
+ * `*_without_replacement`, malformed sequences are automatically replaced
+ * with the REPLACEMENT CHARACTER and errors do not cause the methods to
+ * return early.
+ *
+ * When decoding to UTF-8, the output buffer must have at least 4 bytes of
+ * space. When decoding to UTF-16, the output buffer must have at least two
+ * UTF-16 code units (`char16_t`) of space.
+ *
+ * When decoding to UTF-8 without replacement, the methods are guaranteed
+ * not to return indicating that more output space is needed if the length
+ * of the output buffer is at least the length returned by
+ * `max_utf8_buffer_length_without_replacement()`. When decoding to UTF-8
+ * with replacement, the length of the output buffer that guarantees the
+ * methods not to return indicating that more output space is needed is given
+ * by `max_utf8_buffer_length()`. When decoding to UTF-16 with
+ * or without replacement, the length of the output buffer that guarantees
+ * the methods not to return indicating that more output space is needed is
+ * given by `max_utf16_buffer_length()`.
+ *
+ * The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
+ * and the output after each `decode_*` call is guaranteed to consist of
+ * complete characters. (I.e. the code unit sequence for the last character is
+ * guaranteed not to be split across output buffers.)
+ *
+ * The boolean argument `last` indicates that the end of the stream is reached
+ * when all the bytes in `src` have been consumed.
+ *
+ * A `Decoder` object can be used to incrementally decode a byte stream.
+ *
+ * During the processing of a single stream, the caller must call `decode_*`
+ * zero or more times with `last` set to `false` and then call `decode_*` at
+ * least once with `last` set to `true`. If `decode_*` returns `INPUT_EMPTY`,
+ * the processing of the stream has ended. Otherwise, the caller must call
+ * `decode_*` again with `last` set to `true` (or treat a malformed result,
+ * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error).
+ *
+ * Once the stream has ended, the `Decoder` object must not be used anymore.
+ * That is, you need to create another one to process another stream.
+ *
+ * When the decoder returns `OUTPUT_FULL` or the decoder returns a malformed
+ * result and the caller does not wish to treat it as a fatal error, the input
+ * buffer `src` may not have been completely consumed. In that case, the caller
+ * must pass the unconsumed contents of `src` to `decode_*` again upon the next
+ * call.
+ *
+ * # Infinite loops
+ *
+ * When converting with a fixed-size output buffer whose size is too small to
+ * accommodate one character of output, an infinite loop ensues. When
+ * converting with a fixed-size output buffer, it generally makes sense to
+ * make the buffer fairly large (e.g. couple of kilobytes).
+ */
+class Decoder final {
+ public:
+ ~Decoder() {}
+ static inline void operator delete(void* decoder) {
+ decoder_free(reinterpret_cast<Decoder*>(decoder));
+ }
+
+ /**
+ * The `Encoding` this `Decoder` is for.
+ *
+ * BOM sniffing can change the return value of this method during the life
+ * of the decoder.
+ */
+ inline gsl::not_null<const Encoding*> encoding() const {
+ return gsl::not_null<const Encoding*>(decoder_encoding(this));
+ }
+
+ /**
+ * Query the worst-case UTF-8 output size _with replacement_.
+ *
+ * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
+ * that will not overflow given the current state of the decoder and
+ * `byte_length` number of additional input bytes when decoding with
+ * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
+ * sequence or `std::optional` without value if `size_t` would overflow.
+ */
+ inline std::optional<size_t> max_utf8_buffer_length(
+ size_t byte_length) const {
+ size_t val = decoder_max_utf8_buffer_length(this, byte_length);
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ /**
+ * Query the worst-case UTF-8 output size _without replacement_.
+ *
+ * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
+ * that will not overflow given the current state of the decoder and
+ * `byte_length` number of additional input bytes when decoding without
+ * replacement error handling or `std::optional` without value if `size_t`
+ * would overflow.
+ *
+ * Note that this value may be too small for the `_with_replacement` case.
+ * Use `max_utf8_buffer_length()` for that case.
+ */
+ inline std::optional<size_t> max_utf8_buffer_length_without_replacement(
+ size_t byte_length) const {
+ size_t val =
+ decoder_max_utf8_buffer_length_without_replacement(this, byte_length);
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ /**
+ * Incrementally decode a byte stream into UTF-8 with malformed sequences
+ * replaced with the REPLACEMENT CHARACTER.
+ *
+ * See the documentation of the class for documentation for `decode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf8(
+ gsl::span<const uint8_t> src, gsl::span<uint8_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ bool had_replacements;
+ uint32_t result =
+ decoder_decode_to_utf8(this, null_to_bogus<const uint8_t>(src.data()),
+ &src_read, null_to_bogus<uint8_t>(dst.data()),
+ &dst_written, last, &had_replacements);
+ return {result, src_read, dst_written, had_replacements};
+ }
+
+ /**
+ * Incrementally decode a byte stream into UTF-8 _without replacement_.
+ *
+ * See the documentation of the class for documentation for `decode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t>
+ decode_to_utf8_without_replacement(gsl::span<const uint8_t> src,
+ gsl::span<uint8_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ uint32_t result = decoder_decode_to_utf8_without_replacement(
+ this, null_to_bogus<const uint8_t>(src.data()), &src_read,
+ null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
+ return {result, src_read, dst_written};
+ }
+
+ /**
+ * Query the worst-case UTF-16 output size (with or without replacement).
+ *
+ * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
+ * that will not overflow given the current state of the decoder and
+ * `byte_length` number of additional input bytes or `std::optional`
+ * without value if `size_t` would overflow.
+ *
+ * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
+ * return value of this method applies also in the
+ * `_without_replacement` case.
+ */
+ inline std::optional<size_t> max_utf16_buffer_length(
+ size_t byte_length) const {
+ size_t val = decoder_max_utf16_buffer_length(this, byte_length);
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ /**
+ * Incrementally decode a byte stream into UTF-16 with malformed sequences
+ * replaced with the REPLACEMENT CHARACTER.
+ *
+ * See the documentation of the class for documentation for `decode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf16(
+ gsl::span<const uint8_t> src, gsl::span<char16_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ bool had_replacements;
+ uint32_t result =
+ decoder_decode_to_utf16(this, null_to_bogus<const uint8_t>(src.data()),
+ &src_read, null_to_bogus<char16_t>(dst.data()),
+ &dst_written, last, &had_replacements);
+ return {result, src_read, dst_written, had_replacements};
+ }
+
+ /**
+ * Incrementally decode a byte stream into UTF-16 _without replacement_.
+ *
+ * See the documentation of the class for documentation for `decode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t>
+ decode_to_utf16_without_replacement(gsl::span<const uint8_t> src,
+ gsl::span<char16_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ uint32_t result = decoder_decode_to_utf16_without_replacement(
+ this, null_to_bogus<const uint8_t>(src.data()), &src_read,
+ null_to_bogus<char16_t>(dst.data()), &dst_written, last);
+ return {result, src_read, dst_written};
+ }
+
+ /**
+ * Checks for compatibility with storing Unicode scalar values as unsigned
+ * bytes taking into account the state of the decoder.
+ *
+ * Returns `std::nullopt` if the decoder is not in a neutral state, including
+ * waiting for the BOM, or if the encoding is never Latin1-byte-compatible.
+ *
+ * Otherwise returns the index of the first byte whose unsigned value doesn't
+ * directly correspond to the decoded Unicode scalar value, or the length
+ * of the input if all bytes in the input decode directly to scalar values
+ * corresponding to the unsigned byte values.
+ *
+ * Does not change the state of the decoder.
+ *
+ * Do not use this unless you are supporting SpiderMonkey/V8-style string
+ * storage optimizations.
+ */
+ inline std::optional<size_t> latin1_byte_compatible_up_to(
+ gsl::span<const uint8_t> buffer) const {
+ size_t val = decoder_latin1_byte_compatible_up_to(
+ this, null_to_bogus<const uint8_t>(buffer.data()),
+ static_cast<size_t>(buffer.size()));
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ private:
+ /**
+ * Replaces `nullptr` with a bogus pointer suitable for use as part of a
+ * zero-length Rust slice.
+ */
+ template <class T>
+ static inline T* null_to_bogus(T* ptr) {
+ return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
+ }
+
+ Decoder() = delete;
+ Decoder(const Decoder&) = delete;
+ Decoder& operator=(const Decoder&) = delete;
+};
+
+/**
+ * A converter that encodes a Unicode stream into bytes according to a
+ * character encoding in a streaming (incremental) manner.
+ *
+ * The various `encode_*` methods take an input buffer (`src`) and an output
+ * buffer `dst` both of which are caller-allocated. There are variants for
+ * both UTF-8 and UTF-16 input buffers.
+ *
+ * An `encode_*` method encode characters from `src` into bytes characters
+ * stored into `dst` until one of the following three things happens:
+ *
+ * 1. An unmappable character is encountered (`*_without_replacement` variants
+ * only).
+ *
+ * 2. The output buffer has been filled so near capacity that the decoder
+ * cannot be sure that processing an additional character of input wouldn't
+ * cause so much output that the output buffer would overflow.
+ *
+ * 3. All the input characters have been processed.
+ *
+ * The `encode_*` method then returns tuple of a status indicating which one
+ * of the three reasons to return happened, how many input code units (`uint8_t`
+ * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
+ * how many output bytes were written, and in the case of the variants that
+ * perform replacement, a boolean indicating whether an unmappable
+ * character was replaced with a numeric character reference during the call.
+ *
+ * The number of bytes "written" is what's logically written. Garbage may be
+ * written in the output buffer beyond the point logically written to.
+ *
+ * In the case of the methods whose name ends with
+ * `*_without_replacement`, the status is a `uint32_t` whose possible values
+ * are an unmappable code point, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding
+ * to the three cases listed above).
+ *
+ * In the case of methods whose name does not end with
+ * `*_without_replacement`, unmappable characters are automatically replaced
+ * with the corresponding numeric character references and unmappable
+ * characters do not cause the methods to return early.
+ *
+ * When encoding from UTF-8 without replacement, the methods are guaranteed
+ * not to return indicating that more output space is needed if the length
+ * of the output buffer is at least the length returned by
+ * `max_buffer_length_from_utf8_without_replacement()`. When encoding from
+ * UTF-8 with replacement, the length of the output buffer that guarantees the
+ * methods not to return indicating that more output space is needed in the
+ * absence of unmappable characters is given by
+ * `max_buffer_length_from_utf8_if_no_unmappables()`. When encoding from
+ * UTF-16 without replacement, the methods are guaranteed not to return
+ * indicating that more output space is needed if the length of the output
+ * buffer is at least the length returned by
+ * `max_buffer_length_from_utf16_without_replacement()`. When encoding
+ * from UTF-16 with replacement, the the length of the output buffer that
+ * guarantees the methods not to return indicating that more output space is
+ * needed in the absence of unmappable characters is given by
+ * `max_buffer_length_from_utf16_if_no_unmappables()`.
+ * When encoding with replacement, applications are not expected to size the
+ * buffer for the worst case ahead of time but to resize the buffer if there
+ * are unmappable characters. This is why max length queries are only available
+ * for the case where there are no unmappable characters.
+ *
+ * When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. When
+ * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
+ * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
+ * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
+ * surrogate pairs are not split across input buffer boundaries.
+ *
+ * After an `encode_*` call returns, the output produced so far, taken as a
+ * whole from the start of the stream, is guaranteed to consist of a valid
+ * byte sequence in the target encoding. (I.e. the code unit sequence for a
+ * character is guaranteed not to be split across output buffers. However, due
+ * to the stateful nature of ISO-2022-JP, the stream needs to be considered
+ * from the start for it to be valid. For other encodings, the validity holds
+ * on a per-output buffer basis.)
+ *
+ * The boolean argument `last` indicates that the end of the stream is reached
+ * when all the characters in `src` have been consumed. This argument is needed
+ * for ISO-2022-JP and is ignored for other encodings.
+ *
+ * An `Encoder` object can be used to incrementally encode a byte stream.
+ *
+ * During the processing of a single stream, the caller must call `encode_*`
+ * zero or more times with `last` set to `false` and then call `encode_*` at
+ * least once with `last` set to `true`. If `encode_*` returns `INPUT_EMPTY`,
+ * the processing of the stream has ended. Otherwise, the caller must call
+ * `encode_*` again with `last` set to `true` (or treat an unmappable result,
+ * i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error).
+ *
+ * Once the stream has ended, the `Encoder` object must not be used anymore.
+ * That is, you need to create another one to process another stream.
+ *
+ * When the encoder returns `OUTPUT_FULL` or the encoder returns an unmappable
+ * result and the caller does not wish to treat it as a fatal error, the input
+ * buffer `src` may not have been completely consumed. In that case, the caller
+ * must pass the unconsumed contents of `src` to `encode_*` again upon the next
+ * call.
+ *
+ * # Infinite loops
+ *
+ * When converting with a fixed-size output buffer whose size is too small to
+ * accommodate one character of output, an infinite loop ensues. When
+ * converting with a fixed-size output buffer, it generally makes sense to
+ * make the buffer fairly large (e.g. couple of kilobytes).
+ */
+class Encoder final {
+ public:
+ ~Encoder() {}
+
+ static inline void operator delete(void* encoder) {
+ encoder_free(reinterpret_cast<Encoder*>(encoder));
+ }
+
+ /**
+ * The `Encoding` this `Encoder` is for.
+ */
+ inline gsl::not_null<const Encoding*> encoding() const {
+ return gsl::not_null<const Encoding*>(encoder_encoding(this));
+ }
+
+ /**
+ * Returns `true` if this is an ISO-2022-JP encoder that's not in the
+ * ASCII state and `false` otherwise.
+ */
+ inline bool has_pending_state() const {
+ return encoder_has_pending_state(this);
+ }
+
+ /**
+ * Query the worst-case output size when encoding from UTF-8 with
+ * replacement.
+ *
+ * Returns the size of the output buffer in bytes that will not overflow
+ * given the current state of the encoder and `byte_length` number of
+ * additional input code units if there are no unmappable characters in
+ * the input or `SIZE_MAX` if `size_t` would overflow.
+ */
+ inline std::optional<size_t> max_buffer_length_from_utf8_if_no_unmappables(
+ size_t byte_length) const {
+ size_t val = encoder_max_buffer_length_from_utf8_if_no_unmappables(
+ this, byte_length);
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ /**
+ * Query the worst-case output size when encoding from UTF-8 without
+ * replacement.
+ *
+ * Returns the size of the output buffer in bytes that will not overflow
+ * given the current state of the encoder and `byte_length` number of
+ * additional input code units or `SIZE_MAX` if `size_t` would overflow.
+ */
+ inline std::optional<size_t> max_buffer_length_from_utf8_without_replacement(
+ size_t byte_length) const {
+ size_t val = encoder_max_buffer_length_from_utf8_without_replacement(
+ this, byte_length);
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ /**
+ * Incrementally encode into byte stream from UTF-8 with unmappable
+ * characters replaced with HTML (decimal) numeric character references.
+ *
+ * See the documentation of the class for documentation for `encode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf8(
+ std::string_view src, gsl::span<uint8_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ bool had_replacements;
+ uint32_t result = encoder_encode_from_utf8(
+ this,
+ null_to_bogus<const uint8_t>(
+ reinterpret_cast<const uint8_t*>(src.data())),
+ &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last,
+ &had_replacements);
+ return {result, src_read, dst_written, had_replacements};
+ }
+
+ /**
+ * Incrementally encode into byte stream from UTF-8 _without replacement_.
+ *
+ * See the documentation of the class for documentation for `encode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t>
+ encode_from_utf8_without_replacement(std::string_view src,
+ gsl::span<uint8_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ uint32_t result = encoder_encode_from_utf8_without_replacement(
+ this,
+ null_to_bogus<const uint8_t>(
+ reinterpret_cast<const uint8_t*>(src.data())),
+ &src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
+ return {result, src_read, dst_written};
+ }
+
+ /**
+ * Query the worst-case output size when encoding from UTF-16 with
+ * replacement.
+ *
+ * Returns the size of the output buffer in bytes that will not overflow
+ * given the current state of the encoder and `u16_length` number of
+ * additional input code units if there are no unmappable characters in
+ * the input or `SIZE_MAX` if `size_t` would overflow.
+ */
+ inline std::optional<size_t> max_buffer_length_from_utf16_if_no_unmappables(
+ size_t u16_length) const {
+ size_t val = encoder_max_buffer_length_from_utf16_if_no_unmappables(
+ this, u16_length);
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ /**
+ * Query the worst-case output size when encoding from UTF-16 without
+ * replacement.
+ *
+ * Returns the size of the output buffer in bytes that will not overflow
+ * given the current state of the encoder and `u16_length` number of
+ * additional input code units or `SIZE_MAX` if `size_t` would overflow.
+ */
+ inline std::optional<size_t> max_buffer_length_from_utf16_without_replacement(
+ size_t u16_length) const {
+ size_t val = encoder_max_buffer_length_from_utf16_without_replacement(
+ this, u16_length);
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+ }
+
+ /**
+ * Incrementally encode into byte stream from UTF-16 with unmappable
+ * characters replaced with HTML (decimal) numeric character references.
+ *
+ * See the documentation of the class for documentation for `encode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf16(
+ std::u16string_view src, gsl::span<uint8_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ bool had_replacements;
+ uint32_t result = encoder_encode_from_utf16(
+ this, null_to_bogus<const char16_t>(src.data()), &src_read,
+ null_to_bogus<uint8_t>(dst.data()), &dst_written, last,
+ &had_replacements);
+ return {result, src_read, dst_written, had_replacements};
+ }
+
+ /**
+ * Incrementally encode into byte stream from UTF-16 _without replacement_.
+ *
+ * See the documentation of the class for documentation for `encode_*`
+ * methods collectively.
+ */
+ inline std::tuple<uint32_t, size_t, size_t>
+ encode_from_utf16_without_replacement(std::u16string_view src,
+ gsl::span<uint8_t> dst, bool last) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ uint32_t result = encoder_encode_from_utf16_without_replacement(
+ this, null_to_bogus<const char16_t>(src.data()), &src_read,
+ null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
+ return {result, src_read, dst_written};
+ }
+
+ private:
+ /**
+ * Replaces `nullptr` with a bogus pointer suitable for use as part of a
+ * zero-length Rust slice.
+ */
+ template <class T>
+ static inline T* null_to_bogus(T* ptr) {
+ return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
+ }
+
+ Encoder() = delete;
+ Encoder(const Encoder&) = delete;
+ Encoder& operator=(const Encoder&) = delete;
+};
+
+/**
+ * An encoding as defined in the Encoding Standard
+ * (https://encoding.spec.whatwg.org/).
+ *
+ * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
+ * sequence and, in most cases, vice versa. Each encoding has a name, an output
+ * encoding, and one or more labels.
+ *
+ * _Labels_ are ASCII-case-insensitive strings that are used to identify an
+ * encoding in formats and protocols. The _name_ of the encoding is the
+ * preferred label in the case appropriate for returning from the
+ * `characterSet` property of the `Document` DOM interface, except for
+ * the replacement encoding whose name is not one of its labels.
+ *
+ * The _output encoding_ is the encoding used for form submission and URL
+ * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
+ * UTF-16LE and UTF-16BE encodings and the encoding itself for other
+ * encodings.
+ *
+ * # Streaming vs. Non-Streaming
+ *
+ * When you have the entire input in a single buffer, you can use the
+ * methods `decode()`, `decode_with_bom_removal()`,
+ * `decode_without_bom_handling()`,
+ * `decode_without_bom_handling_and_without_replacement()` and
+ * `encode()`. Unlike the rest of the API, these methods perform heap
+ * allocations. You should the `Decoder` and `Encoder` objects when your input
+ * is split into multiple buffers or when you want to control the allocation of
+ * the output buffers.
+ *
+ * # Instances
+ *
+ * All instances of `Encoding` are statically allocated and have the process's
+ * lifetime. There is precisely one unique `Encoding` instance for each
+ * encoding defined in the Encoding Standard.
+ *
+ * To obtain a reference to a particular encoding whose identity you know at
+ * compile time, use a `static` that refers to encoding. There is a `static`
+ * for each encoding. The `static`s are named in all caps with hyphens
+ * replaced with underscores and with `_ENCODING` appended to the
+ * name. For example, if you know at compile time that you will want to
+ * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
+ *
+ * If you don't know what encoding you need at compile time and need to
+ * dynamically get an encoding by label, use `Encoding::for_label()`.
+ *
+ * Instances of `Encoding` can be compared with `==`.
+ */
+class Encoding final {
+ public:
+ /**
+ * Implements the _get an encoding_ algorithm
+ * (https://encoding.spec.whatwg.org/#concept-encoding-get).
+ *
+ * If, after ASCII-lowercasing and removing leading and trailing
+ * whitespace, the argument matches a label defined in the Encoding
+ * Standard, `const Encoding*` representing the corresponding
+ * encoding is returned. If there is no match, `nullptr` is returned.
+ *
+ * This is the right method to use if the action upon the method returning
+ * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
+ * instead. When the action upon the method returning `nullptr` is not to
+ * proceed with a fallback but to refuse processing,
+ * `for_label_no_replacement()` is more appropriate.
+ */
+ static inline const Encoding* for_label(gsl::cstring_span<> label) {
+ return encoding_for_label(
+ null_to_bogus<const uint8_t>(
+ reinterpret_cast<const uint8_t*>(label.data())),
+ label.length());
+ }
+
+ /**
+ * This method behaves the same as `for_label()`, except when `for_label()`
+ * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
+ *
+ * This method is useful in scenarios where a fatal error is required
+ * upon invalid label, because in those cases the caller typically wishes
+ * to treat the labels that map to the replacement encoding as fatal
+ * errors, too.
+ *
+ * It is not OK to use this method when the action upon the method returning
+ * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
+ * such a case, the `for_label()` method should be used instead in order to
+ * avoid
+ * unsafe fallback for labels that `for_label()` maps to
+ * `REPLACEMENT_ENCODING`.
+ */
+ static inline const Encoding* for_label_no_replacement(
+ gsl::cstring_span<> label) {
+ return encoding_for_label_no_replacement(
+ null_to_bogus<const uint8_t>(
+ reinterpret_cast<const uint8_t*>(label.data())),
+ label.length());
+ }
+
+ /**
+ * Performs non-incremental BOM sniffing.
+ *
+ * The argument must either be a buffer representing the entire input
+ * stream (non-streaming case) or a buffer representing at least the first
+ * three bytes of the input stream (streaming case).
+ *
+ * Returns a std::optinal wrapping `make_tuple(UTF_8_ENCODING, 3)`,
+ * `make_tuple(UTF_16LE_ENCODING, 2)` or `make_tuple(UTF_16BE_ENCODING, 3)`
+ * if the argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or
+ * `std::nullopt` otherwise.
+ */
+ static inline std::optional<
+ std::tuple<gsl::not_null<const Encoding*>, size_t>>
+ for_bom(gsl::span<const uint8_t> buffer) {
+ size_t len = buffer.size();
+ const Encoding* encoding =
+ encoding_for_bom(null_to_bogus(buffer.data()), &len);
+ if (encoding) {
+ return std::make_tuple(gsl::not_null<const Encoding*>(encoding), len);
+ }
+ return std::nullopt;
+ }
+
+ /**
+ * Returns the name of this encoding.
+ *
+ * This name is appropriate to return as-is from the DOM
+ * `document.characterSet` property.
+ */
+ inline std::string name() const {
+ std::string name(ENCODING_NAME_MAX_LENGTH, '\0');
+ // http://herbsutter.com/2008/04/07/cringe-not-vectors-are-guaranteed-to-be-contiguous/#comment-483
+ size_t length = encoding_name(this, reinterpret_cast<uint8_t*>(&name[0]));
+ name.resize(length);
+ return name;
+ }
+
+ /**
+ * Checks whether the _output encoding_ of this encoding can encode every
+ * Unicode code point. (Only true if the output encoding is UTF-8.)
+ */
+ inline bool can_encode_everything() const {
+ return encoding_can_encode_everything(this);
+ }
+
+ /**
+ * Checks whether the bytes 0x00...0x7F map exclusively to the characters
+ * U+0000...U+007F and vice versa.
+ */
+ inline bool is_ascii_compatible() const {
+ return encoding_is_ascii_compatible(this);
+ }
+
+ /**
+ * Checks whether this encoding maps one byte to one Basic Multilingual
+ * Plane code point (i.e. byte length equals decoded UTF-16 length) and
+ * vice versa (for mappable characters).
+ *
+ * `true` iff this encoding is on the list of Legacy single-byte
+ * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
+ * in the spec or x-user-defined.
+ */
+ inline bool is_single_byte() const { return encoding_is_single_byte(this); }
+
+ /**
+ * Returns the _output encoding_ of this encoding. This is UTF-8 for
+ * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
+ */
+ inline gsl::not_null<const Encoding*> output_encoding() const {
+ return gsl::not_null<const Encoding*>(encoding_output_encoding(this));
+ }
+
+ /**
+ * Decode complete input to `std::string` _with BOM sniffing_ and with
+ * malformed sequences replaced with the REPLACEMENT CHARACTER when the
+ * entire input is available as a single buffer (i.e. the end of the
+ * buffer marks the end of the stream).
+ *
+ * This method implements the (non-streaming version of) the
+ * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
+ *
+ * The second item in the returned tuple is the encoding that was actually
+ * used (which may differ from this encoding thanks to BOM sniffing).
+ *
+ * The third item in the returned tuple indicates whether there were
+ * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use `new_decoder()`
+ * when decoding segmented input.
+ */
+ inline std::tuple<std::string, gsl::not_null<const Encoding*>, bool> decode(
+ gsl::span<const uint8_t> bytes) const {
+ auto opt = Encoding::for_bom(bytes);
+ const Encoding* encoding;
+ if (opt) {
+ size_t bom_length;
+ std::tie(encoding, bom_length) = *opt;
+ bytes = bytes.subspan(bom_length);
+ } else {
+ encoding = this;
+ }
+ auto [str, had_errors] = encoding->decode_without_bom_handling(bytes);
+ return {str, gsl::not_null<const Encoding*>(encoding), had_errors};
+ }
+
+ /**
+ * Decode complete input to `std::string` _with BOM removal_ and with
+ * malformed sequences replaced with the REPLACEMENT CHARACTER when the
+ * entire input is available as a single buffer (i.e. the end of the
+ * buffer marks the end of the stream).
+ *
+ * When invoked on `UTF_8`, this method implements the (non-streaming
+ * version of) the _UTF-8 decode_
+ * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
+ *
+ * The second item in the returned pair indicates whether there were
+ * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use
+ * `new_decoder_with_bom_removal()` when decoding segmented input.
+ */
+ inline std::tuple<std::string, bool> decode_with_bom_removal(
+ gsl::span<const uint8_t> bytes) const {
+ if (this == UTF_8_ENCODING && bytes.size() >= 3 &&
+ (gsl::as_bytes(bytes.first<3>()) ==
+ gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) {
+ bytes = bytes.subspan(3, bytes.size() - 3);
+ } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 &&
+ (gsl::as_bytes(bytes.first<2>()) ==
+ gsl::as_bytes(gsl::make_span("\xFF\xFE")))) {
+ bytes = bytes.subspan(2, bytes.size() - 2);
+ } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 &&
+ (gsl::as_bytes(bytes.first<2>()) ==
+ gsl::as_bytes(gsl::make_span("\xFE\xFF")))) {
+ bytes = bytes.subspan(2, bytes.size() - 2);
+ }
+ return decode_without_bom_handling(bytes);
+ }
+
+ /**
+ * Decode complete input to `std::string` _without BOM handling_ and
+ * with malformed sequences replaced with the REPLACEMENT CHARACTER when
+ * the entire input is available as a single buffer (i.e. the end of the
+ * buffer marks the end of the stream).
+ *
+ * When invoked on `UTF_8`, this method implements the (non-streaming
+ * version of) the _UTF-8 decode without BOM_
+ * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
+ *
+ * The second item in the returned pair indicates whether there were
+ * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use
+ * `new_decoder_without_bom_handling()` when decoding segmented input.
+ */
+ inline std::tuple<std::string, bool> decode_without_bom_handling(
+ gsl::span<const uint8_t> bytes) const {
+ auto decoder = new_decoder_without_bom_handling();
+ auto needed = decoder->max_utf8_buffer_length(bytes.size());
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ std::string string(needed.value(), '\0');
+ const auto [result, read, written, had_errors] = decoder->decode_to_utf8(
+ bytes,
+ gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]), string.size()),
+ true);
+ assert(read == static_cast<size_t>(bytes.size()));
+ assert(written <= static_cast<size_t>(string.size()));
+ assert(result == INPUT_EMPTY);
+ string.resize(written);
+ return {string, had_errors};
+ }
+
+ /**
+ * Decode complete input to `std::string` _without BOM handling_ and
+ * _with malformed sequences treated as fatal_ when the entire input is
+ * available as a single buffer (i.e. the end of the buffer marks the end
+ * of the stream).
+ *
+ * When invoked on `UTF_8`, this method implements the (non-streaming
+ * version of) the _UTF-8 decode without BOM or fail_
+ * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
+ * spec concept.
+ *
+ * Returns `std::nullopt` if a malformed sequence was encountered and the result
+ * of the decode as `std::optional<std::string>` otherwise.
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use
+ * `new_decoder_without_bom_handling()` when decoding segmented input.
+ */
+ inline std::optional<std::string>
+ decode_without_bom_handling_and_without_replacement(
+ gsl::span<const uint8_t> bytes) const {
+ auto decoder = new_decoder_without_bom_handling();
+ auto needed =
+ decoder->max_utf8_buffer_length_without_replacement(bytes.size());
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ std::string string(needed.value(), '\0');
+ const auto [result, read, written] =
+ decoder->decode_to_utf8_without_replacement(
+ bytes,
+ gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]),
+ string.size()),
+ true);
+ assert(result != OUTPUT_FULL);
+ if (result == INPUT_EMPTY) {
+ assert(read == static_cast<size_t>(bytes.size()));
+ assert(written <= static_cast<size_t>(string.size()));
+ string.resize(written);
+ return string;
+ }
+ return std::nullopt;
+ }
+
+ /**
+ * Decode complete input to `std::u16string` _with BOM sniffing_ and with
+ * malformed sequences replaced with the REPLACEMENT CHARACTER when the
+ * entire input is available as a single buffer (i.e. the end of the
+ * buffer marks the end of the stream).
+ *
+ * This method implements the (non-streaming version of) the
+ * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
+ *
+ * The second item in the returned tuple is the encoding that was actually
+ * used (which may differ from this encoding thanks to BOM sniffing).
+ *
+ * The third item in the returned tuple indicates whether there were
+ * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use `new_decoder()`
+ * when decoding segmented input.
+ */
+ inline std::tuple<std::u16string, gsl::not_null<const Encoding*>, bool>
+ decode16(gsl::span<const uint8_t> bytes) const {
+ auto opt = Encoding::for_bom(bytes);
+ const Encoding* encoding;
+ if (opt) {
+ size_t bom_length;
+ std::tie(encoding, bom_length) = *opt;
+ bytes = bytes.subspan(bom_length);
+ } else {
+ encoding = this;
+ }
+ auto [str, had_errors] = encoding->decode16_without_bom_handling(bytes);
+ return {str, gsl::not_null<const Encoding*>(encoding), had_errors};
+ }
+
+ /**
+ * Decode complete input to `std::u16string` _with BOM removal_ and with
+ * malformed sequences replaced with the REPLACEMENT CHARACTER when the
+ * entire input is available as a single buffer (i.e. the end of the
+ * buffer marks the end of the stream).
+ *
+ * When invoked on `UTF_8`, this method implements the (non-streaming
+ * version of) the _UTF-8 decode_
+ * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
+ *
+ * The second item in the returned pair indicates whether there were
+ * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use
+ * `new_decoder_with_bom_removal()` when decoding segmented input.
+ */
+ inline std::tuple<std::u16string, bool> decode16_with_bom_removal(
+ gsl::span<const uint8_t> bytes) const {
+ if (this == UTF_8_ENCODING && bytes.size() >= 3 &&
+ (gsl::as_bytes(bytes.first<3>()) ==
+ gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) {
+ bytes = bytes.subspan(3, bytes.size() - 3);
+ } else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 &&
+ (gsl::as_bytes(bytes.first<2>()) ==
+ gsl::as_bytes(gsl::make_span("\xFF\xFE")))) {
+ bytes = bytes.subspan(2, bytes.size() - 2);
+ } else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 &&
+ (gsl::as_bytes(bytes.first<2>()) ==
+ gsl::as_bytes(gsl::make_span("\xFE\xFF")))) {
+ bytes = bytes.subspan(2, bytes.size() - 2);
+ }
+ return decode16_without_bom_handling(bytes);
+ }
+
+ /**
+ * Decode complete input to `std::u16string` _without BOM handling_ and
+ * with malformed sequences replaced with the REPLACEMENT CHARACTER when
+ * the entire input is available as a single buffer (i.e. the end of the
+ * buffer marks the end of the stream).
+ *
+ * When invoked on `UTF_8`, this method implements the (non-streaming
+ * version of) the _UTF-8 decode without BOM_
+ * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
+ *
+ * The second item in the returned pair indicates whether there were
+ * malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use
+ * `new_decoder_without_bom_handling()` when decoding segmented input.
+ */
+ inline std::tuple<std::u16string, bool> decode16_without_bom_handling(
+ gsl::span<const uint8_t> bytes) const {
+ auto decoder = new_decoder_without_bom_handling();
+ auto needed = decoder->max_utf16_buffer_length(bytes.size());
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ std::u16string string(needed.value(), '\0');
+ const auto [result, read, written, had_errors] = decoder->decode_to_utf16(
+ bytes, gsl::make_span(&string[0], string.size()), true);
+ assert(read == static_cast<size_t>(bytes.size()));
+ assert(written <= static_cast<size_t>(string.size()));
+ assert(result == INPUT_EMPTY);
+ string.resize(written);
+ return {string, had_errors};
+ }
+
+ /**
+ * Decode complete input to `std::u16string` _without BOM handling_ and
+ * _with malformed sequences treated as fatal_ when the entire input is
+ * available as a single buffer (i.e. the end of the buffer marks the end
+ * of the stream).
+ *
+ * When invoked on `UTF_8`, this method implements the (non-streaming
+ * version of) the _UTF-8 decode without BOM or fail_
+ * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
+ * spec concept.
+ *
+ * Returns `std::nullopt` if a malformed sequence was encountered and the result
+ * of the decode as `std::optional<std::u16string>` otherwise.
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use
+ * `new_decoder_without_bom_handling()` when decoding segmented input.
+ */
+ inline std::optional<std::u16string>
+ decode16_without_bom_handling_and_without_replacement(
+ gsl::span<const uint8_t> bytes) const {
+ auto decoder = new_decoder_without_bom_handling();
+ auto needed = decoder->max_utf16_buffer_length(bytes.size());
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ std::u16string string(needed.value(), '\0');
+ const auto [result, read, written] =
+ decoder->decode_to_utf16_without_replacement(
+ bytes, gsl::make_span(&string[0], string.size()), true);
+ assert(result != OUTPUT_FULL);
+ if (result == INPUT_EMPTY) {
+ assert(read == static_cast<size_t>(bytes.size()));
+ assert(written <= static_cast<size_t>(string.size()));
+ string.resize(written);
+ return string;
+ }
+ return std::nullopt;
+ }
+
+ /**
+ * Encode complete input to `std::vector<uint8_t>` with unmappable characters
+ * replaced with decimal numeric character references when the entire input
+ * is available as a single buffer (i.e. the end of the buffer marks the
+ * end of the stream).
+ *
+ * This method implements the (non-streaming version of) the
+ * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
+ *
+ * The second item in the returned tuple is the encoding that was actually
+ * used (which may differ from this encoding thanks to some encodings
+ * having UTF-8 as their output encoding).
+ *
+ * The third item in the returned tuple indicates whether there were
+ * unmappable characters (that were replaced with HTML numeric character
+ * references).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use `new_encoder()`
+ * when encoding segmented output.
+ */
+ inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool>
+ encode(std::string_view string) const {
+ auto output_enc = output_encoding();
+ if (output_enc == UTF_8_ENCODING) {
+ std::vector<uint8_t> vec(string.size());
+ std::memcpy(&vec[0], string.data(), string.size());
+ }
+ auto encoder = output_enc->new_encoder();
+ auto needed =
+ encoder->max_buffer_length_from_utf8_if_no_unmappables(string.size());
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ std::vector<uint8_t> vec(needed.value());
+ bool total_had_errors = false;
+ size_t total_read = 0;
+ size_t total_written = 0;
+ for (;;) {
+ const auto [result, read, written, had_errors] =
+ encoder->encode_from_utf8(string.substr(total_read),
+ gsl::make_span(vec).subspan(total_written),
+ true);
+ total_read += read;
+ total_written += written;
+ total_had_errors |= had_errors;
+ if (result == INPUT_EMPTY) {
+ assert(total_read == static_cast<size_t>(string.size()));
+ assert(total_written <= static_cast<size_t>(vec.size()));
+ vec.resize(total_written);
+ return {vec, gsl::not_null<const Encoding*>(output_enc),
+ total_had_errors};
+ }
+ auto needed = encoder->max_buffer_length_from_utf8_if_no_unmappables(
+ string.size() - total_read);
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ vec.resize(total_written + needed.value());
+ }
+ }
+
+ /**
+ * Encode complete input to `std::vector<uint8_t>` with unmappable characters
+ * replaced with decimal numeric character references when the entire input
+ * is available as a single buffer (i.e. the end of the buffer marks the
+ * end of the stream).
+ *
+ * This method implements the (non-streaming version of) the
+ * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
+ *
+ * The second item in the returned tuple is the encoding that was actually
+ * used (which may differ from this encoding thanks to some encodings
+ * having UTF-8 as their output encoding).
+ *
+ * The third item in the returned tuple indicates whether there were
+ * unmappable characters (that were replaced with HTML numeric character
+ * references).
+ *
+ * _Note:_ It is wrong to use this when the input buffer represents only
+ * a segment of the input instead of the whole input. Use `new_encoder()`
+ * when encoding segmented output.
+ */
+ inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool>
+ encode(std::u16string_view string) const {
+ auto output_enc = output_encoding();
+ auto encoder = output_enc->new_encoder();
+ auto needed =
+ encoder->max_buffer_length_from_utf16_if_no_unmappables(string.size());
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ std::vector<uint8_t> vec(needed.value());
+ bool total_had_errors = false;
+ size_t total_read = 0;
+ size_t total_written = 0;
+ for (;;) {
+ const auto [result, read, written, had_errors] =
+ encoder->encode_from_utf16(string.substr(total_read),
+ gsl::make_span(vec).subspan(total_written),
+ true);
+ total_read += read;
+ total_written += written;
+ total_had_errors |= had_errors;
+ if (result == INPUT_EMPTY) {
+ assert(total_read == static_cast<size_t>(string.size()));
+ assert(total_written <= static_cast<size_t>(vec.size()));
+ vec.resize(total_written);
+ return {vec, gsl::not_null<const Encoding*>(output_enc),
+ total_had_errors};
+ }
+ auto needed = encoder->max_buffer_length_from_utf16_if_no_unmappables(
+ string.size() - total_read);
+ if (!needed) {
+ throw std::overflow_error("Overflow in buffer size computation.");
+ }
+ vec.resize(total_written + needed.value());
+ }
+ }
+
+ /**
+ * Instantiates a new decoder for this encoding with BOM sniffing enabled.
+ *
+ * BOM sniffing may cause the returned decoder to morph into a decoder
+ * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
+ */
+ inline std::unique_ptr<Decoder> new_decoder() const {
+ return std::unique_ptr<Decoder>(encoding_new_decoder(this));
+ }
+
+ /**
+ * Instantiates a new decoder for this encoding with BOM sniffing enabled
+ * into memory occupied by a previously-instantiated decoder.
+ *
+ * BOM sniffing may cause the returned decoder to morph into a decoder
+ * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
+ */
+ inline void new_decoder_into(Decoder& decoder) const {
+ encoding_new_decoder_into(this, &decoder);
+ }
+
+ /**
+ * Instantiates a new decoder for this encoding with BOM removal.
+ *
+ * If the input starts with bytes that are the BOM for this encoding,
+ * those bytes are removed. However, the decoder never morphs into a
+ * decoder for another encoding: A BOM for another encoding is treated as
+ * (potentially malformed) input to the decoding algorithm for this
+ * encoding.
+ */
+ inline std::unique_ptr<Decoder> new_decoder_with_bom_removal() const {
+ return std::unique_ptr<Decoder>(
+ encoding_new_decoder_with_bom_removal(this));
+ }
+
+ /**
+ * Instantiates a new decoder for this encoding with BOM removal
+ * into memory occupied by a previously-instantiated decoder.
+ *
+ * If the input starts with bytes that are the BOM for this encoding,
+ * those bytes are removed. However, the decoder never morphs into a
+ * decoder for another encoding: A BOM for another encoding is treated as
+ * (potentially malformed) input to the decoding algorithm for this
+ * encoding.
+ */
+ inline void new_decoder_with_bom_removal_into(Decoder& decoder) const {
+ encoding_new_decoder_with_bom_removal_into(this, &decoder);
+ }
+
+ /**
+ * Instantiates a new decoder for this encoding with BOM handling disabled.
+ *
+ * If the input starts with bytes that look like a BOM, those bytes are
+ * not treated as a BOM. (Hence, the decoder never morphs into a decoder
+ * for another encoding.)
+ *
+ * _Note:_ If the caller has performed BOM sniffing on its own but has not
+ * removed the BOM, the caller should use `new_decoder_with_bom_removal()`
+ * instead of this method to cause the BOM to be removed.
+ */
+ inline std::unique_ptr<Decoder> new_decoder_without_bom_handling() const {
+ return std::unique_ptr<Decoder>(
+ encoding_new_decoder_without_bom_handling(this));
+ }
+
+ /**
+ * Instantiates a new decoder for this encoding with BOM handling disabled
+ * into memory occupied by a previously-instantiated decoder.
+ *
+ * If the input starts with bytes that look like a BOM, those bytes are
+ * not treated as a BOM. (Hence, the decoder never morphs into a decoder
+ * for another encoding.)
+ *
+ * _Note:_ If the caller has performed BOM sniffing on its own but has not
+ * removed the BOM, the caller should use
+ * `new_decoder_with_bom_removal_into()`
+ * instead of this method to cause the BOM to be removed.
+ */
+ inline void new_decoder_without_bom_handling_into(Decoder& decoder) const {
+ encoding_new_decoder_without_bom_handling_into(this, &decoder);
+ }
+
+ /**
+ * Instantiates a new encoder for the output encoding of this encoding.
+ */
+ inline std::unique_ptr<Encoder> new_encoder() const {
+ return std::unique_ptr<Encoder>(encoding_new_encoder(this));
+ }
+
+ /**
+ * Instantiates a new encoder for the output encoding of this encoding
+ * into memory occupied by a previously-instantiated encoder.
+ */
+ inline void new_encoder_into(Encoder& encoder) const {
+ encoding_new_encoder_into(this, &encoder);
+ }
+
+ /**
+ * Validates UTF-8.
+ *
+ * Returns the index of the first byte that makes the input malformed as
+ * UTF-8 or the length of the input if the input is entirely valid.
+ */
+ static inline size_t utf8_valid_up_to(gsl::span<const uint8_t> buffer) {
+ return encoding_utf8_valid_up_to(
+ null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
+ }
+
+ /**
+ * Validates ASCII.
+ *
+ * Returns the index of the first byte that makes the input malformed as
+ * ASCII or the length of the input if the input is entirely valid.
+ */
+ static inline size_t ascii_valid_up_to(gsl::span<const uint8_t> buffer) {
+ return encoding_ascii_valid_up_to(
+ null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
+ }
+
+ /**
+ * Validates ISO-2022-JP ASCII-state data.
+ *
+ * Returns the index of the first byte that makes the input not
+ * representable in the ASCII state of ISO-2022-JP or the length of the
+ * input if the input is entirely representable in the ASCII state of
+ * ISO-2022-JP.
+ */
+ static inline size_t iso_2022_jp_ascii_valid_up_to(
+ gsl::span<const uint8_t> buffer) {
+ return encoding_iso_2022_jp_ascii_valid_up_to(
+ null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
+ }
+
+ private:
+ /**
+ * Replaces `nullptr` with a bogus pointer suitable for use as part of a
+ * zero-length Rust slice.
+ */
+ template <class T>
+ static inline T* null_to_bogus(T* ptr) {
+ return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
+ }
+
+ Encoding() = delete;
+ Encoding(const Encoding&) = delete;
+ Encoding& operator=(const Encoding&) = delete;
+ ~Encoding() = delete;
+};
+
+}; // namespace encoding_rs
+
+#endif // encoding_rs_cpp_h_
diff --git a/third_party/rust/encoding_c/include/encoding_rs_statics.h b/third_party/rust/encoding_c/include/encoding_rs_statics.h
new file mode 100644
index 0000000000..c3e84d586e
--- /dev/null
+++ b/third_party/rust/encoding_c/include/encoding_rs_statics.h
@@ -0,0 +1,171 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+// Instead, please regenerate using generate-encoding-data.py
+
+// This file is not meant to be included directly. Instead, encoding_rs.h
+// includes this file.
+
+#ifndef encoding_rs_statics_h_
+#define encoding_rs_statics_h_
+
+#ifndef ENCODING_RS_ENCODING
+#define ENCODING_RS_ENCODING Encoding
+#ifndef __cplusplus
+typedef struct Encoding_ Encoding;
+#endif
+#endif
+
+#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR
+#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING*
+#endif
+
+#ifndef ENCODING_RS_ENCODER
+#define ENCODING_RS_ENCODER Encoder
+#ifndef __cplusplus
+typedef struct Encoder_ Encoder;
+#endif
+#endif
+
+#ifndef ENCODING_RS_DECODER
+#define ENCODING_RS_DECODER Decoder
+#ifndef __cplusplus
+typedef struct Decoder_ Decoder;
+#endif
+#endif
+
+#define INPUT_EMPTY 0
+
+#define OUTPUT_FULL 0xFFFFFFFF
+
+// x-mac-cyrillic
+#define ENCODING_NAME_MAX_LENGTH 14
+
+/// The Big5 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const BIG5_ENCODING;
+
+/// The EUC-JP encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const EUC_JP_ENCODING;
+
+/// The EUC-KR encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const EUC_KR_ENCODING;
+
+/// The GBK encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const GBK_ENCODING;
+
+/// The IBM866 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const IBM866_ENCODING;
+
+/// The ISO-2022-JP encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_2022_JP_ENCODING;
+
+/// The ISO-8859-10 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_10_ENCODING;
+
+/// The ISO-8859-13 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_13_ENCODING;
+
+/// The ISO-8859-14 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_14_ENCODING;
+
+/// The ISO-8859-15 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_15_ENCODING;
+
+/// The ISO-8859-16 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_16_ENCODING;
+
+/// The ISO-8859-2 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_2_ENCODING;
+
+/// The ISO-8859-3 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_3_ENCODING;
+
+/// The ISO-8859-4 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_4_ENCODING;
+
+/// The ISO-8859-5 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_5_ENCODING;
+
+/// The ISO-8859-6 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_6_ENCODING;
+
+/// The ISO-8859-7 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_7_ENCODING;
+
+/// The ISO-8859-8 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_8_ENCODING;
+
+/// The ISO-8859-8-I encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ISO_8859_8_I_ENCODING;
+
+/// The KOI8-R encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const KOI8_R_ENCODING;
+
+/// The KOI8-U encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const KOI8_U_ENCODING;
+
+/// The Shift_JIS encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const SHIFT_JIS_ENCODING;
+
+/// The UTF-16BE encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const UTF_16BE_ENCODING;
+
+/// The UTF-16LE encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const UTF_16LE_ENCODING;
+
+/// The UTF-8 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const UTF_8_ENCODING;
+
+/// The gb18030 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const GB18030_ENCODING;
+
+/// The macintosh encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const MACINTOSH_ENCODING;
+
+/// The replacement encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const REPLACEMENT_ENCODING;
+
+/// The windows-1250 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1250_ENCODING;
+
+/// The windows-1251 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1251_ENCODING;
+
+/// The windows-1252 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1252_ENCODING;
+
+/// The windows-1253 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1253_ENCODING;
+
+/// The windows-1254 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1254_ENCODING;
+
+/// The windows-1255 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1255_ENCODING;
+
+/// The windows-1256 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1256_ENCODING;
+
+/// The windows-1257 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1257_ENCODING;
+
+/// The windows-1258 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_1258_ENCODING;
+
+/// The windows-874 encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const WINDOWS_874_ENCODING;
+
+/// The x-mac-cyrillic encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const X_MAC_CYRILLIC_ENCODING;
+
+/// The x-user-defined encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const X_USER_DEFINED_ENCODING;
+
+#endif // encoding_rs_statics_h_
diff --git a/third_party/rust/encoding_c/src/lib.rs b/third_party/rust/encoding_c/src/lib.rs
new file mode 100644
index 0000000000..699e6427c8
--- /dev/null
+++ b/third_party/rust/encoding_c/src/lib.rs
@@ -0,0 +1,1194 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![doc(html_root_url = "https://docs.rs/encoding_c/0.9.7")]
+
+//! The C API for encoding_rs.
+//!
+//! # Mapping from Rust
+//!
+//! ## Naming convention
+//!
+//! The wrapper function for each method has a name that starts with the name
+//! of the struct lower-cased, followed by an underscore and ends with the
+//! name of the method.
+//!
+//! For example, `Encoding::for_label()` is wrapped as `encoding_for_label()`.
+//!
+//! ## Arguments
+//!
+//! Functions that wrap non-static methods take the `self` object as their
+//! first argument.
+//!
+//! Slice argument `foo` is decomposed into a pointer `foo` and a length
+//! `foo_len`.
+//!
+//! ## Return values
+//!
+//! Multiple return values become out-params. When an out-param is
+//! length-related, `foo_len` for a slice becomes a pointer in order to become
+//! an in/out-param.
+//!
+//! `DecoderResult`, `EncoderResult` and `CoderResult` become `uint32_t`.
+//! `InputEmpty` becomes `INPUT_EMPTY`. `OutputFull` becomes `OUTPUT_FULL`.
+//! `Unmappable` becomes the scalar value of the unmappable character.
+//! `Malformed` becomes a number whose lowest 8 bits, which can have the decimal
+//! value 0, 1, 2 or 3, indicate the number of bytes that were consumed after
+//! the malformed sequence and whose next-lowest 8 bits, when shifted right by
+//! 8 indicate the length of the malformed byte sequence (possible decimal
+//! values 1, 2, 3 or 4). The maximum possible sum of the two is 6.
+
+extern crate encoding_rs;
+
+use encoding_rs::*;
+
+/// Return value for `*_decode_*` and `*_encode_*` functions that indicates that
+/// the input has been exhausted.
+///
+/// (This is zero as a micro optimization. U+0000 is never unmappable and
+/// malformed sequences always have a positive length.)
+pub const INPUT_EMPTY: u32 = 0;
+
+/// Return value for `*_decode_*` and `*_encode_*` functions that indicates that
+/// the output space has been exhausted.
+pub const OUTPUT_FULL: u32 = 0xFFFFFFFF;
+
+/// Newtype for `*const Encoding` in order to be able to implement `Sync` for
+/// it.
+pub struct ConstEncoding(*const Encoding);
+
+/// Required for `static` fields.
+unsafe impl Sync for ConstEncoding {}
+
+// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
+// Instead, please regenerate using generate-encoding-data.py
+
+/// The minimum length of buffers that may be passed to `encoding_name()`.
+pub const ENCODING_NAME_MAX_LENGTH: usize = 14; // x-mac-cyrillic
+
+/// The Big5 encoding.
+#[no_mangle]
+pub static BIG5_ENCODING: ConstEncoding = ConstEncoding(&BIG5_INIT);
+
+/// The EUC-JP encoding.
+#[no_mangle]
+pub static EUC_JP_ENCODING: ConstEncoding = ConstEncoding(&EUC_JP_INIT);
+
+/// The EUC-KR encoding.
+#[no_mangle]
+pub static EUC_KR_ENCODING: ConstEncoding = ConstEncoding(&EUC_KR_INIT);
+
+/// The GBK encoding.
+#[no_mangle]
+pub static GBK_ENCODING: ConstEncoding = ConstEncoding(&GBK_INIT);
+
+/// The IBM866 encoding.
+#[no_mangle]
+pub static IBM866_ENCODING: ConstEncoding = ConstEncoding(&IBM866_INIT);
+
+/// The ISO-2022-JP encoding.
+#[no_mangle]
+pub static ISO_2022_JP_ENCODING: ConstEncoding = ConstEncoding(&ISO_2022_JP_INIT);
+
+/// The ISO-8859-10 encoding.
+#[no_mangle]
+pub static ISO_8859_10_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_10_INIT);
+
+/// The ISO-8859-13 encoding.
+#[no_mangle]
+pub static ISO_8859_13_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_13_INIT);
+
+/// The ISO-8859-14 encoding.
+#[no_mangle]
+pub static ISO_8859_14_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_14_INIT);
+
+/// The ISO-8859-15 encoding.
+#[no_mangle]
+pub static ISO_8859_15_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_15_INIT);
+
+/// The ISO-8859-16 encoding.
+#[no_mangle]
+pub static ISO_8859_16_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_16_INIT);
+
+/// The ISO-8859-2 encoding.
+#[no_mangle]
+pub static ISO_8859_2_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_2_INIT);
+
+/// The ISO-8859-3 encoding.
+#[no_mangle]
+pub static ISO_8859_3_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_3_INIT);
+
+/// The ISO-8859-4 encoding.
+#[no_mangle]
+pub static ISO_8859_4_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_4_INIT);
+
+/// The ISO-8859-5 encoding.
+#[no_mangle]
+pub static ISO_8859_5_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_5_INIT);
+
+/// The ISO-8859-6 encoding.
+#[no_mangle]
+pub static ISO_8859_6_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_6_INIT);
+
+/// The ISO-8859-7 encoding.
+#[no_mangle]
+pub static ISO_8859_7_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_7_INIT);
+
+/// The ISO-8859-8 encoding.
+#[no_mangle]
+pub static ISO_8859_8_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_8_INIT);
+
+/// The ISO-8859-8-I encoding.
+#[no_mangle]
+pub static ISO_8859_8_I_ENCODING: ConstEncoding = ConstEncoding(&ISO_8859_8_I_INIT);
+
+/// The KOI8-R encoding.
+#[no_mangle]
+pub static KOI8_R_ENCODING: ConstEncoding = ConstEncoding(&KOI8_R_INIT);
+
+/// The KOI8-U encoding.
+#[no_mangle]
+pub static KOI8_U_ENCODING: ConstEncoding = ConstEncoding(&KOI8_U_INIT);
+
+/// The Shift_JIS encoding.
+#[no_mangle]
+pub static SHIFT_JIS_ENCODING: ConstEncoding = ConstEncoding(&SHIFT_JIS_INIT);
+
+/// The UTF-16BE encoding.
+#[no_mangle]
+pub static UTF_16BE_ENCODING: ConstEncoding = ConstEncoding(&UTF_16BE_INIT);
+
+/// The UTF-16LE encoding.
+#[no_mangle]
+pub static UTF_16LE_ENCODING: ConstEncoding = ConstEncoding(&UTF_16LE_INIT);
+
+/// The UTF-8 encoding.
+#[no_mangle]
+pub static UTF_8_ENCODING: ConstEncoding = ConstEncoding(&UTF_8_INIT);
+
+/// The gb18030 encoding.
+#[no_mangle]
+pub static GB18030_ENCODING: ConstEncoding = ConstEncoding(&GB18030_INIT);
+
+/// The macintosh encoding.
+#[no_mangle]
+pub static MACINTOSH_ENCODING: ConstEncoding = ConstEncoding(&MACINTOSH_INIT);
+
+/// The replacement encoding.
+#[no_mangle]
+pub static REPLACEMENT_ENCODING: ConstEncoding = ConstEncoding(&REPLACEMENT_INIT);
+
+/// The windows-1250 encoding.
+#[no_mangle]
+pub static WINDOWS_1250_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1250_INIT);
+
+/// The windows-1251 encoding.
+#[no_mangle]
+pub static WINDOWS_1251_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1251_INIT);
+
+/// The windows-1252 encoding.
+#[no_mangle]
+pub static WINDOWS_1252_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1252_INIT);
+
+/// The windows-1253 encoding.
+#[no_mangle]
+pub static WINDOWS_1253_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1253_INIT);
+
+/// The windows-1254 encoding.
+#[no_mangle]
+pub static WINDOWS_1254_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1254_INIT);
+
+/// The windows-1255 encoding.
+#[no_mangle]
+pub static WINDOWS_1255_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1255_INIT);
+
+/// The windows-1256 encoding.
+#[no_mangle]
+pub static WINDOWS_1256_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1256_INIT);
+
+/// The windows-1257 encoding.
+#[no_mangle]
+pub static WINDOWS_1257_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1257_INIT);
+
+/// The windows-1258 encoding.
+#[no_mangle]
+pub static WINDOWS_1258_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_1258_INIT);
+
+/// The windows-874 encoding.
+#[no_mangle]
+pub static WINDOWS_874_ENCODING: ConstEncoding = ConstEncoding(&WINDOWS_874_INIT);
+
+/// The x-mac-cyrillic encoding.
+#[no_mangle]
+pub static X_MAC_CYRILLIC_ENCODING: ConstEncoding = ConstEncoding(&X_MAC_CYRILLIC_INIT);
+
+/// The x-user-defined encoding.
+#[no_mangle]
+pub static X_USER_DEFINED_ENCODING: ConstEncoding = ConstEncoding(&X_USER_DEFINED_INIT);
+
+// END GENERATED CODE
+
+#[inline(always)]
+fn coder_result_to_u32(result: CoderResult) -> u32 {
+ match result {
+ CoderResult::InputEmpty => INPUT_EMPTY,
+ CoderResult::OutputFull => OUTPUT_FULL,
+ }
+}
+
+#[inline(always)]
+fn decoder_result_to_u32(result: DecoderResult) -> u32 {
+ match result {
+ DecoderResult::InputEmpty => INPUT_EMPTY,
+ DecoderResult::OutputFull => OUTPUT_FULL,
+ DecoderResult::Malformed(bad, good) => ((good as u32) << 8) | (bad as u32),
+ }
+}
+
+#[inline(always)]
+fn encoder_result_to_u32(result: EncoderResult) -> u32 {
+ match result {
+ EncoderResult::InputEmpty => INPUT_EMPTY,
+ EncoderResult::OutputFull => OUTPUT_FULL,
+ EncoderResult::Unmappable(c) => c as u32,
+ }
+}
+
+#[inline(always)]
+fn option_to_ptr(opt: Option<&'static Encoding>) -> *const Encoding {
+ match opt {
+ None => ::std::ptr::null(),
+ Some(e) => e,
+ }
+}
+
+/// Implements the
+/// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
+/// algorithm.
+///
+/// If, after ASCII-lowercasing and removing leading and trailing
+/// whitespace, the argument matches a label defined in the Encoding
+/// Standard, `const Encoding*` representing the corresponding
+/// encoding is returned. If there is no match, `NULL` is returned.
+///
+/// This is the right function to use if the action upon the method returning
+/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) instead.
+/// When the action upon the method returning `NULL` is not to proceed with
+/// a fallback but to refuse processing, `encoding_for_label_no_replacement()` is
+/// more appropriate.
+///
+/// The argument buffer can be in any ASCII-compatible encoding. It is not
+/// required to be UTF-8.
+///
+/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len`
+/// is zero, it is OK for `label` to be something non-dereferencable,
+/// such as `0x1`. This is required due to Rust's optimization for slices
+/// within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `label` and `label_len` don't designate a valid memory block
+/// of if `label` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_for_label(label: *const u8, label_len: usize) -> *const Encoding {
+ let label_slice = ::std::slice::from_raw_parts(label, label_len);
+ option_to_ptr(Encoding::for_label(label_slice))
+}
+
+/// This function behaves the same as `encoding_for_label()`, except when
+/// `encoding_for_label()` would return `REPLACEMENT_ENCODING`, this method
+/// returns `NULL` instead.
+///
+/// This method is useful in scenarios where a fatal error is required
+/// upon invalid label, because in those cases the caller typically wishes
+/// to treat the labels that map to the replacement encoding as fatal
+/// errors, too.
+///
+/// It is not OK to use this funciton when the action upon the method returning
+/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
+/// such a case, the `encoding_for_label()` function should be used instead
+/// in order to avoid unsafe fallback for labels that `encoding_for_label()`
+/// maps to `REPLACEMENT_ENCODING`.
+///
+/// The argument buffer can be in any ASCII-compatible encoding. It is not
+/// required to be UTF-8.
+///
+/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len`
+/// is zero, it is OK for `label` to be something non-dereferencable,
+/// such as `0x1`. This is required due to Rust's optimization for slices
+/// within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `label` and `label_len` don't designate a valid memory block
+/// of if `label` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_for_label_no_replacement(
+ label: *const u8,
+ label_len: usize,
+) -> *const Encoding {
+ let label_slice = ::std::slice::from_raw_parts(label, label_len);
+ option_to_ptr(Encoding::for_label_no_replacement(label_slice))
+}
+
+/// Performs non-incremental BOM sniffing.
+///
+/// The argument must either be a buffer representing the entire input
+/// stream (non-streaming case) or a buffer representing at least the first
+/// three bytes of the input stream (streaming case).
+///
+/// Returns `UTF_8_ENCODING`, `UTF_16LE_ENCODING` or `UTF_16BE_ENCODING` if the
+/// argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or `NULL`
+/// otherwise. Upon return, `*buffer_len` is the length of the BOM (zero if
+/// there is no BOM).
+///
+/// `buffer` must be non-`NULL` even if `*buffer_len` is zero. When
+/// `*buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_for_bom(
+ buffer: *const u8,
+ buffer_len: *mut usize,
+) -> *const Encoding {
+ let buffer_slice = ::std::slice::from_raw_parts(buffer, *buffer_len);
+ let (encoding, bom_length) = match Encoding::for_bom(buffer_slice) {
+ Some((encoding, bom_length)) => (encoding as *const Encoding, bom_length),
+ None => (::std::ptr::null(), 0),
+ };
+ *buffer_len = bom_length;
+ encoding
+}
+
+/// Writes the name of the given `Encoding` to a caller-supplied buffer as
+/// ASCII and returns the number of bytes / ASCII characters written.
+///
+/// The output is not null-terminated.
+///
+/// The caller _MUST_ ensure that `name_out` points to a buffer whose length
+/// is at least `ENCODING_NAME_MAX_LENGTH` bytes.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL` or if `name_out` doesn't point to
+/// a valid block of memory whose length is at least
+/// `ENCODING_NAME_MAX_LENGTH` bytes.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_name(encoding: *const Encoding, name_out: *mut u8) -> usize {
+ let bytes = (*encoding).name().as_bytes();
+ ::std::ptr::copy_nonoverlapping(bytes.as_ptr(), name_out, bytes.len());
+ bytes.len()
+}
+
+/// Checks whether the _output encoding_ of this encoding can encode every
+/// Unicode scalar. (Only true if the output encoding is UTF-8.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_can_encode_everything(encoding: *const Encoding) -> bool {
+ (*encoding).can_encode_everything()
+}
+
+/// Checks whether the bytes 0x00...0x7F map exclusively to the characters
+/// U+0000...U+007F and vice versa.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_is_ascii_compatible(encoding: *const Encoding) -> bool {
+ (*encoding).is_ascii_compatible()
+}
+
+/// Checks whether this encoding maps one byte to one Basic Multilingual
+/// Plane code point (i.e. byte length equals decoded UTF-16 length) and
+/// vice versa (for mappable characters).
+///
+/// `true` iff this encoding is on the list of [Legacy single-byte
+/// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
+/// in the spec or x-user-defined.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_is_single_byte(encoding: *const Encoding) -> bool {
+ (*encoding).is_single_byte()
+}
+
+/// Returns the _output encoding_ of this encoding. This is UTF-8 for
+/// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_output_encoding(encoding: *const Encoding) -> *const Encoding {
+ (*encoding).output_encoding()
+}
+
+/// Allocates a new `Decoder` for the given `Encoding` on the heap with BOM
+/// sniffing enabled and returns a pointer to the newly-allocated `Decoder`.
+///
+/// BOM sniffing may cause the returned decoder to morph into a decoder
+/// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
+///
+/// Once the allocated `Decoder` is no longer needed, the caller _MUST_
+/// deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_decoder(encoding: *const Encoding) -> *mut Decoder {
+ Box::into_raw(Box::new((*encoding).new_decoder()))
+}
+
+/// Allocates a new `Decoder` for the given `Encoding` on the heap with BOM
+/// removal and returns a pointer to the newly-allocated `Decoder`.
+///
+/// If the input starts with bytes that are the BOM for this encoding,
+/// those bytes are removed. However, the decoder never morphs into a
+/// decoder for another encoding: A BOM for another encoding is treated as
+/// (potentially malformed) input to the decoding algorithm for this
+/// encoding.
+///
+/// Once the allocated `Decoder` is no longer needed, the caller _MUST_
+/// deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_decoder_with_bom_removal(
+ encoding: *const Encoding,
+) -> *mut Decoder {
+ Box::into_raw(Box::new((*encoding).new_decoder_with_bom_removal()))
+}
+
+/// Allocates a new `Decoder` for the given `Encoding` on the heap with BOM
+/// handling disabled and returns a pointer to the newly-allocated `Decoder`.
+///
+/// If the input starts with bytes that look like a BOM, those bytes are
+/// not treated as a BOM. (Hence, the decoder never morphs into a decoder
+/// for another encoding.)
+///
+/// _Note:_ If the caller has performed BOM sniffing on its own but has not
+/// removed the BOM, the caller should use
+/// `encoding_new_decoder_with_bom_removal()` instead of this function to cause
+/// the BOM to be removed.
+///
+/// Once the allocated `Decoder` is no longer needed, the caller _MUST_
+/// deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_decoder_without_bom_handling(
+ encoding: *const Encoding,
+) -> *mut Decoder {
+ Box::into_raw(Box::new((*encoding).new_decoder_without_bom_handling()))
+}
+
+/// Allocates a new `Decoder` for the given `Encoding` into memory provided by
+/// the caller with BOM sniffing enabled. (In practice, the target should
+/// likely be a pointer previously returned by `encoding_new_decoder()`.)
+///
+/// Note: If the caller has already performed BOM sniffing but has
+/// not removed the BOM, the caller should still use this function in
+/// order to cause the BOM to be ignored.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_decoder_into(
+ encoding: *const Encoding,
+ decoder: *mut Decoder,
+) {
+ *decoder = (*encoding).new_decoder();
+}
+
+/// Allocates a new `Decoder` for the given `Encoding` into memory provided by
+/// the caller with BOM removal.
+///
+/// If the input starts with bytes that are the BOM for this encoding,
+/// those bytes are removed. However, the decoder never morphs into a
+/// decoder for another encoding: A BOM for another encoding is treated as
+/// (potentially malformed) input to the decoding algorithm for this
+/// encoding.
+///
+/// Once the allocated `Decoder` is no longer needed, the caller _MUST_
+/// deallocate it by passing the pointer returned by this function to
+/// `decoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_decoder_with_bom_removal_into(
+ encoding: *const Encoding,
+ decoder: *mut Decoder,
+) {
+ *decoder = (*encoding).new_decoder_with_bom_removal();
+}
+
+/// Allocates a new `Decoder` for the given `Encoding` into memory provided by
+/// the caller with BOM handling disabled.
+///
+/// If the input starts with bytes that look like a BOM, those bytes are
+/// not treated as a BOM. (Hence, the decoder never morphs into a decoder
+/// for another encoding.)
+///
+/// _Note:_ If the caller has performed BOM sniffing on its own but has not
+/// removed the BOM, the caller should use
+/// `encoding_new_decoder_with_bom_removal_into()` instead of this function to
+/// cause the BOM to be removed.
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_decoder_without_bom_handling_into(
+ encoding: *const Encoding,
+ decoder: *mut Decoder,
+) {
+ *decoder = (*encoding).new_decoder_without_bom_handling();
+}
+
+/// Allocates a new `Encoder` for the given `Encoding` on the heap and returns a
+/// pointer to the newly-allocated `Encoder`. (Exception, if the `Encoding` is
+/// `replacement`, a new `Decoder` for UTF-8 is instantiated (and that
+/// `Decoder` reports `UTF_8` as its `Encoding`).
+///
+/// Once the allocated `Encoder` is no longer needed, the caller _MUST_
+/// deallocate it by passing the pointer returned by this function to
+/// `encoder_free()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_encoder(encoding: *const Encoding) -> *mut Encoder {
+ Box::into_raw(Box::new((*encoding).new_encoder()))
+}
+
+/// Allocates a new `Encoder` for the given `Encoding` into memory provided by
+/// the caller. (In practice, the target should likely be a pointer previously
+/// returned by `encoding_new_encoder()`.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if either argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_new_encoder_into(
+ encoding: *const Encoding,
+ encoder: *mut Encoder,
+) {
+ *encoder = (*encoding).new_encoder();
+}
+
+/// Validates UTF-8.
+///
+/// Returns the index of the first byte that makes the input malformed as
+/// UTF-8 or `buffer_len` if `buffer` is entirely valid.
+///
+/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
+/// `buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_utf8_valid_up_to(buffer: *const u8, buffer_len: usize) -> usize {
+ let buffer_slice = ::std::slice::from_raw_parts(buffer, buffer_len);
+ Encoding::utf8_valid_up_to(buffer_slice)
+}
+
+/// Validates ASCII.
+///
+/// Returns the index of the first byte that makes the input malformed as
+/// ASCII or `buffer_len` if `buffer` is entirely valid.
+///
+/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
+/// `buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_ascii_valid_up_to(buffer: *const u8, buffer_len: usize) -> usize {
+ let buffer_slice = ::std::slice::from_raw_parts(buffer, buffer_len);
+ Encoding::ascii_valid_up_to(buffer_slice)
+}
+
+/// Validates ISO-2022-JP ASCII-state data.
+///
+/// Returns the index of the first byte that makes the input not representable
+/// in the ASCII state of ISO-2022-JP or `buffer_len` if `buffer` is entirely
+/// representable in the ASCII state of ISO-2022-JP.
+///
+/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
+/// `buffer_len` is zero, it is OK for `buffer` to be something
+/// non-dereferencable, such as `0x1`. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_iso_2022_jp_ascii_valid_up_to(
+ buffer: *const u8,
+ buffer_len: usize,
+) -> usize {
+ let buffer_slice = ::std::slice::from_raw_parts(buffer, buffer_len);
+ Encoding::iso_2022_jp_ascii_valid_up_to(buffer_slice)
+}
+
+/// Deallocates a `Decoder` previously allocated by `encoding_new_decoder()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn decoder_free(decoder: *mut Decoder) {
+ let _ = Box::from_raw(decoder);
+}
+
+/// The `Encoding` this `Decoder` is for.
+///
+/// BOM sniffing can change the return value of this method during the life
+/// of the decoder.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn decoder_encoding(decoder: *const Decoder) -> *const Encoding {
+ (*decoder).encoding()
+}
+
+/// Query the worst-case UTF-8 output size _with replacement_.
+///
+/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
+/// that will not overflow given the current state of the decoder and
+/// `byte_length` number of additional input bytes when decoding with
+/// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
+/// sequence or `SIZE_MAX` if `size_t` would overflow.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `decoder` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn decoder_max_utf8_buffer_length(
+ decoder: *const Decoder,
+ byte_length: usize,
+) -> usize {
+ (*decoder)
+ .max_utf8_buffer_length(byte_length)
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Query the worst-case UTF-8 output size _without replacement_.
+///
+/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
+/// that will not overflow given the current state of the decoder and
+/// `byte_length` number of additional input bytes when decoding without
+/// replacement error handling or `SIZE_MAX` if `size_t` would overflow.
+///
+/// Note that this value may be too small for the `_with_replacement` case.
+/// Use `decoder_max_utf8_buffer_length()` for that case.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `decoder` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn decoder_max_utf8_buffer_length_without_replacement(
+ decoder: *const Decoder,
+ byte_length: usize,
+) -> usize {
+ (*decoder)
+ .max_utf8_buffer_length_without_replacement(byte_length)
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Incrementally decode a byte stream into UTF-8 with malformed sequences
+/// replaced with the REPLACEMENT CHARACTER.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`Decoder`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+#[no_mangle]
+pub unsafe extern "C" fn decoder_decode_to_utf8(
+ decoder: *mut Decoder,
+ src: *const u8,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+ last: bool,
+ had_replacements: *mut bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written, replaced) = (*decoder).decode_to_utf8(src_slice, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ *had_replacements = replaced;
+ coder_result_to_u32(result)
+}
+
+/// Incrementally decode a byte stream into UTF-8 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`Decoder`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+#[no_mangle]
+pub unsafe extern "C" fn decoder_decode_to_utf8_without_replacement(
+ decoder: *mut Decoder,
+ src: *const u8,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+ last: bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written) =
+ (*decoder).decode_to_utf8_without_replacement(src_slice, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ decoder_result_to_u32(result)
+}
+
+/// Query the worst-case UTF-16 output size (with or without replacement).
+///
+/// Returns the size of the output buffer in UTF-16 code units (`char16_t`)
+/// that will not overflow given the current state of the decoder and
+/// `byte_length` number of additional input bytes or `SIZE_MAX` if `size_t`
+/// would overflow.
+///
+/// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
+/// return value of this method applies also in the
+/// `_without_replacement` case.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `decoder` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn decoder_max_utf16_buffer_length(
+ decoder: *const Decoder,
+ u16_length: usize,
+) -> usize {
+ (*decoder)
+ .max_utf16_buffer_length(u16_length)
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Incrementally decode a byte stream into UTF-16 with malformed sequences
+/// replaced with the REPLACEMENT CHARACTER.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`Decoder`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+#[no_mangle]
+pub unsafe extern "C" fn decoder_decode_to_utf16(
+ decoder: *mut Decoder,
+ src: *const u8,
+ src_len: *mut usize,
+ dst: *mut u16,
+ dst_len: *mut usize,
+ last: bool,
+ had_replacements: *mut bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written, replaced) = (*decoder).decode_to_utf16(src_slice, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ *had_replacements = replaced;
+ coder_result_to_u32(result)
+}
+
+/// Incrementally decode a byte stream into UTF-16 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `decoder_decode_*` functions are mapped from Rust and the documentation
+/// for the [`Decoder`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
+#[no_mangle]
+pub unsafe extern "C" fn decoder_decode_to_utf16_without_replacement(
+ decoder: *mut Decoder,
+ src: *const u8,
+ src_len: *mut usize,
+ dst: *mut u16,
+ dst_len: *mut usize,
+ last: bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written) =
+ (*decoder).decode_to_utf16_without_replacement(src_slice, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ decoder_result_to_u32(result)
+}
+
+/// Checks for compatibility with storing Unicode scalar values as unsigned
+/// bytes taking into account the state of the decoder.
+///
+/// Returns `SIZE_MAX` if the decoder is not in a neutral state, including waiting
+/// for the BOM, or if the encoding is never Latin1-byte-compatible.
+///
+/// Otherwise returns the index of the first byte whose unsigned value doesn't
+/// directly correspond to the decoded Unicode scalar value, or the length
+/// of the input if all bytes in the input decode directly to scalar values
+/// corresponding to the unsigned byte values.
+///
+/// Does not change the state of the decoder.
+///
+/// Do not use this unless you are supporting SpiderMonkey/V8-style string
+/// storage optimizations.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory
+/// block of if `buffer` is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn decoder_latin1_byte_compatible_up_to(
+ decoder: *const Decoder,
+ buffer: *const u8,
+ buffer_len: usize,
+) -> usize {
+ (*decoder)
+ .latin1_byte_compatible_up_to(::std::slice::from_raw_parts(buffer, buffer_len))
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Deallocates an `Encoder` previously allocated by `encoding_new_encoder()`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoder_free(encoder: *mut Encoder) {
+ let _ = Box::from_raw(encoder);
+}
+
+/// The `Encoding` this `Encoder` is for.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoder_encoding(encoder: *const Encoder) -> *const Encoding {
+ (*encoder).encoding()
+}
+
+/// Returns `true` if this is an ISO-2022-JP encoder that's not in the
+/// ASCII state and `false` otherwise.
+///
+/// # Undefined behavior
+///
+/// UB ensues if the argument is `NULL`.
+#[no_mangle]
+pub unsafe extern "C" fn encoder_has_pending_state(encoder: *const Encoder) -> bool {
+ (*encoder).has_pending_state()
+}
+
+/// Query the worst-case output size when encoding from UTF-8 with
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `byte_length` number of
+/// additional input code units if there are no unmappable characters in
+/// the input or `SIZE_MAX` if `size_t` would overflow.
+#[no_mangle]
+pub unsafe extern "C" fn encoder_max_buffer_length_from_utf8_if_no_unmappables(
+ encoder: *const Encoder,
+ byte_length: usize,
+) -> usize {
+ (*encoder)
+ .max_buffer_length_from_utf8_if_no_unmappables(byte_length)
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Query the worst-case output size when encoding from UTF-8 without
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `byte_length` number of
+/// additional input code units or `SIZE_MAX` if `size_t` would overflow.
+#[no_mangle]
+pub unsafe extern "C" fn encoder_max_buffer_length_from_utf8_without_replacement(
+ encoder: *const Encoder,
+ byte_length: usize,
+) -> usize {
+ (*encoder)
+ .max_buffer_length_from_utf8_without_replacement(byte_length)
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Incrementally encode into byte stream from UTF-8 with unmappable
+/// characters replaced with HTML (decimal) numeric character references.
+///
+/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe!
+/// If in doubt, check the validity of input before using!
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`Encoder`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+#[no_mangle]
+pub unsafe extern "C" fn encoder_encode_from_utf8(
+ encoder: *mut Encoder,
+ src: *const u8,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+ last: bool,
+ had_replacements: *mut bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let string = ::std::str::from_utf8_unchecked(src_slice);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written, replaced) = (*encoder).encode_from_utf8(string, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ *had_replacements = replaced;
+ coder_result_to_u32(result)
+}
+
+/// Incrementally encode into byte stream from UTF-8 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`Encoder`][1] struct for the semantics.
+///
+/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe!
+/// If in doubt, check the validity of input before using!
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+#[no_mangle]
+pub unsafe extern "C" fn encoder_encode_from_utf8_without_replacement(
+ encoder: *mut Encoder,
+ src: *const u8,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+ last: bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let string = ::std::str::from_utf8_unchecked(src_slice);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written) =
+ (*encoder).encode_from_utf8_without_replacement(string, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ encoder_result_to_u32(result)
+}
+
+/// Query the worst-case output size when encoding from UTF-16 with
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `u16_length` number of
+/// additional input code units if there are no unmappable characters in
+/// the input or `SIZE_MAX` if `size_t` would overflow.
+#[no_mangle]
+pub unsafe extern "C" fn encoder_max_buffer_length_from_utf16_if_no_unmappables(
+ encoder: *const Encoder,
+ u16_length: usize,
+) -> usize {
+ (*encoder)
+ .max_buffer_length_from_utf16_if_no_unmappables(u16_length)
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Query the worst-case output size when encoding from UTF-16 without
+/// replacement.
+///
+/// Returns the size of the output buffer in bytes that will not overflow
+/// given the current state of the encoder and `u16_length` number of
+/// additional input code units or `SIZE_MAX` if `size_t` would overflow.
+#[no_mangle]
+pub unsafe extern "C" fn encoder_max_buffer_length_from_utf16_without_replacement(
+ encoder: *const Encoder,
+ u16_length: usize,
+) -> usize {
+ (*encoder)
+ .max_buffer_length_from_utf16_without_replacement(u16_length)
+ .unwrap_or(::std::usize::MAX)
+}
+
+/// Incrementally encode into byte stream from UTF-16 with unmappable
+/// characters replaced with HTML (decimal) numeric character references.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`Encoder`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+#[no_mangle]
+pub unsafe extern "C" fn encoder_encode_from_utf16(
+ encoder: *mut Encoder,
+ src: *const u16,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+ last: bool,
+ had_replacements: *mut bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written, replaced) =
+ (*encoder).encode_from_utf16(src_slice, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ *had_replacements = replaced;
+ coder_result_to_u32(result)
+}
+
+/// Incrementally encode into byte stream from UTF-16 _without replacement_.
+///
+/// See the top-level FFI documentation for documentation for how the
+/// `encoder_encode_*` functions are mapped from Rust and the documentation
+/// for the [`Encoder`][1] struct for the semantics.
+///
+/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
+/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
+/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
+/// optimization for slices within `Option`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
+/// don't designate a valid block of memory or `dst` and `dst_len` don't
+/// designate a valid block of memory.
+///
+/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
+#[no_mangle]
+pub unsafe extern "C" fn encoder_encode_from_utf16_without_replacement(
+ encoder: *mut Encoder,
+ src: *const u16,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+ last: bool,
+) -> u32 {
+ let src_slice = ::std::slice::from_raw_parts(src, *src_len);
+ let dst_slice = ::std::slice::from_raw_parts_mut(dst, *dst_len);
+ let (result, read, written) =
+ (*encoder).encode_from_utf16_without_replacement(src_slice, dst_slice, last);
+ *src_len = read;
+ *dst_len = written;
+ encoder_result_to_u32(result)
+}
diff --git a/third_party/rust/encoding_c_mem/.cargo-checksum.json b/third_party/rust/encoding_c_mem/.cargo-checksum.json
new file mode 100644
index 0000000000..b2de1315ea
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"CONTRIBUTING.md":"d393951002340c3d98011f7b654e8133408f3f0e13b9f6470f4cb5d251e3afed","COPYRIGHT":"8667a5cdf817b0123721cc7d7ca73e97f05ac926203a13646a9e8a30c70c0989","Cargo.toml":"bc7ca08a7395d4839be804fff569d96a5cf0250be792e074af2f57c1ab1fd8d4","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"3fa4ca83dcc9237839b1bdeb2e6d16bdfb5ec0c5ce42b24694d8bbf0dcbef72c","README.md":"333b861c160f5328e9fb4bf506e8aaaf1a1eab8e93af3ce03998c3f6b57a2da2","build.rs":"013c85c18b035473d3a0900b833906304a8431882e5c22053684a69588adde98","include/encoding_rs_mem.h":"99f2c8d900bdb66ffd74772419a0e50d482d25f20db54a187c78d079fe483be0","include/encoding_rs_mem_cpp.h":"5a546590508d8e1cc78493d6e0a04cdb80a499d23ef192603a31aaf2e518ca3a","src/lib.rs":"7d5940a215cd93b231aafa61cc9cff474e808893a35d5236b99e7317697ac308"},"package":"3a80a16821fe8c7cab96e0c67b57cd7090e021e9615e6ce6ab0cf866c44ed1f0"} \ No newline at end of file
diff --git a/third_party/rust/encoding_c_mem/CONTRIBUTING.md b/third_party/rust/encoding_c_mem/CONTRIBUTING.md
new file mode 100644
index 0000000000..88322776f6
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/CONTRIBUTING.md
@@ -0,0 +1,33 @@
+If you send a pull request / patch, please observe the following.
+
+## Licensing
+
+Since this crate is dual-licensed,
+[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions)
+is considered to apply in the sense of Contributions being automatically
+under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file).
+That is, by the act of offering a Contribution, you place your Contribution
+under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT`
+file. Please do not contribute if you aren't willing or allowed to license your
+contributions in this manner.
+
+You are encouraged to dedicate test code that you contribute to the Public
+Domain using the CC0 dedication. If you contribute test code that is not
+dedicated to the Public Domain, please be sure not to put it in a part of
+source code that the comments designate as being dedicated to the Public
+Domain.
+
+## Copyright Notices
+
+If you require the addition of your copyright notice, it's up to you to edit in
+your notice as part of your Contribution. Not adding a copyright notice is
+taken as a waiver of copyright notice.
+
+## Compatibility with Stable Rust
+
+Please ensure that your Contribution compiles with the latest stable-channel
+rustc.
+
+## rustfmt
+
+Please run `cargo fmt` before creating a pull. \ No newline at end of file
diff --git a/third_party/rust/encoding_c_mem/COPYRIGHT b/third_party/rust/encoding_c_mem/COPYRIGHT
new file mode 100644
index 0000000000..b4569d6701
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/COPYRIGHT
@@ -0,0 +1,9 @@
+encoding_c_mem is copyright Mozilla Foundation.
+
+Licensed under the Apache License, Version 2.0
+<LICENSE-APACHE or
+https://www.apache.org/licenses/LICENSE-2.0> or the MIT
+license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
+at your option. All files in the project carrying such
+notice may not be copied, modified, or distributed except
+according to those terms.
diff --git a/third_party/rust/encoding_c_mem/Cargo.toml b/third_party/rust/encoding_c_mem/Cargo.toml
new file mode 100644
index 0000000000..2284f40fd8
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/Cargo.toml
@@ -0,0 +1,27 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+edition = "2018"
+name = "encoding_c_mem"
+version = "0.2.6"
+authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
+links = "encoding_c_mem"
+description = "C API for encoding_rs::mem"
+homepage = "https://docs.rs/encoding_c_mem/"
+documentation = "https://docs.rs/encoding_c_mem/"
+readme = "README.md"
+keywords = ["ffi", "capi", "encoding", "unicode", "charset"]
+license = "Apache-2.0 OR MIT"
+repository = "https://github.com/hsivonen/encoding_c_mem"
+[dependencies.encoding_rs]
+version = "0.8.19"
diff --git a/third_party/rust/encoding_c_mem/LICENSE-APACHE b/third_party/rust/encoding_c_mem/LICENSE-APACHE
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/LICENSE-APACHE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/third_party/rust/encoding_c_mem/LICENSE-MIT b/third_party/rust/encoding_c_mem/LICENSE-MIT
new file mode 100644
index 0000000000..3317c82e2f
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright Mozilla Foundation
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/third_party/rust/encoding_c_mem/README.md b/third_party/rust/encoding_c_mem/README.md
new file mode 100644
index 0000000000..59c7abe607
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/README.md
@@ -0,0 +1,60 @@
+# encoding_c_mem
+
+[![crates.io](https://meritbadge.herokuapp.com/encoding_c_mem)](https://crates.io/crates/encoding_c_mem)
+[![docs.rs](https://docs.rs/encoding_c_mem/badge.svg)](https://docs.rs/encoding_c_mem/)
+[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/encoding_c_mem/blob/master/COPYRIGHT)
+
+encoding_c_mem is an FFI wrapper for the `mem` module of [encoding_rs](https://github.com/hsivonen/encoding_rs).
+
+## Licensing
+
+Please see the file named
+[COPYRIGHT](https://github.com/hsivonen/encoding_c_mem/blob/master/COPYRIGHT).
+
+## No Unwinding Support!
+
+This crate is meant for use in binaries compiled with `panic = 'abort'`, which
+is _required_ for correctness! Unwinding across FFI is Undefined Behavior, and
+this crate does nothing to try to prevent unwinding across the FFI if
+compiled with unwinding enabled.
+
+## Release Notes
+
+### 0.2.6
+
+* Remove year from copyright notices.
+
+### 0.2.5
+
+* Specify a `links` value in the Cargo manifest.
+* Emit an `include_dir` variable from build script so that other build scripts
+ depending on this crate can rely on it.
+
+### 0.2.4
+
+* Documentation-only fix.
+
+### 0.2.3
+
+* Documentation-only fix.
+
+### 0.2.2
+
+* Wrap `convert_utf8_to_utf16_without_replacement`, `utf8_latin1_up_to`,
+ and `str_latin1_up_to`.
+
+### 0.2.1
+
+* Fix a typo in README.
+
+### 0.2.0
+
+* Use `char` instead of `uint8_t` for 8-bit-unit text in C and C++.
+
+### 0.1.1
+
+* Add include guard to the C header.
+
+### 0.1.0
+
+* Initial release of encoding_c_mem.
diff --git a/third_party/rust/encoding_c_mem/build.rs b/third_party/rust/encoding_c_mem/build.rs
new file mode 100644
index 0000000000..962b7ae12b
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/build.rs
@@ -0,0 +1,7 @@
+fn main() {
+ println!("cargo:rerun-if-changed=");
+
+ let cargo_manifest_dir = std::env::var_os("CARGO_MANIFEST_DIR").unwrap();
+ let include_dir = std::path::PathBuf::from(cargo_manifest_dir).join("include");
+ println!("cargo:include-dir={}", include_dir.display());
+}
diff --git a/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h b/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h
new file mode 100644
index 0000000000..2327a9dd0b
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/include/encoding_rs_mem.h
@@ -0,0 +1,704 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#ifndef encoding_rs_mem_h_
+#define encoding_rs_mem_h_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/*
+ * _Note:_ "Latin1" in this header refers to the Unicode range from U+0000 to
+ * U+00FF, inclusive, and does not refer to the windows-1252 range. This
+ * in-memory encoding is sometimes used as a storage optimization of text
+ * when UTF-16 indexing and length semantics are exposed.
+ */
+
+/**
+ * Classification of text as Latin1 (all code points are below U+0100),
+ * left-to-right with some non-Latin1 characters or as containing at least
+ * some right-to-left characters.
+ */
+typedef enum {
+ /**
+ * Every character is below U+0100.
+ */
+ Latin1 = 0,
+ /**
+ * There is at least one character that's U+0100 or higher, but there
+ * are no right-to-left characters.
+ */
+ LeftToRight = 1,
+ /**
+ * There is at least one right-to-left character.
+ */
+ Bidi = 2,
+} Latin1Bidi;
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/**
+ * Checks whether a valid UTF-8 buffer contains code points
+ * that trigger right-to-left processing or is all-Latin1.
+ *
+ * Possibly more efficient than performing the checks separately.
+ *
+ * Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
+ * Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
+ * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+ * if `buffer` is `NULL`, or if the memory designated by `buffer` and
+ * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer`
+ * may be bogus but still has to be non-`NULL`.)
+ */
+Latin1Bidi encoding_mem_check_str_for_latin1_and_bidi(const char* buffer,
+ size_t len);
+
+/**
+ * Checks whether a potentially invalid UTF-16 buffer contains code points
+ * that trigger right-to-left processing or is all-Latin1.
+ *
+ * Possibly more efficient than performing the checks separately.
+ *
+ * Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
+ * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
+ * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL` and aligned.)
+ */
+Latin1Bidi encoding_mem_check_utf16_for_latin1_and_bidi(const char16_t* buffer,
+ size_t len);
+
+/**
+ * Checks whether a potentially invalid UTF-8 buffer contains code points
+ * that trigger right-to-left processing or is all-Latin1.
+ *
+ * Possibly more efficient than performing the checks separately.
+ *
+ * Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
+ *
+ * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
+ * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL`.)
+ */
+Latin1Bidi encoding_mem_check_utf8_for_latin1_and_bidi(const char* buffer,
+ size_t len);
+
+/**
+ * Converts bytes whose unsigned value is interpreted as Unicode code point
+ * (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * The number of `char16_t`s written equals the length of the source buffer.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+void encoding_mem_convert_latin1_to_utf16(const char* src, size_t src_len,
+ char16_t* dst, size_t dst_len);
+
+/**
+ * Converts bytes whose unsigned value is interpreted as Unicode code point
+ * (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer times two.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Safety
+ *
+ * Note that this function may write garbage beyond the number of bytes
+ * indicated by the return value.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_convert_latin1_to_utf8(const char* src, size_t src_len,
+ char* dst, size_t dst_len);
+
+/**
+ * Converts bytes whose unsigned value is interpreted as Unicode code point
+ * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
+ * output space.
+ *
+ * Writes the number of code units read into `*src_len` and the number of
+ * bytes written into `*dst_len`.
+ *
+ * If the output isn't large enough, not all input is consumed.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+void encoding_mem_convert_latin1_to_utf8_partial(const char* src,
+ size_t* src_len, char* dst,
+ size_t* dst_len);
+
+/**
+ * Converts valid UTF-8 to valid UTF-16.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of `char16_t`s written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL`, if the two memory blocks overlap, of if the
+ * buffer designated by `src` and `src_len` does not contain valid UTF-8. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_convert_str_to_utf16(const char* src, size_t src_len,
+ char16_t* dst, size_t dst_len);
+
+/**
+ * If the input is valid UTF-16 representing only Unicode code points from
+ * U+0000 to U+00FF, inclusive, converts the input into output that
+ * represents the value of each code point as the unsigned byte value of
+ * each output byte.
+ *
+ * If the input does not fulfill the condition stated above, does something
+ * that is memory-safe without any promises about any properties of the
+ * output and will probably assert in debug builds in future versions.
+ * In particular, callers shouldn't assume the output to be the same across
+ * crate versions or CPU architectures and should not assume that non-ASCII
+ * input can't map to ASCII output.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * The number of bytes written equals the length of the source buffer.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ * (Probably in future versions if debug assertions are enabled (and not
+ * fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src,
+ size_t src_len, char* dst,
+ size_t dst_len);
+
+/**
+ * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+ * with the REPLACEMENT CHARACTER.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer times three.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len,
+ char* dst, size_t dst_len);
+
+/**
+ * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+ * with the REPLACEMENT CHARACTER with potentially insufficient output
+ * space.
+ *
+ * Writes the number of code units read into `*src_len` and the number of
+ * bytes written into `*dst_len`.
+ *
+ * Guarantees that the bytes in the destination beyond the number of
+ * bytes claimed as written by the second item of the return tuple
+ * are left unmodified.
+ *
+ * Not all code units are read if there isn't enough output space.
+ * Note that this method isn't designed for general streamability but for
+ * not allocating memory for the worst case up front. Specifically,
+ * if the input starts with or ends with an unpaired surrogate, those are
+ * replaced with the REPLACEMENT CHARACTER.
+ *
+ * Matches the semantics of `TextEncoder.encodeInto()` from the
+ * Encoding Standard.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+void encoding_mem_convert_utf16_to_utf8_partial(const char16_t* src,
+ size_t* src_len, char* dst,
+ size_t* dst_len);
+
+/**
+ * If the input is valid UTF-8 representing only Unicode code points from
+ * U+0000 to U+00FF, inclusive, converts the input into output that
+ * represents the value of each code point as the unsigned byte value of
+ * each output byte.
+ *
+ * If the input does not fulfill the condition stated above, this function
+ * panics if debug assertions are enabled (and fuzzing isn't) and otherwise
+ * does something that is memory-safe without any promises about any
+ * properties of the output. In particular, callers shouldn't assume the
+ * output to be the same across crate versions or CPU architectures and
+ * should not assume that non-ASCII input can't map to ASCII output.
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ * If debug assertions are enabled (and not fuzzing) and the input is
+ * not in the range U+0000 to U+00FF, inclusive.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_convert_utf8_to_latin1_lossy(const char* src,
+ size_t src_len, char* dst,
+ size_t dst_len);
+
+/**
+ * Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
+ * with the REPLACEMENT CHARACTER.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer _plus one_.
+ *
+ * Returns the number of `char16_t`s written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_convert_utf8_to_utf16(const char* src, size_t src_len,
+ char16_t* dst, size_t dst_len);
+
+/**
+ * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of `char16_t`s written or `SIZE_MAX` if the input was
+ * invalid.
+ *
+ * When the input was invalid, some output may have been written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_convert_utf8_to_utf16_without_replacement(const char* src,
+ size_t src_len,
+ char16_t* dst,
+ size_t dst_len);
+
+/**
+ * Copies ASCII from source to destination up to the first non-ASCII byte
+ * (or the end of the input if it is ASCII in its entirety).
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_copy_ascii_to_ascii(const char* src, size_t src_len,
+ char* dst, size_t dst_len);
+
+/**
+ * Copies ASCII from source to destination zero-extending it to UTF-16 up to
+ * the first non-ASCII byte (or the end of the input if it is ASCII in its
+ * entirety).
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of `char16_t`s written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_copy_ascii_to_basic_latin(const char* src, size_t src_len,
+ char16_t* dst, size_t dst_len);
+
+/**
+ * Copies Basic Latin from source to destination narrowing it to ASCII up to
+ * the first non-Basic Latin code unit (or the end of the input if it is
+ * Basic Latin in its entirety).
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `src_len` don't designate a valid memory block, if
+ * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+ * block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+ * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+ * aligned. Likewise for `dst` and `dst_len`.)
+ */
+size_t encoding_mem_copy_basic_latin_to_ascii(const char16_t* src,
+ size_t src_len, char* dst,
+ size_t dst_len);
+
+/**
+ * Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL` and aligned.)
+ */
+void encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t len);
+
+/**
+ * Checks whether the buffer is all-ASCII.
+ *
+ * May read the entire buffer even if it isn't all-ASCII. (I.e. the function
+ * is not guaranteed to fail fast.)
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL`.)
+ */
+bool encoding_mem_is_ascii(const char* buffer, size_t len);
+
+/**
+ * Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
+ * only ASCII characters).
+ *
+ * May read the entire buffer even if it isn't all-ASCII. (I.e. the function
+ * is not guaranteed to fail fast.)
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL` and aligned.)
+ */
+bool encoding_mem_is_basic_latin(const char16_t* buffer, size_t len);
+
+/**
+ * Checks whether a scalar value triggers right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ *
+ * # Undefined behavior
+ *
+ * Undefined behavior ensues if `c` is not a valid Unicode Scalar Value.
+ */
+bool encoding_mem_is_char_bidi(char32_t c);
+
+/**
+ * Checks whether a valid UTF-8 buffer contains code points that trigger
+ * right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+ * if `buffer` is `NULL`, or if the memory designated by `buffer` and
+ * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer`
+ * may be bogus but still has to be non-`NULL`.)
+ */
+bool encoding_mem_is_str_bidi(const char* buffer, size_t len);
+
+/**
+ * Checks whether the buffer represents only code points less than or equal
+ * to U+00FF.
+ *
+ * Fails fast. (I.e. returns before having read the whole buffer if code
+ * points above U+00FF are discovered.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+ * if `buffer` is `NULL`, or if the memory designated by `buffer` and
+ * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer`
+ * may be bogus but still has to be non-`NULL`.)
+ */
+bool encoding_mem_is_str_latin1(const char* buffer, size_t len);
+
+/**
+ * Checks whether a UTF-16 buffer contains code points that trigger
+ * right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ * Returns `true` if the input contains an RTL character or an unpaired
+ * high surrogate that could be the high half of an RTL character.
+ * Returns `false` if the input contains neither RTL characters nor
+ * unpaired high surrogates that could be higher halves of RTL characters.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL` and aligned.)
+ */
+bool encoding_mem_is_utf16_bidi(const char16_t* buffer, size_t len);
+
+/**
+ * Checks whether a UTF-16 code unit triggers right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ * Since supplementary-plane right-to-left blocks are identifiable from the
+ * high surrogate without examining the low surrogate, this function returns
+ * `true` for such high surrogates making the function suitable for handling
+ * supplementary-plane text without decoding surrogate pairs to scalar
+ * values. Obviously, such high surrogates are then reported as right-to-left
+ * even if actually unpaired.
+ */
+bool encoding_mem_is_utf16_code_unit_bidi(char16_t u);
+
+/**
+ * Checks whether the buffer represents only code point less than or equal
+ * to U+00FF.
+ *
+ * May read the entire buffer even if it isn't all-Latin1. (I.e. the function
+ * is not guaranteed to fail fast.)
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL` and aligned.)
+ */
+bool encoding_mem_is_utf16_latin1(const char16_t* buffer, size_t len);
+
+/**
+ * Checks whether a potentially-invalid UTF-8 buffer contains code points
+ * that trigger right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ * Returns `true` if the input is invalid UTF-8 or the input contains an
+ * RTL character. Returns `false` if the input is valid UTF-8 and contains
+ * no RTL characters.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL`.)
+ */
+bool encoding_mem_is_utf8_bidi(const char* buffer, size_t len);
+
+/**
+ * Checks whether the buffer is valid UTF-8 representing only code points
+ * less than or equal to U+00FF.
+ *
+ * Fails fast. (I.e. returns before having read the whole buffer if UTF-8
+ * invalidity or code points above U+00FF are discovered.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL`.)
+ */
+bool encoding_mem_is_utf8_latin1(const char* buffer, size_t len);
+
+/**
+ * Returns the index of the first unpaired surrogate or, if the input is
+ * valid UTF-16 in its entirety, the length of the input.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL` and aligned.)
+ */
+size_t encoding_mem_utf16_valid_up_to(const char16_t* buffer, size_t len);
+
+/**
+ * Returns the index of first byte that starts an invalid byte
+ * sequence or a non-Latin1 byte sequence, or the length of the
+ * string if there are neither.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+ * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+ * still has to be non-`NULL` and aligned.)
+ */
+size_t encoding_mem_utf8_latin1_up_to(const char* buffer, size_t len);
+
+/**
+ * Returns the index of first byte that starts a non-Latin1 byte
+ * sequence, or the length of the string if there are none.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+ * if `buffer` is `NULL`, or if the memory block does not contain valid UTF-8.
+ * (If `buffer_len` is `0`, `buffer` may be bogus but still has to be non-`NULL`
+ * and aligned.)
+ */
+size_t encoding_mem_str_latin1_up_to(const char* buffer, size_t len);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // encoding_rs_mem_h_
diff --git a/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h b/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h
new file mode 100644
index 0000000000..b6173d7ef4
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/include/encoding_rs_mem_cpp.h
@@ -0,0 +1,578 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#pragma once
+
+#ifndef encoding_rs_mem_cpp_h_
+#define encoding_rs_mem_cpp_h_
+
+#include <optional>
+#include <string_view>
+#include <tuple>
+#include "gsl/gsl"
+
+#include "encoding_rs_mem.h"
+
+namespace encoding_rs {
+namespace mem {
+
+namespace detail {
+/**
+ * Replaces `nullptr` with a bogus pointer suitable for use as part of a
+ * zero-length Rust slice.
+ */
+template <class T>
+static inline T* null_to_bogus(T* ptr) {
+ return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
+}
+}; // namespace detail
+
+/**
+ * Checks whether a potentially invalid UTF-16 buffer contains code points
+ * that trigger right-to-left processing or is all-Latin1.
+ *
+ * Possibly more efficient than performing the checks separately.
+ *
+ * Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
+ * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
+ * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+ */
+inline Latin1Bidi check_for_latin1_and_bidi(std::u16string_view buffer) {
+ return encoding_mem_check_utf16_for_latin1_and_bidi(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Checks whether a potentially invalid UTF-8 buffer contains code points
+ * that trigger right-to-left processing or is all-Latin1.
+ *
+ * Possibly more efficient than performing the checks separately.
+ *
+ * Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
+ *
+ * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
+ * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+ */
+inline Latin1Bidi check_for_latin1_and_bidi(std::string_view buffer) {
+ return encoding_mem_check_utf8_for_latin1_and_bidi(
+ encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Converts bytes whose unsigned value is interpreted as Unicode code point
+ * (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * The number of `char16_t`s written equals the length of the source buffer.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ */
+inline void convert_latin1_to_utf16(gsl::span<const char> src,
+ gsl::span<char16_t> dst) {
+ encoding_mem_convert_latin1_to_utf16(
+ encoding_rs::mem::detail::null_to_bogus<const char>(src.data()),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Converts bytes whose unsigned value is interpreted as Unicode code point
+ * (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer times two.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Safety
+ *
+ * Note that this function may write garbage beyond the number of bytes
+ * indicated by the return value.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `dst` overlap.
+ */
+inline size_t convert_latin1_to_utf8(gsl::span<const char> src,
+ gsl::span<char> dst) {
+ return encoding_mem_convert_latin1_to_utf8(
+ encoding_rs::mem::detail::null_to_bogus<const char>(src.data()),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Converts bytes whose unsigned value is interpreted as Unicode code point
+ * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
+ * output space.
+ *
+ * Returns the number of bytes read and the number of bytes written.
+ *
+ * If the output isn't large enough, not all input is consumed.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `dst` overlap.
+ */
+inline std::tuple<size_t, size_t> convert_latin1_to_utf8_partial(
+ gsl::span<const char> src, gsl::span<char> dst) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ encoding_mem_convert_latin1_to_utf8_partial(
+ encoding_rs::mem::detail::null_to_bogus<const char>(src.data()),
+ &src_read, encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ &dst_written);
+ return {src_read, dst_written};
+}
+
+/**
+ * Converts valid UTF-8 to valid UTF-16.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of `char16_t`s written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ */
+inline size_t convert_str_to_utf16(std::string_view src,
+ gsl::span<char16_t> dst) {
+ return encoding_mem_convert_str_to_utf16(
+ encoding_rs::mem::detail::null_to_bogus<const char>(
+ reinterpret_cast<const char*>(src.data())),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()),
+ dst.size());
+}
+
+/**
+ * If the input is valid UTF-16 representing only Unicode code points from
+ * U+0000 to U+00FF, inclusive, converts the input into output that
+ * represents the value of each code point as the unsigned byte value of
+ * each output byte.
+ *
+ * If the input does not fulfill the condition stated above, does something
+ * that is memory-safe without any promises about any properties of the
+ * output and will probably assert in debug builds in future versions.
+ * In particular, callers shouldn't assume the output to be the same across
+ * crate versions or CPU architectures and should not assume that non-ASCII
+ * input can't map to ASCII output.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * The number of bytes written equals the length of the source buffer.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ * (Probably in future versions if debug assertions are enabled (and not
+ * fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
+ */
+inline void convert_utf16_to_latin1_lossy(std::u16string_view src,
+ gsl::span<char> dst) {
+ encoding_mem_convert_utf16_to_latin1_lossy(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+ * with the REPLACEMENT CHARACTER.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer times three.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ */
+inline size_t convert_utf16_to_utf8(std::u16string_view src,
+ gsl::span<char> dst) {
+ return encoding_mem_convert_utf16_to_utf8(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+ * with the REPLACEMENT CHARACTER with potentially insufficient output
+ * space.
+ *
+ * Returns the number of code units read and the number of bytes written.
+ *
+ * Guarantees that the bytes in the destination beyond the number of
+ * bytes claimed as written by the second item of the return tuple
+ * are left unmodified.
+ *
+ * Not all code units are read if there isn't enough output space.
+ * Note that this method isn't designed for general streamability but for
+ * not allocating memory for the worst case up front. Specifically,
+ * if the input starts with or ends with an unpaired surrogate, those are
+ * replaced with the REPLACEMENT CHARACTER.
+ *
+ * Matches the semantics of `TextEncoder.encodeInto()` from the
+ * Encoding Standard.
+ */
+inline std::tuple<size_t, size_t> convert_utf16_to_utf8_partial(
+ std::u16string_view src, gsl::span<char> dst) {
+ size_t src_read = src.size();
+ size_t dst_written = dst.size();
+ encoding_mem_convert_utf16_to_utf8_partial(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()),
+ &src_read, encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ &dst_written);
+ return {src_read, dst_written};
+}
+
+/**
+ * If the input is valid UTF-8 representing only Unicode code points from
+ * U+0000 to U+00FF, inclusive, converts the input into output that
+ * represents the value of each code point as the unsigned byte value of
+ * each output byte.
+ *
+ * If the input does not fulfill the condition stated above, this function
+ * panics if debug assertions are enabled (and fuzzing isn't) and otherwise
+ * does something that is memory-safe without any promises about any
+ * properties of the output. In particular, callers shouldn't assume the
+ * output to be the same across crate versions or CPU architectures and
+ * should not assume that non-ASCII input can't map to ASCII output.
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ * If debug assertions are enabled (and not fuzzing) and the input is
+ * not in the range U+0000 to U+00FF, inclusive.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `dst` overlap.
+ */
+inline size_t convert_utf8_to_latin1_lossy(std::string_view src,
+ gsl::span<char> dst) {
+ return encoding_mem_convert_utf8_to_latin1_lossy(
+ encoding_rs::mem::detail::null_to_bogus<const char>(
+ reinterpret_cast<const char*>(src.data())),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
+ * with the REPLACEMENT CHARACTER.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer _plus one_.
+ *
+ * Returns the number of `char16_t`s written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ */
+inline size_t convert_utf8_to_utf16(std::string_view src,
+ gsl::span<char16_t> dst) {
+ return encoding_mem_convert_utf8_to_utf16(
+ encoding_rs::mem::detail::null_to_bogus<const char>(
+ reinterpret_cast<const char*>(src.data())),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of `char16_t`s written or `std::nullopt` if the input was
+ * invalid.
+ *
+ * When the input was invalid, some output may have been written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ */
+inline std::optional<size_t> convert_utf8_to_utf16_without_replacement(
+ std::string_view src, gsl::span<char16_t> dst) {
+ size_t val = encoding_mem_convert_utf8_to_utf16_without_replacement(
+ encoding_rs::mem::detail::null_to_bogus<const char>(
+ reinterpret_cast<const char*>(src.data())),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()),
+ dst.size());
+ if (val == SIZE_MAX) {
+ return std::nullopt;
+ }
+ return val;
+}
+
+/**
+ * Copies ASCII from source to destination up to the first non-ASCII byte
+ * (or the end of the input if it is ASCII in its entirety).
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ *
+ * # Undefined behavior
+ *
+ * UB ensues if `src` and `dst` overlap.
+ */
+inline size_t copy_ascii_to_ascii(gsl::span<const char> src,
+ gsl::span<char> dst) {
+ return encoding_mem_copy_ascii_to_ascii(
+ encoding_rs::mem::detail::null_to_bogus<const char>(src.data()),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Copies ASCII from source to destination zero-extending it to UTF-16 up to
+ * the first non-ASCII byte (or the end of the input if it is ASCII in its
+ * entirety).
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of `char16_t`s written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ */
+inline size_t copy_ascii_to_basic_latin(gsl::span<const char> src,
+ gsl::span<char16_t> dst) {
+ return encoding_mem_copy_ascii_to_basic_latin(
+ encoding_rs::mem::detail::null_to_bogus<const char>(src.data()),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char16_t>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Copies Basic Latin from source to destination narrowing it to ASCII up to
+ * the first non-Basic Latin code unit (or the end of the input if it is
+ * Basic Latin in its entirety).
+ *
+ * The length of the destination buffer must be at least the length of the
+ * source buffer.
+ *
+ * Returns the number of bytes written.
+ *
+ * # Panics
+ *
+ * Panics if the destination buffer is shorter than stated above.
+ */
+inline size_t copy_basic_latin_to_ascii(gsl::span<const char16_t> src,
+ gsl::span<char> dst) {
+ return encoding_mem_copy_basic_latin_to_ascii(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(src.data()),
+ src.size(), encoding_rs::mem::detail::null_to_bogus<char>(dst.data()),
+ dst.size());
+}
+
+/**
+ * Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
+ */
+inline void ensure_utf16_validity(gsl::span<char16_t> buffer) {
+ encoding_mem_ensure_utf16_validity(
+ encoding_rs::mem::detail::null_to_bogus<char16_t>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Checks whether the buffer is all-ASCII.
+ *
+ * May read the entire buffer even if it isn't all-ASCII. (I.e. the function
+ * is not guaranteed to fail fast.)
+ */
+inline bool is_ascii(std::string_view buffer) {
+ return encoding_mem_is_ascii(
+ encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
+ * only ASCII characters).
+ *
+ * May read the entire buffer even if it isn't all-ASCII. (I.e. the function
+ * is not guaranteed to fail fast.)
+ */
+inline bool is_ascii(std::u16string_view buffer) {
+ return encoding_mem_is_basic_latin(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Checks whether a scalar value triggers right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ *
+ * # Undefined behavior
+ *
+ * Undefined behavior ensues if `c` is not a valid Unicode Scalar Value.
+ */
+inline bool is_scalar_value_bidi(char32_t c) {
+ return encoding_mem_is_char_bidi(c);
+}
+
+/**
+ * Checks whether a UTF-16 buffer contains code points that trigger
+ * right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ * Returns `true` if the input contains an RTL character or an unpaired
+ * high surrogate that could be the high half of an RTL character.
+ * Returns `false` if the input contains neither RTL characters nor
+ * unpaired high surrogates that could be higher halves of RTL characters.
+ */
+inline bool is_bidi(std::u16string_view buffer) {
+ return encoding_mem_is_utf16_bidi(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Checks whether a UTF-16 code unit triggers right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ * Since supplementary-plane right-to-left blocks are identifiable from the
+ * high surrogate without examining the low surrogate, this function returns
+ * `true` for such high surrogates making the function suitable for handling
+ * supplementary-plane text without decoding surrogate pairs to scalar
+ * values. Obviously, such high surrogates are then reported as right-to-left
+ * even if actually unpaired.
+ */
+inline bool is_utf16_code_unit_bidi(char16_t u) {
+ return encoding_mem_is_utf16_code_unit_bidi(u);
+}
+
+/**
+ * Checks whether the buffer represents only code point less than or equal
+ * to U+00FF.
+ *
+ * May read the entire buffer even if it isn't all-Latin1. (I.e. the function
+ * is not guaranteed to fail fast.)
+ */
+inline bool is_utf16_latin1(std::u16string_view buffer) {
+ return encoding_mem_is_utf16_latin1(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Checks whether a potentially-invalid UTF-8 buffer contains code points
+ * that trigger right-to-left processing.
+ *
+ * The check is done on a Unicode block basis without regard to assigned
+ * vs. unassigned code points in the block. Hebrew presentation forms in
+ * the Alphabetic Presentation Forms block are treated as if they formed
+ * a block on their own (i.e. it treated as right-to-left). Additionally,
+ * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+ * for. Control characters that are technically bidi controls but do not
+ * cause right-to-left behavior without the presence of right-to-left
+ * characters or right-to-left controls are not checked for. As a special
+ * case, U+FEFF is excluded from Arabic Presentation Forms-B.
+ * Returns `true` if the input is invalid UTF-8 or the input contains an
+ * RTL character. Returns `false` if the input is valid UTF-8 and contains
+ * no RTL characters.
+ */
+inline bool is_bidi(std::string_view buffer) {
+ return encoding_mem_is_utf8_bidi(
+ encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Checks whether the buffer is valid UTF-8 representing only code points
+ * less than or equal to U+00FF.
+ *
+ * Fails fast. (I.e. returns before having read the whole buffer if UTF-8
+ * invalidity or code points above U+00FF are discovered.
+ */
+inline bool is_utf8_latin1(std::string_view buffer) {
+ return encoding_mem_is_utf8_latin1(
+ encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Returns the index of the first unpaired surrogate or, if the input is
+ * valid UTF-16 in its entirety, the length of the input.
+ */
+inline size_t utf16_valid_up_to(std::u16string_view buffer) {
+ return encoding_mem_utf16_valid_up_to(
+ encoding_rs::mem::detail::null_to_bogus<const char16_t>(buffer.data()),
+ buffer.size());
+}
+
+/**
+ * Returns the index of first byte that starts a non-Latin1 byte
+ * sequence, or the length of the string if there are none.
+ */
+inline size_t utf8_latin1_up_to(std::string_view buffer) {
+ return encoding_mem_utf8_latin1_up_to(
+ encoding_rs::mem::detail::null_to_bogus<const char>(buffer.data()),
+ buffer.size());
+}
+
+}; // namespace mem
+}; // namespace encoding_rs
+
+#endif // encoding_rs_mem_cpp_h_
diff --git a/third_party/rust/encoding_c_mem/src/lib.rs b/third_party/rust/encoding_c_mem/src/lib.rs
new file mode 100644
index 0000000000..e5f31c1be0
--- /dev/null
+++ b/third_party/rust/encoding_c_mem/src/lib.rs
@@ -0,0 +1,825 @@
+// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! FFI bindings for `encoding_rs::mem`.
+//!
+//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
+//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
+//! in-memory encoding is sometimes used as a storage optimization of text
+//! when UTF-16 indexing and length semantics are exposed.
+
+use encoding_rs::mem::Latin1Bidi;
+
+/// Checks whether the buffer is all-ASCII.
+///
+/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
+/// is not guaranteed to fail fast.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_ascii(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_ascii(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
+/// only ASCII characters).
+///
+/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
+/// is not guaranteed to fail fast.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL` and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_basic_latin(buffer: *const u16, len: usize) -> bool {
+ encoding_rs::mem::is_basic_latin(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Checks whether the buffer is valid UTF-8 representing only code points
+/// less than or equal to U+00FF.
+///
+/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
+/// invalidity or code points above U+00FF are discovered.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf8_latin1(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_utf8_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Checks whether the buffer represents only code points less than or equal
+/// to U+00FF.
+///
+/// Fails fast. (I.e. returns before having read the whole buffer if code
+/// points above U+00FF are discovered.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+/// if `buffer` is `NULL`, or if the memory designated by `buffer` and `buffer_len`
+/// does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_str_latin1(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_str_latin1(::std::str::from_utf8_unchecked(
+ ::std::slice::from_raw_parts(buffer, len),
+ ))
+}
+
+/// Checks whether the buffer represents only code point less than or equal
+/// to U+00FF.
+///
+/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
+/// is not guaranteed to fail fast.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL` and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf16_latin1(buffer: *const u16, len: usize) -> bool {
+ encoding_rs::mem::is_utf16_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Checks whether a potentially-invalid UTF-8 buffer contains code points
+/// that trigger right-to-left processing.
+///
+/// The check is done on a Unicode block basis without regard to assigned
+/// vs. unassigned code points in the block. Hebrew presentation forms in
+/// the Alphabetic Presentation Forms block are treated as if they formed
+/// a block on their own (i.e. it treated as right-to-left). Additionally,
+/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+/// for. Control characters that are technically bidi controls but do not
+/// cause right-to-left behavior without the presence of right-to-left
+/// characters or right-to-left controls are not checked for. As a special
+/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
+///
+/// Returns `true` if the input is invalid UTF-8 or the input contains an
+/// RTL character. Returns `false` if the input is valid UTF-8 and contains
+/// no RTL characters.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf8_bidi(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_utf8_bidi(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Checks whether a valid UTF-8 buffer contains code points that trigger
+/// right-to-left processing.
+///
+/// The check is done on a Unicode block basis without regard to assigned
+/// vs. unassigned code points in the block. Hebrew presentation forms in
+/// the Alphabetic Presentation Forms block are treated as if they formed
+/// a block on their own (i.e. it treated as right-to-left). Additionally,
+/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+/// for. Control characters that are technically bidi controls but do not
+/// cause right-to-left behavior without the presence of right-to-left
+/// characters or right-to-left controls are not checked for. As a special
+/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+/// if `buffer` is `NULL`, or if the memory designated by `buffer` and `buffer_len`
+/// does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_str_bidi(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_str_bidi(::std::str::from_utf8_unchecked(
+ ::std::slice::from_raw_parts(buffer, len),
+ ))
+}
+
+/// Checks whether a UTF-16 buffer contains code points that trigger
+/// right-to-left processing.
+///
+/// The check is done on a Unicode block basis without regard to assigned
+/// vs. unassigned code points in the block. Hebrew presentation forms in
+/// the Alphabetic Presentation Forms block are treated as if they formed
+/// a block on their own (i.e. it treated as right-to-left). Additionally,
+/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+/// for. Control characters that are technically bidi controls but do not
+/// cause right-to-left behavior without the presence of right-to-left
+/// characters or right-to-left controls are not checked for. As a special
+/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
+///
+/// Returns `true` if the input contains an RTL character or an unpaired
+/// high surrogate that could be the high half of an RTL character.
+/// Returns `false` if the input contains neither RTL characters nor
+/// unpaired high surrogates that could be higher halves of RTL characters.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL` and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf16_bidi(buffer: *const u16, len: usize) -> bool {
+ encoding_rs::mem::is_utf16_bidi(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Checks whether a scalar value triggers right-to-left processing.
+///
+/// The check is done on a Unicode block basis without regard to assigned
+/// vs. unassigned code points in the block. Hebrew presentation forms in
+/// the Alphabetic Presentation Forms block are treated as if they formed
+/// a block on their own (i.e. it treated as right-to-left). Additionally,
+/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+/// for. Control characters that are technically bidi controls but do not
+/// cause right-to-left behavior without the presence of right-to-left
+/// characters or right-to-left controls are not checked for. As a special
+/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
+///
+/// # Undefined behavior
+///
+/// Undefined behavior ensues if `c` is not a valid Unicode Scalar Value.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_char_bidi(c: char) -> bool {
+ encoding_rs::mem::is_char_bidi(c)
+}
+
+/// Checks whether a UTF-16 code unit triggers right-to-left processing.
+///
+/// The check is done on a Unicode block basis without regard to assigned
+/// vs. unassigned code points in the block. Hebrew presentation forms in
+/// the Alphabetic Presentation Forms block are treated as if they formed
+/// a block on their own (i.e. it treated as right-to-left). Additionally,
+/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
+/// for. Control characters that are technically bidi controls but do not
+/// cause right-to-left behavior without the presence of right-to-left
+/// characters or right-to-left controls are not checked for. As a special
+/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
+///
+/// Since supplementary-plane right-to-left blocks are identifiable from the
+/// high surrogate without examining the low surrogate, this function returns
+/// `true` for such high surrogates making the function suitable for handling
+/// supplementary-plane text without decoding surrogate pairs to scalar
+/// values. Obviously, such high surrogates are then reported as right-to-left
+/// even if actually unpaired.
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf16_code_unit_bidi(u: u16) -> bool {
+ encoding_rs::mem::is_utf16_code_unit_bidi(u)
+}
+
+/// Checks whether a potentially invalid UTF-8 buffer contains code points
+/// that trigger right-to-left processing or is all-Latin1.
+///
+/// Possibly more efficient than performing the checks separately.
+///
+/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
+/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
+/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_check_utf8_for_latin1_and_bidi(
+ buffer: *const u8,
+ len: usize,
+) -> Latin1Bidi {
+ encoding_rs::mem::check_utf8_for_latin1_and_bidi(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Checks whether a valid UTF-8 buffer contains code points
+/// that trigger right-to-left processing or is all-Latin1.
+///
+/// Possibly more efficient than performing the checks separately.
+///
+/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
+/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
+/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+/// if `buffer` is `NULL`, or if the memory designated by `buffer` and `buffer_len`
+/// does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_check_str_for_latin1_and_bidi(
+ buffer: *const u8,
+ len: usize,
+) -> Latin1Bidi {
+ encoding_rs::mem::check_str_for_latin1_and_bidi(::std::str::from_utf8_unchecked(
+ ::std::slice::from_raw_parts(buffer, len),
+ ))
+}
+
+/// Checks whether a potentially invalid UTF-16 buffer contains code points
+/// that trigger right-to-left processing or is all-Latin1.
+///
+/// Possibly more efficient than performing the checks separately.
+///
+/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
+/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
+/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL` and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_check_utf16_for_latin1_and_bidi(
+ buffer: *const u16,
+ len: usize,
+) -> Latin1Bidi {
+ encoding_rs::mem::check_utf16_for_latin1_and_bidi(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
+/// with the REPLACEMENT CHARACTER.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer _plus one_.
+///
+/// Returns the number of `u16`s written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_utf16(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u16,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_utf8_to_utf16(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+/// Converts valid UTF-8 to valid UTF-16.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// Returns the number of `u16`s written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL`, if the two memory blocks overlap, of if the
+/// buffer designated by `src` and `src_len` does not contain valid UTF-8. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_str_to_utf16(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u16,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_str_to_utf16(
+ ::std::str::from_utf8_unchecked(::std::slice::from_raw_parts(src, src_len)),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// Returns the number of `u16`s written or `SIZE_MAX` if the input was invalid.
+///
+/// When the input was invalid, some output may have been written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_utf16_without_replacement(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u16,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_utf8_to_utf16_without_replacement(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ ).unwrap_or(::std::usize::MAX)
+}
+
+/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+/// with the REPLACEMENT CHARACTER with potentially insufficient output
+/// space.
+///
+/// Writes the number of code units read into `*src_len` and the number of
+/// bytes written into `*dst_len`.
+///
+/// Guarantees that the bytes in the destination beyond the number of
+/// bytes claimed as written by the second item of the return tuple
+/// are left unmodified.
+///
+/// Not all code units are read if there isn't enough output space.
+///
+/// Note that this method isn't designed for general streamability but for
+/// not allocating memory for the worst case up front. Specifically,
+/// if the input starts with or ends with an unpaired surrogate, those are
+/// replaced with the REPLACEMENT CHARACTER.
+///
+/// Matches the semantics of `TextEncoder.encodeInto()` from the
+/// Encoding Standard.
+///
+/// # Safety
+///
+/// If you want to convert into a `&mut str`, use
+/// `convert_utf16_to_str_partial()` instead of using this function
+/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8_partial(
+ src: *const u16,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+) {
+ let (read, written) = encoding_rs::mem::convert_utf16_to_utf8_partial(
+ ::std::slice::from_raw_parts(src, *src_len),
+ ::std::slice::from_raw_parts_mut(dst, *dst_len),
+ );
+ *src_len = read;
+ *dst_len = written;
+}
+
+/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+/// with the REPLACEMENT CHARACTER.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer times three.
+///
+/// Returns the number of bytes written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Safety
+///
+/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
+/// instead of using this function together with the `unsafe` method
+/// `as_bytes_mut()` on `&mut str`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8(
+ src: *const u16,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_utf16_to_utf8(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+/// Converts bytes whose unsigned value is interpreted as Unicode code point
+/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// The number of `u16`s written equals the length of the source buffer.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf16(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u16,
+ dst_len: usize,
+) {
+ encoding_rs::mem::convert_latin1_to_utf16(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ );
+}
+
+/// Converts bytes whose unsigned value is interpreted as Unicode code point
+/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
+/// output space.
+///
+/// Writes the number of code units read into `*src_len` and the number of
+/// bytes written into `*dst_len`.
+///
+/// If the output isn't large enough, not all input is consumed.
+///
+/// # Safety
+///
+/// If you want to convert into a `&mut str`, use
+/// `encoding_mem_convert_latin1_to_str_partial()` instead of using this function
+/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf8_partial(
+ src: *const u8,
+ src_len: *mut usize,
+ dst: *mut u8,
+ dst_len: *mut usize,
+) {
+ let (read, written) = encoding_rs::mem::convert_latin1_to_utf8_partial(
+ ::std::slice::from_raw_parts(src, *src_len),
+ ::std::slice::from_raw_parts_mut(dst, *dst_len),
+ );
+ *src_len = read;
+ *dst_len = written;
+}
+
+/// Converts bytes whose unsigned value is interpreted as Unicode code point
+/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer times two.
+///
+/// Returns the number of bytes written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Safety
+///
+/// Note that this function may write garbage beyond the number of bytes
+/// indicated by the return value, so using a `&mut str` interpreted as
+/// `&mut [u8]` as the destination is not safe. If you want to convert into
+/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf8(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_latin1_to_utf8(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+/// If the input is valid UTF-8 representing only Unicode code points from
+/// U+0000 to U+00FF, inclusive, converts the input into output that
+/// represents the value of each code point as the unsigned byte value of
+/// each output byte.
+///
+/// If the input does not fulfill the condition stated above, this function
+/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
+/// does something that is memory-safe without any promises about any
+/// properties of the output. In particular, callers shouldn't assume the
+/// output to be the same across crate versions or CPU architectures and
+/// should not assume that non-ASCII input can't map to ASCII output.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// Returns the number of bytes written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// If debug assertions are enabled (and not fuzzing) and the input is
+/// not in the range U+0000 to U+00FF, inclusive.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_latin1_lossy(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_utf8_to_latin1_lossy(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+/// If the input is valid UTF-16 representing only Unicode code points from
+/// U+0000 to U+00FF, inclusive, converts the input into output that
+/// represents the value of each code point as the unsigned byte value of
+/// each output byte.
+///
+/// If the input does not fulfill the condition stated above, does something
+/// that is memory-safe without any promises about any properties of the
+/// output and will probably assert in debug builds in future versions.
+/// In particular, callers shouldn't assume the output to be the same across
+/// crate versions or CPU architectures and should not assume that non-ASCII
+/// input can't map to ASCII output.
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// The number of bytes written equals the length of the source buffer.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// (Probably in future versions if debug assertions are enabled (and not
+/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf16_to_latin1_lossy(
+ src: *const u16,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) {
+ encoding_rs::mem::convert_utf16_to_latin1_lossy(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ );
+}
+
+/// Returns the index of the first unpaired surrogate or, if the input is
+/// valid UTF-16 in its entirety, the length of the input.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL` and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_utf16_valid_up_to(buffer: *const u16, len: usize) -> usize {
+ encoding_rs::mem::utf16_valid_up_to(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Returns the index of first byte that starts an invalid byte
+/// sequence or a non-Latin1 byte sequence, or the length of the
+/// string if there are neither.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL` and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_utf8_latin1_up_to(buffer: *const u8, len: usize) -> usize {
+ encoding_rs::mem::utf8_latin1_up_to(::std::slice::from_raw_parts(buffer, len))
+}
+
+/// Returns the index of first byte that starts a non-Latin1 byte
+/// sequence, or the length of the string if there are none.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
+/// if `buffer` is `NULL`, or if the memory block does not contain valid UTF-8.
+/// (If `buffer_len` is `0`, `buffer` may be bogus but still has to be non-`NULL`
+/// and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_str_latin1_up_to(buffer: *const u8, len: usize) -> usize {
+ encoding_rs::mem::str_latin1_up_to(::std::str::from_utf8_unchecked(
+ ::std::slice::from_raw_parts(buffer, len),
+ ))
+}
+
+/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
+/// or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
+/// still has to be non-`NULL` and aligned.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_ensure_utf16_validity(buffer: *mut u16, len: usize) {
+ encoding_rs::mem::ensure_utf16_validity(::std::slice::from_raw_parts_mut(buffer, len));
+}
+
+/// Copies ASCII from source to destination up to the first non-ASCII byte
+/// (or the end of the input if it is ASCII in its entirety).
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// Returns the number of bytes written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_copy_ascii_to_ascii(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::copy_ascii_to_ascii(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+/// Copies ASCII from source to destination zero-extending it to UTF-16 up to
+/// the first non-ASCII byte (or the end of the input if it is ASCII in its
+/// entirety).
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// Returns the number of `u16`s written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_copy_ascii_to_basic_latin(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u16,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::copy_ascii_to_basic_latin(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+/// Copies Basic Latin from source to destination narrowing it to ASCII up to
+/// the first non-Basic Latin code unit (or the end of the input if it is
+/// Basic Latin in its entirety).
+///
+/// The length of the destination buffer must be at least the length of the
+/// source buffer.
+///
+/// Returns the number of bytes written.
+///
+/// # Panics
+///
+/// Panics if the destination buffer is shorter than stated above.
+///
+/// # Undefined behavior
+///
+/// UB ensues if `src` and `src_len` don't designate a valid memory block, if
+/// `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
+/// block, if `dst` is `NULL` or if the two memory blocks overlap. (If
+/// `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
+/// aligned. Likewise for `dst` and `dst_len`.)
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_copy_basic_latin_to_ascii(
+ src: *const u16,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::copy_basic_latin_to_ascii(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}