summaryrefslogtreecommitdiffstats
path: root/vendor/futf
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--vendor/futf/.cargo-checksum.json1
-rw-r--r--vendor/futf/Cargo.toml24
-rw-r--r--vendor/futf/LICENSE-APACHE201
-rw-r--r--vendor/futf/LICENSE-MIT25
-rw-r--r--vendor/futf/README.md18
-rw-r--r--vendor/futf/src/lib.rs248
-rw-r--r--vendor/futf/src/test.rs270
7 files changed, 787 insertions, 0 deletions
diff --git a/vendor/futf/.cargo-checksum.json b/vendor/futf/.cargo-checksum.json
new file mode 100644
index 000000000..b42896739
--- /dev/null
+++ b/vendor/futf/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.toml":"969dea475ecafd7cdc49b65403f6dcc241ab63e50df8fff054480c9816857b03","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"60a7062291b01ba068f300612cdbdc20382ac1d4934f07bcdd7167c15299f309","README.md":"933cfbcce46af48e2dbaa75f042df2143d726b983a546f57ca0eb5fb93e220b5","src/lib.rs":"90628646d656c57504b4e7b5c3f3986fffcac31e710e7cc212fb63d836d89833","src/test.rs":"0ca773b918809aeb73f75c75c0640d78cdc824efd995f136603a567239578c47"},"package":"df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"} \ No newline at end of file
diff --git a/vendor/futf/Cargo.toml b/vendor/futf/Cargo.toml
new file mode 100644
index 000000000..e57e71bbc
--- /dev/null
+++ b/vendor/futf/Cargo.toml
@@ -0,0 +1,24 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+name = "futf"
+version = "0.1.5"
+authors = ["Keegan McAllister <kmcallister@mozilla.com>"]
+description = "Handling fragments of UTF-8"
+license = "MIT / Apache-2.0"
+repository = "https://github.com/servo/futf"
+
+[dependencies.mac]
+version = "0.1.0"
+
+[dependencies.new_debug_unreachable]
+version = "1.0.2"
diff --git a/vendor/futf/LICENSE-APACHE b/vendor/futf/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/vendor/futf/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/vendor/futf/LICENSE-MIT b/vendor/futf/LICENSE-MIT
new file mode 100644
index 000000000..2e0fee105
--- /dev/null
+++ b/vendor/futf/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2015 Keegan McAllister
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/vendor/futf/README.md b/vendor/futf/README.md
new file mode 100644
index 000000000..325b2e74d
--- /dev/null
+++ b/vendor/futf/README.md
@@ -0,0 +1,18 @@
+# futf
+
+[![Build Status](https://travis-ci.org/servo/futf.svg?branch=master)](https://travis-ci.org/kmcallister/futf)
+
+futf is a library for *flexible* UTF-8, or UTF-8 *fragments*. I don't know.
+Check out the [API documentation](http://doc.servo.org/futf/index.html).
+
+Anyway, it takes an index into a byte buffer and tells you things about the
+UTF-8 codepoint containing that byte. It can deal with incomplete codepoint
+prefixes / suffixes at the ends of a buffer, which is useful for incremental
+I/O. It can also handle UTF-16 surrogate code units encoded in the manner of
+[CESU-8][] or [WTF-8][].
+
+This is a low-level helper for [tendril][] that might be useful more generally.
+
+[CESU-8]: http://www.unicode.org/reports/tr26/
+[WTF-8]: http://simonsapin.github.io/wtf-8/
+[tendril]: https://github.com/kmcallister/tendril
diff --git a/vendor/futf/src/lib.rs b/vendor/futf/src/lib.rs
new file mode 100644
index 000000000..4b94a35a5
--- /dev/null
+++ b/vendor/futf/src/lib.rs
@@ -0,0 +1,248 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![cfg_attr(test, feature(test))]
+
+#[macro_use]
+extern crate debug_unreachable;
+
+#[macro_use]
+extern crate mac;
+
+#[cfg(test)]
+extern crate test as std_test;
+
+use std::{slice, char};
+
+/// Meaning of a complete or partial UTF-8 codepoint.
+///
+/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
+/// `Suffix` may in reality have no valid completion.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
+pub enum Meaning {
+ /// We found a whole codepoint.
+ Whole(char),
+
+ /// We found something that isn't a valid Unicode codepoint, but
+ /// it *would* correspond to a UTF-16 leading surrogate code unit,
+ /// i.e. a value in the range `U+D800` - `U+DBFF`.
+ ///
+ /// The argument is the code unit's 10-bit index within that range.
+ ///
+ /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
+ LeadSurrogate(u16),
+
+ /// We found something that isn't a valid Unicode codepoint, but
+ /// it *would* correspond to a UTF-16 trailing surrogate code unit,
+ /// i.e. a value in the range `U+DC00` - `U+DFFF`.
+ ///
+ /// The argument is the code unit's 10-bit index within that range.
+ ///
+ /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
+ TrailSurrogate(u16),
+
+ /// We found only a prefix of a codepoint before the buffer ended.
+ ///
+ /// Includes the number of additional bytes needed.
+ Prefix(usize),
+
+ /// We found only a suffix of a codepoint before running off the
+ /// start of the buffer.
+ ///
+ /// Up to 3 more bytes may be needed.
+ Suffix,
+}
+
+/// Represents a complete or partial UTF-8 codepoint.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
+pub struct Codepoint<'a> {
+ /// The bytes that make up the partial or full codepoint.
+ ///
+ /// For a `Suffix` this depends on `idx`. We don't scan forward
+ /// for additional continuation bytes after the reverse scan
+ /// failed to locate a multibyte sequence start.
+ pub bytes: &'a [u8],
+
+ /// Start of the codepoint in the buffer, expressed as an offset
+ /// back from `idx`.
+ pub rewind: usize,
+
+ /// Meaning of the partial or full codepoint.
+ pub meaning: Meaning,
+}
+
+#[derive(Debug, PartialEq, Eq)]
+enum Byte {
+ Ascii,
+ Start(usize),
+ Cont,
+}
+
+impl Byte {
+ #[inline(always)]
+ fn classify(x: u8) -> Option<Byte> {
+ match x & 0xC0 {
+ 0xC0 => match x {
+ x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
+ x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
+ x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
+ _ => None,
+ },
+ 0x80 => Some(Byte::Cont),
+ _ => Some(Byte::Ascii),
+ }
+ }
+}
+
+#[inline(always)]
+fn all_cont(buf: &[u8]) -> bool {
+ buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
+}
+
+// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
+// a starting byte followed by the correct number of continuation bytes.
+#[inline(always)]
+unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
+ debug_assert!(buf.len() >= 2);
+ debug_assert!(buf.len() <= 4);
+ let n;
+ match buf.len() {
+ 2 => {
+ n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
+ | ((*buf.get_unchecked(1) & 0x3F) as u32);
+ if n < 0x80 { return None } // Overlong
+ }
+ 3 => {
+ n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
+ | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
+ | ((*buf.get_unchecked(2) & 0x3F) as u32);
+ match n {
+ 0x0000 ... 0x07FF => return None, // Overlong
+ 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
+ 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
+ _ => {}
+ }
+ }
+ 4 => {
+ n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
+ | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
+ | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
+ | ((*buf.get_unchecked(3) & 0x3F) as u32);
+ if n < 0x1_0000 { return None } // Overlong
+ }
+ _ => debug_unreachable!(),
+ }
+
+ char::from_u32(n).map(Meaning::Whole)
+}
+
+#[inline(always)]
+unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
+ debug_assert!(start <= buf.len());
+ debug_assert!(new_len <= (buf.len() - start));
+ slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
+}
+
+macro_rules! otry {
+ ($x:expr) => { unwrap_or_return!($x, None) }
+}
+
+/// Describes the UTF-8 codepoint containing the byte at index `idx` within
+/// `buf`.
+///
+/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
+/// in the vicinity of `idx`.
+#[inline]
+pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
+ if idx >= buf.len() {
+ return None;
+ }
+
+ unsafe {
+ let x = *buf.get_unchecked(idx);
+ match otry!(Byte::classify(x)) {
+ Byte::Ascii => Some(Codepoint {
+ bytes: unsafe_slice(buf, idx, 1),
+ rewind: 0,
+ meaning: Meaning::Whole(x as char),
+ }),
+ Byte::Start(n) => {
+ let avail = buf.len() - idx;
+ if avail >= n {
+ let bytes = unsafe_slice(buf, idx, n);
+ if !all_cont(unsafe_slice(bytes, 1, n-1)) {
+ return None;
+ }
+ let meaning = otry!(decode(bytes));
+ Some(Codepoint {
+ bytes: bytes,
+ rewind: 0,
+ meaning: meaning,
+ })
+ } else {
+ Some(Codepoint {
+ bytes: unsafe_slice(buf, idx, avail),
+ rewind: 0,
+ meaning: Meaning::Prefix(n - avail),
+ })
+ }
+ },
+ Byte::Cont => {
+ let mut start = idx;
+ let mut checked = 0;
+ loop {
+ if start == 0 {
+ // Whoops, fell off the beginning.
+ return Some(Codepoint {
+ bytes: unsafe_slice(buf, 0, idx + 1),
+ rewind: idx,
+ meaning: Meaning::Suffix,
+ });
+ }
+
+ start -= 1;
+ checked += 1;
+ match otry!(Byte::classify(*buf.get_unchecked(start))) {
+ Byte::Cont => (),
+ Byte::Start(n) => {
+ let avail = buf.len() - start;
+ if avail >= n {
+ let bytes = unsafe_slice(buf, start, n);
+ if checked < n {
+ if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
+ return None;
+ }
+ }
+ let meaning = otry!(decode(bytes));
+ return Some(Codepoint {
+ bytes: bytes,
+ rewind: idx - start,
+ meaning: meaning,
+ });
+ } else {
+ return Some(Codepoint {
+ bytes: unsafe_slice(buf, start, avail),
+ rewind: idx - start,
+ meaning: Meaning::Prefix(n - avail),
+ });
+ }
+ }
+ _ => return None,
+ }
+
+ if idx - start >= 3 {
+ // We looked at 3 bytes before a continuation byte
+ // and didn't find a start byte.
+ return None;
+ }
+ }
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod test;
diff --git a/vendor/futf/src/test.rs b/vendor/futf/src/test.rs
new file mode 100644
index 000000000..f8e0c9387
--- /dev/null
+++ b/vendor/futf/src/test.rs
@@ -0,0 +1,270 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::{Meaning, Byte, classify, decode, all_cont};
+
+use std::borrow::ToOwned;
+use std::io::Write;
+use std_test::Bencher;
+
+#[test]
+fn classify_all_bytes() {
+ for n in 0x00..0x80 { assert_eq!(Byte::classify(n), Some(Byte::Ascii)); }
+ for n in 0x80..0xC0 { assert_eq!(Byte::classify(n), Some(Byte::Cont)); }
+ for n in 0xC0..0xE0 { assert_eq!(Byte::classify(n), Some(Byte::Start(2))); }
+ for n in 0xE0..0xF0 { assert_eq!(Byte::classify(n), Some(Byte::Start(3))); }
+ for n in 0xF0..0xF8 { assert_eq!(Byte::classify(n), Some(Byte::Start(4))); }
+ for n in 0xF8..0xFF { assert_eq!(Byte::classify(n), None); }
+ assert_eq!(Byte::classify(0xFF), None);
+}
+
+#[test]
+fn test_all_cont() {
+ assert!(all_cont(b""));
+ assert!(all_cont(b"\x80"));
+ assert!(all_cont(b"\xBF"));
+ assert!(all_cont(b"\x80\xBF\x80\xBF"));
+
+ assert!(!all_cont(b"z"));
+ assert!(!all_cont(b"\xC0\xBF"));
+ assert!(!all_cont(b"\xFF"));
+ assert!(!all_cont(b"\x80\xBFz\x80\xBF"));
+ assert!(!all_cont(b"\x80\xBF\xC0\x80\xBF"));
+ assert!(!all_cont(b"\x80\xBF\xFF\x80\xBF"));
+ assert!(!all_cont(b"\x80\xBF\x80\xBFz"));
+ assert!(!all_cont(b"\x80\xBF\x80\xBF\xC0"));
+ assert!(!all_cont(b"z\x80\xBF\x80\xBF"));
+ assert!(!all_cont(b"\xC0\x80\xBF\x80\xBF"));
+}
+
+#[test]
+fn test_decode() {
+ unsafe {
+ assert_eq!(Some(Meaning::Whole('ő')), decode(b"\xC5\x91"));
+ assert_eq!(Some(Meaning::Whole('\u{a66e}')), decode(b"\xEA\x99\xAE"));
+ assert_eq!(Some(Meaning::Whole('\u{1f4a9}')), decode(b"\xF0\x9F\x92\xA9"));
+ assert_eq!(Some(Meaning::Whole('\u{10ffff}')), decode(b"\xF4\x8F\xBF\xBF"));
+
+ assert_eq!(Some(Meaning::LeadSurrogate(0x0000)), decode(b"\xED\xA0\x80"));
+ assert_eq!(Some(Meaning::LeadSurrogate(0x0001)), decode(b"\xED\xA0\x81"));
+ assert_eq!(Some(Meaning::LeadSurrogate(0x03FE)), decode(b"\xED\xAF\xBE"));
+ assert_eq!(Some(Meaning::LeadSurrogate(0x03FF)), decode(b"\xED\xAF\xBF"));
+
+ assert_eq!(Some(Meaning::TrailSurrogate(0x0000)), decode(b"\xED\xB0\x80"));
+ assert_eq!(Some(Meaning::TrailSurrogate(0x0001)), decode(b"\xED\xB0\x81"));
+ assert_eq!(Some(Meaning::TrailSurrogate(0x03FE)), decode(b"\xED\xBF\xBE"));
+ assert_eq!(Some(Meaning::TrailSurrogate(0x03FF)), decode(b"\xED\xBF\xBF"));
+
+ // The last 4-byte UTF-8 sequence. This would be U+1FFFFF, which is out of
+ // range.
+ assert_eq!(None, decode(b"\xF7\xBF\xBF\xBF"));
+
+ // First otherwise-valid sequence (would be U+110000) that is out of range
+ assert_eq!(None, decode(b"\xF4\x90\x80\x80"));
+
+ // Overlong sequences
+ assert_eq!(None, decode(b"\xC0\x80"));
+ assert_eq!(None, decode(b"\xC1\xBF"));
+ assert_eq!(None, decode(b"\xE0\x80\x80"));
+ assert_eq!(None, decode(b"\xE0\x9F\xBF"));
+ assert_eq!(None, decode(b"\xF0\x80\x80\x80"));
+ assert_eq!(None, decode(b"\xF0\x8F\xBF\xBF"));
+
+ // For not-overlong sequence for each sequence length
+ assert_eq!(Some(Meaning::Whole('\u{80}')), decode(b"\xC2\x80"));
+ assert_eq!(Some(Meaning::Whole('\u{800}')), decode(b"\xE0\xA0\x80"));
+ assert_eq!(Some(Meaning::Whole('\u{10000}')), decode(b"\xF0\x90\x80\x80"));
+ }
+}
+
+static JUNK: &'static [u8] = b"\
+ \xf8\x0d\x07\x25\xa6\x7b\x95\xeb\x47\x01\x7f\xee\
+ \x3b\x00\x60\x57\x1d\x9e\x5d\x0a\x0b\x0a\x7c\x75\
+ \x13\xa1\x82\x46\x27\x34\xe9\x52\x61\x0d\xec\x10\
+ \x54\x49\x6e\x54\xdf\x7b\xe1\x31\x8c\x06\x21\x83\
+ \x0f\xb5\x1f\x4c\x6a\x71\x52\x42\x74\xe7\x7b\x50\
+ \x59\x1f\x6a\xd4\xff\x06\x92\x33\xc4\x34\x97\xff\
+ \xcc\xb5\xc4\x00\x7b\xc3\x4a\x7f\x7e\x63\x96\x58\
+ \x51\x63\x21\x54\x53\x2f\x03\x8a\x7d\x41\x79\x98\
+ \x5b\xcb\xb8\x94\x6b\x73\xf3\x0c\x5a\xd7\xc4\x12\
+ \x7a\x2b\x9a\x2e\x67\x62\x2a\x00\x45\x2c\xfe\x7d\
+ \x8d\xd6\x51\x4e\x59\x36\x72\x1b\xae\xaa\x06\xe8\
+ \x71\x1b\x85\xd3\x35\xb5\xbe\x9e\x16\x96\x72\xd8\
+ \x1a\x48\xba\x4d\x55\x4f\x1b\xa2\x77\xfa\x8f\x71\
+ \x58\x7d\x03\x93\xa2\x3a\x76\x51\xda\x48\xe2\x3f\
+ \xeb\x8d\xda\x89\xae\xf7\xbd\x3d\xb6\x37\x97\xca\
+ \x99\xcc\x4a\x8d\x62\x89\x97\xe3\xc0\xd1\x8d\xc1\
+ \x26\x11\xbb\x8d\x53\x61\x4f\x76\x03\x00\x30\xd3\
+ \x5f\x86\x19\x52\x9c\x3e\x99\x8c\xb7\x21\x48\x1c\
+ \x85\xae\xad\xd5\x74\x00\x6c\x3e\xd0\x17\xff\x76\
+ \x5c\x32\xc3\xfb\x24\x99\xd4\x4c\xa4\x1f\x66\x46\
+ \xe7\x2d\x44\x56\x7d\x14\xd9\x76\x91\x37\x2f\xb7\
+ \xcc\x1b\xd3\xc2";
+
+#[test]
+fn classify_whole() {
+ assert_eq!(JUNK.len(), 256);
+
+ for &c in &['\0', '\x01', 'o', 'z', 'ő', '\u{2764}',
+ '\u{a66e}', '\u{1f4a9}', '\u{1f685}'] {
+ for idx in 0 .. JUNK.len() - 3 {
+ let mut buf = JUNK.to_owned();
+ let ch = format!("{}", c).into_bytes();
+ (&mut buf[idx..]).write_all(&ch).unwrap();
+
+ for j in 0 .. ch.len() {
+ let class = classify(&buf, idx+j).unwrap();
+ assert_eq!(class.bytes, &*ch);
+ assert_eq!(class.rewind, j);
+ assert_eq!(class.meaning, Meaning::Whole(c));
+ }
+ }
+ }
+}
+
+#[test]
+fn classify_surrogates() {
+ for &(s, b) in &[
+ (Meaning::LeadSurrogate(0x0000), b"\xED\xA0\x80"),
+ (Meaning::LeadSurrogate(0x0001), b"\xED\xA0\x81"),
+ (Meaning::LeadSurrogate(0x03FE), b"\xED\xAF\xBE"),
+ (Meaning::LeadSurrogate(0x03FF), b"\xED\xAF\xBF"),
+
+ (Meaning::TrailSurrogate(0x0000), b"\xED\xB0\x80"),
+ (Meaning::TrailSurrogate(0x0001), b"\xED\xB0\x81"),
+ (Meaning::TrailSurrogate(0x03FE), b"\xED\xBF\xBE"),
+ (Meaning::TrailSurrogate(0x03FF), b"\xED\xBF\xBF"),
+ ] {
+ for idx in 0 .. JUNK.len() - 2 {
+ let mut buf = JUNK.to_owned();
+ (&mut buf[idx..]).write_all(b).unwrap();
+
+ let class = classify(&buf, idx).unwrap();
+ assert_eq!(class.bytes, b);
+ assert_eq!(class.rewind, 0);
+ assert_eq!(class.meaning, s);
+ }
+ }
+}
+
+#[test]
+fn classify_prefix_suffix() {
+ for &c in &['ő', '\u{a66e}', '\u{1f4a9}'] {
+ let ch = format!("{}", c).into_bytes();
+ for pfx in 1 .. ch.len() - 1 {
+ let mut buf = JUNK.to_owned();
+ let buflen = buf.len();
+ (&mut buf[buflen - pfx .. buflen]).write_all(&ch[..pfx]).unwrap();
+ for j in 0 .. pfx {
+ let idx = buflen - 1 - j;
+ let class = classify(&buf, idx).unwrap();
+ assert_eq!(class.bytes, &ch[..pfx]);
+ assert_eq!(class.rewind, pfx - 1 - j);
+ assert_eq!(class.meaning, Meaning::Prefix(ch.len() - pfx));
+ }
+ }
+ for sfx in 1 .. ch.len() - 1 {
+ let ch_bytes = &ch[ch.len() - sfx ..];
+ let mut buf = JUNK.to_owned();
+ (&mut *buf).write_all(ch_bytes).unwrap();
+ for j in 0 .. sfx {
+ let class = classify(&buf, j).unwrap();
+ assert!(ch_bytes.starts_with(class.bytes));
+ assert_eq!(class.rewind, j);
+ assert_eq!(class.meaning, Meaning::Suffix);
+ }
+ }
+ }
+}
+
+#[test]
+fn out_of_bounds() {
+ assert!(classify(b"", 0).is_none());
+ assert!(classify(b"", 7).is_none());
+ assert!(classify(b"aaaaaaa", 7).is_none());
+}
+
+#[test]
+fn malformed() {
+ assert_eq!(None, classify(b"\xFF", 0));
+ assert_eq!(None, classify(b"\xC5\xC5", 0));
+ assert_eq!(None, classify(b"x\x91", 1));
+ assert_eq!(None, classify(b"\x91\x91\x91\x91", 3));
+ assert_eq!(None, classify(b"\x91\x91\x91\x91\x91", 4));
+ assert_eq!(None, classify(b"\xEA\x91\xFF", 1));
+ assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 0));
+ assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 1));
+ assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 2));
+
+ for i in 0..4 {
+ // out of range: U+110000
+ assert_eq!(None, classify(b"\xF4\x90\x80\x80", i));
+
+ // out of range: U+1FFFFF
+ assert_eq!(None, classify(b"\xF7\xBF\xBF\xBF", i));
+
+ // Overlong sequences
+ assert_eq!(None, classify(b"\xC0\x80", i));
+ assert_eq!(None, classify(b"\xC1\xBF", i));
+ assert_eq!(None, classify(b"\xE0\x80\x80", i));
+ assert_eq!(None, classify(b"\xE0\x9F\xBF", i));
+ assert_eq!(None, classify(b"\xF0\x80\x80\x80", i));
+ assert_eq!(None, classify(b"\xF0\x8F\xBF\xBF", i));
+ }
+}
+
+static TEXT: &'static str = "
+ All human beings are born free and equal in dignity and rights.
+ They are endowed with reason and conscience and should act
+ towards one another in a spirit of brotherhood.
+
+ Minden emberi lény szabadon születik és egyenlő méltósága és
+ joga van. Az emberek, ésszel és lelkiismerettel bírván,
+ egymással szemben testvéri szellemben kell hogy viseltessenek.
+
+ เราทุกคนเกิดมาอย่างอิสระ เราทุกคนมีความคิดและความเข้าใจเป็นของเราเอง
+ เราทุกคนควรได้รับการปฏิบัติในทางเดียวกัน.
+
+ 모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어
+ 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로
+ 형제애의 정신으로 행동하여야 한다.
+
+ ro remna cu se jinzi co zifre je simdu'i be le ry. nilselsi'a
+ .e lei ry. selcru .i ry. se menli gi'e se sezmarde .i .ei
+ jeseki'ubo ry. simyzu'e ta'i le tunba
+
+ ᏂᎦᏓ ᎠᏂᏴᏫ ᏂᎨᎫᏓᎸᎾ ᎠᎴ ᎤᏂᏠᏱ ᎤᎾᏕᎿ ᏚᏳᎧᏛ ᎨᏒᎢ. ᎨᏥᏁᎳ ᎤᎾᏓᏅᏖᏗ ᎠᎴ ᎤᏃᏟᏍᏗ
+ ᎠᎴ ᏌᏊ ᎨᏒ ᏧᏂᎸᏫᏍᏓᏁᏗ ᎠᎾᏟᏅᏢ ᎠᏓᏅᏙ ᎬᏗ.";
+
+// random
+static IXES: &'static [usize]
+ = &[778, 156, 87, 604, 1216, 365, 884, 311,
+ 469, 515, 709, 162, 871, 206, 634, 442];
+
+static BOUNDARY: &'static [bool]
+ = &[false, true, true, false, false, true, true, true,
+ true, false, false, true, true, true, false, false];
+
+#[bench]
+fn std_utf8_check(b: &mut Bencher) {
+ b.iter(|| {
+ assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| {
+ expect == TEXT.is_char_boundary(ix)
+ }));
+ });
+}
+
+// We don't expect to be as fast as is_char_boundary, because we provide more
+// information. But we shouldn't be tremendously slower, either. A factor of
+// 5-10 is expected on this text.
+#[bench]
+fn futf_check(b: &mut Bencher) {
+ b.iter(|| {
+ assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| {
+ expect == (::classify(TEXT.as_bytes(), ix).unwrap().rewind == 0)
+ }));
+ });
+}