diff options
Diffstat (limited to '')
-rw-r--r-- | vendor/futf/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | vendor/futf/Cargo.toml | 24 | ||||
-rw-r--r-- | vendor/futf/LICENSE-APACHE | 201 | ||||
-rw-r--r-- | vendor/futf/LICENSE-MIT | 25 | ||||
-rw-r--r-- | vendor/futf/README.md | 18 | ||||
-rw-r--r-- | vendor/futf/src/lib.rs | 248 | ||||
-rw-r--r-- | vendor/futf/src/test.rs | 270 |
7 files changed, 787 insertions, 0 deletions
diff --git a/vendor/futf/.cargo-checksum.json b/vendor/futf/.cargo-checksum.json new file mode 100644 index 000000000..b42896739 --- /dev/null +++ b/vendor/futf/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"969dea475ecafd7cdc49b65403f6dcc241ab63e50df8fff054480c9816857b03","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"60a7062291b01ba068f300612cdbdc20382ac1d4934f07bcdd7167c15299f309","README.md":"933cfbcce46af48e2dbaa75f042df2143d726b983a546f57ca0eb5fb93e220b5","src/lib.rs":"90628646d656c57504b4e7b5c3f3986fffcac31e710e7cc212fb63d836d89833","src/test.rs":"0ca773b918809aeb73f75c75c0640d78cdc824efd995f136603a567239578c47"},"package":"df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"}
\ No newline at end of file diff --git a/vendor/futf/Cargo.toml b/vendor/futf/Cargo.toml new file mode 100644 index 000000000..e57e71bbc --- /dev/null +++ b/vendor/futf/Cargo.toml @@ -0,0 +1,24 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +name = "futf" +version = "0.1.5" +authors = ["Keegan McAllister <kmcallister@mozilla.com>"] +description = "Handling fragments of UTF-8" +license = "MIT / Apache-2.0" +repository = "https://github.com/servo/futf" + +[dependencies.mac] +version = "0.1.0" + +[dependencies.new_debug_unreachable] +version = "1.0.2" diff --git a/vendor/futf/LICENSE-APACHE b/vendor/futf/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/vendor/futf/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/futf/LICENSE-MIT b/vendor/futf/LICENSE-MIT new file mode 100644 index 000000000..2e0fee105 --- /dev/null +++ b/vendor/futf/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2015 Keegan McAllister + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/vendor/futf/README.md b/vendor/futf/README.md new file mode 100644 index 000000000..325b2e74d --- /dev/null +++ b/vendor/futf/README.md @@ -0,0 +1,18 @@ +# futf + +[![Build Status](https://travis-ci.org/servo/futf.svg?branch=master)](https://travis-ci.org/kmcallister/futf) + +futf is a library for *flexible* UTF-8, or UTF-8 *fragments*. I don't know. +Check out the [API documentation](http://doc.servo.org/futf/index.html). + +Anyway, it takes an index into a byte buffer and tells you things about the +UTF-8 codepoint containing that byte. It can deal with incomplete codepoint +prefixes / suffixes at the ends of a buffer, which is useful for incremental +I/O. It can also handle UTF-16 surrogate code units encoded in the manner of +[CESU-8][] or [WTF-8][]. + +This is a low-level helper for [tendril][] that might be useful more generally. + +[CESU-8]: http://www.unicode.org/reports/tr26/ +[WTF-8]: http://simonsapin.github.io/wtf-8/ +[tendril]: https://github.com/kmcallister/tendril diff --git a/vendor/futf/src/lib.rs b/vendor/futf/src/lib.rs new file mode 100644 index 000000000..4b94a35a5 --- /dev/null +++ b/vendor/futf/src/lib.rs @@ -0,0 +1,248 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(test, feature(test))] + +#[macro_use] +extern crate debug_unreachable; + +#[macro_use] +extern crate mac; + +#[cfg(test)] +extern crate test as std_test; + +use std::{slice, char}; + +/// Meaning of a complete or partial UTF-8 codepoint. +/// +/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or +/// `Suffix` may in reality have no valid completion. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] +pub enum Meaning { + /// We found a whole codepoint. + Whole(char), + + /// We found something that isn't a valid Unicode codepoint, but + /// it *would* correspond to a UTF-16 leading surrogate code unit, + /// i.e. a value in the range `U+D800` - `U+DBFF`. + /// + /// The argument is the code unit's 10-bit index within that range. + /// + /// These are found in UTF-8 variants such as CESU-8 and WTF-8. + LeadSurrogate(u16), + + /// We found something that isn't a valid Unicode codepoint, but + /// it *would* correspond to a UTF-16 trailing surrogate code unit, + /// i.e. a value in the range `U+DC00` - `U+DFFF`. + /// + /// The argument is the code unit's 10-bit index within that range. + /// + /// These are found in UTF-8 variants such as CESU-8 and WTF-8. + TrailSurrogate(u16), + + /// We found only a prefix of a codepoint before the buffer ended. + /// + /// Includes the number of additional bytes needed. + Prefix(usize), + + /// We found only a suffix of a codepoint before running off the + /// start of the buffer. + /// + /// Up to 3 more bytes may be needed. + Suffix, +} + +/// Represents a complete or partial UTF-8 codepoint. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] +pub struct Codepoint<'a> { + /// The bytes that make up the partial or full codepoint. + /// + /// For a `Suffix` this depends on `idx`. We don't scan forward + /// for additional continuation bytes after the reverse scan + /// failed to locate a multibyte sequence start. + pub bytes: &'a [u8], + + /// Start of the codepoint in the buffer, expressed as an offset + /// back from `idx`. + pub rewind: usize, + + /// Meaning of the partial or full codepoint. + pub meaning: Meaning, +} + +#[derive(Debug, PartialEq, Eq)] +enum Byte { + Ascii, + Start(usize), + Cont, +} + +impl Byte { + #[inline(always)] + fn classify(x: u8) -> Option<Byte> { + match x & 0xC0 { + 0xC0 => match x { + x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)), + x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)), + x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)), + _ => None, + }, + 0x80 => Some(Byte::Cont), + _ => Some(Byte::Ascii), + } + } +} + +#[inline(always)] +fn all_cont(buf: &[u8]) -> bool { + buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont))) +} + +// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence: +// a starting byte followed by the correct number of continuation bytes. +#[inline(always)] +unsafe fn decode(buf: &[u8]) -> Option<Meaning> { + debug_assert!(buf.len() >= 2); + debug_assert!(buf.len() <= 4); + let n; + match buf.len() { + 2 => { + n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6 + | ((*buf.get_unchecked(1) & 0x3F) as u32); + if n < 0x80 { return None } // Overlong + } + 3 => { + n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 + | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 + | ((*buf.get_unchecked(2) & 0x3F) as u32); + match n { + 0x0000 ... 0x07FF => return None, // Overlong + 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), + 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), + _ => {} + } + } + 4 => { + n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 + | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 + | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6 + | ((*buf.get_unchecked(3) & 0x3F) as u32); + if n < 0x1_0000 { return None } // Overlong + } + _ => debug_unreachable!(), + } + + char::from_u32(n).map(Meaning::Whole) +} + +#[inline(always)] +unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) +} + +macro_rules! otry { + ($x:expr) => { unwrap_or_return!($x, None) } +} + +/// Describes the UTF-8 codepoint containing the byte at index `idx` within +/// `buf`. +/// +/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8 +/// in the vicinity of `idx`. +#[inline] +pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> { + if idx >= buf.len() { + return None; + } + + unsafe { + let x = *buf.get_unchecked(idx); + match otry!(Byte::classify(x)) { + Byte::Ascii => Some(Codepoint { + bytes: unsafe_slice(buf, idx, 1), + rewind: 0, + meaning: Meaning::Whole(x as char), + }), + Byte::Start(n) => { + let avail = buf.len() - idx; + if avail >= n { + let bytes = unsafe_slice(buf, idx, n); + if !all_cont(unsafe_slice(bytes, 1, n-1)) { + return None; + } + let meaning = otry!(decode(bytes)); + Some(Codepoint { + bytes: bytes, + rewind: 0, + meaning: meaning, + }) + } else { + Some(Codepoint { + bytes: unsafe_slice(buf, idx, avail), + rewind: 0, + meaning: Meaning::Prefix(n - avail), + }) + } + }, + Byte::Cont => { + let mut start = idx; + let mut checked = 0; + loop { + if start == 0 { + // Whoops, fell off the beginning. + return Some(Codepoint { + bytes: unsafe_slice(buf, 0, idx + 1), + rewind: idx, + meaning: Meaning::Suffix, + }); + } + + start -= 1; + checked += 1; + match otry!(Byte::classify(*buf.get_unchecked(start))) { + Byte::Cont => (), + Byte::Start(n) => { + let avail = buf.len() - start; + if avail >= n { + let bytes = unsafe_slice(buf, start, n); + if checked < n { + if !all_cont(unsafe_slice(bytes, checked, n-checked)) { + return None; + } + } + let meaning = otry!(decode(bytes)); + return Some(Codepoint { + bytes: bytes, + rewind: idx - start, + meaning: meaning, + }); + } else { + return Some(Codepoint { + bytes: unsafe_slice(buf, start, avail), + rewind: idx - start, + meaning: Meaning::Prefix(n - avail), + }); + } + } + _ => return None, + } + + if idx - start >= 3 { + // We looked at 3 bytes before a continuation byte + // and didn't find a start byte. + return None; + } + } + } + } + } +} + +#[cfg(test)] +mod test; diff --git a/vendor/futf/src/test.rs b/vendor/futf/src/test.rs new file mode 100644 index 000000000..f8e0c9387 --- /dev/null +++ b/vendor/futf/src/test.rs @@ -0,0 +1,270 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{Meaning, Byte, classify, decode, all_cont}; + +use std::borrow::ToOwned; +use std::io::Write; +use std_test::Bencher; + +#[test] +fn classify_all_bytes() { + for n in 0x00..0x80 { assert_eq!(Byte::classify(n), Some(Byte::Ascii)); } + for n in 0x80..0xC0 { assert_eq!(Byte::classify(n), Some(Byte::Cont)); } + for n in 0xC0..0xE0 { assert_eq!(Byte::classify(n), Some(Byte::Start(2))); } + for n in 0xE0..0xF0 { assert_eq!(Byte::classify(n), Some(Byte::Start(3))); } + for n in 0xF0..0xF8 { assert_eq!(Byte::classify(n), Some(Byte::Start(4))); } + for n in 0xF8..0xFF { assert_eq!(Byte::classify(n), None); } + assert_eq!(Byte::classify(0xFF), None); +} + +#[test] +fn test_all_cont() { + assert!(all_cont(b"")); + assert!(all_cont(b"\x80")); + assert!(all_cont(b"\xBF")); + assert!(all_cont(b"\x80\xBF\x80\xBF")); + + assert!(!all_cont(b"z")); + assert!(!all_cont(b"\xC0\xBF")); + assert!(!all_cont(b"\xFF")); + assert!(!all_cont(b"\x80\xBFz\x80\xBF")); + assert!(!all_cont(b"\x80\xBF\xC0\x80\xBF")); + assert!(!all_cont(b"\x80\xBF\xFF\x80\xBF")); + assert!(!all_cont(b"\x80\xBF\x80\xBFz")); + assert!(!all_cont(b"\x80\xBF\x80\xBF\xC0")); + assert!(!all_cont(b"z\x80\xBF\x80\xBF")); + assert!(!all_cont(b"\xC0\x80\xBF\x80\xBF")); +} + +#[test] +fn test_decode() { + unsafe { + assert_eq!(Some(Meaning::Whole('ő')), decode(b"\xC5\x91")); + assert_eq!(Some(Meaning::Whole('\u{a66e}')), decode(b"\xEA\x99\xAE")); + assert_eq!(Some(Meaning::Whole('\u{1f4a9}')), decode(b"\xF0\x9F\x92\xA9")); + assert_eq!(Some(Meaning::Whole('\u{10ffff}')), decode(b"\xF4\x8F\xBF\xBF")); + + assert_eq!(Some(Meaning::LeadSurrogate(0x0000)), decode(b"\xED\xA0\x80")); + assert_eq!(Some(Meaning::LeadSurrogate(0x0001)), decode(b"\xED\xA0\x81")); + assert_eq!(Some(Meaning::LeadSurrogate(0x03FE)), decode(b"\xED\xAF\xBE")); + assert_eq!(Some(Meaning::LeadSurrogate(0x03FF)), decode(b"\xED\xAF\xBF")); + + assert_eq!(Some(Meaning::TrailSurrogate(0x0000)), decode(b"\xED\xB0\x80")); + assert_eq!(Some(Meaning::TrailSurrogate(0x0001)), decode(b"\xED\xB0\x81")); + assert_eq!(Some(Meaning::TrailSurrogate(0x03FE)), decode(b"\xED\xBF\xBE")); + assert_eq!(Some(Meaning::TrailSurrogate(0x03FF)), decode(b"\xED\xBF\xBF")); + + // The last 4-byte UTF-8 sequence. This would be U+1FFFFF, which is out of + // range. + assert_eq!(None, decode(b"\xF7\xBF\xBF\xBF")); + + // First otherwise-valid sequence (would be U+110000) that is out of range + assert_eq!(None, decode(b"\xF4\x90\x80\x80")); + + // Overlong sequences + assert_eq!(None, decode(b"\xC0\x80")); + assert_eq!(None, decode(b"\xC1\xBF")); + assert_eq!(None, decode(b"\xE0\x80\x80")); + assert_eq!(None, decode(b"\xE0\x9F\xBF")); + assert_eq!(None, decode(b"\xF0\x80\x80\x80")); + assert_eq!(None, decode(b"\xF0\x8F\xBF\xBF")); + + // For not-overlong sequence for each sequence length + assert_eq!(Some(Meaning::Whole('\u{80}')), decode(b"\xC2\x80")); + assert_eq!(Some(Meaning::Whole('\u{800}')), decode(b"\xE0\xA0\x80")); + assert_eq!(Some(Meaning::Whole('\u{10000}')), decode(b"\xF0\x90\x80\x80")); + } +} + +static JUNK: &'static [u8] = b"\ + \xf8\x0d\x07\x25\xa6\x7b\x95\xeb\x47\x01\x7f\xee\ + \x3b\x00\x60\x57\x1d\x9e\x5d\x0a\x0b\x0a\x7c\x75\ + \x13\xa1\x82\x46\x27\x34\xe9\x52\x61\x0d\xec\x10\ + \x54\x49\x6e\x54\xdf\x7b\xe1\x31\x8c\x06\x21\x83\ + \x0f\xb5\x1f\x4c\x6a\x71\x52\x42\x74\xe7\x7b\x50\ + \x59\x1f\x6a\xd4\xff\x06\x92\x33\xc4\x34\x97\xff\ + \xcc\xb5\xc4\x00\x7b\xc3\x4a\x7f\x7e\x63\x96\x58\ + \x51\x63\x21\x54\x53\x2f\x03\x8a\x7d\x41\x79\x98\ + \x5b\xcb\xb8\x94\x6b\x73\xf3\x0c\x5a\xd7\xc4\x12\ + \x7a\x2b\x9a\x2e\x67\x62\x2a\x00\x45\x2c\xfe\x7d\ + \x8d\xd6\x51\x4e\x59\x36\x72\x1b\xae\xaa\x06\xe8\ + \x71\x1b\x85\xd3\x35\xb5\xbe\x9e\x16\x96\x72\xd8\ + \x1a\x48\xba\x4d\x55\x4f\x1b\xa2\x77\xfa\x8f\x71\ + \x58\x7d\x03\x93\xa2\x3a\x76\x51\xda\x48\xe2\x3f\ + \xeb\x8d\xda\x89\xae\xf7\xbd\x3d\xb6\x37\x97\xca\ + \x99\xcc\x4a\x8d\x62\x89\x97\xe3\xc0\xd1\x8d\xc1\ + \x26\x11\xbb\x8d\x53\x61\x4f\x76\x03\x00\x30\xd3\ + \x5f\x86\x19\x52\x9c\x3e\x99\x8c\xb7\x21\x48\x1c\ + \x85\xae\xad\xd5\x74\x00\x6c\x3e\xd0\x17\xff\x76\ + \x5c\x32\xc3\xfb\x24\x99\xd4\x4c\xa4\x1f\x66\x46\ + \xe7\x2d\x44\x56\x7d\x14\xd9\x76\x91\x37\x2f\xb7\ + \xcc\x1b\xd3\xc2"; + +#[test] +fn classify_whole() { + assert_eq!(JUNK.len(), 256); + + for &c in &['\0', '\x01', 'o', 'z', 'ő', '\u{2764}', + '\u{a66e}', '\u{1f4a9}', '\u{1f685}'] { + for idx in 0 .. JUNK.len() - 3 { + let mut buf = JUNK.to_owned(); + let ch = format!("{}", c).into_bytes(); + (&mut buf[idx..]).write_all(&ch).unwrap(); + + for j in 0 .. ch.len() { + let class = classify(&buf, idx+j).unwrap(); + assert_eq!(class.bytes, &*ch); + assert_eq!(class.rewind, j); + assert_eq!(class.meaning, Meaning::Whole(c)); + } + } + } +} + +#[test] +fn classify_surrogates() { + for &(s, b) in &[ + (Meaning::LeadSurrogate(0x0000), b"\xED\xA0\x80"), + (Meaning::LeadSurrogate(0x0001), b"\xED\xA0\x81"), + (Meaning::LeadSurrogate(0x03FE), b"\xED\xAF\xBE"), + (Meaning::LeadSurrogate(0x03FF), b"\xED\xAF\xBF"), + + (Meaning::TrailSurrogate(0x0000), b"\xED\xB0\x80"), + (Meaning::TrailSurrogate(0x0001), b"\xED\xB0\x81"), + (Meaning::TrailSurrogate(0x03FE), b"\xED\xBF\xBE"), + (Meaning::TrailSurrogate(0x03FF), b"\xED\xBF\xBF"), + ] { + for idx in 0 .. JUNK.len() - 2 { + let mut buf = JUNK.to_owned(); + (&mut buf[idx..]).write_all(b).unwrap(); + + let class = classify(&buf, idx).unwrap(); + assert_eq!(class.bytes, b); + assert_eq!(class.rewind, 0); + assert_eq!(class.meaning, s); + } + } +} + +#[test] +fn classify_prefix_suffix() { + for &c in &['ő', '\u{a66e}', '\u{1f4a9}'] { + let ch = format!("{}", c).into_bytes(); + for pfx in 1 .. ch.len() - 1 { + let mut buf = JUNK.to_owned(); + let buflen = buf.len(); + (&mut buf[buflen - pfx .. buflen]).write_all(&ch[..pfx]).unwrap(); + for j in 0 .. pfx { + let idx = buflen - 1 - j; + let class = classify(&buf, idx).unwrap(); + assert_eq!(class.bytes, &ch[..pfx]); + assert_eq!(class.rewind, pfx - 1 - j); + assert_eq!(class.meaning, Meaning::Prefix(ch.len() - pfx)); + } + } + for sfx in 1 .. ch.len() - 1 { + let ch_bytes = &ch[ch.len() - sfx ..]; + let mut buf = JUNK.to_owned(); + (&mut *buf).write_all(ch_bytes).unwrap(); + for j in 0 .. sfx { + let class = classify(&buf, j).unwrap(); + assert!(ch_bytes.starts_with(class.bytes)); + assert_eq!(class.rewind, j); + assert_eq!(class.meaning, Meaning::Suffix); + } + } + } +} + +#[test] +fn out_of_bounds() { + assert!(classify(b"", 0).is_none()); + assert!(classify(b"", 7).is_none()); + assert!(classify(b"aaaaaaa", 7).is_none()); +} + +#[test] +fn malformed() { + assert_eq!(None, classify(b"\xFF", 0)); + assert_eq!(None, classify(b"\xC5\xC5", 0)); + assert_eq!(None, classify(b"x\x91", 1)); + assert_eq!(None, classify(b"\x91\x91\x91\x91", 3)); + assert_eq!(None, classify(b"\x91\x91\x91\x91\x91", 4)); + assert_eq!(None, classify(b"\xEA\x91\xFF", 1)); + assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 0)); + assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 1)); + assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 2)); + + for i in 0..4 { + // out of range: U+110000 + assert_eq!(None, classify(b"\xF4\x90\x80\x80", i)); + + // out of range: U+1FFFFF + assert_eq!(None, classify(b"\xF7\xBF\xBF\xBF", i)); + + // Overlong sequences + assert_eq!(None, classify(b"\xC0\x80", i)); + assert_eq!(None, classify(b"\xC1\xBF", i)); + assert_eq!(None, classify(b"\xE0\x80\x80", i)); + assert_eq!(None, classify(b"\xE0\x9F\xBF", i)); + assert_eq!(None, classify(b"\xF0\x80\x80\x80", i)); + assert_eq!(None, classify(b"\xF0\x8F\xBF\xBF", i)); + } +} + +static TEXT: &'static str = " + All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act + towards one another in a spirit of brotherhood. + + Minden emberi lény szabadon születik és egyenlő méltósága és + joga van. Az emberek, ésszel és lelkiismerettel bírván, + egymással szemben testvéri szellemben kell hogy viseltessenek. + + เราทุกคนเกิดมาอย่างอิสระ เราทุกคนมีความคิดและความเข้าใจเป็นของเราเอง + เราทุกคนควรได้รับการปฏิบัติในทางเดียวกัน. + + 모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 + 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 + 형제애의 정신으로 행동하여야 한다. + + ro remna cu se jinzi co zifre je simdu'i be le ry. nilselsi'a + .e lei ry. selcru .i ry. se menli gi'e se sezmarde .i .ei + jeseki'ubo ry. simyzu'e ta'i le tunba + + ᏂᎦᏓ ᎠᏂᏴᏫ ᏂᎨᎫᏓᎸᎾ ᎠᎴ ᎤᏂᏠᏱ ᎤᎾᏕᎿ ᏚᏳᎧᏛ ᎨᏒᎢ. ᎨᏥᏁᎳ ᎤᎾᏓᏅᏖᏗ ᎠᎴ ᎤᏃᏟᏍᏗ + ᎠᎴ ᏌᏊ ᎨᏒ ᏧᏂᎸᏫᏍᏓᏁᏗ ᎠᎾᏟᏅᏢ ᎠᏓᏅᏙ ᎬᏗ."; + +// random +static IXES: &'static [usize] + = &[778, 156, 87, 604, 1216, 365, 884, 311, + 469, 515, 709, 162, 871, 206, 634, 442]; + +static BOUNDARY: &'static [bool] + = &[false, true, true, false, false, true, true, true, + true, false, false, true, true, true, false, false]; + +#[bench] +fn std_utf8_check(b: &mut Bencher) { + b.iter(|| { + assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| { + expect == TEXT.is_char_boundary(ix) + })); + }); +} + +// We don't expect to be as fast as is_char_boundary, because we provide more +// information. But we shouldn't be tremendously slower, either. A factor of +// 5-10 is expected on this text. +#[bench] +fn futf_check(b: &mut Bencher) { + b.iter(|| { + assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| { + expect == (::classify(TEXT.as_bytes(), ix).unwrap().rewind == 0) + })); + }); +} |