diff options
Diffstat (limited to 'vendor/tendril')
-rw-r--r-- | vendor/tendril/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | vendor/tendril/Cargo.lock | 193 | ||||
-rw-r--r-- | vendor/tendril/Cargo.toml | 40 | ||||
-rw-r--r-- | vendor/tendril/LICENSE-APACHE | 201 | ||||
-rw-r--r-- | vendor/tendril/LICENSE-MIT | 25 | ||||
-rw-r--r-- | vendor/tendril/README.md | 96 | ||||
-rw-r--r-- | vendor/tendril/examples/fuzz.rs | 146 | ||||
-rw-r--r-- | vendor/tendril/src/bench.rs | 159 | ||||
-rw-r--r-- | vendor/tendril/src/buf32.rs | 120 | ||||
-rw-r--r-- | vendor/tendril/src/fmt.rs | 519 | ||||
-rw-r--r-- | vendor/tendril/src/lib.rs | 35 | ||||
-rw-r--r-- | vendor/tendril/src/stream.rs | 752 | ||||
-rw-r--r-- | vendor/tendril/src/tendril.rs | 2472 | ||||
-rw-r--r-- | vendor/tendril/src/utf8_decode.rs | 98 | ||||
-rw-r--r-- | vendor/tendril/src/util.rs | 45 |
15 files changed, 4902 insertions, 0 deletions
diff --git a/vendor/tendril/.cargo-checksum.json b/vendor/tendril/.cargo-checksum.json new file mode 100644 index 000000000..adfe81e97 --- /dev/null +++ b/vendor/tendril/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.lock":"f32ecdca092196752a395cf6ffefe1e797632f95fb99b6ead931c9b88c656e8c","Cargo.toml":"7117e609284194fac601dadb674672052d8a36904493fbdd820f9d0e256046c0","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"60a7062291b01ba068f300612cdbdc20382ac1d4934f07bcdd7167c15299f309","README.md":"f58e2c541777e9ef432da96a68d69b2e30c8c8ab309979d4f9979dc6e05f2e9f","examples/fuzz.rs":"a5495135579c6b87f7233e9f1ebc84f38dcddb99124d0fc236f7260d87fd65bc","src/bench.rs":"f0a0b14f4e757b07cf6f89a6a8569c50bea3517a029a77934776699e3d7d6e0e","src/buf32.rs":"c3a54ddb4c4eae4e5faa092262ab706f3217a10bfe218355da7cf61c439139ea","src/fmt.rs":"d48d8c590cd76e53c907f7401d08d606a373f080aa94848a0070bfdc5ef737c1","src/lib.rs":"b5ebd9a16a2c9b5786831a815f2e9b39a6730590c37a043dbfed31f900eafcc1","src/stream.rs":"caeb9fd6959685f240eecd547713ddbf765864495d8f1ecead26ef3f6674d521","src/tendril.rs":"3123532a94faac738a75e33d0584a68eb476800583d5938c3fb99230c28d8f70","src/utf8_decode.rs":"8185ff3d7ed5e77cc5a7ce9b25e57506105ee01a2b09c09c64584fe27d25e7cc","src/util.rs":"39458c2ca52bd725e841802026c291874483ccd92f02c073220450d2b904f0d5"},"package":"d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"}
\ No newline at end of file diff --git a/vendor/tendril/Cargo.lock b/vendor/tendril/Cargo.lock new file mode 100644 index 000000000..fe6983790 --- /dev/null +++ b/vendor/tendril/Cargo.lock @@ -0,0 +1,193 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "fuchsia-cprng" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "libc" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "rand" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" +dependencies = [ + "fuchsia-cprng", + "libc", + "rand_core 0.3.1", + "rdrand", + "winapi", +] + +[[package]] +name = "rand_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" +dependencies = [ + "rand_core 0.4.2", +] + +[[package]] +name = "rand_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" + +[[package]] +name = "rdrand" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "tendril" +version = "0.4.3" +dependencies = [ + "encoding", + "encoding_rs", + "futf", + "mac", + "rand", + "utf-8", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/vendor/tendril/Cargo.toml b/vendor/tendril/Cargo.toml new file mode 100644 index 000000000..bb4aa0f31 --- /dev/null +++ b/vendor/tendril/Cargo.toml @@ -0,0 +1,40 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +name = "tendril" +version = "0.4.3" +authors = ["Keegan McAllister <mcallister.keegan@gmail.com>", "Simon Sapin <simon.sapin@exyr.org>", "Chris Morgan <me@chrismorgan.info>"] +description = "Compact buffer/string type for zero-copy parsing" +readme = "README.md" +license = "MIT/Apache-2.0" +repository = "https://github.com/servo/tendril" +[dependencies.encoding] +version = "0.2" +optional = true + +[dependencies.encoding_rs] +version = "0.8.12" +optional = true + +[dependencies.futf] +version = "0.1.5" + +[dependencies.mac] +version = "0.1" + +[dependencies.utf-8] +version = "0.7" +[dev-dependencies.rand] +version = "0.4" + +[features] +bench = [] diff --git a/vendor/tendril/LICENSE-APACHE b/vendor/tendril/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/vendor/tendril/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/tendril/LICENSE-MIT b/vendor/tendril/LICENSE-MIT new file mode 100644 index 000000000..2e0fee105 --- /dev/null +++ b/vendor/tendril/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2015 Keegan McAllister + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/vendor/tendril/README.md b/vendor/tendril/README.md new file mode 100644 index 000000000..fced4b70d --- /dev/null +++ b/vendor/tendril/README.md @@ -0,0 +1,96 @@ +# tendril + +**Warning**: This library is at a very early stage of development, and it +contains a substantial amount of `unsafe` code. Use at your own risk! + +[![Build Status](https://github.com/servo/tendril/workflows/CI/badge.svg)](https://github.com/servo/tendril/actions) + +[API Documentation](https://doc.servo.org/tendril/index.html) + +## Introduction + +`Tendril` is a compact string/buffer type, optimized for zero-copy parsing. +Tendrils have the semantics of owned strings, but are sometimes views into +shared buffers. When you mutate a tendril, an owned copy is made if necessary. +Further mutations occur in-place until the string becomes shared, e.g. with +`clone()` or `subtendril()`. + +Buffer sharing is accomplished through thread-local (non-atomic) reference +counting, which has very low overhead. The Rust type system will prevent +you at compile time from sending a tendril between threads. (See below +for thoughts on relaxing this restriction.) + +Whereas `String` allocates in the heap for any non-empty string, `Tendril` can +store small strings (up to 8 bytes) in-line, without a heap allocation. +`Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes versus +24. `Option<Tendril>` is the same size as `Tendril`, thanks to +[`NonZero`][NonZero]. + +The maximum length of a tendril is 4 GB. The library will panic if you attempt +to go over the limit. + +## Formats and encoding + +`Tendril` uses +[phantom types](https://doc.rust-lang.org/stable/rust-by-example/generics/phantom.html) +to track a buffer's format. This determines at compile time which +operations are available on a given tendril. For example, `Tendril<UTF8>` and +`Tendril<Bytes>` can be borrowed as `&str` and `&[u8]` respectively. + +`Tendril` also integrates with +[rust-encoding](https://github.com/lifthrasiir/rust-encoding) and has +preliminary support for [WTF-8][] buffers. + +## Plans for the future + +### Ropes + +[html5ever][] will use `Tendril` as a zero-copy text representation. It would +be good to preserve this all the way through to Servo's DOM. This would reduce +memory consumption, and possibly speed up text shaping and painting. However, +DOM text may conceivably be larger than 4 GB, and will anyway not be contiguous +in memory around e.g. a character entity reference. + +*Solution:* Build a **[rope][] on top of these strings** and use that as +Servo's representation of DOM text. We can perhaps do text shaping and/or +painting in parallel for different chunks of a rope. html5ever can additionally +use this rope type as a replacement for `BufferQueue`. + +Because the underlying buffers are reference-counted, the bulk of this rope +is already a [persistent data structure][]. Consider what happens when +appending two ropes to get a "new" rope. A vector-backed rope would copy a +vector of small structs, one for each chunk, and would bump the corresponding +refcounts. But it would not copy any of the string data. + +If we want more sharing, then a [2-3 finger tree][] could be a good choice. +We would probably stick with `VecDeque` for ropes under a certain size. + +### UTF-16 compatibility + +SpiderMonkey expects text to be in UCS-2 format for the most part. The +semantics of JavaScript strings are difficult to implement on UTF-8. This also +applies to HTML parsing via `document.write`. Also, passing SpiderMonkey a +string that isn't contiguous in memory will incur additional overhead and +complexity, if not a full copy. + +*Solution:* Use **WTF-8 in parsing** and in the DOM. Servo will **convert to +contiguous UTF-16 when necessary**. The conversion can easily be parallelized, +if we find a practical need to convert huge chunks of text all at once. + +### Source span information + +Some html5ever API consumers want to know the originating location in the HTML +source file(s) of each token or parse error. An example application would be a +command-line HTML validator with diagnostic output similar to `rustc`'s. + +*Solution:* Accept **some metadata along with each input string**. The type of +metadata is chosen by the API consumer; it defaults to `()`, which has size +zero. For any non-inline string, we can provide the associated metadata as well +as a byte offset. + +[NonZero]: https://doc.rust-lang.org/core/nonzero/struct.NonZero.html +[html5ever]: https://github.com/servo/html5ever +[WTF-8]: https://simonsapin.github.io/wtf-8/ +[rope]: https://en.wikipedia.org/wiki/Rope_%28data_structure%29 +[persistent data structure]: https://en.wikipedia.org/wiki/Persistent_data_structure +[2-3 finger tree]: https://www.staff.city.ac.uk/~ross/papers/FingerTree.html diff --git a/vendor/tendril/examples/fuzz.rs b/vendor/tendril/examples/fuzz.rs new file mode 100644 index 000000000..37daf560b --- /dev/null +++ b/vendor/tendril/examples/fuzz.rs @@ -0,0 +1,146 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! A simple fuzz tester for the library. + +#![deny(warnings)] + +extern crate rand; +extern crate tendril; + +use std::borrow::ToOwned; + +use rand::distributions::{IndependentSample, Range}; +use rand::Rng; +use tendril::StrTendril; + +fn fuzz() { + let mut rng = rand::thread_rng(); + let capacity = Range::new(0u32, 1 << 14).ind_sample(&mut rng); + let mut buf_string = String::with_capacity(capacity as usize); + let mut buf_tendril = StrTendril::with_capacity(capacity); + let mut string_slices = vec![]; + let mut tendril_slices = vec![]; + + for _ in 1..100_000 { + if buf_string.len() > (1 << 30) { + buf_string.truncate(0); + buf_tendril.clear(); + } + + let dist_action = Range::new(0, 100); + match dist_action.ind_sample(&mut rng) { + 0..=15 => { + let (start, end) = random_slice(&mut rng, TEXT); + let snip = &TEXT[start..end]; + buf_string.push_str(snip); + buf_tendril.push_slice(snip); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 16..=31 => { + let (start, end) = random_slice(&mut rng, &buf_string); + let snip = &buf_string[start..end].to_owned(); + buf_string.push_str(&snip); + buf_tendril.push_slice(&snip); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 32..=47 => { + let lenstr = format!("[length = {}]", buf_tendril.len()); + buf_string.push_str(&lenstr); + buf_tendril.push_slice(&lenstr); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 48..=63 => { + let n = random_boundary(&mut rng, &buf_string); + buf_tendril.pop_front(n as u32); + buf_string = buf_string[n..].to_owned(); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 64..=79 => { + let new_len = random_boundary(&mut rng, &buf_string); + let n = buf_string.len() - new_len; + buf_string.truncate(new_len); + buf_tendril.pop_back(n as u32); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 80..=90 => { + let (start, end) = random_slice(&mut rng, &buf_string); + buf_string = buf_string[start..end].to_owned(); + buf_tendril = buf_tendril.subtendril(start as u32, (end - start) as u32); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 91..=96 => { + let c = rng.gen(); + buf_string.push(c); + assert!(buf_tendril.try_push_char(c).is_ok()); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 97 => { + buf_string.truncate(0); + buf_tendril.clear(); + assert_eq!(&*buf_string, &*buf_tendril); + } + + _ => { + let (start, end) = random_slice(&mut rng, &buf_string); + string_slices.push(buf_string[start..end].to_owned()); + tendril_slices.push(buf_tendril.subtendril(start as u32, (end - start) as u32)); + assert_eq!(string_slices.len(), tendril_slices.len()); + assert!(string_slices + .iter() + .zip(tendril_slices.iter()) + .all(|(s, t)| **s == **t)); + } + } + } +} + +fn random_boundary<R: Rng>(rng: &mut R, text: &str) -> usize { + loop { + let i = Range::new(0, text.len() + 1).ind_sample(rng); + if text.is_char_boundary(i) { + return i; + } + } +} + +fn random_slice<R: Rng>(rng: &mut R, text: &str) -> (usize, usize) { + loop { + let start = Range::new(0, text.len() + 1).ind_sample(rng); + let end = Range::new(start, text.len() + 1).ind_sample(rng); + if !text.is_char_boundary(start) { + continue; + } + if end < text.len() && !text.is_char_boundary(end) { + continue; + } + return (start, end); + } +} + +static TEXT: &'static str = + "It was from the artists and poets that the pertinent answers came, and I \ + know that panic would have broken loose had they been able to compare notes. \ + As it was, lacking their original letters, I half suspected the compiler of \ + having asked leading questions, or of having edited the correspondence in \ + corroboration of what he had latently resolved to see.\ +\ + ˙ǝǝs oʇ pǝʌʃosǝɹ ʎʃʇuǝʇɐʃ pɐɥ ǝɥ ʇɐɥʍ ɟo uoıʇɐɹoqoɹɹoɔ uı ǝɔuǝpuodsǝɹɹoɔ ǝɥʇ \ + pǝʇıpǝ ƃuıʌɐɥ ɟo ɹo 'suoıʇsǝnb ƃuıpɐǝʃ pǝʞsɐ ƃuıʌɐɥ ɟo ɹǝʃıdɯoɔ ǝɥʇ pǝʇɔǝdsns \ + ɟʃɐɥ I 'sɹǝʇʇǝʃ ʃɐuıƃıɹo ɹıǝɥʇ ƃuıʞɔɐʃ 'sɐʍ ʇı s∀ ˙sǝʇou ǝɹɐdɯoɔ oʇ ǝʃqɐ uǝǝq \ + ʎǝɥʇ pɐɥ ǝsooʃ uǝʞoɹq ǝʌɐɥ pʃnoʍ ɔıuɐd ʇɐɥʇ ʍouʞ I puɐ 'ǝɯɐɔ sɹǝʍsuɐ ʇuǝuıʇɹǝd \ + ǝɥʇ ʇɐɥʇ sʇǝod puɐ sʇsıʇɹɐ ǝɥʇ ɯoɹɟ sɐʍ ʇI"; + +fn main() { + fuzz(); +} diff --git a/vendor/tendril/src/bench.rs b/vendor/tendril/src/bench.rs new file mode 100644 index 000000000..a9d2c30af --- /dev/null +++ b/vendor/tendril/src/bench.rs @@ -0,0 +1,159 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::ToOwned; +use std::collections::hash_map::{Entry, HashMap}; + +use tendril::StrTendril; + +fn index_words_string(input: &String) -> HashMap<char, Vec<String>> { + let mut index = HashMap::new(); + for word in input.split(|c| c == ' ') { + if word.len() == 0 { + continue; + } + let word = word.to_owned(); + match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + let x: &mut Vec<String> = e.get_mut(); + x.push(word); + } + Entry::Vacant(e) => { + e.insert(vec![word]); + } + } + } + index +} + +fn index_words_tendril(input: &StrTendril) -> HashMap<char, Vec<StrTendril>> { + let mut index = HashMap::new(); + let mut t = input.clone(); + loop { + match t.pop_front_char_run(|c| c != ' ') { + None => return index, + Some((_, false)) => (), + Some((word, true)) => match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + e.get_mut().push(word); + } + Entry::Vacant(e) => { + e.insert(vec![word]); + } + }, + } + } +} + +static EN_1: &'static str = "Days turn to nights turn to paper into rocks into plastic"; + +static EN_2: &'static str = + "Here the notes in my laboratory journal cease. I was able to write the last \ + words only with great effort. By now it was already clear to me that LSD had \ + been the cause of the remarkable experience of the previous Friday, for the \ + altered perceptions were of the same type as before, only much more intense. I \ + had to struggle to speak intelligibly. I asked my laboratory assistant, who was \ + informed of the self-experiment, to escort me home. We went by bicycle, no \ + automobile being available because of wartime restrictions on their use. On the \ + way home, my condition began to assume threatening forms. Everything in my \ + field of vision wavered and was distorted as if seen in a curved mirror. I also \ + had the sensation of being unable to move from the spot. Nevertheless, my \ + assistant later told me that we had traveled very rapidly. Finally, we arrived \ + at home safe and sound, and I was just barely capable of asking my companion to \ + summon our family doctor and request milk from the neighbors.\n\n\ + In spite of my delirious, bewildered condition, I had brief periods of clear \ + and effective thinking—and chose milk as a nonspecific antidote for poisoning."; + +static KR_1: &'static str = + "러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, 메모리-안전하고 병렬 \ + 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. 아직 \ + 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다."; + +static HTML_KR_1: &'static str = + "<p>러스트(<a href=\"http://rust-lang.org\">Rust</a>)는 모질라(<a href=\"\ + https://www.mozilla.org/\">mozilla.org</a>)에서 개발하고 있는, \ + 메모리-안전하고 병렬 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. \ + 아직 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다.</p>"; + +mod index_words { + macro_rules! bench { + ($txt:ident) => { + #[allow(non_snake_case)] + mod $txt { + const SMALL_SIZE: usize = 65536; + const LARGE_SIZE: usize = (1 << 20); + + #[bench] + fn index_words_string(b: &mut ::test::Bencher) { + let mut s = String::new(); + while s.len() < SMALL_SIZE { + s.push_str(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_string(&s)); + } + + #[bench] + fn index_words_tendril(b: &mut ::test::Bencher) { + let mut t = ::tendril::StrTendril::new(); + while t.len() < SMALL_SIZE { + t.push_slice(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_tendril(&t)); + } + + #[bench] + fn index_words_big_string(b: &mut ::test::Bencher) { + let mut s = String::new(); + while s.len() < LARGE_SIZE { + s.push_str(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_string(&s)); + } + + #[bench] + fn index_words_big_tendril(b: &mut ::test::Bencher) { + let mut t = ::tendril::StrTendril::new(); + while t.len() < LARGE_SIZE { + t.push_slice(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_tendril(&t)); + } + + #[test] + fn correctness() { + use std::borrow::ToOwned; + use tendril::bench::{index_words_string, index_words_tendril}; + use tendril::SliceExt; + + let txt = ::tendril::bench::$txt; + let input_string = txt.to_owned(); + let count_s = index_words_string(&input_string); + let mut keys: Vec<char> = count_s.keys().cloned().collect(); + keys.sort(); + + let input_tendril = txt.to_tendril(); + let count_t = index_words_tendril(&input_tendril); + let mut keys_t: Vec<char> = count_t.keys().cloned().collect(); + keys_t.sort(); + + assert_eq!(keys, keys_t); + + for k in &keys { + let vs = &count_s[k]; + let vt = &count_t[k]; + assert_eq!(vs.len(), vt.len()); + assert!(vs.iter().zip(vt.iter()).all(|(s, t)| **s == **t)); + } + } + } + }; + } + + bench!(EN_1); + bench!(EN_2); + bench!(KR_1); + bench!(HTML_KR_1); +} diff --git a/vendor/tendril/src/buf32.rs b/vendor/tendril/src/buf32.rs new file mode 100644 index 000000000..d60a277a1 --- /dev/null +++ b/vendor/tendril/src/buf32.rs @@ -0,0 +1,120 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Provides an unsafe owned buffer type, used in implementing `Tendril`. + +use std::{mem, ptr, slice, u32}; + +use OFLOW; + +pub const MIN_CAP: u32 = 16; + +pub const MAX_LEN: usize = u32::MAX as usize; + +/// A buffer points to a header of type `H`, which is followed by `MIN_CAP` or more +/// bytes of storage. +pub struct Buf32<H> { + pub ptr: *mut H, + pub len: u32, + pub cap: u32, +} + +#[inline(always)] +fn bytes_to_vec_capacity<H>(x: u32) -> usize { + let header = mem::size_of::<H>(); + debug_assert!(header > 0); + let x = (x as usize).checked_add(header).expect(OFLOW); + // Integer ceil https://stackoverflow.com/a/2745086/1162888 + 1 + ((x - 1) / header) +} + +impl<H> Buf32<H> { + #[inline] + pub unsafe fn with_capacity(mut cap: u32, h: H) -> Buf32<H> { + if cap < MIN_CAP { + cap = MIN_CAP; + } + + let mut vec = Vec::<H>::with_capacity(bytes_to_vec_capacity::<H>(cap)); + let ptr = vec.as_mut_ptr(); + mem::forget(vec); + ptr::write(ptr, h); + + Buf32 { + ptr: ptr, + len: 0, + cap: cap, + } + } + + #[inline] + pub unsafe fn destroy(self) { + mem::drop(Vec::from_raw_parts( + self.ptr, + 1, + bytes_to_vec_capacity::<H>(self.cap), + )); + } + + #[inline(always)] + pub unsafe fn data_ptr(&self) -> *mut u8 { + (self.ptr as *mut u8).offset(mem::size_of::<H>() as isize) + } + + #[inline(always)] + pub unsafe fn data(&self) -> &[u8] { + slice::from_raw_parts(self.data_ptr(), self.len as usize) + } + + #[inline(always)] + pub unsafe fn data_mut(&mut self) -> &mut [u8] { + slice::from_raw_parts_mut(self.data_ptr(), self.len as usize) + } + + /// Grow the capacity to at least `new_cap`. + /// + /// This will panic if the capacity calculation overflows `u32`. + #[inline] + pub unsafe fn grow(&mut self, new_cap: u32) { + if new_cap <= self.cap { + return; + } + + let new_cap = new_cap.checked_next_power_of_two().expect(OFLOW); + let mut vec = Vec::from_raw_parts(self.ptr, 0, bytes_to_vec_capacity::<H>(self.cap)); + vec.reserve_exact(bytes_to_vec_capacity::<H>(new_cap)); + self.ptr = vec.as_mut_ptr(); + self.cap = new_cap; + mem::forget(vec); + } +} + +#[cfg(test)] +mod test { + use super::Buf32; + use std::ptr; + + #[test] + fn smoke_test() { + unsafe { + let mut b = Buf32::with_capacity(0, 0u8); + assert_eq!(b"", b.data()); + + b.grow(5); + ptr::copy_nonoverlapping(b"Hello".as_ptr(), b.data_ptr(), 5); + + assert_eq!(b"", b.data()); + b.len = 5; + assert_eq!(b"Hello", b.data()); + + b.grow(1337); + assert!(b.cap >= 1337); + assert_eq!(b"Hello", b.data()); + + b.destroy(); + } + } +} diff --git a/vendor/tendril/src/fmt.rs b/vendor/tendril/src/fmt.rs new file mode 100644 index 000000000..2ff04bbca --- /dev/null +++ b/vendor/tendril/src/fmt.rs @@ -0,0 +1,519 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Marker types for formats. +//! +//! This module defines the types and traits used to mark a `Tendril` +//! with the format of data it contains. It includes those formats +//! for which `Tendril` supports at least some operations without +//! conversion. +//! +//! To convert a string tendril to/from a byte tendril in an arbitrary +//! character encoding, see the `encode` and `decode` methods on +//! `Tendril`. +//! +//! `Tendril` operations may become memory-unsafe if data invalid for +//! the format sneaks in. For that reason, these traits require +//! `unsafe impl`. + +use std::default::Default; +use std::{char, mem, str}; + +use futf::{self, Codepoint, Meaning}; + +/// Implementation details. +/// +/// You don't need these unless you are implementing +/// a new format. +pub mod imp { + use std::default::Default; + use std::{iter, mem, slice}; + + /// Describes how to fix up encodings when concatenating. + /// + /// We can drop characters on either side of the splice, + /// and insert up to 4 bytes in the middle. + pub struct Fixup { + pub drop_left: u32, + pub drop_right: u32, + pub insert_len: u32, + pub insert_bytes: [u8; 4], + } + + impl Default for Fixup { + #[inline(always)] + fn default() -> Fixup { + Fixup { + drop_left: 0, + drop_right: 0, + insert_len: 0, + insert_bytes: [0; 4], + } + } + } + + #[inline(always)] + unsafe fn from_u32_unchecked(n: u32) -> char { + mem::transmute(n) + } + + pub struct SingleByteCharIndices<'a> { + inner: iter::Enumerate<slice::Iter<'a, u8>>, + } + + impl<'a> Iterator for SingleByteCharIndices<'a> { + type Item = (usize, char); + + #[inline] + fn next(&mut self) -> Option<(usize, char)> { + self.inner + .next() + .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) + } + } + + impl<'a> SingleByteCharIndices<'a> { + #[inline] + pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { + SingleByteCharIndices { + inner: buf.iter().enumerate(), + } + } + } +} + +/// Trait for format marker types. +/// +/// The type implementing this trait is usually not instantiated. +/// It's used with a phantom type parameter of `Tendril`. +pub unsafe trait Format { + /// Check whether the buffer is valid for this format. + fn validate(buf: &[u8]) -> bool; + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a prefix of a valid buffer. + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a suffix of a valid buffer. + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a contiguous subsequence + /// of a valid buffer, but not necessarily a prefix or + /// a suffix. + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Compute any fixup needed when concatenating buffers. + /// + /// The default is to do nothing. + /// + /// The function is `unsafe` because it may assume the input + /// buffers are already valid for the format. Also, no + /// bounds-checking is performed on the return value! + #[inline(always)] + unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { + Default::default() + } +} + +/// Indicates that one format is a subset of another. +/// +/// The subset format can be converted to the superset format +/// for free. +pub unsafe trait SubsetOf<Super>: Format +where + Super: Format, +{ + /// Validate the *other* direction of conversion; check if + /// this buffer from the superset format conforms to the + /// subset format. + /// + /// The default calls `Self::validate`, but some conversions + /// may implement a check which is cheaper than validating + /// from scratch. + fn revalidate_subset(x: &[u8]) -> bool { + Self::validate(x) + } +} + +/// Indicates a format which corresponds to a Rust slice type, +/// representing exactly the same invariants. +pub unsafe trait SliceFormat: Format + Sized { + type Slice: ?Sized + Slice; +} + +/// Indicates a format which contains characters from Unicode +/// (all of it, or some proper subset). +pub unsafe trait CharFormat<'a>: Format { + /// Iterator for characters and their byte indices. + type Iter: Iterator<Item = (usize, char)>; + + /// Iterate over the characters of the string and their byte + /// indices. + /// + /// You may assume the buffer is *already validated* for `Format`. + unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; + + /// Encode the character as bytes and pass them to a continuation. + /// + /// Returns `Err(())` iff the character cannot be represented. + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]); +} + +/// Indicates a Rust slice type that is represented in memory as bytes. +pub unsafe trait Slice { + /// Access the raw bytes of the slice. + fn as_bytes(&self) -> &[u8]; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_bytes(x: &[u8]) -> &Self; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; +} + +/// Marker type for uninterpreted bytes. +/// +/// Validation will never fail for this format. +#[derive(Copy, Clone, Default, Debug)] +pub struct Bytes; + +unsafe impl Format for Bytes { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } +} + +unsafe impl SliceFormat for Bytes { + type Slice = [u8]; +} + +unsafe impl Slice for [u8] { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + self + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &[u8] { + x + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { + x + } +} + +/// Marker type for ASCII text. +#[derive(Copy, Clone, Default, Debug)] +pub struct ASCII; + +unsafe impl Format for ASCII { + #[inline] + fn validate(buf: &[u8]) -> bool { + buf.iter().all(|&n| n <= 127) + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl SubsetOf<UTF8> for ASCII {} +unsafe impl SubsetOf<Latin1> for ASCII {} + +unsafe impl<'a> CharFormat<'a> for ASCII { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0x7F { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} + +/// Marker type for UTF-8 text. +#[derive(Copy, Clone, Default, Debug)] +pub struct UTF8; + +unsafe impl Format for UTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + str::from_utf8(buf).is_ok() + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) + } +} + +unsafe impl SubsetOf<WTF8> for UTF8 {} + +unsafe impl SliceFormat for UTF8 { + type Slice = str; +} + +unsafe impl Slice for str { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + str::as_bytes(self) + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &str { + str::from_utf8_unchecked(x) + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { + mem::transmute(x) + } +} + +unsafe impl<'a> CharFormat<'a> for UTF8 { + type Iter = str::CharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { + str::from_utf8_unchecked(buf).char_indices() + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); + Ok(()) + } +} + +/// Marker type for WTF-8 text. +/// +/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). +#[derive(Copy, Clone, Default, Debug)] +pub struct WTF8; + +#[inline] +fn wtf8_meaningful(m: Meaning) -> bool { + match m { + Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, + _ => false, + } +} + +unsafe impl Format for WTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + let mut i = 0; + let mut prev_lead = false; + while i < buf.len() { + let codept = unwrap_or_return!(futf::classify(buf, i), false); + if !wtf8_meaningful(codept.meaning) { + return false; + } + i += codept.bytes.len(); + prev_lead = match codept.meaning { + Meaning::TrailSurrogate(_) if prev_lead => return false, + Meaning::LeadSurrogate(_) => true, + _ => false, + }; + } + + true + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) + } + + #[inline] + unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { + const ERR: &'static str = "WTF8: internal error"; + + if lhs.len() >= 3 && rhs.len() >= 3 { + if let ( + Some(Codepoint { + meaning: Meaning::LeadSurrogate(hi), + .. + }), + Some(Codepoint { + meaning: Meaning::TrailSurrogate(lo), + .. + }), + ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) + { + let mut fixup = imp::Fixup { + drop_left: 3, + drop_right: 3, + insert_len: 0, + insert_bytes: [0_u8; 4], + }; + + let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); + + let ch = char::from_u32(n).expect(ERR); + fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; + + return fixup; + } + } + + Default::default() + } +} + +/// Marker type for the single-byte encoding of the first 256 Unicode codepoints. +/// +/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the +/// C0 and C1 control characters from ECMA-48 / ISO 6429. +/// +/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the +/// many other aliases), which actually stand for Windows-1252. +#[derive(Copy, Clone, Default, Debug)] +pub struct Latin1; + +unsafe impl Format for Latin1 { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl<'a> CharFormat<'a> for Latin1 { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0xFF { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} diff --git a/vendor/tendril/src/lib.rs b/vendor/tendril/src/lib.rs new file mode 100644 index 000000000..33782fdc2 --- /dev/null +++ b/vendor/tendril/src/lib.rs @@ -0,0 +1,35 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(all(test, feature = "bench"), feature(test))] +//#![cfg_attr(test, deny(warnings))] + +#[cfg(feature = "encoding")] +pub extern crate encoding; +#[cfg(feature = "encoding_rs")] +pub extern crate encoding_rs; +#[cfg(all(test, feature = "bench"))] +extern crate test; +#[macro_use] +extern crate mac; +extern crate futf; +extern crate utf8; + +pub use fmt::Format; +pub use stream::TendrilSink; +pub use tendril::{Atomic, Atomicity, NonAtomic, SendTendril}; +pub use tendril::{ByteTendril, ReadExt, SliceExt, StrTendril, SubtendrilError, Tendril}; +pub use utf8_decode::IncompleteUtf8; + +pub mod fmt; +pub mod stream; + +mod buf32; +mod tendril; +mod utf8_decode; +mod util; + +static OFLOW: &'static str = "tendril: overflow in buffer arithmetic"; diff --git a/vendor/tendril/src/stream.rs b/vendor/tendril/src/stream.rs new file mode 100644 index 000000000..469d58c9b --- /dev/null +++ b/vendor/tendril/src/stream.rs @@ -0,0 +1,752 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Streams of tendrils. + +use fmt; +use tendril::{Atomicity, NonAtomic, Tendril}; + +use std::borrow::Cow; +use std::fs::File; +use std::io; +use std::marker::PhantomData; +use std::path::Path; + +#[cfg(feature = "encoding")] +use encoding; +#[cfg(feature = "encoding_rs")] +use encoding_rs::{self, DecoderResult}; +use utf8; + +/// Trait for types that can process a tendril. +/// +/// This is a "push" interface, unlike the "pull" interface of +/// `Iterator<Item=Tendril<F>>`. The push interface matches +/// [html5ever][] and other incremental parsers with a similar +/// architecture. +/// +/// [html5ever]: https://github.com/servo/html5ever +pub trait TendrilSink<F, A = NonAtomic> +where + F: fmt::Format, + A: Atomicity, +{ + /// Process this tendril. + fn process(&mut self, t: Tendril<F, A>); + + /// Indicates that an error has occurred. + fn error(&mut self, desc: Cow<'static, str>); + + /// What the overall result of processing is. + type Output; + + /// Indicates the end of the stream. + fn finish(self) -> Self::Output; + + /// Process one tendril and finish. + fn one<T>(mut self, t: T) -> Self::Output + where + Self: Sized, + T: Into<Tendril<F, A>>, + { + self.process(t.into()); + self.finish() + } + + /// Consume an iterator of tendrils, processing each item, then finish. + fn from_iter<I>(mut self, i: I) -> Self::Output + where + Self: Sized, + I: IntoIterator, + I::Item: Into<Tendril<F, A>>, + { + for t in i { + self.process(t.into()) + } + self.finish() + } + + /// Read from the given stream of bytes until exhaustion and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output> + where + Self: Sized, + R: io::Read, + F: fmt::SliceFormat<Slice = [u8]>, + { + const BUFFER_SIZE: u32 = 4 * 1024; + loop { + let mut tendril = Tendril::<F, A>::new(); + // FIXME: this exposes uninitialized bytes to a generic R type + // this is fine for R=File which never reads these bytes, + // but user-defined types might. + // The standard library pushes zeros to `Vec<u8>` for that reason. + unsafe { + tendril.push_uninitialized(BUFFER_SIZE); + } + loop { + match r.read(&mut tendril) { + Ok(0) => return Ok(self.finish()), + Ok(n) => { + tendril.pop_back(BUFFER_SIZE - n as u32); + self.process(tendril); + break; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + } + } + + /// Read from the file at the given path and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn from_file<P>(self, path: P) -> io::Result<Self::Output> + where + Self: Sized, + P: AsRef<Path>, + F: fmt::SliceFormat<Slice = [u8]>, + { + self.read_from(&mut File::open(path)?) + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This does not allocate memory: the output is either subtendrils on the input, +/// on inline tendrils for a single code point. +pub struct Utf8LossyDecoder<Sink, A = NonAtomic> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + pub inner_sink: Sink, + incomplete: Option<utf8::Incomplete>, + marker: PhantomData<A>, +} + +impl<Sink, A> Utf8LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + /// Create a new incremental UTF-8 decoder. + #[inline] + pub fn new(inner_sink: Sink) -> Self { + Utf8LossyDecoder { + inner_sink: inner_sink, + incomplete: None, + marker: PhantomData, + } + } +} + +impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + #[inline] + fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) { + // FIXME: remove take() and map() when non-lexical borrows are stable. + if let Some(mut incomplete) = self.incomplete.take() { + let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { + match result { + Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), + Err(_) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + } + t.len() - rest.len() + }); + match resume_at { + None => { + self.incomplete = Some(incomplete); + return; + } + Some(resume_at) => t.pop_front(resume_at as u32), + } + } + while !t.is_empty() { + let unborrowed_result = match utf8::decode(&t) { + Ok(s) => { + debug_assert!(s.as_ptr() == t.as_ptr()); + debug_assert!(s.len() == t.len()); + Ok(()) + } + Err(utf8::DecodeError::Invalid { + valid_prefix, + invalid_sequence, + .. + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err(( + valid_prefix.len(), + Err(valid_prefix.len() + invalid_sequence.len()), + )) + } + Err(utf8::DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err((valid_prefix.len(), Ok(incomplete_suffix))) + } + }; + match unborrowed_result { + Ok(()) => { + unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } + return; + } + Err((valid_len, and_then)) => { + if valid_len > 0 { + let subtendril = t.subtendril(0, valid_len as u32); + unsafe { + self.inner_sink + .process(subtendril.reinterpret_without_validating()) + } + } + match and_then { + Ok(incomplete) => { + self.incomplete = Some(incomplete); + return; + } + Err(offset) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + t.pop_front(offset as u32); + } + } + } + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + self.inner_sink.error(desc); + } + + type Output = Sink::Output; + + #[inline] + fn finish(mut self) -> Sink::Output { + if self.incomplete.is_some() { + self.inner_sink + .error("incomplete byte sequence at end of stream".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + self.inner_sink.finish() + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This allocates new tendrils for encodings other than UTF-8. +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +pub struct LossyDecoder<Sink, A = NonAtomic> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + inner: LossyDecoderInner<Sink, A>, +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +enum LossyDecoderInner<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + Utf8(Utf8LossyDecoder<Sink, A>), + #[cfg(feature = "encoding")] + Encoding(Box<encoding::RawDecoder>, Sink), + #[cfg(feature = "encoding_rs")] + EncodingRs(encoding_rs::Decoder, Sink), +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl<Sink, A> LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + /// Create a new incremental decoder using the encoding crate. + #[cfg(feature = "encoding")] + #[inline] + pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { + if encoding.name() == "utf-8" { + LossyDecoder::utf8(sink) + } else { + LossyDecoder { + inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), + } + } + } + + /// Create a new incremental decoder using the encoding_rs crate. + #[cfg(feature = "encoding_rs")] + #[inline] + pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { + if encoding == encoding_rs::UTF_8 { + return Self::utf8(sink); + } + Self { + inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), + } + } + + /// Create a new incremental decoder for the UTF-8 encoding. + /// + /// This is useful for content that is known at run-time to be UTF-8 + /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) + #[inline] + pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> { + LossyDecoder { + inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), + } + } + + /// Give a reference to the inner sink. + pub fn inner_sink(&self) -> &Sink { + match self.inner { + LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, + } + } + + /// Give a mutable reference to the inner sink. + pub fn inner_sink_mut(&mut self) -> &mut Sink { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, + } + } +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + #[inline] + fn process(&mut self, t: Tendril<fmt::Bytes, A>) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { + let mut out = Tendril::new(); + let mut t = t; + loop { + match decoder.raw_feed(&*t, &mut out) { + (_, Some(err)) => { + out.push_char('\u{fffd}'); + sink.error(err.cause); + debug_assert!(err.upto >= 0); + t.pop_front(err.upto as u32); + // continue loop and process remainder of t + } + (_, None) => break, + } + } + if out.len() > 0 { + sink.process(out); + } + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { + if t.is_empty() { + return; + } + decode_to_sink(t, decoder, sink, false); + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), + } + } + + type Output = Sink::Output; + + #[inline] + fn finish(self) -> Sink::Output { + match self.inner { + LossyDecoderInner::Utf8(utf8) => return utf8.finish(), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(mut decoder, mut sink) => { + let mut out = Tendril::new(); + if let Some(err) = decoder.raw_finish(&mut out) { + out.push_char('\u{fffd}'); + sink.error(err.cause); + } + if out.len() > 0 { + sink.process(out); + } + sink.finish() + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { + decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); + sink.finish() + } + } + } +} + +#[cfg(feature = "encoding_rs")] +fn decode_to_sink<Sink, A>( + mut t: Tendril<fmt::Bytes, A>, + decoder: &mut encoding_rs::Decoder, + sink: &mut Sink, + last: bool, +) where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + loop { + let mut out = <Tendril<fmt::Bytes, A>>::new(); + let max_len = decoder + .max_utf8_buffer_length_without_replacement(t.len()) + .unwrap_or(8192); + unsafe { + out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); + } + let (result, bytes_read, bytes_written) = + decoder.decode_to_utf8_without_replacement(&t, &mut out, last); + if bytes_written > 0 { + sink.process(unsafe { + out.subtendril(0, bytes_written as u32) + .reinterpret_without_validating() + }); + } + match result { + DecoderResult::InputEmpty => return, + DecoderResult::OutputFull => {} + DecoderResult::Malformed(_, _) => { + sink.error(Cow::Borrowed("invalid sequence")); + sink.process("\u{FFFD}".into()); + } + } + t.pop_front(bytes_read as u32); + if t.is_empty() { + return; + } + } +} + +#[cfg(test)] +mod test { + use super::{TendrilSink, Utf8LossyDecoder}; + use fmt; + use std::borrow::Cow; + use tendril::{Atomicity, NonAtomic, Tendril}; + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use super::LossyDecoder; + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use tendril::SliceExt; + + #[cfg(feature = "encoding")] + use encoding::all as enc; + #[cfg(feature = "encoding_rs")] + use encoding_rs as enc_rs; + + struct Accumulate<A> + where + A: Atomicity, + { + tendrils: Vec<Tendril<fmt::UTF8, A>>, + errors: Vec<String>, + } + + impl<A> Accumulate<A> + where + A: Atomicity, + { + fn new() -> Accumulate<A> { + Accumulate { + tendrils: vec![], + errors: vec![], + } + } + } + + impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A> + where + A: Atomicity, + { + fn process(&mut self, t: Tendril<fmt::UTF8, A>) { + self.tendrils.push(t); + } + + fn error(&mut self, desc: Cow<'static, str>) { + self.errors.push(desc.into_owned()); + } + + type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>); + + fn finish(self) -> Self::Output { + (self.tendrils, self.errors) + } + } + + fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { + let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); + let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); + assert_eq!( + expected, + &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>() + ); + assert_eq!(errs, errors.len()); + } + + #[test] + fn utf8() { + check_utf8(&[], &[], 0); + check_utf8(&[b""], &[], 0); + check_utf8(&[b"xyz"], &["xyz"], 0); + check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); + + check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); + check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8( + &[b"xy\xEA", b"\x99", b"\xAEzw"], + &["xy", "\u{a66e}z", "w"], + 0, + ); + check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); + check_utf8( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + &["\u{a66e}"], + 0, + ); + + check_utf8( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], + 4, + ); + check_utf8( + &[b"xy\xEA\x99", b"\xFFz"], + &["xy", "\u{fffd}", "\u{fffd}", "z"], + 2, + ); + + check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); + check_utf8( + &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], + &["ő", "\u{fffd}", "\u{fffd}", "ő"], + 2, + ); + + // incomplete char at end of input + check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); + check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + fn check_decode( + mut decoder: LossyDecoder<Accumulate<NonAtomic>>, + input: &[&[u8]], + expected: &str, + errs: usize, + ) { + for x in input { + decoder.process(x.to_tendril()); + } + let (tendrils, errors) = decoder.finish(); + let mut tendril: Tendril<fmt::UTF8> = Tendril::new(); + for t in tendrils { + tendril.push_tendril(&t); + } + assert_eq!(expected, &*tendril); + assert_eq!(errs, errors.len()); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; + + #[cfg(any(feature = "encoding"))] + const ASCII: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"xy", b"", b"", b"z"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xFF"], "\u{fffd}", 1), + (&[b"x\xC0yz"], "x\u{fffd}yz", 1), + (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), + (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_ascii() { + for &(input, expected, errs) in ASCII { + let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const UTF_8: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), + ( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + "\u{a66e}", + 0, + ), + (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), + ( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", + 4, + ), + (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), + // incomplete char at end of input + (&[b"\xC0"], "\u{fffd}", 1), + (&[b"\xEA\x99"], "\u{fffd}", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_utf8() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_utf8_encoding_rs() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const KOI8_U: Tests = &[ + (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), + ( + &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], + "Энергия", + 0, + ), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_koi8_u() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_koi8_u_encoding_rs() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const WINDOWS_949: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), + ( + &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], + "안녕하세요", + 0, + ), + (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), + (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), + (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_windows_949() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_windows_949_encoding_rs() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[test] + fn read_from() { + let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); + let mut bytes: &[u8] = b"foo\xffbar"; + let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); + assert_eq!( + &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(), + &["foo", "\u{FFFD}", "bar"] + ); + assert_eq!(errors, &["invalid byte sequence"]); + } +} diff --git a/vendor/tendril/src/tendril.rs b/vendor/tendril/src/tendril.rs new file mode 100644 index 000000000..0941b267e --- /dev/null +++ b/vendor/tendril/src/tendril.rs @@ -0,0 +1,2472 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::Borrow; +use std::cell::{Cell, UnsafeCell}; +use std::cmp::Ordering; +use std::default::Default; +use std::fmt as strfmt; +use std::iter::FromIterator; +use std::marker::PhantomData; +use std::num::NonZeroUsize; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::atomic::{self, AtomicUsize}; +use std::{hash, io, mem, ptr, str, u32}; + +#[cfg(feature = "encoding")] +use encoding::{self, DecoderTrap, EncoderTrap, EncodingRef}; + +use buf32::{self, Buf32}; +use fmt::imp::Fixup; +use fmt::{self, Slice}; +use util::{copy_and_advance, copy_lifetime, copy_lifetime_mut, unsafe_slice, unsafe_slice_mut}; +use OFLOW; + +const MAX_INLINE_LEN: usize = 8; +const MAX_INLINE_TAG: usize = 0xF; +const EMPTY_TAG: usize = 0xF; + +#[inline(always)] +fn inline_tag(len: u32) -> NonZeroUsize { + debug_assert!(len <= MAX_INLINE_LEN as u32); + unsafe { NonZeroUsize::new_unchecked(if len == 0 { EMPTY_TAG } else { len as usize }) } +} + +/// The multithreadedness of a tendril. +/// +/// Exactly two types implement this trait: +/// +/// - `Atomic`: use this in your tendril and you will have a `Send` tendril which works +/// across threads; this is akin to `Arc`. +/// +/// - `NonAtomic`: use this in your tendril and you will have a tendril which is neither +/// `Send` nor `Sync` but should be a tad faster; this is akin to `Rc`. +/// +/// The layout of this trait is also mandated to be that of a `usize`, +/// for it is used for reference counting. +pub unsafe trait Atomicity: 'static { + #[doc(hidden)] + fn new() -> Self; + + #[doc(hidden)] + fn increment(&self) -> usize; + + #[doc(hidden)] + fn decrement(&self) -> usize; + + #[doc(hidden)] + fn fence_acquire(); +} + +/// A marker of a non-atomic tendril. +/// +/// This is the default for the second type parameter of a `Tendril` +/// and so doesn't typically need to be written. +/// +/// This is akin to using `Rc` for reference counting. +#[repr(C)] +pub struct NonAtomic(Cell<usize>); + +unsafe impl Atomicity for NonAtomic { + #[inline] + fn new() -> Self { + NonAtomic(Cell::new(1)) + } + + #[inline] + fn increment(&self) -> usize { + let value = self.0.get(); + self.0.set(value.checked_add(1).expect(OFLOW)); + value + } + + #[inline] + fn decrement(&self) -> usize { + let value = self.0.get(); + self.0.set(value - 1); + value + } + + #[inline] + fn fence_acquire() {} +} + +/// A marker of an atomic (and hence concurrent) tendril. +/// +/// This is used as the second, optional type parameter of a `Tendril`; +/// `Tendril<F, Atomic>` thus implements`Send`. +/// +/// This is akin to using `Arc` for reference counting. +pub struct Atomic(AtomicUsize); + +unsafe impl Atomicity for Atomic { + #[inline] + fn new() -> Self { + Atomic(AtomicUsize::new(1)) + } + + #[inline] + fn increment(&self) -> usize { + // Relaxed is OK because we have a reference already. + self.0.fetch_add(1, AtomicOrdering::Relaxed) + } + + #[inline] + fn decrement(&self) -> usize { + self.0.fetch_sub(1, AtomicOrdering::Release) + } + + #[inline] + fn fence_acquire() { + atomic::fence(AtomicOrdering::Acquire); + } +} + +#[repr(C)] // Preserve field order for cross-atomicity transmutes +struct Header<A: Atomicity> { + refcount: A, + cap: u32, +} + +impl<A> Header<A> +where + A: Atomicity, +{ + #[inline(always)] + unsafe fn new() -> Header<A> { + Header { + refcount: A::new(), + cap: 0, + } + } +} + +/// Errors that can occur when slicing a `Tendril`. +#[derive(Copy, Clone, Hash, Debug, PartialEq, Eq)] +pub enum SubtendrilError { + OutOfBounds, + ValidationFailed, +} + +/// Compact string type for zero-copy parsing. +/// +/// `Tendril`s have the semantics of owned strings, but are sometimes views +/// into shared buffers. When you mutate a `Tendril`, an owned copy is made +/// if necessary. Further mutations occur in-place until the string becomes +/// shared, e.g. with `clone()` or `subtendril()`. +/// +/// Buffer sharing is accomplished through thread-local (non-atomic) reference +/// counting, which has very low overhead. The Rust type system will prevent +/// you at compile time from sending a `Tendril` between threads. We plan to +/// relax this restriction in the future; see `README.md`. +/// +/// Whereas `String` allocates in the heap for any non-empty string, `Tendril` +/// can store small strings (up to 8 bytes) in-line, without a heap allocation. +/// `Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes +/// versus 24. +/// +/// The type parameter `F` specifies the format of the tendril, for example +/// UTF-8 text or uninterpreted bytes. The parameter will be instantiated +/// with one of the marker types from `tendril::fmt`. See the `StrTendril` +/// and `ByteTendril` type aliases for two examples. +/// +/// The type parameter `A` indicates the atomicity of the tendril; it is by +/// default `NonAtomic`, but can be specified as `Atomic` to get a tendril +/// which implements `Send` (viz. a thread-safe tendril). +/// +/// The maximum length of a `Tendril` is 4 GB. The library will panic if +/// you attempt to go over the limit. +#[repr(C)] +pub struct Tendril<F, A = NonAtomic> +where + F: fmt::Format, + A: Atomicity, +{ + ptr: Cell<NonZeroUsize>, + buf: UnsafeCell<Buffer>, + marker: PhantomData<*mut F>, + refcount_marker: PhantomData<A>, +} + +#[repr(C)] +union Buffer { + heap: Heap, + inline: [u8; 8], +} + +#[derive(Copy, Clone)] +#[repr(C)] +struct Heap { + len: u32, + aux: u32, +} + +unsafe impl<F, A> Send for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity + Sync, +{ +} + +/// `Tendril` for storing native Rust strings. +pub type StrTendril = Tendril<fmt::UTF8>; + +/// `Tendril` for storing binary data. +pub type ByteTendril = Tendril<fmt::Bytes>; + +impl<F, A> Clone for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn clone(&self) -> Tendril<F, A> { + unsafe { + if self.ptr.get().get() > MAX_INLINE_TAG { + self.make_buf_shared(); + self.incref(); + } + + ptr::read(self) + } + } +} + +impl<F, A> Drop for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn drop(&mut self) { + unsafe { + let p = self.ptr.get().get(); + if p <= MAX_INLINE_TAG { + return; + } + + let (buf, shared, _) = self.assume_buf(); + if shared { + let header = self.header(); + if (*header).refcount.decrement() == 1 { + A::fence_acquire(); + buf.destroy(); + } + } else { + buf.destroy(); + } + } + } +} + +macro_rules! from_iter_method { + ($ty:ty) => { + #[inline] + fn from_iter<I>(iterable: I) -> Self + where + I: IntoIterator<Item = $ty>, + { + let mut output = Self::new(); + output.extend(iterable); + output + } + }; +} + +impl<A> Extend<char> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = char>, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for c in iterator { + self.push_char(c); + } + } +} + +impl<A> FromIterator<char> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + from_iter_method!(char); +} + +impl<A> Extend<u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = u8>, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for b in iterator { + self.push_slice(&[b]); + } + } +} + +impl<A> FromIterator<u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + from_iter_method!(u8); +} + +impl<'a, A> Extend<&'a u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a u8>, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for &b in iterator { + self.push_slice(&[b]); + } + } +} + +impl<'a, A> FromIterator<&'a u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + from_iter_method!(&'a u8); +} + +impl<'a, A> Extend<&'a str> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a str>, + { + for s in iterable { + self.push_slice(s); + } + } +} + +impl<'a, A> FromIterator<&'a str> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + from_iter_method!(&'a str); +} + +impl<'a, A> Extend<&'a [u8]> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a [u8]>, + { + for s in iterable { + self.push_slice(s); + } + } +} + +impl<'a, A> FromIterator<&'a [u8]> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + from_iter_method!(&'a [u8]); +} + +impl<'a, F, A> Extend<&'a Tendril<F, A>> for Tendril<F, A> +where + F: fmt::Format + 'a, + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a Tendril<F, A>>, + { + for t in iterable { + self.push_tendril(t); + } + } +} + +impl<'a, F, A> FromIterator<&'a Tendril<F, A>> for Tendril<F, A> +where + F: fmt::Format + 'a, + A: Atomicity, +{ + from_iter_method!(&'a Tendril<F, A>); +} + +impl<F, A> Deref for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + type Target = F::Slice; + + #[inline] + fn deref(&self) -> &F::Slice { + unsafe { F::Slice::from_bytes(self.as_byte_slice()) } + } +} + +impl<F, A> DerefMut for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn deref_mut(&mut self) -> &mut F::Slice { + unsafe { F::Slice::from_mut_bytes(self.as_mut_byte_slice()) } + } +} + +impl<F, A> Borrow<[u8]> for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + fn borrow(&self) -> &[u8] { + self.as_byte_slice() + } +} + +// Why not impl Borrow<str> for Tendril<fmt::UTF8>? str and [u8] hash differently, +// and so a HashMap<StrTendril, _> would silently break if we indexed by str. Ick. +// https://github.com/rust-lang/rust/issues/27108 + +impl<F, A> PartialEq for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn eq(&self, other: &Self) -> bool { + self.as_byte_slice() == other.as_byte_slice() + } + + #[inline] + fn ne(&self, other: &Self) -> bool { + self.as_byte_slice() != other.as_byte_slice() + } +} + +impl<F, A> Eq for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ +} + +impl<F, A> PartialOrd for Tendril<F, A> +where + F: fmt::SliceFormat, + <F as fmt::SliceFormat>::Slice: PartialOrd, + A: Atomicity, +{ + #[inline] + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { + PartialOrd::partial_cmp(&**self, &**other) + } +} + +impl<F, A> Ord for Tendril<F, A> +where + F: fmt::SliceFormat, + <F as fmt::SliceFormat>::Slice: Ord, + A: Atomicity, +{ + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&**self, &**other) + } +} + +impl<F, A> Default for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline(always)] + fn default() -> Tendril<F, A> { + Tendril::new() + } +} + +impl<F, A> strfmt::Debug for Tendril<F, A> +where + F: fmt::SliceFormat + Default + strfmt::Debug, + <F as fmt::SliceFormat>::Slice: strfmt::Debug, + A: Atomicity, +{ + #[inline] + fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { + let kind = match self.ptr.get().get() { + p if p <= MAX_INLINE_TAG => "inline", + p if p & 1 == 1 => "shared", + _ => "owned", + }; + + write!(f, "Tendril<{:?}>({}: ", <F as Default>::default(), kind)?; + <<F as fmt::SliceFormat>::Slice as strfmt::Debug>::fmt(&**self, f)?; + write!(f, ")") + } +} + +impl<F, A> hash::Hash for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn hash<H: hash::Hasher>(&self, hasher: &mut H) { + self.as_byte_slice().hash(hasher) + } +} + +impl<F, A> Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + /// Create a new, empty `Tendril` in any format. + #[inline(always)] + pub fn new() -> Tendril<F, A> { + unsafe { Tendril::inline(&[]) } + } + + /// Create a new, empty `Tendril` with a specified capacity. + #[inline] + pub fn with_capacity(capacity: u32) -> Tendril<F, A> { + let mut t: Tendril<F, A> = Tendril::new(); + if capacity > MAX_INLINE_LEN as u32 { + unsafe { + t.make_owned_with_capacity(capacity); + } + } + t + } + + /// Reserve space for additional bytes. + /// + /// This is only a suggestion. There are cases where `Tendril` will + /// decline to allocate until the buffer is actually modified. + #[inline] + pub fn reserve(&mut self, additional: u32) { + if !self.is_shared() { + // Don't grow a shared tendril because we'd have to copy + // right away. + self.force_reserve(additional); + } + } + + /// Reserve space for additional bytes, even for shared buffers. + #[inline] + fn force_reserve(&mut self, additional: u32) { + let new_len = self.len32().checked_add(additional).expect(OFLOW); + if new_len > MAX_INLINE_LEN as u32 { + unsafe { + self.make_owned_with_capacity(new_len); + } + } + } + + /// Get the length of the `Tendril`. + /// + /// This is named not to conflict with `len()` on the underlying + /// slice, if any. + #[inline(always)] + pub fn len32(&self) -> u32 { + match self.ptr.get().get() { + EMPTY_TAG => 0, + n if n <= MAX_INLINE_LEN => n as u32, + _ => unsafe { self.raw_len() }, + } + } + + /// Is the backing buffer shared? + #[inline] + pub fn is_shared(&self) -> bool { + let n = self.ptr.get().get(); + + (n > MAX_INLINE_TAG) && ((n & 1) == 1) + } + + /// Is the backing buffer shared with this other `Tendril`? + #[inline] + pub fn is_shared_with(&self, other: &Tendril<F, A>) -> bool { + let n = self.ptr.get().get(); + + (n > MAX_INLINE_TAG) && (n == other.ptr.get().get()) + } + + /// Truncate to length 0 without discarding any owned storage. + #[inline] + pub fn clear(&mut self) { + if self.ptr.get().get() <= MAX_INLINE_TAG { + self.ptr + .set(unsafe { NonZeroUsize::new_unchecked(EMPTY_TAG) }); + } else { + let (_, shared, _) = unsafe { self.assume_buf() }; + if shared { + // No need to keep a reference alive for a 0-size slice. + *self = Tendril::new(); + } else { + unsafe { self.set_len(0) }; + } + } + } + + /// Build a `Tendril` by copying a byte slice, if it conforms to the format. + #[inline] + pub fn try_from_byte_slice(x: &[u8]) -> Result<Tendril<F, A>, ()> { + match F::validate(x) { + true => Ok(unsafe { Tendril::from_byte_slice_without_validating(x) }), + false => Err(()), + } + } + + /// View as uninterpreted bytes. + #[inline(always)] + pub fn as_bytes(&self) -> &Tendril<fmt::Bytes, A> { + unsafe { mem::transmute(self) } + } + + /// Convert into uninterpreted bytes. + #[inline(always)] + pub fn into_bytes(self) -> Tendril<fmt::Bytes, A> { + unsafe { mem::transmute(self) } + } + + /// Convert `self` into a type which is `Send`. + /// + /// If the tendril is owned or inline, this is free, + /// but if it's shared this will entail a copy of the contents. + #[inline] + pub fn into_send(mut self) -> SendTendril<F> { + self.make_owned(); + SendTendril { + // This changes the header.refcount from A to NonAtomic, but that's + // OK because we have defined the format of A as a usize. + tendril: unsafe { mem::transmute(self) }, + } + } + + /// View as a superset format, for free. + #[inline(always)] + pub fn as_superset<Super>(&self) -> &Tendril<Super, A> + where + F: fmt::SubsetOf<Super>, + Super: fmt::Format, + { + unsafe { mem::transmute(self) } + } + + /// Convert into a superset format, for free. + #[inline(always)] + pub fn into_superset<Super>(self) -> Tendril<Super, A> + where + F: fmt::SubsetOf<Super>, + Super: fmt::Format, + { + unsafe { mem::transmute(self) } + } + + /// View as a subset format, if the `Tendril` conforms to that subset. + #[inline] + pub fn try_as_subset<Sub>(&self) -> Result<&Tendril<Sub, A>, ()> + where + Sub: fmt::SubsetOf<F>, + { + match Sub::revalidate_subset(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(()), + } + } + + /// Convert into a subset format, if the `Tendril` conforms to that subset. + #[inline] + pub fn try_into_subset<Sub>(self) -> Result<Tendril<Sub, A>, Self> + where + Sub: fmt::SubsetOf<F>, + { + match Sub::revalidate_subset(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(self), + } + } + + /// View as another format, if the bytes of the `Tendril` are valid for + /// that format. + #[inline] + pub fn try_reinterpret_view<Other>(&self) -> Result<&Tendril<Other, A>, ()> + where + Other: fmt::Format, + { + match Other::validate(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(()), + } + } + + /// Convert into another format, if the `Tendril` conforms to that format. + /// + /// This only re-validates the existing bytes under the new format. It + /// will *not* change the byte content of the tendril! + /// + /// See the `encode` and `decode` methods for character encoding conversion. + #[inline] + pub fn try_reinterpret<Other>(self) -> Result<Tendril<Other, A>, Self> + where + Other: fmt::Format, + { + match Other::validate(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(self), + } + } + + /// Push some bytes onto the end of the `Tendril`, if they conform to the + /// format. + #[inline] + pub fn try_push_bytes(&mut self, buf: &[u8]) -> Result<(), ()> { + match F::validate(buf) { + true => unsafe { + self.push_bytes_without_validating(buf); + Ok(()) + }, + false => Err(()), + } + } + + /// Push another `Tendril` onto the end of this one. + #[inline] + pub fn push_tendril(&mut self, other: &Tendril<F, A>) { + let new_len = self.len32().checked_add(other.len32()).expect(OFLOW); + + unsafe { + if (self.ptr.get().get() > MAX_INLINE_TAG) && (other.ptr.get().get() > MAX_INLINE_TAG) { + let (self_buf, self_shared, _) = self.assume_buf(); + let (other_buf, other_shared, _) = other.assume_buf(); + + if self_shared + && other_shared + && (self_buf.data_ptr() == other_buf.data_ptr()) + && other.aux() == self.aux() + self.raw_len() + { + self.set_len(new_len); + return; + } + } + + self.push_bytes_without_validating(other.as_byte_slice()) + } + } + + /// Attempt to slice this `Tendril` as a new `Tendril`. + /// + /// This will share the buffer when possible. Mutating a shared buffer + /// will copy the contents. + /// + /// The offset and length are in bytes. The function will return + /// `Err` if these are out of bounds, or if the resulting slice + /// does not conform to the format. + #[inline] + pub fn try_subtendril( + &self, + offset: u32, + length: u32, + ) -> Result<Tendril<F, A>, SubtendrilError> { + let self_len = self.len32(); + if offset > self_len || length > (self_len - offset) { + return Err(SubtendrilError::OutOfBounds); + } + + unsafe { + let byte_slice = unsafe_slice(self.as_byte_slice(), offset as usize, length as usize); + if !F::validate_subseq(byte_slice) { + return Err(SubtendrilError::ValidationFailed); + } + + Ok(self.unsafe_subtendril(offset, length)) + } + } + + /// Slice this `Tendril` as a new `Tendril`. + /// + /// Panics on bounds or validity check failure. + #[inline] + pub fn subtendril(&self, offset: u32, length: u32) -> Tendril<F, A> { + self.try_subtendril(offset, length).unwrap() + } + + /// Try to drop `n` bytes from the front. + /// + /// Returns `Err` if the bytes are not available, or the suffix fails + /// validation. + #[inline] + pub fn try_pop_front(&mut self, n: u32) -> Result<(), SubtendrilError> { + if n == 0 { + return Ok(()); + } + let old_len = self.len32(); + if n > old_len { + return Err(SubtendrilError::OutOfBounds); + } + let new_len = old_len - n; + + unsafe { + if !F::validate_suffix(unsafe_slice( + self.as_byte_slice(), + n as usize, + new_len as usize, + )) { + return Err(SubtendrilError::ValidationFailed); + } + + self.unsafe_pop_front(n); + Ok(()) + } + } + + /// Drop `n` bytes from the front. + /// + /// Panics if the bytes are not available, or the suffix fails + /// validation. + #[inline] + pub fn pop_front(&mut self, n: u32) { + self.try_pop_front(n).unwrap() + } + + /// Drop `n` bytes from the back. + /// + /// Returns `Err` if the bytes are not available, or the prefix fails + /// validation. + #[inline] + pub fn try_pop_back(&mut self, n: u32) -> Result<(), SubtendrilError> { + if n == 0 { + return Ok(()); + } + let old_len = self.len32(); + if n > old_len { + return Err(SubtendrilError::OutOfBounds); + } + let new_len = old_len - n; + + unsafe { + if !F::validate_prefix(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)) { + return Err(SubtendrilError::ValidationFailed); + } + + self.unsafe_pop_back(n); + Ok(()) + } + } + + /// Drop `n` bytes from the back. + /// + /// Panics if the bytes are not available, or the prefix fails + /// validation. + #[inline] + pub fn pop_back(&mut self, n: u32) { + self.try_pop_back(n).unwrap() + } + + /// View as another format, without validating. + #[inline(always)] + pub unsafe fn reinterpret_view_without_validating<Other>(&self) -> &Tendril<Other, A> + where + Other: fmt::Format, + { + mem::transmute(self) + } + + /// Convert into another format, without validating. + #[inline(always)] + pub unsafe fn reinterpret_without_validating<Other>(self) -> Tendril<Other, A> + where + Other: fmt::Format, + { + mem::transmute(self) + } + + /// Build a `Tendril` by copying a byte slice, without validating. + #[inline] + pub unsafe fn from_byte_slice_without_validating(x: &[u8]) -> Tendril<F, A> { + assert!(x.len() <= buf32::MAX_LEN); + if x.len() <= MAX_INLINE_LEN { + Tendril::inline(x) + } else { + Tendril::owned_copy(x) + } + } + + /// Push some bytes onto the end of the `Tendril`, without validating. + #[inline] + pub unsafe fn push_bytes_without_validating(&mut self, buf: &[u8]) { + assert!(buf.len() <= buf32::MAX_LEN); + + let Fixup { + drop_left, + drop_right, + insert_len, + insert_bytes, + } = F::fixup(self.as_byte_slice(), buf); + + // FIXME: think more about overflow + let adj_len = self.len32() + insert_len - drop_left; + + let new_len = adj_len.checked_add(buf.len() as u32).expect(OFLOW) - drop_right; + + let drop_left = drop_left as usize; + let drop_right = drop_right as usize; + + if new_len <= MAX_INLINE_LEN as u32 { + let mut tmp = [0_u8; MAX_INLINE_LEN]; + { + let old = self.as_byte_slice(); + let mut dest = tmp.as_mut_ptr(); + copy_and_advance(&mut dest, unsafe_slice(old, 0, old.len() - drop_left)); + copy_and_advance( + &mut dest, + unsafe_slice(&insert_bytes, 0, insert_len as usize), + ); + copy_and_advance( + &mut dest, + unsafe_slice(buf, drop_right, buf.len() - drop_right), + ); + } + *self = Tendril::inline(&tmp[..new_len as usize]); + } else { + self.make_owned_with_capacity(new_len); + let (owned, _, _) = self.assume_buf(); + let mut dest = owned + .data_ptr() + .offset((owned.len as usize - drop_left) as isize); + copy_and_advance( + &mut dest, + unsafe_slice(&insert_bytes, 0, insert_len as usize), + ); + copy_and_advance( + &mut dest, + unsafe_slice(buf, drop_right, buf.len() - drop_right), + ); + self.set_len(new_len); + } + } + + /// Slice this `Tendril` as a new `Tendril`. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_subtendril(&self, offset: u32, length: u32) -> Tendril<F, A> { + if length <= MAX_INLINE_LEN as u32 { + Tendril::inline(unsafe_slice( + self.as_byte_slice(), + offset as usize, + length as usize, + )) + } else { + self.make_buf_shared(); + self.incref(); + let (buf, _, _) = self.assume_buf(); + Tendril::shared(buf, self.aux() + offset, length) + } + } + + /// Drop `n` bytes from the front. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_pop_front(&mut self, n: u32) { + let new_len = self.len32() - n; + if new_len <= MAX_INLINE_LEN as u32 { + *self = Tendril::inline(unsafe_slice( + self.as_byte_slice(), + n as usize, + new_len as usize, + )); + } else { + self.make_buf_shared(); + self.set_aux(self.aux() + n); + let len = self.raw_len(); + self.set_len(len - n); + } + } + + /// Drop `n` bytes from the back. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_pop_back(&mut self, n: u32) { + let new_len = self.len32() - n; + if new_len <= MAX_INLINE_LEN as u32 { + *self = Tendril::inline(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)); + } else { + self.make_buf_shared(); + let len = self.raw_len(); + self.set_len(len - n); + } + } + + #[inline] + unsafe fn incref(&self) { + (*self.header()).refcount.increment(); + } + + #[inline] + unsafe fn make_buf_shared(&self) { + let p = self.ptr.get().get(); + if p & 1 == 0 { + let header = p as *mut Header<A>; + (*header).cap = self.aux(); + + self.ptr.set(NonZeroUsize::new_unchecked(p | 1)); + self.set_aux(0); + } + } + + // This is not public as it is of no practical value to users. + // By and large they shouldn't need to worry about the distinction at all, + // and going out of your way to make it owned is pointless. + #[inline] + fn make_owned(&mut self) { + unsafe { + let ptr = self.ptr.get().get(); + if ptr <= MAX_INLINE_TAG || (ptr & 1) == 1 { + *self = Tendril::owned_copy(self.as_byte_slice()); + } + } + } + + #[inline] + unsafe fn make_owned_with_capacity(&mut self, cap: u32) { + self.make_owned(); + let mut buf = self.assume_buf().0; + buf.grow(cap); + self.ptr.set(NonZeroUsize::new_unchecked(buf.ptr as usize)); + self.set_aux(buf.cap); + } + + #[inline(always)] + unsafe fn header(&self) -> *mut Header<A> { + (self.ptr.get().get() & !1) as *mut Header<A> + } + + #[inline] + unsafe fn assume_buf(&self) -> (Buf32<Header<A>>, bool, u32) { + let ptr = self.ptr.get().get(); + let header = self.header(); + let shared = (ptr & 1) == 1; + let (cap, offset) = match shared { + true => ((*header).cap, self.aux()), + false => (self.aux(), 0), + }; + + ( + Buf32 { + ptr: header, + len: offset + self.len32(), + cap: cap, + }, + shared, + offset, + ) + } + + #[inline] + unsafe fn inline(x: &[u8]) -> Tendril<F, A> { + let len = x.len(); + let t = Tendril { + ptr: Cell::new(inline_tag(len as u32)), + buf: UnsafeCell::new(Buffer { inline: [0; 8] }), + marker: PhantomData, + refcount_marker: PhantomData, + }; + ptr::copy_nonoverlapping(x.as_ptr(), (*t.buf.get()).inline.as_mut_ptr(), len); + t + } + + #[inline] + unsafe fn owned(x: Buf32<Header<A>>) -> Tendril<F, A> { + Tendril { + ptr: Cell::new(NonZeroUsize::new_unchecked(x.ptr as usize)), + buf: UnsafeCell::new(Buffer { + heap: Heap { + len: x.len, + aux: x.cap, + }, + }), + marker: PhantomData, + refcount_marker: PhantomData, + } + } + + #[inline] + unsafe fn owned_copy(x: &[u8]) -> Tendril<F, A> { + let len32 = x.len() as u32; + let mut b = Buf32::with_capacity(len32, Header::new()); + ptr::copy_nonoverlapping(x.as_ptr(), b.data_ptr(), x.len()); + b.len = len32; + Tendril::owned(b) + } + + #[inline] + unsafe fn shared(buf: Buf32<Header<A>>, off: u32, len: u32) -> Tendril<F, A> { + Tendril { + ptr: Cell::new(NonZeroUsize::new_unchecked((buf.ptr as usize) | 1)), + buf: UnsafeCell::new(Buffer { + heap: Heap { len, aux: off }, + }), + marker: PhantomData, + refcount_marker: PhantomData, + } + } + + #[inline] + fn as_byte_slice<'a>(&'a self) -> &'a [u8] { + unsafe { + match self.ptr.get().get() { + EMPTY_TAG => &[], + n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked(..n), + _ => { + let (buf, _, offset) = self.assume_buf(); + copy_lifetime( + self, + unsafe_slice(buf.data(), offset as usize, self.len32() as usize), + ) + } + } + } + } + + // There's no need to worry about locking on an atomic Tendril, because it makes it unique as + // soon as you do that. + #[inline] + fn as_mut_byte_slice<'a>(&'a mut self) -> &'a mut [u8] { + unsafe { + match self.ptr.get().get() { + EMPTY_TAG => &mut [], + n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked_mut(..n), + _ => { + self.make_owned(); + let (mut buf, _, offset) = self.assume_buf(); + let len = self.len32() as usize; + copy_lifetime_mut(self, unsafe_slice_mut(buf.data_mut(), offset as usize, len)) + } + } + } + } + + unsafe fn raw_len(&self) -> u32 { + (*self.buf.get()).heap.len + } + + unsafe fn set_len(&mut self, len: u32) { + (*self.buf.get()).heap.len = len; + } + + unsafe fn aux(&self) -> u32 { + (*self.buf.get()).heap.aux + } + + unsafe fn set_aux(&self, aux: u32) { + (*self.buf.get()).heap.aux = aux; + } +} + +impl<F, A> Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + /// Build a `Tendril` by copying a slice. + #[inline] + pub fn from_slice(x: &F::Slice) -> Tendril<F, A> { + unsafe { Tendril::from_byte_slice_without_validating(x.as_bytes()) } + } + + /// Push a slice onto the end of the `Tendril`. + #[inline] + pub fn push_slice(&mut self, x: &F::Slice) { + unsafe { self.push_bytes_without_validating(x.as_bytes()) } + } +} + +/// A simple wrapper to make `Tendril` `Send`. +/// +/// Although there is a certain subset of the operations on a `Tendril` that a `SendTendril` could +/// reasonably implement, in order to clearly separate concerns this type is deliberately +/// minimalist, acting as a safe encapsulation around the invariants which permit `Send`ness and +/// behaving as an opaque object. +/// +/// A `SendTendril` may be produced by `Tendril.into_send()` or `SendTendril::from(tendril)`, +/// and may be returned to a `Tendril` by `Tendril::from(self)`. +#[derive(Clone)] +pub struct SendTendril<F> +where + F: fmt::Format, +{ + tendril: Tendril<F>, +} + +unsafe impl<F> Send for SendTendril<F> where F: fmt::Format {} + +impl<F, A> From<Tendril<F, A>> for SendTendril<F> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn from(tendril: Tendril<F, A>) -> SendTendril<F> { + tendril.into_send() + } +} + +impl<F, A> From<SendTendril<F>> for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn from(send: SendTendril<F>) -> Tendril<F, A> { + unsafe { mem::transmute(send.tendril) } + // header.refcount may have been initialised as an Atomic or a NonAtomic, but the value + // will be the same (1) regardless, because the layout is defined. + // Thus we don't need to fiddle about resetting it or anything like that. + } +} + +/// `Tendril`-related methods for Rust slices. +pub trait SliceExt<F>: fmt::Slice +where + F: fmt::SliceFormat<Slice = Self>, +{ + /// Make a `Tendril` from this slice. + #[inline] + fn to_tendril(&self) -> Tendril<F> { + // It should be done thusly, but at the time of writing the defaults don't help inference: + //fn to_tendril<A = NonAtomic>(&self) -> Tendril<Self::Format, A> + // where A: Atomicity, + //{ + Tendril::from_slice(self) + } +} + +impl SliceExt<fmt::UTF8> for str {} +impl SliceExt<fmt::Bytes> for [u8] {} + +impl<F, A> Tendril<F, A> +where + F: for<'a> fmt::CharFormat<'a>, + A: Atomicity, +{ + /// Remove and return the first character, if any. + #[inline] + pub fn pop_front_char<'a>(&'a mut self) -> Option<char> { + unsafe { + let next_char; // first char in iterator + let mut skip = 0; // number of bytes to skip, or 0 to clear + + { + // <--+ + // | Creating an iterator borrows self, so introduce a + // +- scope to contain the borrow (that way we can mutate + // self below, after this scope exits). + + let mut iter = F::char_indices(self.as_byte_slice()); + match iter.next() { + Some((_, c)) => { + next_char = Some(c); + if let Some((n, _)) = iter.next() { + skip = n as u32; + } + } + None => { + next_char = None; + } + } + } + + if skip != 0 { + self.unsafe_pop_front(skip); + } else { + self.clear(); + } + + next_char + } + } + + /// Remove and return a run of characters at the front of the `Tendril` + /// which are classified the same according to the function `classify`. + /// + /// Returns `None` on an empty string. + #[inline] + pub fn pop_front_char_run<'a, C, R>(&'a mut self, mut classify: C) -> Option<(Tendril<F, A>, R)> + where + C: FnMut(char) -> R, + R: PartialEq, + { + let (class, first_mismatch); + { + let mut chars = unsafe { F::char_indices(self.as_byte_slice()) }; + let (_, first) = unwrap_or_return!(chars.next(), None); + class = classify(first); + first_mismatch = chars.find(|&(_, ch)| &classify(ch) != &class); + } + + match first_mismatch { + Some((idx, _)) => unsafe { + let t = self.unsafe_subtendril(0, idx as u32); + self.unsafe_pop_front(idx as u32); + Some((t, class)) + }, + None => { + let t = self.clone(); + self.clear(); + Some((t, class)) + } + } + } + + /// Push a character, if it can be represented in this format. + #[inline] + pub fn try_push_char(&mut self, c: char) -> Result<(), ()> { + F::encode_char(c, |b| unsafe { + self.push_bytes_without_validating(b); + }) + } +} + +/// Extension trait for `io::Read`. +pub trait ReadExt: io::Read { + fn read_to_tendril<A>(&mut self, buf: &mut Tendril<fmt::Bytes, A>) -> io::Result<usize> + where + A: Atomicity; +} + +impl<T> ReadExt for T +where + T: io::Read, +{ + /// Read all bytes until EOF. + fn read_to_tendril<A>(&mut self, buf: &mut Tendril<fmt::Bytes, A>) -> io::Result<usize> + where + A: Atomicity, + { + // Adapted from libstd/io/mod.rs. + const DEFAULT_BUF_SIZE: u32 = 64 * 1024; + + let start_len = buf.len(); + let mut len = start_len; + let mut new_write_size = 16; + let ret; + loop { + if len == buf.len() { + if new_write_size < DEFAULT_BUF_SIZE { + new_write_size *= 2; + } + // FIXME: this exposes uninitialized bytes to a generic R type + // this is fine for R=File which never reads these bytes, + // but user-defined types might. + // The standard library pushes zeros to `Vec<u8>` for that reason. + unsafe { + buf.push_uninitialized(new_write_size); + } + } + + match self.read(&mut buf[len..]) { + Ok(0) => { + ret = Ok(len - start_len); + break; + } + Ok(n) => len += n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => { + ret = Err(e); + break; + } + } + } + + let buf_len = buf.len32(); + buf.pop_back(buf_len - (len as u32)); + ret + } +} + +impl<A> io::Write for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { + self.push_slice(buf); + Ok(buf.len()) + } + + #[inline] + fn write_all(&mut self, buf: &[u8]) -> io::Result<()> { + self.push_slice(buf); + Ok(()) + } + + #[inline(always)] + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +#[cfg(feature = "encoding")] +impl<A> encoding::ByteWriter for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn write_byte(&mut self, b: u8) { + self.push_slice(&[b]); + } + + #[inline] + fn write_bytes(&mut self, v: &[u8]) { + self.push_slice(v); + } + + #[inline] + fn writer_hint(&mut self, additional: usize) { + self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); + } +} + +impl<F, A> Tendril<F, A> +where + A: Atomicity, + F: fmt::SliceFormat<Slice = [u8]>, +{ + /// Decode from some character encoding into UTF-8. + /// + /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) + /// for more information. + #[inline] + #[cfg(feature = "encoding")] + pub fn decode( + &self, + encoding: EncodingRef, + trap: DecoderTrap, + ) -> Result<Tendril<fmt::UTF8, A>, ::std::borrow::Cow<'static, str>> { + let mut ret = Tendril::new(); + encoding.decode_to(&*self, trap, &mut ret).map(|_| ret) + } + + /// Push "uninitialized bytes" onto the end. + /// + /// Really, this grows the tendril without writing anything to the new area. + /// It's only defined for byte tendrils because it's only useful if you + /// plan to then mutate the buffer. + #[inline] + pub unsafe fn push_uninitialized(&mut self, n: u32) { + let new_len = self.len32().checked_add(n).expect(OFLOW); + if new_len <= MAX_INLINE_LEN as u32 && self.ptr.get().get() <= MAX_INLINE_TAG { + self.ptr.set(inline_tag(new_len)) + } else { + self.make_owned_with_capacity(new_len); + self.set_len(new_len); + } + } +} + +impl<A> strfmt::Display for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { + <str as strfmt::Display>::fmt(&**self, f) + } +} + +impl<A> str::FromStr for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + type Err = (); + + #[inline] + fn from_str(s: &str) -> Result<Self, ()> { + Ok(Tendril::from_slice(s)) + } +} + +impl<A> strfmt::Write for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn write_str(&mut self, s: &str) -> strfmt::Result { + self.push_slice(s); + Ok(()) + } +} + +#[cfg(feature = "encoding")] +impl<A> encoding::StringWriter for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn write_char(&mut self, c: char) { + self.push_char(c); + } + + #[inline] + fn write_str(&mut self, s: &str) { + self.push_slice(s); + } + + #[inline] + fn writer_hint(&mut self, additional: usize) { + self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); + } +} + +impl<A> Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + /// Encode from UTF-8 into some other character encoding. + /// + /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) + /// for more information. + #[inline] + #[cfg(feature = "encoding")] + pub fn encode( + &self, + encoding: EncodingRef, + trap: EncoderTrap, + ) -> Result<Tendril<fmt::Bytes, A>, ::std::borrow::Cow<'static, str>> { + let mut ret = Tendril::new(); + encoding.encode_to(&*self, trap, &mut ret).map(|_| ret) + } + + /// Push a character onto the end. + #[inline] + pub fn push_char(&mut self, c: char) { + unsafe { + self.push_bytes_without_validating(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); + } + } + + /// Create a `Tendril` from a single character. + #[inline] + pub fn from_char(c: char) -> Tendril<fmt::UTF8, A> { + let mut t: Tendril<fmt::UTF8, A> = Tendril::new(); + t.push_char(c); + t + } + + /// Helper for the `format_tendril!` macro. + #[inline] + pub fn format(args: strfmt::Arguments) -> Tendril<fmt::UTF8, A> { + use std::fmt::Write; + let mut output: Tendril<fmt::UTF8, A> = Tendril::new(); + let _ = write!(&mut output, "{}", args); + output + } +} + +/// Create a `StrTendril` through string formatting. +/// +/// Works just like the standard `format!` macro. +#[macro_export] +macro_rules! format_tendril { + ($($arg:tt)*) => ($crate::StrTendril::format(format_args!($($arg)*))) +} + +impl<'a, F, A> From<&'a F::Slice> for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn from(input: &F::Slice) -> Tendril<F, A> { + Tendril::from_slice(input) + } +} + +impl<A> From<String> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn from(input: String) -> Tendril<fmt::UTF8, A> { + Tendril::from_slice(&*input) + } +} + +impl<F, A> AsRef<F::Slice> for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn as_ref(&self) -> &F::Slice { + &**self + } +} + +impl<A> From<Tendril<fmt::UTF8, A>> for String +where + A: Atomicity, +{ + #[inline] + fn from(input: Tendril<fmt::UTF8, A>) -> String { + String::from(&*input) + } +} + +impl<'a, A> From<&'a Tendril<fmt::UTF8, A>> for String +where + A: Atomicity, +{ + #[inline] + fn from(input: &'a Tendril<fmt::UTF8, A>) -> String { + String::from(&**input) + } +} + +#[cfg(all(test, feature = "bench"))] +#[path = "bench.rs"] +mod bench; + +#[cfg(test)] +mod test { + use super::{ + Atomic, ByteTendril, Header, NonAtomic, ReadExt, SendTendril, SliceExt, StrTendril, Tendril, + }; + use fmt; + use std::iter; + use std::thread; + + fn assert_send<T: Send>() {} + + #[test] + fn smoke_test() { + assert_eq!("", &*"".to_tendril()); + assert_eq!("abc", &*"abc".to_tendril()); + assert_eq!("Hello, world!", &*"Hello, world!".to_tendril()); + + assert_eq!(b"", &*b"".to_tendril()); + assert_eq!(b"abc", &*b"abc".to_tendril()); + assert_eq!(b"Hello, world!", &*b"Hello, world!".to_tendril()); + } + + #[test] + fn assert_sizes() { + use std::mem; + struct EmptyWithDrop; + impl Drop for EmptyWithDrop { + fn drop(&mut self) {} + } + let compiler_uses_inline_drop_flags = mem::size_of::<EmptyWithDrop>() > 0; + + let correct = mem::size_of::<*const ()>() + + 8 + + if compiler_uses_inline_drop_flags { + 1 + } else { + 0 + }; + + assert_eq!(correct, mem::size_of::<ByteTendril>()); + assert_eq!(correct, mem::size_of::<StrTendril>()); + + assert_eq!(correct, mem::size_of::<Option<ByteTendril>>()); + assert_eq!(correct, mem::size_of::<Option<StrTendril>>()); + + assert_eq!( + mem::size_of::<*const ()>() * 2, + mem::size_of::<Header<Atomic>>(), + ); + assert_eq!( + mem::size_of::<Header<Atomic>>(), + mem::size_of::<Header<NonAtomic>>(), + ); + } + + #[test] + fn validate_utf8() { + assert!(ByteTendril::try_from_byte_slice(b"\xFF").is_ok()); + assert!(StrTendril::try_from_byte_slice(b"\xFF").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xFF").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xAE\xEA").is_err()); + assert_eq!( + "\u{a66e}", + &*StrTendril::try_from_byte_slice(b"\xEA\x99\xAE").unwrap() + ); + + let mut t = StrTendril::new(); + assert!(t.try_push_bytes(b"\xEA\x99").is_err()); + assert!(t.try_push_bytes(b"\xAE").is_err()); + assert!(t.try_push_bytes(b"\xEA\x99\xAE").is_ok()); + assert_eq!("\u{a66e}", &*t); + } + + #[test] + fn share_and_unshare() { + let s = b"foobarbaz".to_tendril(); + assert_eq!(b"foobarbaz", &*s); + assert!(!s.is_shared()); + + let mut t = s.clone(); + assert_eq!(s.as_ptr(), t.as_ptr()); + assert!(s.is_shared()); + assert!(t.is_shared()); + + t.push_slice(b"quux"); + assert_eq!(b"foobarbaz", &*s); + assert_eq!(b"foobarbazquux", &*t); + assert!(s.as_ptr() != t.as_ptr()); + assert!(!t.is_shared()); + } + + #[test] + fn format_display() { + assert_eq!("foobar", &*format!("{}", "foobar".to_tendril())); + + let mut s = "foo".to_tendril(); + assert_eq!("foo", &*format!("{}", s)); + + let t = s.clone(); + assert_eq!("foo", &*format!("{}", s)); + assert_eq!("foo", &*format!("{}", t)); + + s.push_slice("barbaz!"); + assert_eq!("foobarbaz!", &*format!("{}", s)); + assert_eq!("foo", &*format!("{}", t)); + } + + #[test] + fn format_debug() { + assert_eq!( + r#"Tendril<UTF8>(inline: "foobar")"#, + &*format!("{:?}", "foobar".to_tendril()) + ); + assert_eq!( + r#"Tendril<Bytes>(inline: [102, 111, 111, 98, 97, 114])"#, + &*format!("{:?}", b"foobar".to_tendril()) + ); + + let t = "anextralongstring".to_tendril(); + assert_eq!( + r#"Tendril<UTF8>(owned: "anextralongstring")"#, + &*format!("{:?}", t) + ); + let _ = t.clone(); + assert_eq!( + r#"Tendril<UTF8>(shared: "anextralongstring")"#, + &*format!("{:?}", t) + ); + } + + #[test] + fn subtendril() { + assert_eq!("foo".to_tendril(), "foo-bar".to_tendril().subtendril(0, 3)); + assert_eq!("bar".to_tendril(), "foo-bar".to_tendril().subtendril(4, 3)); + + let mut t = "foo-bar".to_tendril(); + t.pop_front(2); + assert_eq!("o-bar".to_tendril(), t); + t.pop_back(1); + assert_eq!("o-ba".to_tendril(), t); + + assert_eq!( + "foo".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(0, 3) + ); + assert_eq!( + "oo-a-".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(1, 5) + ); + assert_eq!( + "bar".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(20, 3) + ); + + let mut t = "another rather long string".to_tendril(); + t.pop_front(2); + assert!(t.starts_with("other rather")); + t.pop_back(1); + assert_eq!("other rather long strin".to_tendril(), t); + assert!(t.is_shared()); + } + + #[test] + fn subtendril_invalid() { + assert!("\u{a66e}".to_tendril().try_subtendril(0, 2).is_err()); + assert!("\u{a66e}".to_tendril().try_subtendril(1, 2).is_err()); + + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 3).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 3).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(3, 1).is_err()); + + let mut t = "\u{1f4a9}zzzzzz".to_tendril(); + assert!(t.try_pop_front(1).is_err()); + assert!(t.try_pop_front(2).is_err()); + assert!(t.try_pop_front(3).is_err()); + assert!(t.try_pop_front(4).is_ok()); + assert_eq!("zzzzzz", &*t); + + let mut t = "zzzzzz\u{1f4a9}".to_tendril(); + assert!(t.try_pop_back(1).is_err()); + assert!(t.try_pop_back(2).is_err()); + assert!(t.try_pop_back(3).is_err()); + assert!(t.try_pop_back(4).is_ok()); + assert_eq!("zzzzzz", &*t); + } + + #[test] + fn conversion() { + assert_eq!( + &[0x66, 0x6F, 0x6F].to_tendril(), + "foo".to_tendril().as_bytes() + ); + assert_eq!( + [0x66, 0x6F, 0x6F].to_tendril(), + "foo".to_tendril().into_bytes() + ); + + let ascii: Tendril<fmt::ASCII> = b"hello".to_tendril().try_reinterpret().unwrap(); + assert_eq!(&"hello".to_tendril(), ascii.as_superset()); + assert_eq!("hello".to_tendril(), ascii.clone().into_superset()); + + assert!(b"\xFF" + .to_tendril() + .try_reinterpret::<fmt::ASCII>() + .is_err()); + + let t = "hello".to_tendril(); + let ascii: &Tendril<fmt::ASCII> = t.try_as_subset().unwrap(); + assert_eq!(b"hello", &**ascii.as_bytes()); + + assert!("ő" + .to_tendril() + .try_reinterpret_view::<fmt::ASCII>() + .is_err()); + assert!("ő".to_tendril().try_as_subset::<fmt::ASCII>().is_err()); + + let ascii: Tendril<fmt::ASCII> = "hello".to_tendril().try_into_subset().unwrap(); + assert_eq!(b"hello", &**ascii.as_bytes()); + + assert!("ő".to_tendril().try_reinterpret::<fmt::ASCII>().is_err()); + assert!("ő".to_tendril().try_into_subset::<fmt::ASCII>().is_err()); + } + + #[test] + fn clear() { + let mut t = "foo-".to_tendril(); + t.clear(); + assert_eq!(t.len(), 0); + assert_eq!(t.len32(), 0); + assert_eq!(&*t, ""); + + let mut t = "much longer".to_tendril(); + let s = t.clone(); + t.clear(); + assert_eq!(t.len(), 0); + assert_eq!(t.len32(), 0); + assert_eq!(&*t, ""); + assert_eq!(&*s, "much longer"); + } + + #[test] + fn push_tendril() { + let mut t = "abc".to_tendril(); + t.push_tendril(&"xyz".to_tendril()); + assert_eq!("abcxyz", &*t); + } + + #[test] + fn wtf8() { + assert!(Tendril::<fmt::WTF8>::try_from_byte_slice(b"\xED\xA0\xBD").is_ok()); + assert!(Tendril::<fmt::WTF8>::try_from_byte_slice(b"\xED\xB2\xA9").is_ok()); + assert!(Tendril::<fmt::WTF8>::try_from_byte_slice(b"\xED\xA0\xBD\xED\xB2\xA9").is_err()); + + let t: Tendril<fmt::WTF8> = + Tendril::try_from_byte_slice(b"\xED\xA0\xBD\xEA\x99\xAE").unwrap(); + assert!(b"\xED\xA0\xBD".to_tendril().try_reinterpret().unwrap() == t.subtendril(0, 3)); + assert!(b"\xEA\x99\xAE".to_tendril().try_reinterpret().unwrap() == t.subtendril(3, 3)); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_err()); + + assert!(t.try_subtendril(0, 1).is_err()); + assert!(t.try_subtendril(0, 2).is_err()); + assert!(t.try_subtendril(1, 1).is_err()); + + assert!(t.try_subtendril(3, 1).is_err()); + assert!(t.try_subtendril(3, 2).is_err()); + assert!(t.try_subtendril(4, 1).is_err()); + + // paired surrogates + let mut t: Tendril<fmt::WTF8> = Tendril::try_from_byte_slice(b"\xED\xA0\xBD").unwrap(); + assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); + assert_eq!(b"\xF0\x9F\x92\xA9", t.as_byte_slice()); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_ok()); + + // unpaired surrogates + let mut t: Tendril<fmt::WTF8> = Tendril::try_from_byte_slice(b"\xED\xA0\xBB").unwrap(); + assert!(t.try_push_bytes(b"\xED\xA0").is_err()); + assert!(t.try_push_bytes(b"\xED").is_err()); + assert!(t.try_push_bytes(b"\xA0").is_err()); + assert!(t.try_push_bytes(b"\xED\xA0\xBD").is_ok()); + assert_eq!(b"\xED\xA0\xBB\xED\xA0\xBD", t.as_byte_slice()); + assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); + assert_eq!(b"\xED\xA0\xBB\xF0\x9F\x92\xA9", t.as_byte_slice()); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_err()); + } + + #[test] + fn front_char() { + let mut t = "".to_tendril(); + assert_eq!(None, t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = "abc".to_tendril(); + assert_eq!(Some('a'), t.pop_front_char()); + assert_eq!(Some('b'), t.pop_front_char()); + assert_eq!(Some('c'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = "főo-a-longer-string-bar-baz".to_tendril(); + assert_eq!(28, t.len()); + assert_eq!(Some('f'), t.pop_front_char()); + assert_eq!(Some('ő'), t.pop_front_char()); + assert_eq!(Some('o'), t.pop_front_char()); + assert_eq!(Some('-'), t.pop_front_char()); + assert_eq!(23, t.len()); + } + + #[test] + fn char_run() { + for &(s, exp) in &[ + ("", None), + (" ", Some((" ", true))), + ("x", Some(("x", false))), + (" \t \n", Some((" \t \n", true))), + ("xyzzy", Some(("xyzzy", false))), + (" xyzzy", Some((" ", true))), + ("xyzzy ", Some(("xyzzy", false))), + (" xyzzy ", Some((" ", true))), + ("xyzzy hi", Some(("xyzzy", false))), + ("中 ", Some(("中", false))), + (" 中 ", Some((" ", true))), + (" 中 ", Some((" ", true))), + (" 中 ", Some((" ", true))), + ] { + let mut t = s.to_tendril(); + let res = t.pop_front_char_run(char::is_whitespace); + match exp { + None => assert!(res.is_none()), + Some((es, ec)) => { + let (rt, rc) = res.unwrap(); + assert_eq!(es, &*rt); + assert_eq!(ec, rc); + } + } + } + } + + #[test] + fn deref_mut_inline() { + let mut t = "xyő".to_tendril().into_bytes(); + t[3] = 0xff; + assert_eq!(b"xy\xC5\xFF", &*t); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_err()); + t[3] = 0x8b; + assert_eq!("xyŋ", &**t.try_reinterpret_view::<fmt::UTF8>().unwrap()); + + unsafe { + t.push_uninitialized(3); + t[4] = 0xEA; + t[5] = 0x99; + t[6] = 0xAE; + assert_eq!( + "xyŋ\u{a66e}", + &**t.try_reinterpret_view::<fmt::UTF8>().unwrap() + ); + t.push_uninitialized(20); + t.pop_back(20); + assert_eq!( + "xyŋ\u{a66e}", + &**t.try_reinterpret_view::<fmt::UTF8>().unwrap() + ); + } + } + + #[test] + fn deref_mut() { + let mut t = b"0123456789".to_tendril(); + let u = t.clone(); + assert!(t.is_shared()); + t[9] = 0xff; + assert!(!t.is_shared()); + assert_eq!(b"0123456789", &*u); + assert_eq!(b"012345678\xff", &*t); + } + + #[test] + fn push_char() { + let mut t = "xyz".to_tendril(); + t.push_char('o'); + assert_eq!("xyzo", &*t); + t.push_char('ő'); + assert_eq!("xyzoő", &*t); + t.push_char('\u{a66e}'); + assert_eq!("xyzoő\u{a66e}", &*t); + t.push_char('\u{1f4a9}'); + assert_eq!("xyzoő\u{a66e}\u{1f4a9}", &*t); + assert_eq!(t.len(), 13); + } + + #[test] + #[cfg(feature = "encoding")] + fn encode() { + use encoding::{all, EncoderTrap}; + + let t = "안녕하세요 러스트".to_tendril(); + assert_eq!( + b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae", + &*t.encode(all::WINDOWS_949, EncoderTrap::Strict).unwrap() + ); + + let t = "Энергия пробуждения ия-я-я! \u{a66e}".to_tendril(); + assert_eq!( + b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ + \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21 ?", + &*t.encode(all::KOI8_U, EncoderTrap::Replace).unwrap() + ); + + let t = "\u{1f4a9}".to_tendril(); + assert!(t.encode(all::WINDOWS_1252, EncoderTrap::Strict).is_err()); + } + + #[test] + #[cfg(feature = "encoding")] + fn decode() { + use encoding::{all, DecoderTrap}; + + let t = b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\ + \xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae" + .to_tendril(); + assert_eq!( + "안녕하세요 러스트", + &*t.decode(all::WINDOWS_949, DecoderTrap::Strict).unwrap() + ); + + let t = b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ + \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21" + .to_tendril(); + assert_eq!( + "Энергия пробуждения ия-я-я!", + &*t.decode(all::KOI8_U, DecoderTrap::Replace).unwrap() + ); + + let t = b"x \xff y".to_tendril(); + assert!(t.decode(all::UTF_8, DecoderTrap::Strict).is_err()); + + let t = b"x \xff y".to_tendril(); + assert_eq!( + "x \u{fffd} y", + &*t.decode(all::UTF_8, DecoderTrap::Replace).unwrap() + ); + } + + #[test] + fn ascii() { + fn mk(x: &[u8]) -> Tendril<fmt::ASCII> { + x.to_tendril().try_reinterpret().unwrap() + } + + let mut t = mk(b"xyz"); + assert_eq!(Some('x'), t.pop_front_char()); + assert_eq!(Some('y'), t.pop_front_char()); + assert_eq!(Some('z'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = mk(b" \t xyz"); + assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); + assert!(Some((mk(b"xyz"), false)) == t.pop_front_char_run(char::is_whitespace)); + assert!(t.pop_front_char_run(char::is_whitespace).is_none()); + + let mut t = Tendril::<fmt::ASCII>::new(); + assert!(t.try_push_char('x').is_ok()); + assert!(t.try_push_char('\0').is_ok()); + assert!(t.try_push_char('\u{a0}').is_err()); + assert_eq!(b"x\0", t.as_byte_slice()); + } + + #[test] + fn latin1() { + fn mk(x: &[u8]) -> Tendril<fmt::Latin1> { + x.to_tendril().try_reinterpret().unwrap() + } + + let mut t = mk(b"\xd8_\xd8"); + assert_eq!(Some('Ø'), t.pop_front_char()); + assert_eq!(Some('_'), t.pop_front_char()); + assert_eq!(Some('Ø'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = mk(b" \t \xfe\xa7z"); + assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); + assert!(Some((mk(b"\xfe\xa7z"), false)) == t.pop_front_char_run(char::is_whitespace)); + assert!(t.pop_front_char_run(char::is_whitespace).is_none()); + + let mut t = Tendril::<fmt::Latin1>::new(); + assert!(t.try_push_char('x').is_ok()); + assert!(t.try_push_char('\0').is_ok()); + assert!(t.try_push_char('\u{a0}').is_ok()); + assert!(t.try_push_char('ő').is_err()); + assert!(t.try_push_char('я').is_err()); + assert!(t.try_push_char('\u{a66e}').is_err()); + assert!(t.try_push_char('\u{1f4a9}').is_err()); + assert_eq!(b"x\0\xa0", t.as_byte_slice()); + } + + #[test] + fn format() { + assert_eq!("", &*format_tendril!("")); + assert_eq!( + "two and two make 4", + &*format_tendril!("two and two make {}", 2 + 2) + ); + } + + #[test] + fn merge_shared() { + let t = "012345678901234567890123456789".to_tendril(); + let a = t.subtendril(10, 20); + assert!(a.is_shared()); + assert_eq!("01234567890123456789", &*a); + let mut b = t.subtendril(0, 10); + assert!(b.is_shared()); + assert_eq!("0123456789", &*b); + + b.push_tendril(&a); + assert!(b.is_shared()); + assert!(a.is_shared()); + assert!(a.is_shared_with(&b)); + assert!(b.is_shared_with(&a)); + assert_eq!("012345678901234567890123456789", &*b); + + assert!(t.is_shared()); + assert!(t.is_shared_with(&a)); + assert!(t.is_shared_with(&b)); + } + + #[test] + fn merge_cant_share() { + let t = "012345678901234567890123456789".to_tendril(); + let mut b = t.subtendril(0, 10); + assert!(b.is_shared()); + assert_eq!("0123456789", &*b); + + b.push_tendril(&"abcd".to_tendril()); + assert!(!b.is_shared()); + assert_eq!("0123456789abcd", &*b); + } + + #[test] + fn shared_doesnt_reserve() { + let mut t = "012345678901234567890123456789".to_tendril(); + let a = t.subtendril(1, 10); + + assert!(t.is_shared()); + t.reserve(10); + assert!(t.is_shared()); + + let _ = a; + } + + #[test] + fn out_of_bounds() { + assert!("".to_tendril().try_subtendril(0, 1).is_err()); + assert!("abc".to_tendril().try_subtendril(0, 4).is_err()); + assert!("abc".to_tendril().try_subtendril(3, 1).is_err()); + assert!("abc".to_tendril().try_subtendril(7, 1).is_err()); + + let mut t = "".to_tendril(); + assert!(t.try_pop_front(1).is_err()); + assert!(t.try_pop_front(5).is_err()); + assert!(t.try_pop_front(500).is_err()); + assert!(t.try_pop_back(1).is_err()); + assert!(t.try_pop_back(5).is_err()); + assert!(t.try_pop_back(500).is_err()); + + let mut t = "abcd".to_tendril(); + assert!(t.try_pop_front(1).is_ok()); + assert!(t.try_pop_front(4).is_err()); + assert!(t.try_pop_front(500).is_err()); + assert!(t.try_pop_back(1).is_ok()); + assert!(t.try_pop_back(3).is_err()); + assert!(t.try_pop_back(500).is_err()); + } + + #[test] + fn compare() { + for &a in &[ + "indiscretions", + "validity", + "hallucinogenics", + "timelessness", + "original", + "microcosms", + "boilers", + "mammoth", + ] { + for &b in &[ + "intrepidly", + "frigid", + "spa", + "cardigans", + "guileful", + "evaporated", + "unenthusiastic", + "legitimate", + ] { + let ta = a.to_tendril(); + let tb = b.to_tendril(); + + assert_eq!(a.eq(b), ta.eq(&tb)); + assert_eq!(a.ne(b), ta.ne(&tb)); + assert_eq!(a.lt(b), ta.lt(&tb)); + assert_eq!(a.le(b), ta.le(&tb)); + assert_eq!(a.gt(b), ta.gt(&tb)); + assert_eq!(a.ge(b), ta.ge(&tb)); + assert_eq!(a.partial_cmp(b), ta.partial_cmp(&tb)); + assert_eq!(a.cmp(b), ta.cmp(&tb)); + } + } + } + + #[test] + fn extend_and_from_iterator() { + // Testing Extend<T> and FromIterator<T> for the various Ts. + + // Tendril<F> + let mut t = "Hello".to_tendril(); + t.extend(None::<&Tendril<_>>.into_iter()); + assert_eq!("Hello", &*t); + t.extend(&[", ".to_tendril(), "world".to_tendril(), "!".to_tendril()]); + assert_eq!("Hello, world!", &*t); + assert_eq!( + "Hello, world!", + &*[ + "Hello".to_tendril(), + ", ".to_tendril(), + "world".to_tendril(), + "!".to_tendril() + ] + .iter() + .collect::<StrTendril>() + ); + + // &str + let mut t = "Hello".to_tendril(); + t.extend(None::<&str>.into_iter()); + assert_eq!("Hello", &*t); + t.extend([", ", "world", "!"].iter().map(|&s| s)); + assert_eq!("Hello, world!", &*t); + assert_eq!( + "Hello, world!", + &*["Hello", ", ", "world", "!"] + .iter() + .map(|&s| s) + .collect::<StrTendril>() + ); + + // &[u8] + let mut t = b"Hello".to_tendril(); + t.extend(None::<&[u8]>.into_iter()); + assert_eq!(b"Hello", &*t); + t.extend( + [b", ".as_ref(), b"world".as_ref(), b"!".as_ref()] + .iter() + .map(|&s| s), + ); + assert_eq!(b"Hello, world!", &*t); + assert_eq!( + b"Hello, world!", + &*[ + b"Hello".as_ref(), + b", ".as_ref(), + b"world".as_ref(), + b"!".as_ref() + ] + .iter() + .map(|&s| s) + .collect::<ByteTendril>() + ); + + let string = "the quick brown fox jumps over the lazy dog"; + let string_expected = string.to_tendril(); + let bytes = string.as_bytes(); + let bytes_expected = bytes.to_tendril(); + + // char + assert_eq!(string_expected, string.chars().collect()); + let mut tendril = StrTendril::new(); + tendril.extend(string.chars()); + assert_eq!(string_expected, tendril); + + // &u8 + assert_eq!(bytes_expected, bytes.iter().collect()); + let mut tendril = ByteTendril::new(); + tendril.extend(bytes); + assert_eq!(bytes_expected, tendril); + + // u8 + assert_eq!(bytes_expected, bytes.iter().map(|&b| b).collect()); + let mut tendril = ByteTendril::new(); + tendril.extend(bytes.iter().map(|&b| b)); + assert_eq!(bytes_expected, tendril); + } + + #[test] + fn from_str() { + use std::str::FromStr; + let t: Tendril<_> = FromStr::from_str("foo bar baz").unwrap(); + assert_eq!("foo bar baz", &*t); + } + + #[test] + fn from_char() { + assert_eq!("o", &*StrTendril::from_char('o')); + assert_eq!("ő", &*StrTendril::from_char('ő')); + assert_eq!("\u{a66e}", &*StrTendril::from_char('\u{a66e}')); + assert_eq!("\u{1f4a9}", &*StrTendril::from_char('\u{1f4a9}')); + } + + #[test] + #[cfg_attr(miri, ignore)] // slow + fn read() { + fn check(x: &[u8]) { + use std::io::Cursor; + let mut t = ByteTendril::new(); + assert_eq!(x.len(), Cursor::new(x).read_to_tendril(&mut t).unwrap()); + assert_eq!(x, &*t); + } + + check(b""); + check(b"abcd"); + + let long: Vec<u8> = iter::repeat(b'x').take(1_000_000).collect(); + check(&long); + } + + #[test] + fn hash_map_key() { + use std::collections::HashMap; + + // As noted with Borrow, indexing on HashMap<StrTendril, _> is byte-based because of + // https://github.com/rust-lang/rust/issues/27108. + let mut map = HashMap::new(); + map.insert("foo".to_tendril(), 1); + assert_eq!(map.get(b"foo".as_ref()), Some(&1)); + assert_eq!(map.get(b"bar".as_ref()), None); + + let mut map = HashMap::new(); + map.insert(b"foo".to_tendril(), 1); + assert_eq!(map.get(b"foo".as_ref()), Some(&1)); + assert_eq!(map.get(b"bar".as_ref()), None); + } + + #[test] + fn atomic() { + assert_send::<Tendril<fmt::UTF8, Atomic>>(); + let s: Tendril<fmt::UTF8, Atomic> = Tendril::from_slice("this is a string"); + assert!(!s.is_shared()); + let mut t = s.clone(); + assert!(s.is_shared()); + let sp = s.as_ptr() as usize; + thread::spawn(move || { + assert!(t.is_shared()); + t.push_slice(" extended"); + assert_eq!("this is a string extended", &*t); + assert!(t.as_ptr() as usize != sp); + assert!(!t.is_shared()); + }) + .join() + .unwrap(); + assert!(s.is_shared()); + assert_eq!("this is a string", &*s); + } + + #[test] + fn send() { + assert_send::<SendTendril<fmt::UTF8>>(); + let s = "this is a string".to_tendril(); + let t = s.clone(); + let s2 = s.into_send(); + thread::spawn(move || { + let s = StrTendril::from(s2); + assert!(!s.is_shared()); + assert_eq!("this is a string", &*s); + }) + .join() + .unwrap(); + assert_eq!("this is a string", &*t); + } + + /// https://github.com/servo/tendril/issues/58 + #[test] + fn issue_58() { + let data = "<p><i>Hello!</p>, World!</i>"; + let s: Tendril<fmt::UTF8, NonAtomic> = data.into(); + assert_eq!(&*s, data); + let s: Tendril<fmt::UTF8, Atomic> = s.into_send().into(); + assert_eq!(&*s, data); + } + + #[test] + fn inline_send() { + let s = "x".to_tendril(); + let t = s.clone(); + let s2 = s.into_send(); + thread::spawn(move || { + let s = StrTendril::from(s2); + assert!(!s.is_shared()); + assert_eq!("x", &*s); + }) + .join() + .unwrap(); + assert_eq!("x", &*t); + } +} diff --git a/vendor/tendril/src/utf8_decode.rs b/vendor/tendril/src/utf8_decode.rs new file mode 100644 index 000000000..b682d57a3 --- /dev/null +++ b/vendor/tendril/src/utf8_decode.rs @@ -0,0 +1,98 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use fmt; +use tendril::{Atomicity, Tendril}; +use utf8; + +pub struct IncompleteUtf8(utf8::Incomplete); + +impl<A> Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + pub fn decode_utf8_lossy<F>(mut self, mut push_utf8: F) -> Option<IncompleteUtf8> + where + F: FnMut(Tendril<fmt::UTF8, A>), + { + loop { + if self.is_empty() { + return None; + } + let unborrowed_result = match utf8::decode(&self) { + Ok(s) => { + debug_assert!(s.as_ptr() == self.as_ptr()); + debug_assert!(s.len() == self.len()); + Ok(()) + } + Err(utf8::DecodeError::Invalid { + valid_prefix, + invalid_sequence, + .. + }) => { + debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); + debug_assert!(valid_prefix.len() <= self.len()); + Err(( + valid_prefix.len(), + Err(valid_prefix.len() + invalid_sequence.len()), + )) + } + Err(utf8::DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + }) => { + debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); + debug_assert!(valid_prefix.len() <= self.len()); + Err((valid_prefix.len(), Ok(incomplete_suffix))) + } + }; + match unborrowed_result { + Ok(()) => { + unsafe { push_utf8(self.reinterpret_without_validating()) } + return None; + } + Err((valid_len, and_then)) => { + if valid_len > 0 { + let subtendril = self.subtendril(0, valid_len as u32); + unsafe { push_utf8(subtendril.reinterpret_without_validating()) } + } + match and_then { + Ok(incomplete) => return Some(IncompleteUtf8(incomplete)), + Err(offset) => { + push_utf8(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + self.pop_front(offset as u32) + } + } + } + } + } + } +} + +impl IncompleteUtf8 { + pub fn try_complete<A, F>( + &mut self, + mut input: Tendril<fmt::Bytes, A>, + mut push_utf8: F, + ) -> Result<Tendril<fmt::Bytes, A>, ()> + where + A: Atomicity, + F: FnMut(Tendril<fmt::UTF8, A>), + { + let resume_at; + match self.0.try_complete(&input) { + None => return Err(()), + Some((result, rest)) => { + push_utf8(Tendril::from_slice( + result.unwrap_or(utf8::REPLACEMENT_CHARACTER), + )); + resume_at = input.len() - rest.len(); + } + } + input.pop_front(resume_at as u32); + Ok(input) + } +} diff --git a/vendor/tendril/src/util.rs b/vendor/tendril/src/util.rs new file mode 100644 index 000000000..28c55c128 --- /dev/null +++ b/vendor/tendril/src/util.rs @@ -0,0 +1,45 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::mem; +use std::{ptr, slice}; + +#[inline(always)] +pub unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) +} + +#[inline(always)] +pub unsafe fn unsafe_slice_mut<'a>( + buf: &'a mut [u8], + start: usize, + new_len: usize, +) -> &'a mut [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts_mut(buf.as_mut_ptr().offset(start as isize), new_len) +} + +#[inline(always)] +pub unsafe fn copy_and_advance(dest: &mut *mut u8, src: &[u8]) { + ptr::copy_nonoverlapping(src.as_ptr(), *dest, src.len()); + *dest = dest.offset(src.len() as isize) +} + +#[inline(always)] +pub unsafe fn copy_lifetime_mut<'a, S: ?Sized, T: ?Sized + 'a>( + _ptr: &'a mut S, + ptr: &mut T, +) -> &'a mut T { + mem::transmute(ptr) +} + +#[inline(always)] +pub unsafe fn copy_lifetime<'a, S: ?Sized, T: ?Sized + 'a>(_ptr: &'a S, ptr: &T) -> &'a T { + mem::transmute(ptr) +} |