From 9835e2ae736235810b4ea1c162ca5e65c547e770 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 04:49:50 +0200 Subject: Merging upstream version 1.71.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/bstr/.cargo-checksum.json | 2 +- vendor/bstr/Cargo.lock | 2 +- vendor/bstr/Cargo.toml | 2 +- vendor/bstr/README.md | 8 +- vendor/bstr/src/escape_bytes.rs | 445 +++++++++++++++++++++++++++++++++++++++ vendor/bstr/src/ext_slice.rs | 42 ++++ vendor/bstr/src/ext_vec.rs | 101 +++++++++ vendor/bstr/src/impls.rs | 21 ++ vendor/bstr/src/lib.rs | 2 + 9 files changed, 615 insertions(+), 10 deletions(-) create mode 100644 vendor/bstr/src/escape_bytes.rs (limited to 'vendor/bstr') diff --git a/vendor/bstr/.cargo-checksum.json b/vendor/bstr/.cargo-checksum.json index 10d518866..57f8c3c3b 100644 --- a/vendor/bstr/.cargo-checksum.json +++ b/vendor/bstr/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"COPYING":"68653aaa727a2bfa31b7a751e31701ce33c49d695c12dd291a07d1c54da4c14b","Cargo.lock":"c241cc64634e0a14b96402447de8017caf38e31adc05c7e70d457b84263b1367","Cargo.toml":"aa0235fbbc95d22ad0c296fe2987aab0d70b057208ccdd137e85bc65e59028af","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6b7374c39a57e57fc2c38eb529c4c88340152b10f51dd5ae2d819dfa67f61715","README.md":"be9317a1df98e856a05caee8c77a65b9f7cdf8d55ca4f7cda5385d7c0f676321","examples/graphemes-std.rs":"100264f623ff973be76831fb1d4519e6f371b21972d6577bb49bf7bbff4d0d5e","examples/graphemes.rs":"401c5fac813f78e4029ece9c98bccb3128637c507d8667b73e069bfbc9d7f2f4","examples/lines-std.rs":"094a48bfd483ec01f80f9c937ddfe6f0bdbf09f960ba822215ec8ed9862624df","examples/lines.rs":"65ae4edbdb0ccff8ff40cdc70b4e7a70824f5028daff2e1b2a3247f884589db8","examples/uppercase-std.rs":"33aed88e38483aa303625757304a974594476a3a659d8bdd4877aceb90ff8be3","examples/uppercase.rs":"2cdf7f173cb6a5d4c16a967e3f733bc40331f5167da519c5194ceee187ff814f","examples/words-std.rs":"ffde2fccd361890fab0e0b051915a749d5d51e95b9be700b76fada231d002f00","examples/words.rs":"aa805faa5012714428ef895596947c333417c2b16a7e0155d4a128be7428fc17","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","scripts/generate-unicode-data":"75d68617b5acf564cc681ddfbf77a6097eaa9a9f8f54af4e16905dda3dc6df77","scripts/regex/grapheme.sh":"d796bca73278f6ab04d65f285b2dc72efcad76874f98f4bfa22bf38f2eaeece7","scripts/regex/sentence.sh":"7892e07ac9e450967bd79279558cbef2f0fc0b0aefab24217910937ed6330f06","scripts/regex/word.sh":"b3f53e2331e9e50a1a7232b7d278aaecace6a42ef6c16dd0b8d0ec59fd2aaf4f","src/ascii.rs":"3a2ffadf7b0529e8a475a8d2776d02e7fd149984de22a89fa158647e9c54bb77","src/bstr.rs":"be1313d13814f3818068f1f6c96e4a1eecf1ecdec42c360f90379596804ea0ef","src/bstring.rs":"1cd7656dc3a6eded742eb7e9e43c83a5d020e6a419060c186788c8e1010f7dcc","src/byteset/mod.rs":"3f88d5594d95724c6eda96f79006a27dab80c4f849f00342b0bae3efedc32c45","src/byteset/scalar.rs":"558daee3ca4bc774ab7d808b5ac45942962ef53af97f221cc01c0fab9cb32c8b","src/ext_slice.rs":"3fba7273b2cb5c231d5175962ee69c82653e1f2772cad6355948837dc10b43fd","src/ext_vec.rs":"a4641413047627d678b69aec8ec9a85fad73ad69b2c1b9bee09f1a0c75d65477","src/impls.rs":"96b2b16de74052de1b93f076573a786d051913a0c26c055d1a8186f7c4141988","src/io.rs":"73afcb89230d940b17a5917696c3f7c55267aefcb42db4164062dbf18875b677","src/lib.rs":"ec843ece6f18760c2e679bd7d245e499458f0c4b2ed4e292b48b8881daf125b2","src/tests.rs":"8adfd1a4a9da91b2a4dff25ffafcf99d914be3f5b7d67d66cdcb40a2d72abd04","src/unicode/data/GraphemeBreakTest.txt":"ddc7d4d1f3838573b94fc5d83ff7217e63c47b22ae1cd40c5fe1a54efc15589b","src/unicode/data/LICENSE-UNICODE":"8b9babb256418ec15761d635a49f973424939affba7a0a88de2fc2690e454a23","src/unicode/data/SentenceBreakTest.txt":"7e42dd749dbb94aa44b13faf9df6319d9a16ce2ea09a3a094fcfbb5962168040","src/unicode/data/WordBreakTest.txt":"8094b544ec1580c7e41ac0187805cc1aeb330a90301ec7505563e1a59318284e","src/unicode/fsm/grapheme_break_fwd.bigendian.dfa":"ae5220a77570720fcf78e63794d4cddbeef365fc3aaeec7dde391c229bc0a840","src/unicode/fsm/grapheme_break_fwd.littleendian.dfa":"3f9ce5d78325ede1651587e24e12357740a90608c784ac59c643abd42c4d9a83","src/unicode/fsm/grapheme_break_fwd.rs":"b6d937ec3afee23ea7c01ff9c0eeff1fc4f85287b87659dca80765db49d6b09e","src/unicode/fsm/grapheme_break_rev.bigendian.dfa":"fa2c745adc61060f08e5734f19acc09de387b0abd671597a543b4d4d80fd7a04","src/unicode/fsm/grapheme_break_rev.littleendian.dfa":"a10fd82f63b0f0aa08e5e7f09000c020c7ff4cfe6240afb11a615c663100de99","src/unicode/fsm/grapheme_break_rev.rs":"d9de2be51a17c5be37142ac44b9e2f0627c05a9101d5b1e23fd78229ca0ef75d","src/unicode/fsm/mod.rs":"50b8baa692e83f909a0fe62eced9666b712a68b6c7bf42976c8cc37e49dd9b64","src/unicode/fsm/regional_indicator_rev.bigendian.dfa":"db9db4c86bced5f4aaf68d5e475e13e5d4976c237deec13c192111a399aa5858","src/unicode/fsm/regional_indicator_rev.littleendian.dfa":"0905f70acddd423c1b53bfbeb73299009f724400029d7f9a987d63c32d36e36c","src/unicode/fsm/regional_indicator_rev.rs":"50b89fc6f7d461c789e88cc6f1a769257104b7f45eb01bd31047e898f1e9587a","src/unicode/fsm/sentence_break_fwd.bigendian.dfa":"0cd36026a86ea5d2e4710b8278733982808e341c88b62c4f9ca309417a181dc9","src/unicode/fsm/sentence_break_fwd.littleendian.dfa":"f3b85da014d1c94e1b444f3fca2952d1a5fbf2a9f42e32574eb52e027a797281","src/unicode/fsm/sentence_break_fwd.rs":"2c6147825fd78c15ecdb952d368d519f81bbf196eedf3e90e927699e832c7080","src/unicode/fsm/simple_word_fwd.bigendian.dfa":"635ab3e9c589268ef91a48c8b9b038e156deaf4a9a4475fce49ca75eabddccf7","src/unicode/fsm/simple_word_fwd.littleendian.dfa":"4f92b789385027a9276498a829cc8e5a3ecdd5f3c6d88254c6cd23d95d828c57","src/unicode/fsm/simple_word_fwd.rs":"44a2b90c8b4a2fa50c66cacc1d48afd47a8f7aa4753dd391471b48a9a891be71","src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa":"593c8ad059ab0bee60a2ea25f4c1fc89c105cb19a9bda3fa98d1464b8e87cfc0","src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa":"a04ed70d5dbd969c0af0e12bec5033ca910161c486f741dd0a792d2e5b0cc6f6","src/unicode/fsm/whitespace_anchored_fwd.rs":"e0f3f0be717ff306409ea9242f507847c4c0fa7469eccbd98a849389afe7fd26","src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa":"9ab09359ce73058d22e5bfa857e040831d49f4a53dd25da804136e9db9e7f5fb","src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa":"cb5804786bd98bfe0726f3f7733d72bc1d69130c8a8b026465c03c78f8c8ac79","src/unicode/fsm/whitespace_anchored_rev.rs":"8f27a50dfe549db99e768225c2956489f233f5a4986857a5ff5f2b507375a69d","src/unicode/fsm/word_break_fwd.bigendian.dfa":"72278d401ac119d50e06c6b8a4cb29d54366d1da536cfaedc3a20bb8cc1b742c","src/unicode/fsm/word_break_fwd.littleendian.dfa":"97dcdca86472d96faadd15d1c0328a76484971f3c4ad409c29049791cb9ed3eb","src/unicode/fsm/word_break_fwd.rs":"3ce6b28015dd6f2b330759da46263e8227f921e2370d4aa6922b8be26db558a8","src/unicode/grapheme.rs":"9bac56709754b48d42ee35282d5752c554a5af3e70b08c01977872f70ffa2afd","src/unicode/mod.rs":"fc67b0d64e9395398235c8663706b246edc0742e6cfe87057eeabdb1b19ad28d","src/unicode/sentence.rs":"8af1f274f80120b04928a6e560bfb3f2c9176d882f729265494b1a796a370681","src/unicode/whitespace.rs":"1fe313906fce009060267ae14de10e0ce577f0e2e2018273ee79d008dc9cf2f5","src/unicode/word.rs":"ed1bef53cf01ef6e682898e802e1654356a763c3993b4f16898eb5ed4b5e7637","src/utf8.rs":"e759713023dc3e5f9f5b2e6c3ba601af591ce5b2ad71aba729e3c29bcf6007e3"},"package":"5ffdb39cb703212f3c11973452c2861b972f757b021158f3516ba10f2fa8b2c1"} \ No newline at end of file +{"files":{"COPYING":"68653aaa727a2bfa31b7a751e31701ce33c49d695c12dd291a07d1c54da4c14b","Cargo.lock":"01a3a44ed3940a7525419e6422b631b4158ffa3ff2c867990a37fbfd3e600bb6","Cargo.toml":"8c73af775af87d01e76538c541dd34947510ab4353c73050017a6f4db12a4413","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6b7374c39a57e57fc2c38eb529c4c88340152b10f51dd5ae2d819dfa67f61715","README.md":"5e20af8472e06926761584e3c249ebc8b9802f1eb13440276d9aa267e70e5955","examples/graphemes-std.rs":"100264f623ff973be76831fb1d4519e6f371b21972d6577bb49bf7bbff4d0d5e","examples/graphemes.rs":"401c5fac813f78e4029ece9c98bccb3128637c507d8667b73e069bfbc9d7f2f4","examples/lines-std.rs":"094a48bfd483ec01f80f9c937ddfe6f0bdbf09f960ba822215ec8ed9862624df","examples/lines.rs":"65ae4edbdb0ccff8ff40cdc70b4e7a70824f5028daff2e1b2a3247f884589db8","examples/uppercase-std.rs":"33aed88e38483aa303625757304a974594476a3a659d8bdd4877aceb90ff8be3","examples/uppercase.rs":"2cdf7f173cb6a5d4c16a967e3f733bc40331f5167da519c5194ceee187ff814f","examples/words-std.rs":"ffde2fccd361890fab0e0b051915a749d5d51e95b9be700b76fada231d002f00","examples/words.rs":"aa805faa5012714428ef895596947c333417c2b16a7e0155d4a128be7428fc17","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","scripts/generate-unicode-data":"75d68617b5acf564cc681ddfbf77a6097eaa9a9f8f54af4e16905dda3dc6df77","scripts/regex/grapheme.sh":"d796bca73278f6ab04d65f285b2dc72efcad76874f98f4bfa22bf38f2eaeece7","scripts/regex/sentence.sh":"7892e07ac9e450967bd79279558cbef2f0fc0b0aefab24217910937ed6330f06","scripts/regex/word.sh":"b3f53e2331e9e50a1a7232b7d278aaecace6a42ef6c16dd0b8d0ec59fd2aaf4f","src/ascii.rs":"3a2ffadf7b0529e8a475a8d2776d02e7fd149984de22a89fa158647e9c54bb77","src/bstr.rs":"be1313d13814f3818068f1f6c96e4a1eecf1ecdec42c360f90379596804ea0ef","src/bstring.rs":"1cd7656dc3a6eded742eb7e9e43c83a5d020e6a419060c186788c8e1010f7dcc","src/byteset/mod.rs":"3f88d5594d95724c6eda96f79006a27dab80c4f849f00342b0bae3efedc32c45","src/byteset/scalar.rs":"558daee3ca4bc774ab7d808b5ac45942962ef53af97f221cc01c0fab9cb32c8b","src/escape_bytes.rs":"3bd168a350f8e96b39c4b9433abf779e34d5f66345b1c427118f5609f5d3398d","src/ext_slice.rs":"9e849981a4b4288b3d1237372847c81a6068186f041c8b04cab901a87a2bcc82","src/ext_vec.rs":"4dd9af267c07554051787c752e22e091684eb93f1a70c70ac2537535d1a54a07","src/impls.rs":"0a58c6a02c558c0c26b895d6aeab6997b41c6dd5051ceb596e2c2283ab11dce4","src/io.rs":"73afcb89230d940b17a5917696c3f7c55267aefcb42db4164062dbf18875b677","src/lib.rs":"b4433d15492fe85983cbe1bf23249d1dac54aa5df4510989939933a43bfd07f9","src/tests.rs":"8adfd1a4a9da91b2a4dff25ffafcf99d914be3f5b7d67d66cdcb40a2d72abd04","src/unicode/data/GraphemeBreakTest.txt":"ddc7d4d1f3838573b94fc5d83ff7217e63c47b22ae1cd40c5fe1a54efc15589b","src/unicode/data/LICENSE-UNICODE":"8b9babb256418ec15761d635a49f973424939affba7a0a88de2fc2690e454a23","src/unicode/data/SentenceBreakTest.txt":"7e42dd749dbb94aa44b13faf9df6319d9a16ce2ea09a3a094fcfbb5962168040","src/unicode/data/WordBreakTest.txt":"8094b544ec1580c7e41ac0187805cc1aeb330a90301ec7505563e1a59318284e","src/unicode/fsm/grapheme_break_fwd.bigendian.dfa":"ae5220a77570720fcf78e63794d4cddbeef365fc3aaeec7dde391c229bc0a840","src/unicode/fsm/grapheme_break_fwd.littleendian.dfa":"3f9ce5d78325ede1651587e24e12357740a90608c784ac59c643abd42c4d9a83","src/unicode/fsm/grapheme_break_fwd.rs":"b6d937ec3afee23ea7c01ff9c0eeff1fc4f85287b87659dca80765db49d6b09e","src/unicode/fsm/grapheme_break_rev.bigendian.dfa":"fa2c745adc61060f08e5734f19acc09de387b0abd671597a543b4d4d80fd7a04","src/unicode/fsm/grapheme_break_rev.littleendian.dfa":"a10fd82f63b0f0aa08e5e7f09000c020c7ff4cfe6240afb11a615c663100de99","src/unicode/fsm/grapheme_break_rev.rs":"d9de2be51a17c5be37142ac44b9e2f0627c05a9101d5b1e23fd78229ca0ef75d","src/unicode/fsm/mod.rs":"50b8baa692e83f909a0fe62eced9666b712a68b6c7bf42976c8cc37e49dd9b64","src/unicode/fsm/regional_indicator_rev.bigendian.dfa":"db9db4c86bced5f4aaf68d5e475e13e5d4976c237deec13c192111a399aa5858","src/unicode/fsm/regional_indicator_rev.littleendian.dfa":"0905f70acddd423c1b53bfbeb73299009f724400029d7f9a987d63c32d36e36c","src/unicode/fsm/regional_indicator_rev.rs":"50b89fc6f7d461c789e88cc6f1a769257104b7f45eb01bd31047e898f1e9587a","src/unicode/fsm/sentence_break_fwd.bigendian.dfa":"0cd36026a86ea5d2e4710b8278733982808e341c88b62c4f9ca309417a181dc9","src/unicode/fsm/sentence_break_fwd.littleendian.dfa":"f3b85da014d1c94e1b444f3fca2952d1a5fbf2a9f42e32574eb52e027a797281","src/unicode/fsm/sentence_break_fwd.rs":"2c6147825fd78c15ecdb952d368d519f81bbf196eedf3e90e927699e832c7080","src/unicode/fsm/simple_word_fwd.bigendian.dfa":"635ab3e9c589268ef91a48c8b9b038e156deaf4a9a4475fce49ca75eabddccf7","src/unicode/fsm/simple_word_fwd.littleendian.dfa":"4f92b789385027a9276498a829cc8e5a3ecdd5f3c6d88254c6cd23d95d828c57","src/unicode/fsm/simple_word_fwd.rs":"44a2b90c8b4a2fa50c66cacc1d48afd47a8f7aa4753dd391471b48a9a891be71","src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa":"593c8ad059ab0bee60a2ea25f4c1fc89c105cb19a9bda3fa98d1464b8e87cfc0","src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa":"a04ed70d5dbd969c0af0e12bec5033ca910161c486f741dd0a792d2e5b0cc6f6","src/unicode/fsm/whitespace_anchored_fwd.rs":"e0f3f0be717ff306409ea9242f507847c4c0fa7469eccbd98a849389afe7fd26","src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa":"9ab09359ce73058d22e5bfa857e040831d49f4a53dd25da804136e9db9e7f5fb","src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa":"cb5804786bd98bfe0726f3f7733d72bc1d69130c8a8b026465c03c78f8c8ac79","src/unicode/fsm/whitespace_anchored_rev.rs":"8f27a50dfe549db99e768225c2956489f233f5a4986857a5ff5f2b507375a69d","src/unicode/fsm/word_break_fwd.bigendian.dfa":"72278d401ac119d50e06c6b8a4cb29d54366d1da536cfaedc3a20bb8cc1b742c","src/unicode/fsm/word_break_fwd.littleendian.dfa":"97dcdca86472d96faadd15d1c0328a76484971f3c4ad409c29049791cb9ed3eb","src/unicode/fsm/word_break_fwd.rs":"3ce6b28015dd6f2b330759da46263e8227f921e2370d4aa6922b8be26db558a8","src/unicode/grapheme.rs":"9bac56709754b48d42ee35282d5752c554a5af3e70b08c01977872f70ffa2afd","src/unicode/mod.rs":"fc67b0d64e9395398235c8663706b246edc0742e6cfe87057eeabdb1b19ad28d","src/unicode/sentence.rs":"8af1f274f80120b04928a6e560bfb3f2c9176d882f729265494b1a796a370681","src/unicode/whitespace.rs":"1fe313906fce009060267ae14de10e0ce577f0e2e2018273ee79d008dc9cf2f5","src/unicode/word.rs":"ed1bef53cf01ef6e682898e802e1654356a763c3993b4f16898eb5ed4b5e7637","src/utf8.rs":"e759713023dc3e5f9f5b2e6c3ba601af591ce5b2ad71aba729e3c29bcf6007e3"},"package":"c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09"} \ No newline at end of file diff --git a/vendor/bstr/Cargo.lock b/vendor/bstr/Cargo.lock index 07bf058b7..116430881 100644 --- a/vendor/bstr/Cargo.lock +++ b/vendor/bstr/Cargo.lock @@ -4,7 +4,7 @@ version = 3 [[package]] name = "bstr" -version = "1.3.0" +version = "1.4.0" dependencies = [ "memchr", "once_cell", diff --git a/vendor/bstr/Cargo.toml b/vendor/bstr/Cargo.toml index 135bd386b..97f6d733a 100644 --- a/vendor/bstr/Cargo.toml +++ b/vendor/bstr/Cargo.toml @@ -13,7 +13,7 @@ edition = "2021" rust-version = "1.60" name = "bstr" -version = "1.3.0" +version = "1.4.0" authors = ["Andrew Gallant "] exclude = ["/.github"] description = "A string type that is not required to be valid UTF-8." diff --git a/vendor/bstr/README.md b/vendor/bstr/README.md index 080926e87..69cb16379 100644 --- a/vendor/bstr/README.md +++ b/vendor/bstr/README.md @@ -25,13 +25,7 @@ incorrect to require valid UTF-8. ### Usage -Add this to your `Cargo.toml`: - -```toml -[dependencies] -bstr = "1" -``` - +`cargo add bstr` ### Examples diff --git a/vendor/bstr/src/escape_bytes.rs b/vendor/bstr/src/escape_bytes.rs new file mode 100644 index 000000000..62c1fcdda --- /dev/null +++ b/vendor/bstr/src/escape_bytes.rs @@ -0,0 +1,445 @@ +/// An iterator of `char` values that represent an escaping of arbitrary bytes. +/// +/// The lifetime parameter `'a` refers to the lifetime of the bytes being +/// escaped. +/// +/// This iterator is created by the +/// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method. +#[derive(Clone, Debug)] +pub struct EscapeBytes<'a> { + remaining: &'a [u8], + state: EscapeState, +} + +impl<'a> EscapeBytes<'a> { + pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes { + EscapeBytes { remaining: bytes, state: EscapeState::Start } + } +} + +impl<'a> Iterator for EscapeBytes<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + use self::EscapeState::*; + + match self.state { + Start => { + let byte = match crate::decode_utf8(self.remaining) { + (None, 0) => return None, + // If we see invalid UTF-8 or ASCII, then we always just + // peel one byte off. If it's printable ASCII, we'll pass + // it through as-is below. Otherwise, below, it will get + // escaped in some way. + (None, _) | (Some(_), 1) => { + let byte = self.remaining[0]; + self.remaining = &self.remaining[1..]; + byte + } + // For any valid UTF-8 that is not ASCII, we pass it + // through as-is. We don't do any Unicode escaping. + (Some(ch), size) => { + self.remaining = &self.remaining[size..]; + return Some(ch); + } + }; + self.state = match byte { + 0x21..=0x5B | 0x5D..=0x7E => { + return Some(char::from(byte)) + } + b'\0' => SpecialEscape('0'), + b'\n' => SpecialEscape('n'), + b'\r' => SpecialEscape('r'), + b'\t' => SpecialEscape('t'), + b'\\' => SpecialEscape('\\'), + _ => HexEscapeX(byte), + }; + Some('\\') + } + SpecialEscape(ch) => { + self.state = Start; + Some(ch) + } + HexEscapeX(byte) => { + self.state = HexEscapeHighNybble(byte); + Some('x') + } + HexEscapeHighNybble(byte) => { + self.state = HexEscapeLowNybble(byte); + let nybble = byte >> 4; + Some(hexdigit_to_char(nybble)) + } + HexEscapeLowNybble(byte) => { + self.state = Start; + let nybble = byte & 0xF; + Some(hexdigit_to_char(nybble)) + } + } + } +} + +impl<'a> core::fmt::Display for EscapeBytes<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use core::fmt::Write; + for ch in self.clone() { + f.write_char(ch)?; + } + Ok(()) + } +} + +/// The state used by the FSM in the escaping iterator. +#[derive(Clone, Debug)] +enum EscapeState { + /// Read and remove the next byte from 'remaining'. If 'remaining' is + /// empty, then return None. Otherwise, escape the byte according to the + /// following rules or emit it as-is. + /// + /// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current + /// state to 'SpecialEscape(n | r | t | \ | 0)'. Otherwise, if the 'byte' + /// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to + /// to 'HexEscapeX(byte)'. + Start, + /// Emit the given codepoint as is. This assumes '\' has just been emitted. + /// Then set the state to 'Start'. + SpecialEscape(char), + /// Emit the 'x' part of a hex escape. This assumes '\' has just been + /// emitted. Then set the state to 'HexEscapeHighNybble(byte)'. + HexEscapeX(u8), + /// Emit the high nybble of the byte as a hexadecimal digit. This + /// assumes '\x' has just been emitted. Then set the state to + /// 'HexEscapeLowNybble(byte)'. + HexEscapeHighNybble(u8), + /// Emit the low nybble of the byte as a hexadecimal digit. This assume + /// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte. + /// Then set the state to 'Start'. + HexEscapeLowNybble(u8), +} + +/// An iterator of `u8` values that represent an unescaping of a sequence of +/// codepoints. +/// +/// The type parameter `I` refers to the iterator of codepoints that is +/// unescaped. +/// +/// Currently this iterator is not exposed in the crate API, and instead all +/// we expose is a `ByteVec::unescape` method. Which of course requires an +/// alloc. That's the most convenient form of this, but in theory, we could +/// expose this for core-only use cases too. I'm just not quite sure what the +/// API should be. +#[derive(Clone, Debug)] +pub(crate) struct UnescapeBytes { + it: I, + state: UnescapeState, +} + +impl> UnescapeBytes { + pub(crate) fn new>( + t: T, + ) -> UnescapeBytes { + UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start } + } +} + +impl> Iterator for UnescapeBytes { + type Item = u8; + + fn next(&mut self) -> Option { + use self::UnescapeState::*; + + loop { + match self.state { + Start => { + let ch = self.it.next()?; + match ch { + '\\' => { + self.state = Escape; + } + ch => { + self.state = UnescapeState::bytes(&[], ch); + } + } + } + Bytes { buf, mut cur, len } => { + let byte = buf[cur]; + cur += 1; + if cur >= len { + self.state = Start; + } else { + self.state = Bytes { buf, cur, len }; + } + return Some(byte); + } + Escape => { + let ch = match self.it.next() { + Some(ch) => ch, + None => { + self.state = Start; + // Incomplete escape sequences unescape as + // themselves. + return Some(b'\\'); + } + }; + match ch { + '0' => { + self.state = Start; + return Some(b'\x00'); + } + '\\' => { + self.state = Start; + return Some(b'\\'); + } + 'r' => { + self.state = Start; + return Some(b'\r'); + } + 'n' => { + self.state = Start; + return Some(b'\n'); + } + 't' => { + self.state = Start; + return Some(b'\t'); + } + 'x' => { + self.state = HexFirst; + } + ch => { + // An invalid escape sequence unescapes as itself. + self.state = UnescapeState::bytes(&[b'\\'], ch); + } + } + } + HexFirst => { + let ch = match self.it.next() { + Some(ch) => ch, + None => { + // An incomplete escape sequence unescapes as + // itself. + self.state = UnescapeState::bytes_raw(&[b'x']); + return Some(b'\\'); + } + }; + match ch { + '0'..='9' | 'A'..='F' | 'a'..='f' => { + self.state = HexSecond(ch); + } + ch => { + // An invalid escape sequence unescapes as itself. + self.state = UnescapeState::bytes(&[b'x'], ch); + return Some(b'\\'); + } + } + } + HexSecond(first) => { + let second = match self.it.next() { + Some(ch) => ch, + None => { + // An incomplete escape sequence unescapes as + // itself. + self.state = UnescapeState::bytes(&[b'x'], first); + return Some(b'\\'); + } + }; + match second { + '0'..='9' | 'A'..='F' | 'a'..='f' => { + self.state = Start; + let hinybble = char_to_hexdigit(first); + let lonybble = char_to_hexdigit(second); + let byte = hinybble << 4 | lonybble; + return Some(byte); + } + ch => { + // An invalid escape sequence unescapes as itself. + self.state = + UnescapeState::bytes2(&[b'x'], first, ch); + return Some(b'\\'); + } + } + } + } + } + } +} + +/// The state used by the FSM in the unescaping iterator. +#[derive(Clone, Debug)] +enum UnescapeState { + /// The start state. Look for an escape sequence, otherwise emit the next + /// codepoint as-is. + Start, + /// Emit the byte at `buf[cur]`. + /// + /// This state should never be created when `cur >= len`. That is, when + /// this state is visited, it is assumed that `cur < len`. + Bytes { buf: [u8; 11], cur: usize, len: usize }, + /// This state is entered after a `\` is seen. + Escape, + /// This state is entered after a `\x` is seen. + HexFirst, + /// This state is entered after a `\xN` is seen, where `N` is in + /// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`. + HexSecond(char), +} + +impl UnescapeState { + /// Create a new `Bytes` variant with the given slice. + /// + /// # Panics + /// + /// Panics if `bytes.len() > 11`. + fn bytes_raw(bytes: &[u8]) -> UnescapeState { + // This can be increased, you just need to make sure 'buf' in the + // 'Bytes' state has enough room. + assert!(bytes.len() <= 11, "no more than 11 bytes allowed"); + let mut buf = [0; 11]; + buf[..bytes.len()].copy_from_slice(bytes); + UnescapeState::Bytes { buf, cur: 0, len: bytes.len() } + } + + /// Create a new `Bytes` variant with the prefix byte slice, followed by + /// the UTF-8 encoding of the given char. + /// + /// # Panics + /// + /// Panics if `prefix.len() > 3`. + fn bytes(prefix: &[u8], ch: char) -> UnescapeState { + // This can be increased, you just need to make sure 'buf' in the + // 'Bytes' state has enough room. + assert!(prefix.len() <= 3, "no more than 3 bytes allowed"); + let mut buf = [0; 11]; + buf[..prefix.len()].copy_from_slice(prefix); + let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len(); + UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + chlen } + } + + /// Create a new `Bytes` variant with the prefix byte slice, followed by + /// the UTF-8 encoding of `ch1` and then `ch2`. + /// + /// # Panics + /// + /// Panics if `prefix.len() > 3`. + fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState { + // This can be increased, you just need to make sure 'buf' in the + // 'Bytes' state has enough room. + assert!(prefix.len() <= 3, "no more than 3 bytes allowed"); + let mut buf = [0; 11]; + buf[..prefix.len()].copy_from_slice(prefix); + let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len(); + let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len(); + UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + len1 + len2 } + } +} + +/// Convert the given codepoint to its corresponding hexadecimal digit. +/// +/// # Panics +/// +/// This panics if `ch` is not in `[0-9A-Fa-f]`. +fn char_to_hexdigit(ch: char) -> u8 { + u8::try_from(ch.to_digit(16).unwrap()).unwrap() +} + +/// Convert the given hexadecimal digit to its corresponding codepoint. +/// +/// # Panics +/// +/// This panics when `digit > 15`. +fn hexdigit_to_char(digit: u8) -> char { + char::from_digit(u32::from(digit), 16).unwrap().to_ascii_uppercase() +} + +#[cfg(all(test, feature = "std"))] +mod tests { + use crate::BString; + + use super::*; + + #[allow(non_snake_case)] + fn B>(bytes: B) -> BString { + BString::from(bytes.as_ref()) + } + + fn e>(bytes: B) -> String { + EscapeBytes::new(bytes.as_ref()).to_string() + } + + fn u(string: &str) -> BString { + UnescapeBytes::new(string.chars()).collect() + } + + #[test] + fn escape() { + assert_eq!(r"a", e(br"a")); + assert_eq!(r"\\x61", e(br"\x61")); + assert_eq!(r"a", e(b"\x61")); + assert_eq!(r"~", e(b"\x7E")); + assert_eq!(r"\x7F", e(b"\x7F")); + + assert_eq!(r"\n", e(b"\n")); + assert_eq!(r"\r", e(b"\r")); + assert_eq!(r"\t", e(b"\t")); + assert_eq!(r"\\", e(b"\\")); + assert_eq!(r"\0", e(b"\0")); + assert_eq!(r"\0", e(b"\x00")); + + assert_eq!(r"\x88", e(b"\x88")); + assert_eq!(r"\x8F", e(b"\x8F")); + assert_eq!(r"\xF8", e(b"\xF8")); + assert_eq!(r"\xFF", e(b"\xFF")); + + assert_eq!(r"\xE2", e(b"\xE2")); + assert_eq!(r"\xE2\x98", e(b"\xE2\x98")); + assert_eq!(r"☃", e(b"\xE2\x98\x83")); + + assert_eq!(r"\xF0", e(b"\xF0")); + assert_eq!(r"\xF0\x9F", e(b"\xF0\x9F")); + assert_eq!(r"\xF0\x9F\x92", e(b"\xF0\x9F\x92")); + assert_eq!(r"💩", e(b"\xF0\x9F\x92\xA9")); + } + + #[test] + fn unescape() { + assert_eq!(B(r"a"), u(r"a")); + assert_eq!(B(r"\x61"), u(r"\\x61")); + assert_eq!(B(r"a"), u(r"\x61")); + assert_eq!(B(r"~"), u(r"\x7E")); + assert_eq!(B(b"\x7F"), u(r"\x7F")); + + assert_eq!(B(b"\n"), u(r"\n")); + assert_eq!(B(b"\r"), u(r"\r")); + assert_eq!(B(b"\t"), u(r"\t")); + assert_eq!(B(b"\\"), u(r"\\")); + assert_eq!(B(b"\0"), u(r"\0")); + assert_eq!(B(b"\0"), u(r"\x00")); + + assert_eq!(B(b"\x88"), u(r"\x88")); + assert_eq!(B(b"\x8F"), u(r"\x8F")); + assert_eq!(B(b"\xF8"), u(r"\xF8")); + assert_eq!(B(b"\xFF"), u(r"\xFF")); + + assert_eq!(B(b"\xE2"), u(r"\xE2")); + assert_eq!(B(b"\xE2\x98"), u(r"\xE2\x98")); + assert_eq!(B("☃"), u(r"\xE2\x98\x83")); + + assert_eq!(B(b"\xF0"), u(r"\xf0")); + assert_eq!(B(b"\xF0\x9F"), u(r"\xf0\x9f")); + assert_eq!(B(b"\xF0\x9F\x92"), u(r"\xf0\x9f\x92")); + assert_eq!(B("💩"), u(r"\xf0\x9f\x92\xa9")); + } + + #[test] + fn unescape_weird() { + assert_eq!(B(b"\\"), u(r"\")); + assert_eq!(B(b"\\"), u(r"\\")); + assert_eq!(B(b"\\x"), u(r"\x")); + assert_eq!(B(b"\\xA"), u(r"\xA")); + + assert_eq!(B(b"\\xZ"), u(r"\xZ")); + assert_eq!(B(b"\\xZZ"), u(r"\xZZ")); + assert_eq!(B(b"\\i"), u(r"\i")); + assert_eq!(B(b"\\u"), u(r"\u")); + assert_eq!(B(b"\\u{2603}"), u(r"\u{2603}")); + } +} diff --git a/vendor/bstr/src/ext_slice.rs b/vendor/bstr/src/ext_slice.rs index 91af45083..5e1801971 100644 --- a/vendor/bstr/src/ext_slice.rs +++ b/vendor/bstr/src/ext_slice.rs @@ -10,6 +10,7 @@ use std::{ffi::OsStr, path::Path}; use memchr::{memchr, memmem, memrchr}; +use crate::escape_bytes::EscapeBytes; #[cfg(feature = "alloc")] use crate::ext_vec::ByteVec; #[cfg(feature = "unicode")] @@ -2765,6 +2766,47 @@ pub trait ByteSlice: private::Sealed { self.as_bytes_mut().make_ascii_uppercase(); } + /// Escapes this byte string into a sequence of `char` values. + /// + /// When the sequence of `char` values is concatenated into a string, the + /// result is always valid UTF-8. Any unprintable or invalid UTF-8 in this + /// byte string are escaped using using `\xNN` notation. Moreover, the + /// characters `\0`, `\r`, `\n`, `\t` and `\` are escaped as well. + /// + /// This is useful when one wants to get a human readable view of the raw + /// bytes that is also valid UTF-8. + /// + /// The iterator returned implements the `Display` trait. So one can do + /// `b"foo\xFFbar".escape_bytes().to_string()` to get a `String` with its + /// bytes escaped. + /// + /// The dual of this function is [`ByteVec::unescape_bytes`]. + /// + /// Note that this is similar to, but not equivalent to the `Debug` + /// implementation on [`BStr`] and [`BString`]. The `Debug` implementations + /// also use the debug representation for all Unicode codepoints. However, + /// this escaping routine only escapes individual bytes. All Unicode + /// codepoints above `U+007F` are passed through unchanged without any + /// escaping. + /// + /// # Examples + /// + /// ``` + /// # #[cfg(feature = "alloc")] { + /// use bstr::{B, ByteSlice}; + /// + /// assert_eq!(r"foo\xFFbar", b"foo\xFFbar".escape_bytes().to_string()); + /// assert_eq!(r"foo\nbar", b"foo\nbar".escape_bytes().to_string()); + /// assert_eq!(r"foo\tbar", b"foo\tbar".escape_bytes().to_string()); + /// assert_eq!(r"foo\\bar", b"foo\\bar".escape_bytes().to_string()); + /// assert_eq!(r"foo☃bar", B("foo☃bar").escape_bytes().to_string()); + /// # } + /// ``` + #[inline] + fn escape_bytes(&self) -> EscapeBytes<'_> { + EscapeBytes::new(self.as_bytes()) + } + /// Reverse the bytes in this string, in place. /// /// This is not necessarily a well formed operation! For example, if this diff --git a/vendor/bstr/src/ext_vec.rs b/vendor/bstr/src/ext_vec.rs index b8e2be2cf..0c18121e0 100644 --- a/vendor/bstr/src/ext_vec.rs +++ b/vendor/bstr/src/ext_vec.rs @@ -296,6 +296,107 @@ pub trait ByteVec: private::Sealed { Vec::from_os_str_lossy(path.as_os_str()) } + /// Unescapes the given string into its raw bytes. + /// + /// This looks for the escape sequences `\xNN`, `\0`, `\r`, `\n`, `\t` + /// and `\` and translates them into their corresponding unescaped form. + /// + /// Incomplete escape sequences or things that look like escape sequences + /// but are not (for example, `\i` or `\xYZ`) are passed through literally. + /// + /// This is the dual of [`ByteSlice::escape_bytes`]. + /// + /// Note that the zero or NUL byte may be represented as either `\0` or + /// `\x00`. Both will be unescaped into the zero byte. + /// + /// # Examples + /// + /// This shows basic usage: + /// + /// ``` + /// # #[cfg(feature = "alloc")] { + /// use bstr::{B, BString, ByteVec}; + /// + /// assert_eq!( + /// BString::from(b"foo\xFFbar"), + /// Vec::unescape_bytes(r"foo\xFFbar"), + /// ); + /// assert_eq!( + /// BString::from(b"foo\nbar"), + /// Vec::unescape_bytes(r"foo\nbar"), + /// ); + /// assert_eq!( + /// BString::from(b"foo\tbar"), + /// Vec::unescape_bytes(r"foo\tbar"), + /// ); + /// assert_eq!( + /// BString::from(b"foo\\bar"), + /// Vec::unescape_bytes(r"foo\\bar"), + /// ); + /// assert_eq!( + /// BString::from("foo☃bar"), + /// Vec::unescape_bytes(r"foo☃bar"), + /// ); + /// + /// # } + /// ``` + /// + /// This shows some examples of how incomplete or "incorrect" escape + /// sequences get passed through literally. + /// + /// ``` + /// # #[cfg(feature = "alloc")] { + /// use bstr::{B, BString, ByteVec}; + /// + /// // Show some incomplete escape sequences. + /// assert_eq!( + /// BString::from(br"\"), + /// Vec::unescape_bytes(r"\"), + /// ); + /// assert_eq!( + /// BString::from(br"\"), + /// Vec::unescape_bytes(r"\\"), + /// ); + /// assert_eq!( + /// BString::from(br"\x"), + /// Vec::unescape_bytes(r"\x"), + /// ); + /// assert_eq!( + /// BString::from(br"\xA"), + /// Vec::unescape_bytes(r"\xA"), + /// ); + /// // And now some that kind of look like escape + /// // sequences, but aren't. + /// assert_eq!( + /// BString::from(br"\xZ"), + /// Vec::unescape_bytes(r"\xZ"), + /// ); + /// assert_eq!( + /// BString::from(br"\xZZ"), + /// Vec::unescape_bytes(r"\xZZ"), + /// ); + /// assert_eq!( + /// BString::from(br"\i"), + /// Vec::unescape_bytes(r"\i"), + /// ); + /// assert_eq!( + /// BString::from(br"\u"), + /// Vec::unescape_bytes(r"\u"), + /// ); + /// assert_eq!( + /// BString::from(br"\u{2603}"), + /// Vec::unescape_bytes(r"\u{2603}"), + /// ); + /// + /// # } + /// ``` + #[inline] + #[cfg(feature = "alloc")] + fn unescape_bytes>(escaped: S) -> Vec { + let s = escaped.as_ref(); + crate::escape_bytes::UnescapeBytes::new(s.chars()).collect() + } + /// Appends the given byte to the end of this byte string. /// /// Note that this is equivalent to the generic `Vec::push` method. This diff --git a/vendor/bstr/src/impls.rs b/vendor/bstr/src/impls.rs index c063cb6b6..e017cf1ac 100644 --- a/vendor/bstr/src/impls.rs +++ b/vendor/bstr/src/impls.rs @@ -156,6 +156,20 @@ mod bstring { } } + impl<'a, const N: usize> From<&'a [u8; N]> for BString { + #[inline] + fn from(s: &'a [u8; N]) -> BString { + BString::from(&s[..]) + } + } + + impl From<[u8; N]> for BString { + #[inline] + fn from(s: [u8; N]) -> BString { + BString::from(&s[..]) + } + } + impl<'a> From<&'a [u8]> for BString { #[inline] fn from(s: &'a [u8]) -> BString { @@ -610,6 +624,13 @@ mod bstr { } } + impl<'a, const N: usize> From<&'a [u8; N]> for &'a BStr { + #[inline] + fn from(s: &'a [u8; N]) -> &'a BStr { + BStr::from_bytes(s) + } + } + impl<'a> From<&'a [u8]> for &'a BStr { #[inline] fn from(s: &'a [u8]) -> &'a BStr { diff --git a/vendor/bstr/src/lib.rs b/vendor/bstr/src/lib.rs index 3d334ac63..8598e4458 100644 --- a/vendor/bstr/src/lib.rs +++ b/vendor/bstr/src/lib.rs @@ -414,6 +414,7 @@ extern crate alloc; pub use crate::bstr::BStr; #[cfg(feature = "alloc")] pub use crate::bstring::BString; +pub use crate::escape_bytes::EscapeBytes; #[cfg(feature = "unicode")] pub use crate::ext_slice::Fields; pub use crate::ext_slice::{ @@ -437,6 +438,7 @@ mod bstr; #[cfg(feature = "alloc")] mod bstring; mod byteset; +mod escape_bytes; mod ext_slice; #[cfg(feature = "alloc")] mod ext_vec; -- cgit v1.2.3