summaryrefslogtreecommitdiffstats
path: root/third_party/rust/xmldecl/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/xmldecl/src/lib.rs')
-rw-r--r--third_party/rust/xmldecl/src/lib.rs371
1 files changed, 371 insertions, 0 deletions
diff --git a/third_party/rust/xmldecl/src/lib.rs b/third_party/rust/xmldecl/src/lib.rs
new file mode 100644
index 0000000000..ed4fb43a85
--- /dev/null
+++ b/third_party/rust/xmldecl/src/lib.rs
@@ -0,0 +1,371 @@
+// Copyright Mozilla Foundation
+//
+// Licensed under the Apache License (Version 2.0), or the MIT license,
+// (the "Licenses") at your option. You may not use this file except in
+// compliance with one of the Licenses. You may obtain copies of the
+// Licenses at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+// http://opensource.org/licenses/MIT
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the Licenses is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the Licenses for the specific language governing permissions and
+// limitations under the Licenses.
+
+//! `xmldecl::parse()` extracts an encoding from an ASCII-based bogo-XML
+//! declaration in `text/html` in a WebKit-compatible way.
+
+extern crate encoding_rs;
+
+fn position(needle: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().position(|&x| x == needle)
+}
+
+// The standard library lacks subslice search.
+// Since our needle, "encoding" is short, 'g' occurs in it only once,
+// and the other letters we expect to skip over are "version", let's
+// search for 'g' and verify match.
+fn skip_encoding(hay: &[u8]) -> Option<&[u8]> {
+ let mut haystack = hay;
+ loop {
+ if let Some(g) = position(b'g', haystack) {
+ let (head, tail) = haystack.split_at(g + 1);
+ if let Some(_) = head.strip_suffix(b"encoding") {
+ return Some(tail);
+ }
+ haystack = tail;
+ } else {
+ return None;
+ }
+ }
+}
+
+/// Extracts an encoding from an ASCII-based bogo-XML declaration.
+/// `bytes` must the prefix of a `text/html` resource.
+///
+/// The intended use is that when the `meta` prescan fails, the HTML
+/// parser will have buffered the head section or the first 1024
+/// bytes (whichever is larger) at which point the should be passed to
+/// this function.
+pub fn parse(bytes: &[u8]) -> Option<&'static encoding_rs::Encoding> {
+ if let Some(after_xml) = bytes.strip_prefix(b"<?xml") {
+ if let Some(gt) = position(b'>', after_xml) {
+ let until_gt = &after_xml[..gt];
+ if let Some(tail) = skip_encoding(until_gt) {
+ let mut pos = 0;
+ loop {
+ if pos >= tail.len() {
+ return None;
+ }
+ let c = tail[pos];
+ pos += 1;
+ if c == b'=' {
+ break;
+ }
+ if c <= b' ' {
+ continue;
+ }
+ return None;
+ }
+ // pos is now the index of the byte after =
+ let is_single_quoted;
+ let label_start;
+ loop {
+ if pos >= tail.len() {
+ return None;
+ }
+ let c = tail[pos];
+ pos += 1;
+ if c == b'"' {
+ is_single_quoted = false;
+ label_start = pos;
+ break;
+ }
+ if c == b'\'' {
+ is_single_quoted = true;
+ label_start = pos;
+ break;
+ }
+ if c <= b' ' {
+ continue;
+ }
+ return None;
+ }
+ loop {
+ if pos >= tail.len() {
+ return None;
+ }
+ let c = tail[pos];
+ if c <= b' ' {
+ return None;
+ }
+ if (c == b'"' && !is_single_quoted) || (c == b'\'' && is_single_quoted) {
+ let encoding = encoding_rs::Encoding::for_label(&tail[label_start..pos]);
+ if encoding == Some(encoding_rs::UTF_16LE)
+ || encoding == Some(encoding_rs::UTF_16BE)
+ {
+ return Some(encoding_rs::UTF_8);
+ }
+ return encoding;
+ }
+ pos += 1;
+ }
+ }
+ }
+ }
+ None
+}
+
+// Any copyright to the test code below this comment is dedicated to the
+// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
+
+#[cfg(test)]
+mod tests {
+ use super::parse;
+ #[test]
+ fn baseline() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn meta_encoding_before_charset() {
+ assert_eq!(parse(b"<?xml version=\"1.0\" <meta encoding=\"windows-1251\" charset=\"windows-1253\"?>AAAA"), Some(encoding_rs::WINDOWS_1251));
+ }
+ #[test]
+ fn lt() {
+ assert_eq!(
+ parse(b"<?xml<encoding=\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn unmatched_quotes() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"windows-1251'?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn no_version() {
+ assert_eq!(
+ parse(b"<?xml encoding=\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn no_quotes_space() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=windows-1251 ?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn no_quotes() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=windows-1251?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn no_space_no_version_line_breaks_trailing_body() {
+ assert_eq!(
+ parse(b"<?xmlencoding \n = \n 'windows-1251'<body>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn space_before_label() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\" windows-1251\"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn space_after_label() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"windows-1251 \"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn one_around_label() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"\x01windows-1251\x01\"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn one_around_equals() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding\x01=\x01\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn no_version_no_space_trailing_lt_without_question_mark() {
+ assert_eq!(
+ parse(b"<?xmlencoding=\"windows-1251\"<>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn no_version_no_space_spaces_around_equals_single_quotes_trailing_body() {
+ assert_eq!(
+ parse(b"<?xmlencoding = 'windows-1251'<body>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn no_version_no_space_single_quotes_trailing_body() {
+ assert_eq!(
+ parse(b"<?xmlencoding='windows-1251'<body>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn no_version_no_space_double_quotes_trailing_body() {
+ assert_eq!(
+ parse(b"<?xmlencoding=\"windows-1251\"<body>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn no_version_no_space_no_trailing_question_mark() {
+ assert_eq!(
+ parse(b"<?xmlencoding=\"windows-1251\">AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn no_version_no_space() {
+ assert_eq!(
+ parse(b"<?xmlencoding=\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn upper_case_xml() {
+ assert_eq!(
+ parse(b"<?XML version=\"1.0\" encoding=\"windows-1251\"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn meta_charset_before_encoding() {
+ assert_eq!(parse(b"<?xml version=\"1.0\" <meta charset=\"windows-1253\" encoding=\"windows-1251\"?>AAAA"), Some(encoding_rs::WINDOWS_1251));
+ }
+ #[test]
+ fn lt_between_xml_and_encoding() {
+ assert_eq!(
+ parse(b"<?xml<encoding=\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn letter_between_xml_and_encoding() {
+ assert_eq!(
+ parse(b"<?xmlaencoding=\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn gt_between_xml_and_encoding() {
+ assert_eq!(parse(b"<?xml>encoding=\"windows-1251\"?>"), None);
+ }
+ #[test]
+ fn non_primary_label() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"cp1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn upper_case_label() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"WINDOWS-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn upper_case_version() {
+ assert_eq!(
+ parse(b"<?xml VERSION=\"1.0\" encoding=\"windows-1251\"?>AAAA"),
+ Some(encoding_rs::WINDOWS_1251)
+ );
+ }
+ #[test]
+ fn upper_case_encoding() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" ENCODING=\"windows-1251\"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn space_before() {
+ assert_eq!(
+ parse(b" <?xml version=\"1.0\" encoding=\"windows-1251\"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn encoding_equals_encoding() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=encoding=\"windows-1251\"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn encodingencoding() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encodingencoding=\"windows-1251\"?>AAAA"),
+ None
+ );
+ }
+ #[test]
+ fn utf16() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"UTF-16\"?>AAAA"),
+ Some(encoding_rs::UTF_8)
+ );
+ }
+ #[test]
+ fn utf16le() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"UTF-16LE\"?>AAAA"),
+ Some(encoding_rs::UTF_8)
+ );
+ }
+ #[test]
+ fn utf16be() {
+ assert_eq!(
+ parse(b"<?xml version=\"1.0\" encoding=\"UTF-16BE\"?>AAAA"),
+ Some(encoding_rs::UTF_8)
+ );
+ }
+ #[test]
+ fn bytes_1024() {
+ let mut v = Vec::new();
+ v.extend_from_slice(b"<?xml version=\"1.0\" encoding=\"windows-1251\"");
+ while v.len() < 1022 {
+ v.push(b' ');
+ }
+ v.extend_from_slice(b"?>AAAA");
+ assert_eq!(v.len(), 1028);
+ assert_eq!(parse(&v), Some(encoding_rs::WINDOWS_1251));
+ }
+ #[test]
+ fn bytes_1025() {
+ let mut v = Vec::new();
+ v.extend_from_slice(b"<?xml version=\"1.0\" encoding=\"windows-1251\"");
+ while v.len() < 1023 {
+ v.push(b' ');
+ }
+ v.extend_from_slice(b"?>AAAA");
+ assert_eq!(v.len(), 1029);
+ assert_eq!(parse(&v), Some(encoding_rs::WINDOWS_1251));
+ }
+}