1 files changed, 218 insertions, 0 deletions
diff --git a/vendor/content_inspector/src/lib.rs b/vendor/content_inspector/src/lib.rs
new file mode 100644
index 000000000..de846cce1
--- /dev/null
+++ b/vendor/content_inspector/src/lib.rs
@@ -0,0 +1,218 @@
+//! A simple library for *fast* inspection of binary buffers to guess the type of content.
+//!
+//! This is mainly intended to quickly determine whether a given buffer contains "binary"
+//! or "text" data. Programs like `grep` or `git diff` use similar mechanisms to decide whether
+//! to treat some files as "binary data" or not.
+//!
+//! The analysis is based on a very simple heuristic: Searching for NULL bytes
+//! (indicating "binary" content) and the detection of special [byte order
+//! marks](https://en.wikipedia.org/wiki/Byte_order_mark) (indicating a particular kind of textual
+//! encoding). Note that **this analysis can fail**. For example, even if unlikely, UTF-8-encoded
+//! text can legally contain NULL bytes. Conversely, some particular binary formats (like binary
+//! [PGM](https://en.wikipedia.org/wiki/Netpbm_format)) may not contain NULL bytes. Also, for
+//! performance reasons, only the first 1024 bytes are checked for the NULL-byte (if no BOM was
+//! detected).
+//!
+//! If this library reports a certain type of encoding (say `UTF_16LE`), there is **no guarantee**
+//! that the binary buffer can *actually* be decoded as UTF-16LE.
+//!
+//! # Example
+//! ```
+//! use content_inspector::{ContentType, inspect};
+//!
+//! assert_eq!(ContentType::UTF_8, inspect(b"Hello"));
+//! assert_eq!(ContentType::BINARY, inspect(b"\xFF\xE0\x00\x10\x4A\x46\x49\x46\x00"));
+//!
+//! assert!(inspect(b"Hello").is_text());
+//! ```
+
+extern crate memchr;
+
+use memchr::memchr;
+use std::cmp::min;
+use std::fmt;
+
+const MAX_SCAN_SIZE: usize = 1024;
+
+/// The type of encoding that was detected (for "text" data) or `BINARY` for "binary" data.
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum ContentType {
+    /// "binary" data
+    BINARY,
+
+    /// UTF-8 encoded "text" data
+    UTF_8,
+
+    /// UTF-8 encoded "text" data with a byte order mark.
+    UTF_8_BOM,
+
+    /// UTF-16 encoded "text" data (little endian)
+    UTF_16LE,
+
+    /// UTF-16 encoded "text" data (big endian)
+    UTF_16BE,
+
+    /// UTF-32 encoded "text" data (little endian)
+    UTF_32LE,
+
+    /// UTF-32 encoded "text" data (big endian)
+    UTF_32BE,
+}
+
+impl ContentType {
+    /// Returns `true`, if the `ContentType` is `BINARY`.
+    pub fn is_binary(self) -> bool {
+        self == ContentType::BINARY
+    }
+
+    /// Returns `true`, if the `ContentType` is __not__ `BINARY`.
+    pub fn is_text(self) -> bool {
+        !self.is_binary()
+    }
+}
+
+impl fmt::Display for ContentType {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use ContentType::*;
+
+        let name: &str = match *self {
+            BINARY => "binary",
+            UTF_8 => "UTF-8",
+            UTF_8_BOM => "UTF-8-BOM",
+            UTF_16LE => "UTF-16LE",
+            UTF_16BE => "UTF-16BE",
+            UTF_32LE => "UTF-32LE",
+            UTF_32BE => "UTF-32BE",
+        };
+        write!(f, "{}", name)
+    }
+}
+
+/// Common byte order marks
+/// (see https://en.wikipedia.org/wiki/Byte_order_mark)
+static BYTE_ORDER_MARKS: &[(&[u8], ContentType)] = &[
+    (&[0xEF, 0xBB, 0xBF], ContentType::UTF_8_BOM),
+    // UTF-32 needs to be checked before UTF-16 (overlapping BOMs)
+    (&[0x00, 0x00, 0xFE, 0xFF], ContentType::UTF_32BE),
+    (&[0xFF, 0xFE, 0x00, 0x00], ContentType::UTF_32LE),
+    (&[0xFE, 0xFF], ContentType::UTF_16BE),
+    (&[0xFF, 0xFE], ContentType::UTF_16LE),
+];
+
+/// Magic numbers for some filetypes that could otherwise be characterized as text.
+static MAGIC_NUMBERS: [&[u8]; 2] = [b"%PDF", b"\x89PNG"];
+
+/// Try to determine the type of content in the given buffer. See the crate documentation for a
+/// usage example and for more details on how this analysis is performed.
+///
+/// If the buffer is empty, the content type will be reported as `UTF_8`.
+pub fn inspect(buffer: &[u8]) -> ContentType {
+    use ContentType::*;
+
+    for &(bom, content_type) in BYTE_ORDER_MARKS {
+        if buffer.starts_with(bom) {
+            return content_type;
+        }
+    }
+
+    // Scan the first few bytes for zero-bytes
+    let scan_size = min(buffer.len(), MAX_SCAN_SIZE);
+    let has_zero_bytes = memchr(0x00, &buffer[..scan_size]).is_some();
+
+    if has_zero_bytes {
+        return BINARY;
+    }
+
+    if MAGIC_NUMBERS.iter().any(|magic| buffer.starts_with(magic)) {
+        return BINARY;
+    }
+
+    UTF_8
+}
+
+#[cfg(test)]
+mod tests {
+    use {inspect, ContentType::*};
+
+    #[test]
+    fn test_empty_buffer_utf_8() {
+        assert_eq!(UTF_8, inspect(b""));
+    }
+
+    #[test]
+    fn test_text_simple() {
+        assert_eq!(UTF_8, inspect("Simple UTF-8 string ☔".as_bytes()));
+    }
+
+    #[test]
+    fn test_text_utf8() {
+        assert_eq!(UTF_8, inspect(include_bytes!("../testdata/text_UTF-8.txt")));
+    }
+
+    #[test]
+    fn test_text_utf8_bom() {
+        assert_eq!(
+            UTF_8_BOM,
+            inspect(include_bytes!("../testdata/text_UTF-8-BOM.txt"))
+        );
+    }
+
+    #[test]
+    fn test_text_utf16le() {
+        assert_eq!(
+            UTF_16LE,
+            inspect(include_bytes!("../testdata/text_UTF-16LE-BOM.txt"))
+        );
+    }
+
+    #[test]
+    fn test_text_utf16be() {
+        assert_eq!(
+            UTF_16BE,
+            inspect(include_bytes!("../testdata/text_UTF-16BE-BOM.txt"))
+        );
+    }
+
+    #[test]
+    fn test_text_utf32le() {
+        assert_eq!(
+            UTF_32LE,
+            inspect(include_bytes!("../testdata/text_UTF-32LE-BOM.txt"))
+        );
+    }
+
+    #[test]
+    fn test_text_utf32be() {
+        assert_eq!(
+            UTF_32BE,
+            inspect(include_bytes!("../testdata/text_UTF-32BE-BOM.txt"))
+        );
+    }
+
+    #[test]
+    fn test_png() {
+        assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.png")));
+    }
+
+    #[test]
+    fn test_jpg() {
+        assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.jpg")));
+    }
+
+    #[test]
+    fn test_pdf() {
+        assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.pdf")));
+    }
+
+    #[test]
+    fn test_is_text() {
+        assert!(UTF_8.is_text());
+        assert!(UTF_32LE.is_text());
+    }
+
+    #[test]
+    fn test_is_binary() {
+        assert!(BINARY.is_binary());
+    }
+}