diff options
Diffstat (limited to 'vendor/content_inspector/src/lib.rs')
-rw-r--r-- | vendor/content_inspector/src/lib.rs | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/vendor/content_inspector/src/lib.rs b/vendor/content_inspector/src/lib.rs new file mode 100644 index 000000000..de846cce1 --- /dev/null +++ b/vendor/content_inspector/src/lib.rs @@ -0,0 +1,218 @@ +//! A simple library for *fast* inspection of binary buffers to guess the type of content. +//! +//! This is mainly intended to quickly determine whether a given buffer contains "binary" +//! or "text" data. Programs like `grep` or `git diff` use similar mechanisms to decide whether +//! to treat some files as "binary data" or not. +//! +//! The analysis is based on a very simple heuristic: Searching for NULL bytes +//! (indicating "binary" content) and the detection of special [byte order +//! marks](https://en.wikipedia.org/wiki/Byte_order_mark) (indicating a particular kind of textual +//! encoding). Note that **this analysis can fail**. For example, even if unlikely, UTF-8-encoded +//! text can legally contain NULL bytes. Conversely, some particular binary formats (like binary +//! [PGM](https://en.wikipedia.org/wiki/Netpbm_format)) may not contain NULL bytes. Also, for +//! performance reasons, only the first 1024 bytes are checked for the NULL-byte (if no BOM was +//! detected). +//! +//! If this library reports a certain type of encoding (say `UTF_16LE`), there is **no guarantee** +//! that the binary buffer can *actually* be decoded as UTF-16LE. +//! +//! # Example +//! ``` +//! use content_inspector::{ContentType, inspect}; +//! +//! assert_eq!(ContentType::UTF_8, inspect(b"Hello")); +//! assert_eq!(ContentType::BINARY, inspect(b"\xFF\xE0\x00\x10\x4A\x46\x49\x46\x00")); +//! +//! assert!(inspect(b"Hello").is_text()); +//! ``` + +extern crate memchr; + +use memchr::memchr; +use std::cmp::min; +use std::fmt; + +const MAX_SCAN_SIZE: usize = 1024; + +/// The type of encoding that was detected (for "text" data) or `BINARY` for "binary" data. +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum ContentType { + /// "binary" data + BINARY, + + /// UTF-8 encoded "text" data + UTF_8, + + /// UTF-8 encoded "text" data with a byte order mark. + UTF_8_BOM, + + /// UTF-16 encoded "text" data (little endian) + UTF_16LE, + + /// UTF-16 encoded "text" data (big endian) + UTF_16BE, + + /// UTF-32 encoded "text" data (little endian) + UTF_32LE, + + /// UTF-32 encoded "text" data (big endian) + UTF_32BE, +} + +impl ContentType { + /// Returns `true`, if the `ContentType` is `BINARY`. + pub fn is_binary(self) -> bool { + self == ContentType::BINARY + } + + /// Returns `true`, if the `ContentType` is __not__ `BINARY`. + pub fn is_text(self) -> bool { + !self.is_binary() + } +} + +impl fmt::Display for ContentType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use ContentType::*; + + let name: &str = match *self { + BINARY => "binary", + UTF_8 => "UTF-8", + UTF_8_BOM => "UTF-8-BOM", + UTF_16LE => "UTF-16LE", + UTF_16BE => "UTF-16BE", + UTF_32LE => "UTF-32LE", + UTF_32BE => "UTF-32BE", + }; + write!(f, "{}", name) + } +} + +/// Common byte order marks +/// (see https://en.wikipedia.org/wiki/Byte_order_mark) +static BYTE_ORDER_MARKS: &[(&[u8], ContentType)] = &[ + (&[0xEF, 0xBB, 0xBF], ContentType::UTF_8_BOM), + // UTF-32 needs to be checked before UTF-16 (overlapping BOMs) + (&[0x00, 0x00, 0xFE, 0xFF], ContentType::UTF_32BE), + (&[0xFF, 0xFE, 0x00, 0x00], ContentType::UTF_32LE), + (&[0xFE, 0xFF], ContentType::UTF_16BE), + (&[0xFF, 0xFE], ContentType::UTF_16LE), +]; + +/// Magic numbers for some filetypes that could otherwise be characterized as text. +static MAGIC_NUMBERS: [&[u8]; 2] = [b"%PDF", b"\x89PNG"]; + +/// Try to determine the type of content in the given buffer. See the crate documentation for a +/// usage example and for more details on how this analysis is performed. +/// +/// If the buffer is empty, the content type will be reported as `UTF_8`. +pub fn inspect(buffer: &[u8]) -> ContentType { + use ContentType::*; + + for &(bom, content_type) in BYTE_ORDER_MARKS { + if buffer.starts_with(bom) { + return content_type; + } + } + + // Scan the first few bytes for zero-bytes + let scan_size = min(buffer.len(), MAX_SCAN_SIZE); + let has_zero_bytes = memchr(0x00, &buffer[..scan_size]).is_some(); + + if has_zero_bytes { + return BINARY; + } + + if MAGIC_NUMBERS.iter().any(|magic| buffer.starts_with(magic)) { + return BINARY; + } + + UTF_8 +} + +#[cfg(test)] +mod tests { + use {inspect, ContentType::*}; + + #[test] + fn test_empty_buffer_utf_8() { + assert_eq!(UTF_8, inspect(b"")); + } + + #[test] + fn test_text_simple() { + assert_eq!(UTF_8, inspect("Simple UTF-8 string ☔".as_bytes())); + } + + #[test] + fn test_text_utf8() { + assert_eq!(UTF_8, inspect(include_bytes!("../testdata/text_UTF-8.txt"))); + } + + #[test] + fn test_text_utf8_bom() { + assert_eq!( + UTF_8_BOM, + inspect(include_bytes!("../testdata/text_UTF-8-BOM.txt")) + ); + } + + #[test] + fn test_text_utf16le() { + assert_eq!( + UTF_16LE, + inspect(include_bytes!("../testdata/text_UTF-16LE-BOM.txt")) + ); + } + + #[test] + fn test_text_utf16be() { + assert_eq!( + UTF_16BE, + inspect(include_bytes!("../testdata/text_UTF-16BE-BOM.txt")) + ); + } + + #[test] + fn test_text_utf32le() { + assert_eq!( + UTF_32LE, + inspect(include_bytes!("../testdata/text_UTF-32LE-BOM.txt")) + ); + } + + #[test] + fn test_text_utf32be() { + assert_eq!( + UTF_32BE, + inspect(include_bytes!("../testdata/text_UTF-32BE-BOM.txt")) + ); + } + + #[test] + fn test_png() { + assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.png"))); + } + + #[test] + fn test_jpg() { + assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.jpg"))); + } + + #[test] + fn test_pdf() { + assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.pdf"))); + } + + #[test] + fn test_is_text() { + assert!(UTF_8.is_text()); + assert!(UTF_32LE.is_text()); + } + + #[test] + fn test_is_binary() { + assert!(BINARY.is_binary()); + } +} |