summaryrefslogtreecommitdiffstats
path: root/vendor/content_inspector/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/content_inspector/src/lib.rs')
-rw-r--r--vendor/content_inspector/src/lib.rs218
1 files changed, 218 insertions, 0 deletions
diff --git a/vendor/content_inspector/src/lib.rs b/vendor/content_inspector/src/lib.rs
new file mode 100644
index 000000000..de846cce1
--- /dev/null
+++ b/vendor/content_inspector/src/lib.rs
@@ -0,0 +1,218 @@
+//! A simple library for *fast* inspection of binary buffers to guess the type of content.
+//!
+//! This is mainly intended to quickly determine whether a given buffer contains "binary"
+//! or "text" data. Programs like `grep` or `git diff` use similar mechanisms to decide whether
+//! to treat some files as "binary data" or not.
+//!
+//! The analysis is based on a very simple heuristic: Searching for NULL bytes
+//! (indicating "binary" content) and the detection of special [byte order
+//! marks](https://en.wikipedia.org/wiki/Byte_order_mark) (indicating a particular kind of textual
+//! encoding). Note that **this analysis can fail**. For example, even if unlikely, UTF-8-encoded
+//! text can legally contain NULL bytes. Conversely, some particular binary formats (like binary
+//! [PGM](https://en.wikipedia.org/wiki/Netpbm_format)) may not contain NULL bytes. Also, for
+//! performance reasons, only the first 1024 bytes are checked for the NULL-byte (if no BOM was
+//! detected).
+//!
+//! If this library reports a certain type of encoding (say `UTF_16LE`), there is **no guarantee**
+//! that the binary buffer can *actually* be decoded as UTF-16LE.
+//!
+//! # Example
+//! ```
+//! use content_inspector::{ContentType, inspect};
+//!
+//! assert_eq!(ContentType::UTF_8, inspect(b"Hello"));
+//! assert_eq!(ContentType::BINARY, inspect(b"\xFF\xE0\x00\x10\x4A\x46\x49\x46\x00"));
+//!
+//! assert!(inspect(b"Hello").is_text());
+//! ```
+
+extern crate memchr;
+
+use memchr::memchr;
+use std::cmp::min;
+use std::fmt;
+
+const MAX_SCAN_SIZE: usize = 1024;
+
+/// The type of encoding that was detected (for "text" data) or `BINARY` for "binary" data.
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum ContentType {
+ /// "binary" data
+ BINARY,
+
+ /// UTF-8 encoded "text" data
+ UTF_8,
+
+ /// UTF-8 encoded "text" data with a byte order mark.
+ UTF_8_BOM,
+
+ /// UTF-16 encoded "text" data (little endian)
+ UTF_16LE,
+
+ /// UTF-16 encoded "text" data (big endian)
+ UTF_16BE,
+
+ /// UTF-32 encoded "text" data (little endian)
+ UTF_32LE,
+
+ /// UTF-32 encoded "text" data (big endian)
+ UTF_32BE,
+}
+
+impl ContentType {
+ /// Returns `true`, if the `ContentType` is `BINARY`.
+ pub fn is_binary(self) -> bool {
+ self == ContentType::BINARY
+ }
+
+ /// Returns `true`, if the `ContentType` is __not__ `BINARY`.
+ pub fn is_text(self) -> bool {
+ !self.is_binary()
+ }
+}
+
+impl fmt::Display for ContentType {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ use ContentType::*;
+
+ let name: &str = match *self {
+ BINARY => "binary",
+ UTF_8 => "UTF-8",
+ UTF_8_BOM => "UTF-8-BOM",
+ UTF_16LE => "UTF-16LE",
+ UTF_16BE => "UTF-16BE",
+ UTF_32LE => "UTF-32LE",
+ UTF_32BE => "UTF-32BE",
+ };
+ write!(f, "{}", name)
+ }
+}
+
+/// Common byte order marks
+/// (see https://en.wikipedia.org/wiki/Byte_order_mark)
+static BYTE_ORDER_MARKS: &[(&[u8], ContentType)] = &[
+ (&[0xEF, 0xBB, 0xBF], ContentType::UTF_8_BOM),
+ // UTF-32 needs to be checked before UTF-16 (overlapping BOMs)
+ (&[0x00, 0x00, 0xFE, 0xFF], ContentType::UTF_32BE),
+ (&[0xFF, 0xFE, 0x00, 0x00], ContentType::UTF_32LE),
+ (&[0xFE, 0xFF], ContentType::UTF_16BE),
+ (&[0xFF, 0xFE], ContentType::UTF_16LE),
+];
+
+/// Magic numbers for some filetypes that could otherwise be characterized as text.
+static MAGIC_NUMBERS: [&[u8]; 2] = [b"%PDF", b"\x89PNG"];
+
+/// Try to determine the type of content in the given buffer. See the crate documentation for a
+/// usage example and for more details on how this analysis is performed.
+///
+/// If the buffer is empty, the content type will be reported as `UTF_8`.
+pub fn inspect(buffer: &[u8]) -> ContentType {
+ use ContentType::*;
+
+ for &(bom, content_type) in BYTE_ORDER_MARKS {
+ if buffer.starts_with(bom) {
+ return content_type;
+ }
+ }
+
+ // Scan the first few bytes for zero-bytes
+ let scan_size = min(buffer.len(), MAX_SCAN_SIZE);
+ let has_zero_bytes = memchr(0x00, &buffer[..scan_size]).is_some();
+
+ if has_zero_bytes {
+ return BINARY;
+ }
+
+ if MAGIC_NUMBERS.iter().any(|magic| buffer.starts_with(magic)) {
+ return BINARY;
+ }
+
+ UTF_8
+}
+
+#[cfg(test)]
+mod tests {
+ use {inspect, ContentType::*};
+
+ #[test]
+ fn test_empty_buffer_utf_8() {
+ assert_eq!(UTF_8, inspect(b""));
+ }
+
+ #[test]
+ fn test_text_simple() {
+ assert_eq!(UTF_8, inspect("Simple UTF-8 string ☔".as_bytes()));
+ }
+
+ #[test]
+ fn test_text_utf8() {
+ assert_eq!(UTF_8, inspect(include_bytes!("../testdata/text_UTF-8.txt")));
+ }
+
+ #[test]
+ fn test_text_utf8_bom() {
+ assert_eq!(
+ UTF_8_BOM,
+ inspect(include_bytes!("../testdata/text_UTF-8-BOM.txt"))
+ );
+ }
+
+ #[test]
+ fn test_text_utf16le() {
+ assert_eq!(
+ UTF_16LE,
+ inspect(include_bytes!("../testdata/text_UTF-16LE-BOM.txt"))
+ );
+ }
+
+ #[test]
+ fn test_text_utf16be() {
+ assert_eq!(
+ UTF_16BE,
+ inspect(include_bytes!("../testdata/text_UTF-16BE-BOM.txt"))
+ );
+ }
+
+ #[test]
+ fn test_text_utf32le() {
+ assert_eq!(
+ UTF_32LE,
+ inspect(include_bytes!("../testdata/text_UTF-32LE-BOM.txt"))
+ );
+ }
+
+ #[test]
+ fn test_text_utf32be() {
+ assert_eq!(
+ UTF_32BE,
+ inspect(include_bytes!("../testdata/text_UTF-32BE-BOM.txt"))
+ );
+ }
+
+ #[test]
+ fn test_png() {
+ assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.png")));
+ }
+
+ #[test]
+ fn test_jpg() {
+ assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.jpg")));
+ }
+
+ #[test]
+ fn test_pdf() {
+ assert_eq!(BINARY, inspect(include_bytes!("../testdata/test.pdf")));
+ }
+
+ #[test]
+ fn test_is_text() {
+ assert!(UTF_8.is_text());
+ assert!(UTF_32LE.is_text());
+ }
+
+ #[test]
+ fn test_is_binary() {
+ assert!(BINARY.is_binary());
+ }
+}