diff options
Diffstat (limited to '')
-rw-r--r-- | src/net/http/sniff.go | 304 |
1 files changed, 304 insertions, 0 deletions
diff --git a/src/net/http/sniff.go b/src/net/http/sniff.go new file mode 100644 index 0000000..ac18ab9 --- /dev/null +++ b/src/net/http/sniff.go @@ -0,0 +1,304 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package http + +import ( + "bytes" + "encoding/binary" +) + +// The algorithm uses at most sniffLen bytes to make its decision. +const sniffLen = 512 + +// DetectContentType implements the algorithm described +// at https://mimesniff.spec.whatwg.org/ to determine the +// Content-Type of the given data. It considers at most the +// first 512 bytes of data. DetectContentType always returns +// a valid MIME type: if it cannot determine a more specific one, it +// returns "application/octet-stream". +func DetectContentType(data []byte) string { + if len(data) > sniffLen { + data = data[:sniffLen] + } + + // Index of the first non-whitespace byte in data. + firstNonWS := 0 + for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ { + } + + for _, sig := range sniffSignatures { + if ct := sig.match(data, firstNonWS); ct != "" { + return ct + } + } + + return "application/octet-stream" // fallback +} + +// isWS reports whether the provided byte is a whitespace byte (0xWS) +// as defined in https://mimesniff.spec.whatwg.org/#terminology. +func isWS(b byte) bool { + switch b { + case '\t', '\n', '\x0c', '\r', ' ': + return true + } + return false +} + +// isTT reports whether the provided byte is a tag-terminating byte (0xTT) +// as defined in https://mimesniff.spec.whatwg.org/#terminology. +func isTT(b byte) bool { + switch b { + case ' ', '>': + return true + } + return false +} + +type sniffSig interface { + // match returns the MIME type of the data, or "" if unknown. + match(data []byte, firstNonWS int) string +} + +// Data matching the table in section 6. +var sniffSignatures = []sniffSig{ + htmlSig("<!DOCTYPE HTML"), + htmlSig("<HTML"), + htmlSig("<HEAD"), + htmlSig("<SCRIPT"), + htmlSig("<IFRAME"), + htmlSig("<H1"), + htmlSig("<DIV"), + htmlSig("<FONT"), + htmlSig("<TABLE"), + htmlSig("<A"), + htmlSig("<STYLE"), + htmlSig("<TITLE"), + htmlSig("<B"), + htmlSig("<BODY"), + htmlSig("<BR"), + htmlSig("<P"), + htmlSig("<!--"), + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\xFF\xFF"), + pat: []byte("<?xml"), + skipWS: true, + ct: "text/xml; charset=utf-8"}, + &exactSig{[]byte("%PDF-"), "application/pdf"}, + &exactSig{[]byte("%!PS-Adobe-"), "application/postscript"}, + + // UTF BOMs. + &maskedSig{ + mask: []byte("\xFF\xFF\x00\x00"), + pat: []byte("\xFE\xFF\x00\x00"), + ct: "text/plain; charset=utf-16be", + }, + &maskedSig{ + mask: []byte("\xFF\xFF\x00\x00"), + pat: []byte("\xFF\xFE\x00\x00"), + ct: "text/plain; charset=utf-16le", + }, + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\x00"), + pat: []byte("\xEF\xBB\xBF\x00"), + ct: "text/plain; charset=utf-8", + }, + + // Image types + // For posterity, we originally returned "image/vnd.microsoft.icon" from + // https://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03#section-7 + // https://codereview.appspot.com/4746042 + // but that has since been replaced with "image/x-icon" in Section 6.2 + // of https://mimesniff.spec.whatwg.org/#matching-an-image-type-pattern + &exactSig{[]byte("\x00\x00\x01\x00"), "image/x-icon"}, + &exactSig{[]byte("\x00\x00\x02\x00"), "image/x-icon"}, + &exactSig{[]byte("BM"), "image/bmp"}, + &exactSig{[]byte("GIF87a"), "image/gif"}, + &exactSig{[]byte("GIF89a"), "image/gif"}, + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"), + pat: []byte("RIFF\x00\x00\x00\x00WEBPVP"), + ct: "image/webp", + }, + &exactSig{[]byte("\x89PNG\x0D\x0A\x1A\x0A"), "image/png"}, + &exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"}, + + // Audio and Video types + // Enforce the pattern match ordering as prescribed in + // https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"), + pat: []byte("FORM\x00\x00\x00\x00AIFF"), + ct: "audio/aiff", + }, + &maskedSig{ + mask: []byte("\xFF\xFF\xFF"), + pat: []byte("ID3"), + ct: "audio/mpeg", + }, + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\xFF\xFF"), + pat: []byte("OggS\x00"), + ct: "application/ogg", + }, + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"), + pat: []byte("MThd\x00\x00\x00\x06"), + ct: "audio/midi", + }, + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"), + pat: []byte("RIFF\x00\x00\x00\x00AVI "), + ct: "video/avi", + }, + &maskedSig{ + mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"), + pat: []byte("RIFF\x00\x00\x00\x00WAVE"), + ct: "audio/wave", + }, + // 6.2.0.2. video/mp4 + mp4Sig{}, + // 6.2.0.3. video/webm + &exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"}, + + // Font types + &maskedSig{ + // 34 NULL bytes followed by the string "LP" + pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00LP"), + // 34 NULL bytes followed by \xF\xF + mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"), + ct: "application/vnd.ms-fontobject", + }, + &exactSig{[]byte("\x00\x01\x00\x00"), "font/ttf"}, + &exactSig{[]byte("OTTO"), "font/otf"}, + &exactSig{[]byte("ttcf"), "font/collection"}, + &exactSig{[]byte("wOFF"), "font/woff"}, + &exactSig{[]byte("wOF2"), "font/woff2"}, + + // Archive types + &exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"}, + &exactSig{[]byte("PK\x03\x04"), "application/zip"}, + // RAR's signatures are incorrectly defined by the MIME spec as per + // https://github.com/whatwg/mimesniff/issues/63 + // However, RAR Labs correctly defines it at: + // https://www.rarlab.com/technote.htm#rarsign + // so we use the definition from RAR Labs. + // TODO: do whatever the spec ends up doing. + &exactSig{[]byte("Rar!\x1A\x07\x00"), "application/x-rar-compressed"}, // RAR v1.5-v4.0 + &exactSig{[]byte("Rar!\x1A\x07\x01\x00"), "application/x-rar-compressed"}, // RAR v5+ + + &exactSig{[]byte("\x00\x61\x73\x6D"), "application/wasm"}, + + textSig{}, // should be last +} + +type exactSig struct { + sig []byte + ct string +} + +func (e *exactSig) match(data []byte, firstNonWS int) string { + if bytes.HasPrefix(data, e.sig) { + return e.ct + } + return "" +} + +type maskedSig struct { + mask, pat []byte + skipWS bool + ct string +} + +func (m *maskedSig) match(data []byte, firstNonWS int) string { + // pattern matching algorithm section 6 + // https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm + + if m.skipWS { + data = data[firstNonWS:] + } + if len(m.pat) != len(m.mask) { + return "" + } + if len(data) < len(m.pat) { + return "" + } + for i, pb := range m.pat { + maskedData := data[i] & m.mask[i] + if maskedData != pb { + return "" + } + } + return m.ct +} + +type htmlSig []byte + +func (h htmlSig) match(data []byte, firstNonWS int) string { + data = data[firstNonWS:] + if len(data) < len(h)+1 { + return "" + } + for i, b := range h { + db := data[i] + if 'A' <= b && b <= 'Z' { + db &= 0xDF + } + if b != db { + return "" + } + } + // Next byte must be a tag-terminating byte(0xTT). + if !isTT(data[len(h)]) { + return "" + } + return "text/html; charset=utf-8" +} + +var mp4ftype = []byte("ftyp") +var mp4 = []byte("mp4") + +type mp4Sig struct{} + +func (mp4Sig) match(data []byte, firstNonWS int) string { + // https://mimesniff.spec.whatwg.org/#signature-for-mp4 + // c.f. section 6.2.1 + if len(data) < 12 { + return "" + } + boxSize := int(binary.BigEndian.Uint32(data[:4])) + if len(data) < boxSize || boxSize%4 != 0 { + return "" + } + if !bytes.Equal(data[4:8], mp4ftype) { + return "" + } + for st := 8; st < boxSize; st += 4 { + if st == 12 { + // Ignores the four bytes that correspond to the version number of the "major brand". + continue + } + if bytes.Equal(data[st:st+3], mp4) { + return "video/mp4" + } + } + return "" +} + +type textSig struct{} + +func (textSig) match(data []byte, firstNonWS int) string { + // c.f. section 5, step 4. + for _, b := range data[firstNonWS:] { + switch { + case b <= 0x08, + b == 0x0B, + 0x0E <= b && b <= 0x1A, + 0x1C <= b && b <= 0x1F: + return "" + } + } + return "text/plain; charset=utf-8" +} |