1 files changed, 307 insertions, 0 deletions
diff --git a/src/archive/tar/format.go b/src/archive/tar/format.go
new file mode 100644
index 0000000..e50124d
--- /dev/null
+++ b/src/archive/tar/format.go
@@ -0,0 +1,307 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tar
+
+import "strings"
+
+// Format represents the tar archive format.
+//
+// The original tar format was introduced in Unix V7.
+// Since then, there have been multiple competing formats attempting to
+// standardize or extend the V7 format to overcome its limitations.
+// The most common formats are the USTAR, PAX, and GNU formats,
+// each with their own advantages and limitations.
+//
+// The following table captures the capabilities of each format:
+//
+//	                  |  USTAR |       PAX |       GNU
+//	------------------+--------+-----------+----------
+//	Name              |   256B | unlimited | unlimited
+//	Linkname          |   100B | unlimited | unlimited
+//	Size              | uint33 | unlimited |    uint89
+//	Mode              | uint21 |    uint21 |    uint57
+//	Uid/Gid           | uint21 | unlimited |    uint57
+//	Uname/Gname       |    32B | unlimited |       32B
+//	ModTime           | uint33 | unlimited |     int89
+//	AccessTime        |    n/a | unlimited |     int89
+//	ChangeTime        |    n/a | unlimited |     int89
+//	Devmajor/Devminor | uint21 |    uint21 |    uint57
+//	------------------+--------+-----------+----------
+//	string encoding   |  ASCII |     UTF-8 |    binary
+//	sub-second times  |     no |       yes |        no
+//	sparse files      |     no |       yes |       yes
+//
+// The table's upper portion shows the Header fields, where each format reports
+// the maximum number of bytes allowed for each string field and
+// the integer type used to store each numeric field
+// (where timestamps are stored as the number of seconds since the Unix epoch).
+//
+// The table's lower portion shows specialized features of each format,
+// such as supported string encodings, support for sub-second timestamps,
+// or support for sparse files.
+//
+// The Writer currently provides no support for sparse files.
+type Format int
+
+// Constants to identify various tar formats.
+const (
+	// Deliberately hide the meaning of constants from public API.
+	_ Format = (1 << iota) / 4 // Sequence of 0, 0, 1, 2, 4, 8, etc...
+
+	// FormatUnknown indicates that the format is unknown.
+	FormatUnknown
+
+	// The format of the original Unix V7 tar tool prior to standardization.
+	formatV7
+
+	// FormatUSTAR represents the USTAR header format defined in POSIX.1-1988.
+	//
+	// While this format is compatible with most tar readers,
+	// the format has several limitations making it unsuitable for some usages.
+	// Most notably, it cannot support sparse files, files larger than 8GiB,
+	// filenames larger than 256 characters, and non-ASCII filenames.
+	//
+	// Reference:
+	//	http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html#tag_20_92_13_06
+	FormatUSTAR
+
+	// FormatPAX represents the PAX header format defined in POSIX.1-2001.
+	//
+	// PAX extends USTAR by writing a special file with Typeflag TypeXHeader
+	// preceding the original header. This file contains a set of key-value
+	// records, which are used to overcome USTAR's shortcomings, in addition to
+	// providing the ability to have sub-second resolution for timestamps.
+	//
+	// Some newer formats add their own extensions to PAX by defining their
+	// own keys and assigning certain semantic meaning to the associated values.
+	// For example, sparse file support in PAX is implemented using keys
+	// defined by the GNU manual (e.g., "GNU.sparse.map").
+	//
+	// Reference:
+	//	http://pubs.opengroup.org/onlinepubs/009695399/utilities/pax.html
+	FormatPAX
+
+	// FormatGNU represents the GNU header format.
+	//
+	// The GNU header format is older than the USTAR and PAX standards and
+	// is not compatible with them. The GNU format supports
+	// arbitrary file sizes, filenames of arbitrary encoding and length,
+	// sparse files, and other features.
+	//
+	// It is recommended that PAX be chosen over GNU unless the target
+	// application can only parse GNU formatted archives.
+	//
+	// Reference:
+	//	https://www.gnu.org/software/tar/manual/html_node/Standard.html
+	FormatGNU
+
+	// Schily's tar format, which is incompatible with USTAR.
+	// This does not cover STAR extensions to the PAX format; these fall under
+	// the PAX format.
+	formatSTAR
+
+	formatMax
+)
+
+func (f Format) has(f2 Format) bool   { return f&f2 != 0 }
+func (f *Format) mayBe(f2 Format)     { *f |= f2 }
+func (f *Format) mayOnlyBe(f2 Format) { *f &= f2 }
+func (f *Format) mustNotBe(f2 Format) { *f &^= f2 }
+
+var formatNames = map[Format]string{
+	formatV7: "V7", FormatUSTAR: "USTAR", FormatPAX: "PAX", FormatGNU: "GNU", formatSTAR: "STAR",
+}
+
+func (f Format) String() string {
+	var ss []string
+	for f2 := Format(1); f2 < formatMax; f2 <<= 1 {
+		if f.has(f2) {
+			ss = append(ss, formatNames[f2])
+		}
+	}
+	switch len(ss) {
+	case 0:
+		return "<unknown>"
+	case 1:
+		return ss[0]
+	default:
+		return "(" + strings.Join(ss, " | ") + ")"
+	}
+}
+
+// Magics used to identify various formats.
+const (
+	magicGNU, versionGNU     = "ustar ", " \x00"
+	magicUSTAR, versionUSTAR = "ustar\x00", "00"
+	trailerSTAR              = "tar\x00"
+)
+
+// Size constants from various tar specifications.
+const (
+	blockSize  = 512 // Size of each block in a tar stream
+	nameSize   = 100 // Max length of the name field in USTAR format
+	prefixSize = 155 // Max length of the prefix field in USTAR format
+
+	// Max length of a special file (PAX header, GNU long name or link).
+	// This matches the limit used by libarchive.
+	maxSpecialFileSize = 1 << 20
+)
+
+// blockPadding computes the number of bytes needed to pad offset up to the
+// nearest block edge where 0 <= n < blockSize.
+func blockPadding(offset int64) (n int64) {
+	return -offset & (blockSize - 1)
+}
+
+var zeroBlock block
+
+type block [blockSize]byte
+
+// Convert block to any number of formats.
+func (b *block) toV7() *headerV7       { return (*headerV7)(b) }
+func (b *block) toGNU() *headerGNU     { return (*headerGNU)(b) }
+func (b *block) toSTAR() *headerSTAR   { return (*headerSTAR)(b) }
+func (b *block) toUSTAR() *headerUSTAR { return (*headerUSTAR)(b) }
+func (b *block) toSparse() sparseArray { return sparseArray(b[:]) }
+
+// getFormat checks that the block is a valid tar header based on the checksum.
+// It then attempts to guess the specific format based on magic values.
+// If the checksum fails, then FormatUnknown is returned.
+func (b *block) getFormat() Format {
+	// Verify checksum.
+	var p parser
+	value := p.parseOctal(b.toV7().chksum())
+	chksum1, chksum2 := b.computeChecksum()
+	if p.err != nil || (value != chksum1 && value != chksum2) {
+		return FormatUnknown
+	}
+
+	// Guess the magic values.
+	magic := string(b.toUSTAR().magic())
+	version := string(b.toUSTAR().version())
+	trailer := string(b.toSTAR().trailer())
+	switch {
+	case magic == magicUSTAR && trailer == trailerSTAR:
+		return formatSTAR
+	case magic == magicUSTAR:
+		return FormatUSTAR | FormatPAX
+	case magic == magicGNU && version == versionGNU:
+		return FormatGNU
+	default:
+		return formatV7
+	}
+}
+
+// setFormat writes the magic values necessary for specified format
+// and then updates the checksum accordingly.
+func (b *block) setFormat(format Format) {
+	// Set the magic values.
+	switch {
+	case format.has(formatV7):
+		// Do nothing.
+	case format.has(FormatGNU):
+		copy(b.toGNU().magic(), magicGNU)
+		copy(b.toGNU().version(), versionGNU)
+	case format.has(formatSTAR):
+		copy(b.toSTAR().magic(), magicUSTAR)
+		copy(b.toSTAR().version(), versionUSTAR)
+		copy(b.toSTAR().trailer(), trailerSTAR)
+	case format.has(FormatUSTAR | FormatPAX):
+		copy(b.toUSTAR().magic(), magicUSTAR)
+		copy(b.toUSTAR().version(), versionUSTAR)
+	default:
+		panic("invalid format")
+	}
+
+	// Update checksum.
+	// This field is special in that it is terminated by a NULL then space.
+	var f formatter
+	field := b.toV7().chksum()
+	chksum, _ := b.computeChecksum() // Possible values are 256..128776
+	f.formatOctal(field[:7], chksum) // Never fails since 128776 < 262143
+	field[7] = ' '
+}
+
+// computeChecksum computes the checksum for the header block.
+// POSIX specifies a sum of the unsigned byte values, but the Sun tar used
+// signed byte values.
+// We compute and return both.
+func (b *block) computeChecksum() (unsigned, signed int64) {
+	for i, c := range b {
+		if 148 <= i && i < 156 {
+			c = ' ' // Treat the checksum field itself as all spaces.
+		}
+		unsigned += int64(c)
+		signed += int64(int8(c))
+	}
+	return unsigned, signed
+}
+
+// reset clears the block with all zeros.
+func (b *block) reset() {
+	*b = block{}
+}
+
+type headerV7 [blockSize]byte
+
+func (h *headerV7) name() []byte     { return h[000:][:100] }
+func (h *headerV7) mode() []byte     { return h[100:][:8] }
+func (h *headerV7) uid() []byte      { return h[108:][:8] }
+func (h *headerV7) gid() []byte      { return h[116:][:8] }
+func (h *headerV7) size() []byte     { return h[124:][:12] }
+func (h *headerV7) modTime() []byte  { return h[136:][:12] }
+func (h *headerV7) chksum() []byte   { return h[148:][:8] }
+func (h *headerV7) typeFlag() []byte { return h[156:][:1] }
+func (h *headerV7) linkName() []byte { return h[157:][:100] }
+
+type headerGNU [blockSize]byte
+
+func (h *headerGNU) v7() *headerV7       { return (*headerV7)(h) }
+func (h *headerGNU) magic() []byte       { return h[257:][:6] }
+func (h *headerGNU) version() []byte     { return h[263:][:2] }
+func (h *headerGNU) userName() []byte    { return h[265:][:32] }
+func (h *headerGNU) groupName() []byte   { return h[297:][:32] }
+func (h *headerGNU) devMajor() []byte    { return h[329:][:8] }
+func (h *headerGNU) devMinor() []byte    { return h[337:][:8] }
+func (h *headerGNU) accessTime() []byte  { return h[345:][:12] }
+func (h *headerGNU) changeTime() []byte  { return h[357:][:12] }
+func (h *headerGNU) sparse() sparseArray { return sparseArray(h[386:][:24*4+1]) }
+func (h *headerGNU) realSize() []byte    { return h[483:][:12] }
+
+type headerSTAR [blockSize]byte
+
+func (h *headerSTAR) v7() *headerV7      { return (*headerV7)(h) }
+func (h *headerSTAR) magic() []byte      { return h[257:][:6] }
+func (h *headerSTAR) version() []byte    { return h[263:][:2] }
+func (h *headerSTAR) userName() []byte   { return h[265:][:32] }
+func (h *headerSTAR) groupName() []byte  { return h[297:][:32] }
+func (h *headerSTAR) devMajor() []byte   { return h[329:][:8] }
+func (h *headerSTAR) devMinor() []byte   { return h[337:][:8] }
+func (h *headerSTAR) prefix() []byte     { return h[345:][:131] }
+func (h *headerSTAR) accessTime() []byte { return h[476:][:12] }
+func (h *headerSTAR) changeTime() []byte { return h[488:][:12] }
+func (h *headerSTAR) trailer() []byte    { return h[508:][:4] }
+
+type headerUSTAR [blockSize]byte
+
+func (h *headerUSTAR) v7() *headerV7     { return (*headerV7)(h) }
+func (h *headerUSTAR) magic() []byte     { return h[257:][:6] }
+func (h *headerUSTAR) version() []byte   { return h[263:][:2] }
+func (h *headerUSTAR) userName() []byte  { return h[265:][:32] }
+func (h *headerUSTAR) groupName() []byte { return h[297:][:32] }
+func (h *headerUSTAR) devMajor() []byte  { return h[329:][:8] }
+func (h *headerUSTAR) devMinor() []byte  { return h[337:][:8] }
+func (h *headerUSTAR) prefix() []byte    { return h[345:][:155] }
+
+type sparseArray []byte
+
+func (s sparseArray) entry(i int) sparseElem { return sparseElem(s[i*24:]) }
+func (s sparseArray) isExtended() []byte     { return s[24*s.maxEntries():][:1] }
+func (s sparseArray) maxEntries() int        { return len(s) / 24 }
+
+type sparseElem []byte
+
+func (s sparseElem) offset() []byte { return s[00:][:12] }
+func (s sparseElem) length() []byte { return s[12:][:12] }