summaryrefslogtreecommitdiffstats
path: root/src/compress/bzip2/bzip2.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/compress/bzip2/bzip2.go')
-rw-r--r--src/compress/bzip2/bzip2.go502
1 files changed, 502 insertions, 0 deletions
diff --git a/src/compress/bzip2/bzip2.go b/src/compress/bzip2/bzip2.go
new file mode 100644
index 0000000..8f88e38
--- /dev/null
+++ b/src/compress/bzip2/bzip2.go
@@ -0,0 +1,502 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package bzip2 implements bzip2 decompression.
+package bzip2
+
+import "io"
+
+// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
+// of guessing: https://en.wikipedia.org/wiki/Bzip2
+// The source code to pyflate was useful for debugging:
+// http://www.paul.sladen.org/projects/pyflate
+
+// A StructuralError is returned when the bzip2 data is found to be
+// syntactically invalid.
+type StructuralError string
+
+func (s StructuralError) Error() string {
+ return "bzip2 data invalid: " + string(s)
+}
+
+// A reader decompresses bzip2 compressed data.
+type reader struct {
+ br bitReader
+ fileCRC uint32
+ blockCRC uint32
+ wantBlockCRC uint32
+ setupDone bool // true if we have parsed the bzip2 header.
+ blockSize int // blockSize in bytes, i.e. 900 * 1000.
+ eof bool
+ c [256]uint // the ``C'' array for the inverse BWT.
+ tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
+ tPos uint32 // Index of the next output byte in tt.
+
+ preRLE []uint32 // contains the RLE data still to be processed.
+ preRLEUsed int // number of entries of preRLE used.
+ lastByte int // the last byte value seen.
+ byteRepeats uint // the number of repeats of lastByte seen.
+ repeats uint // the number of copies of lastByte to output.
+}
+
+// NewReader returns an io.Reader which decompresses bzip2 data from r.
+// If r does not also implement [io.ByteReader],
+// the decompressor may read more data than necessary from r.
+func NewReader(r io.Reader) io.Reader {
+ bz2 := new(reader)
+ bz2.br = newBitReader(r)
+ return bz2
+}
+
+const bzip2FileMagic = 0x425a // "BZ"
+const bzip2BlockMagic = 0x314159265359
+const bzip2FinalMagic = 0x177245385090
+
+// setup parses the bzip2 header.
+func (bz2 *reader) setup(needMagic bool) error {
+ br := &bz2.br
+
+ if needMagic {
+ magic := br.ReadBits(16)
+ if magic != bzip2FileMagic {
+ return StructuralError("bad magic value")
+ }
+ }
+
+ t := br.ReadBits(8)
+ if t != 'h' {
+ return StructuralError("non-Huffman entropy encoding")
+ }
+
+ level := br.ReadBits(8)
+ if level < '1' || level > '9' {
+ return StructuralError("invalid compression level")
+ }
+
+ bz2.fileCRC = 0
+ bz2.blockSize = 100 * 1000 * (level - '0')
+ if bz2.blockSize > len(bz2.tt) {
+ bz2.tt = make([]uint32, bz2.blockSize)
+ }
+ return nil
+}
+
+func (bz2 *reader) Read(buf []byte) (n int, err error) {
+ if bz2.eof {
+ return 0, io.EOF
+ }
+
+ if !bz2.setupDone {
+ err = bz2.setup(true)
+ brErr := bz2.br.Err()
+ if brErr != nil {
+ err = brErr
+ }
+ if err != nil {
+ return 0, err
+ }
+ bz2.setupDone = true
+ }
+
+ n, err = bz2.read(buf)
+ brErr := bz2.br.Err()
+ if brErr != nil {
+ err = brErr
+ }
+ return
+}
+
+func (bz2 *reader) readFromBlock(buf []byte) int {
+ // bzip2 is a block based compressor, except that it has a run-length
+ // preprocessing step. The block based nature means that we can
+ // preallocate fixed-size buffers and reuse them. However, the RLE
+ // preprocessing would require allocating huge buffers to store the
+ // maximum expansion. Thus we process blocks all at once, except for
+ // the RLE which we decompress as required.
+ n := 0
+ for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
+ // We have RLE data pending.
+
+ // The run-length encoding works like this:
+ // Any sequence of four equal bytes is followed by a length
+ // byte which contains the number of repeats of that byte to
+ // include. (The number of repeats can be zero.) Because we are
+ // decompressing on-demand our state is kept in the reader
+ // object.
+
+ if bz2.repeats > 0 {
+ buf[n] = byte(bz2.lastByte)
+ n++
+ bz2.repeats--
+ if bz2.repeats == 0 {
+ bz2.lastByte = -1
+ }
+ continue
+ }
+
+ bz2.tPos = bz2.preRLE[bz2.tPos]
+ b := byte(bz2.tPos)
+ bz2.tPos >>= 8
+ bz2.preRLEUsed++
+
+ if bz2.byteRepeats == 3 {
+ bz2.repeats = uint(b)
+ bz2.byteRepeats = 0
+ continue
+ }
+
+ if bz2.lastByte == int(b) {
+ bz2.byteRepeats++
+ } else {
+ bz2.byteRepeats = 0
+ }
+ bz2.lastByte = int(b)
+
+ buf[n] = b
+ n++
+ }
+
+ return n
+}
+
+func (bz2 *reader) read(buf []byte) (int, error) {
+ for {
+ n := bz2.readFromBlock(buf)
+ if n > 0 || len(buf) == 0 {
+ bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
+ return n, nil
+ }
+
+ // End of block. Check CRC.
+ if bz2.blockCRC != bz2.wantBlockCRC {
+ bz2.br.err = StructuralError("block checksum mismatch")
+ return 0, bz2.br.err
+ }
+
+ // Find next block.
+ br := &bz2.br
+ switch br.ReadBits64(48) {
+ default:
+ return 0, StructuralError("bad magic value found")
+
+ case bzip2BlockMagic:
+ // Start of block.
+ err := bz2.readBlock()
+ if err != nil {
+ return 0, err
+ }
+
+ case bzip2FinalMagic:
+ // Check end-of-file CRC.
+ wantFileCRC := uint32(br.ReadBits64(32))
+ if br.err != nil {
+ return 0, br.err
+ }
+ if bz2.fileCRC != wantFileCRC {
+ br.err = StructuralError("file checksum mismatch")
+ return 0, br.err
+ }
+
+ // Skip ahead to byte boundary.
+ // Is there a file concatenated to this one?
+ // It would start with BZ.
+ if br.bits%8 != 0 {
+ br.ReadBits(br.bits % 8)
+ }
+ b, err := br.r.ReadByte()
+ if err == io.EOF {
+ br.err = io.EOF
+ bz2.eof = true
+ return 0, io.EOF
+ }
+ if err != nil {
+ br.err = err
+ return 0, err
+ }
+ z, err := br.r.ReadByte()
+ if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ br.err = err
+ return 0, err
+ }
+ if b != 'B' || z != 'Z' {
+ return 0, StructuralError("bad magic value in continuation file")
+ }
+ if err := bz2.setup(false); err != nil {
+ return 0, err
+ }
+ }
+ }
+}
+
+// readBlock reads a bzip2 block. The magic number should already have been consumed.
+func (bz2 *reader) readBlock() (err error) {
+ br := &bz2.br
+ bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
+ bz2.blockCRC = 0
+ bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
+ randomized := br.ReadBits(1)
+ if randomized != 0 {
+ return StructuralError("deprecated randomized files")
+ }
+ origPtr := uint(br.ReadBits(24))
+
+ // If not every byte value is used in the block (i.e., it's text) then
+ // the symbol set is reduced. The symbols used are stored as a
+ // two-level, 16x16 bitmap.
+ symbolRangeUsedBitmap := br.ReadBits(16)
+ symbolPresent := make([]bool, 256)
+ numSymbols := 0
+ for symRange := uint(0); symRange < 16; symRange++ {
+ if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
+ bits := br.ReadBits(16)
+ for symbol := uint(0); symbol < 16; symbol++ {
+ if bits&(1<<(15-symbol)) != 0 {
+ symbolPresent[16*symRange+symbol] = true
+ numSymbols++
+ }
+ }
+ }
+ }
+
+ if numSymbols == 0 {
+ // There must be an EOF symbol.
+ return StructuralError("no symbols in input")
+ }
+
+ // A block uses between two and six different Huffman trees.
+ numHuffmanTrees := br.ReadBits(3)
+ if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
+ return StructuralError("invalid number of Huffman trees")
+ }
+
+ // The Huffman tree can switch every 50 symbols so there's a list of
+ // tree indexes telling us which tree to use for each 50 symbol block.
+ numSelectors := br.ReadBits(15)
+ treeIndexes := make([]uint8, numSelectors)
+
+ // The tree indexes are move-to-front transformed and stored as unary
+ // numbers.
+ mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
+ for i := range treeIndexes {
+ c := 0
+ for {
+ inc := br.ReadBits(1)
+ if inc == 0 {
+ break
+ }
+ c++
+ }
+ if c >= numHuffmanTrees {
+ return StructuralError("tree index too large")
+ }
+ treeIndexes[i] = mtfTreeDecoder.Decode(c)
+ }
+
+ // The list of symbols for the move-to-front transform is taken from
+ // the previously decoded symbol bitmap.
+ symbols := make([]byte, numSymbols)
+ nextSymbol := 0
+ for i := 0; i < 256; i++ {
+ if symbolPresent[i] {
+ symbols[nextSymbol] = byte(i)
+ nextSymbol++
+ }
+ }
+ mtf := newMTFDecoder(symbols)
+
+ numSymbols += 2 // to account for RUNA and RUNB symbols
+ huffmanTrees := make([]huffmanTree, numHuffmanTrees)
+
+ // Now we decode the arrays of code-lengths for each tree.
+ lengths := make([]uint8, numSymbols)
+ for i := range huffmanTrees {
+ // The code lengths are delta encoded from a 5-bit base value.
+ length := br.ReadBits(5)
+ for j := range lengths {
+ for {
+ if length < 1 || length > 20 {
+ return StructuralError("Huffman length out of range")
+ }
+ if !br.ReadBit() {
+ break
+ }
+ if br.ReadBit() {
+ length--
+ } else {
+ length++
+ }
+ }
+ lengths[j] = uint8(length)
+ }
+ huffmanTrees[i], err = newHuffmanTree(lengths)
+ if err != nil {
+ return err
+ }
+ }
+
+ selectorIndex := 1 // the next tree index to use
+ if len(treeIndexes) == 0 {
+ return StructuralError("no tree selectors given")
+ }
+ if int(treeIndexes[0]) >= len(huffmanTrees) {
+ return StructuralError("tree selector out of range")
+ }
+ currentHuffmanTree := huffmanTrees[treeIndexes[0]]
+ bufIndex := 0 // indexes bz2.buf, the output buffer.
+ // The output of the move-to-front transform is run-length encoded and
+ // we merge the decoding into the Huffman parsing loop. These two
+ // variables accumulate the repeat count. See the Wikipedia page for
+ // details.
+ repeat := 0
+ repeatPower := 0
+
+ // The `C' array (used by the inverse BWT) needs to be zero initialized.
+ for i := range bz2.c {
+ bz2.c[i] = 0
+ }
+
+ decoded := 0 // counts the number of symbols decoded by the current tree.
+ for {
+ if decoded == 50 {
+ if selectorIndex >= numSelectors {
+ return StructuralError("insufficient selector indices for number of symbols")
+ }
+ if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
+ return StructuralError("tree selector out of range")
+ }
+ currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
+ selectorIndex++
+ decoded = 0
+ }
+
+ v := currentHuffmanTree.Decode(br)
+ decoded++
+
+ if v < 2 {
+ // This is either the RUNA or RUNB symbol.
+ if repeat == 0 {
+ repeatPower = 1
+ }
+ repeat += repeatPower << v
+ repeatPower <<= 1
+
+ // This limit of 2 million comes from the bzip2 source
+ // code. It prevents repeat from overflowing.
+ if repeat > 2*1024*1024 {
+ return StructuralError("repeat count too large")
+ }
+ continue
+ }
+
+ if repeat > 0 {
+ // We have decoded a complete run-length so we need to
+ // replicate the last output symbol.
+ if repeat > bz2.blockSize-bufIndex {
+ return StructuralError("repeats past end of block")
+ }
+ for i := 0; i < repeat; i++ {
+ b := mtf.First()
+ bz2.tt[bufIndex] = uint32(b)
+ bz2.c[b]++
+ bufIndex++
+ }
+ repeat = 0
+ }
+
+ if int(v) == numSymbols-1 {
+ // This is the EOF symbol. Because it's always at the
+ // end of the move-to-front list, and never gets moved
+ // to the front, it has this unique value.
+ break
+ }
+
+ // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
+ // one would expect |v-2| to be passed to the MTF decoder.
+ // However, the front of the MTF list is never referenced as 0,
+ // it's always referenced with a run-length of 1. Thus 0
+ // doesn't need to be encoded and we have |v-1| in the next
+ // line.
+ b := mtf.Decode(int(v - 1))
+ if bufIndex >= bz2.blockSize {
+ return StructuralError("data exceeds block size")
+ }
+ bz2.tt[bufIndex] = uint32(b)
+ bz2.c[b]++
+ bufIndex++
+ }
+
+ if origPtr >= uint(bufIndex) {
+ return StructuralError("origPtr out of bounds")
+ }
+
+ // We have completed the entropy decoding. Now we can perform the
+ // inverse BWT and setup the RLE buffer.
+ bz2.preRLE = bz2.tt[:bufIndex]
+ bz2.preRLEUsed = 0
+ bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
+ bz2.lastByte = -1
+ bz2.byteRepeats = 0
+ bz2.repeats = 0
+
+ return nil
+}
+
+// inverseBWT implements the inverse Burrows-Wheeler transform as described in
+// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
+// In that document, origPtr is called “I” and c is the “C” array after the
+// first pass over the data. It's an argument here because we merge the first
+// pass with the Huffman decoding.
+//
+// This also implements the “single array” method from the bzip2 source code
+// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
+// index of the next byte in the top 24-bits. The index of the first byte is
+// returned.
+func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
+ sum := uint(0)
+ for i := 0; i < 256; i++ {
+ sum += c[i]
+ c[i] = sum - c[i]
+ }
+
+ for i := range tt {
+ b := tt[i] & 0xff
+ tt[c[b]] |= uint32(i) << 8
+ c[b]++
+ }
+
+ return tt[origPtr] >> 8
+}
+
+// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
+// causing the bits in the input to be processed in the reverse of the usual order.
+
+var crctab [256]uint32
+
+func init() {
+ const poly = 0x04C11DB7
+ for i := range crctab {
+ crc := uint32(i) << 24
+ for j := 0; j < 8; j++ {
+ if crc&0x80000000 != 0 {
+ crc = (crc << 1) ^ poly
+ } else {
+ crc <<= 1
+ }
+ }
+ crctab[i] = crc
+ }
+}
+
+// updateCRC updates the crc value to incorporate the data in b.
+// The initial value is 0.
+func updateCRC(val uint32, b []byte) uint32 {
+ crc := ^val
+ for _, v := range b {
+ crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
+ }
+ return ^crc
+}