diff options
Diffstat (limited to 'src/compress/bzip2')
-rw-r--r-- | src/compress/bzip2/bit_reader.go | 82 | ||||
-rw-r--r-- | src/compress/bzip2/bzip2.go | 502 | ||||
-rw-r--r-- | src/compress/bzip2/bzip2_test.go | 240 | ||||
-rw-r--r-- | src/compress/bzip2/huffman.go | 237 | ||||
-rw-r--r-- | src/compress/bzip2/move_to_front.go | 53 | ||||
-rw-r--r-- | src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2 | bin | 0 -> 132469 bytes | |||
-rw-r--r-- | src/compress/bzip2/testdata/e.txt.bz2 | bin | 0 -> 43149 bytes | |||
-rw-r--r-- | src/compress/bzip2/testdata/fail-issue5747.bz2 | bin | 0 -> 7232 bytes | |||
-rw-r--r-- | src/compress/bzip2/testdata/pass-random1.bin | bin | 0 -> 1024 bytes | |||
-rw-r--r-- | src/compress/bzip2/testdata/pass-random1.bz2 | bin | 0 -> 1309 bytes | |||
-rw-r--r-- | src/compress/bzip2/testdata/pass-random2.bin | 1 | ||||
-rw-r--r-- | src/compress/bzip2/testdata/pass-random2.bz2 | bin | 0 -> 125 bytes | |||
-rw-r--r-- | src/compress/bzip2/testdata/pass-sawtooth.bz2 | bin | 0 -> 2017 bytes | |||
-rw-r--r-- | src/compress/bzip2/testdata/random.data.bz2 | bin | 0 -> 16846 bytes |
14 files changed, 1115 insertions, 0 deletions
diff --git a/src/compress/bzip2/bit_reader.go b/src/compress/bzip2/bit_reader.go new file mode 100644 index 0000000..ab1d606 --- /dev/null +++ b/src/compress/bzip2/bit_reader.go @@ -0,0 +1,82 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bzip2 + +import ( + "bufio" + "io" +) + +// bitReader wraps an io.Reader and provides the ability to read values, +// bit-by-bit, from it. Its Read* methods don't return the usual error +// because the error handling was verbose. Instead, any error is kept and can +// be checked afterwards. +type bitReader struct { + r io.ByteReader + n uint64 + bits uint + err error +} + +// newBitReader returns a new bitReader reading from r. If r is not +// already an io.ByteReader, it will be converted via a bufio.Reader. +func newBitReader(r io.Reader) bitReader { + byter, ok := r.(io.ByteReader) + if !ok { + byter = bufio.NewReader(r) + } + return bitReader{r: byter} +} + +// ReadBits64 reads the given number of bits and returns them in the +// least-significant part of a uint64. In the event of an error, it returns 0 +// and the error can be obtained by calling Err(). +func (br *bitReader) ReadBits64(bits uint) (n uint64) { + for bits > br.bits { + b, err := br.r.ReadByte() + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + if err != nil { + br.err = err + return 0 + } + br.n <<= 8 + br.n |= uint64(b) + br.bits += 8 + } + + // br.n looks like this (assuming that br.bits = 14 and bits = 6): + // Bit: 111111 + // 5432109876543210 + // + // (6 bits, the desired output) + // |-----| + // V V + // 0101101101001110 + // ^ ^ + // |------------| + // br.bits (num valid bits) + // + // This the next line right shifts the desired bits into the + // least-significant places and masks off anything above. + n = (br.n >> (br.bits - bits)) & ((1 << bits) - 1) + br.bits -= bits + return +} + +func (br *bitReader) ReadBits(bits uint) (n int) { + n64 := br.ReadBits64(bits) + return int(n64) +} + +func (br *bitReader) ReadBit() bool { + n := br.ReadBits(1) + return n != 0 +} + +func (br *bitReader) Err() error { + return br.err +} diff --git a/src/compress/bzip2/bzip2.go b/src/compress/bzip2/bzip2.go new file mode 100644 index 0000000..51054cc --- /dev/null +++ b/src/compress/bzip2/bzip2.go @@ -0,0 +1,502 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package bzip2 implements bzip2 decompression. +package bzip2 + +import "io" + +// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot +// of guessing: https://en.wikipedia.org/wiki/Bzip2 +// The source code to pyflate was useful for debugging: +// http://www.paul.sladen.org/projects/pyflate + +// A StructuralError is returned when the bzip2 data is found to be +// syntactically invalid. +type StructuralError string + +func (s StructuralError) Error() string { + return "bzip2 data invalid: " + string(s) +} + +// A reader decompresses bzip2 compressed data. +type reader struct { + br bitReader + fileCRC uint32 + blockCRC uint32 + wantBlockCRC uint32 + setupDone bool // true if we have parsed the bzip2 header. + blockSize int // blockSize in bytes, i.e. 900 * 1000. + eof bool + c [256]uint // the ``C'' array for the inverse BWT. + tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits. + tPos uint32 // Index of the next output byte in tt. + + preRLE []uint32 // contains the RLE data still to be processed. + preRLEUsed int // number of entries of preRLE used. + lastByte int // the last byte value seen. + byteRepeats uint // the number of repeats of lastByte seen. + repeats uint // the number of copies of lastByte to output. +} + +// NewReader returns an io.Reader which decompresses bzip2 data from r. +// If r does not also implement io.ByteReader, +// the decompressor may read more data than necessary from r. +func NewReader(r io.Reader) io.Reader { + bz2 := new(reader) + bz2.br = newBitReader(r) + return bz2 +} + +const bzip2FileMagic = 0x425a // "BZ" +const bzip2BlockMagic = 0x314159265359 +const bzip2FinalMagic = 0x177245385090 + +// setup parses the bzip2 header. +func (bz2 *reader) setup(needMagic bool) error { + br := &bz2.br + + if needMagic { + magic := br.ReadBits(16) + if magic != bzip2FileMagic { + return StructuralError("bad magic value") + } + } + + t := br.ReadBits(8) + if t != 'h' { + return StructuralError("non-Huffman entropy encoding") + } + + level := br.ReadBits(8) + if level < '1' || level > '9' { + return StructuralError("invalid compression level") + } + + bz2.fileCRC = 0 + bz2.blockSize = 100 * 1000 * (level - '0') + if bz2.blockSize > len(bz2.tt) { + bz2.tt = make([]uint32, bz2.blockSize) + } + return nil +} + +func (bz2 *reader) Read(buf []byte) (n int, err error) { + if bz2.eof { + return 0, io.EOF + } + + if !bz2.setupDone { + err = bz2.setup(true) + brErr := bz2.br.Err() + if brErr != nil { + err = brErr + } + if err != nil { + return 0, err + } + bz2.setupDone = true + } + + n, err = bz2.read(buf) + brErr := bz2.br.Err() + if brErr != nil { + err = brErr + } + return +} + +func (bz2 *reader) readFromBlock(buf []byte) int { + // bzip2 is a block based compressor, except that it has a run-length + // preprocessing step. The block based nature means that we can + // preallocate fixed-size buffers and reuse them. However, the RLE + // preprocessing would require allocating huge buffers to store the + // maximum expansion. Thus we process blocks all at once, except for + // the RLE which we decompress as required. + n := 0 + for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { + // We have RLE data pending. + + // The run-length encoding works like this: + // Any sequence of four equal bytes is followed by a length + // byte which contains the number of repeats of that byte to + // include. (The number of repeats can be zero.) Because we are + // decompressing on-demand our state is kept in the reader + // object. + + if bz2.repeats > 0 { + buf[n] = byte(bz2.lastByte) + n++ + bz2.repeats-- + if bz2.repeats == 0 { + bz2.lastByte = -1 + } + continue + } + + bz2.tPos = bz2.preRLE[bz2.tPos] + b := byte(bz2.tPos) + bz2.tPos >>= 8 + bz2.preRLEUsed++ + + if bz2.byteRepeats == 3 { + bz2.repeats = uint(b) + bz2.byteRepeats = 0 + continue + } + + if bz2.lastByte == int(b) { + bz2.byteRepeats++ + } else { + bz2.byteRepeats = 0 + } + bz2.lastByte = int(b) + + buf[n] = b + n++ + } + + return n +} + +func (bz2 *reader) read(buf []byte) (int, error) { + for { + n := bz2.readFromBlock(buf) + if n > 0 || len(buf) == 0 { + bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) + return n, nil + } + + // End of block. Check CRC. + if bz2.blockCRC != bz2.wantBlockCRC { + bz2.br.err = StructuralError("block checksum mismatch") + return 0, bz2.br.err + } + + // Find next block. + br := &bz2.br + switch br.ReadBits64(48) { + default: + return 0, StructuralError("bad magic value found") + + case bzip2BlockMagic: + // Start of block. + err := bz2.readBlock() + if err != nil { + return 0, err + } + + case bzip2FinalMagic: + // Check end-of-file CRC. + wantFileCRC := uint32(br.ReadBits64(32)) + if br.err != nil { + return 0, br.err + } + if bz2.fileCRC != wantFileCRC { + br.err = StructuralError("file checksum mismatch") + return 0, br.err + } + + // Skip ahead to byte boundary. + // Is there a file concatenated to this one? + // It would start with BZ. + if br.bits%8 != 0 { + br.ReadBits(br.bits % 8) + } + b, err := br.r.ReadByte() + if err == io.EOF { + br.err = io.EOF + bz2.eof = true + return 0, io.EOF + } + if err != nil { + br.err = err + return 0, err + } + z, err := br.r.ReadByte() + if err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + br.err = err + return 0, err + } + if b != 'B' || z != 'Z' { + return 0, StructuralError("bad magic value in continuation file") + } + if err := bz2.setup(false); err != nil { + return 0, err + } + } + } +} + +// readBlock reads a bzip2 block. The magic number should already have been consumed. +func (bz2 *reader) readBlock() (err error) { + br := &bz2.br + bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. + bz2.blockCRC = 0 + bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC + randomized := br.ReadBits(1) + if randomized != 0 { + return StructuralError("deprecated randomized files") + } + origPtr := uint(br.ReadBits(24)) + + // If not every byte value is used in the block (i.e., it's text) then + // the symbol set is reduced. The symbols used are stored as a + // two-level, 16x16 bitmap. + symbolRangeUsedBitmap := br.ReadBits(16) + symbolPresent := make([]bool, 256) + numSymbols := 0 + for symRange := uint(0); symRange < 16; symRange++ { + if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { + bits := br.ReadBits(16) + for symbol := uint(0); symbol < 16; symbol++ { + if bits&(1<<(15-symbol)) != 0 { + symbolPresent[16*symRange+symbol] = true + numSymbols++ + } + } + } + } + + if numSymbols == 0 { + // There must be an EOF symbol. + return StructuralError("no symbols in input") + } + + // A block uses between two and six different Huffman trees. + numHuffmanTrees := br.ReadBits(3) + if numHuffmanTrees < 2 || numHuffmanTrees > 6 { + return StructuralError("invalid number of Huffman trees") + } + + // The Huffman tree can switch every 50 symbols so there's a list of + // tree indexes telling us which tree to use for each 50 symbol block. + numSelectors := br.ReadBits(15) + treeIndexes := make([]uint8, numSelectors) + + // The tree indexes are move-to-front transformed and stored as unary + // numbers. + mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) + for i := range treeIndexes { + c := 0 + for { + inc := br.ReadBits(1) + if inc == 0 { + break + } + c++ + } + if c >= numHuffmanTrees { + return StructuralError("tree index too large") + } + treeIndexes[i] = mtfTreeDecoder.Decode(c) + } + + // The list of symbols for the move-to-front transform is taken from + // the previously decoded symbol bitmap. + symbols := make([]byte, numSymbols) + nextSymbol := 0 + for i := 0; i < 256; i++ { + if symbolPresent[i] { + symbols[nextSymbol] = byte(i) + nextSymbol++ + } + } + mtf := newMTFDecoder(symbols) + + numSymbols += 2 // to account for RUNA and RUNB symbols + huffmanTrees := make([]huffmanTree, numHuffmanTrees) + + // Now we decode the arrays of code-lengths for each tree. + lengths := make([]uint8, numSymbols) + for i := range huffmanTrees { + // The code lengths are delta encoded from a 5-bit base value. + length := br.ReadBits(5) + for j := range lengths { + for { + if length < 1 || length > 20 { + return StructuralError("Huffman length out of range") + } + if !br.ReadBit() { + break + } + if br.ReadBit() { + length-- + } else { + length++ + } + } + lengths[j] = uint8(length) + } + huffmanTrees[i], err = newHuffmanTree(lengths) + if err != nil { + return err + } + } + + selectorIndex := 1 // the next tree index to use + if len(treeIndexes) == 0 { + return StructuralError("no tree selectors given") + } + if int(treeIndexes[0]) >= len(huffmanTrees) { + return StructuralError("tree selector out of range") + } + currentHuffmanTree := huffmanTrees[treeIndexes[0]] + bufIndex := 0 // indexes bz2.buf, the output buffer. + // The output of the move-to-front transform is run-length encoded and + // we merge the decoding into the Huffman parsing loop. These two + // variables accumulate the repeat count. See the Wikipedia page for + // details. + repeat := 0 + repeatPower := 0 + + // The `C' array (used by the inverse BWT) needs to be zero initialized. + for i := range bz2.c { + bz2.c[i] = 0 + } + + decoded := 0 // counts the number of symbols decoded by the current tree. + for { + if decoded == 50 { + if selectorIndex >= numSelectors { + return StructuralError("insufficient selector indices for number of symbols") + } + if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) { + return StructuralError("tree selector out of range") + } + currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] + selectorIndex++ + decoded = 0 + } + + v := currentHuffmanTree.Decode(br) + decoded++ + + if v < 2 { + // This is either the RUNA or RUNB symbol. + if repeat == 0 { + repeatPower = 1 + } + repeat += repeatPower << v + repeatPower <<= 1 + + // This limit of 2 million comes from the bzip2 source + // code. It prevents repeat from overflowing. + if repeat > 2*1024*1024 { + return StructuralError("repeat count too large") + } + continue + } + + if repeat > 0 { + // We have decoded a complete run-length so we need to + // replicate the last output symbol. + if repeat > bz2.blockSize-bufIndex { + return StructuralError("repeats past end of block") + } + for i := 0; i < repeat; i++ { + b := mtf.First() + bz2.tt[bufIndex] = uint32(b) + bz2.c[b]++ + bufIndex++ + } + repeat = 0 + } + + if int(v) == numSymbols-1 { + // This is the EOF symbol. Because it's always at the + // end of the move-to-front list, and never gets moved + // to the front, it has this unique value. + break + } + + // Since two metasymbols (RUNA and RUNB) have values 0 and 1, + // one would expect |v-2| to be passed to the MTF decoder. + // However, the front of the MTF list is never referenced as 0, + // it's always referenced with a run-length of 1. Thus 0 + // doesn't need to be encoded and we have |v-1| in the next + // line. + b := mtf.Decode(int(v - 1)) + if bufIndex >= bz2.blockSize { + return StructuralError("data exceeds block size") + } + bz2.tt[bufIndex] = uint32(b) + bz2.c[b]++ + bufIndex++ + } + + if origPtr >= uint(bufIndex) { + return StructuralError("origPtr out of bounds") + } + + // We have completed the entropy decoding. Now we can perform the + // inverse BWT and setup the RLE buffer. + bz2.preRLE = bz2.tt[:bufIndex] + bz2.preRLEUsed = 0 + bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) + bz2.lastByte = -1 + bz2.byteRepeats = 0 + bz2.repeats = 0 + + return nil +} + +// inverseBWT implements the inverse Burrows-Wheeler transform as described in +// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. +// In that document, origPtr is called “I” and c is the “C” array after the +// first pass over the data. It's an argument here because we merge the first +// pass with the Huffman decoding. +// +// This also implements the “single array” method from the bzip2 source code +// which leaves the output, still shuffled, in the bottom 8 bits of tt with the +// index of the next byte in the top 24-bits. The index of the first byte is +// returned. +func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { + sum := uint(0) + for i := 0; i < 256; i++ { + sum += c[i] + c[i] = sum - c[i] + } + + for i := range tt { + b := tt[i] & 0xff + tt[c[b]] |= uint32(i) << 8 + c[b]++ + } + + return tt[origPtr] >> 8 +} + +// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, +// causing the bits in the input to be processed in the reverse of the usual order. + +var crctab [256]uint32 + +func init() { + const poly = 0x04C11DB7 + for i := range crctab { + crc := uint32(i) << 24 + for j := 0; j < 8; j++ { + if crc&0x80000000 != 0 { + crc = (crc << 1) ^ poly + } else { + crc <<= 1 + } + } + crctab[i] = crc + } +} + +// updateCRC updates the crc value to incorporate the data in b. +// The initial value is 0. +func updateCRC(val uint32, b []byte) uint32 { + crc := ^val + for _, v := range b { + crc = crctab[byte(crc>>24)^v] ^ (crc << 8) + } + return ^crc +} diff --git a/src/compress/bzip2/bzip2_test.go b/src/compress/bzip2/bzip2_test.go new file mode 100644 index 0000000..e6065cb --- /dev/null +++ b/src/compress/bzip2/bzip2_test.go @@ -0,0 +1,240 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bzip2 + +import ( + "bytes" + "encoding/hex" + "fmt" + "io" + "os" + "testing" +) + +func mustDecodeHex(s string) []byte { + b, err := hex.DecodeString(s) + if err != nil { + panic(err) + } + return b +} + +func mustLoadFile(f string) []byte { + b, err := os.ReadFile(f) + if err != nil { + panic(err) + } + return b +} + +func trim(b []byte) string { + const limit = 1024 + if len(b) < limit { + return fmt.Sprintf("%q", b) + } + return fmt.Sprintf("%q...", b[:limit]) +} + +func TestReader(t *testing.T) { + var vectors = []struct { + desc string + input []byte + output []byte + fail bool + }{{ + desc: "hello world", + input: mustDecodeHex("" + + "425a68393141592653594eece83600000251800010400006449080200031064c" + + "4101a7a9a580bb9431f8bb9229c28482776741b0", + ), + output: []byte("hello world\n"), + }, { + desc: "concatenated files", + input: mustDecodeHex("" + + "425a68393141592653594eece83600000251800010400006449080200031064c" + + "4101a7a9a580bb9431f8bb9229c28482776741b0425a68393141592653594eec" + + "e83600000251800010400006449080200031064c4101a7a9a580bb9431f8bb92" + + "29c28482776741b0", + ), + output: []byte("hello world\nhello world\n"), + }, { + desc: "32B zeros", + input: mustDecodeHex("" + + "425a6839314159265359b5aa5098000000600040000004200021008283177245" + + "385090b5aa5098", + ), + output: make([]byte, 32), + }, { + desc: "1MiB zeros", + input: mustDecodeHex("" + + "425a683931415926535938571ce50008084000c0040008200030cc0529a60806" + + "c4201e2ee48a70a12070ae39ca", + ), + output: make([]byte, 1<<20), + }, { + desc: "random data", + input: mustLoadFile("testdata/pass-random1.bz2"), + output: mustLoadFile("testdata/pass-random1.bin"), + }, { + desc: "random data - full symbol range", + input: mustLoadFile("testdata/pass-random2.bz2"), + output: mustLoadFile("testdata/pass-random2.bin"), + }, { + desc: "random data - uses RLE1 stage", + input: mustDecodeHex("" + + "425a6839314159265359d992d0f60000137dfe84020310091c1e280e100e0428" + + "01099210094806c0110002e70806402000546034000034000000f28300000320" + + "00d3403264049270eb7a9280d308ca06ad28f6981bee1bf8160727c7364510d7" + + "3a1e123083421b63f031f63993a0f40051fbf177245385090d992d0f60", + ), + output: mustDecodeHex("" + + "92d5652616ac444a4a04af1a8a3964aca0450d43d6cf233bd03233f4ba92f871" + + "9e6c2a2bd4f5f88db07ecd0da3a33b263483db9b2c158786ad6363be35d17335" + + "ba", + ), + }, { + desc: "1MiB sawtooth", + input: mustLoadFile("testdata/pass-sawtooth.bz2"), + output: func() []byte { + b := make([]byte, 1<<20) + for i := range b { + b[i] = byte(i) + } + return b + }(), + }, { + desc: "RLE2 buffer overrun - issue 5747", + input: mustLoadFile("testdata/fail-issue5747.bz2"), + fail: true, + }, { + desc: "out-of-range selector - issue 8363", + input: mustDecodeHex("" + + "425a68393141592653594eece83600000251800010400006449080200031064c" + + "4101a7a9a580bb943117724538509000000000", + ), + fail: true, + }, { + desc: "bad block size - issue 13941", + input: mustDecodeHex("" + + "425a683131415926535936dc55330063ffc0006000200020a40830008b0008b8" + + "bb9229c28481b6e2a998", + ), + fail: true, + }, { + desc: "bad huffman delta", + input: mustDecodeHex("" + + "425a6836314159265359b1f7404b000000400040002000217d184682ee48a70a" + + "12163ee80960", + ), + fail: true, + }} + + for i, v := range vectors { + rd := NewReader(bytes.NewReader(v.input)) + buf, err := io.ReadAll(rd) + + if fail := bool(err != nil); fail != v.fail { + if fail { + t.Errorf("test %d (%s), unexpected failure: %v", i, v.desc, err) + } else { + t.Errorf("test %d (%s), unexpected success", i, v.desc) + } + } + if !v.fail && !bytes.Equal(buf, v.output) { + t.Errorf("test %d (%s), output mismatch:\ngot %s\nwant %s", i, v.desc, trim(buf), trim(v.output)) + } + } +} + +func TestBitReader(t *testing.T) { + var vectors = []struct { + nbits uint // Number of bits to read + value int // Expected output value (0 for error) + fail bool // Expected operation failure? + }{ + {nbits: 1, value: 1}, + {nbits: 1, value: 0}, + {nbits: 1, value: 1}, + {nbits: 5, value: 11}, + {nbits: 32, value: 0x12345678}, + {nbits: 15, value: 14495}, + {nbits: 3, value: 6}, + {nbits: 6, value: 13}, + {nbits: 1, fail: true}, + } + + rd := bytes.NewReader([]byte{0xab, 0x12, 0x34, 0x56, 0x78, 0x71, 0x3f, 0x8d}) + br := newBitReader(rd) + for i, v := range vectors { + val := br.ReadBits(v.nbits) + if fail := bool(br.err != nil); fail != v.fail { + if fail { + t.Errorf("test %d, unexpected failure: ReadBits(%d) = %v", i, v.nbits, br.err) + } else { + t.Errorf("test %d, unexpected success: ReadBits(%d) = nil", i, v.nbits) + } + } + if !v.fail && val != v.value { + t.Errorf("test %d, mismatching value: ReadBits(%d) = %d, want %d", i, v.nbits, val, v.value) + } + } +} + +func TestMTF(t *testing.T) { + var vectors = []struct { + idx int // Input index + sym uint8 // Expected output symbol + }{ + {idx: 1, sym: 1}, // [1 0 2 3 4] + {idx: 0, sym: 1}, // [1 0 2 3 4] + {idx: 1, sym: 0}, // [0 1 2 3 4] + {idx: 4, sym: 4}, // [4 0 1 2 3] + {idx: 1, sym: 0}, // [0 4 1 2 3] + } + + mtf := newMTFDecoderWithRange(5) + for i, v := range vectors { + sym := mtf.Decode(v.idx) + t.Log(mtf) + if sym != v.sym { + t.Errorf("test %d, symbol mismatch: Decode(%d) = %d, want %d", i, v.idx, sym, v.sym) + } + } +} + +func TestZeroRead(t *testing.T) { + b := mustDecodeHex("425a6839314159265359b5aa5098000000600040000004200021008283177245385090b5aa5098") + r := NewReader(bytes.NewReader(b)) + if n, err := r.Read(nil); n != 0 || err != nil { + t.Errorf("Read(nil) = (%d, %v), want (0, nil)", n, err) + } +} + +var ( + digits = mustLoadFile("testdata/e.txt.bz2") + newton = mustLoadFile("testdata/Isaac.Newton-Opticks.txt.bz2") + random = mustLoadFile("testdata/random.data.bz2") +) + +func benchmarkDecode(b *testing.B, compressed []byte) { + // Determine the uncompressed size of testfile. + uncompressedSize, err := io.Copy(io.Discard, NewReader(bytes.NewReader(compressed))) + if err != nil { + b.Fatal(err) + } + + b.SetBytes(uncompressedSize) + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + r := bytes.NewReader(compressed) + io.Copy(io.Discard, NewReader(r)) + } +} + +func BenchmarkDecodeDigits(b *testing.B) { benchmarkDecode(b, digits) } +func BenchmarkDecodeNewton(b *testing.B) { benchmarkDecode(b, newton) } +func BenchmarkDecodeRand(b *testing.B) { benchmarkDecode(b, random) } diff --git a/src/compress/bzip2/huffman.go b/src/compress/bzip2/huffman.go new file mode 100644 index 0000000..447fc4d --- /dev/null +++ b/src/compress/bzip2/huffman.go @@ -0,0 +1,237 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bzip2 + +import "sort" + +// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a +// symbol. +type huffmanTree struct { + // nodes contains all the non-leaf nodes in the tree. nodes[0] is the + // root of the tree and nextNode contains the index of the next element + // of nodes to use when the tree is being constructed. + nodes []huffmanNode + nextNode int +} + +// A huffmanNode is a node in the tree. left and right contain indexes into the +// nodes slice of the tree. If left or right is invalidNodeValue then the child +// is a left node and its value is in leftValue/rightValue. +// +// The symbols are uint16s because bzip2 encodes not only MTF indexes in the +// tree, but also two magic values for run-length encoding and an EOF symbol. +// Thus there are more than 256 possible symbols. +type huffmanNode struct { + left, right uint16 + leftValue, rightValue uint16 +} + +// invalidNodeValue is an invalid index which marks a leaf node in the tree. +const invalidNodeValue = 0xffff + +// Decode reads bits from the given bitReader and navigates the tree until a +// symbol is found. +func (t *huffmanTree) Decode(br *bitReader) (v uint16) { + nodeIndex := uint16(0) // node 0 is the root of the tree. + + for { + node := &t.nodes[nodeIndex] + + var bit uint16 + if br.bits > 0 { + // Get next bit - fast path. + br.bits-- + bit = uint16(br.n>>(br.bits&63)) & 1 + } else { + // Get next bit - slow path. + // Use ReadBits to retrieve a single bit + // from the underling io.ByteReader. + bit = uint16(br.ReadBits(1)) + } + + // Trick a compiler into generating conditional move instead of branch, + // by making both loads unconditional. + l, r := node.left, node.right + + if bit == 1 { + nodeIndex = l + } else { + nodeIndex = r + } + + if nodeIndex == invalidNodeValue { + // We found a leaf. Use the value of bit to decide + // whether is a left or a right value. + l, r := node.leftValue, node.rightValue + if bit == 1 { + v = l + } else { + v = r + } + return + } + } +} + +// newHuffmanTree builds a Huffman tree from a slice containing the code +// lengths of each symbol. The maximum code length is 32 bits. +func newHuffmanTree(lengths []uint8) (huffmanTree, error) { + // There are many possible trees that assign the same code length to + // each symbol (consider reflecting a tree down the middle, for + // example). Since the code length assignments determine the + // efficiency of the tree, each of these trees is equally good. In + // order to minimize the amount of information needed to build a tree + // bzip2 uses a canonical tree so that it can be reconstructed given + // only the code length assignments. + + if len(lengths) < 2 { + panic("newHuffmanTree: too few symbols") + } + + var t huffmanTree + + // First we sort the code length assignments by ascending code length, + // using the symbol value to break ties. + pairs := make([]huffmanSymbolLengthPair, len(lengths)) + for i, length := range lengths { + pairs[i].value = uint16(i) + pairs[i].length = length + } + + sort.Slice(pairs, func(i, j int) bool { + if pairs[i].length < pairs[j].length { + return true + } + if pairs[i].length > pairs[j].length { + return false + } + if pairs[i].value < pairs[j].value { + return true + } + return false + }) + + // Now we assign codes to the symbols, starting with the longest code. + // We keep the codes packed into a uint32, at the most-significant end. + // So branches are taken from the MSB downwards. This makes it easy to + // sort them later. + code := uint32(0) + length := uint8(32) + + codes := make([]huffmanCode, len(lengths)) + for i := len(pairs) - 1; i >= 0; i-- { + if length > pairs[i].length { + length = pairs[i].length + } + codes[i].code = code + codes[i].codeLen = length + codes[i].value = pairs[i].value + // We need to 'increment' the code, which means treating |code| + // like a |length| bit number. + code += 1 << (32 - length) + } + + // Now we can sort by the code so that the left half of each branch are + // grouped together, recursively. + sort.Slice(codes, func(i, j int) bool { + return codes[i].code < codes[j].code + }) + + t.nodes = make([]huffmanNode, len(codes)) + _, err := buildHuffmanNode(&t, codes, 0) + return t, err +} + +// huffmanSymbolLengthPair contains a symbol and its code length. +type huffmanSymbolLengthPair struct { + value uint16 + length uint8 +} + +// huffmanCode contains a symbol, its code and code length. +type huffmanCode struct { + code uint32 + codeLen uint8 + value uint16 +} + +// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in +// the Huffman tree at the given level. It returns the index of the newly +// constructed node. +func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) { + test := uint32(1) << (31 - level) + + // We have to search the list of codes to find the divide between the left and right sides. + firstRightIndex := len(codes) + for i, code := range codes { + if code.code&test != 0 { + firstRightIndex = i + break + } + } + + left := codes[:firstRightIndex] + right := codes[firstRightIndex:] + + if len(left) == 0 || len(right) == 0 { + // There is a superfluous level in the Huffman tree indicating + // a bug in the encoder. However, this bug has been observed in + // the wild so we handle it. + + // If this function was called recursively then we know that + // len(codes) >= 2 because, otherwise, we would have hit the + // "leaf node" case, below, and not recurred. + // + // However, for the initial call it's possible that len(codes) + // is zero or one. Both cases are invalid because a zero length + // tree cannot encode anything and a length-1 tree can only + // encode EOF and so is superfluous. We reject both. + if len(codes) < 2 { + return 0, StructuralError("empty Huffman tree") + } + + // In this case the recursion doesn't always reduce the length + // of codes so we need to ensure termination via another + // mechanism. + if level == 31 { + // Since len(codes) >= 2 the only way that the values + // can match at all 32 bits is if they are equal, which + // is invalid. This ensures that we never enter + // infinite recursion. + return 0, StructuralError("equal symbols in Huffman tree") + } + + if len(left) == 0 { + return buildHuffmanNode(t, right, level+1) + } + return buildHuffmanNode(t, left, level+1) + } + + nodeIndex = uint16(t.nextNode) + node := &t.nodes[t.nextNode] + t.nextNode++ + + if len(left) == 1 { + // leaf node + node.left = invalidNodeValue + node.leftValue = left[0].value + } else { + node.left, err = buildHuffmanNode(t, left, level+1) + } + + if err != nil { + return + } + + if len(right) == 1 { + // leaf node + node.right = invalidNodeValue + node.rightValue = right[0].value + } else { + node.right, err = buildHuffmanNode(t, right, level+1) + } + + return +} diff --git a/src/compress/bzip2/move_to_front.go b/src/compress/bzip2/move_to_front.go new file mode 100644 index 0000000..526dfb3 --- /dev/null +++ b/src/compress/bzip2/move_to_front.go @@ -0,0 +1,53 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bzip2 + +// moveToFrontDecoder implements a move-to-front list. Such a list is an +// efficient way to transform a string with repeating elements into one with +// many small valued numbers, which is suitable for entropy encoding. It works +// by starting with an initial list of symbols and references symbols by their +// index into that list. When a symbol is referenced, it's moved to the front +// of the list. Thus, a repeated symbol ends up being encoded with many zeros, +// as the symbol will be at the front of the list after the first access. +type moveToFrontDecoder []byte + +// newMTFDecoder creates a move-to-front decoder with an explicit initial list +// of symbols. +func newMTFDecoder(symbols []byte) moveToFrontDecoder { + if len(symbols) > 256 { + panic("too many symbols") + } + return moveToFrontDecoder(symbols) +} + +// newMTFDecoderWithRange creates a move-to-front decoder with an initial +// symbol list of 0...n-1. +func newMTFDecoderWithRange(n int) moveToFrontDecoder { + if n > 256 { + panic("newMTFDecoderWithRange: cannot have > 256 symbols") + } + + m := make([]byte, n) + for i := 0; i < n; i++ { + m[i] = byte(i) + } + return moveToFrontDecoder(m) +} + +func (m moveToFrontDecoder) Decode(n int) (b byte) { + // Implement move-to-front with a simple copy. This approach + // beats more sophisticated approaches in benchmarking, probably + // because it has high locality of reference inside of a + // single cache line (most move-to-front operations have n < 64). + b = m[n] + copy(m[1:], m[:n]) + m[0] = b + return +} + +// First returns the symbol at the front of the list. +func (m moveToFrontDecoder) First() byte { + return m[0] +} diff --git a/src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2 b/src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2 Binary files differnew file mode 100644 index 0000000..6c56de3 --- /dev/null +++ b/src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2 diff --git a/src/compress/bzip2/testdata/e.txt.bz2 b/src/compress/bzip2/testdata/e.txt.bz2 Binary files differnew file mode 100644 index 0000000..65bf3b4 --- /dev/null +++ b/src/compress/bzip2/testdata/e.txt.bz2 diff --git a/src/compress/bzip2/testdata/fail-issue5747.bz2 b/src/compress/bzip2/testdata/fail-issue5747.bz2 Binary files differnew file mode 100644 index 0000000..2bf2b6a --- /dev/null +++ b/src/compress/bzip2/testdata/fail-issue5747.bz2 diff --git a/src/compress/bzip2/testdata/pass-random1.bin b/src/compress/bzip2/testdata/pass-random1.bin Binary files differnew file mode 100644 index 0000000..6e17879 --- /dev/null +++ b/src/compress/bzip2/testdata/pass-random1.bin diff --git a/src/compress/bzip2/testdata/pass-random1.bz2 b/src/compress/bzip2/testdata/pass-random1.bz2 Binary files differnew file mode 100644 index 0000000..f6a9dc7 --- /dev/null +++ b/src/compress/bzip2/testdata/pass-random1.bz2 diff --git a/src/compress/bzip2/testdata/pass-random2.bin b/src/compress/bzip2/testdata/pass-random2.bin new file mode 100644 index 0000000..f152d40 --- /dev/null +++ b/src/compress/bzip2/testdata/pass-random2.bin @@ -0,0 +1 @@ +e&DJJ9dE
C#;23ql*+~
;&4ۛ,cc5s5
\ No newline at end of file diff --git a/src/compress/bzip2/testdata/pass-random2.bz2 b/src/compress/bzip2/testdata/pass-random2.bz2 Binary files differnew file mode 100644 index 0000000..91ef775 --- /dev/null +++ b/src/compress/bzip2/testdata/pass-random2.bz2 diff --git a/src/compress/bzip2/testdata/pass-sawtooth.bz2 b/src/compress/bzip2/testdata/pass-sawtooth.bz2 Binary files differnew file mode 100644 index 0000000..579a378 --- /dev/null +++ b/src/compress/bzip2/testdata/pass-sawtooth.bz2 diff --git a/src/compress/bzip2/testdata/random.data.bz2 b/src/compress/bzip2/testdata/random.data.bz2 Binary files differnew file mode 100644 index 0000000..1ef2300 --- /dev/null +++ b/src/compress/bzip2/testdata/random.data.bz2 |