From f6ad4dcef54c5ce997a4bad5a6d86de229015700 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Tue, 16 Apr 2024 21:25:22 +0200
Subject: Adding upstream version 1.22.1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/compress/bzip2/bit_reader.go                   |  82 ++++
 src/compress/bzip2/bzip2.go                        | 502 +++++++++++++++++++++
 src/compress/bzip2/bzip2_test.go                   | 240 ++++++++++
 src/compress/bzip2/huffman.go                      | 237 ++++++++++
 src/compress/bzip2/move_to_front.go                |  53 +++
 .../bzip2/testdata/Isaac.Newton-Opticks.txt.bz2    | Bin 0 -> 132469 bytes
 src/compress/bzip2/testdata/e.txt.bz2              | Bin 0 -> 43149 bytes
 src/compress/bzip2/testdata/fail-issue5747.bz2     | Bin 0 -> 7232 bytes
 src/compress/bzip2/testdata/pass-random1.bin       | Bin 0 -> 1024 bytes
 src/compress/bzip2/testdata/pass-random1.bz2       | Bin 0 -> 1309 bytes
 src/compress/bzip2/testdata/pass-random2.bin       |   1 +
 src/compress/bzip2/testdata/pass-random2.bz2       | Bin 0 -> 125 bytes
 src/compress/bzip2/testdata/pass-sawtooth.bz2      | Bin 0 -> 2017 bytes
 src/compress/bzip2/testdata/random.data.bz2        | Bin 0 -> 16846 bytes
 14 files changed, 1115 insertions(+)
 create mode 100644 src/compress/bzip2/bit_reader.go
 create mode 100644 src/compress/bzip2/bzip2.go
 create mode 100644 src/compress/bzip2/bzip2_test.go
 create mode 100644 src/compress/bzip2/huffman.go
 create mode 100644 src/compress/bzip2/move_to_front.go
 create mode 100644 src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2
 create mode 100644 src/compress/bzip2/testdata/e.txt.bz2
 create mode 100644 src/compress/bzip2/testdata/fail-issue5747.bz2
 create mode 100644 src/compress/bzip2/testdata/pass-random1.bin
 create mode 100644 src/compress/bzip2/testdata/pass-random1.bz2
 create mode 100644 src/compress/bzip2/testdata/pass-random2.bin
 create mode 100644 src/compress/bzip2/testdata/pass-random2.bz2
 create mode 100644 src/compress/bzip2/testdata/pass-sawtooth.bz2
 create mode 100644 src/compress/bzip2/testdata/random.data.bz2

(limited to 'src/compress/bzip2')

diff --git a/src/compress/bzip2/bit_reader.go b/src/compress/bzip2/bit_reader.go
new file mode 100644
index 0000000..8c40777
--- /dev/null
+++ b/src/compress/bzip2/bit_reader.go
@@ -0,0 +1,82 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bzip2
+
+import (
+	"bufio"
+	"io"
+)
+
+// bitReader wraps an io.Reader and provides the ability to read values,
+// bit-by-bit, from it. Its Read* methods don't return the usual error
+// because the error handling was verbose. Instead, any error is kept and can
+// be checked afterwards.
+type bitReader struct {
+	r    io.ByteReader
+	n    uint64
+	bits uint
+	err  error
+}
+
+// newBitReader returns a new bitReader reading from r. If r is not
+// already an io.ByteReader, it will be converted via a bufio.Reader.
+func newBitReader(r io.Reader) bitReader {
+	byter, ok := r.(io.ByteReader)
+	if !ok {
+		byter = bufio.NewReader(r)
+	}
+	return bitReader{r: byter}
+}
+
+// ReadBits64 reads the given number of bits and returns them in the
+// least-significant part of a uint64. In the event of an error, it returns 0
+// and the error can be obtained by calling bitReader.Err().
+func (br *bitReader) ReadBits64(bits uint) (n uint64) {
+	for bits > br.bits {
+		b, err := br.r.ReadByte()
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
+		if err != nil {
+			br.err = err
+			return 0
+		}
+		br.n <<= 8
+		br.n |= uint64(b)
+		br.bits += 8
+	}
+
+	// br.n looks like this (assuming that br.bits = 14 and bits = 6):
+	// Bit: 111111
+	//      5432109876543210
+	//
+	//         (6 bits, the desired output)
+	//        |-----|
+	//        V     V
+	//      0101101101001110
+	//        ^            ^
+	//        |------------|
+	//           br.bits (num valid bits)
+	//
+	// The next line right shifts the desired bits into the
+	// least-significant places and masks off anything above.
+	n = (br.n >> (br.bits - bits)) & ((1 << bits) - 1)
+	br.bits -= bits
+	return
+}
+
+func (br *bitReader) ReadBits(bits uint) (n int) {
+	n64 := br.ReadBits64(bits)
+	return int(n64)
+}
+
+func (br *bitReader) ReadBit() bool {
+	n := br.ReadBits(1)
+	return n != 0
+}
+
+func (br *bitReader) Err() error {
+	return br.err
+}
diff --git a/src/compress/bzip2/bzip2.go b/src/compress/bzip2/bzip2.go
new file mode 100644
index 0000000..8f88e38
--- /dev/null
+++ b/src/compress/bzip2/bzip2.go
@@ -0,0 +1,502 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package bzip2 implements bzip2 decompression.
+package bzip2
+
+import "io"
+
+// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
+// of guessing: https://en.wikipedia.org/wiki/Bzip2
+// The source code to pyflate was useful for debugging:
+// http://www.paul.sladen.org/projects/pyflate
+
+// A StructuralError is returned when the bzip2 data is found to be
+// syntactically invalid.
+type StructuralError string
+
+func (s StructuralError) Error() string {
+	return "bzip2 data invalid: " + string(s)
+}
+
+// A reader decompresses bzip2 compressed data.
+type reader struct {
+	br           bitReader
+	fileCRC      uint32
+	blockCRC     uint32
+	wantBlockCRC uint32
+	setupDone    bool // true if we have parsed the bzip2 header.
+	blockSize    int  // blockSize in bytes, i.e. 900 * 1000.
+	eof          bool
+	c            [256]uint // the ``C'' array for the inverse BWT.
+	tt           []uint32  // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
+	tPos         uint32    // Index of the next output byte in tt.
+
+	preRLE      []uint32 // contains the RLE data still to be processed.
+	preRLEUsed  int      // number of entries of preRLE used.
+	lastByte    int      // the last byte value seen.
+	byteRepeats uint     // the number of repeats of lastByte seen.
+	repeats     uint     // the number of copies of lastByte to output.
+}
+
+// NewReader returns an io.Reader which decompresses bzip2 data from r.
+// If r does not also implement [io.ByteReader],
+// the decompressor may read more data than necessary from r.
+func NewReader(r io.Reader) io.Reader {
+	bz2 := new(reader)
+	bz2.br = newBitReader(r)
+	return bz2
+}
+
+const bzip2FileMagic = 0x425a // "BZ"
+const bzip2BlockMagic = 0x314159265359
+const bzip2FinalMagic = 0x177245385090
+
+// setup parses the bzip2 header.
+func (bz2 *reader) setup(needMagic bool) error {
+	br := &bz2.br
+
+	if needMagic {
+		magic := br.ReadBits(16)
+		if magic != bzip2FileMagic {
+			return StructuralError("bad magic value")
+		}
+	}
+
+	t := br.ReadBits(8)
+	if t != 'h' {
+		return StructuralError("non-Huffman entropy encoding")
+	}
+
+	level := br.ReadBits(8)
+	if level < '1' || level > '9' {
+		return StructuralError("invalid compression level")
+	}
+
+	bz2.fileCRC = 0
+	bz2.blockSize = 100 * 1000 * (level - '0')
+	if bz2.blockSize > len(bz2.tt) {
+		bz2.tt = make([]uint32, bz2.blockSize)
+	}
+	return nil
+}
+
+func (bz2 *reader) Read(buf []byte) (n int, err error) {
+	if bz2.eof {
+		return 0, io.EOF
+	}
+
+	if !bz2.setupDone {
+		err = bz2.setup(true)
+		brErr := bz2.br.Err()
+		if brErr != nil {
+			err = brErr
+		}
+		if err != nil {
+			return 0, err
+		}
+		bz2.setupDone = true
+	}
+
+	n, err = bz2.read(buf)
+	brErr := bz2.br.Err()
+	if brErr != nil {
+		err = brErr
+	}
+	return
+}
+
+func (bz2 *reader) readFromBlock(buf []byte) int {
+	// bzip2 is a block based compressor, except that it has a run-length
+	// preprocessing step. The block based nature means that we can
+	// preallocate fixed-size buffers and reuse them. However, the RLE
+	// preprocessing would require allocating huge buffers to store the
+	// maximum expansion. Thus we process blocks all at once, except for
+	// the RLE which we decompress as required.
+	n := 0
+	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
+		// We have RLE data pending.
+
+		// The run-length encoding works like this:
+		// Any sequence of four equal bytes is followed by a length
+		// byte which contains the number of repeats of that byte to
+		// include. (The number of repeats can be zero.) Because we are
+		// decompressing on-demand our state is kept in the reader
+		// object.
+
+		if bz2.repeats > 0 {
+			buf[n] = byte(bz2.lastByte)
+			n++
+			bz2.repeats--
+			if bz2.repeats == 0 {
+				bz2.lastByte = -1
+			}
+			continue
+		}
+
+		bz2.tPos = bz2.preRLE[bz2.tPos]
+		b := byte(bz2.tPos)
+		bz2.tPos >>= 8
+		bz2.preRLEUsed++
+
+		if bz2.byteRepeats == 3 {
+			bz2.repeats = uint(b)
+			bz2.byteRepeats = 0
+			continue
+		}
+
+		if bz2.lastByte == int(b) {
+			bz2.byteRepeats++
+		} else {
+			bz2.byteRepeats = 0
+		}
+		bz2.lastByte = int(b)
+
+		buf[n] = b
+		n++
+	}
+
+	return n
+}
+
+func (bz2 *reader) read(buf []byte) (int, error) {
+	for {
+		n := bz2.readFromBlock(buf)
+		if n > 0 || len(buf) == 0 {
+			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
+			return n, nil
+		}
+
+		// End of block. Check CRC.
+		if bz2.blockCRC != bz2.wantBlockCRC {
+			bz2.br.err = StructuralError("block checksum mismatch")
+			return 0, bz2.br.err
+		}
+
+		// Find next block.
+		br := &bz2.br
+		switch br.ReadBits64(48) {
+		default:
+			return 0, StructuralError("bad magic value found")
+
+		case bzip2BlockMagic:
+			// Start of block.
+			err := bz2.readBlock()
+			if err != nil {
+				return 0, err
+			}
+
+		case bzip2FinalMagic:
+			// Check end-of-file CRC.
+			wantFileCRC := uint32(br.ReadBits64(32))
+			if br.err != nil {
+				return 0, br.err
+			}
+			if bz2.fileCRC != wantFileCRC {
+				br.err = StructuralError("file checksum mismatch")
+				return 0, br.err
+			}
+
+			// Skip ahead to byte boundary.
+			// Is there a file concatenated to this one?
+			// It would start with BZ.
+			if br.bits%8 != 0 {
+				br.ReadBits(br.bits % 8)
+			}
+			b, err := br.r.ReadByte()
+			if err == io.EOF {
+				br.err = io.EOF
+				bz2.eof = true
+				return 0, io.EOF
+			}
+			if err != nil {
+				br.err = err
+				return 0, err
+			}
+			z, err := br.r.ReadByte()
+			if err != nil {
+				if err == io.EOF {
+					err = io.ErrUnexpectedEOF
+				}
+				br.err = err
+				return 0, err
+			}
+			if b != 'B' || z != 'Z' {
+				return 0, StructuralError("bad magic value in continuation file")
+			}
+			if err := bz2.setup(false); err != nil {
+				return 0, err
+			}
+		}
+	}
+}
+
+// readBlock reads a bzip2 block. The magic number should already have been consumed.
+func (bz2 *reader) readBlock() (err error) {
+	br := &bz2.br
+	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
+	bz2.blockCRC = 0
+	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
+	randomized := br.ReadBits(1)
+	if randomized != 0 {
+		return StructuralError("deprecated randomized files")
+	}
+	origPtr := uint(br.ReadBits(24))
+
+	// If not every byte value is used in the block (i.e., it's text) then
+	// the symbol set is reduced. The symbols used are stored as a
+	// two-level, 16x16 bitmap.
+	symbolRangeUsedBitmap := br.ReadBits(16)
+	symbolPresent := make([]bool, 256)
+	numSymbols := 0
+	for symRange := uint(0); symRange < 16; symRange++ {
+		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
+			bits := br.ReadBits(16)
+			for symbol := uint(0); symbol < 16; symbol++ {
+				if bits&(1<<(15-symbol)) != 0 {
+					symbolPresent[16*symRange+symbol] = true
+					numSymbols++
+				}
+			}
+		}
+	}
+
+	if numSymbols == 0 {
+		// There must be an EOF symbol.
+		return StructuralError("no symbols in input")
+	}
+
+	// A block uses between two and six different Huffman trees.
+	numHuffmanTrees := br.ReadBits(3)
+	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
+		return StructuralError("invalid number of Huffman trees")
+	}
+
+	// The Huffman tree can switch every 50 symbols so there's a list of
+	// tree indexes telling us which tree to use for each 50 symbol block.
+	numSelectors := br.ReadBits(15)
+	treeIndexes := make([]uint8, numSelectors)
+
+	// The tree indexes are move-to-front transformed and stored as unary
+	// numbers.
+	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
+	for i := range treeIndexes {
+		c := 0
+		for {
+			inc := br.ReadBits(1)
+			if inc == 0 {
+				break
+			}
+			c++
+		}
+		if c >= numHuffmanTrees {
+			return StructuralError("tree index too large")
+		}
+		treeIndexes[i] = mtfTreeDecoder.Decode(c)
+	}
+
+	// The list of symbols for the move-to-front transform is taken from
+	// the previously decoded symbol bitmap.
+	symbols := make([]byte, numSymbols)
+	nextSymbol := 0
+	for i := 0; i < 256; i++ {
+		if symbolPresent[i] {
+			symbols[nextSymbol] = byte(i)
+			nextSymbol++
+		}
+	}
+	mtf := newMTFDecoder(symbols)
+
+	numSymbols += 2 // to account for RUNA and RUNB symbols
+	huffmanTrees := make([]huffmanTree, numHuffmanTrees)
+
+	// Now we decode the arrays of code-lengths for each tree.
+	lengths := make([]uint8, numSymbols)
+	for i := range huffmanTrees {
+		// The code lengths are delta encoded from a 5-bit base value.
+		length := br.ReadBits(5)
+		for j := range lengths {
+			for {
+				if length < 1 || length > 20 {
+					return StructuralError("Huffman length out of range")
+				}
+				if !br.ReadBit() {
+					break
+				}
+				if br.ReadBit() {
+					length--
+				} else {
+					length++
+				}
+			}
+			lengths[j] = uint8(length)
+		}
+		huffmanTrees[i], err = newHuffmanTree(lengths)
+		if err != nil {
+			return err
+		}
+	}
+
+	selectorIndex := 1 // the next tree index to use
+	if len(treeIndexes) == 0 {
+		return StructuralError("no tree selectors given")
+	}
+	if int(treeIndexes[0]) >= len(huffmanTrees) {
+		return StructuralError("tree selector out of range")
+	}
+	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
+	bufIndex := 0 // indexes bz2.buf, the output buffer.
+	// The output of the move-to-front transform is run-length encoded and
+	// we merge the decoding into the Huffman parsing loop. These two
+	// variables accumulate the repeat count. See the Wikipedia page for
+	// details.
+	repeat := 0
+	repeatPower := 0
+
+	// The `C' array (used by the inverse BWT) needs to be zero initialized.
+	for i := range bz2.c {
+		bz2.c[i] = 0
+	}
+
+	decoded := 0 // counts the number of symbols decoded by the current tree.
+	for {
+		if decoded == 50 {
+			if selectorIndex >= numSelectors {
+				return StructuralError("insufficient selector indices for number of symbols")
+			}
+			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
+				return StructuralError("tree selector out of range")
+			}
+			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
+			selectorIndex++
+			decoded = 0
+		}
+
+		v := currentHuffmanTree.Decode(br)
+		decoded++
+
+		if v < 2 {
+			// This is either the RUNA or RUNB symbol.
+			if repeat == 0 {
+				repeatPower = 1
+			}
+			repeat += repeatPower << v
+			repeatPower <<= 1
+
+			// This limit of 2 million comes from the bzip2 source
+			// code. It prevents repeat from overflowing.
+			if repeat > 2*1024*1024 {
+				return StructuralError("repeat count too large")
+			}
+			continue
+		}
+
+		if repeat > 0 {
+			// We have decoded a complete run-length so we need to
+			// replicate the last output symbol.
+			if repeat > bz2.blockSize-bufIndex {
+				return StructuralError("repeats past end of block")
+			}
+			for i := 0; i < repeat; i++ {
+				b := mtf.First()
+				bz2.tt[bufIndex] = uint32(b)
+				bz2.c[b]++
+				bufIndex++
+			}
+			repeat = 0
+		}
+
+		if int(v) == numSymbols-1 {
+			// This is the EOF symbol. Because it's always at the
+			// end of the move-to-front list, and never gets moved
+			// to the front, it has this unique value.
+			break
+		}
+
+		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
+		// one would expect |v-2| to be passed to the MTF decoder.
+		// However, the front of the MTF list is never referenced as 0,
+		// it's always referenced with a run-length of 1. Thus 0
+		// doesn't need to be encoded and we have |v-1| in the next
+		// line.
+		b := mtf.Decode(int(v - 1))
+		if bufIndex >= bz2.blockSize {
+			return StructuralError("data exceeds block size")
+		}
+		bz2.tt[bufIndex] = uint32(b)
+		bz2.c[b]++
+		bufIndex++
+	}
+
+	if origPtr >= uint(bufIndex) {
+		return StructuralError("origPtr out of bounds")
+	}
+
+	// We have completed the entropy decoding. Now we can perform the
+	// inverse BWT and setup the RLE buffer.
+	bz2.preRLE = bz2.tt[:bufIndex]
+	bz2.preRLEUsed = 0
+	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
+	bz2.lastByte = -1
+	bz2.byteRepeats = 0
+	bz2.repeats = 0
+
+	return nil
+}
+
+// inverseBWT implements the inverse Burrows-Wheeler transform as described in
+// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
+// In that document, origPtr is called “I” and c is the “C” array after the
+// first pass over the data. It's an argument here because we merge the first
+// pass with the Huffman decoding.
+//
+// This also implements the “single array” method from the bzip2 source code
+// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
+// index of the next byte in the top 24-bits. The index of the first byte is
+// returned.
+func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
+	sum := uint(0)
+	for i := 0; i < 256; i++ {
+		sum += c[i]
+		c[i] = sum - c[i]
+	}
+
+	for i := range tt {
+		b := tt[i] & 0xff
+		tt[c[b]] |= uint32(i) << 8
+		c[b]++
+	}
+
+	return tt[origPtr] >> 8
+}
+
+// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
+// causing the bits in the input to be processed in the reverse of the usual order.
+
+var crctab [256]uint32
+
+func init() {
+	const poly = 0x04C11DB7
+	for i := range crctab {
+		crc := uint32(i) << 24
+		for j := 0; j < 8; j++ {
+			if crc&0x80000000 != 0 {
+				crc = (crc << 1) ^ poly
+			} else {
+				crc <<= 1
+			}
+		}
+		crctab[i] = crc
+	}
+}
+
+// updateCRC updates the crc value to incorporate the data in b.
+// The initial value is 0.
+func updateCRC(val uint32, b []byte) uint32 {
+	crc := ^val
+	for _, v := range b {
+		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
+	}
+	return ^crc
+}
diff --git a/src/compress/bzip2/bzip2_test.go b/src/compress/bzip2/bzip2_test.go
new file mode 100644
index 0000000..e6065cb
--- /dev/null
+++ b/src/compress/bzip2/bzip2_test.go
@@ -0,0 +1,240 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bzip2
+
+import (
+	"bytes"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"os"
+	"testing"
+)
+
+func mustDecodeHex(s string) []byte {
+	b, err := hex.DecodeString(s)
+	if err != nil {
+		panic(err)
+	}
+	return b
+}
+
+func mustLoadFile(f string) []byte {
+	b, err := os.ReadFile(f)
+	if err != nil {
+		panic(err)
+	}
+	return b
+}
+
+func trim(b []byte) string {
+	const limit = 1024
+	if len(b) < limit {
+		return fmt.Sprintf("%q", b)
+	}
+	return fmt.Sprintf("%q...", b[:limit])
+}
+
+func TestReader(t *testing.T) {
+	var vectors = []struct {
+		desc   string
+		input  []byte
+		output []byte
+		fail   bool
+	}{{
+		desc: "hello world",
+		input: mustDecodeHex("" +
+			"425a68393141592653594eece83600000251800010400006449080200031064c" +
+			"4101a7a9a580bb9431f8bb9229c28482776741b0",
+		),
+		output: []byte("hello world\n"),
+	}, {
+		desc: "concatenated files",
+		input: mustDecodeHex("" +
+			"425a68393141592653594eece83600000251800010400006449080200031064c" +
+			"4101a7a9a580bb9431f8bb9229c28482776741b0425a68393141592653594eec" +
+			"e83600000251800010400006449080200031064c4101a7a9a580bb9431f8bb92" +
+			"29c28482776741b0",
+		),
+		output: []byte("hello world\nhello world\n"),
+	}, {
+		desc: "32B zeros",
+		input: mustDecodeHex("" +
+			"425a6839314159265359b5aa5098000000600040000004200021008283177245" +
+			"385090b5aa5098",
+		),
+		output: make([]byte, 32),
+	}, {
+		desc: "1MiB zeros",
+		input: mustDecodeHex("" +
+			"425a683931415926535938571ce50008084000c0040008200030cc0529a60806" +
+			"c4201e2ee48a70a12070ae39ca",
+		),
+		output: make([]byte, 1<<20),
+	}, {
+		desc:   "random data",
+		input:  mustLoadFile("testdata/pass-random1.bz2"),
+		output: mustLoadFile("testdata/pass-random1.bin"),
+	}, {
+		desc:   "random data - full symbol range",
+		input:  mustLoadFile("testdata/pass-random2.bz2"),
+		output: mustLoadFile("testdata/pass-random2.bin"),
+	}, {
+		desc: "random data - uses RLE1 stage",
+		input: mustDecodeHex("" +
+			"425a6839314159265359d992d0f60000137dfe84020310091c1e280e100e0428" +
+			"01099210094806c0110002e70806402000546034000034000000f28300000320" +
+			"00d3403264049270eb7a9280d308ca06ad28f6981bee1bf8160727c7364510d7" +
+			"3a1e123083421b63f031f63993a0f40051fbf177245385090d992d0f60",
+		),
+		output: mustDecodeHex("" +
+			"92d5652616ac444a4a04af1a8a3964aca0450d43d6cf233bd03233f4ba92f871" +
+			"9e6c2a2bd4f5f88db07ecd0da3a33b263483db9b2c158786ad6363be35d17335" +
+			"ba",
+		),
+	}, {
+		desc:  "1MiB sawtooth",
+		input: mustLoadFile("testdata/pass-sawtooth.bz2"),
+		output: func() []byte {
+			b := make([]byte, 1<<20)
+			for i := range b {
+				b[i] = byte(i)
+			}
+			return b
+		}(),
+	}, {
+		desc:  "RLE2 buffer overrun - issue 5747",
+		input: mustLoadFile("testdata/fail-issue5747.bz2"),
+		fail:  true,
+	}, {
+		desc: "out-of-range selector - issue 8363",
+		input: mustDecodeHex("" +
+			"425a68393141592653594eece83600000251800010400006449080200031064c" +
+			"4101a7a9a580bb943117724538509000000000",
+		),
+		fail: true,
+	}, {
+		desc: "bad block size - issue 13941",
+		input: mustDecodeHex("" +
+			"425a683131415926535936dc55330063ffc0006000200020a40830008b0008b8" +
+			"bb9229c28481b6e2a998",
+		),
+		fail: true,
+	}, {
+		desc: "bad huffman delta",
+		input: mustDecodeHex("" +
+			"425a6836314159265359b1f7404b000000400040002000217d184682ee48a70a" +
+			"12163ee80960",
+		),
+		fail: true,
+	}}
+
+	for i, v := range vectors {
+		rd := NewReader(bytes.NewReader(v.input))
+		buf, err := io.ReadAll(rd)
+
+		if fail := bool(err != nil); fail != v.fail {
+			if fail {
+				t.Errorf("test %d (%s), unexpected failure: %v", i, v.desc, err)
+			} else {
+				t.Errorf("test %d (%s), unexpected success", i, v.desc)
+			}
+		}
+		if !v.fail && !bytes.Equal(buf, v.output) {
+			t.Errorf("test %d (%s), output mismatch:\ngot  %s\nwant %s", i, v.desc, trim(buf), trim(v.output))
+		}
+	}
+}
+
+func TestBitReader(t *testing.T) {
+	var vectors = []struct {
+		nbits uint // Number of bits to read
+		value int  // Expected output value (0 for error)
+		fail  bool // Expected operation failure?
+	}{
+		{nbits: 1, value: 1},
+		{nbits: 1, value: 0},
+		{nbits: 1, value: 1},
+		{nbits: 5, value: 11},
+		{nbits: 32, value: 0x12345678},
+		{nbits: 15, value: 14495},
+		{nbits: 3, value: 6},
+		{nbits: 6, value: 13},
+		{nbits: 1, fail: true},
+	}
+
+	rd := bytes.NewReader([]byte{0xab, 0x12, 0x34, 0x56, 0x78, 0x71, 0x3f, 0x8d})
+	br := newBitReader(rd)
+	for i, v := range vectors {
+		val := br.ReadBits(v.nbits)
+		if fail := bool(br.err != nil); fail != v.fail {
+			if fail {
+				t.Errorf("test %d, unexpected failure: ReadBits(%d) = %v", i, v.nbits, br.err)
+			} else {
+				t.Errorf("test %d, unexpected success: ReadBits(%d) = nil", i, v.nbits)
+			}
+		}
+		if !v.fail && val != v.value {
+			t.Errorf("test %d, mismatching value: ReadBits(%d) = %d, want %d", i, v.nbits, val, v.value)
+		}
+	}
+}
+
+func TestMTF(t *testing.T) {
+	var vectors = []struct {
+		idx int   // Input index
+		sym uint8 // Expected output symbol
+	}{
+		{idx: 1, sym: 1}, // [1 0 2 3 4]
+		{idx: 0, sym: 1}, // [1 0 2 3 4]
+		{idx: 1, sym: 0}, // [0 1 2 3 4]
+		{idx: 4, sym: 4}, // [4 0 1 2 3]
+		{idx: 1, sym: 0}, // [0 4 1 2 3]
+	}
+
+	mtf := newMTFDecoderWithRange(5)
+	for i, v := range vectors {
+		sym := mtf.Decode(v.idx)
+		t.Log(mtf)
+		if sym != v.sym {
+			t.Errorf("test %d, symbol mismatch: Decode(%d) = %d, want %d", i, v.idx, sym, v.sym)
+		}
+	}
+}
+
+func TestZeroRead(t *testing.T) {
+	b := mustDecodeHex("425a6839314159265359b5aa5098000000600040000004200021008283177245385090b5aa5098")
+	r := NewReader(bytes.NewReader(b))
+	if n, err := r.Read(nil); n != 0 || err != nil {
+		t.Errorf("Read(nil) = (%d, %v), want (0, nil)", n, err)
+	}
+}
+
+var (
+	digits = mustLoadFile("testdata/e.txt.bz2")
+	newton = mustLoadFile("testdata/Isaac.Newton-Opticks.txt.bz2")
+	random = mustLoadFile("testdata/random.data.bz2")
+)
+
+func benchmarkDecode(b *testing.B, compressed []byte) {
+	// Determine the uncompressed size of testfile.
+	uncompressedSize, err := io.Copy(io.Discard, NewReader(bytes.NewReader(compressed)))
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.SetBytes(uncompressedSize)
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		r := bytes.NewReader(compressed)
+		io.Copy(io.Discard, NewReader(r))
+	}
+}
+
+func BenchmarkDecodeDigits(b *testing.B) { benchmarkDecode(b, digits) }
+func BenchmarkDecodeNewton(b *testing.B) { benchmarkDecode(b, newton) }
+func BenchmarkDecodeRand(b *testing.B)   { benchmarkDecode(b, random) }
diff --git a/src/compress/bzip2/huffman.go b/src/compress/bzip2/huffman.go
new file mode 100644
index 0000000..447fc4d
--- /dev/null
+++ b/src/compress/bzip2/huffman.go
@@ -0,0 +1,237 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bzip2
+
+import "sort"
+
+// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
+// symbol.
+type huffmanTree struct {
+	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
+	// root of the tree and nextNode contains the index of the next element
+	// of nodes to use when the tree is being constructed.
+	nodes    []huffmanNode
+	nextNode int
+}
+
+// A huffmanNode is a node in the tree. left and right contain indexes into the
+// nodes slice of the tree. If left or right is invalidNodeValue then the child
+// is a left node and its value is in leftValue/rightValue.
+//
+// The symbols are uint16s because bzip2 encodes not only MTF indexes in the
+// tree, but also two magic values for run-length encoding and an EOF symbol.
+// Thus there are more than 256 possible symbols.
+type huffmanNode struct {
+	left, right           uint16
+	leftValue, rightValue uint16
+}
+
+// invalidNodeValue is an invalid index which marks a leaf node in the tree.
+const invalidNodeValue = 0xffff
+
+// Decode reads bits from the given bitReader and navigates the tree until a
+// symbol is found.
+func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
+	nodeIndex := uint16(0) // node 0 is the root of the tree.
+
+	for {
+		node := &t.nodes[nodeIndex]
+
+		var bit uint16
+		if br.bits > 0 {
+			// Get next bit - fast path.
+			br.bits--
+			bit = uint16(br.n>>(br.bits&63)) & 1
+		} else {
+			// Get next bit - slow path.
+			// Use ReadBits to retrieve a single bit
+			// from the underling io.ByteReader.
+			bit = uint16(br.ReadBits(1))
+		}
+
+		// Trick a compiler into generating conditional move instead of branch,
+		// by making both loads unconditional.
+		l, r := node.left, node.right
+
+		if bit == 1 {
+			nodeIndex = l
+		} else {
+			nodeIndex = r
+		}
+
+		if nodeIndex == invalidNodeValue {
+			// We found a leaf. Use the value of bit to decide
+			// whether is a left or a right value.
+			l, r := node.leftValue, node.rightValue
+			if bit == 1 {
+				v = l
+			} else {
+				v = r
+			}
+			return
+		}
+	}
+}
+
+// newHuffmanTree builds a Huffman tree from a slice containing the code
+// lengths of each symbol. The maximum code length is 32 bits.
+func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
+	// There are many possible trees that assign the same code length to
+	// each symbol (consider reflecting a tree down the middle, for
+	// example). Since the code length assignments determine the
+	// efficiency of the tree, each of these trees is equally good. In
+	// order to minimize the amount of information needed to build a tree
+	// bzip2 uses a canonical tree so that it can be reconstructed given
+	// only the code length assignments.
+
+	if len(lengths) < 2 {
+		panic("newHuffmanTree: too few symbols")
+	}
+
+	var t huffmanTree
+
+	// First we sort the code length assignments by ascending code length,
+	// using the symbol value to break ties.
+	pairs := make([]huffmanSymbolLengthPair, len(lengths))
+	for i, length := range lengths {
+		pairs[i].value = uint16(i)
+		pairs[i].length = length
+	}
+
+	sort.Slice(pairs, func(i, j int) bool {
+		if pairs[i].length < pairs[j].length {
+			return true
+		}
+		if pairs[i].length > pairs[j].length {
+			return false
+		}
+		if pairs[i].value < pairs[j].value {
+			return true
+		}
+		return false
+	})
+
+	// Now we assign codes to the symbols, starting with the longest code.
+	// We keep the codes packed into a uint32, at the most-significant end.
+	// So branches are taken from the MSB downwards. This makes it easy to
+	// sort them later.
+	code := uint32(0)
+	length := uint8(32)
+
+	codes := make([]huffmanCode, len(lengths))
+	for i := len(pairs) - 1; i >= 0; i-- {
+		if length > pairs[i].length {
+			length = pairs[i].length
+		}
+		codes[i].code = code
+		codes[i].codeLen = length
+		codes[i].value = pairs[i].value
+		// We need to 'increment' the code, which means treating |code|
+		// like a |length| bit number.
+		code += 1 << (32 - length)
+	}
+
+	// Now we can sort by the code so that the left half of each branch are
+	// grouped together, recursively.
+	sort.Slice(codes, func(i, j int) bool {
+		return codes[i].code < codes[j].code
+	})
+
+	t.nodes = make([]huffmanNode, len(codes))
+	_, err := buildHuffmanNode(&t, codes, 0)
+	return t, err
+}
+
+// huffmanSymbolLengthPair contains a symbol and its code length.
+type huffmanSymbolLengthPair struct {
+	value  uint16
+	length uint8
+}
+
+// huffmanCode contains a symbol, its code and code length.
+type huffmanCode struct {
+	code    uint32
+	codeLen uint8
+	value   uint16
+}
+
+// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
+// the Huffman tree at the given level. It returns the index of the newly
+// constructed node.
+func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
+	test := uint32(1) << (31 - level)
+
+	// We have to search the list of codes to find the divide between the left and right sides.
+	firstRightIndex := len(codes)
+	for i, code := range codes {
+		if code.code&test != 0 {
+			firstRightIndex = i
+			break
+		}
+	}
+
+	left := codes[:firstRightIndex]
+	right := codes[firstRightIndex:]
+
+	if len(left) == 0 || len(right) == 0 {
+		// There is a superfluous level in the Huffman tree indicating
+		// a bug in the encoder. However, this bug has been observed in
+		// the wild so we handle it.
+
+		// If this function was called recursively then we know that
+		// len(codes) >= 2 because, otherwise, we would have hit the
+		// "leaf node" case, below, and not recurred.
+		//
+		// However, for the initial call it's possible that len(codes)
+		// is zero or one. Both cases are invalid because a zero length
+		// tree cannot encode anything and a length-1 tree can only
+		// encode EOF and so is superfluous. We reject both.
+		if len(codes) < 2 {
+			return 0, StructuralError("empty Huffman tree")
+		}
+
+		// In this case the recursion doesn't always reduce the length
+		// of codes so we need to ensure termination via another
+		// mechanism.
+		if level == 31 {
+			// Since len(codes) >= 2 the only way that the values
+			// can match at all 32 bits is if they are equal, which
+			// is invalid. This ensures that we never enter
+			// infinite recursion.
+			return 0, StructuralError("equal symbols in Huffman tree")
+		}
+
+		if len(left) == 0 {
+			return buildHuffmanNode(t, right, level+1)
+		}
+		return buildHuffmanNode(t, left, level+1)
+	}
+
+	nodeIndex = uint16(t.nextNode)
+	node := &t.nodes[t.nextNode]
+	t.nextNode++
+
+	if len(left) == 1 {
+		// leaf node
+		node.left = invalidNodeValue
+		node.leftValue = left[0].value
+	} else {
+		node.left, err = buildHuffmanNode(t, left, level+1)
+	}
+
+	if err != nil {
+		return
+	}
+
+	if len(right) == 1 {
+		// leaf node
+		node.right = invalidNodeValue
+		node.rightValue = right[0].value
+	} else {
+		node.right, err = buildHuffmanNode(t, right, level+1)
+	}
+
+	return
+}
diff --git a/src/compress/bzip2/move_to_front.go b/src/compress/bzip2/move_to_front.go
new file mode 100644
index 0000000..526dfb3
--- /dev/null
+++ b/src/compress/bzip2/move_to_front.go
@@ -0,0 +1,53 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bzip2
+
+// moveToFrontDecoder implements a move-to-front list. Such a list is an
+// efficient way to transform a string with repeating elements into one with
+// many small valued numbers, which is suitable for entropy encoding. It works
+// by starting with an initial list of symbols and references symbols by their
+// index into that list. When a symbol is referenced, it's moved to the front
+// of the list. Thus, a repeated symbol ends up being encoded with many zeros,
+// as the symbol will be at the front of the list after the first access.
+type moveToFrontDecoder []byte
+
+// newMTFDecoder creates a move-to-front decoder with an explicit initial list
+// of symbols.
+func newMTFDecoder(symbols []byte) moveToFrontDecoder {
+	if len(symbols) > 256 {
+		panic("too many symbols")
+	}
+	return moveToFrontDecoder(symbols)
+}
+
+// newMTFDecoderWithRange creates a move-to-front decoder with an initial
+// symbol list of 0...n-1.
+func newMTFDecoderWithRange(n int) moveToFrontDecoder {
+	if n > 256 {
+		panic("newMTFDecoderWithRange: cannot have > 256 symbols")
+	}
+
+	m := make([]byte, n)
+	for i := 0; i < n; i++ {
+		m[i] = byte(i)
+	}
+	return moveToFrontDecoder(m)
+}
+
+func (m moveToFrontDecoder) Decode(n int) (b byte) {
+	// Implement move-to-front with a simple copy. This approach
+	// beats more sophisticated approaches in benchmarking, probably
+	// because it has high locality of reference inside of a
+	// single cache line (most move-to-front operations have n < 64).
+	b = m[n]
+	copy(m[1:], m[:n])
+	m[0] = b
+	return
+}
+
+// First returns the symbol at the front of the list.
+func (m moveToFrontDecoder) First() byte {
+	return m[0]
+}
diff --git a/src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2 b/src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2
new file mode 100644
index 0000000..6c56de3
Binary files /dev/null and b/src/compress/bzip2/testdata/Isaac.Newton-Opticks.txt.bz2 differ
diff --git a/src/compress/bzip2/testdata/e.txt.bz2 b/src/compress/bzip2/testdata/e.txt.bz2
new file mode 100644
index 0000000..65bf3b4
Binary files /dev/null and b/src/compress/bzip2/testdata/e.txt.bz2 differ
diff --git a/src/compress/bzip2/testdata/fail-issue5747.bz2 b/src/compress/bzip2/testdata/fail-issue5747.bz2
new file mode 100644
index 0000000..2bf2b6a
Binary files /dev/null and b/src/compress/bzip2/testdata/fail-issue5747.bz2 differ
diff --git a/src/compress/bzip2/testdata/pass-random1.bin b/src/compress/bzip2/testdata/pass-random1.bin
new file mode 100644
index 0000000..6e17879
Binary files /dev/null and b/src/compress/bzip2/testdata/pass-random1.bin differ
diff --git a/src/compress/bzip2/testdata/pass-random1.bz2 b/src/compress/bzip2/testdata/pass-random1.bz2
new file mode 100644
index 0000000..f6a9dc7
Binary files /dev/null and b/src/compress/bzip2/testdata/pass-random1.bz2 differ
diff --git a/src/compress/bzip2/testdata/pass-random2.bin b/src/compress/bzip2/testdata/pass-random2.bin
new file mode 100644
index 0000000..f152d40
--- /dev/null
+++ b/src/compress/bzip2/testdata/pass-random2.bin
@@ -0,0 +1 @@
+��e&�DJJ��9d��EC��#;�23����q�l*+�����~���;&4�ۛ,���cc�5�s5�
\ No newline at end of file
diff --git a/src/compress/bzip2/testdata/pass-random2.bz2 b/src/compress/bzip2/testdata/pass-random2.bz2
new file mode 100644
index 0000000..91ef775
Binary files /dev/null and b/src/compress/bzip2/testdata/pass-random2.bz2 differ
diff --git a/src/compress/bzip2/testdata/pass-sawtooth.bz2 b/src/compress/bzip2/testdata/pass-sawtooth.bz2
new file mode 100644
index 0000000..579a378
Binary files /dev/null and b/src/compress/bzip2/testdata/pass-sawtooth.bz2 differ
diff --git a/src/compress/bzip2/testdata/random.data.bz2 b/src/compress/bzip2/testdata/random.data.bz2
new file mode 100644
index 0000000..1ef2300
Binary files /dev/null and b/src/compress/bzip2/testdata/random.data.bz2 differ
-- 
cgit v1.2.3