summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2017-05-07 15:50:01 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2017-05-07 15:50:01 +0000
commitefe11df4b26426c3db4f96592e899b1f005a878e (patch)
tree4195bbbd046bc3e44a30202dc3284e45bad0f516
parentReleasing debian version 1.8-5. (diff)
downloadclzip-efe11df4b26426c3db4f96592e899b1f005a878e.tar.xz
clzip-efe11df4b26426c3db4f96592e899b1f005a878e.zip
Merging upstream version 1.9.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
-rw-r--r--ChangeLog18
-rw-r--r--INSTALL2
-rw-r--r--Makefile.in14
-rw-r--r--NEWS30
-rw-r--r--README39
-rw-r--r--carg_parser.c9
-rw-r--r--carg_parser.h2
-rwxr-xr-xconfigure21
-rw-r--r--decoder.c89
-rw-r--r--decoder.h209
-rw-r--r--doc/clzip.17
-rw-r--r--doc/clzip.info1029
-rw-r--r--doc/clzip.texi1004
-rw-r--r--encoder.c96
-rw-r--r--encoder.h90
-rw-r--r--encoder_base.c20
-rw-r--r--encoder_base.h217
-rw-r--r--fast_encoder.c44
-rw-r--r--fast_encoder.h8
-rw-r--r--file_index.c268
-rw-r--r--file_index.h90
-rw-r--r--list.c123
-rw-r--r--lzip.h27
-rw-r--r--main.c267
-rwxr-xr-xtestsuite/check.sh264
-rw-r--r--testsuite/test.txt.lzbin7376 -> 7376 bytes
26 files changed, 3243 insertions, 744 deletions
diff --git a/ChangeLog b/ChangeLog
index 4377266..88b6ab2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2017-04-13 Antonio Diaz Diaz <antonio@gnu.org>
+
+ * Version 1.9 released.
+ * The option '-l, --list' has been ported from lziprecover.
+ * Don't allow mixing different operations (-d, -l or -t).
+ * Compression time of option '-0' has been reduced by 6%.
+ * Compression time of options -1 to -9 has been reduced by 1%.
+ * Decompression time has been reduced by 7%.
+ * main.c: Continue testing if any input file is a terminal.
+ * main.c: Show trailing data in both hexadecimal and ASCII.
+ * file_index.c: Improve detection of bad dict and trailing data.
+ * lzip.h: Unified messages for bad magic, trailing data, etc.
+ * clzip.texi: Added missing chapters from lzip.texi.
+
2016-05-13 Antonio Diaz Diaz <antonio@gnu.org>
* Version 1.8 released.
@@ -7,7 +21,7 @@
* decoder.c (LZd_verify_trailer): Removed test of final code.
* main.c (main): Delete '--output' file if infd is a terminal.
* main.c (main): Don't use stdin more than once.
- * lzip.texi: Added chapter 'Trailing data'.
+ * clzip.texi: Added chapter 'Trailing data'.
* configure: Avoid warning on some shells when testing for gcc.
* Makefile.in: Detect the existence of install-info.
* testsuite/check.sh: A POSIX shell is required to run the tests.
@@ -94,7 +108,7 @@
* Translated to C from the C++ source of lzip 1.10.
-Copyright (C) 2010-2016 Antonio Diaz Diaz.
+Copyright (C) 2010-2017 Antonio Diaz Diaz.
This file is a collection of facts, and thus it is not copyrightable,
but just in case, you have unlimited permission to copy, distribute and
diff --git a/INSTALL b/INSTALL
index ed6f68a..0808525 100644
--- a/INSTALL
+++ b/INSTALL
@@ -62,7 +62,7 @@ After running 'configure', you can run 'make' and 'make install' as
explained above.
-Copyright (C) 2010-2016 Antonio Diaz Diaz.
+Copyright (C) 2010-2017 Antonio Diaz Diaz.
This file is free documentation: you have unlimited permission to copy,
distribute and modify it.
diff --git a/Makefile.in b/Makefile.in
index d028148..08274dd 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -7,13 +7,15 @@ INSTALL_DIR = $(INSTALL) -d -m 755
SHELL = /bin/sh
CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1
-objs = carg_parser.o encoder_base.o encoder.o fast_encoder.o decoder.o main.o
+objs = carg_parser.o file_index.o list.o encoder_base.o encoder.o \
+ fast_encoder.o decoder.o main.o
.PHONY : all install install-bin install-info install-man \
install-strip install-compress install-strip-compress \
install-bin-strip install-info-compress install-man-compress \
- install-as-lzip uninstall uninstall-bin uninstall-info uninstall-man \
+ install-as-lzip \
+ uninstall uninstall-bin uninstall-info uninstall-man \
doc info man check dist clean distclean
all : $(progname)
@@ -33,6 +35,8 @@ decoder.o : lzip.h decoder.h
encoder_base.o : lzip.h encoder_base.h
encoder.o : lzip.h encoder_base.h encoder.h
fast_encoder.o : lzip.h encoder_base.h fast_encoder.h
+file_index.o : lzip.h file_index.h
+list.o : lzip.h file_index.h
main.o : carg_parser.h lzip.h decoder.h encoder_base.h encoder.h fast_encoder.h
@@ -117,11 +121,11 @@ dist : doc
$(DISTNAME)/doc/$(progname).1 \
$(DISTNAME)/doc/$(pkgname).info \
$(DISTNAME)/doc/$(pkgname).texi \
+ $(DISTNAME)/*.h \
+ $(DISTNAME)/*.c \
$(DISTNAME)/testsuite/check.sh \
$(DISTNAME)/testsuite/test.txt \
- $(DISTNAME)/testsuite/test.txt.lz \
- $(DISTNAME)/*.h \
- $(DISTNAME)/*.c
+ $(DISTNAME)/testsuite/test.txt.lz
rm -f $(DISTNAME)
lzip -v -9 $(DISTNAME).tar
diff --git a/NEWS b/NEWS
index 7f49444..1f85396 100644
--- a/NEWS
+++ b/NEWS
@@ -1,21 +1,21 @@
-Changes in version 1.8:
+Changes in version 1.9:
-The option "-a, --trailing-error", which makes clzip exit with error
-status 2 if any remaining input is detected after decompressing the last
-member, has been added.
+The option '-l, --list' has been ported from lziprecover.
-When decompressing or testing, up to 6 bytes of trailing data are
-printed if "-vvvv" is specified.
+It is now an error to specify two or more different operations in the
+command line (--decompress, --list or --test).
-The test of the value remaining in the range decoder has been removed.
-(After extensive testing it has been found useless to detect corruption
-in the decompressed data. Eliminating it reduces the number of false
-positives for corruption and makes error detection more accurate).
+Compression time of option '-0' has been reduced by 6%.
-When decompressing, the file specified with the '--output' option is now
-deleted if the input is a terminal.
+Compression time of options '-1' to '-9' has been reduced by 1%.
-The new chapter "Trailing data" has been added to the manual.
+Decompression time has been reduced by 7%.
-A harmless check failure on Windows, caused by the failed comparison of
-a message in text mode, has been fixed.
+In test mode, clzip now continues checking the rest of the files if any
+input file is a terminal.
+
+Trailing data are now shown both in hexadecimal and as a string of
+printable ASCII characters.
+
+Three missing chapters have been added to the manual, which now contains
+all the chapters of the lzip manual.
diff --git a/README b/README
index 9316c4e..cc5fd73 100644
--- a/README
+++ b/README
@@ -1,14 +1,15 @@
Description
-Clzip is a lossless data compressor with a user interface similar to the
-one of gzip or bzip2. Clzip is about as fast as gzip, compresses most
-files more than bzip2, and is better than both from a data recovery
-perspective.
+Clzip is a C language version of lzip, fully compatible with lzip-1.4 or
+newer. As clzip is written in C, it may be easier to integrate in
+applications like package managers, embedded devices, or systems lacking
+a C++ compiler.
-Clzip uses the lzip file format; the files produced by clzip are fully
-compatible with lzip-1.4 or newer, and can be rescued with lziprecover.
-Clzip is in fact a C language version of lzip, intended for embedded
-devices or systems lacking a C++ compiler.
+Lzip is a lossless data compressor with a user interface similar to the
+one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0),
+or compress most files more than bzip2 (lzip -9). Decompression speed is
+intermediate between gzip and bzip2. Lzip is better than gzip and bzip2
+from a data recovery perspective.
The lzip file format is designed for data sharing and long-term
archiving, taking into account both data integrity and decoder
@@ -21,11 +22,11 @@ availability:
merging of damaged copies of a file.
* The lzip format is as simple as possible (but not simpler). The
- lzip manual provides the code of a simple decompressor along with a
- detailed explanation of how it works, so that with the only help of
- the lzip manual it would be possible for a digital archaeologist to
- extract the data from a lzip file long after quantum computers
- eventually render LZMA obsolete.
+ lzip manual provides the source code of a simple decompressor along
+ with a detailed explanation of how it works, so that with the only
+ help of the lzip manual it would be possible for a digital
+ archaeologist to extract the data from a lzip file long after
+ quantum computers eventually render LZMA obsolete.
* Additionally the lzip reference implementation is copylefted, which
guarantees that it will remain free forever.
@@ -80,11 +81,11 @@ or more compressed files. The result is the concatenation of the
corresponding uncompressed files. Integrity testing of concatenated
compressed files is also supported.
-Clzip can produce multimember files and safely recover, with
-lziprecover, the undamaged members in case of file damage. Clzip can
-also split the compressed output in volumes of a given size, even when
-reading from standard input. This allows the direct creation of
-multivolume compressed tar archives.
+Clzip can produce multimember files, and lziprecover can safely recover
+the undamaged members in case of file damage. Clzip can also split the
+compressed output in volumes of a given size, even when reading from
+standard input. This allows the direct creation of multivolume
+compressed tar archives.
Clzip is able to compress and decompress streams of unlimited size by
automatically creating multimember output. The members so created are
@@ -115,7 +116,7 @@ range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).
-Copyright (C) 2010-2016 Antonio Diaz Diaz.
+Copyright (C) 2010-2017 Antonio Diaz Diaz.
This file is free documentation: you have unlimited permission to copy,
distribute and modify it.
diff --git a/carg_parser.c b/carg_parser.c
index 3d4e89f..6850643 100644
--- a/carg_parser.c
+++ b/carg_parser.c
@@ -1,5 +1,5 @@
/* Arg_parser - POSIX/GNU command line argument parser. (C version)
- Copyright (C) 2006-2016 Antonio Diaz Diaz.
+ Copyright (C) 2006-2017 Antonio Diaz Diaz.
This library is free software. Redistribution and use in source and
binary forms, with or without modification, are permitted provided
@@ -94,7 +94,7 @@ static char parse_long_option( struct Arg_parser * const ap,
else if( index < 0 ) index = i; /* First nonexact match found */
else if( options[index].code != options[i].code ||
options[index].has_arg != options[i].has_arg )
- ambig = 1; /* Second or later nonexact match found */
+ ambig = 1; /* Second or later nonexact match found */
}
if( ambig && !exact )
@@ -230,7 +230,9 @@ char ap_init( struct Arg_parser * const ap,
}
else
{
- if( !in_order )
+ if( in_order )
+ { if( !push_back_record( ap, 0, argv[argind++] ) ) return 0; }
+ else
{
void * tmp = ap_resize_buffer( non_options,
( non_options_size + 1 ) * sizeof *non_options );
@@ -238,7 +240,6 @@ char ap_init( struct Arg_parser * const ap,
non_options = (const char **)tmp;
non_options[non_options_size++] = argv[argind++];
}
- else if( !push_back_record( ap, 0, argv[argind++] ) ) return 0;
}
}
if( ap->error ) free_data( ap );
diff --git a/carg_parser.h b/carg_parser.h
index e918942..c4ce31d 100644
--- a/carg_parser.h
+++ b/carg_parser.h
@@ -1,5 +1,5 @@
/* Arg_parser - POSIX/GNU command line argument parser. (C version)
- Copyright (C) 2006-2016 Antonio Diaz Diaz.
+ Copyright (C) 2006-2017 Antonio Diaz Diaz.
This library is free software. Redistribution and use in source and
binary forms, with or without modification, are permitted provided
diff --git a/configure b/configure
index bdfc775..5ef6211 100755
--- a/configure
+++ b/configure
@@ -1,12 +1,12 @@
#! /bin/sh
# configure script for Clzip - LZMA lossless data compressor
-# Copyright (C) 2010-2016 Antonio Diaz Diaz.
+# Copyright (C) 2010-2017 Antonio Diaz Diaz.
#
# This configure script is free software: you have unlimited permission
# to copy, distribute and modify it.
pkgname=clzip
-pkgversion=1.8
+pkgversion=1.9
progname=clzip
srctrigger=doc/${pkgname}.texi
@@ -26,11 +26,11 @@ CFLAGS='-Wall -W -O2'
LDFLAGS=
# checking whether we are using GNU C.
-if /bin/sh -c "${CC} --version" > /dev/null 2>&1 ; then true
-else
+/bin/sh -c "${CC} --version" > /dev/null 2>&1 ||
+ {
CC=cc
- CFLAGS='-W -O2'
-fi
+ CFLAGS=-O2
+ }
# Loop over all args
args=
@@ -52,9 +52,12 @@ while [ $# != 0 ] ; do
# Process the options
case ${option} in
--help | -h)
- echo "Usage: configure [options]"
+ echo "Usage: $0 [OPTION]... [VAR=VALUE]..."
+ echo
+ echo "To assign makefile variables (e.g., CC, CFLAGS...), specify them as"
+ echo "arguments to configure in the form VAR=VALUE."
echo
- echo "Options: [defaults in brackets]"
+ echo "Options and variables: [defaults in brackets]"
echo " -h, --help display this help and exit"
echo " -V, --version output version information and exit"
echo " --srcdir=DIR find the sources in DIR [. or ..]"
@@ -165,7 +168,7 @@ echo "LDFLAGS = ${LDFLAGS}"
rm -f Makefile
cat > Makefile << EOF
# Makefile for Clzip - LZMA lossless data compressor
-# Copyright (C) 2010-2016 Antonio Diaz Diaz.
+# Copyright (C) 2010-2017 Antonio Diaz Diaz.
# This file was generated automatically by configure. Don't edit.
#
# This Makefile is free software: you have unlimited permission
diff --git a/decoder.c b/decoder.c
index 942ec60..453d271 100644
--- a/decoder.c
+++ b/decoder.c
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -172,7 +172,7 @@ static bool LZd_verify_trailer( struct LZ_decoder * const d,
( 8.0 * member_size ) / data_size,
100.0 * ( 1.0 - ( (double)member_size / data_size ) ) );
if( !error && verbosity >= 4 )
- fprintf( stderr, "data CRC %08X, data size %9llu, member size %8llu. ",
+ fprintf( stderr, "CRC %08X, decompressed %9llu, compressed %8llu. ",
LZd_crc( d ), data_size, member_size );
return !error;
}
@@ -184,46 +184,74 @@ int LZd_decode_member( struct LZ_decoder * const d,
struct Pretty_print * const pp )
{
struct Range_decoder * const rdec = d->rdec;
+ Bit_model bm_literal[1<<literal_context_bits][0x300];
+ Bit_model bm_match[states][pos_states];
+ Bit_model bm_rep[states];
+ Bit_model bm_rep0[states];
+ Bit_model bm_rep1[states];
+ Bit_model bm_rep2[states];
+ Bit_model bm_len[states][pos_states];
+ Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
+ Bit_model bm_dis[modeled_distances-end_dis_model+1];
+ Bit_model bm_align[dis_align_size];
+ struct Len_model match_len_model;
+ struct Len_model rep_len_model;
unsigned rep0 = 0; /* rep[0-3] latest four distances */
unsigned rep1 = 0; /* used for efficient coding of */
unsigned rep2 = 0; /* repeated distances */
unsigned rep3 = 0;
State state = 0;
+ Bm_array_init( bm_literal[0], (1 << literal_context_bits) * 0x300 );
+ Bm_array_init( bm_match[0], states * pos_states );
+ Bm_array_init( bm_rep, states );
+ Bm_array_init( bm_rep0, states );
+ Bm_array_init( bm_rep1, states );
+ Bm_array_init( bm_rep2, states );
+ Bm_array_init( bm_len[0], states * pos_states );
+ Bm_array_init( bm_dis_slot[0], len_states * (1 << dis_slot_bits) );
+ Bm_array_init( bm_dis, modeled_distances - end_dis_model + 1 );
+ Bm_array_init( bm_align, dis_align_size );
+ Lm_init( &match_len_model );
+ Lm_init( &rep_len_model );
+
Rd_load( rdec );
while( !Rd_finished( rdec ) )
{
const int pos_state = LZd_data_position( d ) & pos_state_mask;
- if( Rd_decode_bit( rdec, &d->bm_match[state][pos_state] ) == 0 ) /* 1st bit */
+ if( Rd_decode_bit( rdec, &bm_match[state][pos_state] ) == 0 ) /* 1st bit */
{
- const uint8_t prev_byte = LZd_peek_prev( d );
+ Bit_model * const bm = bm_literal[get_lit_state(LZd_peek_prev( d ))];
if( St_is_char( state ) )
{
state -= ( state < 4 ) ? state : 3;
- LZd_put_byte( d, Rd_decode_tree( rdec,
- d->bm_literal[get_lit_state(prev_byte)], 8 ) );
+ LZd_put_byte( d, Rd_decode_tree8( rdec, bm ) );
}
else
{
state -= ( state < 10 ) ? 3 : 6;
- LZd_put_byte( d, Rd_decode_matched( rdec,
- d->bm_literal[get_lit_state(prev_byte)],
- LZd_peek( d, rep0 ) ) );
+ LZd_put_byte( d, Rd_decode_matched( rdec, bm, LZd_peek( d, rep0 ) ) );
}
}
else /* match or repeated match */
{
int len;
- if( Rd_decode_bit( rdec, &d->bm_rep[state] ) != 0 ) /* 2nd bit */
+ if( Rd_decode_bit( rdec, &bm_rep[state] ) != 0 ) /* 2nd bit */
{
- if( Rd_decode_bit( rdec, &d->bm_rep0[state] ) != 0 ) /* 3rd bit */
+ if( Rd_decode_bit( rdec, &bm_rep0[state] ) == 0 ) /* 3rd bit */
+ {
+ if( Rd_decode_bit( rdec, &bm_len[state][pos_state] ) == 0 ) /* 4th bit */
+ { state = St_set_short_rep( state );
+ LZd_put_byte( d, LZd_peek( d, rep0 ) ); continue; }
+ }
+ else
{
unsigned distance;
- if( Rd_decode_bit( rdec, &d->bm_rep1[state] ) == 0 ) /* 4th bit */
+ if( Rd_decode_bit( rdec, &bm_rep1[state] ) == 0 ) /* 4th bit */
distance = rep1;
else
{
- if( Rd_decode_bit( rdec, &d->bm_rep2[state] ) == 0 ) /* 5th bit */
+ if( Rd_decode_bit( rdec, &bm_rep2[state] ) == 0 ) /* 5th bit */
distance = rep2;
else
{ distance = rep3; rep3 = rep2; }
@@ -232,36 +260,29 @@ int LZd_decode_member( struct LZ_decoder * const d,
rep1 = rep0;
rep0 = distance;
}
- else
- {
- if( Rd_decode_bit( rdec, &d->bm_len[state][pos_state] ) == 0 ) /* 4th bit */
- { state = St_set_short_rep( state );
- LZd_put_byte( d, LZd_peek( d, rep0 ) ); continue; }
- }
state = St_set_rep( state );
- len = min_match_len + Rd_decode_len( rdec, &d->rep_len_model, pos_state );
+ len = min_match_len + Rd_decode_len( rdec, &rep_len_model, pos_state );
}
else /* match */
{
- const unsigned rep0_saved = rep0;
- int dis_slot;
- len = min_match_len + Rd_decode_len( rdec, &d->match_len_model, pos_state );
- dis_slot = Rd_decode_tree6( rdec, d->bm_dis_slot[get_len_state(len)] );
- if( dis_slot < start_dis_model ) rep0 = dis_slot;
- else
+ unsigned distance;
+ len = min_match_len + Rd_decode_len( rdec, &match_len_model, pos_state );
+ distance = Rd_decode_tree6( rdec, bm_dis_slot[get_len_state(len)] );
+ if( distance >= start_dis_model )
{
+ const unsigned dis_slot = distance;
const int direct_bits = ( dis_slot >> 1 ) - 1;
- rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
+ distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
if( dis_slot < end_dis_model )
- rep0 += Rd_decode_tree_reversed( rdec,
- d->bm_dis + rep0 - dis_slot - 1, direct_bits );
+ distance += Rd_decode_tree_reversed( rdec,
+ bm_dis + ( distance - dis_slot ), direct_bits );
else
{
- rep0 += Rd_decode( rdec, direct_bits - dis_align_bits ) << dis_align_bits;
- rep0 += Rd_decode_tree_reversed4( rdec, d->bm_align );
- if( rep0 == 0xFFFFFFFFU ) /* marker found */
+ distance +=
+ Rd_decode( rdec, direct_bits - dis_align_bits ) << dis_align_bits;
+ distance += Rd_decode_tree_reversed4( rdec, bm_align );
+ if( distance == 0xFFFFFFFFU ) /* marker found */
{
- rep0 = rep0_saved;
Rd_normalize( rdec );
LZd_flush_data( d );
if( len == min_match_len ) /* End Of Stream marker */
@@ -281,7 +302,7 @@ int LZd_decode_member( struct LZ_decoder * const d,
}
}
}
- rep3 = rep2; rep2 = rep1; rep1 = rep0_saved;
+ rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance;
state = St_set_match( state );
if( rep0 >= d->dictionary_size || ( rep0 >= d->pos && !d->pos_wrapped ) )
{ LZd_flush_data( d ); return 1; }
diff --git a/decoder.h b/decoder.h
index 662aaf9..2f96575 100644
--- a/decoder.h
+++ b/decoder.h
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -56,7 +56,7 @@ Rd_member_position( const struct Range_decoder * const rdec )
{ return rdec->partial_member_pos + rdec->pos; }
static inline void Rd_reset_member_position( struct Range_decoder * const rdec )
- { rdec->partial_member_pos = -rdec->pos; }
+ { rdec->partial_member_pos = 0; rdec->partial_member_pos -= rdec->pos; }
static inline uint8_t Rd_get_byte( struct Range_decoder * const rdec )
{
@@ -68,23 +68,22 @@ static inline uint8_t Rd_get_byte( struct Range_decoder * const rdec )
static inline int Rd_read_data( struct Range_decoder * const rdec,
uint8_t * const outbuf, const int size )
{
- int rest = size;
- while( rest > 0 && !Rd_finished( rdec ) )
+ int sz = 0;
+ while( sz < size && !Rd_finished( rdec ) )
{
- const int rd = min( rest, rdec->stream_pos - rdec->pos );
- memcpy( outbuf + size - rest, rdec->buffer + rdec->pos, rd );
+ const int rd = min( size - sz, rdec->stream_pos - rdec->pos );
+ memcpy( outbuf + sz, rdec->buffer + rdec->pos, rd );
rdec->pos += rd;
- rest -= rd;
+ sz += rd;
}
- return size - rest;
+ return sz;
}
static inline void Rd_load( struct Range_decoder * const rdec )
{
int i;
rdec->code = 0;
- for( i = 0; i < 5; ++i )
- rdec->code = (rdec->code << 8) | Rd_get_byte( rdec );
+ for( i = 0; i < 5; ++i ) rdec->code = (rdec->code << 8) | Rd_get_byte( rdec );
rdec->range = 0xFFFFFFFFU;
rdec->code &= rdec->range; /* make sure that first byte is discarded */
}
@@ -92,34 +91,30 @@ static inline void Rd_load( struct Range_decoder * const rdec )
static inline void Rd_normalize( struct Range_decoder * const rdec )
{
if( rdec->range <= 0x00FFFFFFU )
- {
- rdec->range <<= 8;
- rdec->code = (rdec->code << 8) | Rd_get_byte( rdec );
- }
+ { rdec->range <<= 8; rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); }
}
-static inline int Rd_decode( struct Range_decoder * const rdec,
- const int num_bits )
+static inline unsigned Rd_decode( struct Range_decoder * const rdec,
+ const int num_bits )
{
- int symbol = 0;
+ unsigned symbol = 0;
int i;
for( i = num_bits; i > 0; --i )
{
- uint32_t mask;
+ bool bit;
Rd_normalize( rdec );
rdec->range >>= 1;
/* symbol <<= 1; */
/* if( rdec->code >= rdec->range ) { rdec->code -= rdec->range; symbol |= 1; } */
- mask = 0U - (rdec->code < rdec->range);
- rdec->code -= rdec->range;
- rdec->code += rdec->range & mask;
- symbol = (symbol << 1) + (mask + 1);
+ bit = ( rdec->code >= rdec->range );
+ symbol = ( symbol << 1 ) + bit;
+ rdec->code -= rdec->range & ( 0U - bit );
}
return symbol;
}
-static inline int Rd_decode_bit( struct Range_decoder * const rdec,
- Bit_model * const probability )
+static inline unsigned Rd_decode_bit( struct Range_decoder * const rdec,
+ Bit_model * const probability )
{
uint32_t bound;
Rd_normalize( rdec );
@@ -139,20 +134,20 @@ static inline int Rd_decode_bit( struct Range_decoder * const rdec,
}
}
-static inline int Rd_decode_tree( struct Range_decoder * const rdec,
- Bit_model bm[], const int num_bits )
+static inline unsigned Rd_decode_tree3( struct Range_decoder * const rdec,
+ Bit_model bm[] )
{
- int symbol = 1;
- int i;
- for( i = num_bits; i > 0; --i )
- symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
- return symbol - (1 << num_bits);
+ unsigned symbol = 1;
+ symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
+ symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
+ symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
+ return symbol & 7;
}
-static inline int Rd_decode_tree6( struct Range_decoder * const rdec,
- Bit_model bm[] )
+static inline unsigned Rd_decode_tree6( struct Range_decoder * const rdec,
+ Bit_model bm[] )
{
- int symbol = 1;
+ unsigned symbol = 1;
symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
@@ -162,69 +157,69 @@ static inline int Rd_decode_tree6( struct Range_decoder * const rdec,
return symbol & 0x3F;
}
-static inline int Rd_decode_tree_reversed( struct Range_decoder * const rdec,
- Bit_model bm[], const int num_bits )
+static inline unsigned Rd_decode_tree8( struct Range_decoder * const rdec,
+ Bit_model bm[] )
{
- int model = 1;
- int symbol = 0;
+ unsigned symbol = 1;
+ int i;
+ for( i = 0; i < 8; ++i )
+ symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
+ return symbol & 0xFF;
+ }
+
+static inline unsigned
+Rd_decode_tree_reversed( struct Range_decoder * const rdec,
+ Bit_model bm[], const int num_bits )
+ {
+ unsigned model = 1;
+ unsigned symbol = 0;
int i;
for( i = 0; i < num_bits; ++i )
{
- const bool bit = Rd_decode_bit( rdec, &bm[model] );
- model <<= 1;
- if( bit ) { ++model; symbol |= (1 << i); }
+ const unsigned bit = Rd_decode_bit( rdec, &bm[model] );
+ model = ( model << 1 ) + bit;
+ symbol |= ( bit << i );
}
return symbol;
}
-static inline int Rd_decode_tree_reversed4( struct Range_decoder * const rdec,
- Bit_model bm[] )
+static inline unsigned
+Rd_decode_tree_reversed4( struct Range_decoder * const rdec, Bit_model bm[] )
{
- int model = 1;
- int symbol = Rd_decode_bit( rdec, &bm[model] );
- int bit;
- model = (model << 1) + symbol;
- bit = Rd_decode_bit( rdec, &bm[model] );
- model = (model << 1) + bit; symbol |= (bit << 1);
+ unsigned symbol = Rd_decode_bit( rdec, &bm[1] );
+ unsigned model = 2 + symbol;
+ unsigned bit = Rd_decode_bit( rdec, &bm[model] );
+ model = ( model << 1 ) + bit; symbol |= ( bit << 1 );
bit = Rd_decode_bit( rdec, &bm[model] );
- model = (model << 1) + bit; symbol |= (bit << 2);
- if( Rd_decode_bit( rdec, &bm[model] ) ) symbol |= 8;
+ model = ( model << 1 ) + bit; symbol |= ( bit << 2 );
+ symbol |= ( Rd_decode_bit( rdec, &bm[model] ) << 3 );
return symbol;
}
-static inline int Rd_decode_matched( struct Range_decoder * const rdec,
- Bit_model bm[], int match_byte )
+static inline unsigned Rd_decode_matched( struct Range_decoder * const rdec,
+ Bit_model bm[], unsigned match_byte )
{
- Bit_model * const bm1 = bm + 0x100;
- int symbol = 1;
- while( symbol < 0x100 )
+ unsigned symbol = 1;
+ unsigned mask = 0x100;
+ while( true )
{
- int match_bit, bit;
- match_byte <<= 1;
- match_bit = match_byte & 0x100;
- bit = Rd_decode_bit( rdec, &bm1[match_bit+symbol] );
- symbol = ( symbol << 1 ) | bit;
- if( match_bit != bit << 8 )
- {
- while( symbol < 0x100 )
- symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] );
- break;
- }
+ const unsigned match_bit = ( match_byte <<= 1 ) & mask;
+ const unsigned bit = Rd_decode_bit( rdec, &bm[symbol+match_bit+mask] );
+ symbol = ( symbol << 1 ) + bit;
+ if( symbol > 0xFF ) return symbol & 0xFF;
+ mask &= ~(match_bit ^ (bit << 8)); /* if( match_bit != bit ) mask = 0; */
}
- return symbol & 0xFF;
}
-static inline int Rd_decode_len( struct Range_decoder * const rdec,
- struct Len_model * const lm,
- const int pos_state )
+static inline unsigned Rd_decode_len( struct Range_decoder * const rdec,
+ struct Len_model * const lm,
+ const int pos_state )
{
if( Rd_decode_bit( rdec, &lm->choice1 ) == 0 )
- return Rd_decode_tree( rdec, lm->bm_low[pos_state], len_low_bits );
+ return Rd_decode_tree3( rdec, lm->bm_low[pos_state] );
if( Rd_decode_bit( rdec, &lm->choice2 ) == 0 )
- return len_low_symbols +
- Rd_decode_tree( rdec, lm->bm_mid[pos_state], len_mid_bits );
- return len_low_symbols + len_mid_symbols +
- Rd_decode_tree( rdec, lm->bm_high, len_high_bits );
+ return len_low_symbols + Rd_decode_tree3( rdec, lm->bm_mid[pos_state] );
+ return len_low_symbols + len_mid_symbols + Rd_decode_tree8( rdec, lm->bm_high );
}
@@ -239,35 +234,22 @@ struct LZ_decoder
uint32_t crc;
int outfd; /* output file descriptor */
bool pos_wrapped;
-
- Bit_model bm_literal[1<<literal_context_bits][0x300];
- Bit_model bm_match[states][pos_states];
- Bit_model bm_rep[states];
- Bit_model bm_rep0[states];
- Bit_model bm_rep1[states];
- Bit_model bm_rep2[states];
- Bit_model bm_len[states][pos_states];
- Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
- Bit_model bm_dis[modeled_distances-end_dis_model];
- Bit_model bm_align[dis_align_size];
-
- struct Len_model match_len_model;
- struct Len_model rep_len_model;
};
void LZd_flush_data( struct LZ_decoder * const d );
static inline uint8_t LZd_peek_prev( const struct LZ_decoder * const d )
{
- const unsigned i = ( ( d->pos > 0 ) ? d->pos : d->dictionary_size ) - 1;
- return d->buffer[i];
+ if( d->pos > 0 ) return d->buffer[d->pos-1];
+ if( d->pos_wrapped ) return d->buffer[d->dictionary_size-1];
+ return 0; /* prev_byte of first byte */
}
static inline uint8_t LZd_peek( const struct LZ_decoder * const d,
const unsigned distance )
{
- unsigned i = d->pos - distance - 1;
- if( d->pos <= distance ) i += d->dictionary_size;
+ const unsigned i = ( ( d->pos > distance ) ? 0 : d->dictionary_size ) +
+ d->pos - distance - 1;
return d->buffer[i];
}
@@ -280,17 +262,26 @@ static inline void LZd_put_byte( struct LZ_decoder * const d, const uint8_t b )
static inline void LZd_copy_block( struct LZ_decoder * const d,
const unsigned distance, unsigned len )
{
- unsigned i = d->pos - distance - 1;
- bool fast;
- if( d->pos <= distance )
- { i += d->dictionary_size;
- fast = ( len <= d->dictionary_size - i && len <= i - d->pos ); }
+ unsigned lpos = d->pos, i = lpos - distance - 1;
+ bool fast, fast2;
+ if( lpos > distance )
+ {
+ fast = ( len < d->dictionary_size - lpos );
+ fast2 = ( fast && len <= lpos - i );
+ }
else
- fast = ( len < d->dictionary_size - d->pos && len <= d->pos - i );
- if( fast ) /* no wrap, no overlap */
{
- memcpy( d->buffer + d->pos, d->buffer + i, len );
+ i += d->dictionary_size;
+ fast = ( len < d->dictionary_size - i ); /* (i == pos) may happen */
+ fast2 = ( fast && len <= i - lpos );
+ }
+ if( fast ) /* no wrap */
+ {
d->pos += len;
+ if( fast2 ) /* no wrap, no overlap */
+ memcpy( d->buffer + lpos, d->buffer + i, len );
+ else
+ for( ; len > 0; --len ) d->buffer[lpos++] = d->buffer[i++];
}
else for( ; len > 0; --len )
{
@@ -314,20 +305,6 @@ static inline bool LZd_init( struct LZ_decoder * const d,
d->crc = 0xFFFFFFFFU;
d->outfd = ofd;
d->pos_wrapped = false;
-
- Bm_array_init( d->bm_literal[0], (1 << literal_context_bits) * 0x300 );
- Bm_array_init( d->bm_match[0], states * pos_states );
- Bm_array_init( d->bm_rep, states );
- Bm_array_init( d->bm_rep0, states );
- Bm_array_init( d->bm_rep1, states );
- Bm_array_init( d->bm_rep2, states );
- Bm_array_init( d->bm_len[0], states * pos_states );
- Bm_array_init( d->bm_dis_slot[0], len_states * (1 << dis_slot_bits) );
- Bm_array_init( d->bm_dis, modeled_distances - end_dis_model );
- Bm_array_init( d->bm_align, dis_align_size );
- Lm_init( &d->match_len_model );
- Lm_init( &d->rep_len_model );
- d->buffer[d->dictionary_size-1] = 0; /* prev_byte of first byte */
return true;
}
diff --git a/doc/clzip.1 b/doc/clzip.1
index 5dbb695..8522ad0 100644
--- a/doc/clzip.1
+++ b/doc/clzip.1
@@ -1,5 +1,5 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1.
-.TH CLZIP "1" "May 2016" "clzip 1.8" "User Commands"
+.TH CLZIP "1" "April 2017" "clzip 1.9" "User Commands"
.SH NAME
clzip \- reduces the size of files
.SH SYNOPSIS
@@ -36,6 +36,9 @@ force re\-compression of compressed files
\fB\-k\fR, \fB\-\-keep\fR
keep (don't delete) input files
.TP
+\fB\-l\fR, \fB\-\-list\fR
+print (un)compressed file sizes
+.TP
\fB\-m\fR, \fB\-\-match\-length=\fR<bytes>
set match length limit in bytes [36]
.TP
@@ -87,7 +90,7 @@ Report bugs to lzip\-bug@nongnu.org
.br
Clzip home page: http://www.nongnu.org/lzip/clzip.html
.SH COPYRIGHT
-Copyright \(co 2016 Antonio Diaz Diaz.
+Copyright \(co 2017 Antonio Diaz Diaz.
License GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl.html>
.br
This is free software: you are free to change and redistribute it.
diff --git a/doc/clzip.info b/doc/clzip.info
index c590473..f6e483f 100644
--- a/doc/clzip.info
+++ b/doc/clzip.info
@@ -11,21 +11,24 @@ File: clzip.info, Node: Top, Next: Introduction, Up: (dir)
Clzip Manual
************
-This manual is for Clzip (version 1.8, 13 May 2016).
+This manual is for Clzip (version 1.9, 13 April 2017).
* Menu:
* Introduction:: Purpose and features of clzip
* Invoking clzip:: Command line interface
+* Quality assurance:: Design, development and testing of lzip
* File format:: Detailed format of the compressed file
* Algorithm:: How clzip compresses the data
+* Stream format:: Format of the LZMA stream in lzip files
* Trailing data:: Extra data appended to the file
* Examples:: A small tutorial with examples
* Problems:: Reporting bugs
+* Reference source code:: Source code illustrating stream format
* Concept index:: Index of concepts
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This manual is free documentation: you have unlimited permission to
copy, distribute and modify it.
@@ -36,15 +39,16 @@ File: clzip.info, Node: Introduction, Next: Invoking clzip, Prev: Top, Up: T
1 Introduction
**************
-Clzip is a lossless data compressor with a user interface similar to the
-one of gzip or bzip2. Clzip is about as fast as gzip, compresses most
-files more than bzip2, and is better than both from a data recovery
-perspective.
+Clzip is a C language version of lzip, fully compatible with lzip-1.4 or
+newer. As clzip is written in C, it may be easier to integrate in
+applications like package managers, embedded devices, or systems lacking
+a C++ compiler.
- Clzip uses the lzip file format; the files produced by clzip are
-fully compatible with lzip-1.4 or newer, and can be rescued with
-lziprecover. Clzip is in fact a C language version of lzip, intended
-for embedded devices or systems lacking a C++ compiler.
+ Lzip is a lossless data compressor with a user interface similar to
+the one of gzip or bzip2. Lzip can compress about as fast as gzip
+(lzip -0), or compress most files more than bzip2 (lzip -9).
+Decompression speed is intermediate between gzip and bzip2. Lzip is
+better than gzip and bzip2 from a data recovery perspective.
The lzip file format is designed for data sharing and long-term
archiving, taking into account both data integrity and decoder
@@ -58,11 +62,11 @@ availability:
(lziprecover)Data safety.
* The lzip format is as simple as possible (but not simpler). The
- lzip manual provides the code of a simple decompressor along with
- a detailed explanation of how it works, so that with the only help
- of the lzip manual it would be possible for a digital
- archaeologist to extract the data from a lzip file long after
- quantum computers eventually render LZMA obsolete.
+ lzip manual provides the source code of a simple decompressor
+ along with a detailed explanation of how it works, so that with
+ the only help of the lzip manual it would be possible for a
+ digital archaeologist to extract the data from a lzip file long
+ after quantum computers eventually render LZMA obsolete.
* Additionally the lzip reference implementation is copylefted, which
guarantees that it will remain free forever.
@@ -128,9 +132,9 @@ two or more compressed files. The result is the concatenation of the
corresponding uncompressed files. Integrity testing of concatenated
compressed files is also supported.
- Clzip can produce multimember files and safely recover, with
-lziprecover, the undamaged members in case of file damage. Clzip can
-also split the compressed output in volumes of a given size, even when
+ Clzip can produce multimember files, and lziprecover can safely
+recover the undamaged members in case of file damage. Clzip can also
+split the compressed output in volumes of a given size, even when
reading from standard input. This allows the direct creation of
multivolume compressed tar archives.
@@ -138,8 +142,12 @@ multivolume compressed tar archives.
automatically creating multimember output. The members so created are
large, about 2 PiB each.
+ LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may
+never have been compressed. Decompressed is used to refer to data which
+have undergone the process of decompression.
+

-File: clzip.info, Node: Invoking clzip, Next: File format, Prev: Introduction, Up: Top
+File: clzip.info, Node: Invoking clzip, Next: Quality assurance, Prev: Introduction, Up: Top
2 Invoking clzip
****************
@@ -205,6 +213,21 @@ command line.
Keep (don't delete) input files during compression or
decompression.
+'-l'
+'--list'
+ Print the uncompressed size, compressed size and percentage saved
+ of the specified file(s). Trailing data are ignored. The values
+ produced are correct even for multimember files. If more than one
+ file is given, a final line containing the cumulative sizes is
+ printed. With '-v', the dictionary size, the number of members in
+ the file, and the amount of trailing data (if any) are also
+ printed. With '-vv', the positions and sizes of each member in
+ multimember files are also printed. '-lq' can be used to verify
+ quickly (without decompressing) the structural integrity of the
+ specified files. (Use '--test' to verify the data integrity).
+ '-alq' additionally verifies that none of the specified files
+ contain trailing data.
+
'-m BYTES'
'--match-length=BYTES'
Set the match length limit in bytes. After a match this long is
@@ -254,8 +277,9 @@ command line.
Check integrity of the specified file(s), but don't decompress
them. This really performs a trial decompression and throws away
the result. Use it together with '-v' to see information about
- the file(s). If a file fails the test, clzip continues checking
- the rest of the files.
+ the file(s). If a file fails the test, does not exist, can't be
+ opened, or is a terminal, clzip continues checking the rest of the
+ files.
'-v'
'--verbose'
@@ -265,7 +289,8 @@ command line.
When decompressing or testing, further -v's (up to 4) increase the
verbosity level, showing status, compression ratio, dictionary
size, trailer contents (CRC, data size, member size), and up to 6
- bytes of trailing data (if any).
+ bytes of trailing data (if any) both in hexadecimal and as a
+ string of printable ASCII characters.
'-0 .. -9'
Set the compression parameters (dictionary size and match length
@@ -317,9 +342,186 @@ invalid input file, 3 for an internal consistency error (eg, bug) which
caused clzip to panic.

-File: clzip.info, Node: File format, Next: Algorithm, Prev: Invoking clzip, Up: Top
+File: clzip.info, Node: Quality assurance, Next: File format, Prev: Invoking clzip, Up: Top
+
+3 Design, development and testing of lzip
+*****************************************
+
+There are two ways of constructing a software design: One way is to make
+it so simple that there are obviously no deficiencies and the other way
+is to make it so complicated that there are no obvious deficiencies. The
+first method is far more difficult.
+-- C.A.R. Hoare
+
+ Lzip has been designed, written and tested with great care to be the
+standard general-purpose compressor for unix-like systems. This chapter
+describes the lessons learned from previous compressors (gzip and
+bzip2), and their application to the design of lzip.
+
+
+3.1 Format design
+=================
+
+When gzip was designed in 1992, computers and operating systems were
+much less capable than they are today. Gzip tried to work around some of
+those limitations, like 8.3 file names, with additional fields in its
+file format.
+
+ Today those limitations have mostly disappeared, and the format of
+gzip has proved to be unnecessarily complicated. It includes fields
+that were never used, others that have lost their usefulness, and
+finally others that have become too limited.
+
+ Bzip2 was designed 5 years later, and its format is simpler than the
+one of gzip.
+
+ Probably the worst defect of the gzip format from the point of view
+of data safety is the variable size of its header. If the byte at
+offset 3 (flags) of a gzip member gets corrupted, it may become very
+difficult to recover the data, even if the compressed blocks are
+intact, because it can't be known with certainty where the compressed
+blocks begin.
+
+ By contrast, the header of a lzip member has a fixed length of 6. The
+LZMA stream in a lzip member always starts at offset 6, making it
+trivial to recover the data even if the whole header becomes corrupt.
+
+ Bzip2 also provides a header of fixed length and marks the begin and
+end of each compressed block with six magic bytes, making it possible to
+find the compressed blocks even in case of file damage. But bzip2 does
+not store the size of each compressed block, as lzip does.
+
+ Lzip provides better data recovery capabilities than any other
+gzip-like compressor because its format has been designed from the
+beginning to be simple and safe. It also helps that the LZMA data
+stream as used by lzip is extraordinarily safe. It provides embedded
+error detection. Any distance larger than the dictionary size acts as a
+forbidden symbol, allowing the decompressor to detect the approximate
+position of errors, and leaving very little work for the check sequence
+(CRC and data sizes) in the detection of errors. Lzip is usually able
+to detect all posible bit-flips in the compressed data without
+resorting to the check sequence. It would be very difficult to write an
+automatic recovery tool like lziprecover for the gzip format. And, as
+far as I know, it has never been written.
+
+ Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the
+decompressed data because it provides more accurate error detection than
+CRC64 up to a compressed size of about 16 GiB, a size larger than that
+of most files. In the case of lzip, the additional detection capability
+of the decompressor reduces the probability of undetected errors more
+than a million times, making CRC32 more accurate than CRC64 up to about
+20 PiB of compressed size.
+
+ The lzip format is designed for long-term archiving. Therefore it
+excludes any unneeded features that may interfere with the future
+extraction of the uncompressed data.
+
+
+3.1.1 Gzip format (mis)features not present in lzip
+---------------------------------------------------
+
+'Multiple algorithms'
+ Gzip provides a CM (Compression Method) field that has never been
+ used because it is a bad idea to begin with. New compression
+ methods may require additional fields, making it impossible to
+ implement new methods and, at the same time, keep the same format.
+ This field does not solve the problem of format proliferation; it
+ just makes the problem less obvious.
+
+'Optional fields in header'
+ Unless special precautions are taken, optional fields are
+ generally a bad idea because they produce a header of variable
+ size. The gzip header has 2 fields that, in addition to being
+ optional, are zero-terminated. This means that if any byte inside
+ the field gets zeroed, or if the terminating zero gets altered,
+ gzip won't be able to find neither the header CRC nor the
+ compressed blocks.
+
+'Optional CRC for the header'
+ Using an optional checksum for the header is not only a bad idea,
+ it is an error; it may prevent the extraction of perfectly good
+ data. For example, if the checksum is used and the bit enabling it
+ is reset by a bit-flip, the header will appear to be intact (in
+ spite of being corrupt) while the compressed blocks will appear to
+ be totally unrecoverable (in spite of being intact). Very
+ misleading indeed.
+
+
+3.1.2 Lzip format improvements over gzip and bzip2
+--------------------------------------------------
+
+'64-bit size field'
+ Probably the most frequently reported shortcoming of the gzip
+ format is that it only stores the least significant 32 bits of the
+ uncompressed size. The size of any file larger than 4 GiB gets
+ truncated.
+
+ Bzip2 does not store the uncompressed size of the file.
+
+ The lzip format provides a 64-bit field for the uncompressed size.
+ Additionaly, lzip produces multimember output automatically when
+ the size is too large for a single member, allowing for an
+ unlimited uncompressed size.
+
+'Distributed index'
+ The lzip format provides a distributed index that, among other
+ things, helps plzip to decompress several times faster than pigz
+ and helps lziprecover do its job. Neither the gzip format nor the
+ bzip2 format do provide an index.
+
+ A distributed index is safer and more scalable than a monolithic
+ index. The monolithic index introduces a single point of failure
+ in the compressed file and may limit the number of members or the
+ total uncompressed size.
+
+
+3.2 Quality of implementation
+=============================
+
+'Accurate and robust error detection'
+ The lzip format provides 3 factor integrity checking and the
+ decompressors report mismatches in each factor separately. This
+ way if just one byte in one factor fails but the other two factors
+ match the data, it probably means that the data are intact and the
+ corruption just affects the mismatching factor (CRC or data size)
+ in the check sequence.
+
+'Multiple implementations'
+ Just like the lzip format provides 3 factor protection against
+ undetected data corruption, the development methodology of the lzip
+ family of compressors provides 3 factor protection against
+ undetected programming errors.
+
+ Three related but independent compressor implementations, lzip,
+ clzip and minilzip/lzlib, are developed concurrently. Every stable
+ release of any of them is subjected to a hundred hours of
+ intensive testing to verify that it produces identical output to
+ the other two. This guarantees that all three implement the same
+ algorithm, and makes it unlikely that any of them may contain
+ serious undiscovered errors. In fact, no errors have been
+ discovered in lzip since 2009.
+
+ Additionally, the three implementations have been extensively
+ tested with unzcrash, valgrind and 'american fuzzy lop' without
+ finding a single vulnerability or false negative. *Note Unzcrash:
+ (lziprecover)Unzcrash.
+
+'Dictionary size'
+ Lzip automatically uses the smallest possible dictionary size for
+ each file. In addition to reducing the amount of memory required
+ for decompression, this feature also minimizes the probability of
+ being affected by RAM errors during compression.
+
+'Exit status'
+ Returning a warning status of 2 is a design flaw of compress that
+ leaked into the design of gzip. Both bzip2 and lzip are free from
+ this flaw.
+
+
+
+File: clzip.info, Node: File format, Next: Algorithm, Prev: Quality assurance, Up: Top
-3 File format
+4 File format
*************
Perfection is reached, not when there is no longer anything to add, but
@@ -371,8 +573,8 @@ additional information before, between, or after them.
'LZMA stream'
The LZMA stream, finished by an end of stream marker. Uses default
- values for encoder properties. *Note Stream format: (lzip)Stream
- format, for a complete description.
+ values for encoder properties. *Note Stream format::, for a
+ complete description.
'CRC32 (4 bytes)'
CRC of the uncompressed original data.
@@ -388,9 +590,9 @@ additional information before, between, or after them.

-File: clzip.info, Node: Algorithm, Next: Trailing data, Prev: File format, Up: Top
+File: clzip.info, Node: Algorithm, Next: Stream format, Prev: File format, Up: Top
-4 Algorithm
+5 Algorithm
***********
In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a
@@ -454,21 +656,264 @@ range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).

-File: clzip.info, Node: Trailing data, Next: Examples, Prev: Algorithm, Up: Top
+File: clzip.info, Node: Stream format, Next: Trailing data, Prev: Algorithm, Up: Top
+
+6 Format of the LZMA stream in lzip files
+*****************************************
+
+The LZMA algorithm has three parameters, called "special LZMA
+properties", to adjust it for some kinds of binary data. These
+parameters are; 'literal_context_bits' (with a default value of 3),
+'literal_pos_state_bits' (with a default value of 0), and
+'pos_state_bits' (with a default value of 2). As a general purpose
+compressor, lzip only uses the default values for these parameters. In
+particular 'literal_pos_state_bits' has been optimized away and does
+not even appear in the code.
+
+ Lzip also finishes the LZMA stream with an "End Of Stream" marker
+(the distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the
+"member size" field in the member trailer allows the verification of
+stream integrity. The LZMA stream in lzip files always has these two
+features (default properties and EOS marker) and is referred to in this
+document as LZMA-302eos or LZMA-lzip.
+
+ The second stage of LZMA is a range encoder that uses a different
+probability model for each type of symbol; distances, lengths, literal
+bytes, etc. Range encoding conceptually encodes all the symbols of the
+message into one number. Unlike Huffman coding, which assigns to each
+symbol a bit-pattern and concatenates all the bit-patterns together,
+range encoding can compress one symbol to less than one bit. Therefore
+the compressed data produced by a range encoder can't be split in pieces
+that could be individually described.
+
+ It seems that the only way of describing the LZMA-302eos stream is
+describing the algorithm that decodes it. And given the many details
+about the range decoder that need to be described accurately, the source
+code of a real decoder seems the only appropriate reference to use.
+
+ What follows is a description of the decoding algorithm for
+LZMA-302eos streams using as reference the source code of "lzd", an
+educational decompressor for lzip files which can be downloaded from
+the lzip download directory. The source code of lzd is included in
+appendix A. *Note Reference source code::.
+
+
+6.1 What is coded
+=================
+
+The LZMA stream includes literals, matches and repeated matches (matches
+reusing a recently used distance). There are 7 different coding
+sequences:
+
+Bit sequence Name Description
+---------------------------------------------------------------------------
+0 + byte literal literal byte
+1 + 0 + len + dis match distance-length pair
+1 + 1 + 0 + 0 shortrep 1 byte match at latest used distance
+1 + 1 + 0 + 1 + len rep0 len bytes match at latest used
+ distance
+1 + 1 + 1 + 0 + len rep1 len bytes match at second latest
+ used distance
+1 + 1 + 1 + 1 + 0 + len rep2 len bytes match at third latest used
+ distance
+1 + 1 + 1 + 1 + 1 + len rep3 len bytes match at fourth latest
+ used distance
+
+
+ In the following tables, multibit sequences are coded in normal
+order, from MSB to LSB, except where noted otherwise.
+
+ Lengths (the 'len' in the table above) are coded as follows:
+
+Bit sequence Description
+--------------------------------------------------------------------------
+0 + 3 bits lengths from 2 to 9
+1 + 0 + 3 bits lengths from 10 to 17
+1 + 1 + 8 bits lengths from 18 to 273
+
+
+ The coding of distances is a little more complicated, so I'll begin
+explaining a simpler version of the encoding.
+
+ Imagine you need to code a number from 0 to 2^32 - 1, and you want
+to do it in a way that produces shorter codes for the smaller numbers.
+You may first send the position of the most significant bit that is set
+to 1, which you may find by making a bit scan from the left (from the
+MSB). A position of 0 means that the number is 0 (no bit is set), 1
+means the LSB is the first bit set (the number is 1), and 32 means the
+MSB is set (i.e., the number is >= 0x80000000). Let's call this bit
+position a "slot". Then, if slot is > 1, you send the remaining
+slot - 1 bits. Let's call these bits "direct_bits" because they are
+coded directly by value instead of indirectly by position.
+
+ The inconvenient of this simple method is that it needs 6 bits to
+code the slot, but it just uses 33 of the 64 possible values, wasting
+almost half of the codes.
+
+ The intelligent trick of LZMA is that it encodes the position of the
+most significant bit set, along with the value of the next bit, in the
+same 6 bits that would take to encode the position alone. This seems to
+need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is
+no next bit, so the number of needed slots is 64 (0 to 63).
+
+ The 6 bits representing this "slot number" are then context-coded. If
+the distance is >= 4, the remaining bits are coded as follows.
+'direct_bits' is the amount of remaining bits (from 0 to 30) needed to
+form a complete distance, and is calculated as (slot >> 1) - 1. If a
+distance needs 6 or more direct_bits, the last 4 bits are coded
+separately. The last piece (all the direct_bits for distances 4 to 127
+or the last 4 bits for distances >= 128) is context-coded in reverse
+order (from LSB to MSB). For distances >= 128, the 'direct_bits - 4'
+part is coded with fixed 0.5 probability.
+
+Bit sequence Description
+--------------------------------------------------------------------------
+slot distances from 0 to 3
+slot + direct_bits distances from 4 to 127
+slot + (direct_bits - 4) + 4 bits distances from 128 to 2^32 - 1
+
+
+6.2 The coding contexts
+=======================
+
+These contexts ('Bit_model' in the source), are integers or arrays of
+integers representing the probability of the corresponding bit being 0.
+
+ The indices used in these arrays are:
+
+'state'
+ A state machine ('State' in the source) with 12 states (0 to 11),
+ coding the latest 2 to 4 types of sequences processed. The initial
+ state is 0.
+
+'pos_state'
+ Value of the 2 least significant bits of the current position in
+ the decoded data.
+
+'literal_state'
+ Value of the 3 most significant bits of the latest byte decoded.
+
+'len_state'
+ Coded value of length (length - 2), with a maximum of 3. The
+ resulting value is in the range 0 to 3.
+
+
+ In the following table, '!literal' is any sequence except a literal
+byte. 'rep' is any one of 'rep0', 'rep1', 'rep2' or 'rep3'. The types
+of previous sequences corresponding to each state are:
+
+State Types of previous sequences
+--------------------------------------------------------
+0 literal, literal, literal
+1 match, literal, literal
+2 rep or (!literal, shortrep), literal, literal
+3 literal, shortrep, literal, literal
+4 match, literal
+5 rep or (!literal, shortrep), literal
+6 literal, shortrep, literal
+7 literal, match
+8 literal, rep
+9 literal, shortrep
+10 !literal, match
+11 !literal, (rep or shortrep)
+
+
+ The contexts for decoding the type of coding sequence are:
+
+Name Indices Used when
+---------------------------------------------------------------------------
+bm_match state, pos_state sequence start
+bm_rep state after sequence 1
+bm_rep0 state after sequence 11
+bm_rep1 state after sequence 111
+bm_rep2 state after sequence 1111
+bm_len state, pos_state after sequence 110
+
+
+ The contexts for decoding distances are:
+
+Name Indices Used when
+---------------------------------------------------------------------------
+bm_dis_slot len_state, bit tree distance start
+bm_dis reverse bit tree after slots 4 to 13
+bm_align reverse bit tree for distances >= 128, after
+ fixed probability bits
+
+
+ There are two separate sets of contexts for lengths ('Len_model' in
+the source). One for normal matches, the other for repeated matches. The
+contexts in each Len_model are (see 'decode_len' in the source):
+
+Name Indices Used when
+---------------------------------------------------------------------------
+choice1 none length start
+choice2 none after sequence 1
+bm_low pos_state, bit tree after sequence 0
+bm_mid pos_state, bit tree after sequence 10
+bm_high bit tree after sequence 11
+
+
+ The context array 'bm_literal' is special. In principle it acts as a
+normal bit tree context, the one selected by 'literal_state'. But if
+the previous decoded byte was not a literal, two other bit tree
+contexts are used depending on the value of each bit in 'match_byte'
+(the byte at the latest used distance), until a bit is decoded that is
+different from its corresponding bit in 'match_byte'. After the first
+difference is found, the rest of the byte is decoded using the normal
+bit tree context. (See 'decode_matched' in the source).
+
+
+6.3 The range decoder
+=====================
+
+The LZMA stream is consumed one byte at a time by the range decoder.
+(See 'normalize' in the source). Every byte consumed produces a
+variable number of decoded bits, depending on how well these bits agree
+with their context. (See 'decode_bit' in the source).
+
+ The range decoder state consists of two unsigned 32-bit variables;
+'range' (representing the most significant part of the range size not
+yet decoded), and 'code' (representing the current point within
+'range'). 'range' is initialized to (2^32 - 1), and 'code' is
+initialized to 0.
+
+ The range encoder produces a first 0 byte that must be ignored by the
+range decoder. This is done by shifting 5 bytes in the initialization of
+'code' instead of 4. (See the 'Range_decoder' constructor in the
+source).
+
+
+6.4 Decoding the LZMA stream
+============================
+
+After decoding the member header and obtaining the dictionary size, the
+range decoder is initialized and then the LZMA decoder enters a loop
+(See 'decode_member' in the source) where it invokes the range decoder
+with the appropriate contexts to decode the different coding sequences
+(matches, repeated matches, and literal bytes), until the "End Of
+Stream" marker is decoded.
+
+
+File: clzip.info, Node: Trailing data, Next: Examples, Prev: Stream format, Up: Top
-5 Extra data appended to the file
+7 Extra data appended to the file
*********************************
-Sometimes extra data is found appended to a lzip file after the last
+Sometimes extra data are found appended to a lzip file after the last
member. Such trailing data may be:
* Padding added to make the file size a multiple of some block size,
- for example when writing to a tape.
-
- * Garbage added by some not totally successful copy operation.
+ for example when writing to a tape. It is safe to append any
+ amount of padding zero bytes to a lzip file.
* Useful data added by the user; a cryptographically secure hash, a
- description of file contents, etc.
+ description of file contents, etc. It is safe to append any amount
+ of text to a lzip file as long as the text does not begin with the
+ string "LZIP", and does not contain any zero bytes (null
+ characters). Nonzero bytes and zero bytes can't be safely mixed in
+ trailing data.
+
+ * Garbage added by some not totally successful copy operation.
* Malicious data added to the file in order to make its total size
and hash value (for a chosen hash) coincide with those of another
@@ -481,15 +926,19 @@ member. Such trailing data may be:
the corruption of the integrity information itself. Therefore it
can be considered to be below the noise level.
+ Trailing data are in no way part of the lzip file format, but tools
+reading lzip files are expected to behave as correctly and usefully as
+possible in the presence of trailing data.
+
Trailing data can be safely ignored in most cases. In some cases,
-like that of user-added data, it is expected to be ignored. In those
+like that of user-added data, they are expected to be ignored. In those
cases where a file containing trailing data must be rejected, the option
'--trailing-error' can be used. *Note --trailing-error::.

File: clzip.info, Node: Examples, Next: Problems, Prev: Trailing data, Up: Top
-6 A small tutorial with examples
+8 A small tutorial with examples
********************************
WARNING! Even if clzip is bug-free, other causes may result in a corrupt
@@ -530,8 +979,8 @@ Example 5: Compress a whole device in /dev/sdc and send the output to
clzip -c /dev/sdc > file.lz
-Example 6: The right way of concatenating compressed files. *Note
-Trailing data::.
+Example 6: The right way of concatenating the decompressed output of two
+or more compressed files. *Note Trailing data::.
Don't do this
cat file1.lz file2.lz file3.lz | clzip -d
@@ -569,9 +1018,9 @@ file with a member size of 32 MiB.
clzip -b 32MiB -S 650MB big_db

-File: clzip.info, Node: Problems, Next: Concept index, Prev: Examples, Up: Top
+File: clzip.info, Node: Problems, Next: Reference source code, Prev: Examples, Up: Top
-7 Reporting bugs
+9 Reporting bugs
****************
There are probably bugs in clzip. There are certainly errors and
@@ -584,7 +1033,477 @@ for all eternity, if not longer.
by running 'clzip --version'.

-File: clzip.info, Node: Concept index, Prev: Problems, Up: Top
+File: clzip.info, Node: Reference source code, Next: Concept index, Prev: Problems, Up: Top
+
+Appendix A Reference source code
+********************************
+
+/* Lzd - Educational decompressor for the lzip format
+ Copyright (C) 2013-2017 Antonio Diaz Diaz.
+
+ This program is free software. Redistribution and use in source and
+ binary forms, with or without modification, are permitted provided
+ that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+/*
+ Exit status: 0 for a normal exit, 1 for environmental problems
+ (file not found, invalid flags, I/O errors, etc), 2 to indicate a
+ corrupt or invalid input file.
+*/
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdint.h>
+#include <unistd.h>
+#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+
+class State
+ {
+ int st;
+
+public:
+ enum { states = 12 };
+ State() : st( 0 ) {}
+ int operator()() const { return st; }
+ bool is_char() const { return st < 7; }
+
+ void set_char()
+ {
+ static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
+ st = next[st];
+ }
+ void set_match() { st = ( st < 7 ) ? 7 : 10; }
+ void set_rep() { st = ( st < 7 ) ? 8 : 11; }
+ void set_short_rep() { st = ( st < 7 ) ? 9 : 11; }
+ };
+
+
+enum {
+ min_dictionary_size = 1 << 12,
+ max_dictionary_size = 1 << 29,
+ literal_context_bits = 3,
+ literal_pos_state_bits = 0, // not used
+ pos_state_bits = 2,
+ pos_states = 1 << pos_state_bits,
+ pos_state_mask = pos_states - 1,
+
+ len_states = 4,
+ dis_slot_bits = 6,
+ start_dis_model = 4,
+ end_dis_model = 14,
+ modeled_distances = 1 << (end_dis_model / 2), // 128
+ dis_align_bits = 4,
+ dis_align_size = 1 << dis_align_bits,
+
+ len_low_bits = 3,
+ len_mid_bits = 3,
+ len_high_bits = 8,
+ len_low_symbols = 1 << len_low_bits,
+ len_mid_symbols = 1 << len_mid_bits,
+ len_high_symbols = 1 << len_high_bits,
+ max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,
+
+ min_match_len = 2, // must be 2
+
+ bit_model_move_bits = 5,
+ bit_model_total_bits = 11,
+ bit_model_total = 1 << bit_model_total_bits };
+
+struct Bit_model
+ {
+ int probability;
+ Bit_model() : probability( bit_model_total / 2 ) {}
+ };
+
+struct Len_model
+ {
+ Bit_model choice1;
+ Bit_model choice2;
+ Bit_model bm_low[pos_states][len_low_symbols];
+ Bit_model bm_mid[pos_states][len_mid_symbols];
+ Bit_model bm_high[len_high_symbols];
+ };
+
+
+class CRC32
+ {
+ uint32_t data[256]; // Table of CRCs of all 8-bit messages.
+
+public:
+ CRC32()
+ {
+ for( unsigned n = 0; n < 256; ++n )
+ {
+ unsigned c = n;
+ for( int k = 0; k < 8; ++k )
+ { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
+ data[n] = c;
+ }
+ }
+
+ void update_buf( uint32_t & crc, const uint8_t * const buffer,
+ const int size ) const
+ {
+ for( int i = 0; i < size; ++i )
+ crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
+ }
+ };
+
+const CRC32 crc32;
+
+
+typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size
+
+typedef uint8_t File_trailer[20];
+ // 0-3 CRC32 of the uncompressed data
+ // 4-11 size of the uncompressed data
+ // 12-19 member size including header and trailer
+
+class Range_decoder
+ {
+ uint32_t code;
+ uint32_t range;
+
+public:
+ Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
+ {
+ for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte();
+ }
+
+ uint8_t get_byte() { return std::getc( stdin ); }
+
+ unsigned decode( const int num_bits )
+ {
+ unsigned symbol = 0;
+ for( int i = num_bits; i > 0; --i )
+ {
+ range >>= 1;
+ symbol <<= 1;
+ if( code >= range ) { code -= range; symbol |= 1; }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | get_byte(); }
+ }
+ return symbol;
+ }
+
+ unsigned decode_bit( Bit_model & bm )
+ {
+ unsigned symbol;
+ const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
+ if( code < bound )
+ {
+ range = bound;
+ bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
+ symbol = 0;
+ }
+ else
+ {
+ range -= bound;
+ code -= bound;
+ bm.probability -= bm.probability >> bit_model_move_bits;
+ symbol = 1;
+ }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | get_byte(); }
+ return symbol;
+ }
+
+ unsigned decode_tree( Bit_model bm[], const int num_bits )
+ {
+ unsigned symbol = 1;
+ for( int i = 0; i < num_bits; ++i )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ return symbol - (1 << num_bits);
+ }
+
+ unsigned decode_tree_reversed( Bit_model bm[], const int num_bits )
+ {
+ unsigned symbol = decode_tree( bm, num_bits );
+ unsigned reversed_symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 );
+ symbol >>= 1;
+ }
+ return reversed_symbol;
+ }
+
+ unsigned decode_matched( Bit_model bm[], const unsigned match_byte )
+ {
+ unsigned symbol = 1;
+ for( int i = 7; i >= 0; --i )
+ {
+ const unsigned match_bit = ( match_byte >> i ) & 1;
+ const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] );
+ symbol = ( symbol << 1 ) | bit;
+ if( match_bit != bit )
+ {
+ while( symbol < 0x100 )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ break;
+ }
+ }
+ return symbol & 0xFF;
+ }
+
+ unsigned decode_len( Len_model & lm, const int pos_state )
+ {
+ if( decode_bit( lm.choice1 ) == 0 )
+ return decode_tree( lm.bm_low[pos_state], len_low_bits );
+ if( decode_bit( lm.choice2 ) == 0 )
+ return len_low_symbols +
+ decode_tree( lm.bm_mid[pos_state], len_mid_bits );
+ return len_low_symbols + len_mid_symbols +
+ decode_tree( lm.bm_high, len_high_bits );
+ }
+ };
+
+
+class LZ_decoder
+ {
+ unsigned long long partial_data_pos;
+ Range_decoder rdec;
+ const unsigned dictionary_size;
+ uint8_t * const buffer; // output buffer
+ unsigned pos; // current pos in buffer
+ unsigned stream_pos; // first byte not yet written to stdout
+ uint32_t crc_;
+ bool pos_wrapped;
+
+ void flush_data();
+
+ uint8_t peek( const unsigned distance ) const
+ {
+ if( pos > distance ) return buffer[pos - distance - 1];
+ if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1];
+ return 0; // prev_byte of first byte
+ }
+
+ void put_byte( const uint8_t b )
+ {
+ buffer[pos] = b;
+ if( ++pos >= dictionary_size ) flush_data();
+ }
+
+public:
+ explicit LZ_decoder( const unsigned dict_size )
+ :
+ partial_data_pos( 0 ),
+ dictionary_size( dict_size ),
+ buffer( new uint8_t[dictionary_size] ),
+ pos( 0 ),
+ stream_pos( 0 ),
+ crc_( 0xFFFFFFFFU ),
+ pos_wrapped( false )
+ {}
+
+ ~LZ_decoder() { delete[] buffer; }
+
+ unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
+ unsigned long long data_position() const { return partial_data_pos + pos; }
+
+ bool decode_member();
+ };
+
+
+void LZ_decoder::flush_data()
+ {
+ if( pos > stream_pos )
+ {
+ const unsigned size = pos - stream_pos;
+ crc32.update_buf( crc_, buffer + stream_pos, size );
+ errno = 0;
+ if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
+ { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
+ std::exit( 1 ); }
+ if( pos >= dictionary_size )
+ { partial_data_pos += pos; pos = 0; pos_wrapped = true; }
+ stream_pos = pos;
+ }
+ }
+
+
+bool LZ_decoder::decode_member() // Returns false if error
+ {
+ Bit_model bm_literal[1<<literal_context_bits][0x300];
+ Bit_model bm_match[State::states][pos_states];
+ Bit_model bm_rep[State::states];
+ Bit_model bm_rep0[State::states];
+ Bit_model bm_rep1[State::states];
+ Bit_model bm_rep2[State::states];
+ Bit_model bm_len[State::states][pos_states];
+ Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
+ Bit_model bm_dis[modeled_distances-end_dis_model+1];
+ Bit_model bm_align[dis_align_size];
+ Len_model match_len_model;
+ Len_model rep_len_model;
+ unsigned rep0 = 0; // rep[0-3] latest four distances
+ unsigned rep1 = 0; // used for efficient coding of
+ unsigned rep2 = 0; // repeated distances
+ unsigned rep3 = 0;
+ State state;
+
+ while( !std::feof( stdin ) && !std::ferror( stdin ) )
+ {
+ const int pos_state = data_position() & pos_state_mask;
+ if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit
+ {
+ const uint8_t prev_byte = peek( 0 );
+ const int literal_state = prev_byte >> ( 8 - literal_context_bits );
+ Bit_model * const bm = bm_literal[literal_state];
+ if( state.is_char() )
+ put_byte( rdec.decode_tree( bm, 8 ) );
+ else
+ put_byte( rdec.decode_matched( bm, peek( rep0 ) ) );
+ state.set_char();
+ }
+ else // match or repeated match
+ {
+ int len;
+ if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit
+ {
+ if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit
+ {
+ if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit
+ { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; }
+ }
+ else
+ {
+ unsigned distance;
+ if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit
+ distance = rep1;
+ else
+ {
+ if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit
+ distance = rep2;
+ else
+ { distance = rep3; rep3 = rep2; }
+ rep2 = rep1;
+ }
+ rep1 = rep0;
+ rep0 = distance;
+ }
+ state.set_rep();
+ len = min_match_len + rdec.decode_len( rep_len_model, pos_state );
+ }
+ else // match
+ {
+ rep3 = rep2; rep2 = rep1; rep1 = rep0;
+ len = min_match_len + rdec.decode_len( match_len_model, pos_state );
+ const int len_state = std::min( len - min_match_len, len_states - 1 );
+ rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits );
+ if( rep0 >= start_dis_model )
+ {
+ const unsigned dis_slot = rep0;
+ const int direct_bits = ( dis_slot >> 1 ) - 1;
+ rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
+ if( dis_slot < end_dis_model )
+ rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ),
+ direct_bits );
+ else
+ {
+ rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
+ rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
+ if( rep0 == 0xFFFFFFFFU ) // marker found
+ {
+ flush_data();
+ return ( len == min_match_len ); // End Of Stream marker
+ }
+ }
+ }
+ state.set_match();
+ if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) )
+ { flush_data(); return false; }
+ }
+ for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) );
+ }
+ }
+ flush_data();
+ return false;
+ }
+
+
+int main( const int argc, const char * const argv[] )
+ {
+ if( argc > 1 )
+ {
+ std::printf( "Lzd %s - Educational decompressor for the lzip format.\n",
+ PROGVERSION );
+ std::printf( "Study the source to learn how a lzip decompressor works.\n"
+ "See the lzip manual for an explanation of the code.\n"
+ "It is not safe to use lzd for any real work.\n"
+ "\nUsage: %s < file.lz > file\n", argv[0] );
+ std::printf( "Lzd decompresses from standard input to standard output.\n"
+ "\nCopyright (C) 2017 Antonio Diaz Diaz.\n"
+ "This is free software: you are free to change and redistribute it.\n"
+ "There is NO WARRANTY, to the extent permitted by law.\n"
+ "Report bugs to lzip-bug@nongnu.org\n"
+ "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n" );
+ return 0;
+ }
+
+#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER)
+ setmode( fileno( stdin ), O_BINARY );
+ setmode( fileno( stdout ), O_BINARY );
+#endif
+
+ for( bool first_member = true; ; first_member = false )
+ {
+ File_header header; // verify header
+ for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin );
+ if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 )
+ {
+ if( first_member )
+ { std::fputs( "Bad magic number (file not in lzip format).\n", stderr );
+ return 2; }
+ break;
+ }
+ unsigned dict_size = 1 << ( header[5] & 0x1F );
+ dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
+ if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
+ { std::fputs( "Invalid dictionary size in member header.\n", stderr );
+ return 2; }
+
+ LZ_decoder decoder( dict_size ); // decode LZMA stream
+ if( !decoder.decode_member() )
+ { std::fputs( "Data error\n", stderr ); return 2; }
+
+ File_trailer trailer; // verify trailer
+ for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin );
+ unsigned crc = 0;
+ for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; }
+ unsigned long long data_size = 0;
+ for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; }
+ if( crc != decoder.crc() || data_size != decoder.data_position() )
+ { std::fputs( "CRC error\n", stderr ); return 2; }
+ }
+
+ if( std::fclose( stdout ) != 0 )
+ { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) );
+ return 1; }
+ return 0;
+ }
+
+
+File: clzip.info, Node: Concept index, Prev: Reference source code, Up: Top
Concept index
*************
@@ -596,10 +1515,13 @@ Concept index
* bugs: Problems. (line 6)
* examples: Examples. (line 6)
* file format: File format. (line 6)
+* format of the LZMA stream: Stream format. (line 6)
* getting help: Problems. (line 6)
* introduction: Introduction. (line 6)
* invoking: Invoking clzip. (line 6)
* options: Invoking clzip. (line 6)
+* quality assurance: Quality assurance. (line 6)
+* reference source code: Reference source code. (line 6)
* trailing data: Trailing data. (line 6)
* usage: Invoking clzip. (line 6)
* version: Invoking clzip. (line 6)
@@ -608,16 +1530,19 @@ Concept index

Tag Table:
Node: Top210
-Node: Introduction952
-Node: Invoking clzip6164
-Ref: --trailing-error6730
-Node: File format12728
-Node: Algorithm15150
-Node: Trailing data17980
-Node: Examples19355
-Ref: concat-example20537
-Node: Problems21544
-Node: Concept index22070
+Node: Introduction1154
+Node: Invoking clzip6630
+Ref: --trailing-error7202
+Node: Quality assurance14125
+Node: File format22281
+Node: Algorithm24686
+Node: Stream format27516
+Node: Trailing data38257
+Node: Examples40159
+Ref: concat-example41341
+Node: Problems42386
+Node: Reference source code42920
+Node: Concept index57238

End Tag Table
diff --git a/doc/clzip.texi b/doc/clzip.texi
index 331d4eb..2cac199 100644
--- a/doc/clzip.texi
+++ b/doc/clzip.texi
@@ -6,8 +6,8 @@
@finalout
@c %**end of header
-@set UPDATED 13 May 2016
-@set VERSION 1.8
+@set UPDATED 13 April 2017
+@set VERSION 1.9
@dircategory Data Compression
@direntry
@@ -37,16 +37,19 @@ This manual is for Clzip (version @value{VERSION}, @value{UPDATED}).
@menu
* Introduction:: Purpose and features of clzip
* Invoking clzip:: Command line interface
+* Quality assurance:: Design, development and testing of lzip
* File format:: Detailed format of the compressed file
* Algorithm:: How clzip compresses the data
+* Stream format:: Format of the LZMA stream in lzip files
* Trailing data:: Extra data appended to the file
* Examples:: A small tutorial with examples
* Problems:: Reporting bugs
+* Reference source code:: Source code illustrating stream format
* Concept index:: Index of concepts
@end menu
@sp 1
-Copyright @copyright{} 2010-2016 Antonio Diaz Diaz.
+Copyright @copyright{} 2010-2017 Antonio Diaz Diaz.
This manual is free documentation: you have unlimited permission
to copy, distribute and modify it.
@@ -56,15 +59,16 @@ to copy, distribute and modify it.
@chapter Introduction
@cindex introduction
-Clzip is a lossless data compressor with a user interface similar to the
-one of gzip or bzip2. Clzip is about as fast as gzip, compresses most
-files more than bzip2, and is better than both from a data recovery
-perspective.
+Clzip is a C language version of lzip, fully compatible with lzip-1.4 or
+newer. As clzip is written in C, it may be easier to integrate in
+applications like package managers, embedded devices, or systems lacking
+a C++ compiler.
-Clzip uses the lzip file format; the files produced by clzip are fully
-compatible with lzip-1.4 or newer, and can be rescued with lziprecover.
-Clzip is in fact a C language version of lzip, intended for embedded
-devices or systems lacking a C++ compiler.
+Lzip is a lossless data compressor with a user interface similar to the
+one of gzip or bzip2. Lzip can compress about as fast as gzip
+@w{(lzip -0)}, or compress most files more than bzip2 @w{(lzip -9)}.
+Decompression speed is intermediate between gzip and bzip2. Lzip is
+better than gzip and bzip2 from a data recovery perspective.
The lzip file format is designed for data sharing and long-term
archiving, taking into account both data integrity and decoder
@@ -84,10 +88,10 @@ including error-checked merging of damaged copies of a file.
@item
The lzip format is as simple as possible (but not simpler). The lzip
-manual provides the code of a simple decompressor along with a detailed
-explanation of how it works, so that with the only help of the lzip
-manual it would be possible for a digital archaeologist to extract the
-data from a lzip file long after quantum computers eventually render
+manual provides the source code of a simple decompressor along with a
+detailed explanation of how it works, so that with the only help of the
+lzip manual it would be possible for a digital archaeologist to extract
+the data from a lzip file long after quantum computers eventually render
LZMA obsolete.
@item
@@ -158,16 +162,20 @@ or more compressed files. The result is the concatenation of the
corresponding uncompressed files. Integrity testing of concatenated
compressed files is also supported.
-Clzip can produce multimember files and safely recover, with
-lziprecover, the undamaged members in case of file damage. Clzip can
-also split the compressed output in volumes of a given size, even when
-reading from standard input. This allows the direct creation of
-multivolume compressed tar archives.
+Clzip can produce multimember files, and lziprecover can safely recover
+the undamaged members in case of file damage. Clzip can also split the
+compressed output in volumes of a given size, even when reading from
+standard input. This allows the direct creation of multivolume
+compressed tar archives.
Clzip is able to compress and decompress streams of unlimited size by
automatically creating multimember output. The members so created are
large, about 2 PiB each.
+LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never
+have been compressed. Decompressed is used to refer to data which have
+undergone the process of decompression.
+
@node Invoking clzip
@chapter Invoking clzip
@@ -239,6 +247,20 @@ Force re-compression of files whose name already has the @samp{.lz} or
@itemx --keep
Keep (don't delete) input files during compression or decompression.
+@item -l
+@itemx --list
+Print the uncompressed size, compressed size and percentage saved of the
+specified file(s). Trailing data are ignored. The values produced are
+correct even for multimember files. If more than one file is given, a
+final line containing the cumulative sizes is printed. With @samp{-v},
+the dictionary size, the number of members in the file, and the amount
+of trailing data (if any) are also printed. With @samp{-vv}, the
+positions and sizes of each member in multimember files are also
+printed. @samp{-lq} can be used to verify quickly (without
+decompressing) the structural integrity of the specified files. (Use
+@samp{--test} to verify the data integrity). @samp{-alq} additionally
+verifies that none of the specified files contain trailing data.
+
@item -m @var{bytes}
@itemx --match-length=@var{bytes}
Set the match length limit in bytes. After a match this long is found,
@@ -286,7 +308,8 @@ EiB.
Check integrity of the specified file(s), but don't decompress them.
This really performs a trial decompression and throws away the result.
Use it together with @samp{-v} to see information about the file(s). If
-a file fails the test, clzip continues checking the rest of the files.
+a file fails the test, does not exist, can't be opened, or is a
+terminal, clzip continues checking the rest of the files.
@item -v
@itemx --verbose
@@ -296,7 +319,8 @@ second @samp{-v} shows the progress of compression.@*
When decompressing or testing, further -v's (up to 4) increase the
verbosity level, showing status, compression ratio, dictionary size,
trailer contents (CRC, data size, member size), and up to 6 bytes of
-trailing data (if any).
+trailing data (if any) both in hexadecimal and as a string of printable
+ASCII characters.
@item -0 .. -9
Set the compression parameters (dictionary size and match length limit)
@@ -353,6 +377,190 @@ invalid input file, 3 for an internal consistency error (eg, bug) which
caused clzip to panic.
+@node Quality assurance
+@chapter Design, development and testing of lzip
+@cindex quality assurance
+
+There are two ways of constructing a software design: One way is to make
+it so simple that there are obviously no deficiencies and the other way
+is to make it so complicated that there are no obvious deficiencies. The
+first method is far more difficult.@*
+--- C.A.R. Hoare
+
+Lzip has been designed, written and tested with great care to be the
+standard general-purpose compressor for unix-like systems. This chapter
+describes the lessons learned from previous compressors (gzip and
+bzip2), and their application to the design of lzip.
+
+@sp 1
+@section Format design
+
+When gzip was designed in 1992, computers and operating systems were
+much less capable than they are today. Gzip tried to work around some of
+those limitations, like 8.3 file names, with additional fields in its
+file format.
+
+Today those limitations have mostly disappeared, and the format of gzip
+has proved to be unnecessarily complicated. It includes fields that were
+never used, others that have lost their usefulness, and finally others
+that have become too limited.
+
+Bzip2 was designed 5 years later, and its format is simpler than the one
+of gzip.
+
+Probably the worst defect of the gzip format from the point of view of
+data safety is the variable size of its header. If the byte at offset 3
+(flags) of a gzip member gets corrupted, it may become very difficult to
+recover the data, even if the compressed blocks are intact, because it
+can't be known with certainty where the compressed blocks begin.
+
+By contrast, the header of a lzip member has a fixed length of 6. The
+LZMA stream in a lzip member always starts at offset 6, making it
+trivial to recover the data even if the whole header becomes corrupt.
+
+Bzip2 also provides a header of fixed length and marks the begin and end
+of each compressed block with six magic bytes, making it possible to
+find the compressed blocks even in case of file damage. But bzip2 does
+not store the size of each compressed block, as lzip does.
+
+Lzip provides better data recovery capabilities than any other gzip-like
+compressor because its format has been designed from the beginning to be
+simple and safe. It also helps that the LZMA data stream as used by lzip
+is extraordinarily safe. It provides embedded error detection. Any
+distance larger than the dictionary size acts as a forbidden symbol,
+allowing the decompressor to detect the approximate position of errors,
+and leaving very little work for the check sequence (CRC and data sizes)
+in the detection of errors. Lzip is usually able to detect all posible
+bit-flips in the compressed data without resorting to the check
+sequence. It would be very difficult to write an automatic recovery tool
+like lziprecover for the gzip format. And, as far as I know, it has
+never been written.
+
+Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the
+decompressed data because it provides more accurate error detection than
+CRC64 up to a compressed size of about 16 GiB, a size larger than that
+of most files. In the case of lzip, the additional detection capability
+of the decompressor reduces the probability of undetected errors more
+than a million times, making CRC32 more accurate than CRC64 up to about
+20 PiB of compressed size.
+
+The lzip format is designed for long-term archiving. Therefore it
+excludes any unneeded features that may interfere with the future
+extraction of the uncompressed data.
+
+@sp 1
+@subsection Gzip format (mis)features not present in lzip
+
+@table @samp
+@item Multiple algorithms
+
+Gzip provides a CM (Compression Method) field that has never been used
+because it is a bad idea to begin with. New compression methods may
+require additional fields, making it impossible to implement new methods
+and, at the same time, keep the same format. This field does not solve
+the problem of format proliferation; it just makes the problem less
+obvious.
+
+@item Optional fields in header
+
+Unless special precautions are taken, optional fields are generally a
+bad idea because they produce a header of variable size. The gzip header
+has 2 fields that, in addition to being optional, are zero-terminated.
+This means that if any byte inside the field gets zeroed, or if the
+terminating zero gets altered, gzip won't be able to find neither the
+header CRC nor the compressed blocks.
+
+@item Optional CRC for the header
+
+Using an optional checksum for the header is not only a bad idea, it is
+an error; it may prevent the extraction of perfectly good data. For
+example, if the checksum is used and the bit enabling it is reset by a
+bit-flip, the header will appear to be intact (in spite of being
+corrupt) while the compressed blocks will appear to be totally
+unrecoverable (in spite of being intact). Very misleading indeed.
+
+@end table
+
+@subsection Lzip format improvements over gzip and bzip2
+
+@table @samp
+@item 64-bit size field
+
+Probably the most frequently reported shortcoming of the gzip format is
+that it only stores the least significant 32 bits of the uncompressed
+size. The size of any file larger than 4 GiB gets truncated.
+
+Bzip2 does not store the uncompressed size of the file.
+
+The lzip format provides a 64-bit field for the uncompressed size.
+Additionaly, lzip produces multimember output automatically when the
+size is too large for a single member, allowing for an unlimited
+uncompressed size.
+
+@item Distributed index
+
+The lzip format provides a distributed index that, among other things,
+helps plzip to decompress several times faster than pigz and helps
+lziprecover do its job. Neither the gzip format nor the bzip2 format do
+provide an index.
+
+A distributed index is safer and more scalable than a monolithic index.
+The monolithic index introduces a single point of failure in the
+compressed file and may limit the number of members or the total
+uncompressed size.
+
+@end table
+
+@section Quality of implementation
+
+@table @samp
+@item Accurate and robust error detection
+
+The lzip format provides 3 factor integrity checking and the
+decompressors report mismatches in each factor separately. This way if
+just one byte in one factor fails but the other two factors match the
+data, it probably means that the data are intact and the corruption just
+affects the mismatching factor (CRC or data size) in the check sequence.
+
+@item Multiple implementations
+
+Just like the lzip format provides 3 factor protection against
+undetected data corruption, the development methodology of the lzip
+family of compressors provides 3 factor protection against undetected
+programming errors.
+
+Three related but independent compressor implementations, lzip, clzip
+and minilzip/lzlib, are developed concurrently. Every stable release of
+any of them is subjected to a hundred hours of intensive testing to
+verify that it produces identical output to the other two. This
+guarantees that all three implement the same algorithm, and makes it
+unlikely that any of them may contain serious undiscovered errors. In
+fact, no errors have been discovered in lzip since 2009.
+
+Additionally, the three implementations have been extensively tested
+with
+@uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Unzcrash,,unzcrash},
+valgrind and @samp{american fuzzy lop} without finding a single
+vulnerability or false negative.
+@ifnothtml
+@xref{Unzcrash,,,lziprecover}.
+@end ifnothtml
+
+@item Dictionary size
+
+Lzip automatically uses the smallest possible dictionary size for each
+file. In addition to reducing the amount of memory required for
+decompression, this feature also minimizes the probability of being
+affected by RAM errors during compression.
+
+@item Exit status
+
+Returning a warning status of 2 is a design flaw of compress that leaked
+into the design of gzip. Both bzip2 and lzip are free from this flaw.
+
+@end table
+
+
@node File format
@chapter File format
@cindex file format
@@ -412,15 +620,8 @@ Valid values for dictionary size range from 4 KiB to 512 MiB.
@item LZMA stream
The LZMA stream, finished by an end of stream marker. Uses default
-values for encoder properties.
-@ifnothtml
-@xref{Stream format,,,lzip},
-@end ifnothtml
-@ifhtml
-See
-@uref{http://www.nongnu.org/lzip/manual/lzip_manual.html#Stream-format,,Stream format}
-@end ifhtml
-for a complete description.
+values for encoder properties. @xref{Stream format}, for a complete
+description.
@item CRC32 (4 bytes)
CRC of the uncompressed original data.
@@ -502,24 +703,274 @@ range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).
+@node Stream format
+@chapter Format of the LZMA stream in lzip files
+@cindex format of the LZMA stream
+
+The LZMA algorithm has three parameters, called "special LZMA
+properties", to adjust it for some kinds of binary data. These
+parameters are; @samp{literal_context_bits} (with a default value of 3),
+@samp{literal_pos_state_bits} (with a default value of 0), and
+@samp{pos_state_bits} (with a default value of 2). As a general purpose
+compressor, lzip only uses the default values for these parameters. In
+particular @samp{literal_pos_state_bits} has been optimized away and
+does not even appear in the code.
+
+Lzip also finishes the LZMA stream with an "End Of Stream" marker (the
+distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the
+"member size" field in the member trailer allows the verification of
+stream integrity. The LZMA stream in lzip files always has these two
+features (default properties and EOS marker) and is referred to in this
+document as LZMA-302eos or LZMA-lzip.
+
+The second stage of LZMA is a range encoder that uses a different
+probability model for each type of symbol; distances, lengths, literal
+bytes, etc. Range encoding conceptually encodes all the symbols of the
+message into one number. Unlike Huffman coding, which assigns to each
+symbol a bit-pattern and concatenates all the bit-patterns together,
+range encoding can compress one symbol to less than one bit. Therefore
+the compressed data produced by a range encoder can't be split in pieces
+that could be individually described.
+
+It seems that the only way of describing the LZMA-302eos stream is
+describing the algorithm that decodes it. And given the many details
+about the range decoder that need to be described accurately, the source
+code of a real decoder seems the only appropriate reference to use.
+
+What follows is a description of the decoding algorithm for LZMA-302eos
+streams using as reference the source code of "lzd", an educational
+decompressor for lzip files which can be downloaded from the lzip
+download directory. The source code of lzd is included in appendix A.
+@xref{Reference source code}.
+
+@sp 1
+@section What is coded
+
+The LZMA stream includes literals, matches and repeated matches (matches
+reusing a recently used distance). There are 7 different coding sequences:
+
+@multitable @columnfractions .35 .14 .51
+@headitem Bit sequence @tab Name @tab Description
+@item 0 + byte @tab literal @tab literal byte
+@item 1 + 0 + len + dis @tab match @tab distance-length pair
+@item 1 + 1 + 0 + 0 @tab shortrep @tab 1 byte match at latest used
+distance
+@item 1 + 1 + 0 + 1 + len @tab rep0 @tab len bytes match at latest used
+distance
+@item 1 + 1 + 1 + 0 + len @tab rep1 @tab len bytes match at second
+latest used distance
+@item 1 + 1 + 1 + 1 + 0 + len @tab rep2 @tab len bytes match at third
+latest used distance
+@item 1 + 1 + 1 + 1 + 1 + len @tab rep3 @tab len bytes match at fourth
+latest used distance
+@end multitable
+
+@sp 1
+In the following tables, multibit sequences are coded in normal order,
+from MSB to LSB, except where noted otherwise.
+
+Lengths (the @samp{len} in the table above) are coded as follows:
+
+@multitable @columnfractions .5 .5
+@headitem Bit sequence @tab Description
+@item 0 + 3 bits @tab lengths from 2 to 9
+@item 1 + 0 + 3 bits @tab lengths from 10 to 17
+@item 1 + 1 + 8 bits @tab lengths from 18 to 273
+@end multitable
+
+@sp 1
+The coding of distances is a little more complicated, so I'll begin
+explaining a simpler version of the encoding.
+
+Imagine you need to code a number from 0 to @w{2^32 - 1}, and you want
+to do it in a way that produces shorter codes for the smaller numbers.
+You may first send the position of the most significant bit that is set
+to 1, which you may find by making a bit scan from the left (from the
+MSB). A position of 0 means that the number is 0 (no bit is set), 1
+means the LSB is the first bit set (the number is 1), and 32 means the
+MSB is set (i.e., the number is @w{>= 0x80000000}). Let's call this bit
+position a "slot". Then, if slot is @w{> 1}, you send the remaining
+@w{slot - 1} bits. Let's call these bits "direct_bits" because they are
+coded directly by value instead of indirectly by position.
+
+The inconvenient of this simple method is that it needs 6 bits to code
+the slot, but it just uses 33 of the 64 possible values, wasting almost
+half of the codes.
+
+The intelligent trick of LZMA is that it encodes the position of the
+most significant bit set, along with the value of the next bit, in the
+same 6 bits that would take to encode the position alone. This seems to
+need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is
+no next bit, so the number of needed slots is 64 (0 to 63).
+
+The 6 bits representing this "slot number" are then context-coded. If
+the distance is @w{>= 4}, the remaining bits are coded as follows.
+@samp{direct_bits} is the amount of remaining bits (from 0 to 30) needed
+to form a complete distance, and is calculated as @w{(slot >> 1) - 1}.
+If a distance needs 6 or more direct_bits, the last 4 bits are coded
+separately. The last piece (all the direct_bits for distances 4 to 127
+or the last 4 bits for distances @w{>= 128}) is context-coded in reverse
+order (from LSB to MSB). For distances @w{>= 128}, the
+@w{@samp{direct_bits - 4}} part is coded with fixed 0.5 probability.
+
+@multitable @columnfractions .5 .5
+@headitem Bit sequence @tab Description
+@item slot @tab distances from 0 to 3
+@item slot + direct_bits @tab distances from 4 to 127
+@item slot + (direct_bits - 4) + 4 bits @tab distances from 128 to
+2^32 - 1
+@end multitable
+
+@sp 1
+@section The coding contexts
+
+These contexts (@samp{Bit_model} in the source), are integers or arrays
+of integers representing the probability of the corresponding bit being 0.
+
+The indices used in these arrays are:
+
+@table @samp
+@item state
+A state machine (@samp{State} in the source) with 12 states (0 to 11),
+coding the latest 2 to 4 types of sequences processed. The initial state
+is 0.
+
+@item pos_state
+Value of the 2 least significant bits of the current position in the
+decoded data.
+
+@item literal_state
+Value of the 3 most significant bits of the latest byte decoded.
+
+@item len_state
+Coded value of length @w{(length - 2)}, with a maximum of 3. The
+resulting value is in the range 0 to 3.
+
+@end table
+
+
+In the following table, @samp{!literal} is any sequence except a literal
+byte. @samp{rep} is any one of @samp{rep0}, @samp{rep1}, @samp{rep2} or
+@samp{rep3}. The types of previous sequences corresponding to each state
+are:
+
+@multitable {State} {rep or (!literal, shortrep), literal, literal}
+@headitem State @tab Types of previous sequences
+@item 0 @tab literal, literal, literal
+@item 1 @tab match, literal, literal
+@item 2 @tab rep or (!literal, shortrep), literal, literal
+@item 3 @tab literal, shortrep, literal, literal
+@item 4 @tab match, literal
+@item 5 @tab rep or (!literal, shortrep), literal
+@item 6 @tab literal, shortrep, literal
+@item 7 @tab literal, match
+@item 8 @tab literal, rep
+@item 9 @tab literal, shortrep
+@item 10 @tab !literal, match
+@item 11 @tab !literal, (rep or shortrep)
+@end multitable
+
+@sp 1
+The contexts for decoding the type of coding sequence are:
+
+@multitable @columnfractions .2 .4 .4
+@headitem Name @tab Indices @tab Used when
+@item bm_match @tab state, pos_state @tab sequence start
+@item bm_rep @tab state @tab after sequence 1
+@item bm_rep0 @tab state @tab after sequence 11
+@item bm_rep1 @tab state @tab after sequence 111
+@item bm_rep2 @tab state @tab after sequence 1111
+@item bm_len @tab state, pos_state @tab after sequence 110
+@end multitable
+
+@sp 1
+The contexts for decoding distances are:
+
+@multitable @columnfractions .2 .4 .4
+@headitem Name @tab Indices @tab Used when
+@item bm_dis_slot @tab len_state, bit tree @tab distance start
+@item bm_dis @tab reverse bit tree @tab after slots 4 to 13
+@item bm_align @tab reverse bit tree @tab for distances >= 128, after
+fixed probability bits
+@end multitable
+
+@sp 1
+There are two separate sets of contexts for lengths (@samp{Len_model} in
+the source). One for normal matches, the other for repeated matches. The
+contexts in each Len_model are (see @samp{decode_len} in the source):
+
+@multitable @columnfractions .2 .4 .4
+@headitem Name @tab Indices @tab Used when
+@item choice1 @tab none @tab length start
+@item choice2 @tab none @tab after sequence 1
+@item bm_low @tab pos_state, bit tree @tab after sequence 0
+@item bm_mid @tab pos_state, bit tree @tab after sequence 10
+@item bm_high @tab bit tree @tab after sequence 11
+@end multitable
+
+@sp 1
+The context array @samp{bm_literal} is special. In principle it acts as
+a normal bit tree context, the one selected by @samp{literal_state}. But
+if the previous decoded byte was not a literal, two other bit tree
+contexts are used depending on the value of each bit in
+@samp{match_byte} (the byte at the latest used distance), until a bit is
+decoded that is different from its corresponding bit in
+@samp{match_byte}. After the first difference is found, the rest of the
+byte is decoded using the normal bit tree context. (See
+@samp{decode_matched} in the source).
+
+@sp 1
+@section The range decoder
+
+The LZMA stream is consumed one byte at a time by the range decoder.
+(See @samp{normalize} in the source). Every byte consumed produces a
+variable number of decoded bits, depending on how well these bits agree
+with their context. (See @samp{decode_bit} in the source).
+
+The range decoder state consists of two unsigned 32-bit variables;
+@code{range} (representing the most significant part of the range size
+not yet decoded), and @code{code} (representing the current point within
+@code{range}). @code{range} is initialized to @w{(2^32 - 1)}, and
+@code{code} is initialized to 0.
+
+The range encoder produces a first 0 byte that must be ignored by the
+range decoder. This is done by shifting 5 bytes in the initialization of
+@code{code} instead of 4. (See the @samp{Range_decoder} constructor in
+the source).
+
+@sp 1
+@section Decoding the LZMA stream
+
+After decoding the member header and obtaining the dictionary size, the
+range decoder is initialized and then the LZMA decoder enters a loop
+(See @samp{decode_member} in the source) where it invokes the range
+decoder with the appropriate contexts to decode the different coding
+sequences (matches, repeated matches, and literal bytes), until the "End
+Of Stream" marker is decoded.
+
+
@node Trailing data
@chapter Extra data appended to the file
@cindex trailing data
-Sometimes extra data is found appended to a lzip file after the last
+Sometimes extra data are found appended to a lzip file after the last
member. Such trailing data may be:
@itemize @bullet
@item
Padding added to make the file size a multiple of some block size, for
-example when writing to a tape.
+example when writing to a tape. It is safe to append any amount of
+padding zero bytes to a lzip file.
@item
-Garbage added by some not totally successful copy operation.
+Useful data added by the user; a cryptographically secure hash, a
+description of file contents, etc. It is safe to append any amount of
+text to a lzip file as long as the text does not begin with the string
+"LZIP", and does not contain any zero bytes (null characters). Nonzero
+bytes and zero bytes can't be safely mixed in trailing data.
@item
-Useful data added by the user; a cryptographically secure hash, a
-description of file contents, etc.
+Garbage added by some not totally successful copy operation.
@item
Malicious data added to the file in order to make its total size and
@@ -534,8 +985,12 @@ integrity information itself. Therefore it can be considered to be below
the noise level.
@end itemize
+Trailing data are in no way part of the lzip file format, but tools
+reading lzip files are expected to behave as correctly and usefully as
+possible in the presence of trailing data.
+
Trailing data can be safely ignored in most cases. In some cases, like
-that of user-added data, it is expected to be ignored. In those cases
+that of user-added data, they are expected to be ignored. In those cases
where a file containing trailing data must be rejected, the option
@samp{--trailing-error} can be used. @xref{--trailing-error}.
@@ -600,8 +1055,8 @@ clzip -c /dev/sdc > file.lz
@sp 1
@anchor{concat-example}
@noindent
-Example 6: The right way of concatenating compressed files.
-@xref{Trailing data}.
+Example 6: The right way of concatenating the decompressed output of two
+or more compressed files. @xref{Trailing data}.
@example
Don't do this
@@ -671,6 +1126,477 @@ If you find a bug in clzip, please send electronic mail to
find by running @w{@code{clzip --version}}.
+@node Reference source code
+@appendix Reference source code
+@cindex reference source code
+
+@verbatim
+/* Lzd - Educational decompressor for the lzip format
+ Copyright (C) 2013-2017 Antonio Diaz Diaz.
+
+ This program is free software. Redistribution and use in source and
+ binary forms, with or without modification, are permitted provided
+ that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+/*
+ Exit status: 0 for a normal exit, 1 for environmental problems
+ (file not found, invalid flags, I/O errors, etc), 2 to indicate a
+ corrupt or invalid input file.
+*/
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdint.h>
+#include <unistd.h>
+#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+
+class State
+ {
+ int st;
+
+public:
+ enum { states = 12 };
+ State() : st( 0 ) {}
+ int operator()() const { return st; }
+ bool is_char() const { return st < 7; }
+
+ void set_char()
+ {
+ static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
+ st = next[st];
+ }
+ void set_match() { st = ( st < 7 ) ? 7 : 10; }
+ void set_rep() { st = ( st < 7 ) ? 8 : 11; }
+ void set_short_rep() { st = ( st < 7 ) ? 9 : 11; }
+ };
+
+
+enum {
+ min_dictionary_size = 1 << 12,
+ max_dictionary_size = 1 << 29,
+ literal_context_bits = 3,
+ literal_pos_state_bits = 0, // not used
+ pos_state_bits = 2,
+ pos_states = 1 << pos_state_bits,
+ pos_state_mask = pos_states - 1,
+
+ len_states = 4,
+ dis_slot_bits = 6,
+ start_dis_model = 4,
+ end_dis_model = 14,
+ modeled_distances = 1 << (end_dis_model / 2), // 128
+ dis_align_bits = 4,
+ dis_align_size = 1 << dis_align_bits,
+
+ len_low_bits = 3,
+ len_mid_bits = 3,
+ len_high_bits = 8,
+ len_low_symbols = 1 << len_low_bits,
+ len_mid_symbols = 1 << len_mid_bits,
+ len_high_symbols = 1 << len_high_bits,
+ max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,
+
+ min_match_len = 2, // must be 2
+
+ bit_model_move_bits = 5,
+ bit_model_total_bits = 11,
+ bit_model_total = 1 << bit_model_total_bits };
+
+struct Bit_model
+ {
+ int probability;
+ Bit_model() : probability( bit_model_total / 2 ) {}
+ };
+
+struct Len_model
+ {
+ Bit_model choice1;
+ Bit_model choice2;
+ Bit_model bm_low[pos_states][len_low_symbols];
+ Bit_model bm_mid[pos_states][len_mid_symbols];
+ Bit_model bm_high[len_high_symbols];
+ };
+
+
+class CRC32
+ {
+ uint32_t data[256]; // Table of CRCs of all 8-bit messages.
+
+public:
+ CRC32()
+ {
+ for( unsigned n = 0; n < 256; ++n )
+ {
+ unsigned c = n;
+ for( int k = 0; k < 8; ++k )
+ { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
+ data[n] = c;
+ }
+ }
+
+ void update_buf( uint32_t & crc, const uint8_t * const buffer,
+ const int size ) const
+ {
+ for( int i = 0; i < size; ++i )
+ crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
+ }
+ };
+
+const CRC32 crc32;
+
+
+typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size
+
+typedef uint8_t File_trailer[20];
+ // 0-3 CRC32 of the uncompressed data
+ // 4-11 size of the uncompressed data
+ // 12-19 member size including header and trailer
+
+class Range_decoder
+ {
+ uint32_t code;
+ uint32_t range;
+
+public:
+ Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
+ {
+ for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte();
+ }
+
+ uint8_t get_byte() { return std::getc( stdin ); }
+
+ unsigned decode( const int num_bits )
+ {
+ unsigned symbol = 0;
+ for( int i = num_bits; i > 0; --i )
+ {
+ range >>= 1;
+ symbol <<= 1;
+ if( code >= range ) { code -= range; symbol |= 1; }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | get_byte(); }
+ }
+ return symbol;
+ }
+
+ unsigned decode_bit( Bit_model & bm )
+ {
+ unsigned symbol;
+ const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
+ if( code < bound )
+ {
+ range = bound;
+ bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
+ symbol = 0;
+ }
+ else
+ {
+ range -= bound;
+ code -= bound;
+ bm.probability -= bm.probability >> bit_model_move_bits;
+ symbol = 1;
+ }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | get_byte(); }
+ return symbol;
+ }
+
+ unsigned decode_tree( Bit_model bm[], const int num_bits )
+ {
+ unsigned symbol = 1;
+ for( int i = 0; i < num_bits; ++i )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ return symbol - (1 << num_bits);
+ }
+
+ unsigned decode_tree_reversed( Bit_model bm[], const int num_bits )
+ {
+ unsigned symbol = decode_tree( bm, num_bits );
+ unsigned reversed_symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 );
+ symbol >>= 1;
+ }
+ return reversed_symbol;
+ }
+
+ unsigned decode_matched( Bit_model bm[], const unsigned match_byte )
+ {
+ unsigned symbol = 1;
+ for( int i = 7; i >= 0; --i )
+ {
+ const unsigned match_bit = ( match_byte >> i ) & 1;
+ const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] );
+ symbol = ( symbol << 1 ) | bit;
+ if( match_bit != bit )
+ {
+ while( symbol < 0x100 )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ break;
+ }
+ }
+ return symbol & 0xFF;
+ }
+
+ unsigned decode_len( Len_model & lm, const int pos_state )
+ {
+ if( decode_bit( lm.choice1 ) == 0 )
+ return decode_tree( lm.bm_low[pos_state], len_low_bits );
+ if( decode_bit( lm.choice2 ) == 0 )
+ return len_low_symbols +
+ decode_tree( lm.bm_mid[pos_state], len_mid_bits );
+ return len_low_symbols + len_mid_symbols +
+ decode_tree( lm.bm_high, len_high_bits );
+ }
+ };
+
+
+class LZ_decoder
+ {
+ unsigned long long partial_data_pos;
+ Range_decoder rdec;
+ const unsigned dictionary_size;
+ uint8_t * const buffer; // output buffer
+ unsigned pos; // current pos in buffer
+ unsigned stream_pos; // first byte not yet written to stdout
+ uint32_t crc_;
+ bool pos_wrapped;
+
+ void flush_data();
+
+ uint8_t peek( const unsigned distance ) const
+ {
+ if( pos > distance ) return buffer[pos - distance - 1];
+ if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1];
+ return 0; // prev_byte of first byte
+ }
+
+ void put_byte( const uint8_t b )
+ {
+ buffer[pos] = b;
+ if( ++pos >= dictionary_size ) flush_data();
+ }
+
+public:
+ explicit LZ_decoder( const unsigned dict_size )
+ :
+ partial_data_pos( 0 ),
+ dictionary_size( dict_size ),
+ buffer( new uint8_t[dictionary_size] ),
+ pos( 0 ),
+ stream_pos( 0 ),
+ crc_( 0xFFFFFFFFU ),
+ pos_wrapped( false )
+ {}
+
+ ~LZ_decoder() { delete[] buffer; }
+
+ unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
+ unsigned long long data_position() const { return partial_data_pos + pos; }
+
+ bool decode_member();
+ };
+
+
+void LZ_decoder::flush_data()
+ {
+ if( pos > stream_pos )
+ {
+ const unsigned size = pos - stream_pos;
+ crc32.update_buf( crc_, buffer + stream_pos, size );
+ errno = 0;
+ if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
+ { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
+ std::exit( 1 ); }
+ if( pos >= dictionary_size )
+ { partial_data_pos += pos; pos = 0; pos_wrapped = true; }
+ stream_pos = pos;
+ }
+ }
+
+
+bool LZ_decoder::decode_member() // Returns false if error
+ {
+ Bit_model bm_literal[1<<literal_context_bits][0x300];
+ Bit_model bm_match[State::states][pos_states];
+ Bit_model bm_rep[State::states];
+ Bit_model bm_rep0[State::states];
+ Bit_model bm_rep1[State::states];
+ Bit_model bm_rep2[State::states];
+ Bit_model bm_len[State::states][pos_states];
+ Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
+ Bit_model bm_dis[modeled_distances-end_dis_model+1];
+ Bit_model bm_align[dis_align_size];
+ Len_model match_len_model;
+ Len_model rep_len_model;
+ unsigned rep0 = 0; // rep[0-3] latest four distances
+ unsigned rep1 = 0; // used for efficient coding of
+ unsigned rep2 = 0; // repeated distances
+ unsigned rep3 = 0;
+ State state;
+
+ while( !std::feof( stdin ) && !std::ferror( stdin ) )
+ {
+ const int pos_state = data_position() & pos_state_mask;
+ if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit
+ {
+ const uint8_t prev_byte = peek( 0 );
+ const int literal_state = prev_byte >> ( 8 - literal_context_bits );
+ Bit_model * const bm = bm_literal[literal_state];
+ if( state.is_char() )
+ put_byte( rdec.decode_tree( bm, 8 ) );
+ else
+ put_byte( rdec.decode_matched( bm, peek( rep0 ) ) );
+ state.set_char();
+ }
+ else // match or repeated match
+ {
+ int len;
+ if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit
+ {
+ if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit
+ {
+ if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit
+ { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; }
+ }
+ else
+ {
+ unsigned distance;
+ if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit
+ distance = rep1;
+ else
+ {
+ if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit
+ distance = rep2;
+ else
+ { distance = rep3; rep3 = rep2; }
+ rep2 = rep1;
+ }
+ rep1 = rep0;
+ rep0 = distance;
+ }
+ state.set_rep();
+ len = min_match_len + rdec.decode_len( rep_len_model, pos_state );
+ }
+ else // match
+ {
+ rep3 = rep2; rep2 = rep1; rep1 = rep0;
+ len = min_match_len + rdec.decode_len( match_len_model, pos_state );
+ const int len_state = std::min( len - min_match_len, len_states - 1 );
+ rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits );
+ if( rep0 >= start_dis_model )
+ {
+ const unsigned dis_slot = rep0;
+ const int direct_bits = ( dis_slot >> 1 ) - 1;
+ rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
+ if( dis_slot < end_dis_model )
+ rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ),
+ direct_bits );
+ else
+ {
+ rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
+ rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
+ if( rep0 == 0xFFFFFFFFU ) // marker found
+ {
+ flush_data();
+ return ( len == min_match_len ); // End Of Stream marker
+ }
+ }
+ }
+ state.set_match();
+ if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) )
+ { flush_data(); return false; }
+ }
+ for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) );
+ }
+ }
+ flush_data();
+ return false;
+ }
+
+
+int main( const int argc, const char * const argv[] )
+ {
+ if( argc > 1 )
+ {
+ std::printf( "Lzd %s - Educational decompressor for the lzip format.\n",
+ PROGVERSION );
+ std::printf( "Study the source to learn how a lzip decompressor works.\n"
+ "See the lzip manual for an explanation of the code.\n"
+ "It is not safe to use lzd for any real work.\n"
+ "\nUsage: %s < file.lz > file\n", argv[0] );
+ std::printf( "Lzd decompresses from standard input to standard output.\n"
+ "\nCopyright (C) 2017 Antonio Diaz Diaz.\n"
+ "This is free software: you are free to change and redistribute it.\n"
+ "There is NO WARRANTY, to the extent permitted by law.\n"
+ "Report bugs to lzip-bug@nongnu.org\n"
+ "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n" );
+ return 0;
+ }
+
+#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER)
+ setmode( fileno( stdin ), O_BINARY );
+ setmode( fileno( stdout ), O_BINARY );
+#endif
+
+ for( bool first_member = true; ; first_member = false )
+ {
+ File_header header; // verify header
+ for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin );
+ if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 )
+ {
+ if( first_member )
+ { std::fputs( "Bad magic number (file not in lzip format).\n", stderr );
+ return 2; }
+ break;
+ }
+ unsigned dict_size = 1 << ( header[5] & 0x1F );
+ dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
+ if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
+ { std::fputs( "Invalid dictionary size in member header.\n", stderr );
+ return 2; }
+
+ LZ_decoder decoder( dict_size ); // decode LZMA stream
+ if( !decoder.decode_member() )
+ { std::fputs( "Data error\n", stderr ); return 2; }
+
+ File_trailer trailer; // verify trailer
+ for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin );
+ unsigned crc = 0;
+ for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; }
+ unsigned long long data_size = 0;
+ for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; }
+ if( crc != decoder.crc() || data_size != decoder.data_position() )
+ { std::fputs( "CRC error\n", stderr ); return 2; }
+ }
+
+ if( std::fclose( stdout ) != 0 )
+ { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) );
+ return 1; }
+ return 0;
+ }
+@end verbatim
+
+
@node Concept index
@unnumbered Concept index
diff --git a/encoder.c b/encoder.c
index ce0ddf2..4b9d7ec 100644
--- a/encoder.c
+++ b/encoder.c
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -43,7 +43,7 @@ int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs )
const int min_pos = ( e->eb.mb.pos > e->eb.mb.dictionary_size ) ?
e->eb.mb.pos - e->eb.mb.dictionary_size : 0;
const uint8_t * const data = Mb_ptr_to_current_pos( &e->eb.mb );
- int count, key2, key3, key4, newpos;
+ int count, key2, key3, key4, newpos1;
unsigned tmp;
int len_limit = e->match_len_limit;
@@ -90,15 +90,15 @@ int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs )
e->eb.mb.prev_positions[key2] = pos1;
e->eb.mb.prev_positions[key3] = pos1;
- newpos = e->eb.mb.prev_positions[key4];
+ newpos1 = e->eb.mb.prev_positions[key4];
e->eb.mb.prev_positions[key4] = pos1;
for( count = e->cycles; ; )
{
int delta;
- if( newpos <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; }
+ if( newpos1 <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; }
- delta = pos1 - newpos;
+ delta = pos1 - newpos1;
newptr = e->eb.mb.pos_array +
( ( e->eb.mb.cyclic_pos - delta +
( (e->eb.mb.cyclic_pos >= delta) ? 0 : e->eb.mb.dictionary_size + 1 ) ) << 1 );
@@ -120,16 +120,16 @@ int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs )
}
if( data[len-delta] < data[len] )
{
- *ptr0 = newpos;
+ *ptr0 = newpos1;
ptr0 = newptr + 1;
- newpos = *ptr0;
+ newpos1 = *ptr0;
len0 = len; if( len1 < len ) len = len1;
}
else
{
- *ptr1 = newpos;
+ *ptr1 = newpos1;
ptr1 = newptr;
- newpos = *ptr1;
+ newpos1 = *ptr1;
len1 = len; if( len0 < len ) len = len0;
}
}
@@ -145,7 +145,7 @@ static void LZe_update_distance_prices( struct LZ_encoder * const e )
const int dis_slot = dis_slots[dis];
const int direct_bits = ( dis_slot >> 1 ) - 1;
const int base = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
- const int price = price_symbol_reversed( e->eb.bm_dis + base - dis_slot - 1,
+ const int price = price_symbol_reversed( e->eb.bm_dis + ( base - dis_slot ),
dis - base, direct_bits );
for( len_state = 0; len_state < len_states; ++len_state )
e->dis_prices[len_state][dis] = price;
@@ -158,9 +158,9 @@ static void LZe_update_distance_prices( struct LZ_encoder * const e )
const Bit_model * const bmds = e->eb.bm_dis_slot[len_state];
int slot = 0;
for( ; slot < end_dis_model; ++slot )
- dsp[slot] = price_symbol( bmds, slot, dis_slot_bits );
+ dsp[slot] = price_symbol6( bmds, slot );
for( ; slot < e->num_dis_slots; ++slot )
- dsp[slot] = price_symbol( bmds, slot, dis_slot_bits ) +
+ dsp[slot] = price_symbol6( bmds, slot ) +
(((( slot >> 1 ) - 1 ) - dis_align_bits ) << price_shift_bits );
for( dis = 0; dis < start_dis_model; ++dis )
@@ -173,16 +173,16 @@ static void LZe_update_distance_prices( struct LZ_encoder * const e )
/* Returns the number of bytes advanced (ahead).
trials[0]..trials[ahead-1] contain the steps to encode.
- ( trials[0].dis == -1 ) means literal.
+ ( trials[0].dis4 == -1 ) means literal.
A match/rep longer or equal than match_len_limit finishes the sequence.
*/
static int LZe_sequence_optimizer( struct LZ_encoder * const e,
const int reps[num_rep_distances],
const State state )
{
- int main_len, num_pairs, i, rep, cur = 0, num_trials, len;
+ int main_len, num_pairs, i, rep, num_trials, len;
+ int rep_index = 0, cur = 0;
int replens[num_rep_distances];
- int rep_index = 0;
if( e->pending_num_pairs > 0 ) /* from previous call */
{
@@ -195,13 +195,13 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
for( i = 0; i < num_rep_distances; ++i )
{
- replens[i] = Mb_true_match_len( &e->eb.mb, 0, reps[i] + 1, max_match_len );
+ replens[i] = Mb_true_match_len( &e->eb.mb, 0, reps[i] + 1 );
if( replens[i] > replens[rep_index] ) rep_index = i;
}
if( replens[rep_index] >= e->match_len_limit )
{
e->trials[0].price = replens[rep_index];
- e->trials[0].dis = rep_index;
+ e->trials[0].dis4 = rep_index;
LZe_move_and_update( e, replens[rep_index] );
return replens[rep_index];
}
@@ -209,7 +209,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
if( main_len >= e->match_len_limit )
{
e->trials[0].price = main_len;
- e->trials[0].dis = e->pairs[num_pairs-1].dis + num_rep_distances;
+ e->trials[0].dis4 = e->pairs[num_pairs-1].dis + num_rep_distances;
LZe_move_and_update( e, main_len );
return main_len;
}
@@ -227,7 +227,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
e->trials[1].price += LZeb_price_literal( &e->eb, prev_byte, cur_byte );
else
e->trials[1].price += LZeb_price_matched( &e->eb, prev_byte, cur_byte, match_byte );
- e->trials[1].dis = -1; /* literal */
+ e->trials[1].dis4 = -1; /* literal */
if( match_byte == cur_byte )
Tr_update( &e->trials[1], rep_match_price +
@@ -238,7 +238,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
if( num_trials < min_match_len )
{
e->trials[0].price = 1;
- e->trials[0].dis = e->trials[1].dis;
+ e->trials[0].dis4 = e->trials[1].dis4;
Mb_move_pos( &e->eb.mb );
return 1;
}
@@ -263,7 +263,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
if( main_len > replens[0] )
{
const int normal_match_price = match_price + price0( e->eb.bm_rep[state] );
- i = 0, len = max( replens[0] + 1, min_match_len );
+ int i = 0, len = max( replens[0] + 1, min_match_len );
while( len > e->pairs[i].len ) ++i;
while( true )
{
@@ -304,7 +304,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
/* give final values to current trial */
cur_trial = &e->trials[cur];
{
- int dis = cur_trial->dis;
+ const int dis4 = cur_trial->dis4;
int prev_index = cur_trial->prev_index;
const int prev_index2 = cur_trial->prev_index2;
@@ -313,32 +313,24 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
cur_state = e->trials[prev_index].state;
if( prev_index + 1 == cur ) /* len == 1 */
{
- if( dis == 0 ) cur_state = St_set_short_rep( cur_state );
+ if( dis4 == 0 ) cur_state = St_set_short_rep( cur_state );
else cur_state = St_set_char( cur_state ); /* literal */
}
- else if( dis < num_rep_distances ) cur_state = St_set_rep( cur_state );
+ else if( dis4 < num_rep_distances ) cur_state = St_set_rep( cur_state );
else cur_state = St_set_match( cur_state );
}
- else if( prev_index2 == dual_step_trial ) /* dis == 0 */
+ else
{
- --prev_index;
- cur_state = e->trials[prev_index].state;
- cur_state = St_set_char( cur_state );
- cur_state = St_set_rep( cur_state );
- }
- else /* if( prev_index2 >= 0 ) */
- {
- prev_index = prev_index2;
- cur_state = e->trials[prev_index].state;
- if( dis < num_rep_distances ) cur_state = St_set_rep( cur_state );
- else cur_state = St_set_match( cur_state );
- cur_state = St_set_char( cur_state );
- cur_state = St_set_rep( cur_state );
+ if( prev_index2 == dual_step_trial ) /* dis4 == 0 (rep0) */
+ --prev_index;
+ else /* prev_index2 >= 0 */
+ prev_index = prev_index2;
+ cur_state = 8; /* St_set_char_rep(); */
}
cur_trial->state = cur_state;
for( i = 0; i < num_rep_distances; ++i )
cur_trial->reps[i] = e->trials[prev_index].reps[i];
- mtf_reps( dis, cur_trial->reps );
+ mtf_reps( dis4, cur_trial->reps ); /* literal is ignored */
}
pos_state = Mb_data_position( &e->eb.mb ) & pos_state_mask;
@@ -361,7 +353,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
match_price = cur_trial->price + price1( e->eb.bm_match[cur_state][pos_state] );
rep_match_price = match_price + price1( e->eb.bm_rep[cur_state] );
- if( match_byte == cur_byte && next_trial->dis != 0 &&
+ if( match_byte == cur_byte && next_trial->dis4 != 0 &&
next_trial->prev_index2 == single_step_trial )
{
const int price = rep_match_price +
@@ -369,7 +361,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
if( price <= next_trial->price )
{
next_trial->price = price;
- next_trial->dis = 0;
+ next_trial->dis4 = 0; /* rep0 */
next_trial->prev_index = cur;
}
}
@@ -386,16 +378,16 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
const uint8_t * const data = Mb_ptr_to_current_pos( &e->eb.mb );
const int dis = cur_trial->reps[0] + 1;
const int limit = min( e->match_len_limit + 1, triable_bytes );
- len = 1;
+ int len = 1;
while( len < limit && data[len-dis] == data[len] ) ++len;
if( --len >= min_match_len )
{
const int pos_state2 = ( pos_state + 1 ) & pos_state_mask;
const State state2 = St_set_char( cur_state );
const int price = next_price +
- price1( e->eb.bm_match[state2][pos_state2] ) +
- price1( e->eb.bm_rep[state2] ) +
- LZe_price_rep0_len( e, len, state2, pos_state2 );
+ price1( e->eb.bm_match[state2][pos_state2] ) +
+ price1( e->eb.bm_rep[state2] ) +
+ LZe_price_rep0_len( e, len, state2, pos_state2 );
while( num_trials < cur + 1 + len )
e->trials[++num_trials].price = infinite_price;
Tr_update2( &e->trials[cur+1+len], price, cur + 1 );
@@ -406,8 +398,8 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
for( rep = 0; rep < num_rep_distances; ++rep )
{
const uint8_t * const data = Mb_ptr_to_current_pos( &e->eb.mb );
- int price;
const int dis = cur_trial->reps[rep] + 1;
+ int price;
if( data[0-dis] != data[0] || data[1-dis] != data[1] ) continue;
for( len = min_match_len; len < len_limit; ++len )
@@ -463,7 +455,6 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e,
for( len = start_len; ; ++len )
{
int price = normal_match_price + LZe_price_pair( e, dis, len, pos_state );
-
Tr_update( &e->trials[cur+len], price, dis + num_rep_distances, cur );
/* try match + literal + rep0 */
@@ -510,7 +501,7 @@ bool LZe_encode_member( struct LZ_encoder * const e,
const int dis_price_count = best ? 1 : 512;
const int align_price_count = best ? 1 : dis_align_size;
const int price_count = ( e->match_len_limit > 36 ) ? 1013 : 4093;
- int price_counter = 0;
+ int price_counter = 0; /* counters may decrement below 0 */
int dis_price_counter = 0;
int align_price_counter = 0;
int ahead, i;
@@ -551,7 +542,6 @@ bool LZe_encode_member( struct LZ_encoder * const e,
}
ahead = LZe_sequence_optimizer( e, reps, state );
- if( ahead <= 0 ) return false; /* can't happen */
price_counter -= ahead;
for( i = 0; ahead > 0; )
@@ -559,7 +549,7 @@ bool LZe_encode_member( struct LZ_encoder * const e,
const int pos_state =
( Mb_data_position( &e->eb.mb ) - ahead ) & pos_state_mask;
const int len = e->trials[i].price;
- const int dis = e->trials[i].dis;
+ int dis = e->trials[i].dis4;
bool bit = ( dis < 0 );
Re_encode_bit( &e->eb.renc, &e->eb.bm_match[state][pos_state], !bit );
@@ -605,9 +595,9 @@ bool LZe_encode_member( struct LZ_encoder * const e,
}
else /* match */
{
- LZeb_encode_pair( &e->eb, dis - num_rep_distances, len, pos_state );
- if( get_slot( dis - num_rep_distances ) >= end_dis_model )
- --align_price_counter;
+ dis -= num_rep_distances;
+ LZeb_encode_pair( &e->eb, dis, len, pos_state );
+ if( dis >= modeled_distances ) --align_price_counter;
--dis_price_counter;
Lp_decrement_counter( &e->match_len_prices, pos_state );
state = St_set_match( state );
diff --git a/encoder.h b/encoder.h
index 99670b1..c65022a 100644
--- a/encoder.h
+++ b/encoder.h
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -21,7 +21,7 @@ struct Len_prices
int len_symbols;
int count;
int prices[pos_states][max_len_symbols];
- int counters[pos_states];
+ int counters[pos_states]; /* may decrement below 0 */
};
static inline void Lp_update_low_mid_prices( struct Len_prices * const lp,
@@ -30,15 +30,14 @@ static inline void Lp_update_low_mid_prices( struct Len_prices * const lp,
int * const pps = lp->prices[pos_state];
int tmp = price0( lp->lm->choice1 );
int len = 0;
- lp->counters[pos_state] = lp->count;
for( ; len < len_low_symbols && len < lp->len_symbols; ++len )
- pps[len] = tmp + price_symbol( lp->lm->bm_low[pos_state], len, len_low_bits );
+ pps[len] = tmp + price_symbol3( lp->lm->bm_low[pos_state], len );
if( len >= lp->len_symbols ) return;
tmp = price1( lp->lm->choice1 ) + price0( lp->lm->choice2 );
for( ; len < len_low_symbols + len_mid_symbols && len < lp->len_symbols; ++len )
pps[len] = tmp +
- price_symbol( lp->lm->bm_mid[pos_state], len - len_low_symbols, len_mid_bits );
- }
+ price_symbol3( lp->lm->bm_mid[pos_state], len - len_low_symbols );
+ }
static inline void Lp_update_high_prices( struct Len_prices * const lp )
{
@@ -48,7 +47,7 @@ static inline void Lp_update_high_prices( struct Len_prices * const lp )
/* using 4 slots per value makes "Lp_price" faster */
lp->prices[3][len] = lp->prices[2][len] =
lp->prices[1][len] = lp->prices[0][len] = tmp +
- price_symbol( lp->lm->bm_high, len - len_low_symbols - len_mid_symbols, len_high_bits );
+ price_symbol8( lp->lm->bm_high, len - len_low_symbols - len_mid_symbols );
}
static inline void Lp_reset( struct Len_prices * const lp )
@@ -74,14 +73,15 @@ static inline void Lp_update_prices( struct Len_prices * const lp )
bool high_pending = false;
for( pos_state = 0; pos_state < pos_states; ++pos_state )
if( lp->counters[pos_state] <= 0 )
- { Lp_update_low_mid_prices( lp, pos_state ); high_pending = true; }
+ { lp->counters[pos_state] = lp->count;
+ Lp_update_low_mid_prices( lp, pos_state ); high_pending = true; }
if( high_pending && lp->len_symbols > len_low_symbols + len_mid_symbols )
Lp_update_high_prices( lp );
}
static inline int Lp_price( const struct Len_prices * const lp,
- const int symbol, const int pos_state )
- { return lp->prices[pos_state][symbol - min_match_len]; }
+ const int len, const int pos_state )
+ { return lp->prices[pos_state][len - min_match_len]; }
struct Pair /* distance-length pair */
@@ -99,7 +99,7 @@ struct Trial
{
State state;
int price; /* dual use var; cumulative price, match length */
- int dis; /* rep index or match distance. (-1 for literal) */
+ int dis4; /* -1 for literal, or rep, or match distance + 4 */
int prev_index; /* index of prev trial in trials[] */
int prev_index2; /* -2 trial is single step */
/* -1 literal + rep0 */
@@ -108,34 +108,28 @@ struct Trial
};
static inline void Tr_update( struct Trial * const trial, const int pr,
- const int distance, const int p_i )
+ const int distance4, const int p_i )
{
if( pr < trial->price )
- {
- trial->price = pr; trial->dis = distance; trial->prev_index = p_i;
- trial->prev_index2 = single_step_trial;
- }
+ { trial->price = pr; trial->dis4 = distance4; trial->prev_index = p_i;
+ trial->prev_index2 = single_step_trial; }
}
static inline void Tr_update2( struct Trial * const trial, const int pr,
const int p_i )
{
if( pr < trial->price )
- {
- trial->price = pr; trial->dis = 0; trial->prev_index = p_i;
- trial->prev_index2 = dual_step_trial;
- }
+ { trial->price = pr; trial->dis4 = 0; trial->prev_index = p_i;
+ trial->prev_index2 = dual_step_trial; }
}
static inline void Tr_update3( struct Trial * const trial, const int pr,
- const int distance, const int p_i,
+ const int distance4, const int p_i,
const int p_i2 )
{
if( pr < trial->price )
- {
- trial->price = pr; trial->dis = distance; trial->prev_index = p_i;
- trial->prev_index2 = p_i2;
- }
+ { trial->price = pr; trial->dis4 = distance4; trial->prev_index = p_i;
+ trial->prev_index2 = p_i2; }
}
@@ -161,26 +155,25 @@ static inline bool Mb_dec_pos( struct Matchfinder_base * const mb,
{
if( ahead < 0 || mb->pos < ahead ) return false;
mb->pos -= ahead;
+ if( mb->cyclic_pos < ahead ) mb->cyclic_pos += mb->dictionary_size + 1;
mb->cyclic_pos -= ahead;
- if( mb->cyclic_pos < 0 ) mb->cyclic_pos += mb->dictionary_size + 1;
return true;
}
int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs );
- /* move-to-front dis in/into reps if( dis > 0 ) */
-static inline void mtf_reps( const int dis, int reps[num_rep_distances] )
+ /* move-to-front dis in/into reps; do nothing if( dis4 <= 0 ) */
+static inline void mtf_reps( const int dis4, int reps[num_rep_distances] )
{
- int i;
- if( dis >= num_rep_distances )
+ if( dis4 >= num_rep_distances ) /* match */
{
- for( i = num_rep_distances - 1; i > 0; --i ) reps[i] = reps[i-1];
- reps[0] = dis - num_rep_distances;
+ reps[3] = reps[2]; reps[2] = reps[1]; reps[1] = reps[0];
+ reps[0] = dis4 - num_rep_distances;
}
- else if( dis > 0 )
+ else if( dis4 > 0 ) /* repeated match */
{
- const int distance = reps[dis];
- for( i = dis; i > 0; --i ) reps[i] = reps[i-1];
+ const int distance = reps[dis4];
+ int i; for( i = dis4; i > 0; --i ) reps[i] = reps[i-1];
reps[0] = distance;
}
}
@@ -192,8 +185,8 @@ static inline int LZeb_price_shortrep( const struct LZ_encoder_base * const eb,
}
static inline int LZeb_price_rep( const struct LZ_encoder_base * const eb,
- const int rep,
- const State state, const int pos_state )
+ const int rep, const State state,
+ const int pos_state )
{
int price;
if( rep == 0 ) return price0( eb->bm_rep0[state] ) +
@@ -210,8 +203,8 @@ static inline int LZeb_price_rep( const struct LZ_encoder_base * const eb,
}
static inline int LZe_price_rep0_len( const struct LZ_encoder * const e,
- const int len,
- const State state, const int pos_state )
+ const int len, const State state,
+ const int pos_state )
{
return LZeb_price_rep( &e->eb, 0, state, pos_state ) +
Lp_price( &e->rep_len_prices, len, pos_state );
@@ -235,13 +228,10 @@ static inline int LZe_read_match_distances( struct LZ_encoder * const e )
const int num_pairs = LZe_get_match_pairs( e, e->pairs );
if( num_pairs > 0 )
{
- int len = e->pairs[num_pairs-1].len;
+ const int len = e->pairs[num_pairs-1].len;
if( len == e->match_len_limit && len < max_match_len )
- {
- len += Mb_true_match_len( &e->eb.mb, len, e->pairs[num_pairs-1].dis + 1,
- max_match_len - len );
- e->pairs[num_pairs-1].len = len;
- }
+ e->pairs[num_pairs-1].len =
+ Mb_true_match_len( &e->eb.mb, len, e->pairs[num_pairs-1].dis + 1 );
}
return num_pairs;
}
@@ -258,7 +248,7 @@ static inline void LZe_move_and_update( struct LZ_encoder * const e, int n )
static inline void LZe_backward( struct LZ_encoder * const e, int cur )
{
- int * const dis = &e->trials[cur].dis;
+ int dis4 = e->trials[cur].dis4;
while( cur > 0 )
{
const int prev_index = e->trials[cur].prev_index;
@@ -266,19 +256,19 @@ static inline void LZe_backward( struct LZ_encoder * const e, int cur )
if( e->trials[cur].prev_index2 != single_step_trial )
{
- prev_trial->dis = -1;
+ prev_trial->dis4 = -1; /* literal */
prev_trial->prev_index = prev_index - 1;
prev_trial->prev_index2 = single_step_trial;
if( e->trials[cur].prev_index2 >= 0 )
{
struct Trial * const prev_trial2 = &e->trials[prev_index-1];
- prev_trial2->dis = *dis; *dis = 0;
+ prev_trial2->dis4 = dis4; dis4 = 0; /* rep0 */
prev_trial2->prev_index = e->trials[cur].prev_index2;
prev_trial2->prev_index2 = single_step_trial;
}
}
prev_trial->price = cur - prev_index; /* len */
- cur = *dis; *dis = prev_trial->dis; prev_trial->dis = cur;
+ cur = dis4; dis4 = prev_trial->dis4; prev_trial->dis4 = cur;
cur = prev_index;
}
}
@@ -290,7 +280,7 @@ static inline bool LZe_init( struct LZ_encoder * const e,
const int dict_size, const int len_limit,
const int ifd, const int outfd )
{
- enum { before = max_num_trials + 1,
+ enum { before = max_num_trials,
/* bytes to keep in buffer after pos */
after_size = ( 2 * max_match_len ) + 1,
dict_factor = 2,
diff --git a/encoder_base.c b/encoder_base.c
index 31cad3f..e384d22 100644
--- a/encoder_base.c
+++ b/encoder_base.c
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -54,11 +54,11 @@ void Mb_normalize_pos( struct Matchfinder_base * const mb )
if( !mb->at_stream_end )
{
int i;
- const int offset = mb->pos - mb->dictionary_size - mb->before_size;
+ const int offset = mb->pos - mb->before_size - mb->dictionary_size;
const int size = mb->stream_pos - offset;
memmove( mb->buffer, mb->buffer + offset, size );
mb->partial_data_pos += offset;
- mb->pos -= offset;
+ mb->pos -= offset; /* pos = before_size + dictionary_size */
mb->stream_pos -= offset;
for( i = 0; i < mb->num_prev_positions; ++i )
mb->prev_positions[i] -= min( mb->prev_positions[i], offset );
@@ -69,10 +69,9 @@ void Mb_normalize_pos( struct Matchfinder_base * const mb )
}
-bool Mb_init( struct Matchfinder_base * const mb,
- const int before, const int dict_size,
- const int after_size, const int dict_factor,
- const int num_prev_positions23,
+bool Mb_init( struct Matchfinder_base * const mb, const int before,
+ const int dict_size, const int after_size,
+ const int dict_factor, const int num_prev_positions23,
const int pos_array_factor, const int ifd )
{
const int buffer_size_limit =
@@ -116,8 +115,9 @@ bool Mb_init( struct Matchfinder_base * const mb,
mb->num_prev_positions = size;
mb->pos_array_size = pos_array_factor * ( mb->dictionary_size + 1 );
size += mb->pos_array_size;
- if( size * sizeof (int32_t) <= size ) mb->prev_positions = 0;
- else mb->prev_positions = (int32_t *)malloc( size * sizeof (int32_t) );
+ if( size * sizeof mb->prev_positions[0] <= size ) mb->prev_positions = 0;
+ else mb->prev_positions =
+ (int32_t *)malloc( size * sizeof mb->prev_positions[0] );
if( !mb->prev_positions ) { free( mb->buffer ); return false; }
mb->pos_array = mb->prev_positions + mb->num_prev_positions;
for( i = 0; i < mb->num_prev_positions; ++i ) mb->prev_positions[i] = 0;
@@ -184,7 +184,7 @@ void LZeb_reset( struct LZ_encoder_base * const eb )
Bm_array_init( eb->bm_rep2, states );
Bm_array_init( eb->bm_len[0], states * pos_states );
Bm_array_init( eb->bm_dis_slot[0], len_states * (1 << dis_slot_bits) );
- Bm_array_init( eb->bm_dis, modeled_distances - end_dis_model );
+ Bm_array_init( eb->bm_dis, modeled_distances - end_dis_model + 1 );
Bm_array_init( eb->bm_align, dis_align_size );
Lm_init( &eb->match_len_model );
Lm_init( &eb->rep_len_model );
diff --git a/encoder_base.h b/encoder_base.h
index 54fecd1..b2985f1 100644
--- a/encoder_base.h
+++ b/encoder_base.h
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -77,22 +77,48 @@ static inline int price0( const Bit_model probability )
static inline int price1( const Bit_model probability )
{ return get_price( bit_model_total - probability ); }
-static inline int price_bit( const Bit_model bm, const int bit )
- { if( bit ) return price1( bm ); else return price0( bm ); }
+static inline int price_bit( const Bit_model bm, const bool bit )
+ { return ( bit ? price1( bm ) : price0( bm ) ); }
-static inline int price_symbol( const Bit_model bm[], int symbol,
- const int num_bits )
+static inline int price_symbol3( const Bit_model bm[], int symbol )
{
- int price = 0;
- symbol |= ( 1 << num_bits );
- while( symbol > 1 )
- {
- const int bit = symbol & 1;
- symbol >>= 1;
- price += price_bit( bm[symbol], bit );
- }
- return price;
+ int price;
+ bool bit = symbol & 1;
+ symbol |= 8; symbol >>= 1;
+ price = price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ return price + price_bit( bm[1], symbol & 1 );
+ }
+
+
+static inline int price_symbol6( const Bit_model bm[], unsigned symbol )
+ {
+ int price;
+ bool bit = symbol & 1;
+ symbol |= 64; symbol >>= 1;
+ price = price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ return price + price_bit( bm[1], symbol & 1 );
+ }
+
+
+static inline int price_symbol8( const Bit_model bm[], int symbol )
+ {
+ int price;
+ bool bit = symbol & 1;
+ symbol |= 0x100; symbol >>= 1;
+ price = price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
+ return price + price_bit( bm[1], symbol & 1 );
}
@@ -104,32 +130,29 @@ static inline int price_symbol_reversed( const Bit_model bm[], int symbol,
int i;
for( i = num_bits; i > 0; --i )
{
- const int bit = symbol & 1;
+ const bool bit = symbol & 1;
+ symbol >>= 1;
price += price_bit( bm[model], bit );
model = ( model << 1 ) | bit;
- symbol >>= 1;
}
return price;
}
-static inline int price_matched( const Bit_model bm[], int symbol, int match_byte )
+static inline int price_matched( const Bit_model bm[], unsigned symbol,
+ unsigned match_byte )
{
int price = 0;
- int mask = 0x100;
+ unsigned mask = 0x100;
symbol |= mask;
-
- do {
- int match_bit, bit;
- match_byte <<= 1;
- match_bit = match_byte & mask;
- symbol <<= 1;
- bit = symbol & 0x100;
- price += price_bit( bm[match_bit+(symbol>>9)+mask], bit );
- mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */
+ while( true )
+ {
+ const unsigned match_bit = ( match_byte <<= 1 ) & mask;
+ const bool bit = ( symbol <<= 1 ) & 0x100;
+ price += price_bit( bm[(symbol>>9)+match_bit+mask], bit );
+ if( symbol >= 0x10000 ) return price;
+ mask &= ~(match_bit ^ symbol); /* if( match_bit != bit ) mask = 0; */
}
- while( symbol < 0x10000 );
- return price;
}
@@ -156,10 +179,9 @@ struct Matchfinder_base
bool Mb_read_block( struct Matchfinder_base * const mb );
void Mb_normalize_pos( struct Matchfinder_base * const mb );
-bool Mb_init( struct Matchfinder_base * const mb,
- const int before, const int dict_size,
- const int after_size, const int dict_factor,
- const int num_prev_positions23,
+bool Mb_init( struct Matchfinder_base * const mb, const int before,
+ const int dict_size, const int after_size,
+ const int dict_factor, const int num_prev_positions23,
const int pos_array_factor, const int ifd );
static inline void Mb_free( struct Matchfinder_base * const mb )
@@ -184,13 +206,11 @@ Mb_ptr_to_current_pos( const struct Matchfinder_base * const mb )
{ return mb->buffer + mb->pos; }
static inline int Mb_true_match_len( const struct Matchfinder_base * const mb,
- const int index, const int distance,
- int len_limit )
+ const int index, const int distance )
{
- const uint8_t * const data = mb->buffer + mb->pos + index;
- int i = 0;
- if( index + len_limit > Mb_available_bytes( mb ) )
- len_limit = Mb_available_bytes( mb ) - index;
+ const uint8_t * const data = mb->buffer + mb->pos;
+ int i = index;
+ const int len_limit = min( Mb_available_bytes( mb ), max_match_len );
while( i < len_limit && data[i-distance] == data[i] ) ++i;
return i;
}
@@ -230,9 +250,9 @@ static inline void Re_put_byte( struct Range_encoder * const renc,
static inline void Re_shift_low( struct Range_encoder * const renc )
{
- const bool carry = ( renc->low > 0xFFFFFFFFU );
- if( carry || renc->low < 0xFF000000U )
+ if( renc->low >> 24 != 0xFF )
{
+ const bool carry = ( renc->low > 0xFFFFFFFFU );
Re_put_byte( renc, renc->cache + carry );
for( ; renc->ff_count > 0; --renc->ff_count )
Re_put_byte( renc, 0xFF + carry );
@@ -280,18 +300,18 @@ static inline void Re_flush( struct Range_encoder * const renc )
static inline void Re_encode( struct Range_encoder * const renc,
const int symbol, const int num_bits )
{
- int i;
- for( i = num_bits - 1; i >= 0; --i )
+ unsigned mask;
+ for( mask = 1 << ( num_bits - 1 ); mask > 0; mask >>= 1 )
{
renc->range >>= 1;
- if( (symbol >> i) & 1 ) renc->low += renc->range;
+ if( symbol & mask ) renc->low += renc->range;
if( renc->range <= 0x00FFFFFFU )
{ renc->range <<= 8; Re_shift_low( renc ); }
}
}
static inline void Re_encode_bit( struct Range_encoder * const renc,
- Bit_model * const probability, const int bit )
+ Bit_model * const probability, const bool bit )
{
const uint32_t bound = ( renc->range >> bit_model_total_bits ) * *probability;
if( !bit )
@@ -305,56 +325,78 @@ static inline void Re_encode_bit( struct Range_encoder * const renc,
renc->range -= bound;
*probability -= *probability >> bit_model_move_bits;
}
- if( renc->range <= 0x00FFFFFFU )
- { renc->range <<= 8; Re_shift_low( renc ); }
+ if( renc->range <= 0x00FFFFFFU ) { renc->range <<= 8; Re_shift_low( renc ); }
+ }
+
+static inline void Re_encode_tree3( struct Range_encoder * const renc,
+ Bit_model bm[], const int symbol )
+ {
+ int model = 1;
+ bool bit = ( symbol >> 2 ) & 1;
+ Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit;
+ bit = ( symbol >> 1 ) & 1;
+ Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit;
+ Re_encode_bit( renc, &bm[model], symbol & 1 );
+ }
+
+static inline void Re_encode_tree6( struct Range_encoder * const renc,
+ Bit_model bm[], const unsigned symbol )
+ {
+ int model = 1;
+ bool bit = ( symbol >> 5 ) & 1;
+ Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit;
+ bit = ( symbol >> 4 ) & 1;
+ Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit;
+ bit = ( symbol >> 3 ) & 1;
+ Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit;
+ bit = ( symbol >> 2 ) & 1;
+ Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit;
+ bit = ( symbol >> 1 ) & 1;
+ Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit;
+ Re_encode_bit( renc, &bm[model], symbol & 1 );
}
-static inline void Re_encode_tree( struct Range_encoder * const renc,
- Bit_model bm[], const int symbol, const int num_bits )
+static inline void Re_encode_tree8( struct Range_encoder * const renc,
+ Bit_model bm[], const int symbol )
{
- int mask = ( 1 << ( num_bits - 1 ) );
int model = 1;
int i;
- for( i = num_bits; i > 0; --i, mask >>= 1 )
+ for( i = 7; i >= 0; --i )
{
- const int bit = ( symbol & mask );
+ const bool bit = ( symbol >> i ) & 1;
Re_encode_bit( renc, &bm[model], bit );
- model <<= 1;
- if( bit ) model |= 1;
+ model = ( model << 1 ) | bit;
}
}
static inline void Re_encode_tree_reversed( struct Range_encoder * const renc,
- Bit_model bm[], int symbol, const int num_bits )
+ Bit_model bm[], int symbol, const int num_bits )
{
int model = 1;
int i;
for( i = num_bits; i > 0; --i )
{
- const int bit = symbol & 1;
+ const bool bit = symbol & 1;
+ symbol >>= 1;
Re_encode_bit( renc, &bm[model], bit );
model = ( model << 1 ) | bit;
- symbol >>= 1;
}
}
static inline void Re_encode_matched( struct Range_encoder * const renc,
- Bit_model bm[], int symbol,
- int match_byte )
+ Bit_model bm[], unsigned symbol,
+ unsigned match_byte )
{
- int mask = 0x100;
+ unsigned mask = 0x100;
symbol |= mask;
-
- do {
- int match_bit, bit;
- match_byte <<= 1;
- match_bit = match_byte & mask;
- symbol <<= 1;
- bit = symbol & 0x100;
- Re_encode_bit( renc, &bm[match_bit+(symbol>>9)+mask], bit );
- mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */
+ while( true )
+ {
+ const unsigned match_bit = ( match_byte <<= 1 ) & mask;
+ const bool bit = ( symbol <<= 1 ) & 0x100;
+ Re_encode_bit( renc, &bm[(symbol>>9)+match_bit+mask], bit );
+ if( symbol >= 0x10000 ) break;
+ mask &= ~(match_bit ^ symbol); /* if( match_bit != bit ) mask = 0; */
}
- while( symbol < 0x10000 );
}
static inline void Re_encode_len( struct Range_encoder * const renc,
@@ -364,17 +406,15 @@ static inline void Re_encode_len( struct Range_encoder * const renc,
bool bit = ( ( symbol -= min_match_len ) >= len_low_symbols );
Re_encode_bit( renc, &lm->choice1, bit );
if( !bit )
- Re_encode_tree( renc, lm->bm_low[pos_state], symbol, len_low_bits );
+ Re_encode_tree3( renc, lm->bm_low[pos_state], symbol );
else
{
- bit = ( symbol >= len_low_symbols + len_mid_symbols );
+ bit = ( ( symbol -= len_low_symbols ) >= len_mid_symbols );
Re_encode_bit( renc, &lm->choice2, bit );
if( !bit )
- Re_encode_tree( renc, lm->bm_mid[pos_state],
- symbol - len_low_symbols, len_mid_bits );
+ Re_encode_tree3( renc, lm->bm_mid[pos_state], symbol );
else
- Re_encode_tree( renc, lm->bm_high,
- symbol - len_low_symbols - len_mid_symbols, len_high_bits );
+ Re_encode_tree8( renc, lm->bm_high, symbol - len_mid_symbols );
}
}
@@ -395,7 +435,7 @@ struct LZ_encoder_base
Bit_model bm_rep2[states];
Bit_model bm_len[states][pos_states];
Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
- Bit_model bm_dis[modeled_distances-end_dis_model];
+ Bit_model bm_dis[modeled_distances-end_dis_model+1];
Bit_model bm_align[dis_align_size];
struct Len_model match_len_model;
struct Len_model rep_len_model;
@@ -425,23 +465,21 @@ static inline unsigned LZeb_crc( const struct LZ_encoder_base * const eb )
{ return eb->crc ^ 0xFFFFFFFFU; }
static inline int LZeb_price_literal( const struct LZ_encoder_base * const eb,
- uint8_t prev_byte, uint8_t symbol )
- { return price_symbol( eb->bm_literal[get_lit_state(prev_byte)], symbol, 8 ); }
+ const uint8_t prev_byte, const uint8_t symbol )
+ { return price_symbol8( eb->bm_literal[get_lit_state(prev_byte)], symbol ); }
static inline int LZeb_price_matched( const struct LZ_encoder_base * const eb,
- uint8_t prev_byte, uint8_t symbol,
- uint8_t match_byte )
+ const uint8_t prev_byte, const uint8_t symbol, const uint8_t match_byte )
{ return price_matched( eb->bm_literal[get_lit_state(prev_byte)], symbol,
match_byte ); }
static inline void LZeb_encode_literal( struct LZ_encoder_base * const eb,
- uint8_t prev_byte, uint8_t symbol )
- { Re_encode_tree( &eb->renc,
- eb->bm_literal[get_lit_state(prev_byte)], symbol, 8 ); }
+ const uint8_t prev_byte, const uint8_t symbol )
+ { Re_encode_tree8( &eb->renc, eb->bm_literal[get_lit_state(prev_byte)],
+ symbol ); }
static inline void LZeb_encode_matched( struct LZ_encoder_base * const eb,
- uint8_t prev_byte, uint8_t symbol,
- uint8_t match_byte )
+ const uint8_t prev_byte, const uint8_t symbol, const uint8_t match_byte )
{ Re_encode_matched( &eb->renc, eb->bm_literal[get_lit_state(prev_byte)],
symbol, match_byte ); }
@@ -449,10 +487,9 @@ static inline void LZeb_encode_pair( struct LZ_encoder_base * const eb,
const unsigned dis, const int len,
const int pos_state )
{
- const int dis_slot = get_slot( dis );
+ const unsigned dis_slot = get_slot( dis );
Re_encode_len( &eb->renc, &eb->match_len_model, len, pos_state );
- Re_encode_tree( &eb->renc, eb->bm_dis_slot[get_len_state(len)], dis_slot,
- dis_slot_bits );
+ Re_encode_tree6( &eb->renc, eb->bm_dis_slot[get_len_state(len)], dis_slot );
if( dis_slot >= start_dis_model )
{
@@ -461,7 +498,7 @@ static inline void LZeb_encode_pair( struct LZ_encoder_base * const eb,
const unsigned direct_dis = dis - base;
if( dis_slot < end_dis_model )
- Re_encode_tree_reversed( &eb->renc, eb->bm_dis + base - dis_slot - 1,
+ Re_encode_tree_reversed( &eb->renc, eb->bm_dis + ( base - dis_slot ),
direct_dis, direct_bits );
else
{
diff --git a/fast_encoder.c b/fast_encoder.c
index 941c0e2..399cc1a 100644
--- a/fast_encoder.c
+++ b/fast_encoder.c
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -33,21 +33,22 @@ int FLZe_longest_match_len( struct FLZ_encoder * const fe, int * const distance
enum { len_limit = 16 };
const uint8_t * const data = Mb_ptr_to_current_pos( &fe->eb.mb );
int32_t * ptr0 = fe->eb.mb.pos_array + fe->eb.mb.cyclic_pos;
- int32_t * newptr;
const int pos1 = fe->eb.mb.pos + 1;
- int maxlen = 0;
- int count, delta, newpos;
- if( len_limit > Mb_available_bytes( &fe->eb.mb ) ) return 0;
+ int maxlen = 0, newpos1, count;
+ const int available = min( Mb_available_bytes( &fe->eb.mb ), max_match_len );
+ if( available < len_limit ) return 0;
fe->key4 = ( ( fe->key4 << 4 ) ^ data[3] ) & fe->eb.mb.key4_mask;
- newpos = fe->eb.mb.prev_positions[fe->key4];
+ newpos1 = fe->eb.mb.prev_positions[fe->key4];
fe->eb.mb.prev_positions[fe->key4] = pos1;
for( count = 4; ; )
{
- if( --count < 0 || newpos <= 0 ) { *ptr0 = 0; break; }
- delta = pos1 - newpos;
- if( delta > fe->eb.mb.dictionary_size ) { *ptr0 = 0; break; }
+ int32_t * newptr;
+ int delta;
+ if( newpos1 <= 0 || --count < 0 ||
+ ( delta = pos1 - newpos1 ) > fe->eb.mb.dictionary_size )
+ { *ptr0 = 0; break; }
newptr = fe->eb.mb.pos_array +
( fe->eb.mb.cyclic_pos - delta +
( ( fe->eb.mb.cyclic_pos >= delta ) ? 0 : fe->eb.mb.dictionary_size + 1 ) );
@@ -55,23 +56,15 @@ int FLZe_longest_match_len( struct FLZ_encoder * const fe, int * const distance
if( data[maxlen-delta] == data[maxlen] )
{
int len = 0;
- while( len < len_limit && data[len-delta] == data[len] ) ++len;
- if( maxlen < len ) { maxlen = len; *distance = delta - 1; }
+ while( len < available && data[len-delta] == data[len] ) ++len;
+ if( maxlen < len )
+ { maxlen = len; *distance = delta - 1;
+ if( maxlen >= len_limit ) { *ptr0 = *newptr; break; } }
}
- if( maxlen < len_limit )
- {
- *ptr0 = newpos;
- ptr0 = newptr;
- newpos = *ptr0;
- }
- else
- {
- *ptr0 = *newptr;
- maxlen += Mb_true_match_len( &fe->eb.mb, maxlen, *distance + 1,
- max_match_len - maxlen );
- break;
- }
+ *ptr0 = newpos1;
+ ptr0 = newptr;
+ newpos1 = *ptr0;
}
return maxlen;
}
@@ -112,8 +105,7 @@ bool FLZe_encode_member( struct FLZ_encoder * const fe,
for( i = 0; i < num_rep_distances; ++i )
{
- const int tlen = Mb_true_match_len( &fe->eb.mb, 0,
- reps[i] + 1, max_match_len );
+ const int tlen = Mb_true_match_len( &fe->eb.mb, 0, reps[i] + 1 );
if( tlen > len ) { len = tlen; rep = i; }
}
if( len > min_match_len && len + 3 > main_len )
diff --git a/fast_encoder.h b/fast_encoder.h
index df1741d..9825068 100644
--- a/fast_encoder.h
+++ b/fast_encoder.h
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@
struct FLZ_encoder
{
struct LZ_encoder_base eb;
- int key4; /* key made from latest 4 bytes */
+ unsigned key4; /* key made from latest 4 bytes */
};
static inline void FLZe_reset_key4( struct FLZ_encoder * const fe )
@@ -37,12 +37,10 @@ static inline void FLZe_update_and_move( struct FLZ_encoder * const fe, int n )
{
if( Mb_available_bytes( &fe->eb.mb ) >= 4 )
{
- int newpos;
fe->key4 = ( ( fe->key4 << 4 ) ^ fe->eb.mb.buffer[fe->eb.mb.pos+3] ) &
fe->eb.mb.key4_mask;
- newpos = fe->eb.mb.prev_positions[fe->key4];
+ fe->eb.mb.pos_array[fe->eb.mb.cyclic_pos] = fe->eb.mb.prev_positions[fe->key4];
fe->eb.mb.prev_positions[fe->key4] = fe->eb.mb.pos + 1;
- fe->eb.mb.pos_array[fe->eb.mb.cyclic_pos] = newpos;
}
Mb_move_pos( &fe->eb.mb );
}
diff --git a/file_index.c b/file_index.c
new file mode 100644
index 0000000..74adf95
--- /dev/null
+++ b/file_index.c
@@ -0,0 +1,268 @@
+/* Clzip - LZMA lossless data compressor
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "lzip.h"
+#include "file_index.h"
+
+
+static int seek_read( const int fd, uint8_t * const buf, const int size,
+ const long long pos )
+ {
+ if( lseek( fd, pos, SEEK_SET ) == pos )
+ return readblock( fd, buf, size );
+ return 0;
+ }
+
+
+static bool add_error( struct File_index * const fi, const char * const msg )
+ {
+ const int len = strlen( msg );
+ void * tmp = resize_buffer( fi->error, fi->error_size + len + 1 );
+ if( !tmp ) return false;
+ fi->error = (char *)tmp;
+ strncpy( fi->error + fi->error_size, msg, len + 1 );
+ fi->error_size += len;
+ return true;
+ }
+
+
+static bool push_back_member( struct File_index * const fi,
+ const long long dp, const long long ds,
+ const long long mp, const long long ms,
+ const unsigned dict_size )
+ {
+ struct Member * p;
+ void * tmp = resize_buffer( fi->member_vector,
+ ( fi->members + 1 ) * sizeof fi->member_vector[0] );
+ if( !tmp )
+ { add_error( fi, "Not enough memory." ); fi->retval = 1; return false; }
+ fi->member_vector = (struct Member *)tmp;
+ p = &(fi->member_vector[fi->members]);
+ init_member( p, dp, ds, mp, ms, dict_size );
+ ++fi->members;
+ return true;
+ }
+
+
+static void Fi_free_member_vector( struct File_index * const fi )
+ {
+ if( fi->member_vector )
+ { free( fi->member_vector ); fi->member_vector = 0; }
+ fi->members = 0;
+ }
+
+
+static void Fi_reverse_member_vector( struct File_index * const fi )
+ {
+ struct Member tmp;
+ long i;
+ for( i = 0; i < fi->members / 2; ++i )
+ {
+ tmp = fi->member_vector[i];
+ fi->member_vector[i] = fi->member_vector[fi->members-i-1];
+ fi->member_vector[fi->members-i-1] = tmp;
+ }
+ }
+
+
+static void Fi_set_errno_error( struct File_index * const fi,
+ const char * const msg )
+ {
+ add_error( fi, msg ); add_error( fi, strerror( errno ) );
+ fi->retval = 1;
+ }
+
+static void Fi_set_num_error( struct File_index * const fi,
+ const char * const msg, unsigned long long num )
+ {
+ char buf[80];
+ snprintf( buf, sizeof buf, "%s%llu", msg, num );
+ add_error( fi, buf );
+ fi->retval = 2;
+ }
+
+
+/* If successful, push last member and set pos to member header. */
+static bool Fi_skip_trailing_data( struct File_index * const fi,
+ const int fd, long long * const pos )
+ {
+ enum { block_size = 16384,
+ buffer_size = block_size + Ft_size - 1 + Fh_size };
+ uint8_t buffer[buffer_size];
+ int bsize = *pos % block_size; /* total bytes in buffer */
+ int search_size, rd_size;
+ unsigned long long ipos;
+ int i;
+ if( bsize <= buffer_size - block_size ) bsize += block_size;
+ search_size = bsize; /* bytes to search for trailer */
+ rd_size = bsize; /* bytes to read from file */
+ ipos = *pos - rd_size; /* aligned to block_size */
+ if( *pos < min_member_size ) return false;
+
+ while( true )
+ {
+ const uint8_t max_msb = ( ipos + search_size ) >> 56;
+ if( seek_read( fd, buffer, rd_size, ipos ) != rd_size )
+ { Fi_set_errno_error( fi, "Error seeking member trailer: " );
+ return false; }
+ for( i = search_size; i >= Ft_size; --i )
+ if( buffer[i-1] <= max_msb ) /* most significant byte of member_size */
+ {
+ File_header header;
+ File_trailer * trailer = (File_trailer *)( buffer + i - Ft_size );
+ const unsigned long long member_size = Ft_get_member_size( *trailer );
+ unsigned dictionary_size;
+ if( member_size == 0 )
+ { while( i > Ft_size && buffer[i-9] == 0 ) --i; continue; }
+ if( member_size < min_member_size || member_size > ipos + i )
+ continue;
+ if( seek_read( fd, header, Fh_size,
+ ipos + i - member_size ) != Fh_size )
+ { Fi_set_errno_error( fi, "Error reading member header: " );
+ return false; }
+ dictionary_size = Fh_get_dictionary_size( header );
+ if( !Fh_verify_magic( header ) || !Fh_verify_version( header ) ||
+ !isvalid_ds( dictionary_size ) ) continue;
+ if( Fh_verify_prefix( buffer + i, bsize - i ) )
+ {
+ add_error( fi, "Last member in input file is truncated or corrupt." );
+ fi->retval = 2; return false;
+ }
+ *pos = ipos + i - member_size;
+ return push_back_member( fi, 0, Ft_get_data_size( *trailer ), *pos,
+ member_size, dictionary_size );
+ }
+ if( ipos <= 0 )
+ { Fi_set_num_error( fi, "Member size in trailer is corrupt at pos ",
+ *pos - 8 );
+ return false; }
+ bsize = buffer_size;
+ search_size = bsize - Fh_size;
+ rd_size = block_size;
+ ipos -= rd_size;
+ memcpy( buffer + rd_size, buffer, buffer_size - rd_size );
+ }
+ }
+
+
+bool Fi_init( struct File_index * const fi, const int infd,
+ const bool ignore_trailing )
+ {
+ File_header header;
+ long long pos;
+ long i;
+ fi->member_vector = 0;
+ fi->error = 0;
+ fi->isize = lseek( infd, 0, SEEK_END );
+ fi->members = 0;
+ fi->error_size = 0;
+ fi->retval = 0;
+ if( fi->isize < 0 )
+ { Fi_set_errno_error( fi, "Input file is not seekable: " ); return false; }
+ if( fi->isize < min_member_size )
+ { add_error( fi, "Input file is too short." ); fi->retval = 2;
+ return false; }
+ if( fi->isize > INT64_MAX )
+ { add_error( fi, "Input file is too long (2^63 bytes or more)." );
+ fi->retval = 2; return false; }
+
+ if( seek_read( infd, header, Fh_size, 0 ) != Fh_size )
+ { Fi_set_errno_error( fi, "Error reading member header: " ); return false; }
+ if( !Fh_verify_magic( header ) )
+ { add_error( fi, bad_magic_msg ); fi->retval = 2; return false; }
+ if( !Fh_verify_version( header ) )
+ { add_error( fi, bad_version( Fh_version( header ) ) ); fi->retval = 2;
+ return false; }
+ if( !isvalid_ds( Fh_get_dictionary_size( header ) ) )
+ { add_error( fi, bad_dict_msg ); fi->retval = 2; return false; }
+
+ pos = fi->isize; /* always points to a header or to EOF */
+ while( pos >= min_member_size )
+ {
+ File_trailer trailer;
+ unsigned long long member_size;
+ unsigned dictionary_size;
+ if( seek_read( infd, trailer, Ft_size, pos - Ft_size ) != Ft_size )
+ { Fi_set_errno_error( fi, "Error reading member trailer: " ); break; }
+ member_size = Ft_get_member_size( trailer );
+ if( member_size < min_member_size || member_size > (unsigned long long)pos )
+ {
+ if( fi->members > 0 )
+ Fi_set_num_error( fi, "Member size in trailer is corrupt at pos ",
+ pos - 8 );
+ else if( Fi_skip_trailing_data( fi, infd, &pos ) )
+ { if( ignore_trailing ) continue;
+ add_error( fi, trailing_msg ); fi->retval = 2; return false; }
+ break;
+ }
+ if( seek_read( infd, header, Fh_size, pos - member_size ) != Fh_size )
+ { Fi_set_errno_error( fi, "Error reading member header: " ); break; }
+ dictionary_size = Fh_get_dictionary_size( header );
+ if( !Fh_verify_magic( header ) || !Fh_verify_version( header ) ||
+ !isvalid_ds( dictionary_size ) )
+ {
+ if( fi->members > 0 )
+ Fi_set_num_error( fi, "Bad header at pos ", pos - member_size );
+ else if( Fi_skip_trailing_data( fi, infd, &pos ) )
+ { if( ignore_trailing ) continue;
+ add_error( fi, trailing_msg ); fi->retval = 2; return false; }
+ break;
+ }
+ pos -= member_size;
+ if( !push_back_member( fi, 0, Ft_get_data_size( trailer ), pos,
+ member_size, dictionary_size ) )
+ return false;
+ }
+ if( pos != 0 || fi->members <= 0 )
+ {
+ Fi_free_member_vector( fi );
+ if( fi->retval == 0 )
+ { add_error( fi, "Can't create file index." ); fi->retval = 2; }
+ return false;
+ }
+ Fi_reverse_member_vector( fi );
+ for( i = 0; i < fi->members - 1; ++i )
+ {
+ const long long end = block_end( fi->member_vector[i].dblock );
+ if( end < 0 || end > INT64_MAX )
+ {
+ Fi_free_member_vector( fi );
+ add_error( fi, "Data in input file is too long (2^63 bytes or more)." );
+ fi->retval = 2; return false;
+ }
+ fi->member_vector[i+1].dblock.pos = end;
+ }
+ return true;
+ }
+
+
+void Fi_free( struct File_index * const fi )
+ {
+ Fi_free_member_vector( fi );
+ if( fi->error ) { free( fi->error ); fi->error = 0; }
+ fi->error_size = 0;
+ }
diff --git a/file_index.h b/file_index.h
new file mode 100644
index 0000000..d093fa9
--- /dev/null
+++ b/file_index.h
@@ -0,0 +1,90 @@
+/* Clzip - LZMA lossless data compressor
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef INT64_MAX
+#define INT64_MAX 0x7FFFFFFFFFFFFFFFLL
+#endif
+
+
+struct Block
+ {
+ long long pos, size; /* pos + size <= INT64_MAX */
+ };
+
+static inline void init_block( struct Block * const b,
+ const long long p, const long long s )
+ { b->pos = p; b->size = s; }
+
+static inline long long block_end( const struct Block b )
+ { return b.pos + b.size; }
+
+
+struct Member
+ {
+ struct Block dblock, mblock; /* data block, member block */
+ unsigned dictionary_size;
+ };
+
+static inline void init_member( struct Member * const m,
+ const long long dp, const long long ds,
+ const long long mp, const long long ms,
+ const unsigned dict_size )
+ { init_block( &m->dblock, dp, ds ); init_block( &m->mblock, mp, ms );
+ m->dictionary_size = dict_size; }
+
+struct File_index
+ {
+ struct Member * member_vector;
+ char * error;
+ long long isize;
+ long members;
+ int error_size;
+ int retval;
+ };
+
+bool Fi_init( struct File_index * const fi, const int infd,
+ const bool ignore_trailing );
+
+void Fi_free( struct File_index * const fi );
+
+static inline long long Fi_udata_size( const struct File_index * const fi )
+ {
+ if( fi->members <= 0 ) return 0;
+ return block_end( fi->member_vector[fi->members-1].dblock );
+ }
+
+static inline long long Fi_cdata_size( const struct File_index * const fi )
+ {
+ if( fi->members <= 0 ) return 0;
+ return block_end( fi->member_vector[fi->members-1].mblock );
+ }
+
+ /* total size including trailing data (if any) */
+static inline long long Fi_file_size( const struct File_index * const fi )
+ { if( fi->isize >= 0 ) return fi->isize; else return 0; }
+
+static inline const struct Block * Fi_dblock( const struct File_index * const fi,
+ const long i )
+ { return &fi->member_vector[i].dblock; }
+
+static inline const struct Block * Fi_mblock( const struct File_index * const fi,
+ const long i )
+ { return &fi->member_vector[i].mblock; }
+
+static inline unsigned Fi_dictionary_size( const struct File_index * const fi,
+ const long i )
+ { return fi->member_vector[i].dictionary_size; }
diff --git a/list.c b/list.c
new file mode 100644
index 0000000..c72360d
--- /dev/null
+++ b/list.c
@@ -0,0 +1,123 @@
+/* Clzip - LZMA lossless data compressor
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "lzip.h"
+#include "file_index.h"
+
+
+static void list_line( const unsigned long long uncomp_size,
+ const unsigned long long comp_size,
+ const char * const input_filename )
+ {
+ if( uncomp_size > 0 )
+ printf( "%15llu %15llu %6.2f%% %s\n", uncomp_size, comp_size,
+ 100.0 * ( 1.0 - ( (double)comp_size / uncomp_size ) ),
+ input_filename );
+ else
+ printf( "%15llu %15llu -INF%% %s\n", uncomp_size, comp_size,
+ input_filename );
+ }
+
+
+int list_files( const char * const filenames[], const int num_filenames,
+ const bool ignore_trailing )
+ {
+ unsigned long long total_comp = 0, total_uncomp = 0;
+ int files = 0, retval = 0;
+ int i;
+ bool first_post = true;
+ bool stdin_used = false;
+ for( i = 0; i < num_filenames; ++i )
+ {
+ const char * input_filename;
+ struct File_index file_index;
+ struct stat in_stats; /* not used */
+ int infd;
+ const bool from_stdin = ( strcmp( filenames[i], "-" ) == 0 );
+ if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; }
+ input_filename = from_stdin ? "(stdin)" : filenames[i];
+ infd = from_stdin ? STDIN_FILENO :
+ open_instream( input_filename, &in_stats, true, true );
+ if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; }
+
+ Fi_init( &file_index, infd, ignore_trailing );
+ close( infd );
+ if( file_index.retval != 0 )
+ {
+ show_file_error( input_filename, file_index.error, 0 );
+ if( retval < file_index.retval ) retval = file_index.retval;
+ Fi_free( &file_index ); continue;
+ }
+ if( verbosity >= 0 )
+ {
+ const unsigned long long udata_size = Fi_udata_size( &file_index );
+ const unsigned long long cdata_size = Fi_cdata_size( &file_index );
+ total_comp += cdata_size; total_uncomp += udata_size; ++files;
+ if( first_post )
+ {
+ first_post = false;
+ if( verbosity >= 1 ) fputs( " dict memb trail ", stdout );
+ fputs( " uncompressed compressed saved name\n", stdout );
+ }
+ if( verbosity >= 1 )
+ {
+ long long trailing_size;
+ unsigned dictionary_size = 0;
+ long i;
+ for( i = 0; i < file_index.members; ++i )
+ dictionary_size =
+ max( dictionary_size, Fi_dictionary_size( &file_index, i ) );
+ trailing_size = Fi_file_size( &file_index ) - cdata_size;
+ printf( "%s %5ld %6lld ", format_ds( dictionary_size ),
+ file_index.members, trailing_size );
+ }
+ list_line( udata_size, cdata_size, input_filename );
+
+ if( verbosity >= 2 && file_index.members > 1 )
+ {
+ long i;
+ fputs( " member data_pos data_size member_pos member_size\n", stdout );
+ for( i = 0; i < file_index.members; ++i )
+ {
+ const struct Block * db = Fi_dblock( &file_index, i );
+ const struct Block * mb = Fi_mblock( &file_index, i );
+ printf( "%5ld %15llu %15llu %15llu %15llu\n",
+ i + 1, db->pos, db->size, mb->pos, mb->size );
+ }
+ first_post = true; /* reprint heading after list of members */
+ }
+ fflush( stdout );
+ }
+ Fi_free( &file_index );
+ }
+ if( verbosity >= 0 && files > 1 )
+ {
+ if( verbosity >= 1 ) fputs( " ", stdout );
+ list_line( total_uncomp, total_comp, "(totals)" );
+ fflush( stdout );
+ }
+ return retval;
+ }
diff --git a/lzip.h b/lzip.h
index 5274500..49b6b93 100644
--- a/lzip.h
+++ b/lzip.h
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -49,6 +49,7 @@ enum {
min_dictionary_size = 1 << min_dictionary_bits, /* >= modeled_distances */
max_dictionary_bits = 29,
max_dictionary_size = 1 << max_dictionary_bits,
+ min_member_size = 36,
literal_context_bits = 3,
literal_pos_state_bits = 0, /* not used */
pos_state_bits = 2,
@@ -131,9 +132,9 @@ static inline void Pp_init( struct Pretty_print * const pp,
pp->stdin_name = "(stdin)";
pp->longest_name = 0;
pp->first_post = false;
- stdin_name_len = strlen( pp->stdin_name );
if( verbosity <= 0 ) return;
+ stdin_name_len = strlen( pp->stdin_name );
for( i = 0; i < num_filenames; ++i )
{
const char * const s = filenames[i];
@@ -182,8 +183,10 @@ static inline void CRC32_update_buf( uint32_t * const crc,
const int size )
{
int i;
+ uint32_t c = *crc;
for( i = 0; i < size; ++i )
- *crc = crc32[(*crc^buffer[i])&0xFF] ^ ( *crc >> 8 );
+ c = crc32[(c^buffer[i])&0xFF] ^ ( c >> 8 );
+ *crc = c;
}
@@ -243,7 +246,7 @@ static inline bool Fh_set_dictionary_size( File_header data, const unsigned sz )
{
const unsigned base_size = 1 << data[5];
const unsigned fraction = base_size / 16;
- int i;
+ unsigned i;
for( i = 7; i >= 1; --i )
if( base_size - ( i * fraction ) >= sz )
{ data[5] |= ( i << 5 ); break; }
@@ -290,14 +293,30 @@ static inline void Ft_set_member_size( File_trailer data, unsigned long long sz
{ int i; for( i = 12; i <= 19; ++i ) { data[i] = (uint8_t)sz; sz >>= 8; } }
+static const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
+static const char * const bad_dict_msg = "Invalid dictionary size in member header.";
+static const char * const trailing_msg = "Trailing data not allowed.";
+
/* defined in decoder.c */
int readblock( const int fd, uint8_t * const buf, const int size );
int writeblock( const int fd, const uint8_t * const buf, const int size );
+/* defined in list.c */
+int list_files( const char * const filenames[], const int num_filenames,
+ const bool ignore_trailing );
+
/* defined in main.c */
extern int verbosity;
+struct stat;
+const char * bad_version( const unsigned version );
+const char * format_ds( const unsigned dictionary_size );
+int open_instream( const char * const name, struct stat * const in_statsp,
+ const bool no_ofile, const bool reg_only );
+void * resize_buffer( void * buf, const unsigned min_size );
void cleanup_and_fail( const int retval );
void show_error( const char * const msg, const int errcode, const bool help );
+void show_file_error( const char * const filename, const char * const msg,
+ const int errcode );
void internal_error( const char * const msg );
struct Matchfinder_base;
void show_progress( const unsigned long long partial_size,
diff --git a/main.c b/main.c
index ecf8dd8..1af09f3 100644
--- a/main.c
+++ b/main.c
@@ -1,5 +1,5 @@
/* Clzip - LZMA lossless data compressor
- Copyright (C) 2010-2016 Antonio Diaz Diaz.
+ Copyright (C) 2010-2017 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -71,10 +71,10 @@ int verbosity = 0;
const char * const Program_name = "Clzip";
const char * const program_name = "clzip";
-const char * const program_year = "2016";
+const char * const program_year = "2017";
const char * invocation_name = 0;
-struct { const char * from; const char * to; } const known_extensions[] = {
+const struct { const char * from; const char * to; } known_extensions[] = {
{ ".lz", "" },
{ ".tlz", ".tar" },
{ 0, 0 } };
@@ -85,7 +85,7 @@ struct Lzma_options
int match_len_limit; /* 5 .. 273 */
};
-enum Mode { m_compress, m_decompress, m_test };
+enum Mode { m_compress, m_decompress, m_list, m_test };
char * output_filename = 0;
int outfd = -1;
@@ -106,6 +106,7 @@ static void show_help( void )
" -f, --force overwrite existing output files\n"
" -F, --recompress force re-compression of compressed files\n"
" -k, --keep keep (don't delete) input files\n"
+ " -l, --list print (un)compressed file sizes\n"
" -m, --match-length=<bytes> set match length limit in bytes [36]\n"
" -o, --output=<file> if reading standard input, write to <file>\n"
" -q, --quiet suppress all messages\n"
@@ -145,23 +146,38 @@ static void show_version( void )
}
+const char * bad_version( const unsigned version )
+ {
+ static char buf[80];
+ snprintf( buf, sizeof buf, "Version %u member format not supported.",
+ version );
+ return buf;
+ }
+
+
+const char * format_ds( const unsigned dictionary_size )
+ {
+ enum { bufsize = 16, factor = 1024 };
+ static char buf[bufsize];
+ const char * const prefix[8] =
+ { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" };
+ const char * p = "";
+ const char * np = " ";
+ unsigned num = dictionary_size, i;
+ bool exact = ( num % factor == 0 );
+
+ for( i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i )
+ { num /= factor; if( num % factor != 0 ) exact = false;
+ p = prefix[i]; np = ""; }
+ snprintf( buf, bufsize, "%s%4u %sB", np, num, p );
+ return buf;
+ }
+
+
static void show_header( const unsigned dictionary_size )
{
if( verbosity >= 3 )
- {
- const char * const prefix[8] =
- { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" };
- enum { factor = 1024 };
- const char * p = "";
- const char * np = " ";
- unsigned num = dictionary_size, i;
- bool exact = ( num % factor == 0 );
-
- for( i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i )
- { num /= factor; if( num % factor != 0 ) exact = false;
- p = prefix[i]; np = ""; }
- fprintf( stderr, "dictionary size %s%4u %sB. ", np, num, p );
- }
+ fprintf( stderr, "dictionary %s. ", format_ds( dictionary_size ) );
}
@@ -181,8 +197,8 @@ static unsigned long long getnum( const char * const ptr,
if( !errno && tail[0] )
{
- const int factor = ( tail[1] == 'i' ) ? 1024 : 1000;
- int exponent = 0; /* 0 = bad multiplier */
+ const unsigned factor = ( tail[1] == 'i' ) ? 1024 : 1000;
+ int exponent = 0; /* 0 = bad multiplier */
int i;
switch( tail[0] )
{
@@ -220,7 +236,7 @@ static unsigned long long getnum( const char * const ptr,
static int get_dict_size( const char * const arg )
{
char * tail;
- const int bits = strtol( arg, &tail, 0 );
+ const long bits = strtol( arg, &tail, 0 );
if( bits >= min_dictionary_bits &&
bits <= max_dictionary_bits && *tail == 0 )
return ( 1 << bits );
@@ -228,68 +244,79 @@ static int get_dict_size( const char * const arg )
}
+void set_mode( enum Mode * const program_modep, const enum Mode new_mode )
+ {
+ if( *program_modep != m_compress && *program_modep != new_mode )
+ {
+ show_error( "Only one operation can be specified.", 0, true );
+ exit( 1 );
+ }
+ *program_modep = new_mode;
+ }
+
+
static int extension_index( const char * const name )
{
- int i;
- for( i = 0; known_extensions[i].from; ++i )
+ int eindex;
+ for( eindex = 0; known_extensions[eindex].from; ++eindex )
{
- const char * const ext = known_extensions[i].from;
+ const char * const ext = known_extensions[eindex].from;
const unsigned name_len = strlen( name );
const unsigned ext_len = strlen( ext );
if( name_len > ext_len &&
strncmp( name + name_len - ext_len, ext, ext_len ) == 0 )
- return i;
+ return eindex;
}
return -1;
}
-static int open_instream( const char * const name, struct stat * const in_statsp,
- const enum Mode program_mode, const int eindex,
- const bool recompress, const bool to_stdout )
+int open_instream( const char * const name, struct stat * const in_statsp,
+ const bool no_ofile, const bool reg_only )
{
- int infd = -1;
- if( program_mode == m_compress && !recompress && eindex >= 0 )
- {
- if( verbosity >= 0 )
- fprintf( stderr, "%s: Input file '%s' already has '%s' suffix.\n",
- program_name, name, known_extensions[eindex].from );
- }
+ int infd = open( name, O_RDONLY | O_BINARY );
+ if( infd < 0 )
+ show_file_error( name, "Can't open input file", errno );
else
{
- infd = open( name, O_RDONLY | O_BINARY );
- if( infd < 0 )
+ const int i = fstat( infd, in_statsp );
+ const mode_t mode = in_statsp->st_mode;
+ const bool can_read = ( i == 0 && !reg_only &&
+ ( S_ISBLK( mode ) || S_ISCHR( mode ) ||
+ S_ISFIFO( mode ) || S_ISSOCK( mode ) ) );
+ if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) )
{
if( verbosity >= 0 )
- fprintf( stderr, "%s: Can't open input file '%s': %s\n",
- program_name, name, strerror( errno ) );
- }
- else
- {
- const int i = fstat( infd, in_statsp );
- const mode_t mode = in_statsp->st_mode;
- const bool can_read = ( i == 0 &&
- ( S_ISBLK( mode ) || S_ISCHR( mode ) ||
- S_ISFIFO( mode ) || S_ISSOCK( mode ) ) );
- const bool no_ofile = ( to_stdout || program_mode == m_test );
- if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) )
- {
- if( verbosity >= 0 )
- fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n",
- program_name, name,
- ( can_read && !no_ofile ) ?
- ",\n and '--stdout' was not specified" : "" );
- close( infd );
- infd = -1;
- }
+ fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n",
+ program_name, name,
+ ( can_read && !no_ofile ) ?
+ ",\n and '--stdout' was not specified" : "" );
+ close( infd );
+ infd = -1;
}
}
return infd;
}
+static int open_instream2( const char * const name, struct stat * const in_statsp,
+ const enum Mode program_mode, const int eindex,
+ const bool recompress, const bool to_stdout )
+ {
+ const bool no_ofile = ( to_stdout || program_mode == m_test );
+ if( program_mode == m_compress && !recompress && eindex >= 0 )
+ {
+ if( verbosity >= 0 )
+ fprintf( stderr, "%s: Input file '%s' already has '%s' suffix.\n",
+ program_name, name, known_extensions[eindex].from );
+ return -1;
+ }
+ return open_instream( name, in_statsp, no_ofile, false );
+ }
+
+
/* assure at least a minimum size for buffer 'buf' */
-static void * resize_buffer( void * buf, const int min_size )
+void * resize_buffer( void * buf, const unsigned min_size )
{
if( buf ) buf = realloc( buf, min_size );
else buf = malloc( min_size );
@@ -312,19 +339,19 @@ static void set_c_outname( const char * const name, const bool multifile )
}
-static void set_d_outname( const char * const name, const int i )
+static void set_d_outname( const char * const name, const int eindex )
{
const unsigned name_len = strlen( name );
- if( i >= 0 )
+ if( eindex >= 0 )
{
- const char * const from = known_extensions[i].from;
+ const char * const from = known_extensions[eindex].from;
const unsigned from_len = strlen( from );
if( name_len > from_len )
{
output_filename = resize_buffer( output_filename, name_len +
- strlen( known_extensions[0].to ) + 1 );
+ strlen( known_extensions[eindex].to ) + 1 );
strcpy( output_filename, name );
- strcpy( output_filename + name_len - from_len, known_extensions[i].to );
+ strcpy( output_filename + name_len - from_len, known_extensions[eindex].to );
return;
}
}
@@ -360,7 +387,8 @@ static bool open_outstream( const bool force, const bool from_stdin )
}
-static bool check_tty( const int infd, const enum Mode program_mode )
+static bool check_tty( const char * const input_filename, const int infd,
+ const enum Mode program_mode )
{
if( program_mode == m_compress && isatty( outfd ) )
{
@@ -370,7 +398,8 @@ static bool check_tty( const int infd, const enum Mode program_mode )
if( ( program_mode == m_decompress || program_mode == m_test ) &&
isatty( infd ) )
{
- show_error( "I won't read compressed data from a terminal.", 0, true );
+ show_file_error( input_filename,
+ "I won't read compressed data from a terminal.", 0 );
return false;
}
return true;
@@ -467,7 +496,7 @@ static int compress( const unsigned long long member_size,
bool error = false;
if( zero )
{
- encoder.fe = (struct FLZ_encoder *)malloc( sizeof (struct FLZ_encoder) );
+ encoder.fe = (struct FLZ_encoder *)malloc( sizeof *encoder.fe );
if( !encoder.fe || !FLZe_init( encoder.fe, infd, outfd ) ) error = true;
else encoder.eb = &encoder.fe->eb;
}
@@ -477,7 +506,7 @@ static int compress( const unsigned long long member_size,
if( Fh_set_dictionary_size( header, encoder_options->dictionary_size ) &&
encoder_options->match_len_limit >= min_match_len_limit &&
encoder_options->match_len_limit <= max_match_len )
- encoder.e = (struct LZ_encoder *)malloc( sizeof (struct LZ_encoder) );
+ encoder.e = (struct LZ_encoder *)malloc( sizeof *encoder.e );
else internal_error( "invalid argument to encoder." );
if( !encoder.e || !LZe_init( encoder.e, Fh_get_dictionary_size( header ),
encoder_options->match_len_limit, infd, outfd ) )
@@ -538,10 +567,10 @@ static int compress( const unsigned long long member_size,
}
-static unsigned char xdigit( const int value )
+static unsigned char xdigit( const unsigned value )
{
- if( value >= 0 && value <= 9 ) return '0' + value;
- if( value >= 10 && value <= 15 ) return 'A' + value - 10;
+ if( value <= 9 ) return '0' + value;
+ if( value <= 15 ) return 'A' + value - 10;
return 0;
}
@@ -554,28 +583,21 @@ static bool show_trailing_data( const uint8_t * const data, const int size,
{
int i;
char buf[80];
- int len = snprintf( buf, sizeof buf, "%strailing data = ",
- all ? "" : "first bytes of " );
- bool text = true;
- for( i = 0; i < size; ++i )
- if( !isprint( data[i] ) ) { text = false; break; }
- if( text )
+ unsigned len = max( 0, snprintf( buf, sizeof buf, "%strailing data = ",
+ all ? "" : "first bytes of " ) );
+ for( i = 0; i < size && len + 2 < sizeof buf; ++i )
{
- if( len > 0 && len < (int)sizeof buf )
- snprintf( buf + len, sizeof buf - len, "'%.*s'", size, (const char *)data );
- }
- else
- {
- for( i = 0; i < size && len > 0 && len + 3 < (int)sizeof buf; ++i )
- {
- if( i > 0 ) buf[len++] = ' ';
- buf[len++] = xdigit( data[i] >> 4 );
- buf[len++] = xdigit( data[i] & 0x0F );
- buf[len] = 0;
- }
+ buf[len++] = xdigit( data[i] >> 4 );
+ buf[len++] = xdigit( data[i] & 0x0F );
+ buf[len++] = ' ';
}
+ if( len < sizeof buf ) buf[len++] = '\'';
+ for( i = 0; i < size && len < sizeof buf; ++i )
+ { if( isprint( data[i] ) ) buf[len++] = data[i]; else buf[len++] = '.'; }
+ if( len < sizeof buf ) buf[len++] = '\'';
+ if( len < sizeof buf ) buf[len] = 0; else buf[sizeof buf - 1] = 0;
Pp_show_msg( pp, buf );
- if( !ignore_trailing ) show_error( "Trailing data not allowed.", 0, false );
+ if( !ignore_trailing ) show_file_error( pp->name, trailing_msg, 0 );
}
return ignore_trailing;
}
@@ -615,24 +637,19 @@ static int decompress( const int infd, struct Pretty_print * const pp,
if( !Fh_verify_magic( header ) )
{
if( first_member )
- { Pp_show_msg( pp, "Bad magic number (file not in lzip format)." );
- retval = 2; }
+ { show_file_error( pp->name, bad_magic_msg, 0 ); retval = 2; }
else if( !show_trailing_data( header, size, pp, false, ignore_trailing ) )
retval = 2;
break;
}
if( !Fh_verify_version( header ) )
{
- if( verbosity >= 0 )
- { Pp_show_msg( pp, 0 );
- fprintf( stderr, "Version %d member format not supported.\n",
- Fh_version( header ) ); }
+ Pp_show_msg( pp, bad_version( Fh_version( header ) ) );
retval = 2; break;
}
dictionary_size = Fh_get_dictionary_size( header );
if( !isvalid_ds( dictionary_size ) )
- { Pp_show_msg( pp, "Invalid dictionary size in member header." );
- retval = 2; break; }
+ { Pp_show_msg( pp, bad_dict_msg ); retval = 2; break; }
if( verbosity >= 2 || ( verbosity == 1 && first_member ) )
{ Pp_show_msg( pp, 0 ); show_header( dictionary_size ); }
@@ -693,6 +710,16 @@ void show_error( const char * const msg, const int errcode, const bool help )
}
+void show_file_error( const char * const filename, const char * const msg,
+ const int errcode )
+ {
+ if( verbosity < 0 ) return;
+ fprintf( stderr, "%s: %s: %s", program_name, filename, msg );
+ if( errcode > 0 ) fprintf( stderr, ": %s", strerror( errcode ) );
+ fputc( '\n', stderr );
+ }
+
+
void internal_error( const char * const msg )
{
if( verbosity >= 0 )
@@ -746,7 +773,6 @@ int main( const int argc, const char * const argv[] )
const unsigned long long max_volume_size = 0x4000000000000000ULL;
unsigned long long member_size = max_member_size;
unsigned long long volume_size = 0;
- const char * input_filename = "";
const char * default_output_filename = "";
const char ** filenames = 0;
int num_filenames = 0;
@@ -759,8 +785,8 @@ int main( const int argc, const char * const argv[] )
bool force = false;
bool ignore_trailing = true;
bool keep_input_files = false;
- bool stdin_used = false;
bool recompress = false;
+ bool stdin_used = false;
bool to_stdout = false;
bool zero = false;
struct Pretty_print pp;
@@ -785,6 +811,7 @@ int main( const int argc, const char * const argv[] )
{ 'F', "recompress", ap_no },
{ 'h', "help", ap_no },
{ 'k', "keep", ap_no },
+ { 'l', "list", ap_no },
{ 'm', "match-length", ap_yes },
{ 'n', "threads", ap_yes },
{ 'o', "output", ap_yes },
@@ -794,7 +821,7 @@ int main( const int argc, const char * const argv[] )
{ 't', "test", ap_no },
{ 'v', "verbose", ap_no },
{ 'V', "version", ap_no },
- { 0 , 0, ap_no } };
+ { 0 , 0, ap_no } };
struct Arg_parser parser;
@@ -820,11 +847,12 @@ int main( const int argc, const char * const argv[] )
case 'a': ignore_trailing = false; break;
case 'b': member_size = getnum( arg, 100000, max_member_size ); break;
case 'c': to_stdout = true; break;
- case 'd': program_mode = m_decompress; break;
+ case 'd': set_mode( &program_mode, m_decompress ); break;
case 'f': force = true; break;
case 'F': recompress = true; break;
case 'h': show_help(); return 0;
case 'k': keep_input_files = true; break;
+ case 'l': set_mode( &program_mode, m_list ); break;
case 'm': encoder_options.match_len_limit =
getnum( arg, min_match_len_limit, max_match_len );
zero = false; break;
@@ -834,7 +862,7 @@ int main( const int argc, const char * const argv[] )
case 's': encoder_options.dictionary_size = get_dict_size( arg );
zero = false; break;
case 'S': volume_size = getnum( arg, 100000, max_volume_size ); break;
- case 't': program_mode = m_test; break;
+ case 't': set_mode( &program_mode, m_test ); break;
case 'v': if( verbosity < 4 ) ++verbosity; break;
case 'V': show_version(); return 0;
default : internal_error( "uncaught option." );
@@ -846,14 +874,6 @@ int main( const int argc, const char * const argv[] )
setmode( STDOUT_FILENO, O_BINARY );
#endif
- if( program_mode == m_test )
- outfd = -1;
- else if( program_mode == m_compress )
- {
- Dis_slots_init();
- Prob_prices_init();
- }
-
num_filenames = max( 1, ap_arguments( &parser ) - argind );
filenames = resize_buffer( filenames, num_filenames * sizeof filenames[0] );
filenames[0] = "-";
@@ -864,6 +884,17 @@ int main( const int argc, const char * const argv[] )
if( strcmp( filenames[i], "-" ) != 0 ) filenames_given = true;
}
+ if( program_mode == m_list )
+ return list_files( filenames, num_filenames, ignore_trailing );
+
+ if( program_mode == m_test )
+ outfd = -1;
+ else if( program_mode == m_compress )
+ {
+ Dis_slots_init();
+ Prob_prices_init();
+ }
+
if( !to_stdout && program_mode != m_test &&
( filenames_given || default_output_filename[0] ) )
set_signals();
@@ -873,6 +904,7 @@ int main( const int argc, const char * const argv[] )
output_filename = resize_buffer( output_filename, 1 );
for( i = 0; i < num_filenames; ++i )
{
+ const char * input_filename = "";
int tmp;
struct stat in_stats;
const struct stat * in_statsp;
@@ -881,7 +913,6 @@ int main( const int argc, const char * const argv[] )
if( !filenames[i][0] || strcmp( filenames[i], "-" ) == 0 )
{
if( stdin_used ) continue; else stdin_used = true;
- input_filename = "";
infd = STDIN_FILENO;
if( program_mode != m_test )
{
@@ -908,10 +939,9 @@ int main( const int argc, const char * const argv[] )
}
else
{
- const int eindex = extension_index( filenames[i] );
- input_filename = filenames[i];
- infd = open_instream( input_filename, &in_stats, program_mode,
- eindex, recompress, to_stdout );
+ const int eindex = extension_index( input_filename = filenames[i] );
+ infd = open_instream2( input_filename, &in_stats, program_mode,
+ eindex, recompress, to_stdout );
if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; }
if( program_mode != m_test )
{
@@ -931,14 +961,15 @@ int main( const int argc, const char * const argv[] )
}
}
- if( !check_tty( infd, program_mode ) )
+ Pp_set_name( &pp, input_filename );
+ if( !check_tty( pp.name, infd, program_mode ) )
{
if( retval < 1 ) retval = 1;
+ if( program_mode == m_test ) { close( infd ); infd = -1; continue; }
cleanup_and_fail( retval );
}
in_statsp = input_filename[0] ? &in_stats : 0;
- Pp_set_name( &pp, input_filename );
if( program_mode == m_compress )
tmp = compress( member_size, volume_size, infd, &encoder_options, &pp,
in_statsp, zero );
diff --git a/testsuite/check.sh b/testsuite/check.sh
index 52347b4..add0722 100755
--- a/testsuite/check.sh
+++ b/testsuite/check.sh
@@ -1,6 +1,6 @@
#! /bin/sh
# check script for Clzip - LZMA lossless data compressor
-# Copyright (C) 2010-2016 Antonio Diaz Diaz.
+# Copyright (C) 2010-2017 Antonio Diaz Diaz.
#
# This script is free software: you have unlimited permission
# to copy, distribute and modify it.
@@ -17,12 +17,12 @@ if [ ! -f "${LZIP}" ] || [ ! -x "${LZIP}" ] ; then
exit 1
fi
-if [ -e "${LZIP}" ] 2> /dev/null ; then true
-else
+[ -e "${LZIP}" ] 2> /dev/null ||
+ {
echo "$0: a POSIX shell is required to run the tests"
echo "Try bash -c \"$0 $1 $2\""
exit 1
-fi
+ }
if [ -d tmp ] ; then rm -rf tmp ; fi
mkdir tmp
@@ -31,143 +31,229 @@ cd "${objdir}"/tmp || framework_failure
cat "${testdir}"/test.txt > in || framework_failure
in_lz="${testdir}"/test.txt.lz
fail=0
+test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; }
printf "testing clzip-%s..." "$2"
"${LZIP}" -fkqm4 in
-if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi
+{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO
"${LZIP}" -fkqm274 in
-if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -fkqs-1 in
-if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -fkqs0 in
-if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -fkqs4095 in
-if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -fkqs513MiB in
-if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi
+{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO
+for i in bad_size -1 0 4095 513MiB 1G 1T 1P 1E 1Z 1Y 10KB ; do
+ "${LZIP}" -fkqs $i in
+ { [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO $i
+done
+"${LZIP}" -lq in
+[ $? = 2 ] || test_failed $LINENO
"${LZIP}" -tq in
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+[ $? = 2 ] || test_failed $LINENO
"${LZIP}" -tq < in
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+[ $? = 2 ] || test_failed $LINENO
"${LZIP}" -cdq in
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+[ $? = 2 ] || test_failed $LINENO
"${LZIP}" -cdq < in
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
-dd if="${in_lz}" bs=1 count=6 2> /dev/null | "${LZIP}" -tq
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
-dd if="${in_lz}" bs=1 count=20 2> /dev/null | "${LZIP}" -tq
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+[ $? = 2 ] || test_failed $LINENO
+# these are for code coverage
+"${LZIP}" -lt "${in_lz}" 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" -cdl "${in_lz}" > out 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" -cdt "${in_lz}" > out 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" -t -- nx_file 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" --help > /dev/null || test_failed $LINENO
+"${LZIP}" -n1 -V > /dev/null || test_failed $LINENO
+"${LZIP}" -m 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" -z 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" --bad_option 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" --t 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" --test=2 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" --output= 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" --output 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
+printf "LZIP\001-.............................." | "${LZIP}" -t 2> /dev/null
+printf "LZIP\002-.............................." | "${LZIP}" -t 2> /dev/null
+printf "LZIP\001+.............................." | "${LZIP}" -t 2> /dev/null
printf "\ntesting decompression..."
-"${LZIP}" -t "${in_lz}"
-if [ $? = 0 ] ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -cd "${in_lz}" > copy || fail=1
-cmp in copy || fail=1
-printf .
+"${LZIP}" -lq "${in_lz}" || test_failed $LINENO
+"${LZIP}" -t "${in_lz}" || test_failed $LINENO
+"${LZIP}" -cd "${in_lz}" > copy || test_failed $LINENO
+cmp in copy || test_failed $LINENO
rm -f copy
cat "${in_lz}" > copy.lz || framework_failure
-"${LZIP}" -dk copy.lz || fail=1
-cmp in copy || fail=1
+"${LZIP}" -dk copy.lz || test_failed $LINENO
+cmp in copy || test_failed $LINENO
printf "to be overwritten" > copy || framework_failure
-"${LZIP}" -dq copy.lz
-if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi
+"${LZIP}" -d copy.lz 2> /dev/null
+[ $? = 1 ] || test_failed $LINENO
"${LZIP}" -df copy.lz
-if [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; then
- printf . ; else printf - ; fail=1 ; fi
+{ [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; } || test_failed $LINENO
printf "to be overwritten" > copy || framework_failure
-"${LZIP}" -df -o copy < "${in_lz}" || fail=1
-cmp in copy || fail=1
-printf .
+"${LZIP}" -df -o copy < "${in_lz}" || test_failed $LINENO
+cmp in copy || test_failed $LINENO
rm -f copy
-"${LZIP}" < in > anyothername || fail=1
-"${LZIP}" -d -o copy - anyothername - < "${in_lz}"
-if [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; then
- printf . ; else printf - ; fail=1 ; fi
+"${LZIP}" < in > anyothername || test_failed $LINENO
+"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null
+{ [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; } ||
+ test_failed $LINENO
rm -f copy anyothername.out
+"${LZIP}" -lq in "${in_lz}"
+[ $? = 2 ] || test_failed $LINENO
+"${LZIP}" -lq nx_file.lz "${in_lz}"
+[ $? = 1 ] || test_failed $LINENO
"${LZIP}" -tq in "${in_lz}"
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -tq foo.lz "${in_lz}"
-if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi
+[ $? = 2 ] || test_failed $LINENO
+"${LZIP}" -tq nx_file.lz "${in_lz}"
+[ $? = 1 ] || test_failed $LINENO
"${LZIP}" -cdq in "${in_lz}" > copy
-if [ $? = 2 ] && cat copy in | cmp in - ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -cdq foo.lz "${in_lz}" > copy
-if [ $? = 1 ] && cmp in copy ; then printf . ; else printf - ; fail=1 ; fi
+{ [ $? = 2 ] && cat copy in | cmp in - ; } || test_failed $LINENO
+"${LZIP}" -cdq nx_file.lz "${in_lz}" > copy
+{ [ $? = 1 ] && cmp in copy ; } || test_failed $LINENO
rm -f copy
cat "${in_lz}" > copy.lz || framework_failure
+for i in 1 2 3 4 5 6 7 ; do
+ printf "g" >> copy.lz || framework_failure
+ "${LZIP}" -alvv copy.lz "${in_lz}" > /dev/null 2>&1
+ [ $? = 2 ] || test_failed $LINENO $i
+ "${LZIP}" -atvvvv copy.lz "${in_lz}" 2> /dev/null
+ [ $? = 2 ] || test_failed $LINENO $i
+done
"${LZIP}" -dq in copy.lz
-if [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; then
- printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -dq foo.lz copy.lz
-if [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e foo ] && cmp in copy ; then
- printf . ; else printf - ; fail=1 ; fi
+{ [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; } ||
+ test_failed $LINENO
+"${LZIP}" -dq nx_file.lz copy.lz
+{ [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e nx_file ] && cmp in copy ; } ||
+ test_failed $LINENO
cat in in > in2 || framework_failure
-"${LZIP}" -o copy2 < in2 || fail=1
-"${LZIP}" -t copy2.lz || fail=1
-"${LZIP}" -cd copy2.lz > copy2 || fail=1
-cmp in2 copy2 || fail=1
-printf .
+cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure
+"${LZIP}" -lq in2.lz || test_failed $LINENO
+"${LZIP}" -t in2.lz || test_failed $LINENO
+"${LZIP}" -cd in2.lz > copy2 || test_failed $LINENO
+cmp in2 copy2 || test_failed $LINENO
+
+"${LZIP}" --output=copy2 < in2 || test_failed $LINENO
+"${LZIP}" -lq copy2.lz || test_failed $LINENO
+"${LZIP}" -t copy2.lz || test_failed $LINENO
+"${LZIP}" -cd copy2.lz > copy2 || test_failed $LINENO
+cmp in2 copy2 || test_failed $LINENO
-printf "garbage" >> copy2.lz || framework_failure
+printf "\ngarbage" >> copy2.lz || framework_failure
+"${LZIP}" -tvvvv copy2.lz 2> /dev/null || test_failed $LINENO
rm -f copy2
+"${LZIP}" -alq copy2.lz
+[ $? = 2 ] || test_failed $LINENO
"${LZIP}" -atq copy2.lz
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+[ $? = 2 ] || test_failed $LINENO
"${LZIP}" -atq < copy2.lz
-if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+[ $? = 2 ] || test_failed $LINENO
"${LZIP}" -adkq copy2.lz
-if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi
+{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO
"${LZIP}" -adkq -o copy2 < copy2.lz
-if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi
+{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO
printf "to be overwritten" > copy2 || framework_failure
-"${LZIP}" -df copy2.lz || fail=1
-cmp in2 copy2 || fail=1
-printf .
+"${LZIP}" -df copy2.lz || test_failed $LINENO
+cmp in2 copy2 || test_failed $LINENO
printf "\ntesting compression..."
-"${LZIP}" -cfq "${in_lz}" > out # /dev/null is a tty on OS/2
-if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi
-"${LZIP}" -cF "${in_lz}" > out || fail=1
-"${LZIP}" -cd out | "${LZIP}" -d > copy || fail=1
-cmp in copy || fail=1
-printf .
+"${LZIP}" -cf "${in_lz}" > out 2> /dev/null # /dev/null is a tty on OS/2
+[ $? = 1 ] || test_failed $LINENO
+"${LZIP}" -cFvvm36 "${in_lz}" > out 2> /dev/null || test_failed $LINENO
+"${LZIP}" -cd out | "${LZIP}" -d > copy || test_failed $LINENO
+cmp in copy || test_failed $LINENO
for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do
- "${LZIP}" -k -$i in || fail=1
- mv -f in.lz copy.lz || fail=1
- printf "garbage" >> copy.lz || fail=1
- "${LZIP}" -df copy.lz || fail=1
- cmp in copy || fail=1
+ "${LZIP}" -k -$i in || test_failed $LINENO $i
+ mv -f in.lz copy.lz || test_failed $LINENO $i
+ printf "garbage" >> copy.lz || framework_failure
+ "${LZIP}" -df copy.lz || test_failed $LINENO $i
+ cmp in copy || test_failed $LINENO $i
done
-printf .
for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do
- "${LZIP}" -c -$i in > out || fail=1
- printf "g" >> out || fail=1
- "${LZIP}" -cd out > copy || fail=1
- cmp in copy || fail=1
+ "${LZIP}" -c -$i in > out || test_failed $LINENO $i
+ printf "g" >> out || framework_failure
+ "${LZIP}" -cd out > copy || test_failed $LINENO $i
+ cmp in copy || test_failed $LINENO $i
done
-printf .
for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do
- "${LZIP}" -$i < in > out || fail=1
- "${LZIP}" -d < out > copy || fail=1
- cmp in copy || fail=1
+ "${LZIP}" -$i < in > out || test_failed $LINENO $i
+ "${LZIP}" -d < out > copy || test_failed $LINENO $i
+ cmp in copy || test_failed $LINENO $i
done
-printf .
for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do
- "${LZIP}" -f -$i -o out < in || fail=1
- "${LZIP}" -df -o copy < out.lz || fail=1
- cmp in copy || fail=1
+ "${LZIP}" -f -$i -o out < in || test_failed $LINENO $i
+ "${LZIP}" -df -o copy < out.lz || test_failed $LINENO $i
+ cmp in copy || test_failed $LINENO $i
done
-printf .
+
+cat in in in in in in in in > in8 || framework_failure
+"${LZIP}" -1s12 -S100k -o out < in8 || test_failed $LINENO
+"${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO
+"${LZIP}" -cd out00001.lz out00002.lz | cmp in8 - || test_failed $LINENO
+rm -f out00001.lz
+"${LZIP}" -1ks4Ki -b100000 in8 || test_failed $LINENO
+"${LZIP}" -t in8.lz || test_failed $LINENO
+"${LZIP}" -cd in8.lz | cmp in8 - || test_failed $LINENO
+rm -f in8
+"${LZIP}" -0 -S100k -o out < in8.lz || test_failed $LINENO
+"${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO
+"${LZIP}" -cd out00001.lz out00002.lz | cmp in8.lz - || test_failed $LINENO
+rm -f out00001.lz out00002.lz
+"${LZIP}" -0kF -b100k in8.lz || test_failed $LINENO
+"${LZIP}" -t in8.lz.lz || test_failed $LINENO
+"${LZIP}" -cd in8.lz.lz | cmp in8.lz - || test_failed $LINENO
+rm -f in8.lz in8.lz.lz
+
+printf "\ntesting bad input..."
+
+cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure
+if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null &&
+ [ -e trunc.lz ] && cmp in2.lz trunc.lz > /dev/null 2>&1 ; then
+ for i in 6 20 14734 14753 14754 14755 14756 14757 14758 ; do
+ dd if=in3.lz of=trunc.lz bs=$i count=1 2> /dev/null
+ "${LZIP}" -lq trunc.lz
+ [ $? = 2 ] || test_failed $LINENO $i
+ "${LZIP}" -t trunc.lz 2> /dev/null
+ [ $? = 2 ] || test_failed $LINENO $i
+ "${LZIP}" -tq < trunc.lz
+ [ $? = 2 ] || test_failed $LINENO $i
+ "${LZIP}" -cdq trunc.lz > out
+ [ $? = 2 ] || test_failed $LINENO $i
+ "${LZIP}" -dq < trunc.lz > out
+ [ $? = 2 ] || test_failed $LINENO $i
+ done
+else
+ printf "\nwarning: skipping truncation test: 'dd' does not work on your system."
+fi
+
+cat "${in_lz}" > ingin.lz || framework_failure
+printf "g" >> ingin.lz || framework_failure
+cat "${in_lz}" >> ingin.lz || framework_failure
+"${LZIP}" -lq ingin.lz
+[ $? = 2 ] || test_failed $LINENO
+"${LZIP}" -t ingin.lz || test_failed $LINENO
+"${LZIP}" -cd ingin.lz > copy || test_failed $LINENO
+cmp in copy || test_failed $LINENO
+"${LZIP}" -t < ingin.lz || test_failed $LINENO
+"${LZIP}" -d < ingin.lz > copy || test_failed $LINENO
+cmp in copy || test_failed $LINENO
echo
if [ ${fail} = 0 ] ; then
diff --git a/testsuite/test.txt.lz b/testsuite/test.txt.lz
index 41d2e39..22cea6e 100644
--- a/testsuite/test.txt.lz
+++ b/testsuite/test.txt.lz
Binary files differ