From efe11df4b26426c3db4f96592e899b1f005a878e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 May 2017 17:50:01 +0200 Subject: Merging upstream version 1.9. Signed-off-by: Daniel Baumann --- ChangeLog | 18 +- INSTALL | 2 +- Makefile.in | 14 +- NEWS | 30 +- README | 39 +- carg_parser.c | 9 +- carg_parser.h | 2 +- configure | 21 +- decoder.c | 89 +++-- decoder.h | 209 +++++----- doc/clzip.1 | 7 +- doc/clzip.info | 1029 ++++++++++++++++++++++++++++++++++++++++++++++--- doc/clzip.texi | 1004 +++++++++++++++++++++++++++++++++++++++++++++-- encoder.c | 96 +++-- encoder.h | 90 ++--- encoder_base.c | 20 +- encoder_base.h | 217 ++++++----- fast_encoder.c | 44 +-- fast_encoder.h | 8 +- file_index.c | 268 +++++++++++++ file_index.h | 90 +++++ list.c | 123 ++++++ lzip.h | 27 +- main.c | 267 +++++++------ testsuite/check.sh | 264 ++++++++----- testsuite/test.txt.lz | Bin 7376 -> 7376 bytes 26 files changed, 3243 insertions(+), 744 deletions(-) create mode 100644 file_index.c create mode 100644 file_index.h create mode 100644 list.c diff --git a/ChangeLog b/ChangeLog index 4377266..88b6ab2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +2017-04-13 Antonio Diaz Diaz + + * Version 1.9 released. + * The option '-l, --list' has been ported from lziprecover. + * Don't allow mixing different operations (-d, -l or -t). + * Compression time of option '-0' has been reduced by 6%. + * Compression time of options -1 to -9 has been reduced by 1%. + * Decompression time has been reduced by 7%. + * main.c: Continue testing if any input file is a terminal. + * main.c: Show trailing data in both hexadecimal and ASCII. + * file_index.c: Improve detection of bad dict and trailing data. + * lzip.h: Unified messages for bad magic, trailing data, etc. + * clzip.texi: Added missing chapters from lzip.texi. + 2016-05-13 Antonio Diaz Diaz * Version 1.8 released. @@ -7,7 +21,7 @@ * decoder.c (LZd_verify_trailer): Removed test of final code. * main.c (main): Delete '--output' file if infd is a terminal. * main.c (main): Don't use stdin more than once. - * lzip.texi: Added chapter 'Trailing data'. + * clzip.texi: Added chapter 'Trailing data'. * configure: Avoid warning on some shells when testing for gcc. * Makefile.in: Detect the existence of install-info. * testsuite/check.sh: A POSIX shell is required to run the tests. @@ -94,7 +108,7 @@ * Translated to C from the C++ source of lzip 1.10. -Copyright (C) 2010-2016 Antonio Diaz Diaz. +Copyright (C) 2010-2017 Antonio Diaz Diaz. This file is a collection of facts, and thus it is not copyrightable, but just in case, you have unlimited permission to copy, distribute and diff --git a/INSTALL b/INSTALL index ed6f68a..0808525 100644 --- a/INSTALL +++ b/INSTALL @@ -62,7 +62,7 @@ After running 'configure', you can run 'make' and 'make install' as explained above. -Copyright (C) 2010-2016 Antonio Diaz Diaz. +Copyright (C) 2010-2017 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/Makefile.in b/Makefile.in index d028148..08274dd 100644 --- a/Makefile.in +++ b/Makefile.in @@ -7,13 +7,15 @@ INSTALL_DIR = $(INSTALL) -d -m 755 SHELL = /bin/sh CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 -objs = carg_parser.o encoder_base.o encoder.o fast_encoder.o decoder.o main.o +objs = carg_parser.o file_index.o list.o encoder_base.o encoder.o \ + fast_encoder.o decoder.o main.o .PHONY : all install install-bin install-info install-man \ install-strip install-compress install-strip-compress \ install-bin-strip install-info-compress install-man-compress \ - install-as-lzip uninstall uninstall-bin uninstall-info uninstall-man \ + install-as-lzip \ + uninstall uninstall-bin uninstall-info uninstall-man \ doc info man check dist clean distclean all : $(progname) @@ -33,6 +35,8 @@ decoder.o : lzip.h decoder.h encoder_base.o : lzip.h encoder_base.h encoder.o : lzip.h encoder_base.h encoder.h fast_encoder.o : lzip.h encoder_base.h fast_encoder.h +file_index.o : lzip.h file_index.h +list.o : lzip.h file_index.h main.o : carg_parser.h lzip.h decoder.h encoder_base.h encoder.h fast_encoder.h @@ -117,11 +121,11 @@ dist : doc $(DISTNAME)/doc/$(progname).1 \ $(DISTNAME)/doc/$(pkgname).info \ $(DISTNAME)/doc/$(pkgname).texi \ + $(DISTNAME)/*.h \ + $(DISTNAME)/*.c \ $(DISTNAME)/testsuite/check.sh \ $(DISTNAME)/testsuite/test.txt \ - $(DISTNAME)/testsuite/test.txt.lz \ - $(DISTNAME)/*.h \ - $(DISTNAME)/*.c + $(DISTNAME)/testsuite/test.txt.lz rm -f $(DISTNAME) lzip -v -9 $(DISTNAME).tar diff --git a/NEWS b/NEWS index 7f49444..1f85396 100644 --- a/NEWS +++ b/NEWS @@ -1,21 +1,21 @@ -Changes in version 1.8: +Changes in version 1.9: -The option "-a, --trailing-error", which makes clzip exit with error -status 2 if any remaining input is detected after decompressing the last -member, has been added. +The option '-l, --list' has been ported from lziprecover. -When decompressing or testing, up to 6 bytes of trailing data are -printed if "-vvvv" is specified. +It is now an error to specify two or more different operations in the +command line (--decompress, --list or --test). -The test of the value remaining in the range decoder has been removed. -(After extensive testing it has been found useless to detect corruption -in the decompressed data. Eliminating it reduces the number of false -positives for corruption and makes error detection more accurate). +Compression time of option '-0' has been reduced by 6%. -When decompressing, the file specified with the '--output' option is now -deleted if the input is a terminal. +Compression time of options '-1' to '-9' has been reduced by 1%. -The new chapter "Trailing data" has been added to the manual. +Decompression time has been reduced by 7%. -A harmless check failure on Windows, caused by the failed comparison of -a message in text mode, has been fixed. +In test mode, clzip now continues checking the rest of the files if any +input file is a terminal. + +Trailing data are now shown both in hexadecimal and as a string of +printable ASCII characters. + +Three missing chapters have been added to the manual, which now contains +all the chapters of the lzip manual. diff --git a/README b/README index 9316c4e..cc5fd73 100644 --- a/README +++ b/README @@ -1,14 +1,15 @@ Description -Clzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Clzip is about as fast as gzip, compresses most -files more than bzip2, and is better than both from a data recovery -perspective. +Clzip is a C language version of lzip, fully compatible with lzip-1.4 or +newer. As clzip is written in C, it may be easier to integrate in +applications like package managers, embedded devices, or systems lacking +a C++ compiler. -Clzip uses the lzip file format; the files produced by clzip are fully -compatible with lzip-1.4 or newer, and can be rescued with lziprecover. -Clzip is in fact a C language version of lzip, intended for embedded -devices or systems lacking a C++ compiler. +Lzip is a lossless data compressor with a user interface similar to the +one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0), +or compress most files more than bzip2 (lzip -9). Decompression speed is +intermediate between gzip and bzip2. Lzip is better than gzip and bzip2 +from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -21,11 +22,11 @@ availability: merging of damaged copies of a file. * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the code of a simple decompressor along with a - detailed explanation of how it works, so that with the only help of - the lzip manual it would be possible for a digital archaeologist to - extract the data from a lzip file long after quantum computers - eventually render LZMA obsolete. + lzip manual provides the source code of a simple decompressor along + with a detailed explanation of how it works, so that with the only + help of the lzip manual it would be possible for a digital + archaeologist to extract the data from a lzip file long after + quantum computers eventually render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. @@ -80,11 +81,11 @@ or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. -Clzip can produce multimember files and safely recover, with -lziprecover, the undamaged members in case of file damage. Clzip can -also split the compressed output in volumes of a given size, even when -reading from standard input. This allows the direct creation of -multivolume compressed tar archives. +Clzip can produce multimember files, and lziprecover can safely recover +the undamaged members in case of file damage. Clzip can also split the +compressed output in volumes of a given size, even when reading from +standard input. This allows the direct creation of multivolume +compressed tar archives. Clzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are @@ -115,7 +116,7 @@ range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI). -Copyright (C) 2010-2016 Antonio Diaz Diaz. +Copyright (C) 2010-2017 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/carg_parser.c b/carg_parser.c index 3d4e89f..6850643 100644 --- a/carg_parser.c +++ b/carg_parser.c @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C version) - Copyright (C) 2006-2016 Antonio Diaz Diaz. + Copyright (C) 2006-2017 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -94,7 +94,7 @@ static char parse_long_option( struct Arg_parser * const ap, else if( index < 0 ) index = i; /* First nonexact match found */ else if( options[index].code != options[i].code || options[index].has_arg != options[i].has_arg ) - ambig = 1; /* Second or later nonexact match found */ + ambig = 1; /* Second or later nonexact match found */ } if( ambig && !exact ) @@ -230,7 +230,9 @@ char ap_init( struct Arg_parser * const ap, } else { - if( !in_order ) + if( in_order ) + { if( !push_back_record( ap, 0, argv[argind++] ) ) return 0; } + else { void * tmp = ap_resize_buffer( non_options, ( non_options_size + 1 ) * sizeof *non_options ); @@ -238,7 +240,6 @@ char ap_init( struct Arg_parser * const ap, non_options = (const char **)tmp; non_options[non_options_size++] = argv[argind++]; } - else if( !push_back_record( ap, 0, argv[argind++] ) ) return 0; } } if( ap->error ) free_data( ap ); diff --git a/carg_parser.h b/carg_parser.h index e918942..c4ce31d 100644 --- a/carg_parser.h +++ b/carg_parser.h @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C version) - Copyright (C) 2006-2016 Antonio Diaz Diaz. + Copyright (C) 2006-2017 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided diff --git a/configure b/configure index bdfc775..5ef6211 100755 --- a/configure +++ b/configure @@ -1,12 +1,12 @@ #! /bin/sh # configure script for Clzip - LZMA lossless data compressor -# Copyright (C) 2010-2016 Antonio Diaz Diaz. +# Copyright (C) 2010-2017 Antonio Diaz Diaz. # # This configure script is free software: you have unlimited permission # to copy, distribute and modify it. pkgname=clzip -pkgversion=1.8 +pkgversion=1.9 progname=clzip srctrigger=doc/${pkgname}.texi @@ -26,11 +26,11 @@ CFLAGS='-Wall -W -O2' LDFLAGS= # checking whether we are using GNU C. -if /bin/sh -c "${CC} --version" > /dev/null 2>&1 ; then true -else +/bin/sh -c "${CC} --version" > /dev/null 2>&1 || + { CC=cc - CFLAGS='-W -O2' -fi + CFLAGS=-O2 + } # Loop over all args args= @@ -52,9 +52,12 @@ while [ $# != 0 ] ; do # Process the options case ${option} in --help | -h) - echo "Usage: configure [options]" + echo "Usage: $0 [OPTION]... [VAR=VALUE]..." + echo + echo "To assign makefile variables (e.g., CC, CFLAGS...), specify them as" + echo "arguments to configure in the form VAR=VALUE." echo - echo "Options: [defaults in brackets]" + echo "Options and variables: [defaults in brackets]" echo " -h, --help display this help and exit" echo " -V, --version output version information and exit" echo " --srcdir=DIR find the sources in DIR [. or ..]" @@ -165,7 +168,7 @@ echo "LDFLAGS = ${LDFLAGS}" rm -f Makefile cat > Makefile << EOF # Makefile for Clzip - LZMA lossless data compressor -# Copyright (C) 2010-2016 Antonio Diaz Diaz. +# Copyright (C) 2010-2017 Antonio Diaz Diaz. # This file was generated automatically by configure. Don't edit. # # This Makefile is free software: you have unlimited permission diff --git a/decoder.c b/decoder.c index 942ec60..453d271 100644 --- a/decoder.c +++ b/decoder.c @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -172,7 +172,7 @@ static bool LZd_verify_trailer( struct LZ_decoder * const d, ( 8.0 * member_size ) / data_size, 100.0 * ( 1.0 - ( (double)member_size / data_size ) ) ); if( !error && verbosity >= 4 ) - fprintf( stderr, "data CRC %08X, data size %9llu, member size %8llu. ", + fprintf( stderr, "CRC %08X, decompressed %9llu, compressed %8llu. ", LZd_crc( d ), data_size, member_size ); return !error; } @@ -184,46 +184,74 @@ int LZd_decode_member( struct LZ_decoder * const d, struct Pretty_print * const pp ) { struct Range_decoder * const rdec = d->rdec; + Bit_model bm_literal[1<bm_match[state][pos_state] ) == 0 ) /* 1st bit */ + if( Rd_decode_bit( rdec, &bm_match[state][pos_state] ) == 0 ) /* 1st bit */ { - const uint8_t prev_byte = LZd_peek_prev( d ); + Bit_model * const bm = bm_literal[get_lit_state(LZd_peek_prev( d ))]; if( St_is_char( state ) ) { state -= ( state < 4 ) ? state : 3; - LZd_put_byte( d, Rd_decode_tree( rdec, - d->bm_literal[get_lit_state(prev_byte)], 8 ) ); + LZd_put_byte( d, Rd_decode_tree8( rdec, bm ) ); } else { state -= ( state < 10 ) ? 3 : 6; - LZd_put_byte( d, Rd_decode_matched( rdec, - d->bm_literal[get_lit_state(prev_byte)], - LZd_peek( d, rep0 ) ) ); + LZd_put_byte( d, Rd_decode_matched( rdec, bm, LZd_peek( d, rep0 ) ) ); } } else /* match or repeated match */ { int len; - if( Rd_decode_bit( rdec, &d->bm_rep[state] ) != 0 ) /* 2nd bit */ + if( Rd_decode_bit( rdec, &bm_rep[state] ) != 0 ) /* 2nd bit */ { - if( Rd_decode_bit( rdec, &d->bm_rep0[state] ) != 0 ) /* 3rd bit */ + if( Rd_decode_bit( rdec, &bm_rep0[state] ) == 0 ) /* 3rd bit */ + { + if( Rd_decode_bit( rdec, &bm_len[state][pos_state] ) == 0 ) /* 4th bit */ + { state = St_set_short_rep( state ); + LZd_put_byte( d, LZd_peek( d, rep0 ) ); continue; } + } + else { unsigned distance; - if( Rd_decode_bit( rdec, &d->bm_rep1[state] ) == 0 ) /* 4th bit */ + if( Rd_decode_bit( rdec, &bm_rep1[state] ) == 0 ) /* 4th bit */ distance = rep1; else { - if( Rd_decode_bit( rdec, &d->bm_rep2[state] ) == 0 ) /* 5th bit */ + if( Rd_decode_bit( rdec, &bm_rep2[state] ) == 0 ) /* 5th bit */ distance = rep2; else { distance = rep3; rep3 = rep2; } @@ -232,36 +260,29 @@ int LZd_decode_member( struct LZ_decoder * const d, rep1 = rep0; rep0 = distance; } - else - { - if( Rd_decode_bit( rdec, &d->bm_len[state][pos_state] ) == 0 ) /* 4th bit */ - { state = St_set_short_rep( state ); - LZd_put_byte( d, LZd_peek( d, rep0 ) ); continue; } - } state = St_set_rep( state ); - len = min_match_len + Rd_decode_len( rdec, &d->rep_len_model, pos_state ); + len = min_match_len + Rd_decode_len( rdec, &rep_len_model, pos_state ); } else /* match */ { - const unsigned rep0_saved = rep0; - int dis_slot; - len = min_match_len + Rd_decode_len( rdec, &d->match_len_model, pos_state ); - dis_slot = Rd_decode_tree6( rdec, d->bm_dis_slot[get_len_state(len)] ); - if( dis_slot < start_dis_model ) rep0 = dis_slot; - else + unsigned distance; + len = min_match_len + Rd_decode_len( rdec, &match_len_model, pos_state ); + distance = Rd_decode_tree6( rdec, bm_dis_slot[get_len_state(len)] ); + if( distance >= start_dis_model ) { + const unsigned dis_slot = distance; const int direct_bits = ( dis_slot >> 1 ) - 1; - rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; if( dis_slot < end_dis_model ) - rep0 += Rd_decode_tree_reversed( rdec, - d->bm_dis + rep0 - dis_slot - 1, direct_bits ); + distance += Rd_decode_tree_reversed( rdec, + bm_dis + ( distance - dis_slot ), direct_bits ); else { - rep0 += Rd_decode( rdec, direct_bits - dis_align_bits ) << dis_align_bits; - rep0 += Rd_decode_tree_reversed4( rdec, d->bm_align ); - if( rep0 == 0xFFFFFFFFU ) /* marker found */ + distance += + Rd_decode( rdec, direct_bits - dis_align_bits ) << dis_align_bits; + distance += Rd_decode_tree_reversed4( rdec, bm_align ); + if( distance == 0xFFFFFFFFU ) /* marker found */ { - rep0 = rep0_saved; Rd_normalize( rdec ); LZd_flush_data( d ); if( len == min_match_len ) /* End Of Stream marker */ @@ -281,7 +302,7 @@ int LZd_decode_member( struct LZ_decoder * const d, } } } - rep3 = rep2; rep2 = rep1; rep1 = rep0_saved; + rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; state = St_set_match( state ); if( rep0 >= d->dictionary_size || ( rep0 >= d->pos && !d->pos_wrapped ) ) { LZd_flush_data( d ); return 1; } diff --git a/decoder.h b/decoder.h index 662aaf9..2f96575 100644 --- a/decoder.h +++ b/decoder.h @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -56,7 +56,7 @@ Rd_member_position( const struct Range_decoder * const rdec ) { return rdec->partial_member_pos + rdec->pos; } static inline void Rd_reset_member_position( struct Range_decoder * const rdec ) - { rdec->partial_member_pos = -rdec->pos; } + { rdec->partial_member_pos = 0; rdec->partial_member_pos -= rdec->pos; } static inline uint8_t Rd_get_byte( struct Range_decoder * const rdec ) { @@ -68,23 +68,22 @@ static inline uint8_t Rd_get_byte( struct Range_decoder * const rdec ) static inline int Rd_read_data( struct Range_decoder * const rdec, uint8_t * const outbuf, const int size ) { - int rest = size; - while( rest > 0 && !Rd_finished( rdec ) ) + int sz = 0; + while( sz < size && !Rd_finished( rdec ) ) { - const int rd = min( rest, rdec->stream_pos - rdec->pos ); - memcpy( outbuf + size - rest, rdec->buffer + rdec->pos, rd ); + const int rd = min( size - sz, rdec->stream_pos - rdec->pos ); + memcpy( outbuf + sz, rdec->buffer + rdec->pos, rd ); rdec->pos += rd; - rest -= rd; + sz += rd; } - return size - rest; + return sz; } static inline void Rd_load( struct Range_decoder * const rdec ) { int i; rdec->code = 0; - for( i = 0; i < 5; ++i ) - rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); + for( i = 0; i < 5; ++i ) rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); rdec->range = 0xFFFFFFFFU; rdec->code &= rdec->range; /* make sure that first byte is discarded */ } @@ -92,34 +91,30 @@ static inline void Rd_load( struct Range_decoder * const rdec ) static inline void Rd_normalize( struct Range_decoder * const rdec ) { if( rdec->range <= 0x00FFFFFFU ) - { - rdec->range <<= 8; - rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); - } + { rdec->range <<= 8; rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); } } -static inline int Rd_decode( struct Range_decoder * const rdec, - const int num_bits ) +static inline unsigned Rd_decode( struct Range_decoder * const rdec, + const int num_bits ) { - int symbol = 0; + unsigned symbol = 0; int i; for( i = num_bits; i > 0; --i ) { - uint32_t mask; + bool bit; Rd_normalize( rdec ); rdec->range >>= 1; /* symbol <<= 1; */ /* if( rdec->code >= rdec->range ) { rdec->code -= rdec->range; symbol |= 1; } */ - mask = 0U - (rdec->code < rdec->range); - rdec->code -= rdec->range; - rdec->code += rdec->range & mask; - symbol = (symbol << 1) + (mask + 1); + bit = ( rdec->code >= rdec->range ); + symbol = ( symbol << 1 ) + bit; + rdec->code -= rdec->range & ( 0U - bit ); } return symbol; } -static inline int Rd_decode_bit( struct Range_decoder * const rdec, - Bit_model * const probability ) +static inline unsigned Rd_decode_bit( struct Range_decoder * const rdec, + Bit_model * const probability ) { uint32_t bound; Rd_normalize( rdec ); @@ -139,20 +134,20 @@ static inline int Rd_decode_bit( struct Range_decoder * const rdec, } } -static inline int Rd_decode_tree( struct Range_decoder * const rdec, - Bit_model bm[], const int num_bits ) +static inline unsigned Rd_decode_tree3( struct Range_decoder * const rdec, + Bit_model bm[] ) { - int symbol = 1; - int i; - for( i = num_bits; i > 0; --i ) - symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); - return symbol - (1 << num_bits); + unsigned symbol = 1; + symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); + symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); + symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); + return symbol & 7; } -static inline int Rd_decode_tree6( struct Range_decoder * const rdec, - Bit_model bm[] ) +static inline unsigned Rd_decode_tree6( struct Range_decoder * const rdec, + Bit_model bm[] ) { - int symbol = 1; + unsigned symbol = 1; symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); @@ -162,69 +157,69 @@ static inline int Rd_decode_tree6( struct Range_decoder * const rdec, return symbol & 0x3F; } -static inline int Rd_decode_tree_reversed( struct Range_decoder * const rdec, - Bit_model bm[], const int num_bits ) +static inline unsigned Rd_decode_tree8( struct Range_decoder * const rdec, + Bit_model bm[] ) { - int model = 1; - int symbol = 0; + unsigned symbol = 1; + int i; + for( i = 0; i < 8; ++i ) + symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); + return symbol & 0xFF; + } + +static inline unsigned +Rd_decode_tree_reversed( struct Range_decoder * const rdec, + Bit_model bm[], const int num_bits ) + { + unsigned model = 1; + unsigned symbol = 0; int i; for( i = 0; i < num_bits; ++i ) { - const bool bit = Rd_decode_bit( rdec, &bm[model] ); - model <<= 1; - if( bit ) { ++model; symbol |= (1 << i); } + const unsigned bit = Rd_decode_bit( rdec, &bm[model] ); + model = ( model << 1 ) + bit; + symbol |= ( bit << i ); } return symbol; } -static inline int Rd_decode_tree_reversed4( struct Range_decoder * const rdec, - Bit_model bm[] ) +static inline unsigned +Rd_decode_tree_reversed4( struct Range_decoder * const rdec, Bit_model bm[] ) { - int model = 1; - int symbol = Rd_decode_bit( rdec, &bm[model] ); - int bit; - model = (model << 1) + symbol; - bit = Rd_decode_bit( rdec, &bm[model] ); - model = (model << 1) + bit; symbol |= (bit << 1); + unsigned symbol = Rd_decode_bit( rdec, &bm[1] ); + unsigned model = 2 + symbol; + unsigned bit = Rd_decode_bit( rdec, &bm[model] ); + model = ( model << 1 ) + bit; symbol |= ( bit << 1 ); bit = Rd_decode_bit( rdec, &bm[model] ); - model = (model << 1) + bit; symbol |= (bit << 2); - if( Rd_decode_bit( rdec, &bm[model] ) ) symbol |= 8; + model = ( model << 1 ) + bit; symbol |= ( bit << 2 ); + symbol |= ( Rd_decode_bit( rdec, &bm[model] ) << 3 ); return symbol; } -static inline int Rd_decode_matched( struct Range_decoder * const rdec, - Bit_model bm[], int match_byte ) +static inline unsigned Rd_decode_matched( struct Range_decoder * const rdec, + Bit_model bm[], unsigned match_byte ) { - Bit_model * const bm1 = bm + 0x100; - int symbol = 1; - while( symbol < 0x100 ) + unsigned symbol = 1; + unsigned mask = 0x100; + while( true ) { - int match_bit, bit; - match_byte <<= 1; - match_bit = match_byte & 0x100; - bit = Rd_decode_bit( rdec, &bm1[match_bit+symbol] ); - symbol = ( symbol << 1 ) | bit; - if( match_bit != bit << 8 ) - { - while( symbol < 0x100 ) - symbol = ( symbol << 1 ) | Rd_decode_bit( rdec, &bm[symbol] ); - break; - } + const unsigned match_bit = ( match_byte <<= 1 ) & mask; + const unsigned bit = Rd_decode_bit( rdec, &bm[symbol+match_bit+mask] ); + symbol = ( symbol << 1 ) + bit; + if( symbol > 0xFF ) return symbol & 0xFF; + mask &= ~(match_bit ^ (bit << 8)); /* if( match_bit != bit ) mask = 0; */ } - return symbol & 0xFF; } -static inline int Rd_decode_len( struct Range_decoder * const rdec, - struct Len_model * const lm, - const int pos_state ) +static inline unsigned Rd_decode_len( struct Range_decoder * const rdec, + struct Len_model * const lm, + const int pos_state ) { if( Rd_decode_bit( rdec, &lm->choice1 ) == 0 ) - return Rd_decode_tree( rdec, lm->bm_low[pos_state], len_low_bits ); + return Rd_decode_tree3( rdec, lm->bm_low[pos_state] ); if( Rd_decode_bit( rdec, &lm->choice2 ) == 0 ) - return len_low_symbols + - Rd_decode_tree( rdec, lm->bm_mid[pos_state], len_mid_bits ); - return len_low_symbols + len_mid_symbols + - Rd_decode_tree( rdec, lm->bm_high, len_high_bits ); + return len_low_symbols + Rd_decode_tree3( rdec, lm->bm_mid[pos_state] ); + return len_low_symbols + len_mid_symbols + Rd_decode_tree8( rdec, lm->bm_high ); } @@ -239,35 +234,22 @@ struct LZ_decoder uint32_t crc; int outfd; /* output file descriptor */ bool pos_wrapped; - - Bit_model bm_literal[1<pos > 0 ) ? d->pos : d->dictionary_size ) - 1; - return d->buffer[i]; + if( d->pos > 0 ) return d->buffer[d->pos-1]; + if( d->pos_wrapped ) return d->buffer[d->dictionary_size-1]; + return 0; /* prev_byte of first byte */ } static inline uint8_t LZd_peek( const struct LZ_decoder * const d, const unsigned distance ) { - unsigned i = d->pos - distance - 1; - if( d->pos <= distance ) i += d->dictionary_size; + const unsigned i = ( ( d->pos > distance ) ? 0 : d->dictionary_size ) + + d->pos - distance - 1; return d->buffer[i]; } @@ -280,17 +262,26 @@ static inline void LZd_put_byte( struct LZ_decoder * const d, const uint8_t b ) static inline void LZd_copy_block( struct LZ_decoder * const d, const unsigned distance, unsigned len ) { - unsigned i = d->pos - distance - 1; - bool fast; - if( d->pos <= distance ) - { i += d->dictionary_size; - fast = ( len <= d->dictionary_size - i && len <= i - d->pos ); } + unsigned lpos = d->pos, i = lpos - distance - 1; + bool fast, fast2; + if( lpos > distance ) + { + fast = ( len < d->dictionary_size - lpos ); + fast2 = ( fast && len <= lpos - i ); + } else - fast = ( len < d->dictionary_size - d->pos && len <= d->pos - i ); - if( fast ) /* no wrap, no overlap */ { - memcpy( d->buffer + d->pos, d->buffer + i, len ); + i += d->dictionary_size; + fast = ( len < d->dictionary_size - i ); /* (i == pos) may happen */ + fast2 = ( fast && len <= i - lpos ); + } + if( fast ) /* no wrap */ + { d->pos += len; + if( fast2 ) /* no wrap, no overlap */ + memcpy( d->buffer + lpos, d->buffer + i, len ); + else + for( ; len > 0; --len ) d->buffer[lpos++] = d->buffer[i++]; } else for( ; len > 0; --len ) { @@ -314,20 +305,6 @@ static inline bool LZd_init( struct LZ_decoder * const d, d->crc = 0xFFFFFFFFU; d->outfd = ofd; d->pos_wrapped = false; - - Bm_array_init( d->bm_literal[0], (1 << literal_context_bits) * 0x300 ); - Bm_array_init( d->bm_match[0], states * pos_states ); - Bm_array_init( d->bm_rep, states ); - Bm_array_init( d->bm_rep0, states ); - Bm_array_init( d->bm_rep1, states ); - Bm_array_init( d->bm_rep2, states ); - Bm_array_init( d->bm_len[0], states * pos_states ); - Bm_array_init( d->bm_dis_slot[0], len_states * (1 << dis_slot_bits) ); - Bm_array_init( d->bm_dis, modeled_distances - end_dis_model ); - Bm_array_init( d->bm_align, dis_align_size ); - Lm_init( &d->match_len_model ); - Lm_init( &d->rep_len_model ); - d->buffer[d->dictionary_size-1] = 0; /* prev_byte of first byte */ return true; } diff --git a/doc/clzip.1 b/doc/clzip.1 index 5dbb695..8522ad0 100644 --- a/doc/clzip.1 +++ b/doc/clzip.1 @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH CLZIP "1" "May 2016" "clzip 1.8" "User Commands" +.TH CLZIP "1" "April 2017" "clzip 1.9" "User Commands" .SH NAME clzip \- reduces the size of files .SH SYNOPSIS @@ -36,6 +36,9 @@ force re\-compression of compressed files \fB\-k\fR, \fB\-\-keep\fR keep (don't delete) input files .TP +\fB\-l\fR, \fB\-\-list\fR +print (un)compressed file sizes +.TP \fB\-m\fR, \fB\-\-match\-length=\fR set match length limit in bytes [36] .TP @@ -87,7 +90,7 @@ Report bugs to lzip\-bug@nongnu.org .br Clzip home page: http://www.nongnu.org/lzip/clzip.html .SH COPYRIGHT -Copyright \(co 2016 Antonio Diaz Diaz. +Copyright \(co 2017 Antonio Diaz Diaz. License GPLv2+: GNU GPL version 2 or later .br This is free software: you are free to change and redistribute it. diff --git a/doc/clzip.info b/doc/clzip.info index c590473..f6e483f 100644 --- a/doc/clzip.info +++ b/doc/clzip.info @@ -11,21 +11,24 @@ File: clzip.info, Node: Top, Next: Introduction, Up: (dir) Clzip Manual ************ -This manual is for Clzip (version 1.8, 13 May 2016). +This manual is for Clzip (version 1.9, 13 April 2017). * Menu: * Introduction:: Purpose and features of clzip * Invoking clzip:: Command line interface +* Quality assurance:: Design, development and testing of lzip * File format:: Detailed format of the compressed file * Algorithm:: How clzip compresses the data +* Stream format:: Format of the LZMA stream in lzip files * Trailing data:: Extra data appended to the file * Examples:: A small tutorial with examples * Problems:: Reporting bugs +* Reference source code:: Source code illustrating stream format * Concept index:: Index of concepts - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -36,15 +39,16 @@ File: clzip.info, Node: Introduction, Next: Invoking clzip, Prev: Top, Up: T 1 Introduction ************** -Clzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Clzip is about as fast as gzip, compresses most -files more than bzip2, and is better than both from a data recovery -perspective. +Clzip is a C language version of lzip, fully compatible with lzip-1.4 or +newer. As clzip is written in C, it may be easier to integrate in +applications like package managers, embedded devices, or systems lacking +a C++ compiler. - Clzip uses the lzip file format; the files produced by clzip are -fully compatible with lzip-1.4 or newer, and can be rescued with -lziprecover. Clzip is in fact a C language version of lzip, intended -for embedded devices or systems lacking a C++ compiler. + Lzip is a lossless data compressor with a user interface similar to +the one of gzip or bzip2. Lzip can compress about as fast as gzip +(lzip -0), or compress most files more than bzip2 (lzip -9). +Decompression speed is intermediate between gzip and bzip2. Lzip is +better than gzip and bzip2 from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -58,11 +62,11 @@ availability: (lziprecover)Data safety. * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the code of a simple decompressor along with - a detailed explanation of how it works, so that with the only help - of the lzip manual it would be possible for a digital - archaeologist to extract the data from a lzip file long after - quantum computers eventually render LZMA obsolete. + lzip manual provides the source code of a simple decompressor + along with a detailed explanation of how it works, so that with + the only help of the lzip manual it would be possible for a + digital archaeologist to extract the data from a lzip file long + after quantum computers eventually render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. @@ -128,9 +132,9 @@ two or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. - Clzip can produce multimember files and safely recover, with -lziprecover, the undamaged members in case of file damage. Clzip can -also split the compressed output in volumes of a given size, even when + Clzip can produce multimember files, and lziprecover can safely +recover the undamaged members in case of file damage. Clzip can also +split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume compressed tar archives. @@ -138,8 +142,12 @@ multivolume compressed tar archives. automatically creating multimember output. The members so created are large, about 2 PiB each. + LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may +never have been compressed. Decompressed is used to refer to data which +have undergone the process of decompression. +  -File: clzip.info, Node: Invoking clzip, Next: File format, Prev: Introduction, Up: Top +File: clzip.info, Node: Invoking clzip, Next: Quality assurance, Prev: Introduction, Up: Top 2 Invoking clzip **************** @@ -205,6 +213,21 @@ command line. Keep (don't delete) input files during compression or decompression. +'-l' +'--list' + Print the uncompressed size, compressed size and percentage saved + of the specified file(s). Trailing data are ignored. The values + produced are correct even for multimember files. If more than one + file is given, a final line containing the cumulative sizes is + printed. With '-v', the dictionary size, the number of members in + the file, and the amount of trailing data (if any) are also + printed. With '-vv', the positions and sizes of each member in + multimember files are also printed. '-lq' can be used to verify + quickly (without decompressing) the structural integrity of the + specified files. (Use '--test' to verify the data integrity). + '-alq' additionally verifies that none of the specified files + contain trailing data. + '-m BYTES' '--match-length=BYTES' Set the match length limit in bytes. After a match this long is @@ -254,8 +277,9 @@ command line. Check integrity of the specified file(s), but don't decompress them. This really performs a trial decompression and throws away the result. Use it together with '-v' to see information about - the file(s). If a file fails the test, clzip continues checking - the rest of the files. + the file(s). If a file fails the test, does not exist, can't be + opened, or is a terminal, clzip continues checking the rest of the + files. '-v' '--verbose' @@ -265,7 +289,8 @@ command line. When decompressing or testing, further -v's (up to 4) increase the verbosity level, showing status, compression ratio, dictionary size, trailer contents (CRC, data size, member size), and up to 6 - bytes of trailing data (if any). + bytes of trailing data (if any) both in hexadecimal and as a + string of printable ASCII characters. '-0 .. -9' Set the compression parameters (dictionary size and match length @@ -317,9 +342,186 @@ invalid input file, 3 for an internal consistency error (eg, bug) which caused clzip to panic.  -File: clzip.info, Node: File format, Next: Algorithm, Prev: Invoking clzip, Up: Top +File: clzip.info, Node: Quality assurance, Next: File format, Prev: Invoking clzip, Up: Top + +3 Design, development and testing of lzip +***************************************** + +There are two ways of constructing a software design: One way is to make +it so simple that there are obviously no deficiencies and the other way +is to make it so complicated that there are no obvious deficiencies. The +first method is far more difficult. +-- C.A.R. Hoare + + Lzip has been designed, written and tested with great care to be the +standard general-purpose compressor for unix-like systems. This chapter +describes the lessons learned from previous compressors (gzip and +bzip2), and their application to the design of lzip. + + +3.1 Format design +================= + +When gzip was designed in 1992, computers and operating systems were +much less capable than they are today. Gzip tried to work around some of +those limitations, like 8.3 file names, with additional fields in its +file format. + + Today those limitations have mostly disappeared, and the format of +gzip has proved to be unnecessarily complicated. It includes fields +that were never used, others that have lost their usefulness, and +finally others that have become too limited. + + Bzip2 was designed 5 years later, and its format is simpler than the +one of gzip. + + Probably the worst defect of the gzip format from the point of view +of data safety is the variable size of its header. If the byte at +offset 3 (flags) of a gzip member gets corrupted, it may become very +difficult to recover the data, even if the compressed blocks are +intact, because it can't be known with certainty where the compressed +blocks begin. + + By contrast, the header of a lzip member has a fixed length of 6. The +LZMA stream in a lzip member always starts at offset 6, making it +trivial to recover the data even if the whole header becomes corrupt. + + Bzip2 also provides a header of fixed length and marks the begin and +end of each compressed block with six magic bytes, making it possible to +find the compressed blocks even in case of file damage. But bzip2 does +not store the size of each compressed block, as lzip does. + + Lzip provides better data recovery capabilities than any other +gzip-like compressor because its format has been designed from the +beginning to be simple and safe. It also helps that the LZMA data +stream as used by lzip is extraordinarily safe. It provides embedded +error detection. Any distance larger than the dictionary size acts as a +forbidden symbol, allowing the decompressor to detect the approximate +position of errors, and leaving very little work for the check sequence +(CRC and data sizes) in the detection of errors. Lzip is usually able +to detect all posible bit-flips in the compressed data without +resorting to the check sequence. It would be very difficult to write an +automatic recovery tool like lziprecover for the gzip format. And, as +far as I know, it has never been written. + + Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the +decompressed data because it provides more accurate error detection than +CRC64 up to a compressed size of about 16 GiB, a size larger than that +of most files. In the case of lzip, the additional detection capability +of the decompressor reduces the probability of undetected errors more +than a million times, making CRC32 more accurate than CRC64 up to about +20 PiB of compressed size. + + The lzip format is designed for long-term archiving. Therefore it +excludes any unneeded features that may interfere with the future +extraction of the uncompressed data. + + +3.1.1 Gzip format (mis)features not present in lzip +--------------------------------------------------- + +'Multiple algorithms' + Gzip provides a CM (Compression Method) field that has never been + used because it is a bad idea to begin with. New compression + methods may require additional fields, making it impossible to + implement new methods and, at the same time, keep the same format. + This field does not solve the problem of format proliferation; it + just makes the problem less obvious. + +'Optional fields in header' + Unless special precautions are taken, optional fields are + generally a bad idea because they produce a header of variable + size. The gzip header has 2 fields that, in addition to being + optional, are zero-terminated. This means that if any byte inside + the field gets zeroed, or if the terminating zero gets altered, + gzip won't be able to find neither the header CRC nor the + compressed blocks. + +'Optional CRC for the header' + Using an optional checksum for the header is not only a bad idea, + it is an error; it may prevent the extraction of perfectly good + data. For example, if the checksum is used and the bit enabling it + is reset by a bit-flip, the header will appear to be intact (in + spite of being corrupt) while the compressed blocks will appear to + be totally unrecoverable (in spite of being intact). Very + misleading indeed. + + +3.1.2 Lzip format improvements over gzip and bzip2 +-------------------------------------------------- + +'64-bit size field' + Probably the most frequently reported shortcoming of the gzip + format is that it only stores the least significant 32 bits of the + uncompressed size. The size of any file larger than 4 GiB gets + truncated. + + Bzip2 does not store the uncompressed size of the file. + + The lzip format provides a 64-bit field for the uncompressed size. + Additionaly, lzip produces multimember output automatically when + the size is too large for a single member, allowing for an + unlimited uncompressed size. + +'Distributed index' + The lzip format provides a distributed index that, among other + things, helps plzip to decompress several times faster than pigz + and helps lziprecover do its job. Neither the gzip format nor the + bzip2 format do provide an index. + + A distributed index is safer and more scalable than a monolithic + index. The monolithic index introduces a single point of failure + in the compressed file and may limit the number of members or the + total uncompressed size. + + +3.2 Quality of implementation +============================= + +'Accurate and robust error detection' + The lzip format provides 3 factor integrity checking and the + decompressors report mismatches in each factor separately. This + way if just one byte in one factor fails but the other two factors + match the data, it probably means that the data are intact and the + corruption just affects the mismatching factor (CRC or data size) + in the check sequence. + +'Multiple implementations' + Just like the lzip format provides 3 factor protection against + undetected data corruption, the development methodology of the lzip + family of compressors provides 3 factor protection against + undetected programming errors. + + Three related but independent compressor implementations, lzip, + clzip and minilzip/lzlib, are developed concurrently. Every stable + release of any of them is subjected to a hundred hours of + intensive testing to verify that it produces identical output to + the other two. This guarantees that all three implement the same + algorithm, and makes it unlikely that any of them may contain + serious undiscovered errors. In fact, no errors have been + discovered in lzip since 2009. + + Additionally, the three implementations have been extensively + tested with unzcrash, valgrind and 'american fuzzy lop' without + finding a single vulnerability or false negative. *Note Unzcrash: + (lziprecover)Unzcrash. + +'Dictionary size' + Lzip automatically uses the smallest possible dictionary size for + each file. In addition to reducing the amount of memory required + for decompression, this feature also minimizes the probability of + being affected by RAM errors during compression. + +'Exit status' + Returning a warning status of 2 is a design flaw of compress that + leaked into the design of gzip. Both bzip2 and lzip are free from + this flaw. + + + +File: clzip.info, Node: File format, Next: Algorithm, Prev: Quality assurance, Up: Top -3 File format +4 File format ************* Perfection is reached, not when there is no longer anything to add, but @@ -371,8 +573,8 @@ additional information before, between, or after them. 'LZMA stream' The LZMA stream, finished by an end of stream marker. Uses default - values for encoder properties. *Note Stream format: (lzip)Stream - format, for a complete description. + values for encoder properties. *Note Stream format::, for a + complete description. 'CRC32 (4 bytes)' CRC of the uncompressed original data. @@ -388,9 +590,9 @@ additional information before, between, or after them.  -File: clzip.info, Node: Algorithm, Next: Trailing data, Prev: File format, Up: Top +File: clzip.info, Node: Algorithm, Next: Stream format, Prev: File format, Up: Top -4 Algorithm +5 Algorithm *********** In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a @@ -454,21 +656,264 @@ range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI).  -File: clzip.info, Node: Trailing data, Next: Examples, Prev: Algorithm, Up: Top +File: clzip.info, Node: Stream format, Next: Trailing data, Prev: Algorithm, Up: Top + +6 Format of the LZMA stream in lzip files +***************************************** + +The LZMA algorithm has three parameters, called "special LZMA +properties", to adjust it for some kinds of binary data. These +parameters are; 'literal_context_bits' (with a default value of 3), +'literal_pos_state_bits' (with a default value of 0), and +'pos_state_bits' (with a default value of 2). As a general purpose +compressor, lzip only uses the default values for these parameters. In +particular 'literal_pos_state_bits' has been optimized away and does +not even appear in the code. + + Lzip also finishes the LZMA stream with an "End Of Stream" marker +(the distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the +"member size" field in the member trailer allows the verification of +stream integrity. The LZMA stream in lzip files always has these two +features (default properties and EOS marker) and is referred to in this +document as LZMA-302eos or LZMA-lzip. + + The second stage of LZMA is a range encoder that uses a different +probability model for each type of symbol; distances, lengths, literal +bytes, etc. Range encoding conceptually encodes all the symbols of the +message into one number. Unlike Huffman coding, which assigns to each +symbol a bit-pattern and concatenates all the bit-patterns together, +range encoding can compress one symbol to less than one bit. Therefore +the compressed data produced by a range encoder can't be split in pieces +that could be individually described. + + It seems that the only way of describing the LZMA-302eos stream is +describing the algorithm that decodes it. And given the many details +about the range decoder that need to be described accurately, the source +code of a real decoder seems the only appropriate reference to use. + + What follows is a description of the decoding algorithm for +LZMA-302eos streams using as reference the source code of "lzd", an +educational decompressor for lzip files which can be downloaded from +the lzip download directory. The source code of lzd is included in +appendix A. *Note Reference source code::. + + +6.1 What is coded +================= + +The LZMA stream includes literals, matches and repeated matches (matches +reusing a recently used distance). There are 7 different coding +sequences: + +Bit sequence Name Description +--------------------------------------------------------------------------- +0 + byte literal literal byte +1 + 0 + len + dis match distance-length pair +1 + 1 + 0 + 0 shortrep 1 byte match at latest used distance +1 + 1 + 0 + 1 + len rep0 len bytes match at latest used + distance +1 + 1 + 1 + 0 + len rep1 len bytes match at second latest + used distance +1 + 1 + 1 + 1 + 0 + len rep2 len bytes match at third latest used + distance +1 + 1 + 1 + 1 + 1 + len rep3 len bytes match at fourth latest + used distance + + + In the following tables, multibit sequences are coded in normal +order, from MSB to LSB, except where noted otherwise. + + Lengths (the 'len' in the table above) are coded as follows: + +Bit sequence Description +-------------------------------------------------------------------------- +0 + 3 bits lengths from 2 to 9 +1 + 0 + 3 bits lengths from 10 to 17 +1 + 1 + 8 bits lengths from 18 to 273 + + + The coding of distances is a little more complicated, so I'll begin +explaining a simpler version of the encoding. + + Imagine you need to code a number from 0 to 2^32 - 1, and you want +to do it in a way that produces shorter codes for the smaller numbers. +You may first send the position of the most significant bit that is set +to 1, which you may find by making a bit scan from the left (from the +MSB). A position of 0 means that the number is 0 (no bit is set), 1 +means the LSB is the first bit set (the number is 1), and 32 means the +MSB is set (i.e., the number is >= 0x80000000). Let's call this bit +position a "slot". Then, if slot is > 1, you send the remaining +slot - 1 bits. Let's call these bits "direct_bits" because they are +coded directly by value instead of indirectly by position. + + The inconvenient of this simple method is that it needs 6 bits to +code the slot, but it just uses 33 of the 64 possible values, wasting +almost half of the codes. + + The intelligent trick of LZMA is that it encodes the position of the +most significant bit set, along with the value of the next bit, in the +same 6 bits that would take to encode the position alone. This seems to +need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is +no next bit, so the number of needed slots is 64 (0 to 63). + + The 6 bits representing this "slot number" are then context-coded. If +the distance is >= 4, the remaining bits are coded as follows. +'direct_bits' is the amount of remaining bits (from 0 to 30) needed to +form a complete distance, and is calculated as (slot >> 1) - 1. If a +distance needs 6 or more direct_bits, the last 4 bits are coded +separately. The last piece (all the direct_bits for distances 4 to 127 +or the last 4 bits for distances >= 128) is context-coded in reverse +order (from LSB to MSB). For distances >= 128, the 'direct_bits - 4' +part is coded with fixed 0.5 probability. + +Bit sequence Description +-------------------------------------------------------------------------- +slot distances from 0 to 3 +slot + direct_bits distances from 4 to 127 +slot + (direct_bits - 4) + 4 bits distances from 128 to 2^32 - 1 + + +6.2 The coding contexts +======================= + +These contexts ('Bit_model' in the source), are integers or arrays of +integers representing the probability of the corresponding bit being 0. + + The indices used in these arrays are: + +'state' + A state machine ('State' in the source) with 12 states (0 to 11), + coding the latest 2 to 4 types of sequences processed. The initial + state is 0. + +'pos_state' + Value of the 2 least significant bits of the current position in + the decoded data. + +'literal_state' + Value of the 3 most significant bits of the latest byte decoded. + +'len_state' + Coded value of length (length - 2), with a maximum of 3. The + resulting value is in the range 0 to 3. + + + In the following table, '!literal' is any sequence except a literal +byte. 'rep' is any one of 'rep0', 'rep1', 'rep2' or 'rep3'. The types +of previous sequences corresponding to each state are: + +State Types of previous sequences +-------------------------------------------------------- +0 literal, literal, literal +1 match, literal, literal +2 rep or (!literal, shortrep), literal, literal +3 literal, shortrep, literal, literal +4 match, literal +5 rep or (!literal, shortrep), literal +6 literal, shortrep, literal +7 literal, match +8 literal, rep +9 literal, shortrep +10 !literal, match +11 !literal, (rep or shortrep) + + + The contexts for decoding the type of coding sequence are: + +Name Indices Used when +--------------------------------------------------------------------------- +bm_match state, pos_state sequence start +bm_rep state after sequence 1 +bm_rep0 state after sequence 11 +bm_rep1 state after sequence 111 +bm_rep2 state after sequence 1111 +bm_len state, pos_state after sequence 110 + + + The contexts for decoding distances are: + +Name Indices Used when +--------------------------------------------------------------------------- +bm_dis_slot len_state, bit tree distance start +bm_dis reverse bit tree after slots 4 to 13 +bm_align reverse bit tree for distances >= 128, after + fixed probability bits + + + There are two separate sets of contexts for lengths ('Len_model' in +the source). One for normal matches, the other for repeated matches. The +contexts in each Len_model are (see 'decode_len' in the source): + +Name Indices Used when +--------------------------------------------------------------------------- +choice1 none length start +choice2 none after sequence 1 +bm_low pos_state, bit tree after sequence 0 +bm_mid pos_state, bit tree after sequence 10 +bm_high bit tree after sequence 11 + + + The context array 'bm_literal' is special. In principle it acts as a +normal bit tree context, the one selected by 'literal_state'. But if +the previous decoded byte was not a literal, two other bit tree +contexts are used depending on the value of each bit in 'match_byte' +(the byte at the latest used distance), until a bit is decoded that is +different from its corresponding bit in 'match_byte'. After the first +difference is found, the rest of the byte is decoded using the normal +bit tree context. (See 'decode_matched' in the source). + + +6.3 The range decoder +===================== + +The LZMA stream is consumed one byte at a time by the range decoder. +(See 'normalize' in the source). Every byte consumed produces a +variable number of decoded bits, depending on how well these bits agree +with their context. (See 'decode_bit' in the source). + + The range decoder state consists of two unsigned 32-bit variables; +'range' (representing the most significant part of the range size not +yet decoded), and 'code' (representing the current point within +'range'). 'range' is initialized to (2^32 - 1), and 'code' is +initialized to 0. + + The range encoder produces a first 0 byte that must be ignored by the +range decoder. This is done by shifting 5 bytes in the initialization of +'code' instead of 4. (See the 'Range_decoder' constructor in the +source). + + +6.4 Decoding the LZMA stream +============================ + +After decoding the member header and obtaining the dictionary size, the +range decoder is initialized and then the LZMA decoder enters a loop +(See 'decode_member' in the source) where it invokes the range decoder +with the appropriate contexts to decode the different coding sequences +(matches, repeated matches, and literal bytes), until the "End Of +Stream" marker is decoded. + + +File: clzip.info, Node: Trailing data, Next: Examples, Prev: Stream format, Up: Top -5 Extra data appended to the file +7 Extra data appended to the file ********************************* -Sometimes extra data is found appended to a lzip file after the last +Sometimes extra data are found appended to a lzip file after the last member. Such trailing data may be: * Padding added to make the file size a multiple of some block size, - for example when writing to a tape. - - * Garbage added by some not totally successful copy operation. + for example when writing to a tape. It is safe to append any + amount of padding zero bytes to a lzip file. * Useful data added by the user; a cryptographically secure hash, a - description of file contents, etc. + description of file contents, etc. It is safe to append any amount + of text to a lzip file as long as the text does not begin with the + string "LZIP", and does not contain any zero bytes (null + characters). Nonzero bytes and zero bytes can't be safely mixed in + trailing data. + + * Garbage added by some not totally successful copy operation. * Malicious data added to the file in order to make its total size and hash value (for a chosen hash) coincide with those of another @@ -481,15 +926,19 @@ member. Such trailing data may be: the corruption of the integrity information itself. Therefore it can be considered to be below the noise level. + Trailing data are in no way part of the lzip file format, but tools +reading lzip files are expected to behave as correctly and usefully as +possible in the presence of trailing data. + Trailing data can be safely ignored in most cases. In some cases, -like that of user-added data, it is expected to be ignored. In those +like that of user-added data, they are expected to be ignored. In those cases where a file containing trailing data must be rejected, the option '--trailing-error' can be used. *Note --trailing-error::.  File: clzip.info, Node: Examples, Next: Problems, Prev: Trailing data, Up: Top -6 A small tutorial with examples +8 A small tutorial with examples ******************************** WARNING! Even if clzip is bug-free, other causes may result in a corrupt @@ -530,8 +979,8 @@ Example 5: Compress a whole device in /dev/sdc and send the output to clzip -c /dev/sdc > file.lz -Example 6: The right way of concatenating compressed files. *Note -Trailing data::. +Example 6: The right way of concatenating the decompressed output of two +or more compressed files. *Note Trailing data::. Don't do this cat file1.lz file2.lz file3.lz | clzip -d @@ -569,9 +1018,9 @@ file with a member size of 32 MiB. clzip -b 32MiB -S 650MB big_db  -File: clzip.info, Node: Problems, Next: Concept index, Prev: Examples, Up: Top +File: clzip.info, Node: Problems, Next: Reference source code, Prev: Examples, Up: Top -7 Reporting bugs +9 Reporting bugs **************** There are probably bugs in clzip. There are certainly errors and @@ -584,7 +1033,477 @@ for all eternity, if not longer. by running 'clzip --version'.  -File: clzip.info, Node: Concept index, Prev: Problems, Up: Top +File: clzip.info, Node: Reference source code, Next: Concept index, Prev: Problems, Up: Top + +Appendix A Reference source code +******************************** + +/* Lzd - Educational decompressor for the lzip format + Copyright (C) 2013-2017 Antonio Diaz Diaz. + + This program is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +/* + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file. +*/ + +#include +#include +#include +#include +#include +#include +#include +#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) +#include +#include +#endif + + +class State + { + int st; + +public: + enum { states = 12 }; + State() : st( 0 ) {} + int operator()() const { return st; } + bool is_char() const { return st < 7; } + + void set_char() + { + static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; + st = next[st]; + } + void set_match() { st = ( st < 7 ) ? 7 : 10; } + void set_rep() { st = ( st < 7 ) ? 8 : 11; } + void set_short_rep() { st = ( st < 7 ) ? 9 : 11; } + }; + + +enum { + min_dictionary_size = 1 << 12, + max_dictionary_size = 1 << 29, + literal_context_bits = 3, + literal_pos_state_bits = 0, // not used + pos_state_bits = 2, + pos_states = 1 << pos_state_bits, + pos_state_mask = pos_states - 1, + + len_states = 4, + dis_slot_bits = 6, + start_dis_model = 4, + end_dis_model = 14, + modeled_distances = 1 << (end_dis_model / 2), // 128 + dis_align_bits = 4, + dis_align_size = 1 << dis_align_bits, + + len_low_bits = 3, + len_mid_bits = 3, + len_high_bits = 8, + len_low_symbols = 1 << len_low_bits, + len_mid_symbols = 1 << len_mid_bits, + len_high_symbols = 1 << len_high_bits, + max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, + + min_match_len = 2, // must be 2 + + bit_model_move_bits = 5, + bit_model_total_bits = 11, + bit_model_total = 1 << bit_model_total_bits }; + +struct Bit_model + { + int probability; + Bit_model() : probability( bit_model_total / 2 ) {} + }; + +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32() + { + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + void update_buf( uint32_t & crc, const uint8_t * const buffer, + const int size ) const + { + for( int i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + } + }; + +const CRC32 crc32; + + +typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size + +typedef uint8_t File_trailer[20]; + // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + +class Range_decoder + { + uint32_t code; + uint32_t range; + +public: + Range_decoder() : code( 0 ), range( 0xFFFFFFFFU ) + { + for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte(); + } + + uint8_t get_byte() { return std::getc( stdin ); } + + unsigned decode( const int num_bits ) + { + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) + { + range >>= 1; + symbol <<= 1; + if( code >= range ) { code -= range; symbol |= 1; } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | get_byte(); } + } + return symbol; + } + + unsigned decode_bit( Bit_model & bm ) + { + unsigned symbol; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + symbol = 0; + } + else + { + range -= bound; + code -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol = 1; + } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | get_byte(); } + return symbol; + } + + unsigned decode_tree( Bit_model bm[], const int num_bits ) + { + unsigned symbol = 1; + for( int i = 0; i < num_bits; ++i ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - (1 << num_bits); + } + + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + unsigned symbol = decode_tree( bm, num_bits ); + unsigned reversed_symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); + symbol >>= 1; + } + return reversed_symbol; + } + + unsigned decode_matched( Bit_model bm[], const unsigned match_byte ) + { + unsigned symbol = 1; + for( int i = 7; i >= 0; --i ) + { + const unsigned match_bit = ( match_byte >> i ) & 1; + const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] ); + symbol = ( symbol << 1 ) | bit; + if( match_bit != bit ) + { + while( symbol < 0x100 ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + break; + } + } + return symbol & 0xFF; + } + + unsigned decode_len( Len_model & lm, const int pos_state ) + { + if( decode_bit( lm.choice1 ) == 0 ) + return decode_tree( lm.bm_low[pos_state], len_low_bits ); + if( decode_bit( lm.choice2 ) == 0 ) + return len_low_symbols + + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); + return len_low_symbols + len_mid_symbols + + decode_tree( lm.bm_high, len_high_bits ); + } + }; + + +class LZ_decoder + { + unsigned long long partial_data_pos; + Range_decoder rdec; + const unsigned dictionary_size; + uint8_t * const buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to stdout + uint32_t crc_; + bool pos_wrapped; + + void flush_data(); + + uint8_t peek( const unsigned distance ) const + { + if( pos > distance ) return buffer[pos - distance - 1]; + if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1]; + return 0; // prev_byte of first byte + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + +public: + explicit LZ_decoder( const unsigned dict_size ) + : + partial_data_pos( 0 ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ), + pos_wrapped( false ) + {} + + ~LZ_decoder() { delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const { return partial_data_pos + pos; } + + bool decode_member(); + }; + + +void LZ_decoder::flush_data() + { + if( pos > stream_pos ) + { + const unsigned size = pos - stream_pos; + crc32.update_buf( crc_, buffer + stream_pos, size ); + errno = 0; + if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); + std::exit( 1 ); } + if( pos >= dictionary_size ) + { partial_data_pos += pos; pos = 0; pos_wrapped = true; } + stream_pos = pos; + } + } + + +bool LZ_decoder::decode_member() // Returns false if error + { + Bit_model bm_literal[1<> ( 8 - literal_context_bits ); + Bit_model * const bm = bm_literal[literal_state]; + if( state.is_char() ) + put_byte( rdec.decode_tree( bm, 8 ) ); + else + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); + state.set_char(); + } + else // match or repeated match + { + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + state.set_rep(); + len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = min_match_len + rdec.decode_len( match_len_model, pos_state ); + const int len_state = std::min( len - min_match_len, len_states - 1 ); + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) + { + const unsigned dis_slot = rep0; + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), + direct_bits ); + else + { + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // marker found + { + flush_data(); + return ( len == min_match_len ); // End Of Stream marker + } + } + } + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return false; } + } + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); + } + } + flush_data(); + return false; + } + + +int main( const int argc, const char * const argv[] ) + { + if( argc > 1 ) + { + std::printf( "Lzd %s - Educational decompressor for the lzip format.\n", + PROGVERSION ); + std::printf( "Study the source to learn how a lzip decompressor works.\n" + "See the lzip manual for an explanation of the code.\n" + "It is not safe to use lzd for any real work.\n" + "\nUsage: %s < file.lz > file\n", argv[0] ); + std::printf( "Lzd decompresses from standard input to standard output.\n" + "\nCopyright (C) 2017 Antonio Diaz Diaz.\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" + "Report bugs to lzip-bug@nongnu.org\n" + "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n" ); + return 0; + } + +#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) + setmode( fileno( stdin ), O_BINARY ); + setmode( fileno( stdout ), O_BINARY ); +#endif + + for( bool first_member = true; ; first_member = false ) + { + File_header header; // verify header + for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); + if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) + { + if( first_member ) + { std::fputs( "Bad magic number (file not in lzip format).\n", stderr ); + return 2; } + break; + } + unsigned dict_size = 1 << ( header[5] & 0x1F ); + dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); + if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) + { std::fputs( "Invalid dictionary size in member header.\n", stderr ); + return 2; } + + LZ_decoder decoder( dict_size ); // decode LZMA stream + if( !decoder.decode_member() ) + { std::fputs( "Data error\n", stderr ); return 2; } + + File_trailer trailer; // verify trailer + for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); + unsigned crc = 0; + for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } + unsigned long long data_size = 0; + for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } + if( crc != decoder.crc() || data_size != decoder.data_position() ) + { std::fputs( "CRC error\n", stderr ); return 2; } + } + + if( std::fclose( stdout ) != 0 ) + { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); + return 1; } + return 0; + } + + +File: clzip.info, Node: Concept index, Prev: Reference source code, Up: Top Concept index ************* @@ -596,10 +1515,13 @@ Concept index * bugs: Problems. (line 6) * examples: Examples. (line 6) * file format: File format. (line 6) +* format of the LZMA stream: Stream format. (line 6) * getting help: Problems. (line 6) * introduction: Introduction. (line 6) * invoking: Invoking clzip. (line 6) * options: Invoking clzip. (line 6) +* quality assurance: Quality assurance. (line 6) +* reference source code: Reference source code. (line 6) * trailing data: Trailing data. (line 6) * usage: Invoking clzip. (line 6) * version: Invoking clzip. (line 6) @@ -608,16 +1530,19 @@ Concept index  Tag Table: Node: Top210 -Node: Introduction952 -Node: Invoking clzip6164 -Ref: --trailing-error6730 -Node: File format12728 -Node: Algorithm15150 -Node: Trailing data17980 -Node: Examples19355 -Ref: concat-example20537 -Node: Problems21544 -Node: Concept index22070 +Node: Introduction1154 +Node: Invoking clzip6630 +Ref: --trailing-error7202 +Node: Quality assurance14125 +Node: File format22281 +Node: Algorithm24686 +Node: Stream format27516 +Node: Trailing data38257 +Node: Examples40159 +Ref: concat-example41341 +Node: Problems42386 +Node: Reference source code42920 +Node: Concept index57238  End Tag Table diff --git a/doc/clzip.texi b/doc/clzip.texi index 331d4eb..2cac199 100644 --- a/doc/clzip.texi +++ b/doc/clzip.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 13 May 2016 -@set VERSION 1.8 +@set UPDATED 13 April 2017 +@set VERSION 1.9 @dircategory Data Compression @direntry @@ -37,16 +37,19 @@ This manual is for Clzip (version @value{VERSION}, @value{UPDATED}). @menu * Introduction:: Purpose and features of clzip * Invoking clzip:: Command line interface +* Quality assurance:: Design, development and testing of lzip * File format:: Detailed format of the compressed file * Algorithm:: How clzip compresses the data +* Stream format:: Format of the LZMA stream in lzip files * Trailing data:: Extra data appended to the file * Examples:: A small tutorial with examples * Problems:: Reporting bugs +* Reference source code:: Source code illustrating stream format * Concept index:: Index of concepts @end menu @sp 1 -Copyright @copyright{} 2010-2016 Antonio Diaz Diaz. +Copyright @copyright{} 2010-2017 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -56,15 +59,16 @@ to copy, distribute and modify it. @chapter Introduction @cindex introduction -Clzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Clzip is about as fast as gzip, compresses most -files more than bzip2, and is better than both from a data recovery -perspective. +Clzip is a C language version of lzip, fully compatible with lzip-1.4 or +newer. As clzip is written in C, it may be easier to integrate in +applications like package managers, embedded devices, or systems lacking +a C++ compiler. -Clzip uses the lzip file format; the files produced by clzip are fully -compatible with lzip-1.4 or newer, and can be rescued with lziprecover. -Clzip is in fact a C language version of lzip, intended for embedded -devices or systems lacking a C++ compiler. +Lzip is a lossless data compressor with a user interface similar to the +one of gzip or bzip2. Lzip can compress about as fast as gzip +@w{(lzip -0)}, or compress most files more than bzip2 @w{(lzip -9)}. +Decompression speed is intermediate between gzip and bzip2. Lzip is +better than gzip and bzip2 from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -84,10 +88,10 @@ including error-checked merging of damaged copies of a file. @item The lzip format is as simple as possible (but not simpler). The lzip -manual provides the code of a simple decompressor along with a detailed -explanation of how it works, so that with the only help of the lzip -manual it would be possible for a digital archaeologist to extract the -data from a lzip file long after quantum computers eventually render +manual provides the source code of a simple decompressor along with a +detailed explanation of how it works, so that with the only help of the +lzip manual it would be possible for a digital archaeologist to extract +the data from a lzip file long after quantum computers eventually render LZMA obsolete. @item @@ -158,16 +162,20 @@ or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. -Clzip can produce multimember files and safely recover, with -lziprecover, the undamaged members in case of file damage. Clzip can -also split the compressed output in volumes of a given size, even when -reading from standard input. This allows the direct creation of -multivolume compressed tar archives. +Clzip can produce multimember files, and lziprecover can safely recover +the undamaged members in case of file damage. Clzip can also split the +compressed output in volumes of a given size, even when reading from +standard input. This allows the direct creation of multivolume +compressed tar archives. Clzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are large, about 2 PiB each. +LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never +have been compressed. Decompressed is used to refer to data which have +undergone the process of decompression. + @node Invoking clzip @chapter Invoking clzip @@ -239,6 +247,20 @@ Force re-compression of files whose name already has the @samp{.lz} or @itemx --keep Keep (don't delete) input files during compression or decompression. +@item -l +@itemx --list +Print the uncompressed size, compressed size and percentage saved of the +specified file(s). Trailing data are ignored. The values produced are +correct even for multimember files. If more than one file is given, a +final line containing the cumulative sizes is printed. With @samp{-v}, +the dictionary size, the number of members in the file, and the amount +of trailing data (if any) are also printed. With @samp{-vv}, the +positions and sizes of each member in multimember files are also +printed. @samp{-lq} can be used to verify quickly (without +decompressing) the structural integrity of the specified files. (Use +@samp{--test} to verify the data integrity). @samp{-alq} additionally +verifies that none of the specified files contain trailing data. + @item -m @var{bytes} @itemx --match-length=@var{bytes} Set the match length limit in bytes. After a match this long is found, @@ -286,7 +308,8 @@ EiB. Check integrity of the specified file(s), but don't decompress them. This really performs a trial decompression and throws away the result. Use it together with @samp{-v} to see information about the file(s). If -a file fails the test, clzip continues checking the rest of the files. +a file fails the test, does not exist, can't be opened, or is a +terminal, clzip continues checking the rest of the files. @item -v @itemx --verbose @@ -296,7 +319,8 @@ second @samp{-v} shows the progress of compression.@* When decompressing or testing, further -v's (up to 4) increase the verbosity level, showing status, compression ratio, dictionary size, trailer contents (CRC, data size, member size), and up to 6 bytes of -trailing data (if any). +trailing data (if any) both in hexadecimal and as a string of printable +ASCII characters. @item -0 .. -9 Set the compression parameters (dictionary size and match length limit) @@ -353,6 +377,190 @@ invalid input file, 3 for an internal consistency error (eg, bug) which caused clzip to panic. +@node Quality assurance +@chapter Design, development and testing of lzip +@cindex quality assurance + +There are two ways of constructing a software design: One way is to make +it so simple that there are obviously no deficiencies and the other way +is to make it so complicated that there are no obvious deficiencies. The +first method is far more difficult.@* +--- C.A.R. Hoare + +Lzip has been designed, written and tested with great care to be the +standard general-purpose compressor for unix-like systems. This chapter +describes the lessons learned from previous compressors (gzip and +bzip2), and their application to the design of lzip. + +@sp 1 +@section Format design + +When gzip was designed in 1992, computers and operating systems were +much less capable than they are today. Gzip tried to work around some of +those limitations, like 8.3 file names, with additional fields in its +file format. + +Today those limitations have mostly disappeared, and the format of gzip +has proved to be unnecessarily complicated. It includes fields that were +never used, others that have lost their usefulness, and finally others +that have become too limited. + +Bzip2 was designed 5 years later, and its format is simpler than the one +of gzip. + +Probably the worst defect of the gzip format from the point of view of +data safety is the variable size of its header. If the byte at offset 3 +(flags) of a gzip member gets corrupted, it may become very difficult to +recover the data, even if the compressed blocks are intact, because it +can't be known with certainty where the compressed blocks begin. + +By contrast, the header of a lzip member has a fixed length of 6. The +LZMA stream in a lzip member always starts at offset 6, making it +trivial to recover the data even if the whole header becomes corrupt. + +Bzip2 also provides a header of fixed length and marks the begin and end +of each compressed block with six magic bytes, making it possible to +find the compressed blocks even in case of file damage. But bzip2 does +not store the size of each compressed block, as lzip does. + +Lzip provides better data recovery capabilities than any other gzip-like +compressor because its format has been designed from the beginning to be +simple and safe. It also helps that the LZMA data stream as used by lzip +is extraordinarily safe. It provides embedded error detection. Any +distance larger than the dictionary size acts as a forbidden symbol, +allowing the decompressor to detect the approximate position of errors, +and leaving very little work for the check sequence (CRC and data sizes) +in the detection of errors. Lzip is usually able to detect all posible +bit-flips in the compressed data without resorting to the check +sequence. It would be very difficult to write an automatic recovery tool +like lziprecover for the gzip format. And, as far as I know, it has +never been written. + +Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the +decompressed data because it provides more accurate error detection than +CRC64 up to a compressed size of about 16 GiB, a size larger than that +of most files. In the case of lzip, the additional detection capability +of the decompressor reduces the probability of undetected errors more +than a million times, making CRC32 more accurate than CRC64 up to about +20 PiB of compressed size. + +The lzip format is designed for long-term archiving. Therefore it +excludes any unneeded features that may interfere with the future +extraction of the uncompressed data. + +@sp 1 +@subsection Gzip format (mis)features not present in lzip + +@table @samp +@item Multiple algorithms + +Gzip provides a CM (Compression Method) field that has never been used +because it is a bad idea to begin with. New compression methods may +require additional fields, making it impossible to implement new methods +and, at the same time, keep the same format. This field does not solve +the problem of format proliferation; it just makes the problem less +obvious. + +@item Optional fields in header + +Unless special precautions are taken, optional fields are generally a +bad idea because they produce a header of variable size. The gzip header +has 2 fields that, in addition to being optional, are zero-terminated. +This means that if any byte inside the field gets zeroed, or if the +terminating zero gets altered, gzip won't be able to find neither the +header CRC nor the compressed blocks. + +@item Optional CRC for the header + +Using an optional checksum for the header is not only a bad idea, it is +an error; it may prevent the extraction of perfectly good data. For +example, if the checksum is used and the bit enabling it is reset by a +bit-flip, the header will appear to be intact (in spite of being +corrupt) while the compressed blocks will appear to be totally +unrecoverable (in spite of being intact). Very misleading indeed. + +@end table + +@subsection Lzip format improvements over gzip and bzip2 + +@table @samp +@item 64-bit size field + +Probably the most frequently reported shortcoming of the gzip format is +that it only stores the least significant 32 bits of the uncompressed +size. The size of any file larger than 4 GiB gets truncated. + +Bzip2 does not store the uncompressed size of the file. + +The lzip format provides a 64-bit field for the uncompressed size. +Additionaly, lzip produces multimember output automatically when the +size is too large for a single member, allowing for an unlimited +uncompressed size. + +@item Distributed index + +The lzip format provides a distributed index that, among other things, +helps plzip to decompress several times faster than pigz and helps +lziprecover do its job. Neither the gzip format nor the bzip2 format do +provide an index. + +A distributed index is safer and more scalable than a monolithic index. +The monolithic index introduces a single point of failure in the +compressed file and may limit the number of members or the total +uncompressed size. + +@end table + +@section Quality of implementation + +@table @samp +@item Accurate and robust error detection + +The lzip format provides 3 factor integrity checking and the +decompressors report mismatches in each factor separately. This way if +just one byte in one factor fails but the other two factors match the +data, it probably means that the data are intact and the corruption just +affects the mismatching factor (CRC or data size) in the check sequence. + +@item Multiple implementations + +Just like the lzip format provides 3 factor protection against +undetected data corruption, the development methodology of the lzip +family of compressors provides 3 factor protection against undetected +programming errors. + +Three related but independent compressor implementations, lzip, clzip +and minilzip/lzlib, are developed concurrently. Every stable release of +any of them is subjected to a hundred hours of intensive testing to +verify that it produces identical output to the other two. This +guarantees that all three implement the same algorithm, and makes it +unlikely that any of them may contain serious undiscovered errors. In +fact, no errors have been discovered in lzip since 2009. + +Additionally, the three implementations have been extensively tested +with +@uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Unzcrash,,unzcrash}, +valgrind and @samp{american fuzzy lop} without finding a single +vulnerability or false negative. +@ifnothtml +@xref{Unzcrash,,,lziprecover}. +@end ifnothtml + +@item Dictionary size + +Lzip automatically uses the smallest possible dictionary size for each +file. In addition to reducing the amount of memory required for +decompression, this feature also minimizes the probability of being +affected by RAM errors during compression. + +@item Exit status + +Returning a warning status of 2 is a design flaw of compress that leaked +into the design of gzip. Both bzip2 and lzip are free from this flaw. + +@end table + + @node File format @chapter File format @cindex file format @@ -412,15 +620,8 @@ Valid values for dictionary size range from 4 KiB to 512 MiB. @item LZMA stream The LZMA stream, finished by an end of stream marker. Uses default -values for encoder properties. -@ifnothtml -@xref{Stream format,,,lzip}, -@end ifnothtml -@ifhtml -See -@uref{http://www.nongnu.org/lzip/manual/lzip_manual.html#Stream-format,,Stream format} -@end ifhtml -for a complete description. +values for encoder properties. @xref{Stream format}, for a complete +description. @item CRC32 (4 bytes) CRC of the uncompressed original data. @@ -502,24 +703,274 @@ range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI). +@node Stream format +@chapter Format of the LZMA stream in lzip files +@cindex format of the LZMA stream + +The LZMA algorithm has three parameters, called "special LZMA +properties", to adjust it for some kinds of binary data. These +parameters are; @samp{literal_context_bits} (with a default value of 3), +@samp{literal_pos_state_bits} (with a default value of 0), and +@samp{pos_state_bits} (with a default value of 2). As a general purpose +compressor, lzip only uses the default values for these parameters. In +particular @samp{literal_pos_state_bits} has been optimized away and +does not even appear in the code. + +Lzip also finishes the LZMA stream with an "End Of Stream" marker (the +distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the +"member size" field in the member trailer allows the verification of +stream integrity. The LZMA stream in lzip files always has these two +features (default properties and EOS marker) and is referred to in this +document as LZMA-302eos or LZMA-lzip. + +The second stage of LZMA is a range encoder that uses a different +probability model for each type of symbol; distances, lengths, literal +bytes, etc. Range encoding conceptually encodes all the symbols of the +message into one number. Unlike Huffman coding, which assigns to each +symbol a bit-pattern and concatenates all the bit-patterns together, +range encoding can compress one symbol to less than one bit. Therefore +the compressed data produced by a range encoder can't be split in pieces +that could be individually described. + +It seems that the only way of describing the LZMA-302eos stream is +describing the algorithm that decodes it. And given the many details +about the range decoder that need to be described accurately, the source +code of a real decoder seems the only appropriate reference to use. + +What follows is a description of the decoding algorithm for LZMA-302eos +streams using as reference the source code of "lzd", an educational +decompressor for lzip files which can be downloaded from the lzip +download directory. The source code of lzd is included in appendix A. +@xref{Reference source code}. + +@sp 1 +@section What is coded + +The LZMA stream includes literals, matches and repeated matches (matches +reusing a recently used distance). There are 7 different coding sequences: + +@multitable @columnfractions .35 .14 .51 +@headitem Bit sequence @tab Name @tab Description +@item 0 + byte @tab literal @tab literal byte +@item 1 + 0 + len + dis @tab match @tab distance-length pair +@item 1 + 1 + 0 + 0 @tab shortrep @tab 1 byte match at latest used +distance +@item 1 + 1 + 0 + 1 + len @tab rep0 @tab len bytes match at latest used +distance +@item 1 + 1 + 1 + 0 + len @tab rep1 @tab len bytes match at second +latest used distance +@item 1 + 1 + 1 + 1 + 0 + len @tab rep2 @tab len bytes match at third +latest used distance +@item 1 + 1 + 1 + 1 + 1 + len @tab rep3 @tab len bytes match at fourth +latest used distance +@end multitable + +@sp 1 +In the following tables, multibit sequences are coded in normal order, +from MSB to LSB, except where noted otherwise. + +Lengths (the @samp{len} in the table above) are coded as follows: + +@multitable @columnfractions .5 .5 +@headitem Bit sequence @tab Description +@item 0 + 3 bits @tab lengths from 2 to 9 +@item 1 + 0 + 3 bits @tab lengths from 10 to 17 +@item 1 + 1 + 8 bits @tab lengths from 18 to 273 +@end multitable + +@sp 1 +The coding of distances is a little more complicated, so I'll begin +explaining a simpler version of the encoding. + +Imagine you need to code a number from 0 to @w{2^32 - 1}, and you want +to do it in a way that produces shorter codes for the smaller numbers. +You may first send the position of the most significant bit that is set +to 1, which you may find by making a bit scan from the left (from the +MSB). A position of 0 means that the number is 0 (no bit is set), 1 +means the LSB is the first bit set (the number is 1), and 32 means the +MSB is set (i.e., the number is @w{>= 0x80000000}). Let's call this bit +position a "slot". Then, if slot is @w{> 1}, you send the remaining +@w{slot - 1} bits. Let's call these bits "direct_bits" because they are +coded directly by value instead of indirectly by position. + +The inconvenient of this simple method is that it needs 6 bits to code +the slot, but it just uses 33 of the 64 possible values, wasting almost +half of the codes. + +The intelligent trick of LZMA is that it encodes the position of the +most significant bit set, along with the value of the next bit, in the +same 6 bits that would take to encode the position alone. This seems to +need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is +no next bit, so the number of needed slots is 64 (0 to 63). + +The 6 bits representing this "slot number" are then context-coded. If +the distance is @w{>= 4}, the remaining bits are coded as follows. +@samp{direct_bits} is the amount of remaining bits (from 0 to 30) needed +to form a complete distance, and is calculated as @w{(slot >> 1) - 1}. +If a distance needs 6 or more direct_bits, the last 4 bits are coded +separately. The last piece (all the direct_bits for distances 4 to 127 +or the last 4 bits for distances @w{>= 128}) is context-coded in reverse +order (from LSB to MSB). For distances @w{>= 128}, the +@w{@samp{direct_bits - 4}} part is coded with fixed 0.5 probability. + +@multitable @columnfractions .5 .5 +@headitem Bit sequence @tab Description +@item slot @tab distances from 0 to 3 +@item slot + direct_bits @tab distances from 4 to 127 +@item slot + (direct_bits - 4) + 4 bits @tab distances from 128 to +2^32 - 1 +@end multitable + +@sp 1 +@section The coding contexts + +These contexts (@samp{Bit_model} in the source), are integers or arrays +of integers representing the probability of the corresponding bit being 0. + +The indices used in these arrays are: + +@table @samp +@item state +A state machine (@samp{State} in the source) with 12 states (0 to 11), +coding the latest 2 to 4 types of sequences processed. The initial state +is 0. + +@item pos_state +Value of the 2 least significant bits of the current position in the +decoded data. + +@item literal_state +Value of the 3 most significant bits of the latest byte decoded. + +@item len_state +Coded value of length @w{(length - 2)}, with a maximum of 3. The +resulting value is in the range 0 to 3. + +@end table + + +In the following table, @samp{!literal} is any sequence except a literal +byte. @samp{rep} is any one of @samp{rep0}, @samp{rep1}, @samp{rep2} or +@samp{rep3}. The types of previous sequences corresponding to each state +are: + +@multitable {State} {rep or (!literal, shortrep), literal, literal} +@headitem State @tab Types of previous sequences +@item 0 @tab literal, literal, literal +@item 1 @tab match, literal, literal +@item 2 @tab rep or (!literal, shortrep), literal, literal +@item 3 @tab literal, shortrep, literal, literal +@item 4 @tab match, literal +@item 5 @tab rep or (!literal, shortrep), literal +@item 6 @tab literal, shortrep, literal +@item 7 @tab literal, match +@item 8 @tab literal, rep +@item 9 @tab literal, shortrep +@item 10 @tab !literal, match +@item 11 @tab !literal, (rep or shortrep) +@end multitable + +@sp 1 +The contexts for decoding the type of coding sequence are: + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item bm_match @tab state, pos_state @tab sequence start +@item bm_rep @tab state @tab after sequence 1 +@item bm_rep0 @tab state @tab after sequence 11 +@item bm_rep1 @tab state @tab after sequence 111 +@item bm_rep2 @tab state @tab after sequence 1111 +@item bm_len @tab state, pos_state @tab after sequence 110 +@end multitable + +@sp 1 +The contexts for decoding distances are: + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item bm_dis_slot @tab len_state, bit tree @tab distance start +@item bm_dis @tab reverse bit tree @tab after slots 4 to 13 +@item bm_align @tab reverse bit tree @tab for distances >= 128, after +fixed probability bits +@end multitable + +@sp 1 +There are two separate sets of contexts for lengths (@samp{Len_model} in +the source). One for normal matches, the other for repeated matches. The +contexts in each Len_model are (see @samp{decode_len} in the source): + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item choice1 @tab none @tab length start +@item choice2 @tab none @tab after sequence 1 +@item bm_low @tab pos_state, bit tree @tab after sequence 0 +@item bm_mid @tab pos_state, bit tree @tab after sequence 10 +@item bm_high @tab bit tree @tab after sequence 11 +@end multitable + +@sp 1 +The context array @samp{bm_literal} is special. In principle it acts as +a normal bit tree context, the one selected by @samp{literal_state}. But +if the previous decoded byte was not a literal, two other bit tree +contexts are used depending on the value of each bit in +@samp{match_byte} (the byte at the latest used distance), until a bit is +decoded that is different from its corresponding bit in +@samp{match_byte}. After the first difference is found, the rest of the +byte is decoded using the normal bit tree context. (See +@samp{decode_matched} in the source). + +@sp 1 +@section The range decoder + +The LZMA stream is consumed one byte at a time by the range decoder. +(See @samp{normalize} in the source). Every byte consumed produces a +variable number of decoded bits, depending on how well these bits agree +with their context. (See @samp{decode_bit} in the source). + +The range decoder state consists of two unsigned 32-bit variables; +@code{range} (representing the most significant part of the range size +not yet decoded), and @code{code} (representing the current point within +@code{range}). @code{range} is initialized to @w{(2^32 - 1)}, and +@code{code} is initialized to 0. + +The range encoder produces a first 0 byte that must be ignored by the +range decoder. This is done by shifting 5 bytes in the initialization of +@code{code} instead of 4. (See the @samp{Range_decoder} constructor in +the source). + +@sp 1 +@section Decoding the LZMA stream + +After decoding the member header and obtaining the dictionary size, the +range decoder is initialized and then the LZMA decoder enters a loop +(See @samp{decode_member} in the source) where it invokes the range +decoder with the appropriate contexts to decode the different coding +sequences (matches, repeated matches, and literal bytes), until the "End +Of Stream" marker is decoded. + + @node Trailing data @chapter Extra data appended to the file @cindex trailing data -Sometimes extra data is found appended to a lzip file after the last +Sometimes extra data are found appended to a lzip file after the last member. Such trailing data may be: @itemize @bullet @item Padding added to make the file size a multiple of some block size, for -example when writing to a tape. +example when writing to a tape. It is safe to append any amount of +padding zero bytes to a lzip file. @item -Garbage added by some not totally successful copy operation. +Useful data added by the user; a cryptographically secure hash, a +description of file contents, etc. It is safe to append any amount of +text to a lzip file as long as the text does not begin with the string +"LZIP", and does not contain any zero bytes (null characters). Nonzero +bytes and zero bytes can't be safely mixed in trailing data. @item -Useful data added by the user; a cryptographically secure hash, a -description of file contents, etc. +Garbage added by some not totally successful copy operation. @item Malicious data added to the file in order to make its total size and @@ -534,8 +985,12 @@ integrity information itself. Therefore it can be considered to be below the noise level. @end itemize +Trailing data are in no way part of the lzip file format, but tools +reading lzip files are expected to behave as correctly and usefully as +possible in the presence of trailing data. + Trailing data can be safely ignored in most cases. In some cases, like -that of user-added data, it is expected to be ignored. In those cases +that of user-added data, they are expected to be ignored. In those cases where a file containing trailing data must be rejected, the option @samp{--trailing-error} can be used. @xref{--trailing-error}. @@ -600,8 +1055,8 @@ clzip -c /dev/sdc > file.lz @sp 1 @anchor{concat-example} @noindent -Example 6: The right way of concatenating compressed files. -@xref{Trailing data}. +Example 6: The right way of concatenating the decompressed output of two +or more compressed files. @xref{Trailing data}. @example Don't do this @@ -671,6 +1126,477 @@ If you find a bug in clzip, please send electronic mail to find by running @w{@code{clzip --version}}. +@node Reference source code +@appendix Reference source code +@cindex reference source code + +@verbatim +/* Lzd - Educational decompressor for the lzip format + Copyright (C) 2013-2017 Antonio Diaz Diaz. + + This program is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +/* + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file. +*/ + +#include +#include +#include +#include +#include +#include +#include +#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) +#include +#include +#endif + + +class State + { + int st; + +public: + enum { states = 12 }; + State() : st( 0 ) {} + int operator()() const { return st; } + bool is_char() const { return st < 7; } + + void set_char() + { + static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; + st = next[st]; + } + void set_match() { st = ( st < 7 ) ? 7 : 10; } + void set_rep() { st = ( st < 7 ) ? 8 : 11; } + void set_short_rep() { st = ( st < 7 ) ? 9 : 11; } + }; + + +enum { + min_dictionary_size = 1 << 12, + max_dictionary_size = 1 << 29, + literal_context_bits = 3, + literal_pos_state_bits = 0, // not used + pos_state_bits = 2, + pos_states = 1 << pos_state_bits, + pos_state_mask = pos_states - 1, + + len_states = 4, + dis_slot_bits = 6, + start_dis_model = 4, + end_dis_model = 14, + modeled_distances = 1 << (end_dis_model / 2), // 128 + dis_align_bits = 4, + dis_align_size = 1 << dis_align_bits, + + len_low_bits = 3, + len_mid_bits = 3, + len_high_bits = 8, + len_low_symbols = 1 << len_low_bits, + len_mid_symbols = 1 << len_mid_bits, + len_high_symbols = 1 << len_high_bits, + max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, + + min_match_len = 2, // must be 2 + + bit_model_move_bits = 5, + bit_model_total_bits = 11, + bit_model_total = 1 << bit_model_total_bits }; + +struct Bit_model + { + int probability; + Bit_model() : probability( bit_model_total / 2 ) {} + }; + +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32() + { + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + void update_buf( uint32_t & crc, const uint8_t * const buffer, + const int size ) const + { + for( int i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + } + }; + +const CRC32 crc32; + + +typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size + +typedef uint8_t File_trailer[20]; + // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + +class Range_decoder + { + uint32_t code; + uint32_t range; + +public: + Range_decoder() : code( 0 ), range( 0xFFFFFFFFU ) + { + for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte(); + } + + uint8_t get_byte() { return std::getc( stdin ); } + + unsigned decode( const int num_bits ) + { + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) + { + range >>= 1; + symbol <<= 1; + if( code >= range ) { code -= range; symbol |= 1; } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | get_byte(); } + } + return symbol; + } + + unsigned decode_bit( Bit_model & bm ) + { + unsigned symbol; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + symbol = 0; + } + else + { + range -= bound; + code -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol = 1; + } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | get_byte(); } + return symbol; + } + + unsigned decode_tree( Bit_model bm[], const int num_bits ) + { + unsigned symbol = 1; + for( int i = 0; i < num_bits; ++i ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - (1 << num_bits); + } + + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + unsigned symbol = decode_tree( bm, num_bits ); + unsigned reversed_symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); + symbol >>= 1; + } + return reversed_symbol; + } + + unsigned decode_matched( Bit_model bm[], const unsigned match_byte ) + { + unsigned symbol = 1; + for( int i = 7; i >= 0; --i ) + { + const unsigned match_bit = ( match_byte >> i ) & 1; + const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] ); + symbol = ( symbol << 1 ) | bit; + if( match_bit != bit ) + { + while( symbol < 0x100 ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + break; + } + } + return symbol & 0xFF; + } + + unsigned decode_len( Len_model & lm, const int pos_state ) + { + if( decode_bit( lm.choice1 ) == 0 ) + return decode_tree( lm.bm_low[pos_state], len_low_bits ); + if( decode_bit( lm.choice2 ) == 0 ) + return len_low_symbols + + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); + return len_low_symbols + len_mid_symbols + + decode_tree( lm.bm_high, len_high_bits ); + } + }; + + +class LZ_decoder + { + unsigned long long partial_data_pos; + Range_decoder rdec; + const unsigned dictionary_size; + uint8_t * const buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to stdout + uint32_t crc_; + bool pos_wrapped; + + void flush_data(); + + uint8_t peek( const unsigned distance ) const + { + if( pos > distance ) return buffer[pos - distance - 1]; + if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1]; + return 0; // prev_byte of first byte + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + +public: + explicit LZ_decoder( const unsigned dict_size ) + : + partial_data_pos( 0 ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ), + pos_wrapped( false ) + {} + + ~LZ_decoder() { delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const { return partial_data_pos + pos; } + + bool decode_member(); + }; + + +void LZ_decoder::flush_data() + { + if( pos > stream_pos ) + { + const unsigned size = pos - stream_pos; + crc32.update_buf( crc_, buffer + stream_pos, size ); + errno = 0; + if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); + std::exit( 1 ); } + if( pos >= dictionary_size ) + { partial_data_pos += pos; pos = 0; pos_wrapped = true; } + stream_pos = pos; + } + } + + +bool LZ_decoder::decode_member() // Returns false if error + { + Bit_model bm_literal[1<> ( 8 - literal_context_bits ); + Bit_model * const bm = bm_literal[literal_state]; + if( state.is_char() ) + put_byte( rdec.decode_tree( bm, 8 ) ); + else + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); + state.set_char(); + } + else // match or repeated match + { + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + state.set_rep(); + len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = min_match_len + rdec.decode_len( match_len_model, pos_state ); + const int len_state = std::min( len - min_match_len, len_states - 1 ); + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) + { + const unsigned dis_slot = rep0; + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), + direct_bits ); + else + { + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // marker found + { + flush_data(); + return ( len == min_match_len ); // End Of Stream marker + } + } + } + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return false; } + } + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); + } + } + flush_data(); + return false; + } + + +int main( const int argc, const char * const argv[] ) + { + if( argc > 1 ) + { + std::printf( "Lzd %s - Educational decompressor for the lzip format.\n", + PROGVERSION ); + std::printf( "Study the source to learn how a lzip decompressor works.\n" + "See the lzip manual for an explanation of the code.\n" + "It is not safe to use lzd for any real work.\n" + "\nUsage: %s < file.lz > file\n", argv[0] ); + std::printf( "Lzd decompresses from standard input to standard output.\n" + "\nCopyright (C) 2017 Antonio Diaz Diaz.\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" + "Report bugs to lzip-bug@nongnu.org\n" + "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n" ); + return 0; + } + +#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) + setmode( fileno( stdin ), O_BINARY ); + setmode( fileno( stdout ), O_BINARY ); +#endif + + for( bool first_member = true; ; first_member = false ) + { + File_header header; // verify header + for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); + if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) + { + if( first_member ) + { std::fputs( "Bad magic number (file not in lzip format).\n", stderr ); + return 2; } + break; + } + unsigned dict_size = 1 << ( header[5] & 0x1F ); + dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); + if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) + { std::fputs( "Invalid dictionary size in member header.\n", stderr ); + return 2; } + + LZ_decoder decoder( dict_size ); // decode LZMA stream + if( !decoder.decode_member() ) + { std::fputs( "Data error\n", stderr ); return 2; } + + File_trailer trailer; // verify trailer + for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); + unsigned crc = 0; + for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } + unsigned long long data_size = 0; + for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } + if( crc != decoder.crc() || data_size != decoder.data_position() ) + { std::fputs( "CRC error\n", stderr ); return 2; } + } + + if( std::fclose( stdout ) != 0 ) + { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); + return 1; } + return 0; + } +@end verbatim + + @node Concept index @unnumbered Concept index diff --git a/encoder.c b/encoder.c index ce0ddf2..4b9d7ec 100644 --- a/encoder.c +++ b/encoder.c @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -43,7 +43,7 @@ int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs ) const int min_pos = ( e->eb.mb.pos > e->eb.mb.dictionary_size ) ? e->eb.mb.pos - e->eb.mb.dictionary_size : 0; const uint8_t * const data = Mb_ptr_to_current_pos( &e->eb.mb ); - int count, key2, key3, key4, newpos; + int count, key2, key3, key4, newpos1; unsigned tmp; int len_limit = e->match_len_limit; @@ -90,15 +90,15 @@ int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs ) e->eb.mb.prev_positions[key2] = pos1; e->eb.mb.prev_positions[key3] = pos1; - newpos = e->eb.mb.prev_positions[key4]; + newpos1 = e->eb.mb.prev_positions[key4]; e->eb.mb.prev_positions[key4] = pos1; for( count = e->cycles; ; ) { int delta; - if( newpos <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; } + if( newpos1 <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; } - delta = pos1 - newpos; + delta = pos1 - newpos1; newptr = e->eb.mb.pos_array + ( ( e->eb.mb.cyclic_pos - delta + ( (e->eb.mb.cyclic_pos >= delta) ? 0 : e->eb.mb.dictionary_size + 1 ) ) << 1 ); @@ -120,16 +120,16 @@ int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs ) } if( data[len-delta] < data[len] ) { - *ptr0 = newpos; + *ptr0 = newpos1; ptr0 = newptr + 1; - newpos = *ptr0; + newpos1 = *ptr0; len0 = len; if( len1 < len ) len = len1; } else { - *ptr1 = newpos; + *ptr1 = newpos1; ptr1 = newptr; - newpos = *ptr1; + newpos1 = *ptr1; len1 = len; if( len0 < len ) len = len0; } } @@ -145,7 +145,7 @@ static void LZe_update_distance_prices( struct LZ_encoder * const e ) const int dis_slot = dis_slots[dis]; const int direct_bits = ( dis_slot >> 1 ) - 1; const int base = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - const int price = price_symbol_reversed( e->eb.bm_dis + base - dis_slot - 1, + const int price = price_symbol_reversed( e->eb.bm_dis + ( base - dis_slot ), dis - base, direct_bits ); for( len_state = 0; len_state < len_states; ++len_state ) e->dis_prices[len_state][dis] = price; @@ -158,9 +158,9 @@ static void LZe_update_distance_prices( struct LZ_encoder * const e ) const Bit_model * const bmds = e->eb.bm_dis_slot[len_state]; int slot = 0; for( ; slot < end_dis_model; ++slot ) - dsp[slot] = price_symbol( bmds, slot, dis_slot_bits ); + dsp[slot] = price_symbol6( bmds, slot ); for( ; slot < e->num_dis_slots; ++slot ) - dsp[slot] = price_symbol( bmds, slot, dis_slot_bits ) + + dsp[slot] = price_symbol6( bmds, slot ) + (((( slot >> 1 ) - 1 ) - dis_align_bits ) << price_shift_bits ); for( dis = 0; dis < start_dis_model; ++dis ) @@ -173,16 +173,16 @@ static void LZe_update_distance_prices( struct LZ_encoder * const e ) /* Returns the number of bytes advanced (ahead). trials[0]..trials[ahead-1] contain the steps to encode. - ( trials[0].dis == -1 ) means literal. + ( trials[0].dis4 == -1 ) means literal. A match/rep longer or equal than match_len_limit finishes the sequence. */ static int LZe_sequence_optimizer( struct LZ_encoder * const e, const int reps[num_rep_distances], const State state ) { - int main_len, num_pairs, i, rep, cur = 0, num_trials, len; + int main_len, num_pairs, i, rep, num_trials, len; + int rep_index = 0, cur = 0; int replens[num_rep_distances]; - int rep_index = 0; if( e->pending_num_pairs > 0 ) /* from previous call */ { @@ -195,13 +195,13 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, for( i = 0; i < num_rep_distances; ++i ) { - replens[i] = Mb_true_match_len( &e->eb.mb, 0, reps[i] + 1, max_match_len ); + replens[i] = Mb_true_match_len( &e->eb.mb, 0, reps[i] + 1 ); if( replens[i] > replens[rep_index] ) rep_index = i; } if( replens[rep_index] >= e->match_len_limit ) { e->trials[0].price = replens[rep_index]; - e->trials[0].dis = rep_index; + e->trials[0].dis4 = rep_index; LZe_move_and_update( e, replens[rep_index] ); return replens[rep_index]; } @@ -209,7 +209,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, if( main_len >= e->match_len_limit ) { e->trials[0].price = main_len; - e->trials[0].dis = e->pairs[num_pairs-1].dis + num_rep_distances; + e->trials[0].dis4 = e->pairs[num_pairs-1].dis + num_rep_distances; LZe_move_and_update( e, main_len ); return main_len; } @@ -227,7 +227,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, e->trials[1].price += LZeb_price_literal( &e->eb, prev_byte, cur_byte ); else e->trials[1].price += LZeb_price_matched( &e->eb, prev_byte, cur_byte, match_byte ); - e->trials[1].dis = -1; /* literal */ + e->trials[1].dis4 = -1; /* literal */ if( match_byte == cur_byte ) Tr_update( &e->trials[1], rep_match_price + @@ -238,7 +238,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, if( num_trials < min_match_len ) { e->trials[0].price = 1; - e->trials[0].dis = e->trials[1].dis; + e->trials[0].dis4 = e->trials[1].dis4; Mb_move_pos( &e->eb.mb ); return 1; } @@ -263,7 +263,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, if( main_len > replens[0] ) { const int normal_match_price = match_price + price0( e->eb.bm_rep[state] ); - i = 0, len = max( replens[0] + 1, min_match_len ); + int i = 0, len = max( replens[0] + 1, min_match_len ); while( len > e->pairs[i].len ) ++i; while( true ) { @@ -304,7 +304,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, /* give final values to current trial */ cur_trial = &e->trials[cur]; { - int dis = cur_trial->dis; + const int dis4 = cur_trial->dis4; int prev_index = cur_trial->prev_index; const int prev_index2 = cur_trial->prev_index2; @@ -313,32 +313,24 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, cur_state = e->trials[prev_index].state; if( prev_index + 1 == cur ) /* len == 1 */ { - if( dis == 0 ) cur_state = St_set_short_rep( cur_state ); + if( dis4 == 0 ) cur_state = St_set_short_rep( cur_state ); else cur_state = St_set_char( cur_state ); /* literal */ } - else if( dis < num_rep_distances ) cur_state = St_set_rep( cur_state ); + else if( dis4 < num_rep_distances ) cur_state = St_set_rep( cur_state ); else cur_state = St_set_match( cur_state ); } - else if( prev_index2 == dual_step_trial ) /* dis == 0 */ + else { - --prev_index; - cur_state = e->trials[prev_index].state; - cur_state = St_set_char( cur_state ); - cur_state = St_set_rep( cur_state ); - } - else /* if( prev_index2 >= 0 ) */ - { - prev_index = prev_index2; - cur_state = e->trials[prev_index].state; - if( dis < num_rep_distances ) cur_state = St_set_rep( cur_state ); - else cur_state = St_set_match( cur_state ); - cur_state = St_set_char( cur_state ); - cur_state = St_set_rep( cur_state ); + if( prev_index2 == dual_step_trial ) /* dis4 == 0 (rep0) */ + --prev_index; + else /* prev_index2 >= 0 */ + prev_index = prev_index2; + cur_state = 8; /* St_set_char_rep(); */ } cur_trial->state = cur_state; for( i = 0; i < num_rep_distances; ++i ) cur_trial->reps[i] = e->trials[prev_index].reps[i]; - mtf_reps( dis, cur_trial->reps ); + mtf_reps( dis4, cur_trial->reps ); /* literal is ignored */ } pos_state = Mb_data_position( &e->eb.mb ) & pos_state_mask; @@ -361,7 +353,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, match_price = cur_trial->price + price1( e->eb.bm_match[cur_state][pos_state] ); rep_match_price = match_price + price1( e->eb.bm_rep[cur_state] ); - if( match_byte == cur_byte && next_trial->dis != 0 && + if( match_byte == cur_byte && next_trial->dis4 != 0 && next_trial->prev_index2 == single_step_trial ) { const int price = rep_match_price + @@ -369,7 +361,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, if( price <= next_trial->price ) { next_trial->price = price; - next_trial->dis = 0; + next_trial->dis4 = 0; /* rep0 */ next_trial->prev_index = cur; } } @@ -386,16 +378,16 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, const uint8_t * const data = Mb_ptr_to_current_pos( &e->eb.mb ); const int dis = cur_trial->reps[0] + 1; const int limit = min( e->match_len_limit + 1, triable_bytes ); - len = 1; + int len = 1; while( len < limit && data[len-dis] == data[len] ) ++len; if( --len >= min_match_len ) { const int pos_state2 = ( pos_state + 1 ) & pos_state_mask; const State state2 = St_set_char( cur_state ); const int price = next_price + - price1( e->eb.bm_match[state2][pos_state2] ) + - price1( e->eb.bm_rep[state2] ) + - LZe_price_rep0_len( e, len, state2, pos_state2 ); + price1( e->eb.bm_match[state2][pos_state2] ) + + price1( e->eb.bm_rep[state2] ) + + LZe_price_rep0_len( e, len, state2, pos_state2 ); while( num_trials < cur + 1 + len ) e->trials[++num_trials].price = infinite_price; Tr_update2( &e->trials[cur+1+len], price, cur + 1 ); @@ -406,8 +398,8 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, for( rep = 0; rep < num_rep_distances; ++rep ) { const uint8_t * const data = Mb_ptr_to_current_pos( &e->eb.mb ); - int price; const int dis = cur_trial->reps[rep] + 1; + int price; if( data[0-dis] != data[0] || data[1-dis] != data[1] ) continue; for( len = min_match_len; len < len_limit; ++len ) @@ -463,7 +455,6 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, for( len = start_len; ; ++len ) { int price = normal_match_price + LZe_price_pair( e, dis, len, pos_state ); - Tr_update( &e->trials[cur+len], price, dis + num_rep_distances, cur ); /* try match + literal + rep0 */ @@ -510,7 +501,7 @@ bool LZe_encode_member( struct LZ_encoder * const e, const int dis_price_count = best ? 1 : 512; const int align_price_count = best ? 1 : dis_align_size; const int price_count = ( e->match_len_limit > 36 ) ? 1013 : 4093; - int price_counter = 0; + int price_counter = 0; /* counters may decrement below 0 */ int dis_price_counter = 0; int align_price_counter = 0; int ahead, i; @@ -551,7 +542,6 @@ bool LZe_encode_member( struct LZ_encoder * const e, } ahead = LZe_sequence_optimizer( e, reps, state ); - if( ahead <= 0 ) return false; /* can't happen */ price_counter -= ahead; for( i = 0; ahead > 0; ) @@ -559,7 +549,7 @@ bool LZe_encode_member( struct LZ_encoder * const e, const int pos_state = ( Mb_data_position( &e->eb.mb ) - ahead ) & pos_state_mask; const int len = e->trials[i].price; - const int dis = e->trials[i].dis; + int dis = e->trials[i].dis4; bool bit = ( dis < 0 ); Re_encode_bit( &e->eb.renc, &e->eb.bm_match[state][pos_state], !bit ); @@ -605,9 +595,9 @@ bool LZe_encode_member( struct LZ_encoder * const e, } else /* match */ { - LZeb_encode_pair( &e->eb, dis - num_rep_distances, len, pos_state ); - if( get_slot( dis - num_rep_distances ) >= end_dis_model ) - --align_price_counter; + dis -= num_rep_distances; + LZeb_encode_pair( &e->eb, dis, len, pos_state ); + if( dis >= modeled_distances ) --align_price_counter; --dis_price_counter; Lp_decrement_counter( &e->match_len_prices, pos_state ); state = St_set_match( state ); diff --git a/encoder.h b/encoder.h index 99670b1..c65022a 100644 --- a/encoder.h +++ b/encoder.h @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,7 +21,7 @@ struct Len_prices int len_symbols; int count; int prices[pos_states][max_len_symbols]; - int counters[pos_states]; + int counters[pos_states]; /* may decrement below 0 */ }; static inline void Lp_update_low_mid_prices( struct Len_prices * const lp, @@ -30,15 +30,14 @@ static inline void Lp_update_low_mid_prices( struct Len_prices * const lp, int * const pps = lp->prices[pos_state]; int tmp = price0( lp->lm->choice1 ); int len = 0; - lp->counters[pos_state] = lp->count; for( ; len < len_low_symbols && len < lp->len_symbols; ++len ) - pps[len] = tmp + price_symbol( lp->lm->bm_low[pos_state], len, len_low_bits ); + pps[len] = tmp + price_symbol3( lp->lm->bm_low[pos_state], len ); if( len >= lp->len_symbols ) return; tmp = price1( lp->lm->choice1 ) + price0( lp->lm->choice2 ); for( ; len < len_low_symbols + len_mid_symbols && len < lp->len_symbols; ++len ) pps[len] = tmp + - price_symbol( lp->lm->bm_mid[pos_state], len - len_low_symbols, len_mid_bits ); - } + price_symbol3( lp->lm->bm_mid[pos_state], len - len_low_symbols ); + } static inline void Lp_update_high_prices( struct Len_prices * const lp ) { @@ -48,7 +47,7 @@ static inline void Lp_update_high_prices( struct Len_prices * const lp ) /* using 4 slots per value makes "Lp_price" faster */ lp->prices[3][len] = lp->prices[2][len] = lp->prices[1][len] = lp->prices[0][len] = tmp + - price_symbol( lp->lm->bm_high, len - len_low_symbols - len_mid_symbols, len_high_bits ); + price_symbol8( lp->lm->bm_high, len - len_low_symbols - len_mid_symbols ); } static inline void Lp_reset( struct Len_prices * const lp ) @@ -74,14 +73,15 @@ static inline void Lp_update_prices( struct Len_prices * const lp ) bool high_pending = false; for( pos_state = 0; pos_state < pos_states; ++pos_state ) if( lp->counters[pos_state] <= 0 ) - { Lp_update_low_mid_prices( lp, pos_state ); high_pending = true; } + { lp->counters[pos_state] = lp->count; + Lp_update_low_mid_prices( lp, pos_state ); high_pending = true; } if( high_pending && lp->len_symbols > len_low_symbols + len_mid_symbols ) Lp_update_high_prices( lp ); } static inline int Lp_price( const struct Len_prices * const lp, - const int symbol, const int pos_state ) - { return lp->prices[pos_state][symbol - min_match_len]; } + const int len, const int pos_state ) + { return lp->prices[pos_state][len - min_match_len]; } struct Pair /* distance-length pair */ @@ -99,7 +99,7 @@ struct Trial { State state; int price; /* dual use var; cumulative price, match length */ - int dis; /* rep index or match distance. (-1 for literal) */ + int dis4; /* -1 for literal, or rep, or match distance + 4 */ int prev_index; /* index of prev trial in trials[] */ int prev_index2; /* -2 trial is single step */ /* -1 literal + rep0 */ @@ -108,34 +108,28 @@ struct Trial }; static inline void Tr_update( struct Trial * const trial, const int pr, - const int distance, const int p_i ) + const int distance4, const int p_i ) { if( pr < trial->price ) - { - trial->price = pr; trial->dis = distance; trial->prev_index = p_i; - trial->prev_index2 = single_step_trial; - } + { trial->price = pr; trial->dis4 = distance4; trial->prev_index = p_i; + trial->prev_index2 = single_step_trial; } } static inline void Tr_update2( struct Trial * const trial, const int pr, const int p_i ) { if( pr < trial->price ) - { - trial->price = pr; trial->dis = 0; trial->prev_index = p_i; - trial->prev_index2 = dual_step_trial; - } + { trial->price = pr; trial->dis4 = 0; trial->prev_index = p_i; + trial->prev_index2 = dual_step_trial; } } static inline void Tr_update3( struct Trial * const trial, const int pr, - const int distance, const int p_i, + const int distance4, const int p_i, const int p_i2 ) { if( pr < trial->price ) - { - trial->price = pr; trial->dis = distance; trial->prev_index = p_i; - trial->prev_index2 = p_i2; - } + { trial->price = pr; trial->dis4 = distance4; trial->prev_index = p_i; + trial->prev_index2 = p_i2; } } @@ -161,26 +155,25 @@ static inline bool Mb_dec_pos( struct Matchfinder_base * const mb, { if( ahead < 0 || mb->pos < ahead ) return false; mb->pos -= ahead; + if( mb->cyclic_pos < ahead ) mb->cyclic_pos += mb->dictionary_size + 1; mb->cyclic_pos -= ahead; - if( mb->cyclic_pos < 0 ) mb->cyclic_pos += mb->dictionary_size + 1; return true; } int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs ); - /* move-to-front dis in/into reps if( dis > 0 ) */ -static inline void mtf_reps( const int dis, int reps[num_rep_distances] ) + /* move-to-front dis in/into reps; do nothing if( dis4 <= 0 ) */ +static inline void mtf_reps( const int dis4, int reps[num_rep_distances] ) { - int i; - if( dis >= num_rep_distances ) + if( dis4 >= num_rep_distances ) /* match */ { - for( i = num_rep_distances - 1; i > 0; --i ) reps[i] = reps[i-1]; - reps[0] = dis - num_rep_distances; + reps[3] = reps[2]; reps[2] = reps[1]; reps[1] = reps[0]; + reps[0] = dis4 - num_rep_distances; } - else if( dis > 0 ) + else if( dis4 > 0 ) /* repeated match */ { - const int distance = reps[dis]; - for( i = dis; i > 0; --i ) reps[i] = reps[i-1]; + const int distance = reps[dis4]; + int i; for( i = dis4; i > 0; --i ) reps[i] = reps[i-1]; reps[0] = distance; } } @@ -192,8 +185,8 @@ static inline int LZeb_price_shortrep( const struct LZ_encoder_base * const eb, } static inline int LZeb_price_rep( const struct LZ_encoder_base * const eb, - const int rep, - const State state, const int pos_state ) + const int rep, const State state, + const int pos_state ) { int price; if( rep == 0 ) return price0( eb->bm_rep0[state] ) + @@ -210,8 +203,8 @@ static inline int LZeb_price_rep( const struct LZ_encoder_base * const eb, } static inline int LZe_price_rep0_len( const struct LZ_encoder * const e, - const int len, - const State state, const int pos_state ) + const int len, const State state, + const int pos_state ) { return LZeb_price_rep( &e->eb, 0, state, pos_state ) + Lp_price( &e->rep_len_prices, len, pos_state ); @@ -235,13 +228,10 @@ static inline int LZe_read_match_distances( struct LZ_encoder * const e ) const int num_pairs = LZe_get_match_pairs( e, e->pairs ); if( num_pairs > 0 ) { - int len = e->pairs[num_pairs-1].len; + const int len = e->pairs[num_pairs-1].len; if( len == e->match_len_limit && len < max_match_len ) - { - len += Mb_true_match_len( &e->eb.mb, len, e->pairs[num_pairs-1].dis + 1, - max_match_len - len ); - e->pairs[num_pairs-1].len = len; - } + e->pairs[num_pairs-1].len = + Mb_true_match_len( &e->eb.mb, len, e->pairs[num_pairs-1].dis + 1 ); } return num_pairs; } @@ -258,7 +248,7 @@ static inline void LZe_move_and_update( struct LZ_encoder * const e, int n ) static inline void LZe_backward( struct LZ_encoder * const e, int cur ) { - int * const dis = &e->trials[cur].dis; + int dis4 = e->trials[cur].dis4; while( cur > 0 ) { const int prev_index = e->trials[cur].prev_index; @@ -266,19 +256,19 @@ static inline void LZe_backward( struct LZ_encoder * const e, int cur ) if( e->trials[cur].prev_index2 != single_step_trial ) { - prev_trial->dis = -1; + prev_trial->dis4 = -1; /* literal */ prev_trial->prev_index = prev_index - 1; prev_trial->prev_index2 = single_step_trial; if( e->trials[cur].prev_index2 >= 0 ) { struct Trial * const prev_trial2 = &e->trials[prev_index-1]; - prev_trial2->dis = *dis; *dis = 0; + prev_trial2->dis4 = dis4; dis4 = 0; /* rep0 */ prev_trial2->prev_index = e->trials[cur].prev_index2; prev_trial2->prev_index2 = single_step_trial; } } prev_trial->price = cur - prev_index; /* len */ - cur = *dis; *dis = prev_trial->dis; prev_trial->dis = cur; + cur = dis4; dis4 = prev_trial->dis4; prev_trial->dis4 = cur; cur = prev_index; } } @@ -290,7 +280,7 @@ static inline bool LZe_init( struct LZ_encoder * const e, const int dict_size, const int len_limit, const int ifd, const int outfd ) { - enum { before = max_num_trials + 1, + enum { before = max_num_trials, /* bytes to keep in buffer after pos */ after_size = ( 2 * max_match_len ) + 1, dict_factor = 2, diff --git a/encoder_base.c b/encoder_base.c index 31cad3f..e384d22 100644 --- a/encoder_base.c +++ b/encoder_base.c @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -54,11 +54,11 @@ void Mb_normalize_pos( struct Matchfinder_base * const mb ) if( !mb->at_stream_end ) { int i; - const int offset = mb->pos - mb->dictionary_size - mb->before_size; + const int offset = mb->pos - mb->before_size - mb->dictionary_size; const int size = mb->stream_pos - offset; memmove( mb->buffer, mb->buffer + offset, size ); mb->partial_data_pos += offset; - mb->pos -= offset; + mb->pos -= offset; /* pos = before_size + dictionary_size */ mb->stream_pos -= offset; for( i = 0; i < mb->num_prev_positions; ++i ) mb->prev_positions[i] -= min( mb->prev_positions[i], offset ); @@ -69,10 +69,9 @@ void Mb_normalize_pos( struct Matchfinder_base * const mb ) } -bool Mb_init( struct Matchfinder_base * const mb, - const int before, const int dict_size, - const int after_size, const int dict_factor, - const int num_prev_positions23, +bool Mb_init( struct Matchfinder_base * const mb, const int before, + const int dict_size, const int after_size, + const int dict_factor, const int num_prev_positions23, const int pos_array_factor, const int ifd ) { const int buffer_size_limit = @@ -116,8 +115,9 @@ bool Mb_init( struct Matchfinder_base * const mb, mb->num_prev_positions = size; mb->pos_array_size = pos_array_factor * ( mb->dictionary_size + 1 ); size += mb->pos_array_size; - if( size * sizeof (int32_t) <= size ) mb->prev_positions = 0; - else mb->prev_positions = (int32_t *)malloc( size * sizeof (int32_t) ); + if( size * sizeof mb->prev_positions[0] <= size ) mb->prev_positions = 0; + else mb->prev_positions = + (int32_t *)malloc( size * sizeof mb->prev_positions[0] ); if( !mb->prev_positions ) { free( mb->buffer ); return false; } mb->pos_array = mb->prev_positions + mb->num_prev_positions; for( i = 0; i < mb->num_prev_positions; ++i ) mb->prev_positions[i] = 0; @@ -184,7 +184,7 @@ void LZeb_reset( struct LZ_encoder_base * const eb ) Bm_array_init( eb->bm_rep2, states ); Bm_array_init( eb->bm_len[0], states * pos_states ); Bm_array_init( eb->bm_dis_slot[0], len_states * (1 << dis_slot_bits) ); - Bm_array_init( eb->bm_dis, modeled_distances - end_dis_model ); + Bm_array_init( eb->bm_dis, modeled_distances - end_dis_model + 1 ); Bm_array_init( eb->bm_align, dis_align_size ); Lm_init( &eb->match_len_model ); Lm_init( &eb->rep_len_model ); diff --git a/encoder_base.h b/encoder_base.h index 54fecd1..b2985f1 100644 --- a/encoder_base.h +++ b/encoder_base.h @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -77,22 +77,48 @@ static inline int price0( const Bit_model probability ) static inline int price1( const Bit_model probability ) { return get_price( bit_model_total - probability ); } -static inline int price_bit( const Bit_model bm, const int bit ) - { if( bit ) return price1( bm ); else return price0( bm ); } +static inline int price_bit( const Bit_model bm, const bool bit ) + { return ( bit ? price1( bm ) : price0( bm ) ); } -static inline int price_symbol( const Bit_model bm[], int symbol, - const int num_bits ) +static inline int price_symbol3( const Bit_model bm[], int symbol ) { - int price = 0; - symbol |= ( 1 << num_bits ); - while( symbol > 1 ) - { - const int bit = symbol & 1; - symbol >>= 1; - price += price_bit( bm[symbol], bit ); - } - return price; + int price; + bool bit = symbol & 1; + symbol |= 8; symbol >>= 1; + price = price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + return price + price_bit( bm[1], symbol & 1 ); + } + + +static inline int price_symbol6( const Bit_model bm[], unsigned symbol ) + { + int price; + bool bit = symbol & 1; + symbol |= 64; symbol >>= 1; + price = price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + return price + price_bit( bm[1], symbol & 1 ); + } + + +static inline int price_symbol8( const Bit_model bm[], int symbol ) + { + int price; + bool bit = symbol & 1; + symbol |= 0x100; symbol >>= 1; + price = price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + return price + price_bit( bm[1], symbol & 1 ); } @@ -104,32 +130,29 @@ static inline int price_symbol_reversed( const Bit_model bm[], int symbol, int i; for( i = num_bits; i > 0; --i ) { - const int bit = symbol & 1; + const bool bit = symbol & 1; + symbol >>= 1; price += price_bit( bm[model], bit ); model = ( model << 1 ) | bit; - symbol >>= 1; } return price; } -static inline int price_matched( const Bit_model bm[], int symbol, int match_byte ) +static inline int price_matched( const Bit_model bm[], unsigned symbol, + unsigned match_byte ) { int price = 0; - int mask = 0x100; + unsigned mask = 0x100; symbol |= mask; - - do { - int match_bit, bit; - match_byte <<= 1; - match_bit = match_byte & mask; - symbol <<= 1; - bit = symbol & 0x100; - price += price_bit( bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */ + while( true ) + { + const unsigned match_bit = ( match_byte <<= 1 ) & mask; + const bool bit = ( symbol <<= 1 ) & 0x100; + price += price_bit( bm[(symbol>>9)+match_bit+mask], bit ); + if( symbol >= 0x10000 ) return price; + mask &= ~(match_bit ^ symbol); /* if( match_bit != bit ) mask = 0; */ } - while( symbol < 0x10000 ); - return price; } @@ -156,10 +179,9 @@ struct Matchfinder_base bool Mb_read_block( struct Matchfinder_base * const mb ); void Mb_normalize_pos( struct Matchfinder_base * const mb ); -bool Mb_init( struct Matchfinder_base * const mb, - const int before, const int dict_size, - const int after_size, const int dict_factor, - const int num_prev_positions23, +bool Mb_init( struct Matchfinder_base * const mb, const int before, + const int dict_size, const int after_size, + const int dict_factor, const int num_prev_positions23, const int pos_array_factor, const int ifd ); static inline void Mb_free( struct Matchfinder_base * const mb ) @@ -184,13 +206,11 @@ Mb_ptr_to_current_pos( const struct Matchfinder_base * const mb ) { return mb->buffer + mb->pos; } static inline int Mb_true_match_len( const struct Matchfinder_base * const mb, - const int index, const int distance, - int len_limit ) + const int index, const int distance ) { - const uint8_t * const data = mb->buffer + mb->pos + index; - int i = 0; - if( index + len_limit > Mb_available_bytes( mb ) ) - len_limit = Mb_available_bytes( mb ) - index; + const uint8_t * const data = mb->buffer + mb->pos; + int i = index; + const int len_limit = min( Mb_available_bytes( mb ), max_match_len ); while( i < len_limit && data[i-distance] == data[i] ) ++i; return i; } @@ -230,9 +250,9 @@ static inline void Re_put_byte( struct Range_encoder * const renc, static inline void Re_shift_low( struct Range_encoder * const renc ) { - const bool carry = ( renc->low > 0xFFFFFFFFU ); - if( carry || renc->low < 0xFF000000U ) + if( renc->low >> 24 != 0xFF ) { + const bool carry = ( renc->low > 0xFFFFFFFFU ); Re_put_byte( renc, renc->cache + carry ); for( ; renc->ff_count > 0; --renc->ff_count ) Re_put_byte( renc, 0xFF + carry ); @@ -280,18 +300,18 @@ static inline void Re_flush( struct Range_encoder * const renc ) static inline void Re_encode( struct Range_encoder * const renc, const int symbol, const int num_bits ) { - int i; - for( i = num_bits - 1; i >= 0; --i ) + unsigned mask; + for( mask = 1 << ( num_bits - 1 ); mask > 0; mask >>= 1 ) { renc->range >>= 1; - if( (symbol >> i) & 1 ) renc->low += renc->range; + if( symbol & mask ) renc->low += renc->range; if( renc->range <= 0x00FFFFFFU ) { renc->range <<= 8; Re_shift_low( renc ); } } } static inline void Re_encode_bit( struct Range_encoder * const renc, - Bit_model * const probability, const int bit ) + Bit_model * const probability, const bool bit ) { const uint32_t bound = ( renc->range >> bit_model_total_bits ) * *probability; if( !bit ) @@ -305,56 +325,78 @@ static inline void Re_encode_bit( struct Range_encoder * const renc, renc->range -= bound; *probability -= *probability >> bit_model_move_bits; } - if( renc->range <= 0x00FFFFFFU ) - { renc->range <<= 8; Re_shift_low( renc ); } + if( renc->range <= 0x00FFFFFFU ) { renc->range <<= 8; Re_shift_low( renc ); } + } + +static inline void Re_encode_tree3( struct Range_encoder * const renc, + Bit_model bm[], const int symbol ) + { + int model = 1; + bool bit = ( symbol >> 2 ) & 1; + Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 1 ) & 1; + Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; + Re_encode_bit( renc, &bm[model], symbol & 1 ); + } + +static inline void Re_encode_tree6( struct Range_encoder * const renc, + Bit_model bm[], const unsigned symbol ) + { + int model = 1; + bool bit = ( symbol >> 5 ) & 1; + Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 4 ) & 1; + Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 3 ) & 1; + Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 2 ) & 1; + Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 1 ) & 1; + Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; + Re_encode_bit( renc, &bm[model], symbol & 1 ); } -static inline void Re_encode_tree( struct Range_encoder * const renc, - Bit_model bm[], const int symbol, const int num_bits ) +static inline void Re_encode_tree8( struct Range_encoder * const renc, + Bit_model bm[], const int symbol ) { - int mask = ( 1 << ( num_bits - 1 ) ); int model = 1; int i; - for( i = num_bits; i > 0; --i, mask >>= 1 ) + for( i = 7; i >= 0; --i ) { - const int bit = ( symbol & mask ); + const bool bit = ( symbol >> i ) & 1; Re_encode_bit( renc, &bm[model], bit ); - model <<= 1; - if( bit ) model |= 1; + model = ( model << 1 ) | bit; } } static inline void Re_encode_tree_reversed( struct Range_encoder * const renc, - Bit_model bm[], int symbol, const int num_bits ) + Bit_model bm[], int symbol, const int num_bits ) { int model = 1; int i; for( i = num_bits; i > 0; --i ) { - const int bit = symbol & 1; + const bool bit = symbol & 1; + symbol >>= 1; Re_encode_bit( renc, &bm[model], bit ); model = ( model << 1 ) | bit; - symbol >>= 1; } } static inline void Re_encode_matched( struct Range_encoder * const renc, - Bit_model bm[], int symbol, - int match_byte ) + Bit_model bm[], unsigned symbol, + unsigned match_byte ) { - int mask = 0x100; + unsigned mask = 0x100; symbol |= mask; - - do { - int match_bit, bit; - match_byte <<= 1; - match_bit = match_byte & mask; - symbol <<= 1; - bit = symbol & 0x100; - Re_encode_bit( renc, &bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */ + while( true ) + { + const unsigned match_bit = ( match_byte <<= 1 ) & mask; + const bool bit = ( symbol <<= 1 ) & 0x100; + Re_encode_bit( renc, &bm[(symbol>>9)+match_bit+mask], bit ); + if( symbol >= 0x10000 ) break; + mask &= ~(match_bit ^ symbol); /* if( match_bit != bit ) mask = 0; */ } - while( symbol < 0x10000 ); } static inline void Re_encode_len( struct Range_encoder * const renc, @@ -364,17 +406,15 @@ static inline void Re_encode_len( struct Range_encoder * const renc, bool bit = ( ( symbol -= min_match_len ) >= len_low_symbols ); Re_encode_bit( renc, &lm->choice1, bit ); if( !bit ) - Re_encode_tree( renc, lm->bm_low[pos_state], symbol, len_low_bits ); + Re_encode_tree3( renc, lm->bm_low[pos_state], symbol ); else { - bit = ( symbol >= len_low_symbols + len_mid_symbols ); + bit = ( ( symbol -= len_low_symbols ) >= len_mid_symbols ); Re_encode_bit( renc, &lm->choice2, bit ); if( !bit ) - Re_encode_tree( renc, lm->bm_mid[pos_state], - symbol - len_low_symbols, len_mid_bits ); + Re_encode_tree3( renc, lm->bm_mid[pos_state], symbol ); else - Re_encode_tree( renc, lm->bm_high, - symbol - len_low_symbols - len_mid_symbols, len_high_bits ); + Re_encode_tree8( renc, lm->bm_high, symbol - len_mid_symbols ); } } @@ -395,7 +435,7 @@ struct LZ_encoder_base Bit_model bm_rep2[states]; Bit_model bm_len[states][pos_states]; Bit_model bm_dis_slot[len_states][1<crc ^ 0xFFFFFFFFU; } static inline int LZeb_price_literal( const struct LZ_encoder_base * const eb, - uint8_t prev_byte, uint8_t symbol ) - { return price_symbol( eb->bm_literal[get_lit_state(prev_byte)], symbol, 8 ); } + const uint8_t prev_byte, const uint8_t symbol ) + { return price_symbol8( eb->bm_literal[get_lit_state(prev_byte)], symbol ); } static inline int LZeb_price_matched( const struct LZ_encoder_base * const eb, - uint8_t prev_byte, uint8_t symbol, - uint8_t match_byte ) + const uint8_t prev_byte, const uint8_t symbol, const uint8_t match_byte ) { return price_matched( eb->bm_literal[get_lit_state(prev_byte)], symbol, match_byte ); } static inline void LZeb_encode_literal( struct LZ_encoder_base * const eb, - uint8_t prev_byte, uint8_t symbol ) - { Re_encode_tree( &eb->renc, - eb->bm_literal[get_lit_state(prev_byte)], symbol, 8 ); } + const uint8_t prev_byte, const uint8_t symbol ) + { Re_encode_tree8( &eb->renc, eb->bm_literal[get_lit_state(prev_byte)], + symbol ); } static inline void LZeb_encode_matched( struct LZ_encoder_base * const eb, - uint8_t prev_byte, uint8_t symbol, - uint8_t match_byte ) + const uint8_t prev_byte, const uint8_t symbol, const uint8_t match_byte ) { Re_encode_matched( &eb->renc, eb->bm_literal[get_lit_state(prev_byte)], symbol, match_byte ); } @@ -449,10 +487,9 @@ static inline void LZeb_encode_pair( struct LZ_encoder_base * const eb, const unsigned dis, const int len, const int pos_state ) { - const int dis_slot = get_slot( dis ); + const unsigned dis_slot = get_slot( dis ); Re_encode_len( &eb->renc, &eb->match_len_model, len, pos_state ); - Re_encode_tree( &eb->renc, eb->bm_dis_slot[get_len_state(len)], dis_slot, - dis_slot_bits ); + Re_encode_tree6( &eb->renc, eb->bm_dis_slot[get_len_state(len)], dis_slot ); if( dis_slot >= start_dis_model ) { @@ -461,7 +498,7 @@ static inline void LZeb_encode_pair( struct LZ_encoder_base * const eb, const unsigned direct_dis = dis - base; if( dis_slot < end_dis_model ) - Re_encode_tree_reversed( &eb->renc, eb->bm_dis + base - dis_slot - 1, + Re_encode_tree_reversed( &eb->renc, eb->bm_dis + ( base - dis_slot ), direct_dis, direct_bits ); else { diff --git a/fast_encoder.c b/fast_encoder.c index 941c0e2..399cc1a 100644 --- a/fast_encoder.c +++ b/fast_encoder.c @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,21 +33,22 @@ int FLZe_longest_match_len( struct FLZ_encoder * const fe, int * const distance enum { len_limit = 16 }; const uint8_t * const data = Mb_ptr_to_current_pos( &fe->eb.mb ); int32_t * ptr0 = fe->eb.mb.pos_array + fe->eb.mb.cyclic_pos; - int32_t * newptr; const int pos1 = fe->eb.mb.pos + 1; - int maxlen = 0; - int count, delta, newpos; - if( len_limit > Mb_available_bytes( &fe->eb.mb ) ) return 0; + int maxlen = 0, newpos1, count; + const int available = min( Mb_available_bytes( &fe->eb.mb ), max_match_len ); + if( available < len_limit ) return 0; fe->key4 = ( ( fe->key4 << 4 ) ^ data[3] ) & fe->eb.mb.key4_mask; - newpos = fe->eb.mb.prev_positions[fe->key4]; + newpos1 = fe->eb.mb.prev_positions[fe->key4]; fe->eb.mb.prev_positions[fe->key4] = pos1; for( count = 4; ; ) { - if( --count < 0 || newpos <= 0 ) { *ptr0 = 0; break; } - delta = pos1 - newpos; - if( delta > fe->eb.mb.dictionary_size ) { *ptr0 = 0; break; } + int32_t * newptr; + int delta; + if( newpos1 <= 0 || --count < 0 || + ( delta = pos1 - newpos1 ) > fe->eb.mb.dictionary_size ) + { *ptr0 = 0; break; } newptr = fe->eb.mb.pos_array + ( fe->eb.mb.cyclic_pos - delta + ( ( fe->eb.mb.cyclic_pos >= delta ) ? 0 : fe->eb.mb.dictionary_size + 1 ) ); @@ -55,23 +56,15 @@ int FLZe_longest_match_len( struct FLZ_encoder * const fe, int * const distance if( data[maxlen-delta] == data[maxlen] ) { int len = 0; - while( len < len_limit && data[len-delta] == data[len] ) ++len; - if( maxlen < len ) { maxlen = len; *distance = delta - 1; } + while( len < available && data[len-delta] == data[len] ) ++len; + if( maxlen < len ) + { maxlen = len; *distance = delta - 1; + if( maxlen >= len_limit ) { *ptr0 = *newptr; break; } } } - if( maxlen < len_limit ) - { - *ptr0 = newpos; - ptr0 = newptr; - newpos = *ptr0; - } - else - { - *ptr0 = *newptr; - maxlen += Mb_true_match_len( &fe->eb.mb, maxlen, *distance + 1, - max_match_len - maxlen ); - break; - } + *ptr0 = newpos1; + ptr0 = newptr; + newpos1 = *ptr0; } return maxlen; } @@ -112,8 +105,7 @@ bool FLZe_encode_member( struct FLZ_encoder * const fe, for( i = 0; i < num_rep_distances; ++i ) { - const int tlen = Mb_true_match_len( &fe->eb.mb, 0, - reps[i] + 1, max_match_len ); + const int tlen = Mb_true_match_len( &fe->eb.mb, 0, reps[i] + 1 ); if( tlen > len ) { len = tlen; rep = i; } } if( len > min_match_len && len + 3 > main_len ) diff --git a/fast_encoder.h b/fast_encoder.h index df1741d..9825068 100644 --- a/fast_encoder.h +++ b/fast_encoder.h @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ struct FLZ_encoder { struct LZ_encoder_base eb; - int key4; /* key made from latest 4 bytes */ + unsigned key4; /* key made from latest 4 bytes */ }; static inline void FLZe_reset_key4( struct FLZ_encoder * const fe ) @@ -37,12 +37,10 @@ static inline void FLZe_update_and_move( struct FLZ_encoder * const fe, int n ) { if( Mb_available_bytes( &fe->eb.mb ) >= 4 ) { - int newpos; fe->key4 = ( ( fe->key4 << 4 ) ^ fe->eb.mb.buffer[fe->eb.mb.pos+3] ) & fe->eb.mb.key4_mask; - newpos = fe->eb.mb.prev_positions[fe->key4]; + fe->eb.mb.pos_array[fe->eb.mb.cyclic_pos] = fe->eb.mb.prev_positions[fe->key4]; fe->eb.mb.prev_positions[fe->key4] = fe->eb.mb.pos + 1; - fe->eb.mb.pos_array[fe->eb.mb.cyclic_pos] = newpos; } Mb_move_pos( &fe->eb.mb ); } diff --git a/file_index.c b/file_index.c new file mode 100644 index 0000000..74adf95 --- /dev/null +++ b/file_index.c @@ -0,0 +1,268 @@ +/* Clzip - LZMA lossless data compressor + Copyright (C) 2010-2017 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#define _FILE_OFFSET_BITS 64 + +#include +#include +#include +#include +#include +#include +#include + +#include "lzip.h" +#include "file_index.h" + + +static int seek_read( const int fd, uint8_t * const buf, const int size, + const long long pos ) + { + if( lseek( fd, pos, SEEK_SET ) == pos ) + return readblock( fd, buf, size ); + return 0; + } + + +static bool add_error( struct File_index * const fi, const char * const msg ) + { + const int len = strlen( msg ); + void * tmp = resize_buffer( fi->error, fi->error_size + len + 1 ); + if( !tmp ) return false; + fi->error = (char *)tmp; + strncpy( fi->error + fi->error_size, msg, len + 1 ); + fi->error_size += len; + return true; + } + + +static bool push_back_member( struct File_index * const fi, + const long long dp, const long long ds, + const long long mp, const long long ms, + const unsigned dict_size ) + { + struct Member * p; + void * tmp = resize_buffer( fi->member_vector, + ( fi->members + 1 ) * sizeof fi->member_vector[0] ); + if( !tmp ) + { add_error( fi, "Not enough memory." ); fi->retval = 1; return false; } + fi->member_vector = (struct Member *)tmp; + p = &(fi->member_vector[fi->members]); + init_member( p, dp, ds, mp, ms, dict_size ); + ++fi->members; + return true; + } + + +static void Fi_free_member_vector( struct File_index * const fi ) + { + if( fi->member_vector ) + { free( fi->member_vector ); fi->member_vector = 0; } + fi->members = 0; + } + + +static void Fi_reverse_member_vector( struct File_index * const fi ) + { + struct Member tmp; + long i; + for( i = 0; i < fi->members / 2; ++i ) + { + tmp = fi->member_vector[i]; + fi->member_vector[i] = fi->member_vector[fi->members-i-1]; + fi->member_vector[fi->members-i-1] = tmp; + } + } + + +static void Fi_set_errno_error( struct File_index * const fi, + const char * const msg ) + { + add_error( fi, msg ); add_error( fi, strerror( errno ) ); + fi->retval = 1; + } + +static void Fi_set_num_error( struct File_index * const fi, + const char * const msg, unsigned long long num ) + { + char buf[80]; + snprintf( buf, sizeof buf, "%s%llu", msg, num ); + add_error( fi, buf ); + fi->retval = 2; + } + + +/* If successful, push last member and set pos to member header. */ +static bool Fi_skip_trailing_data( struct File_index * const fi, + const int fd, long long * const pos ) + { + enum { block_size = 16384, + buffer_size = block_size + Ft_size - 1 + Fh_size }; + uint8_t buffer[buffer_size]; + int bsize = *pos % block_size; /* total bytes in buffer */ + int search_size, rd_size; + unsigned long long ipos; + int i; + if( bsize <= buffer_size - block_size ) bsize += block_size; + search_size = bsize; /* bytes to search for trailer */ + rd_size = bsize; /* bytes to read from file */ + ipos = *pos - rd_size; /* aligned to block_size */ + if( *pos < min_member_size ) return false; + + while( true ) + { + const uint8_t max_msb = ( ipos + search_size ) >> 56; + if( seek_read( fd, buffer, rd_size, ipos ) != rd_size ) + { Fi_set_errno_error( fi, "Error seeking member trailer: " ); + return false; } + for( i = search_size; i >= Ft_size; --i ) + if( buffer[i-1] <= max_msb ) /* most significant byte of member_size */ + { + File_header header; + File_trailer * trailer = (File_trailer *)( buffer + i - Ft_size ); + const unsigned long long member_size = Ft_get_member_size( *trailer ); + unsigned dictionary_size; + if( member_size == 0 ) + { while( i > Ft_size && buffer[i-9] == 0 ) --i; continue; } + if( member_size < min_member_size || member_size > ipos + i ) + continue; + if( seek_read( fd, header, Fh_size, + ipos + i - member_size ) != Fh_size ) + { Fi_set_errno_error( fi, "Error reading member header: " ); + return false; } + dictionary_size = Fh_get_dictionary_size( header ); + if( !Fh_verify_magic( header ) || !Fh_verify_version( header ) || + !isvalid_ds( dictionary_size ) ) continue; + if( Fh_verify_prefix( buffer + i, bsize - i ) ) + { + add_error( fi, "Last member in input file is truncated or corrupt." ); + fi->retval = 2; return false; + } + *pos = ipos + i - member_size; + return push_back_member( fi, 0, Ft_get_data_size( *trailer ), *pos, + member_size, dictionary_size ); + } + if( ipos <= 0 ) + { Fi_set_num_error( fi, "Member size in trailer is corrupt at pos ", + *pos - 8 ); + return false; } + bsize = buffer_size; + search_size = bsize - Fh_size; + rd_size = block_size; + ipos -= rd_size; + memcpy( buffer + rd_size, buffer, buffer_size - rd_size ); + } + } + + +bool Fi_init( struct File_index * const fi, const int infd, + const bool ignore_trailing ) + { + File_header header; + long long pos; + long i; + fi->member_vector = 0; + fi->error = 0; + fi->isize = lseek( infd, 0, SEEK_END ); + fi->members = 0; + fi->error_size = 0; + fi->retval = 0; + if( fi->isize < 0 ) + { Fi_set_errno_error( fi, "Input file is not seekable: " ); return false; } + if( fi->isize < min_member_size ) + { add_error( fi, "Input file is too short." ); fi->retval = 2; + return false; } + if( fi->isize > INT64_MAX ) + { add_error( fi, "Input file is too long (2^63 bytes or more)." ); + fi->retval = 2; return false; } + + if( seek_read( infd, header, Fh_size, 0 ) != Fh_size ) + { Fi_set_errno_error( fi, "Error reading member header: " ); return false; } + if( !Fh_verify_magic( header ) ) + { add_error( fi, bad_magic_msg ); fi->retval = 2; return false; } + if( !Fh_verify_version( header ) ) + { add_error( fi, bad_version( Fh_version( header ) ) ); fi->retval = 2; + return false; } + if( !isvalid_ds( Fh_get_dictionary_size( header ) ) ) + { add_error( fi, bad_dict_msg ); fi->retval = 2; return false; } + + pos = fi->isize; /* always points to a header or to EOF */ + while( pos >= min_member_size ) + { + File_trailer trailer; + unsigned long long member_size; + unsigned dictionary_size; + if( seek_read( infd, trailer, Ft_size, pos - Ft_size ) != Ft_size ) + { Fi_set_errno_error( fi, "Error reading member trailer: " ); break; } + member_size = Ft_get_member_size( trailer ); + if( member_size < min_member_size || member_size > (unsigned long long)pos ) + { + if( fi->members > 0 ) + Fi_set_num_error( fi, "Member size in trailer is corrupt at pos ", + pos - 8 ); + else if( Fi_skip_trailing_data( fi, infd, &pos ) ) + { if( ignore_trailing ) continue; + add_error( fi, trailing_msg ); fi->retval = 2; return false; } + break; + } + if( seek_read( infd, header, Fh_size, pos - member_size ) != Fh_size ) + { Fi_set_errno_error( fi, "Error reading member header: " ); break; } + dictionary_size = Fh_get_dictionary_size( header ); + if( !Fh_verify_magic( header ) || !Fh_verify_version( header ) || + !isvalid_ds( dictionary_size ) ) + { + if( fi->members > 0 ) + Fi_set_num_error( fi, "Bad header at pos ", pos - member_size ); + else if( Fi_skip_trailing_data( fi, infd, &pos ) ) + { if( ignore_trailing ) continue; + add_error( fi, trailing_msg ); fi->retval = 2; return false; } + break; + } + pos -= member_size; + if( !push_back_member( fi, 0, Ft_get_data_size( trailer ), pos, + member_size, dictionary_size ) ) + return false; + } + if( pos != 0 || fi->members <= 0 ) + { + Fi_free_member_vector( fi ); + if( fi->retval == 0 ) + { add_error( fi, "Can't create file index." ); fi->retval = 2; } + return false; + } + Fi_reverse_member_vector( fi ); + for( i = 0; i < fi->members - 1; ++i ) + { + const long long end = block_end( fi->member_vector[i].dblock ); + if( end < 0 || end > INT64_MAX ) + { + Fi_free_member_vector( fi ); + add_error( fi, "Data in input file is too long (2^63 bytes or more)." ); + fi->retval = 2; return false; + } + fi->member_vector[i+1].dblock.pos = end; + } + return true; + } + + +void Fi_free( struct File_index * const fi ) + { + Fi_free_member_vector( fi ); + if( fi->error ) { free( fi->error ); fi->error = 0; } + fi->error_size = 0; + } diff --git a/file_index.h b/file_index.h new file mode 100644 index 0000000..d093fa9 --- /dev/null +++ b/file_index.h @@ -0,0 +1,90 @@ +/* Clzip - LZMA lossless data compressor + Copyright (C) 2010-2017 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef INT64_MAX +#define INT64_MAX 0x7FFFFFFFFFFFFFFFLL +#endif + + +struct Block + { + long long pos, size; /* pos + size <= INT64_MAX */ + }; + +static inline void init_block( struct Block * const b, + const long long p, const long long s ) + { b->pos = p; b->size = s; } + +static inline long long block_end( const struct Block b ) + { return b.pos + b.size; } + + +struct Member + { + struct Block dblock, mblock; /* data block, member block */ + unsigned dictionary_size; + }; + +static inline void init_member( struct Member * const m, + const long long dp, const long long ds, + const long long mp, const long long ms, + const unsigned dict_size ) + { init_block( &m->dblock, dp, ds ); init_block( &m->mblock, mp, ms ); + m->dictionary_size = dict_size; } + +struct File_index + { + struct Member * member_vector; + char * error; + long long isize; + long members; + int error_size; + int retval; + }; + +bool Fi_init( struct File_index * const fi, const int infd, + const bool ignore_trailing ); + +void Fi_free( struct File_index * const fi ); + +static inline long long Fi_udata_size( const struct File_index * const fi ) + { + if( fi->members <= 0 ) return 0; + return block_end( fi->member_vector[fi->members-1].dblock ); + } + +static inline long long Fi_cdata_size( const struct File_index * const fi ) + { + if( fi->members <= 0 ) return 0; + return block_end( fi->member_vector[fi->members-1].mblock ); + } + + /* total size including trailing data (if any) */ +static inline long long Fi_file_size( const struct File_index * const fi ) + { if( fi->isize >= 0 ) return fi->isize; else return 0; } + +static inline const struct Block * Fi_dblock( const struct File_index * const fi, + const long i ) + { return &fi->member_vector[i].dblock; } + +static inline const struct Block * Fi_mblock( const struct File_index * const fi, + const long i ) + { return &fi->member_vector[i].mblock; } + +static inline unsigned Fi_dictionary_size( const struct File_index * const fi, + const long i ) + { return fi->member_vector[i].dictionary_size; } diff --git a/list.c b/list.c new file mode 100644 index 0000000..c72360d --- /dev/null +++ b/list.c @@ -0,0 +1,123 @@ +/* Clzip - LZMA lossless data compressor + Copyright (C) 2010-2017 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#define _FILE_OFFSET_BITS 64 + +#include +#include +#include +#include +#include +#include + +#include "lzip.h" +#include "file_index.h" + + +static void list_line( const unsigned long long uncomp_size, + const unsigned long long comp_size, + const char * const input_filename ) + { + if( uncomp_size > 0 ) + printf( "%15llu %15llu %6.2f%% %s\n", uncomp_size, comp_size, + 100.0 * ( 1.0 - ( (double)comp_size / uncomp_size ) ), + input_filename ); + else + printf( "%15llu %15llu -INF%% %s\n", uncomp_size, comp_size, + input_filename ); + } + + +int list_files( const char * const filenames[], const int num_filenames, + const bool ignore_trailing ) + { + unsigned long long total_comp = 0, total_uncomp = 0; + int files = 0, retval = 0; + int i; + bool first_post = true; + bool stdin_used = false; + for( i = 0; i < num_filenames; ++i ) + { + const char * input_filename; + struct File_index file_index; + struct stat in_stats; /* not used */ + int infd; + const bool from_stdin = ( strcmp( filenames[i], "-" ) == 0 ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + input_filename = from_stdin ? "(stdin)" : filenames[i]; + infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, true, true ); + if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } + + Fi_init( &file_index, infd, ignore_trailing ); + close( infd ); + if( file_index.retval != 0 ) + { + show_file_error( input_filename, file_index.error, 0 ); + if( retval < file_index.retval ) retval = file_index.retval; + Fi_free( &file_index ); continue; + } + if( verbosity >= 0 ) + { + const unsigned long long udata_size = Fi_udata_size( &file_index ); + const unsigned long long cdata_size = Fi_cdata_size( &file_index ); + total_comp += cdata_size; total_uncomp += udata_size; ++files; + if( first_post ) + { + first_post = false; + if( verbosity >= 1 ) fputs( " dict memb trail ", stdout ); + fputs( " uncompressed compressed saved name\n", stdout ); + } + if( verbosity >= 1 ) + { + long long trailing_size; + unsigned dictionary_size = 0; + long i; + for( i = 0; i < file_index.members; ++i ) + dictionary_size = + max( dictionary_size, Fi_dictionary_size( &file_index, i ) ); + trailing_size = Fi_file_size( &file_index ) - cdata_size; + printf( "%s %5ld %6lld ", format_ds( dictionary_size ), + file_index.members, trailing_size ); + } + list_line( udata_size, cdata_size, input_filename ); + + if( verbosity >= 2 && file_index.members > 1 ) + { + long i; + fputs( " member data_pos data_size member_pos member_size\n", stdout ); + for( i = 0; i < file_index.members; ++i ) + { + const struct Block * db = Fi_dblock( &file_index, i ); + const struct Block * mb = Fi_mblock( &file_index, i ); + printf( "%5ld %15llu %15llu %15llu %15llu\n", + i + 1, db->pos, db->size, mb->pos, mb->size ); + } + first_post = true; /* reprint heading after list of members */ + } + fflush( stdout ); + } + Fi_free( &file_index ); + } + if( verbosity >= 0 && files > 1 ) + { + if( verbosity >= 1 ) fputs( " ", stdout ); + list_line( total_uncomp, total_comp, "(totals)" ); + fflush( stdout ); + } + return retval; + } diff --git a/lzip.h b/lzip.h index 5274500..49b6b93 100644 --- a/lzip.h +++ b/lzip.h @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,6 +49,7 @@ enum { min_dictionary_size = 1 << min_dictionary_bits, /* >= modeled_distances */ max_dictionary_bits = 29, max_dictionary_size = 1 << max_dictionary_bits, + min_member_size = 36, literal_context_bits = 3, literal_pos_state_bits = 0, /* not used */ pos_state_bits = 2, @@ -131,9 +132,9 @@ static inline void Pp_init( struct Pretty_print * const pp, pp->stdin_name = "(stdin)"; pp->longest_name = 0; pp->first_post = false; - stdin_name_len = strlen( pp->stdin_name ); if( verbosity <= 0 ) return; + stdin_name_len = strlen( pp->stdin_name ); for( i = 0; i < num_filenames; ++i ) { const char * const s = filenames[i]; @@ -182,8 +183,10 @@ static inline void CRC32_update_buf( uint32_t * const crc, const int size ) { int i; + uint32_t c = *crc; for( i = 0; i < size; ++i ) - *crc = crc32[(*crc^buffer[i])&0xFF] ^ ( *crc >> 8 ); + c = crc32[(c^buffer[i])&0xFF] ^ ( c >> 8 ); + *crc = c; } @@ -243,7 +246,7 @@ static inline bool Fh_set_dictionary_size( File_header data, const unsigned sz ) { const unsigned base_size = 1 << data[5]; const unsigned fraction = base_size / 16; - int i; + unsigned i; for( i = 7; i >= 1; --i ) if( base_size - ( i * fraction ) >= sz ) { data[5] |= ( i << 5 ); break; } @@ -290,14 +293,30 @@ static inline void Ft_set_member_size( File_trailer data, unsigned long long sz { int i; for( i = 12; i <= 19; ++i ) { data[i] = (uint8_t)sz; sz >>= 8; } } +static const char * const bad_magic_msg = "Bad magic number (file not in lzip format)."; +static const char * const bad_dict_msg = "Invalid dictionary size in member header."; +static const char * const trailing_msg = "Trailing data not allowed."; + /* defined in decoder.c */ int readblock( const int fd, uint8_t * const buf, const int size ); int writeblock( const int fd, const uint8_t * const buf, const int size ); +/* defined in list.c */ +int list_files( const char * const filenames[], const int num_filenames, + const bool ignore_trailing ); + /* defined in main.c */ extern int verbosity; +struct stat; +const char * bad_version( const unsigned version ); +const char * format_ds( const unsigned dictionary_size ); +int open_instream( const char * const name, struct stat * const in_statsp, + const bool no_ofile, const bool reg_only ); +void * resize_buffer( void * buf, const unsigned min_size ); void cleanup_and_fail( const int retval ); void show_error( const char * const msg, const int errcode, const bool help ); +void show_file_error( const char * const filename, const char * const msg, + const int errcode ); void internal_error( const char * const msg ); struct Matchfinder_base; void show_progress( const unsigned long long partial_size, diff --git a/main.c b/main.c index ecf8dd8..1af09f3 100644 --- a/main.c +++ b/main.c @@ -1,5 +1,5 @@ /* Clzip - LZMA lossless data compressor - Copyright (C) 2010-2016 Antonio Diaz Diaz. + Copyright (C) 2010-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -71,10 +71,10 @@ int verbosity = 0; const char * const Program_name = "Clzip"; const char * const program_name = "clzip"; -const char * const program_year = "2016"; +const char * const program_year = "2017"; const char * invocation_name = 0; -struct { const char * from; const char * to; } const known_extensions[] = { +const struct { const char * from; const char * to; } known_extensions[] = { { ".lz", "" }, { ".tlz", ".tar" }, { 0, 0 } }; @@ -85,7 +85,7 @@ struct Lzma_options int match_len_limit; /* 5 .. 273 */ }; -enum Mode { m_compress, m_decompress, m_test }; +enum Mode { m_compress, m_decompress, m_list, m_test }; char * output_filename = 0; int outfd = -1; @@ -106,6 +106,7 @@ static void show_help( void ) " -f, --force overwrite existing output files\n" " -F, --recompress force re-compression of compressed files\n" " -k, --keep keep (don't delete) input files\n" + " -l, --list print (un)compressed file sizes\n" " -m, --match-length= set match length limit in bytes [36]\n" " -o, --output= if reading standard input, write to \n" " -q, --quiet suppress all messages\n" @@ -145,23 +146,38 @@ static void show_version( void ) } +const char * bad_version( const unsigned version ) + { + static char buf[80]; + snprintf( buf, sizeof buf, "Version %u member format not supported.", + version ); + return buf; + } + + +const char * format_ds( const unsigned dictionary_size ) + { + enum { bufsize = 16, factor = 1024 }; + static char buf[bufsize]; + const char * const prefix[8] = + { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; + const char * p = ""; + const char * np = " "; + unsigned num = dictionary_size, i; + bool exact = ( num % factor == 0 ); + + for( i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) + { num /= factor; if( num % factor != 0 ) exact = false; + p = prefix[i]; np = ""; } + snprintf( buf, bufsize, "%s%4u %sB", np, num, p ); + return buf; + } + + static void show_header( const unsigned dictionary_size ) { if( verbosity >= 3 ) - { - const char * const prefix[8] = - { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; - enum { factor = 1024 }; - const char * p = ""; - const char * np = " "; - unsigned num = dictionary_size, i; - bool exact = ( num % factor == 0 ); - - for( i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) - { num /= factor; if( num % factor != 0 ) exact = false; - p = prefix[i]; np = ""; } - fprintf( stderr, "dictionary size %s%4u %sB. ", np, num, p ); - } + fprintf( stderr, "dictionary %s. ", format_ds( dictionary_size ) ); } @@ -181,8 +197,8 @@ static unsigned long long getnum( const char * const ptr, if( !errno && tail[0] ) { - const int factor = ( tail[1] == 'i' ) ? 1024 : 1000; - int exponent = 0; /* 0 = bad multiplier */ + const unsigned factor = ( tail[1] == 'i' ) ? 1024 : 1000; + int exponent = 0; /* 0 = bad multiplier */ int i; switch( tail[0] ) { @@ -220,7 +236,7 @@ static unsigned long long getnum( const char * const ptr, static int get_dict_size( const char * const arg ) { char * tail; - const int bits = strtol( arg, &tail, 0 ); + const long bits = strtol( arg, &tail, 0 ); if( bits >= min_dictionary_bits && bits <= max_dictionary_bits && *tail == 0 ) return ( 1 << bits ); @@ -228,68 +244,79 @@ static int get_dict_size( const char * const arg ) } +void set_mode( enum Mode * const program_modep, const enum Mode new_mode ) + { + if( *program_modep != m_compress && *program_modep != new_mode ) + { + show_error( "Only one operation can be specified.", 0, true ); + exit( 1 ); + } + *program_modep = new_mode; + } + + static int extension_index( const char * const name ) { - int i; - for( i = 0; known_extensions[i].from; ++i ) + int eindex; + for( eindex = 0; known_extensions[eindex].from; ++eindex ) { - const char * const ext = known_extensions[i].from; + const char * const ext = known_extensions[eindex].from; const unsigned name_len = strlen( name ); const unsigned ext_len = strlen( ext ); if( name_len > ext_len && strncmp( name + name_len - ext_len, ext, ext_len ) == 0 ) - return i; + return eindex; } return -1; } -static int open_instream( const char * const name, struct stat * const in_statsp, - const enum Mode program_mode, const int eindex, - const bool recompress, const bool to_stdout ) +int open_instream( const char * const name, struct stat * const in_statsp, + const bool no_ofile, const bool reg_only ) { - int infd = -1; - if( program_mode == m_compress && !recompress && eindex >= 0 ) - { - if( verbosity >= 0 ) - fprintf( stderr, "%s: Input file '%s' already has '%s' suffix.\n", - program_name, name, known_extensions[eindex].from ); - } + int infd = open( name, O_RDONLY | O_BINARY ); + if( infd < 0 ) + show_file_error( name, "Can't open input file", errno ); else { - infd = open( name, O_RDONLY | O_BINARY ); - if( infd < 0 ) + const int i = fstat( infd, in_statsp ); + const mode_t mode = in_statsp->st_mode; + const bool can_read = ( i == 0 && !reg_only && + ( S_ISBLK( mode ) || S_ISCHR( mode ) || + S_ISFIFO( mode ) || S_ISSOCK( mode ) ) ); + if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) ) { if( verbosity >= 0 ) - fprintf( stderr, "%s: Can't open input file '%s': %s\n", - program_name, name, strerror( errno ) ); - } - else - { - const int i = fstat( infd, in_statsp ); - const mode_t mode = in_statsp->st_mode; - const bool can_read = ( i == 0 && - ( S_ISBLK( mode ) || S_ISCHR( mode ) || - S_ISFIFO( mode ) || S_ISSOCK( mode ) ) ); - const bool no_ofile = ( to_stdout || program_mode == m_test ); - if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) ) - { - if( verbosity >= 0 ) - fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", - program_name, name, - ( can_read && !no_ofile ) ? - ",\n and '--stdout' was not specified" : "" ); - close( infd ); - infd = -1; - } + fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", + program_name, name, + ( can_read && !no_ofile ) ? + ",\n and '--stdout' was not specified" : "" ); + close( infd ); + infd = -1; } } return infd; } +static int open_instream2( const char * const name, struct stat * const in_statsp, + const enum Mode program_mode, const int eindex, + const bool recompress, const bool to_stdout ) + { + const bool no_ofile = ( to_stdout || program_mode == m_test ); + if( program_mode == m_compress && !recompress && eindex >= 0 ) + { + if( verbosity >= 0 ) + fprintf( stderr, "%s: Input file '%s' already has '%s' suffix.\n", + program_name, name, known_extensions[eindex].from ); + return -1; + } + return open_instream( name, in_statsp, no_ofile, false ); + } + + /* assure at least a minimum size for buffer 'buf' */ -static void * resize_buffer( void * buf, const int min_size ) +void * resize_buffer( void * buf, const unsigned min_size ) { if( buf ) buf = realloc( buf, min_size ); else buf = malloc( min_size ); @@ -312,19 +339,19 @@ static void set_c_outname( const char * const name, const bool multifile ) } -static void set_d_outname( const char * const name, const int i ) +static void set_d_outname( const char * const name, const int eindex ) { const unsigned name_len = strlen( name ); - if( i >= 0 ) + if( eindex >= 0 ) { - const char * const from = known_extensions[i].from; + const char * const from = known_extensions[eindex].from; const unsigned from_len = strlen( from ); if( name_len > from_len ) { output_filename = resize_buffer( output_filename, name_len + - strlen( known_extensions[0].to ) + 1 ); + strlen( known_extensions[eindex].to ) + 1 ); strcpy( output_filename, name ); - strcpy( output_filename + name_len - from_len, known_extensions[i].to ); + strcpy( output_filename + name_len - from_len, known_extensions[eindex].to ); return; } } @@ -360,7 +387,8 @@ static bool open_outstream( const bool force, const bool from_stdin ) } -static bool check_tty( const int infd, const enum Mode program_mode ) +static bool check_tty( const char * const input_filename, const int infd, + const enum Mode program_mode ) { if( program_mode == m_compress && isatty( outfd ) ) { @@ -370,7 +398,8 @@ static bool check_tty( const int infd, const enum Mode program_mode ) if( ( program_mode == m_decompress || program_mode == m_test ) && isatty( infd ) ) { - show_error( "I won't read compressed data from a terminal.", 0, true ); + show_file_error( input_filename, + "I won't read compressed data from a terminal.", 0 ); return false; } return true; @@ -467,7 +496,7 @@ static int compress( const unsigned long long member_size, bool error = false; if( zero ) { - encoder.fe = (struct FLZ_encoder *)malloc( sizeof (struct FLZ_encoder) ); + encoder.fe = (struct FLZ_encoder *)malloc( sizeof *encoder.fe ); if( !encoder.fe || !FLZe_init( encoder.fe, infd, outfd ) ) error = true; else encoder.eb = &encoder.fe->eb; } @@ -477,7 +506,7 @@ static int compress( const unsigned long long member_size, if( Fh_set_dictionary_size( header, encoder_options->dictionary_size ) && encoder_options->match_len_limit >= min_match_len_limit && encoder_options->match_len_limit <= max_match_len ) - encoder.e = (struct LZ_encoder *)malloc( sizeof (struct LZ_encoder) ); + encoder.e = (struct LZ_encoder *)malloc( sizeof *encoder.e ); else internal_error( "invalid argument to encoder." ); if( !encoder.e || !LZe_init( encoder.e, Fh_get_dictionary_size( header ), encoder_options->match_len_limit, infd, outfd ) ) @@ -538,10 +567,10 @@ static int compress( const unsigned long long member_size, } -static unsigned char xdigit( const int value ) +static unsigned char xdigit( const unsigned value ) { - if( value >= 0 && value <= 9 ) return '0' + value; - if( value >= 10 && value <= 15 ) return 'A' + value - 10; + if( value <= 9 ) return '0' + value; + if( value <= 15 ) return 'A' + value - 10; return 0; } @@ -554,28 +583,21 @@ static bool show_trailing_data( const uint8_t * const data, const int size, { int i; char buf[80]; - int len = snprintf( buf, sizeof buf, "%strailing data = ", - all ? "" : "first bytes of " ); - bool text = true; - for( i = 0; i < size; ++i ) - if( !isprint( data[i] ) ) { text = false; break; } - if( text ) + unsigned len = max( 0, snprintf( buf, sizeof buf, "%strailing data = ", + all ? "" : "first bytes of " ) ); + for( i = 0; i < size && len + 2 < sizeof buf; ++i ) { - if( len > 0 && len < (int)sizeof buf ) - snprintf( buf + len, sizeof buf - len, "'%.*s'", size, (const char *)data ); - } - else - { - for( i = 0; i < size && len > 0 && len + 3 < (int)sizeof buf; ++i ) - { - if( i > 0 ) buf[len++] = ' '; - buf[len++] = xdigit( data[i] >> 4 ); - buf[len++] = xdigit( data[i] & 0x0F ); - buf[len] = 0; - } + buf[len++] = xdigit( data[i] >> 4 ); + buf[len++] = xdigit( data[i] & 0x0F ); + buf[len++] = ' '; } + if( len < sizeof buf ) buf[len++] = '\''; + for( i = 0; i < size && len < sizeof buf; ++i ) + { if( isprint( data[i] ) ) buf[len++] = data[i]; else buf[len++] = '.'; } + if( len < sizeof buf ) buf[len++] = '\''; + if( len < sizeof buf ) buf[len] = 0; else buf[sizeof buf - 1] = 0; Pp_show_msg( pp, buf ); - if( !ignore_trailing ) show_error( "Trailing data not allowed.", 0, false ); + if( !ignore_trailing ) show_file_error( pp->name, trailing_msg, 0 ); } return ignore_trailing; } @@ -615,24 +637,19 @@ static int decompress( const int infd, struct Pretty_print * const pp, if( !Fh_verify_magic( header ) ) { if( first_member ) - { Pp_show_msg( pp, "Bad magic number (file not in lzip format)." ); - retval = 2; } + { show_file_error( pp->name, bad_magic_msg, 0 ); retval = 2; } else if( !show_trailing_data( header, size, pp, false, ignore_trailing ) ) retval = 2; break; } if( !Fh_verify_version( header ) ) { - if( verbosity >= 0 ) - { Pp_show_msg( pp, 0 ); - fprintf( stderr, "Version %d member format not supported.\n", - Fh_version( header ) ); } + Pp_show_msg( pp, bad_version( Fh_version( header ) ) ); retval = 2; break; } dictionary_size = Fh_get_dictionary_size( header ); if( !isvalid_ds( dictionary_size ) ) - { Pp_show_msg( pp, "Invalid dictionary size in member header." ); - retval = 2; break; } + { Pp_show_msg( pp, bad_dict_msg ); retval = 2; break; } if( verbosity >= 2 || ( verbosity == 1 && first_member ) ) { Pp_show_msg( pp, 0 ); show_header( dictionary_size ); } @@ -693,6 +710,16 @@ void show_error( const char * const msg, const int errcode, const bool help ) } +void show_file_error( const char * const filename, const char * const msg, + const int errcode ) + { + if( verbosity < 0 ) return; + fprintf( stderr, "%s: %s: %s", program_name, filename, msg ); + if( errcode > 0 ) fprintf( stderr, ": %s", strerror( errcode ) ); + fputc( '\n', stderr ); + } + + void internal_error( const char * const msg ) { if( verbosity >= 0 ) @@ -746,7 +773,6 @@ int main( const int argc, const char * const argv[] ) const unsigned long long max_volume_size = 0x4000000000000000ULL; unsigned long long member_size = max_member_size; unsigned long long volume_size = 0; - const char * input_filename = ""; const char * default_output_filename = ""; const char ** filenames = 0; int num_filenames = 0; @@ -759,8 +785,8 @@ int main( const int argc, const char * const argv[] ) bool force = false; bool ignore_trailing = true; bool keep_input_files = false; - bool stdin_used = false; bool recompress = false; + bool stdin_used = false; bool to_stdout = false; bool zero = false; struct Pretty_print pp; @@ -785,6 +811,7 @@ int main( const int argc, const char * const argv[] ) { 'F', "recompress", ap_no }, { 'h', "help", ap_no }, { 'k', "keep", ap_no }, + { 'l', "list", ap_no }, { 'm', "match-length", ap_yes }, { 'n', "threads", ap_yes }, { 'o', "output", ap_yes }, @@ -794,7 +821,7 @@ int main( const int argc, const char * const argv[] ) { 't', "test", ap_no }, { 'v', "verbose", ap_no }, { 'V', "version", ap_no }, - { 0 , 0, ap_no } }; + { 0 , 0, ap_no } }; struct Arg_parser parser; @@ -820,11 +847,12 @@ int main( const int argc, const char * const argv[] ) case 'a': ignore_trailing = false; break; case 'b': member_size = getnum( arg, 100000, max_member_size ); break; case 'c': to_stdout = true; break; - case 'd': program_mode = m_decompress; break; + case 'd': set_mode( &program_mode, m_decompress ); break; case 'f': force = true; break; case 'F': recompress = true; break; case 'h': show_help(); return 0; case 'k': keep_input_files = true; break; + case 'l': set_mode( &program_mode, m_list ); break; case 'm': encoder_options.match_len_limit = getnum( arg, min_match_len_limit, max_match_len ); zero = false; break; @@ -834,7 +862,7 @@ int main( const int argc, const char * const argv[] ) case 's': encoder_options.dictionary_size = get_dict_size( arg ); zero = false; break; case 'S': volume_size = getnum( arg, 100000, max_volume_size ); break; - case 't': program_mode = m_test; break; + case 't': set_mode( &program_mode, m_test ); break; case 'v': if( verbosity < 4 ) ++verbosity; break; case 'V': show_version(); return 0; default : internal_error( "uncaught option." ); @@ -846,14 +874,6 @@ int main( const int argc, const char * const argv[] ) setmode( STDOUT_FILENO, O_BINARY ); #endif - if( program_mode == m_test ) - outfd = -1; - else if( program_mode == m_compress ) - { - Dis_slots_init(); - Prob_prices_init(); - } - num_filenames = max( 1, ap_arguments( &parser ) - argind ); filenames = resize_buffer( filenames, num_filenames * sizeof filenames[0] ); filenames[0] = "-"; @@ -864,6 +884,17 @@ int main( const int argc, const char * const argv[] ) if( strcmp( filenames[i], "-" ) != 0 ) filenames_given = true; } + if( program_mode == m_list ) + return list_files( filenames, num_filenames, ignore_trailing ); + + if( program_mode == m_test ) + outfd = -1; + else if( program_mode == m_compress ) + { + Dis_slots_init(); + Prob_prices_init(); + } + if( !to_stdout && program_mode != m_test && ( filenames_given || default_output_filename[0] ) ) set_signals(); @@ -873,6 +904,7 @@ int main( const int argc, const char * const argv[] ) output_filename = resize_buffer( output_filename, 1 ); for( i = 0; i < num_filenames; ++i ) { + const char * input_filename = ""; int tmp; struct stat in_stats; const struct stat * in_statsp; @@ -881,7 +913,6 @@ int main( const int argc, const char * const argv[] ) if( !filenames[i][0] || strcmp( filenames[i], "-" ) == 0 ) { if( stdin_used ) continue; else stdin_used = true; - input_filename = ""; infd = STDIN_FILENO; if( program_mode != m_test ) { @@ -908,10 +939,9 @@ int main( const int argc, const char * const argv[] ) } else { - const int eindex = extension_index( filenames[i] ); - input_filename = filenames[i]; - infd = open_instream( input_filename, &in_stats, program_mode, - eindex, recompress, to_stdout ); + const int eindex = extension_index( input_filename = filenames[i] ); + infd = open_instream2( input_filename, &in_stats, program_mode, + eindex, recompress, to_stdout ); if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } if( program_mode != m_test ) { @@ -931,14 +961,15 @@ int main( const int argc, const char * const argv[] ) } } - if( !check_tty( infd, program_mode ) ) + Pp_set_name( &pp, input_filename ); + if( !check_tty( pp.name, infd, program_mode ) ) { if( retval < 1 ) retval = 1; + if( program_mode == m_test ) { close( infd ); infd = -1; continue; } cleanup_and_fail( retval ); } in_statsp = input_filename[0] ? &in_stats : 0; - Pp_set_name( &pp, input_filename ); if( program_mode == m_compress ) tmp = compress( member_size, volume_size, infd, &encoder_options, &pp, in_statsp, zero ); diff --git a/testsuite/check.sh b/testsuite/check.sh index 52347b4..add0722 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -1,6 +1,6 @@ #! /bin/sh # check script for Clzip - LZMA lossless data compressor -# Copyright (C) 2010-2016 Antonio Diaz Diaz. +# Copyright (C) 2010-2017 Antonio Diaz Diaz. # # This script is free software: you have unlimited permission # to copy, distribute and modify it. @@ -17,12 +17,12 @@ if [ ! -f "${LZIP}" ] || [ ! -x "${LZIP}" ] ; then exit 1 fi -if [ -e "${LZIP}" ] 2> /dev/null ; then true -else +[ -e "${LZIP}" ] 2> /dev/null || + { echo "$0: a POSIX shell is required to run the tests" echo "Try bash -c \"$0 $1 $2\"" exit 1 -fi + } if [ -d tmp ] ; then rm -rf tmp ; fi mkdir tmp @@ -31,143 +31,229 @@ cd "${objdir}"/tmp || framework_failure cat "${testdir}"/test.txt > in || framework_failure in_lz="${testdir}"/test.txt.lz fail=0 +test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; } printf "testing clzip-%s..." "$2" "${LZIP}" -fkqm4 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO "${LZIP}" -fkqm274 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs-1 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs0 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs4095 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs513MiB in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO +for i in bad_size -1 0 4095 513MiB 1G 1T 1P 1E 1Z 1Y 10KB ; do + "${LZIP}" -fkqs $i in + { [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO $i +done +"${LZIP}" -lq in +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -tq in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -tq < in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -cdq in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -cdq < in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi -dd if="${in_lz}" bs=1 count=6 2> /dev/null | "${LZIP}" -tq -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi -dd if="${in_lz}" bs=1 count=20 2> /dev/null | "${LZIP}" -tq -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO +# these are for code coverage +"${LZIP}" -lt "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cdl "${in_lz}" > out 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cdt "${in_lz}" > out 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -t -- nx_file 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --help > /dev/null || test_failed $LINENO +"${LZIP}" -n1 -V > /dev/null || test_failed $LINENO +"${LZIP}" -m 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -z 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --bad_option 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --t 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --test=2 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --output= 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --output 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +printf "LZIP\001-.............................." | "${LZIP}" -t 2> /dev/null +printf "LZIP\002-.............................." | "${LZIP}" -t 2> /dev/null +printf "LZIP\001+.............................." | "${LZIP}" -t 2> /dev/null printf "\ntesting decompression..." -"${LZIP}" -t "${in_lz}" -if [ $? = 0 ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -cd "${in_lz}" > copy || fail=1 -cmp in copy || fail=1 -printf . +"${LZIP}" -lq "${in_lz}" || test_failed $LINENO +"${LZIP}" -t "${in_lz}" || test_failed $LINENO +"${LZIP}" -cd "${in_lz}" > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO rm -f copy cat "${in_lz}" > copy.lz || framework_failure -"${LZIP}" -dk copy.lz || fail=1 -cmp in copy || fail=1 +"${LZIP}" -dk copy.lz || test_failed $LINENO +cmp in copy || test_failed $LINENO printf "to be overwritten" > copy || framework_failure -"${LZIP}" -dq copy.lz -if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi +"${LZIP}" -d copy.lz 2> /dev/null +[ $? = 1 ] || test_failed $LINENO "${LZIP}" -df copy.lz -if [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; then - printf . ; else printf - ; fail=1 ; fi +{ [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; } || test_failed $LINENO printf "to be overwritten" > copy || framework_failure -"${LZIP}" -df -o copy < "${in_lz}" || fail=1 -cmp in copy || fail=1 -printf . +"${LZIP}" -df -o copy < "${in_lz}" || test_failed $LINENO +cmp in copy || test_failed $LINENO rm -f copy -"${LZIP}" < in > anyothername || fail=1 -"${LZIP}" -d -o copy - anyothername - < "${in_lz}" -if [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; then - printf . ; else printf - ; fail=1 ; fi +"${LZIP}" < in > anyothername || test_failed $LINENO +"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null +{ [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; } || + test_failed $LINENO rm -f copy anyothername.out +"${LZIP}" -lq in "${in_lz}" +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -lq nx_file.lz "${in_lz}" +[ $? = 1 ] || test_failed $LINENO "${LZIP}" -tq in "${in_lz}" -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -tq foo.lz "${in_lz}" -if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -tq nx_file.lz "${in_lz}" +[ $? = 1 ] || test_failed $LINENO "${LZIP}" -cdq in "${in_lz}" > copy -if [ $? = 2 ] && cat copy in | cmp in - ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -cdq foo.lz "${in_lz}" > copy -if [ $? = 1 ] && cmp in copy ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && cat copy in | cmp in - ; } || test_failed $LINENO +"${LZIP}" -cdq nx_file.lz "${in_lz}" > copy +{ [ $? = 1 ] && cmp in copy ; } || test_failed $LINENO rm -f copy cat "${in_lz}" > copy.lz || framework_failure +for i in 1 2 3 4 5 6 7 ; do + printf "g" >> copy.lz || framework_failure + "${LZIP}" -alvv copy.lz "${in_lz}" > /dev/null 2>&1 + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -atvvvv copy.lz "${in_lz}" 2> /dev/null + [ $? = 2 ] || test_failed $LINENO $i +done "${LZIP}" -dq in copy.lz -if [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; then - printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -dq foo.lz copy.lz -if [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e foo ] && cmp in copy ; then - printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; } || + test_failed $LINENO +"${LZIP}" -dq nx_file.lz copy.lz +{ [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e nx_file ] && cmp in copy ; } || + test_failed $LINENO cat in in > in2 || framework_failure -"${LZIP}" -o copy2 < in2 || fail=1 -"${LZIP}" -t copy2.lz || fail=1 -"${LZIP}" -cd copy2.lz > copy2 || fail=1 -cmp in2 copy2 || fail=1 -printf . +cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure +"${LZIP}" -lq in2.lz || test_failed $LINENO +"${LZIP}" -t in2.lz || test_failed $LINENO +"${LZIP}" -cd in2.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO + +"${LZIP}" --output=copy2 < in2 || test_failed $LINENO +"${LZIP}" -lq copy2.lz || test_failed $LINENO +"${LZIP}" -t copy2.lz || test_failed $LINENO +"${LZIP}" -cd copy2.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO -printf "garbage" >> copy2.lz || framework_failure +printf "\ngarbage" >> copy2.lz || framework_failure +"${LZIP}" -tvvvv copy2.lz 2> /dev/null || test_failed $LINENO rm -f copy2 +"${LZIP}" -alq copy2.lz +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -atq copy2.lz -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -atq < copy2.lz -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -adkq copy2.lz -if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO "${LZIP}" -adkq -o copy2 < copy2.lz -if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO printf "to be overwritten" > copy2 || framework_failure -"${LZIP}" -df copy2.lz || fail=1 -cmp in2 copy2 || fail=1 -printf . +"${LZIP}" -df copy2.lz || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO printf "\ntesting compression..." -"${LZIP}" -cfq "${in_lz}" > out # /dev/null is a tty on OS/2 -if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -cF "${in_lz}" > out || fail=1 -"${LZIP}" -cd out | "${LZIP}" -d > copy || fail=1 -cmp in copy || fail=1 -printf . +"${LZIP}" -cf "${in_lz}" > out 2> /dev/null # /dev/null is a tty on OS/2 +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cFvvm36 "${in_lz}" > out 2> /dev/null || test_failed $LINENO +"${LZIP}" -cd out | "${LZIP}" -d > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -k -$i in || fail=1 - mv -f in.lz copy.lz || fail=1 - printf "garbage" >> copy.lz || fail=1 - "${LZIP}" -df copy.lz || fail=1 - cmp in copy || fail=1 + "${LZIP}" -k -$i in || test_failed $LINENO $i + mv -f in.lz copy.lz || test_failed $LINENO $i + printf "garbage" >> copy.lz || framework_failure + "${LZIP}" -df copy.lz || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -c -$i in > out || fail=1 - printf "g" >> out || fail=1 - "${LZIP}" -cd out > copy || fail=1 - cmp in copy || fail=1 + "${LZIP}" -c -$i in > out || test_failed $LINENO $i + printf "g" >> out || framework_failure + "${LZIP}" -cd out > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -$i < in > out || fail=1 - "${LZIP}" -d < out > copy || fail=1 - cmp in copy || fail=1 + "${LZIP}" -$i < in > out || test_failed $LINENO $i + "${LZIP}" -d < out > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -f -$i -o out < in || fail=1 - "${LZIP}" -df -o copy < out.lz || fail=1 - cmp in copy || fail=1 + "${LZIP}" -f -$i -o out < in || test_failed $LINENO $i + "${LZIP}" -df -o copy < out.lz || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . + +cat in in in in in in in in > in8 || framework_failure +"${LZIP}" -1s12 -S100k -o out < in8 || test_failed $LINENO +"${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO +"${LZIP}" -cd out00001.lz out00002.lz | cmp in8 - || test_failed $LINENO +rm -f out00001.lz +"${LZIP}" -1ks4Ki -b100000 in8 || test_failed $LINENO +"${LZIP}" -t in8.lz || test_failed $LINENO +"${LZIP}" -cd in8.lz | cmp in8 - || test_failed $LINENO +rm -f in8 +"${LZIP}" -0 -S100k -o out < in8.lz || test_failed $LINENO +"${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO +"${LZIP}" -cd out00001.lz out00002.lz | cmp in8.lz - || test_failed $LINENO +rm -f out00001.lz out00002.lz +"${LZIP}" -0kF -b100k in8.lz || test_failed $LINENO +"${LZIP}" -t in8.lz.lz || test_failed $LINENO +"${LZIP}" -cd in8.lz.lz | cmp in8.lz - || test_failed $LINENO +rm -f in8.lz in8.lz.lz + +printf "\ntesting bad input..." + +cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure +if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && + [ -e trunc.lz ] && cmp in2.lz trunc.lz > /dev/null 2>&1 ; then + for i in 6 20 14734 14753 14754 14755 14756 14757 14758 ; do + dd if=in3.lz of=trunc.lz bs=$i count=1 2> /dev/null + "${LZIP}" -lq trunc.lz + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -t trunc.lz 2> /dev/null + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -tq < trunc.lz + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -cdq trunc.lz > out + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -dq < trunc.lz > out + [ $? = 2 ] || test_failed $LINENO $i + done +else + printf "\nwarning: skipping truncation test: 'dd' does not work on your system." +fi + +cat "${in_lz}" > ingin.lz || framework_failure +printf "g" >> ingin.lz || framework_failure +cat "${in_lz}" >> ingin.lz || framework_failure +"${LZIP}" -lq ingin.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -t ingin.lz || test_failed $LINENO +"${LZIP}" -cd ingin.lz > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO +"${LZIP}" -t < ingin.lz || test_failed $LINENO +"${LZIP}" -d < ingin.lz > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO echo if [ ${fail} = 0 ] ; then diff --git a/testsuite/test.txt.lz b/testsuite/test.txt.lz index 41d2e39..22cea6e 100644 Binary files a/testsuite/test.txt.lz and b/testsuite/test.txt.lz differ -- cgit v1.2.3