diff options
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | INSTALL | 2 | ||||
-rw-r--r-- | Makefile.in | 11 | ||||
-rw-r--r-- | NEWS | 26 | ||||
-rw-r--r-- | README | 29 | ||||
-rw-r--r-- | arg_parser.cc | 16 | ||||
-rw-r--r-- | arg_parser.h | 5 | ||||
-rwxr-xr-x | configure | 21 | ||||
-rw-r--r-- | decoder.cc | 56 | ||||
-rw-r--r-- | decoder.h | 119 | ||||
-rw-r--r-- | doc/lzip.1 | 7 | ||||
-rw-r--r-- | doc/lzip.info | 202 | ||||
-rw-r--r-- | doc/lzip.texi | 186 | ||||
-rw-r--r-- | encoder.cc | 85 | ||||
-rw-r--r-- | encoder.h | 62 | ||||
-rw-r--r-- | encoder_base.cc | 11 | ||||
-rw-r--r-- | encoder_base.h | 121 | ||||
-rw-r--r-- | fast_encoder.cc | 41 | ||||
-rw-r--r-- | fast_encoder.h | 7 | ||||
-rw-r--r-- | file_index.cc | 192 | ||||
-rw-r--r-- | file_index.h | 85 | ||||
-rw-r--r-- | list.cc | 121 | ||||
-rw-r--r-- | lzip.h | 32 | ||||
-rw-r--r-- | main.cc | 266 | ||||
-rwxr-xr-x | testsuite/check.sh | 264 | ||||
-rw-r--r-- | testsuite/test.txt.lz | bin | 7376 -> 7376 bytes |
26 files changed, 1286 insertions, 696 deletions
@@ -1,3 +1,16 @@ +2017-04-13 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.19 released. + * The option '-l, --list' has been ported from lziprecover. + * Don't allow mixing different operations (-d, -l or -t). + * Compression time of option '-0' has been slightly reduced. + * Decompression time has been reduced by 2%. + * main.cc: Continue testing if any input file is a terminal. + * main.cc: Show trailing data in both hexadecimal and ASCII. + * encoder.cc (Matchfinder_base): Verify size passed to new. + * file_index.cc: Improve detection of bad dict and trailing data. + * lzip.h: Unified messages for bad magic, trailing data, etc. + 2016-05-14 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.18 released. @@ -263,7 +276,7 @@ * Version 0.1 released. -Copyright (C) 2008-2016 Antonio Diaz Diaz. +Copyright (C) 2008-2017 Antonio Diaz Diaz. This file is a collection of facts, and thus it is not copyrightable, but just in case, you have unlimited permission to copy, distribute and @@ -58,7 +58,7 @@ After running 'configure', you can run 'make' and 'make install' as explained above. -Copyright (C) 2008-2016 Antonio Diaz Diaz. +Copyright (C) 2008-2017 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/Makefile.in b/Makefile.in index 88109fb..9db3f63 100644 --- a/Makefile.in +++ b/Makefile.in @@ -7,7 +7,8 @@ INSTALL_DIR = $(INSTALL) -d -m 755 SHELL = /bin/sh CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 -objs = arg_parser.o encoder_base.o encoder.o fast_encoder.o decoder.o main.o +objs = arg_parser.o file_index.o list.o encoder_base.o encoder.o \ + fast_encoder.o decoder.o main.o .PHONY : all install install-bin install-info install-man \ @@ -33,6 +34,8 @@ decoder.o : lzip.h decoder.h encoder_base.o : lzip.h encoder_base.h encoder.o : lzip.h encoder_base.h encoder.h fast_encoder.o : lzip.h encoder_base.h fast_encoder.h +file_index.o : lzip.h file_index.h +list.o : lzip.h file_index.h main.o : arg_parser.h lzip.h decoder.h encoder_base.h encoder.h fast_encoder.h @@ -113,11 +116,11 @@ dist : doc $(DISTNAME)/doc/$(progname).1 \ $(DISTNAME)/doc/$(pkgname).info \ $(DISTNAME)/doc/$(pkgname).texi \ + $(DISTNAME)/*.h \ + $(DISTNAME)/*.cc \ $(DISTNAME)/testsuite/check.sh \ $(DISTNAME)/testsuite/test.txt \ - $(DISTNAME)/testsuite/test.txt.lz \ - $(DISTNAME)/*.h \ - $(DISTNAME)/*.cc + $(DISTNAME)/testsuite/test.txt.lz rm -f $(DISTNAME) lzip -v -9 $(DISTNAME).tar @@ -1,22 +1,16 @@ -Changes in version 1.18: +Changes in version 1.19: -The option "-a, --trailing-error", which makes lzip exit with error -status 2 if any remaining input is detected after decompressing the last -member, has been added. +The option '-l, --list' has been ported from lziprecover. -Decompression time has been reduced by 2%. - -The test of the value remaining in the range decoder has been removed. -(After extensive testing it has been found useless to detect corruption -in the decompressed data. Eliminating it reduces the number of false -positives for corruption and makes error detection more accurate). +It is now an error to specify two or more different operations in the +command line (--decompress, --list or --test). -When decompressing, the file specified with the '--output' option is now -deleted if the input is a terminal. +Compression time of option '-0' has been slightly reduced. -Decompression support for version 0 files has been removed. +Decompression time has been reduced by 2%. -The new chapter "Trailing data" has been added to the manual. +In test mode, lzip now continues checking the rest of the files if any +input file is a terminal. -A harmless check failure on Windows, caused by the failed comparison of -a message in text mode, has been fixed. +Trailing data are now shown both in hexadecimal and as a string of +printable ASCII characters. @@ -1,9 +1,10 @@ Description Lzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Lzip is about as fast as gzip, compresses most -files more than bzip2, and is better than both from a data recovery -perspective. +one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0), +or compress most files more than bzip2 (lzip -9). Decompression speed is +intermediate between gzip and bzip2. Lzip is better than gzip and bzip2 +from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -16,11 +17,11 @@ availability: merging of damaged copies of a file. * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the code of a simple decompressor along with a - detailed explanation of how it works, so that with the only help of - the lzip manual it would be possible for a digital archaeologist to - extract the data from a lzip file long after quantum computers - eventually render LZMA obsolete. + lzip manual provides the source code of a simple decompressor along + with a detailed explanation of how it works, so that with the only + help of the lzip manual it would be possible for a digital + archaeologist to extract the data from a lzip file long after + quantum computers eventually render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. @@ -75,11 +76,11 @@ or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. -Lzip can produce multimember files and safely recover, with -lziprecover, the undamaged members in case of file damage. Lzip can -also split the compressed output in volumes of a given size, even when -reading from standard input. This allows the direct creation of -multivolume compressed tar archives. +Lzip can produce multimember files, and lziprecover can safely recover +the undamaged members in case of file damage. Lzip can also split the +compressed output in volumes of a given size, even when reading from +standard input. This allows the direct creation of multivolume +compressed tar archives. Lzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are @@ -110,7 +111,7 @@ range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI). -Copyright (C) 2008-2016 Antonio Diaz Diaz. +Copyright (C) 2008-2017 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/arg_parser.cc b/arg_parser.cc index 82972ad..cc7d1e2 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2016 Antonio Diaz Diaz. + Copyright (C) 2006-2017 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -42,7 +42,7 @@ bool Arg_parser::parse_long_option( const char * const opt, const char * const a else if( index < 0 ) index = i; // First nonexact match found else if( options[index].code != options[i].code || options[index].has_arg != options[i].has_arg ) - ambig = true; // Second or later nonexact match found + ambig = true; // Second or later nonexact match found } if( ambig && !exact ) @@ -142,7 +142,7 @@ Arg_parser::Arg_parser( const int argc, const char * const argv[], { if( argc < 2 || !argv || !options ) return; - std::vector< std::string > non_options; // skipped non-options + std::vector< const char * > non_options; // skipped non-options int argind = 1; // index in argv while( argind < argc ) @@ -163,17 +163,17 @@ Arg_parser::Arg_parser( const int argc, const char * const argv[], } else { - if( !in_order ) non_options.push_back( argv[argind++] ); - else { data.push_back( Record() ); data.back().argument = argv[argind++]; } + if( in_order ) data.push_back( Record( argv[argind++] ) ); + else non_options.push_back( argv[argind++] ); } } if( error_.size() ) data.clear(); else { for( unsigned i = 0; i < non_options.size(); ++i ) - { data.push_back( Record() ); data.back().argument.swap( non_options[i] ); } + data.push_back( Record( non_options[i] ) ); while( argind < argc ) - { data.push_back( Record() ); data.back().argument = argv[argind++]; } + data.push_back( Record( argv[argind++] ) ); } } @@ -192,5 +192,5 @@ Arg_parser::Arg_parser( const char * const opt, const char * const arg, parse_short_option( opt, arg, options, argind ); if( error_.size() ) data.clear(); } - else { data.push_back( Record() ); data.back().argument = opt; } + else data.push_back( Record( opt ) ); } diff --git a/arg_parser.h b/arg_parser.h index f45b9ac..95b0320 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2016 Antonio Diaz Diaz. + Copyright (C) 2006-2017 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -57,7 +57,8 @@ private: { int code; std::string argument; - explicit Record( const int c = 0 ) : code( c ) {} + explicit Record( const int c ) : code( c ) {} + explicit Record( const char * const arg ) : code( 0 ), argument( arg ) {} }; std::string error_; @@ -1,12 +1,12 @@ #! /bin/sh # configure script for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2016 Antonio Diaz Diaz. +# Copyright (C) 2008-2017 Antonio Diaz Diaz. # # This configure script is free software: you have unlimited permission # to copy, distribute and modify it. pkgname=lzip -pkgversion=1.18 +pkgversion=1.19 progname=lzip srctrigger=doc/${pkgname}.texi @@ -26,11 +26,11 @@ CXXFLAGS='-Wall -W -O2' LDFLAGS= # checking whether we are using GNU C++. -if /bin/sh -c "${CXX} --version" > /dev/null 2>&1 ; then true -else +/bin/sh -c "${CXX} --version" > /dev/null 2>&1 || + { CXX=c++ - CXXFLAGS='-W -O2' -fi + CXXFLAGS=-O2 + } # Loop over all args args= @@ -52,9 +52,12 @@ while [ $# != 0 ] ; do # Process the options case ${option} in --help | -h) - echo "Usage: configure [options]" + echo "Usage: $0 [OPTION]... [VAR=VALUE]..." + echo + echo "To assign makefile variables (e.g., CXX, CXXFLAGS...), specify them as" + echo "arguments to configure in the form VAR=VALUE." echo - echo "Options: [defaults in brackets]" + echo "Options and variables: [defaults in brackets]" echo " -h, --help display this help and exit" echo " -V, --version output version information and exit" echo " --srcdir=DIR find the sources in DIR [. or ..]" @@ -165,7 +168,7 @@ echo "LDFLAGS = ${LDFLAGS}" rm -f Makefile cat > Makefile << EOF # Makefile for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2016 Antonio Diaz Diaz. +# Copyright (C) 2008-2017 Antonio Diaz Diaz. # This file was generated automatically by configure. Don't edit. # # This Makefile is free software: you have unlimited permission @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -170,7 +170,7 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const ( 8.0 * member_size ) / data_size, 100.0 * ( 1.0 - ( (double)member_size / data_size ) ) ); if( !error && verbosity >= 4 ) - std::fprintf( stderr, "data CRC %08X, data size %9llu, member size %8llu. ", + std::fprintf( stderr, "CRC %08X, decompressed %9llu, compressed %8llu. ", crc(), data_size, member_size ); return !error; } @@ -188,7 +188,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) Bit_model bm_rep2[State::states]; Bit_model bm_len[State::states][pos_states]; Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; - Bit_model bm_dis[modeled_distances-end_dis_model]; + Bit_model bm_dis[modeled_distances-end_dis_model+1]; Bit_model bm_align[dis_align_size]; Len_model match_len_model; Len_model rep_len_model; @@ -204,25 +204,23 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { - const uint8_t prev_byte = peek_prev(); - if( state.is_char() ) - { - state.set_char1(); - put_byte( rdec.decode_tree8( bm_literal[get_lit_state(prev_byte)] ) ); - } + Bit_model * const bm = bm_literal[get_lit_state(peek_prev())]; + if( state.is_char_set_char() ) + put_byte( rdec.decode_tree8( bm ) ); else - { - state.set_char2(); - put_byte( rdec.decode_matched( bm_literal[get_lit_state(prev_byte)], - peek( rep0 ) ) ); - } + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); } else // match or repeated match { int len; if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit { - if( rdec.decode_bit( bm_rep0[state()] ) != 0 ) // 3rd bit + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else { unsigned distance; if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit @@ -238,34 +236,28 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) rep1 = rep0; rep0 = distance; } - else - { - if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } - } state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); } else // match { - const unsigned rep0_saved = rep0; len = min_match_len + rdec.decode_len( match_len_model, pos_state ); - const int dis_slot = rdec.decode_tree6( bm_dis_slot[get_len_state(len)] ); - if( dis_slot < start_dis_model ) rep0 = dis_slot; - else + unsigned distance = rdec.decode_tree6( bm_dis_slot[get_len_state(len)] ); + if( distance >= start_dis_model ) { + const unsigned dis_slot = distance; const int direct_bits = ( dis_slot >> 1 ) - 1; - rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; if( dis_slot < end_dis_model ) - rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, - direct_bits ); + distance += rdec.decode_tree_reversed( + bm_dis + ( distance - dis_slot ), direct_bits ); else { - rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; - rep0 += rdec.decode_tree_reversed4( bm_align ); - if( rep0 == 0xFFFFFFFFU ) // marker found + distance += + rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + distance += rdec.decode_tree_reversed4( bm_align ); + if( distance == 0xFFFFFFFFU ) // marker found { - rep0 = rep0_saved; rdec.normalize(); flush_data(); if( len == min_match_len ) // End Of Stream marker @@ -285,7 +277,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) } } } - rep3 = rep2; rep2 = rep1; rep1 = rep0_saved; + rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; state.set_match(); if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) { flush_data(); return 1; } @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,7 +49,9 @@ public: bool finished() { return pos >= stream_pos && !read_block(); } unsigned long long member_position() const { return partial_member_pos + pos; } - void reset_member_position() { partial_member_pos = -pos; } + + void reset_member_position() + { partial_member_pos = 0; partial_member_pos -= pos; } uint8_t get_byte() { @@ -60,15 +62,15 @@ public: int read_data( uint8_t * const outbuf, const int size ) { - int rest = size; - while( rest > 0 && !finished() ) + int sz = 0; + while( sz < size && !finished() ) { - const int rd = std::min( rest, stream_pos - pos ); - std::memcpy( outbuf + size - rest, buffer + pos, rd ); + const int rd = std::min( size - sz, stream_pos - pos ); + std::memcpy( outbuf + sz, buffer + pos, rd ); pos += rd; - rest -= rd; + sz += rd; } - return size - rest; + return sz; } void load() @@ -85,24 +87,23 @@ public: { range <<= 8; code = (code << 8) | get_byte(); } } - int decode( const int num_bits ) + unsigned decode( const int num_bits ) { - int symbol = 0; + unsigned symbol = 0; for( int i = num_bits; i > 0; --i ) { normalize(); range >>= 1; // symbol <<= 1; // if( code >= range ) { code -= range; symbol |= 1; } - const uint32_t mask = 0U - (code < range); - code -= range; - code += range & mask; - symbol = (symbol << 1) + (mask + 1); + const bool bit = ( code >= range ); + symbol = ( symbol << 1 ) + bit; + code -= range & ( 0U - bit ); } return symbol; } - int decode_bit( Bit_model & bm ) + unsigned decode_bit( Bit_model & bm ) { normalize(); const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; @@ -121,18 +122,18 @@ public: } } - int decode_tree3( Bit_model bm[] ) + unsigned decode_tree3( Bit_model bm[] ) { - int symbol = 1; + unsigned symbol = 1; symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); return symbol & 7; } - int decode_tree6( Bit_model bm[] ) + unsigned decode_tree6( Bit_model bm[] ) { - int symbol = 1; + unsigned symbol = 1; symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); @@ -142,49 +143,47 @@ public: return symbol & 0x3F; } - int decode_tree8( Bit_model bm[] ) + unsigned decode_tree8( Bit_model bm[] ) { - int symbol = 1; - while( symbol < 0x100 ) + unsigned symbol = 1; + for( int i = 0; i < 8; ++i ) symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); return symbol & 0xFF; } - int decode_tree_reversed( Bit_model bm[], const int num_bits ) + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) { - int model = 1; - int symbol = 0; + unsigned model = 1; + unsigned symbol = 0; for( int i = 0; i < num_bits; ++i ) { - const bool bit = decode_bit( bm[model] ); - model <<= 1; - if( bit ) { ++model; symbol |= (1 << i); } + const unsigned bit = decode_bit( bm[model] ); + model = ( model << 1 ) + bit; + symbol |= ( bit << i ); } return symbol; } - int decode_tree_reversed4( Bit_model bm[] ) + unsigned decode_tree_reversed4( Bit_model bm[] ) { - int model = 1; - int symbol = decode_bit( bm[model] ); - model = (model << 1) + symbol; - int bit = decode_bit( bm[model] ); - model = (model << 1) + bit; symbol |= (bit << 1); + unsigned symbol = decode_bit( bm[1] ); + unsigned model = 2 + symbol; + unsigned bit = decode_bit( bm[model] ); + model = ( model << 1 ) + bit; symbol |= ( bit << 1 ); bit = decode_bit( bm[model] ); - model = (model << 1) + bit; symbol |= (bit << 2); - if( decode_bit( bm[model] ) ) symbol |= 8; + model = ( model << 1 ) + bit; symbol |= ( bit << 2 ); + symbol |= ( decode_bit( bm[model] ) << 3 ); return symbol; } - int decode_matched( Bit_model bm[], int match_byte ) + unsigned decode_matched( Bit_model bm[], unsigned match_byte ) { Bit_model * const bm1 = bm + 0x100; - int symbol = 1; + unsigned symbol = 1; while( symbol < 0x100 ) { - match_byte <<= 1; - const int match_bit = match_byte & 0x100; - const int bit = decode_bit( bm1[match_bit+symbol] ); + const unsigned match_bit = ( match_byte <<= 1 ) & 0x100; + const unsigned bit = decode_bit( bm1[match_bit+symbol] ); symbol = ( symbol << 1 ) | bit; if( match_bit != bit << 8 ) { @@ -196,7 +195,7 @@ public: return symbol & 0xFF; } - int decode_len( Len_model & lm, const int pos_state ) + unsigned decode_len( Len_model & lm, const int pos_state ) { if( decode_bit( lm.choice1 ) == 0 ) return decode_tree3( lm.bm_low[pos_state] ); @@ -224,14 +223,15 @@ class LZ_decoder uint8_t peek_prev() const { - const unsigned i = ( ( pos > 0 ) ? pos : dictionary_size ) - 1; - return buffer[i]; + if( pos > 0 ) return buffer[pos-1]; + if( pos_wrapped ) return buffer[dictionary_size-1]; + return 0; // prev_byte of first byte } uint8_t peek( const unsigned distance ) const { - unsigned i = pos - distance - 1; - if( pos <= distance ) i += dictionary_size; + const unsigned i = ( ( pos > distance ) ? 0 : dictionary_size ) + + pos - distance - 1; return buffer[i]; } @@ -243,17 +243,26 @@ class LZ_decoder void copy_block( const unsigned distance, unsigned len ) { - unsigned i = pos - distance - 1; - bool fast; - if( pos <= distance ) - { i += dictionary_size; - fast = ( len <= dictionary_size - i && len <= i - pos ); } + unsigned lpos = pos, i = lpos - distance - 1; + bool fast, fast2; + if( lpos > distance ) + { + fast = ( len < dictionary_size - lpos ); + fast2 = ( fast && len <= lpos - i ); + } else - fast = ( len < dictionary_size - pos && len <= pos - i ); - if( fast ) // no wrap, no overlap { - std::memcpy( buffer + pos, buffer + i, len ); + i += dictionary_size; + fast = ( len < dictionary_size - i ); // (i == pos) may happen + fast2 = ( fast && len <= i - lpos ); + } + if( fast ) // no wrap + { pos += len; + if( fast2 ) // no wrap, no overlap + std::memcpy( buffer + lpos, buffer + i, len ); + else + for( ; len > 0; --len ) buffer[lpos++] = buffer[i++]; } else for( ; len > 0; --len ) { @@ -278,7 +287,7 @@ public: crc_( 0xFFFFFFFFU ), outfd( ofd ), pos_wrapped( false ) - { buffer[dictionary_size-1] = 0; } // prev_byte of first byte + {} ~LZ_decoder() { delete[] buffer; } @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH LZIP "1" "May 2016" "lzip 1.18" "User Commands" +.TH LZIP "1" "April 2017" "lzip 1.19" "User Commands" .SH NAME lzip \- reduces the size of files .SH SYNOPSIS @@ -36,6 +36,9 @@ force re\-compression of compressed files \fB\-k\fR, \fB\-\-keep\fR keep (don't delete) input files .TP +\fB\-l\fR, \fB\-\-list\fR +print (un)compressed file sizes +.TP \fB\-m\fR, \fB\-\-match\-length=\fR<bytes> set match length limit in bytes [36] .TP @@ -87,7 +90,7 @@ Report bugs to lzip\-bug@nongnu.org .br Lzip home page: http://www.nongnu.org/lzip/lzip.html .SH COPYRIGHT -Copyright \(co 2016 Antonio Diaz Diaz. +Copyright \(co 2017 Antonio Diaz Diaz. License GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl.html> .br This is free software: you are free to change and redistribute it. diff --git a/doc/lzip.info b/doc/lzip.info index 0210f9e..cac370c 100644 --- a/doc/lzip.info +++ b/doc/lzip.info @@ -11,7 +11,7 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir) Lzip Manual *********** -This manual is for Lzip (version 1.18, 14 May 2016). +This manual is for Lzip (version 1.19, 13 April 2017). * Menu: @@ -28,7 +28,7 @@ This manual is for Lzip (version 1.18, 14 May 2016). * Concept index:: Index of concepts - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -40,9 +40,10 @@ File: lzip.info, Node: Introduction, Next: Invoking lzip, Prev: Top, Up: Top ************** Lzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Lzip is about as fast as gzip, compresses most -files more than bzip2, and is better than both from a data recovery -perspective. +one of gzip or bzip2. Lzip can compress about as fast as gzip +(lzip -0), or compress most files more than bzip2 (lzip -9). +Decompression speed is intermediate between gzip and bzip2. Lzip is +better than gzip and bzip2 from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -56,11 +57,11 @@ availability: (lziprecover)Data safety. * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the code of a simple decompressor along with - a detailed explanation of how it works, so that with the only help - of the lzip manual it would be possible for a digital - archaeologist to extract the data from a lzip file long after - quantum computers eventually render LZMA obsolete. + lzip manual provides the source code of a simple decompressor + along with a detailed explanation of how it works, so that with + the only help of the lzip manual it would be possible for a + digital archaeologist to extract the data from a lzip file long + after quantum computers eventually render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. @@ -126,9 +127,9 @@ two or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. - Lzip can produce multimember files and safely recover, with -lziprecover, the undamaged members in case of file damage. Lzip can -also split the compressed output in volumes of a given size, even when + Lzip can produce multimember files, and lziprecover can safely +recover the undamaged members in case of file damage. Lzip can also +split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume compressed tar archives. @@ -136,6 +137,10 @@ multivolume compressed tar archives. automatically creating multimember output. The members so created are large, about 2 PiB each. + LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may +never have been compressed. Decompressed is used to refer to data which +have undergone the process of decompression. + File: lzip.info, Node: Invoking lzip, Next: Quality assurance, Prev: Introduction, Up: Top @@ -203,6 +208,21 @@ command line. Keep (don't delete) input files during compression or decompression. +'-l' +'--list' + Print the uncompressed size, compressed size and percentage saved + of the specified file(s). Trailing data are ignored. The values + produced are correct even for multimember files. If more than one + file is given, a final line containing the cumulative sizes is + printed. With '-v', the dictionary size, the number of members in + the file, and the amount of trailing data (if any) are also + printed. With '-vv', the positions and sizes of each member in + multimember files are also printed. '-lq' can be used to verify + quickly (without decompressing) the structural integrity of the + specified files. (Use '--test' to verify the data integrity). + '-alq' additionally verifies that none of the specified files + contain trailing data. + '-m BYTES' '--match-length=BYTES' Set the match length limit in bytes. After a match this long is @@ -252,8 +272,9 @@ command line. Check integrity of the specified file(s), but don't decompress them. This really performs a trial decompression and throws away the result. Use it together with '-v' to see information about - the file(s). If a file fails the test, lzip continues checking the - rest of the files. + the file(s). If a file fails the test, does not exist, can't be + opened, or is a terminal, lzip continues checking the rest of the + files. '-v' '--verbose' @@ -263,7 +284,8 @@ command line. When decompressing or testing, further -v's (up to 4) increase the verbosity level, showing status, compression ratio, dictionary size, trailer contents (CRC, data size, member size), and up to 6 - bytes of trailing data (if any). + bytes of trailing data (if any) both in hexadecimal and as a + string of printable ASCII characters. '-0 .. -9' Set the compression parameters (dictionary size and match length @@ -714,10 +736,10 @@ You may first send the position of the most significant bit that is set to 1, which you may find by making a bit scan from the left (from the MSB). A position of 0 means that the number is 0 (no bit is set), 1 means the LSB is the first bit set (the number is 1), and 32 means the -MSB is set (i.e., the number is >= 0x80000000). Lets call this bit -position a "slot". Then, if slot is > 1, you send the remaining slot - -1 bits. Lets call these bits "direct_bits" because they are coded -directly by value instead of indirectly by position. +MSB is set (i.e., the number is >= 0x80000000). Let's call this bit +position a "slot". Then, if slot is > 1, you send the remaining +slot - 1 bits. Let's call these bits "direct_bits" because they are +coded directly by value instead of indirectly by position. The inconvenient of this simple method is that it needs 6 bits to code the slot, but it just uses 33 of the 64 possible values, wasting @@ -729,14 +751,15 @@ same 6 bits that would take to encode the position alone. This seems to need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is no next bit, so the number of needed slots is 64 (0 to 63). - The slot number is context-coded in 6 bits. 'direct_bits' is the -amount of remaining bits (from 0 to 30) needed to form a complete -distance, and is calculated as (slot >> 1) - 1. If a distance needs 6 or -more direct_bits, the last 4 bits are coded separately. The last piece -(all the direct_bits for distances 4 to 127 or the last 4 bits for -distances >= 128) is context-coded in reverse order (from LSB to MSB). -For distances >= 128, the 'direct_bits - 4' part is coded with fixed -0.5 probability. + The 6 bits representing this "slot number" are then context-coded. If +the distance is >= 4, the remaining bits are coded as follows. +'direct_bits' is the amount of remaining bits (from 0 to 30) needed to +form a complete distance, and is calculated as (slot >> 1) - 1. If a +distance needs 6 or more direct_bits, the last 4 bits are coded +separately. The last piece (all the direct_bits for distances 4 to 127 +or the last 4 bits for distances >= 128) is context-coded in reverse +order (from LSB to MSB). For distances >= 128, the 'direct_bits - 4' +part is coded with fixed 0.5 probability. Bit sequence Description -------------------------------------------------------------------------- @@ -871,16 +894,21 @@ File: lzip.info, Node: Trailing data, Next: Examples, Prev: Stream format, U 7 Extra data appended to the file ********************************* -Sometimes extra data is found appended to a lzip file after the last +Sometimes extra data are found appended to a lzip file after the last member. Such trailing data may be: * Padding added to make the file size a multiple of some block size, - for example when writing to a tape. - - * Garbage added by some not totally successful copy operation. + for example when writing to a tape. It is safe to append any + amount of padding zero bytes to a lzip file. * Useful data added by the user; a cryptographically secure hash, a - description of file contents, etc. + description of file contents, etc. It is safe to append any amount + of text to a lzip file as long as the text does not begin with the + string "LZIP", and does not contain any zero bytes (null + characters). Nonzero bytes and zero bytes can't be safely mixed in + trailing data. + + * Garbage added by some not totally successful copy operation. * Malicious data added to the file in order to make its total size and hash value (for a chosen hash) coincide with those of another @@ -893,8 +921,12 @@ member. Such trailing data may be: the corruption of the integrity information itself. Therefore it can be considered to be below the noise level. + Trailing data are in no way part of the lzip file format, but tools +reading lzip files are expected to behave as correctly and usefully as +possible in the presence of trailing data. + Trailing data can be safely ignored in most cases. In some cases, -like that of user-added data, it is expected to be ignored. In those +like that of user-added data, they are expected to be ignored. In those cases where a file containing trailing data must be rejected, the option '--trailing-error' can be used. *Note --trailing-error::. @@ -942,8 +974,8 @@ Example 5: Compress a whole device in /dev/sdc and send the output to lzip -c /dev/sdc > file.lz -Example 6: The right way of concatenating compressed files. *Note -Trailing data::. +Example 6: The right way of concatenating the decompressed output of two +or more compressed files. *Note Trailing data::. Don't do this cat file1.lz file2.lz file3.lz | lzip -d @@ -1002,7 +1034,7 @@ Appendix A Reference source code ******************************** /* Lzd - Educational decompressor for the lzip format - Copyright (C) 2013-2016 Antonio Diaz Diaz. + Copyright (C) 2013-2017 Antonio Diaz Diaz. This program is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -1153,10 +1185,10 @@ public: uint8_t get_byte() { return std::getc( stdin ); } - int decode( const int num_bits ) + unsigned decode( const int num_bits ) { - int symbol = 0; - for( int i = 0; i < num_bits; ++i ) + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) { range >>= 1; symbol <<= 1; @@ -1167,9 +1199,9 @@ public: return symbol; } - int decode_bit( Bit_model & bm ) + unsigned decode_bit( Bit_model & bm ) { - int symbol; + unsigned symbol; const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; if( code < bound ) { @@ -1189,18 +1221,18 @@ public: return symbol; } - int decode_tree( Bit_model bm[], const int num_bits ) + unsigned decode_tree( Bit_model bm[], const int num_bits ) { - int symbol = 1; + unsigned symbol = 1; for( int i = 0; i < num_bits; ++i ) symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); return symbol - (1 << num_bits); } - int decode_tree_reversed( Bit_model bm[], const int num_bits ) + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) { - int symbol = decode_tree( bm, num_bits ); - int reversed_symbol = 0; + unsigned symbol = decode_tree( bm, num_bits ); + unsigned reversed_symbol = 0; for( int i = 0; i < num_bits; ++i ) { reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); @@ -1209,14 +1241,13 @@ public: return reversed_symbol; } - int decode_matched( Bit_model bm[], const int match_byte ) + unsigned decode_matched( Bit_model bm[], const unsigned match_byte ) { - Bit_model * const bm1 = bm + 0x100; - int symbol = 1; + unsigned symbol = 1; for( int i = 7; i >= 0; --i ) { - const int match_bit = ( match_byte >> i ) & 1; - const int bit = decode_bit( bm1[(match_bit<<8)+symbol] ); + const unsigned match_bit = ( match_byte >> i ) & 1; + const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] ); symbol = ( symbol << 1 ) | bit; if( match_bit != bit ) { @@ -1228,7 +1259,7 @@ public: return symbol & 0xFF; } - int decode_len( Len_model & lm, const int pos_state ) + unsigned decode_len( Len_model & lm, const int pos_state ) { if( decode_bit( lm.choice1 ) == 0 ) return decode_tree( lm.bm_low[pos_state], len_low_bits ); @@ -1256,9 +1287,9 @@ class LZ_decoder uint8_t peek( const unsigned distance ) const { - unsigned i = pos - distance - 1; - if( pos <= distance ) i += dictionary_size; - return buffer[i]; + if( pos > distance ) return buffer[pos - distance - 1]; + if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1]; + return 0; // prev_byte of first byte } void put_byte( const uint8_t b ) @@ -1277,7 +1308,7 @@ public: stream_pos( 0 ), crc_( 0xFFFFFFFFU ), pos_wrapped( false ) - { buffer[dictionary_size-1] = 0; } // prev_byte of first byte + {} ~LZ_decoder() { delete[] buffer; } @@ -1315,7 +1346,7 @@ bool LZ_decoder::decode_member() // Returns false if error Bit_model bm_rep2[State::states]; Bit_model bm_len[State::states][pos_states]; Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; - Bit_model bm_dis[modeled_distances-end_dis_model]; + Bit_model bm_dis[modeled_distances-end_dis_model+1]; Bit_model bm_align[dis_align_size]; Len_model match_len_model; Len_model rep_len_model; @@ -1344,7 +1375,12 @@ bool LZ_decoder::decode_member() // Returns false if error int len; if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit { - if( rdec.decode_bit( bm_rep0[state()] ) != 0 ) // 3rd bit + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else { unsigned distance; if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit @@ -1360,11 +1396,6 @@ bool LZ_decoder::decode_member() // Returns false if error rep1 = rep0; rep0 = distance; } - else - { - if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } - } state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); } @@ -1373,15 +1404,14 @@ bool LZ_decoder::decode_member() // Returns false if error rep3 = rep2; rep2 = rep1; rep1 = rep0; len = min_match_len + rdec.decode_len( match_len_model, pos_state ); const int len_state = std::min( len - min_match_len, len_states - 1 ); - const int dis_slot = - rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); - if( dis_slot < start_dis_model ) rep0 = dis_slot; - else + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) { + const unsigned dis_slot = rep0; const int direct_bits = ( dis_slot >> 1 ) - 1; rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; if( dis_slot < end_dis_model ) - rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), direct_bits ); else { @@ -1417,7 +1447,7 @@ int main( const int argc, const char * const argv[] ) "It is not safe to use lzd for any real work.\n" "\nUsage: %s < file.lz > file\n", argv[0] ); std::printf( "Lzd decompresses from standard input to standard output.\n" - "\nCopyright (C) 2016 Antonio Diaz Diaz.\n" + "\nCopyright (C) 2017 Antonio Diaz Diaz.\n" "This is free software: you are free to change and redistribute it.\n" "There is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" @@ -1432,7 +1462,7 @@ int main( const int argc, const char * const argv[] ) for( bool first_member = true; ; first_member = false ) { - File_header header; + File_header header; // verify header for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { @@ -1447,11 +1477,11 @@ int main( const int argc, const char * const argv[] ) { std::fputs( "Invalid dictionary size in member header.\n", stderr ); return 2; } - LZ_decoder decoder( dict_size ); + LZ_decoder decoder( dict_size ); // decode LZMA stream if( !decoder.decode_member() ) { std::fputs( "Data error\n", stderr ); return 2; } - File_trailer trailer; + File_trailer trailer; // verify trailer for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); unsigned crc = 0; for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } @@ -1495,19 +1525,19 @@ Concept index Tag Table: Node: Top208 -Node: Introduction1145 -Node: Invoking lzip6071 -Ref: --trailing-error6635 -Node: Quality assurance12628 -Node: File format20782 -Node: Algorithm23186 -Node: Stream format26012 -Node: Trailing data36660 -Node: Examples38038 -Ref: concat-example39211 -Node: Problems40211 -Node: Reference source code40741 -Node: Concept index54957 +Node: Introduction1147 +Node: Invoking lzip6367 +Ref: --trailing-error6931 +Node: Quality assurance13849 +Node: File format22003 +Node: Algorithm24407 +Node: Stream format27233 +Node: Trailing data37973 +Node: Examples39874 +Ref: concat-example41047 +Node: Problems42085 +Node: Reference source code42615 +Node: Concept index56932 End Tag Table diff --git a/doc/lzip.texi b/doc/lzip.texi index 27feeff..17a2b1e 100644 --- a/doc/lzip.texi +++ b/doc/lzip.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 14 May 2016 -@set VERSION 1.18 +@set UPDATED 13 April 2017 +@set VERSION 1.19 @dircategory Data Compression @direntry @@ -49,7 +49,7 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}). @end menu @sp 1 -Copyright @copyright{} 2008-2016 Antonio Diaz Diaz. +Copyright @copyright{} 2008-2017 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -60,9 +60,10 @@ to copy, distribute and modify it. @cindex introduction Lzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Lzip is about as fast as gzip, compresses most -files more than bzip2, and is better than both from a data recovery -perspective. +one of gzip or bzip2. Lzip can compress about as fast as gzip +@w{(lzip -0)}, or compress most files more than bzip2 @w{(lzip -9)}. +Decompression speed is intermediate between gzip and bzip2. Lzip is +better than gzip and bzip2 from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -82,10 +83,10 @@ including error-checked merging of damaged copies of a file. @item The lzip format is as simple as possible (but not simpler). The lzip -manual provides the code of a simple decompressor along with a detailed -explanation of how it works, so that with the only help of the lzip -manual it would be possible for a digital archaeologist to extract the -data from a lzip file long after quantum computers eventually render +manual provides the source code of a simple decompressor along with a +detailed explanation of how it works, so that with the only help of the +lzip manual it would be possible for a digital archaeologist to extract +the data from a lzip file long after quantum computers eventually render LZMA obsolete. @item @@ -156,7 +157,7 @@ or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. -Lzip can produce multimember files and safely recover, with lziprecover, +Lzip can produce multimember files, and lziprecover can safely recover the undamaged members in case of file damage. Lzip can also split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume @@ -166,6 +167,10 @@ Lzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are large, about 2 PiB each. +LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never +have been compressed. Decompressed is used to refer to data which have +undergone the process of decompression. + @node Invoking lzip @chapter Invoking lzip @@ -237,6 +242,20 @@ Force re-compression of files whose name already has the @samp{.lz} or @itemx --keep Keep (don't delete) input files during compression or decompression. +@item -l +@itemx --list +Print the uncompressed size, compressed size and percentage saved of the +specified file(s). Trailing data are ignored. The values produced are +correct even for multimember files. If more than one file is given, a +final line containing the cumulative sizes is printed. With @samp{-v}, +the dictionary size, the number of members in the file, and the amount +of trailing data (if any) are also printed. With @samp{-vv}, the +positions and sizes of each member in multimember files are also +printed. @samp{-lq} can be used to verify quickly (without +decompressing) the structural integrity of the specified files. (Use +@samp{--test} to verify the data integrity). @samp{-alq} additionally +verifies that none of the specified files contain trailing data. + @item -m @var{bytes} @itemx --match-length=@var{bytes} Set the match length limit in bytes. After a match this long is found, @@ -284,7 +303,8 @@ EiB. Check integrity of the specified file(s), but don't decompress them. This really performs a trial decompression and throws away the result. Use it together with @samp{-v} to see information about the file(s). If -a file fails the test, lzip continues checking the rest of the files. +a file fails the test, does not exist, can't be opened, or is a +terminal, lzip continues checking the rest of the files. @item -v @itemx --verbose @@ -294,7 +314,8 @@ second @samp{-v} shows the progress of compression.@* When decompressing or testing, further -v's (up to 4) increase the verbosity level, showing status, compression ratio, dictionary size, trailer contents (CRC, data size, member size), and up to 6 bytes of -trailing data (if any). +trailing data (if any) both in hexadecimal and as a string of printable +ASCII characters. @item -0 .. -9 Set the compression parameters (dictionary size and match length limit) @@ -756,16 +777,16 @@ Lengths (the @samp{len} in the table above) are coded as follows: The coding of distances is a little more complicated, so I'll begin explaining a simpler version of the encoding. -Imagine you need to code a number from 0 to 2^32 - 1, and you want to do -it in a way that produces shorter codes for the smaller numbers. You may -first send the position of the most significant bit that is set to 1, -which you may find by making a bit scan from the left (from the MSB). A -position of 0 means that the number is 0 (no bit is set), 1 means the -LSB is the first bit set (the number is 1), and 32 means the MSB is set -(i.e., the number is >= 0x80000000). Lets call this bit position a -"slot". Then, if slot is > 1, you send the remaining slot - 1 bits. Lets -call these bits "direct_bits" because they are coded directly by value -instead of indirectly by position. +Imagine you need to code a number from 0 to @w{2^32 - 1}, and you want +to do it in a way that produces shorter codes for the smaller numbers. +You may first send the position of the most significant bit that is set +to 1, which you may find by making a bit scan from the left (from the +MSB). A position of 0 means that the number is 0 (no bit is set), 1 +means the LSB is the first bit set (the number is 1), and 32 means the +MSB is set (i.e., the number is @w{>= 0x80000000}). Let's call this bit +position a "slot". Then, if slot is @w{> 1}, you send the remaining +@w{slot - 1} bits. Let's call these bits "direct_bits" because they are +coded directly by value instead of indirectly by position. The inconvenient of this simple method is that it needs 6 bits to code the slot, but it just uses 33 of the 64 possible values, wasting almost @@ -777,14 +798,15 @@ same 6 bits that would take to encode the position alone. This seems to need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is no next bit, so the number of needed slots is 64 (0 to 63). -The slot number is context-coded in 6 bits. @samp{direct_bits} is the -amount of remaining bits (from 0 to 30) needed to form a complete -distance, and is calculated as (slot >> 1) - 1. If a distance needs 6 or -more direct_bits, the last 4 bits are coded separately. The last piece -(all the direct_bits for distances 4 to 127 or the last 4 bits for -distances >= 128) is context-coded in reverse order (from LSB to MSB). -For distances >= 128, the @samp{direct_bits - 4} part is coded with -fixed 0.5 probability. +The 6 bits representing this "slot number" are then context-coded. If +the distance is @w{>= 4}, the remaining bits are coded as follows. +@samp{direct_bits} is the amount of remaining bits (from 0 to 30) needed +to form a complete distance, and is calculated as @w{(slot >> 1) - 1}. +If a distance needs 6 or more direct_bits, the last 4 bits are coded +separately. The last piece (all the direct_bits for distances 4 to 127 +or the last 4 bits for distances @w{>= 128}) is context-coded in reverse +order (from LSB to MSB). For distances @w{>= 128}, the +@w{@samp{direct_bits - 4}} part is coded with fixed 0.5 probability. @multitable @columnfractions .5 .5 @headitem Bit sequence @tab Description @@ -816,8 +838,8 @@ decoded data. Value of the 3 most significant bits of the latest byte decoded. @item len_state -Coded value of length (length - 2), with a maximum of 3. The resulting -value is in the range 0 to 3. +Coded value of length @w{(length - 2)}, with a maximum of 3. The +resulting value is in the range 0 to 3. @end table @@ -903,7 +925,7 @@ with their context. (See @samp{decode_bit} in the source). The range decoder state consists of two unsigned 32-bit variables; @code{range} (representing the most significant part of the range size not yet decoded), and @code{code} (representing the current point within -@code{range}). @code{range} is initialized to (2^32 - 1), and +@code{range}). @code{range} is initialized to @w{(2^32 - 1)}, and @code{code} is initialized to 0. The range encoder produces a first 0 byte that must be ignored by the @@ -926,20 +948,24 @@ Of Stream" marker is decoded. @chapter Extra data appended to the file @cindex trailing data -Sometimes extra data is found appended to a lzip file after the last +Sometimes extra data are found appended to a lzip file after the last member. Such trailing data may be: @itemize @bullet @item Padding added to make the file size a multiple of some block size, for -example when writing to a tape. +example when writing to a tape. It is safe to append any amount of +padding zero bytes to a lzip file. @item -Garbage added by some not totally successful copy operation. +Useful data added by the user; a cryptographically secure hash, a +description of file contents, etc. It is safe to append any amount of +text to a lzip file as long as the text does not begin with the string +"LZIP", and does not contain any zero bytes (null characters). Nonzero +bytes and zero bytes can't be safely mixed in trailing data. @item -Useful data added by the user; a cryptographically secure hash, a -description of file contents, etc. +Garbage added by some not totally successful copy operation. @item Malicious data added to the file in order to make its total size and @@ -954,8 +980,12 @@ integrity information itself. Therefore it can be considered to be below the noise level. @end itemize +Trailing data are in no way part of the lzip file format, but tools +reading lzip files are expected to behave as correctly and usefully as +possible in the presence of trailing data. + Trailing data can be safely ignored in most cases. In some cases, like -that of user-added data, it is expected to be ignored. In those cases +that of user-added data, they are expected to be ignored. In those cases where a file containing trailing data must be rejected, the option @samp{--trailing-error} can be used. @xref{--trailing-error}. @@ -1020,8 +1050,8 @@ lzip -c /dev/sdc > file.lz @sp 1 @anchor{concat-example} @noindent -Example 6: The right way of concatenating compressed files. -@xref{Trailing data}. +Example 6: The right way of concatenating the decompressed output of two +or more compressed files. @xref{Trailing data}. @example Don't do this @@ -1097,7 +1127,7 @@ find by running @w{@code{lzip --version}}. @verbatim /* Lzd - Educational decompressor for the lzip format - Copyright (C) 2013-2016 Antonio Diaz Diaz. + Copyright (C) 2013-2017 Antonio Diaz Diaz. This program is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -1248,10 +1278,10 @@ public: uint8_t get_byte() { return std::getc( stdin ); } - int decode( const int num_bits ) + unsigned decode( const int num_bits ) { - int symbol = 0; - for( int i = 0; i < num_bits; ++i ) + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) { range >>= 1; symbol <<= 1; @@ -1262,9 +1292,9 @@ public: return symbol; } - int decode_bit( Bit_model & bm ) + unsigned decode_bit( Bit_model & bm ) { - int symbol; + unsigned symbol; const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; if( code < bound ) { @@ -1284,18 +1314,18 @@ public: return symbol; } - int decode_tree( Bit_model bm[], const int num_bits ) + unsigned decode_tree( Bit_model bm[], const int num_bits ) { - int symbol = 1; + unsigned symbol = 1; for( int i = 0; i < num_bits; ++i ) symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); return symbol - (1 << num_bits); } - int decode_tree_reversed( Bit_model bm[], const int num_bits ) + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) { - int symbol = decode_tree( bm, num_bits ); - int reversed_symbol = 0; + unsigned symbol = decode_tree( bm, num_bits ); + unsigned reversed_symbol = 0; for( int i = 0; i < num_bits; ++i ) { reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); @@ -1304,14 +1334,13 @@ public: return reversed_symbol; } - int decode_matched( Bit_model bm[], const int match_byte ) + unsigned decode_matched( Bit_model bm[], const unsigned match_byte ) { - Bit_model * const bm1 = bm + 0x100; - int symbol = 1; + unsigned symbol = 1; for( int i = 7; i >= 0; --i ) { - const int match_bit = ( match_byte >> i ) & 1; - const int bit = decode_bit( bm1[(match_bit<<8)+symbol] ); + const unsigned match_bit = ( match_byte >> i ) & 1; + const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] ); symbol = ( symbol << 1 ) | bit; if( match_bit != bit ) { @@ -1323,7 +1352,7 @@ public: return symbol & 0xFF; } - int decode_len( Len_model & lm, const int pos_state ) + unsigned decode_len( Len_model & lm, const int pos_state ) { if( decode_bit( lm.choice1 ) == 0 ) return decode_tree( lm.bm_low[pos_state], len_low_bits ); @@ -1351,9 +1380,9 @@ class LZ_decoder uint8_t peek( const unsigned distance ) const { - unsigned i = pos - distance - 1; - if( pos <= distance ) i += dictionary_size; - return buffer[i]; + if( pos > distance ) return buffer[pos - distance - 1]; + if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1]; + return 0; // prev_byte of first byte } void put_byte( const uint8_t b ) @@ -1372,7 +1401,7 @@ public: stream_pos( 0 ), crc_( 0xFFFFFFFFU ), pos_wrapped( false ) - { buffer[dictionary_size-1] = 0; } // prev_byte of first byte + {} ~LZ_decoder() { delete[] buffer; } @@ -1410,7 +1439,7 @@ bool LZ_decoder::decode_member() // Returns false if error Bit_model bm_rep2[State::states]; Bit_model bm_len[State::states][pos_states]; Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; - Bit_model bm_dis[modeled_distances-end_dis_model]; + Bit_model bm_dis[modeled_distances-end_dis_model+1]; Bit_model bm_align[dis_align_size]; Len_model match_len_model; Len_model rep_len_model; @@ -1439,7 +1468,12 @@ bool LZ_decoder::decode_member() // Returns false if error int len; if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit { - if( rdec.decode_bit( bm_rep0[state()] ) != 0 ) // 3rd bit + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else { unsigned distance; if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit @@ -1455,11 +1489,6 @@ bool LZ_decoder::decode_member() // Returns false if error rep1 = rep0; rep0 = distance; } - else - { - if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } - } state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); } @@ -1468,15 +1497,14 @@ bool LZ_decoder::decode_member() // Returns false if error rep3 = rep2; rep2 = rep1; rep1 = rep0; len = min_match_len + rdec.decode_len( match_len_model, pos_state ); const int len_state = std::min( len - min_match_len, len_states - 1 ); - const int dis_slot = - rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); - if( dis_slot < start_dis_model ) rep0 = dis_slot; - else + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) { + const unsigned dis_slot = rep0; const int direct_bits = ( dis_slot >> 1 ) - 1; rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; if( dis_slot < end_dis_model ) - rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), direct_bits ); else { @@ -1512,7 +1540,7 @@ int main( const int argc, const char * const argv[] ) "It is not safe to use lzd for any real work.\n" "\nUsage: %s < file.lz > file\n", argv[0] ); std::printf( "Lzd decompresses from standard input to standard output.\n" - "\nCopyright (C) 2016 Antonio Diaz Diaz.\n" + "\nCopyright (C) 2017 Antonio Diaz Diaz.\n" "This is free software: you are free to change and redistribute it.\n" "There is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" @@ -1527,7 +1555,7 @@ int main( const int argc, const char * const argv[] ) for( bool first_member = true; ; first_member = false ) { - File_header header; + File_header header; // verify header for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { @@ -1542,11 +1570,11 @@ int main( const int argc, const char * const argv[] ) { std::fputs( "Invalid dictionary size in member header.\n", stderr ); return 2; } - LZ_decoder decoder( dict_size ); + LZ_decoder decoder( dict_size ); // decode LZMA stream if( !decoder.decode_member() ) { std::fputs( "Data error\n", stderr ); return 2; } - File_trailer trailer; + File_trailer trailer; // verify trailer for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); unsigned crc = 0; for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -85,7 +85,7 @@ int LZ_encoder::get_match_pairs( Pair * pairs ) prev_positions[key2] = pos1; prev_positions[key3] = pos1; - int newpos = prev_positions[key4]; + int newpos1 = prev_positions[key4]; prev_positions[key4] = pos1; int32_t * ptr0 = pos_array + ( cyclic_pos << 1 ); @@ -94,9 +94,9 @@ int LZ_encoder::get_match_pairs( Pair * pairs ) for( int count = cycles; ; ) { - if( newpos <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; } + if( newpos1 <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; } - const int delta = pos1 - newpos; + const int delta = pos1 - newpos1; int32_t * const newptr = pos_array + ( ( cyclic_pos - delta + ( ( cyclic_pos >= delta ) ? 0 : dictionary_size + 1 ) ) << 1 ); @@ -118,16 +118,16 @@ int LZ_encoder::get_match_pairs( Pair * pairs ) } if( data[len-delta] < data[len] ) { - *ptr0 = newpos; + *ptr0 = newpos1; ptr0 = newptr + 1; - newpos = *ptr0; + newpos1 = *ptr0; len0 = len; if( len1 < len ) len = len1; } else { - *ptr1 = newpos; + *ptr1 = newpos1; ptr1 = newptr; - newpos = *ptr1; + newpos1 = *ptr1; len1 = len; if( len0 < len ) len = len0; } } @@ -142,7 +142,7 @@ void LZ_encoder::update_distance_prices() const int dis_slot = dis_slots[dis]; const int direct_bits = ( dis_slot >> 1 ) - 1; const int base = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - const int price = price_symbol_reversed( bm_dis + base - dis_slot - 1, + const int price = price_symbol_reversed( bm_dis + ( base - dis_slot ), dis - base, direct_bits ); for( int len_state = 0; len_state < len_states; ++len_state ) dis_prices[len_state][dis] = price; @@ -171,7 +171,7 @@ void LZ_encoder::update_distance_prices() /* Returns the number of bytes advanced (ahead). trials[0]..trials[ahead-1] contain the steps to encode. - ( trials[0].dis == -1 ) means literal. + ( trials[0].dis4 == -1 ) means literal. A match/rep longer or equal than match_len_limit finishes the sequence. */ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], @@ -192,13 +192,13 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], int rep_index = 0; for( int i = 0; i < num_rep_distances; ++i ) { - replens[i] = true_match_len( 0, reps[i] + 1, max_match_len ); + replens[i] = true_match_len( 0, reps[i] + 1 ); if( replens[i] > replens[rep_index] ) rep_index = i; } if( replens[rep_index] >= match_len_limit ) { trials[0].price = replens[rep_index]; - trials[0].dis = rep_index; + trials[0].dis4 = rep_index; move_and_update( replens[rep_index] ); return replens[rep_index]; } @@ -206,7 +206,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], if( main_len >= match_len_limit ) { trials[0].price = main_len; - trials[0].dis = pairs[num_pairs-1].dis + num_rep_distances; + trials[0].dis4 = pairs[num_pairs-1].dis + num_rep_distances; move_and_update( main_len ); return main_len; } @@ -221,7 +221,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[1].price += price_literal( prev_byte, cur_byte ); else trials[1].price += price_matched( prev_byte, cur_byte, match_byte ); - trials[1].dis = -1; // literal + trials[1].dis4 = -1; // literal const int match_price = price1( bm_match[state()][pos_state] ); const int rep_match_price = match_price + price1( bm_rep[state()] ); @@ -234,7 +234,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], if( num_trials < min_match_len ) { trials[0].price = 1; - trials[0].dis = trials[1].dis; + trials[0].dis4 = trials[1].dis4; move_pos(); return 1; } @@ -292,7 +292,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], Trial & cur_trial = trials[cur]; State cur_state; { - int dis = cur_trial.dis; + const int dis4 = cur_trial.dis4; int prev_index = cur_trial.prev_index; const int prev_index2 = cur_trial.prev_index2; @@ -301,32 +301,24 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], cur_state = trials[prev_index].state; if( prev_index + 1 == cur ) // len == 1 { - if( dis == 0 ) cur_state.set_short_rep(); + if( dis4 == 0 ) cur_state.set_short_rep(); else cur_state.set_char(); // literal } - else if( dis < num_rep_distances ) cur_state.set_rep(); + else if( dis4 < num_rep_distances ) cur_state.set_rep(); else cur_state.set_match(); } - else if( prev_index2 == dual_step_trial ) // dis == 0 + else { - --prev_index; - cur_state = trials[prev_index].state; - cur_state.set_char(); - cur_state.set_rep(); - } - else // if( prev_index2 >= 0 ) - { - prev_index = prev_index2; - cur_state = trials[prev_index].state; - if( dis < num_rep_distances ) cur_state.set_rep(); - else cur_state.set_match(); - cur_state.set_char(); - cur_state.set_rep(); + if( prev_index2 == dual_step_trial ) // dis4 == 0 (rep0) + --prev_index; + else // prev_index2 >= 0 + prev_index = prev_index2; + cur_state.set_char_rep(); } cur_trial.state = cur_state; for( int i = 0; i < num_rep_distances; ++i ) cur_trial.reps[i] = trials[prev_index].reps[i]; - mtf_reps( dis, cur_trial.reps ); + mtf_reps( dis4, cur_trial.reps ); // literal is ignored } const int pos_state = data_position() & pos_state_mask; @@ -349,14 +341,14 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], const int match_price = cur_trial.price + price1( bm_match[cur_state()][pos_state] ); const int rep_match_price = match_price + price1( bm_rep[cur_state()] ); - if( match_byte == cur_byte && next_trial.dis != 0 && + if( match_byte == cur_byte && next_trial.dis4 != 0 && next_trial.prev_index2 == single_step_trial ) { const int price = rep_match_price + price_shortrep( cur_state, pos_state ); if( price <= next_trial.price ) { next_trial.price = price; - next_trial.dis = 0; + next_trial.dis4 = 0; // rep0 next_trial.prev_index = cur; } } @@ -380,9 +372,9 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], const int pos_state2 = ( pos_state + 1 ) & pos_state_mask; State state2 = cur_state; state2.set_char(); const int price = next_price + - price1( bm_match[state2()][pos_state2] ) + - price1( bm_rep[state2()] ) + - price_rep0_len( len, state2, pos_state2 ); + price1( bm_match[state2()][pos_state2] ) + + price1( bm_rep[state2()] ) + + price_rep0_len( len, state2, pos_state2 ); while( num_trials < cur + 1 + len ) trials[++num_trials].price = infinite_price; trials[cur+1+len].update2( price, cur + 1 ); @@ -395,8 +387,8 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], for( int rep = 0; rep < num_rep_distances; ++rep ) { const uint8_t * const data = ptr_to_current_pos(); - int len; const int dis = cur_trial.reps[rep] + 1; + int len; if( data[0-dis] != data[0] || data[1-dis] != data[1] ) continue; for( len = min_match_len; len < len_limit; ++len ) @@ -447,7 +439,6 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], for( int len = start_len; ; ++len ) { int price = normal_match_price + price_pair( dis, len, pos_state ); - trials[cur+len].update( price, dis + num_rep_distances, cur ); // try match + literal + rep0 @@ -493,7 +484,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) const int dis_price_count = best ? 1 : 512; const int align_price_count = best ? 1 : dis_align_size; const int price_count = ( match_len_limit > 36 ) ? 1013 : 4093; - int price_counter = 0; + int price_counter = 0; // counters may decrement below 0 int dis_price_counter = 0; int align_price_counter = 0; int reps[num_rep_distances]; @@ -532,14 +523,13 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) } int ahead = sequence_optimizer( reps, state ); - if( ahead <= 0 ) return false; // can't happen price_counter -= ahead; for( int i = 0; ahead > 0; ) { const int pos_state = ( data_position() - ahead ) & pos_state_mask; const int len = trials[i].price; - const int dis = trials[i].dis; + int dis = trials[i].dis4; bool bit = ( dis < 0 ); renc.encode_bit( bm_match[state()][pos_state], !bit ); @@ -548,14 +538,13 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) const uint8_t prev_byte = peek( ahead + 1 ); const uint8_t cur_byte = peek( ahead ); crc32.update_byte( crc_, cur_byte ); - if( state.is_char() ) + if( state.is_char_set_char() ) encode_literal( prev_byte, cur_byte ); else { const uint8_t match_byte = peek( ahead + reps[0] + 1 ); encode_matched( prev_byte, cur_byte, match_byte ); } - state.set_char(); } else // match or repeated match { @@ -585,9 +574,9 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) } else // match { - encode_pair( dis - num_rep_distances, len, pos_state ); - if( get_slot( dis - num_rep_distances ) >= end_dis_model ) - --align_price_counter; + dis -= num_rep_distances; + encode_pair( dis, len, pos_state ); + if( dis >= modeled_distances ) --align_price_counter; --dis_price_counter; match_len_prices.decrement_counter( pos_state ); state.set_match(); @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,11 +21,10 @@ class Len_prices const int len_symbols; const int count; int prices[pos_states][max_len_symbols]; - int counters[pos_states]; + int counters[pos_states]; // may decrement below 0 void update_low_mid_prices( const int pos_state ) { - counters[pos_state] = count; int * const pps = prices[pos_state]; int tmp = price0( lm.choice1 ); int len = 0; @@ -64,13 +63,14 @@ public: bool high_pending = false; for( int pos_state = 0; pos_state < pos_states; ++pos_state ) if( counters[pos_state] <= 0 ) - { update_low_mid_prices( pos_state ); high_pending = true; } + { counters[pos_state] = count; + update_low_mid_prices( pos_state ); high_pending = true; } if( high_pending && len_symbols > len_low_symbols + len_mid_symbols ) update_high_prices(); } - int price( const int symbol, const int pos_state ) const - { return prices[pos_state][symbol - min_match_len]; } + int price( const int len, const int pos_state ) const + { return prices[pos_state][len - min_match_len]; } }; @@ -91,32 +91,33 @@ class LZ_encoder : public LZ_encoder_base { State state; int price; // dual use var; cumulative price, match length - int dis; // rep index or match distance. (-1 for literal) + int dis4; // -1 for literal, or rep, or match distance + 4 int prev_index; // index of prev trial in trials[] int prev_index2; // -2 trial is single step // -1 literal + rep0 // >= 0 ( rep or match ) + literal + rep0 int reps[num_rep_distances]; - void update( const int pr, const int distance, const int p_i ) + void update( const int pr, const int distance4, const int p_i ) { if( pr < price ) - { price = pr; dis = distance; prev_index = p_i; + { price = pr; dis4 = distance4; prev_index = p_i; prev_index2 = single_step_trial; } } void update2( const int pr, const int p_i ) { if( pr < price ) - { price = pr; dis = 0; prev_index = p_i; + { price = pr; dis4 = 0; prev_index = p_i; prev_index2 = dual_step_trial; } } - void update3( const int pr, const int distance, const int p_i, + void update3( const int pr, const int distance4, const int p_i, const int p_i2 ) { if( pr < price ) - { price = pr; dis = distance; prev_index = p_i; prev_index2 = p_i2; } + { price = pr; dis4 = distance4; prev_index = p_i; + prev_index2 = p_i2; } } }; @@ -137,26 +138,26 @@ class LZ_encoder : public LZ_encoder_base { if( ahead < 0 || pos < ahead ) return false; pos -= ahead; + if( cyclic_pos < ahead ) cyclic_pos += dictionary_size + 1; cyclic_pos -= ahead; - if( cyclic_pos < 0 ) cyclic_pos += dictionary_size + 1; return true; } int get_match_pairs( Pair * pairs = 0 ); void update_distance_prices(); - // move-to-front dis in/into reps if( dis > 0 ) - static void mtf_reps( const int dis, int reps[num_rep_distances] ) + // move-to-front dis in/into reps; do nothing if( dis4 <= 0 ) + static void mtf_reps( const int dis4, int reps[num_rep_distances] ) { - if( dis >= num_rep_distances ) + if( dis4 >= num_rep_distances ) // match { - for( int i = num_rep_distances - 1; i > 0; --i ) reps[i] = reps[i-1]; - reps[0] = dis - num_rep_distances; + reps[3] = reps[2]; reps[2] = reps[1]; reps[1] = reps[0]; + reps[0] = dis4 - num_rep_distances; } - else if( dis > 0 ) + else if( dis4 > 0 ) // repeated match { - const int distance = reps[dis]; - for( int i = dis; i > 0; --i ) reps[i] = reps[i-1]; + const int distance = reps[dis4]; + for( int i = dis4; i > 0; --i ) reps[i] = reps[i-1]; reps[0] = distance; } } @@ -203,13 +204,10 @@ class LZ_encoder : public LZ_encoder_base const int num_pairs = get_match_pairs( pairs ); if( num_pairs > 0 ) { - int len = pairs[num_pairs-1].len; + const int len = pairs[num_pairs-1].len; if( len == match_len_limit && len < max_match_len ) - { - len += true_match_len( len, pairs[num_pairs-1].dis + 1, - max_match_len - len ); - pairs[num_pairs-1].len = len; - } + pairs[num_pairs-1].len = + true_match_len( len, pairs[num_pairs-1].dis + 1 ); } return num_pairs; } @@ -226,7 +224,7 @@ class LZ_encoder : public LZ_encoder_base void backward( int cur ) { - int & dis = trials[cur].dis; + int dis4 = trials[cur].dis4; while( cur > 0 ) { const int prev_index = trials[cur].prev_index; @@ -234,19 +232,19 @@ class LZ_encoder : public LZ_encoder_base if( trials[cur].prev_index2 != single_step_trial ) { - prev_trial.dis = -1; + prev_trial.dis4 = -1; // literal prev_trial.prev_index = prev_index - 1; prev_trial.prev_index2 = single_step_trial; if( trials[cur].prev_index2 >= 0 ) { Trial & prev_trial2 = trials[prev_index-1]; - prev_trial2.dis = dis; dis = 0; + prev_trial2.dis4 = dis4; dis4 = 0; // rep0 prev_trial2.prev_index = trials[cur].prev_index2; prev_trial2.prev_index2 = single_step_trial; } } prev_trial.price = cur - prev_index; // len - cur = dis; dis = prev_trial.dis; prev_trial.dis = cur; + cur = dis4; dis4 = prev_trial.dis4; prev_trial.dis4 = cur; cur = prev_index; } } @@ -254,7 +252,7 @@ class LZ_encoder : public LZ_encoder_base int sequence_optimizer( const int reps[num_rep_distances], const State state ); - enum { before = max_num_trials + 1, + enum { before = max_num_trials, // bytes to keep in buffer after pos after_size = ( 2 * max_match_len ) + 1, dict_factor = 2, diff --git a/encoder_base.cc b/encoder_base.cc index cfc058e..55ce376 100644 --- a/encoder_base.cc +++ b/encoder_base.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -53,11 +53,11 @@ void Matchfinder_base::normalize_pos() internal_error( "pos > stream_pos in Matchfinder_base::normalize_pos." ); if( !at_stream_end ) { - const int offset = pos - dictionary_size - before_size; + const int offset = pos - before_size - dictionary_size; const int size = stream_pos - offset; std::memmove( buffer, buffer + offset, size ); partial_data_pos += offset; - pos -= offset; + pos -= offset; // pos = before_size + dictionary_size stream_pos -= offset; for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] -= std::min( prev_positions[i], offset ); @@ -109,7 +109,8 @@ Matchfinder_base::Matchfinder_base( const int before, const int dict_size, num_prev_positions = size; pos_array_size = pos_array_factor * ( dictionary_size + 1 ); size += pos_array_size; - prev_positions = new( std::nothrow ) int32_t[size]; + if( size * sizeof prev_positions[0] <= size ) prev_positions = 0; + else prev_positions = new( std::nothrow ) int32_t[size]; if( !prev_positions ) { std::free( buffer ); throw std::bad_alloc(); } pos_array = prev_positions + num_prev_positions; for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = 0; @@ -172,7 +173,7 @@ void LZ_encoder_base::reset() bm_rep2[0].reset( State::states ); bm_len[0][0].reset( State::states * pos_states ); bm_dis_slot[0][0].reset( len_states * (1 << dis_slot_bits ) ); - bm_dis[0].reset( modeled_distances - end_dis_model ); + bm_dis[0].reset( modeled_distances - end_dis_model + 1 ); bm_align[0].reset( dis_align_size ); match_len_model.reset(); rep_len_model.reset(); diff --git a/encoder_base.h b/encoder_base.h index 9ce622c..e186c2d 100644 --- a/encoder_base.h +++ b/encoder_base.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -85,13 +85,13 @@ inline int price0( const Bit_model bm ) inline int price1( const Bit_model bm ) { return prob_prices[bit_model_total - bm.probability]; } -inline int price_bit( const Bit_model bm, const int bit ) - { if( bit ) return price1( bm ); else return price0( bm ); } +inline int price_bit( const Bit_model bm, const bool bit ) + { return ( bit ? price1( bm ) : price0( bm ) ); } inline int price_symbol3( const Bit_model bm[], int symbol ) { - int bit = symbol & 1; + bool bit = symbol & 1; symbol |= 8; symbol >>= 1; int price = price_bit( bm[symbol], bit ); bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); @@ -99,9 +99,9 @@ inline int price_symbol3( const Bit_model bm[], int symbol ) } -inline int price_symbol6( const Bit_model bm[], int symbol ) +inline int price_symbol6( const Bit_model bm[], unsigned symbol ) { - int bit = symbol & 1; + bool bit = symbol & 1; symbol |= 64; symbol >>= 1; int price = price_bit( bm[symbol], bit ); bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); @@ -114,7 +114,7 @@ inline int price_symbol6( const Bit_model bm[], int symbol ) inline int price_symbol8( const Bit_model bm[], int symbol ) { - int bit = symbol & 1; + bool bit = symbol & 1; symbol |= 0x100; symbol >>= 1; int price = price_bit( bm[symbol], bit ); bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); @@ -134,31 +134,29 @@ inline int price_symbol_reversed( const Bit_model bm[], int symbol, int model = 1; for( int i = num_bits; i > 0; --i ) { - const int bit = symbol & 1; + const bool bit = symbol & 1; + symbol >>= 1; price += price_bit( bm[model], bit ); model = ( model << 1 ) | bit; - symbol >>= 1; } return price; } -inline int price_matched( const Bit_model bm[], int symbol, int match_byte ) +inline int price_matched( const Bit_model bm[], unsigned symbol, + unsigned match_byte ) { int price = 0; - int mask = 0x100; + unsigned mask = 0x100; symbol |= mask; - - do { - match_byte <<= 1; - const int match_bit = match_byte & mask; - symbol <<= 1; - const int bit = symbol & 0x100; - price += price_bit( bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); // if( match_bit != bit ) mask = 0; + while( true ) + { + const unsigned match_bit = ( match_byte <<= 1 ) & mask; + const bool bit = ( symbol <<= 1 ) & 0x100; + price += price_bit( bm[(symbol>>9)+match_bit+mask], bit ); + if( symbol >= 0x10000 ) return price; + mask &= ~(match_bit ^ symbol); // if( match_bit != bit ) mask = 0; } - while( symbol < 0x10000 ); - return price; } @@ -203,12 +201,11 @@ public: bool data_finished() const { return at_stream_end && pos >= stream_pos; } const uint8_t * ptr_to_current_pos() const { return buffer + pos; } - int true_match_len( const int index, const int distance, int len_limit ) const + int true_match_len( const int index, const int distance ) const { - const uint8_t * const data = buffer + pos + index; - int i = 0; - if( index + len_limit > available_bytes() ) - len_limit = available_bytes() - index; + const uint8_t * const data = buffer + pos; + int i = index; + const int len_limit = std::min( available_bytes(), (int)max_match_len ); while( i < len_limit && data[i-distance] == data[i] ) ++i; return i; } @@ -238,9 +235,9 @@ class Range_encoder void shift_low() { - const bool carry = ( low > 0xFFFFFFFFU ); - if( carry || low < 0xFF000000U ) + if( low >> 24 != 0xFF ) { + const bool carry = ( low > 0xFFFFFFFFU ); put_byte( cache + carry ); for( ; ff_count > 0; --ff_count ) put_byte( 0xFF + carry ); cache = low >> 24; @@ -291,15 +288,15 @@ public: void encode( const int symbol, const int num_bits ) { - for( int i = num_bits - 1; i >= 0; --i ) + for( unsigned mask = 1 << ( num_bits - 1 ); mask > 0; mask >>= 1 ) { range >>= 1; - if( (symbol >> i) & 1 ) low += range; + if( symbol & mask ) low += range; if( range <= 0x00FFFFFFU ) { range <<= 8; shift_low(); } } } - void encode_bit( Bit_model & bm, const int bit ) + void encode_bit( Bit_model & bm, const bool bit ) { const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; if( !bit ) @@ -319,17 +316,17 @@ public: void encode_tree3( Bit_model bm[], const int symbol ) { int model = 1; - int bit = ( symbol >> 2 ) & 1; + bool bit = ( symbol >> 2 ) & 1; encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; bit = ( symbol >> 1 ) & 1; encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; encode_bit( bm[model], symbol & 1 ); } - void encode_tree6( Bit_model bm[], const int symbol ) + void encode_tree6( Bit_model bm[], const unsigned symbol ) { int model = 1; - int bit = ( symbol >> 5 ) & 1; + bool bit = ( symbol >> 5 ) & 1; encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; bit = ( symbol >> 4 ) & 1; encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; @@ -345,13 +342,12 @@ public: void encode_tree8( Bit_model bm[], const int symbol ) { int model = 1; - int mask = ( 1 << 7 ); - do { - const int bit = ( symbol & mask ); + for( int i = 7; i >= 0; --i ) + { + const bool bit = ( symbol >> i ) & 1; encode_bit( bm[model], bit ); - model <<= 1; if( bit ) ++model; + model = ( model << 1 ) | bit; } - while( mask >>= 1 ); } void encode_tree_reversed( Bit_model bm[], int symbol, const int num_bits ) @@ -359,44 +355,41 @@ public: int model = 1; for( int i = num_bits; i > 0; --i ) { - const int bit = symbol & 1; + const bool bit = symbol & 1; + symbol >>= 1; encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - symbol >>= 1; } } - void encode_matched( Bit_model bm[], int symbol, int match_byte ) + void encode_matched( Bit_model bm[], unsigned symbol, unsigned match_byte ) { - int mask = 0x100; + unsigned mask = 0x100; symbol |= mask; - - do { - match_byte <<= 1; - const int match_bit = match_byte & mask; - symbol <<= 1; - const int bit = symbol & 0x100; - encode_bit( bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); // if( match_bit != bit ) mask = 0; + while( true ) + { + const unsigned match_bit = ( match_byte <<= 1 ) & mask; + const bool bit = ( symbol <<= 1 ) & 0x100; + encode_bit( bm[(symbol>>9)+match_bit+mask], bit ); + if( symbol >= 0x10000 ) break; + mask &= ~(match_bit ^ symbol); // if( match_bit != bit ) mask = 0; } - while( symbol < 0x10000 ); } void encode_len( Len_model & lm, int symbol, const int pos_state ) { - symbol -= min_match_len; - bool bit = ( symbol >= len_low_symbols ); + bool bit = ( ( symbol -= min_match_len ) >= len_low_symbols ); encode_bit( lm.choice1, bit ); if( !bit ) encode_tree3( lm.bm_low[pos_state], symbol ); else { - bit = ( symbol >= len_low_symbols + len_mid_symbols ); + bit = ( ( symbol -= len_low_symbols ) >= len_mid_symbols ); encode_bit( lm.choice2, bit ); if( !bit ) - encode_tree3( lm.bm_mid[pos_state], symbol - len_low_symbols ); + encode_tree3( lm.bm_mid[pos_state], symbol ); else - encode_tree8( lm.bm_high, symbol - len_low_symbols - len_mid_symbols ); + encode_tree8( lm.bm_high, symbol - len_mid_symbols ); } } }; @@ -418,15 +411,17 @@ protected: Bit_model bm_rep2[State::states]; Bit_model bm_len[State::states][pos_states]; Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; - Bit_model bm_dis[modeled_distances-end_dis_model]; + Bit_model bm_dis[modeled_distances-end_dis_model+1]; Bit_model bm_align[dis_align_size]; Len_model match_len_model; Len_model rep_len_model; Range_encoder renc; - LZ_encoder_base( const int before, const int dict_size, const int after_size, - const int dict_factor, const int num_prev_positions23, - const int pos_array_factor, const int ifd, const int outfd ) + LZ_encoder_base( const int before, const int dict_size, + const int after_size, const int dict_factor, + const int num_prev_positions23, + const int pos_array_factor, + const int ifd, const int outfd ) : Matchfinder_base( before, dict_size, after_size, dict_factor, num_prev_positions23, pos_array_factor, ifd ), @@ -455,7 +450,7 @@ protected: void encode_pair( const unsigned dis, const int len, const int pos_state ) { renc.encode_len( match_len_model, len, pos_state ); - const int dis_slot = get_slot( dis ); + const unsigned dis_slot = get_slot( dis ); renc.encode_tree6( bm_dis_slot[get_len_state(len)], dis_slot ); if( dis_slot >= start_dis_model ) @@ -465,7 +460,7 @@ protected: const unsigned direct_dis = dis - base; if( dis_slot < end_dis_model ) - renc.encode_tree_reversed( bm_dis + base - dis_slot - 1, direct_dis, + renc.encode_tree_reversed( bm_dis + ( base - dis_slot ), direct_dis, direct_bits ); else { diff --git a/fast_encoder.cc b/fast_encoder.cc index 939259f..57b21ef 100644 --- a/fast_encoder.cc +++ b/fast_encoder.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,23 +33,22 @@ int FLZ_encoder::longest_match_len( int * const distance ) { enum { len_limit = 16 }; - if( len_limit > available_bytes() ) return 0; + const int available = std::min( available_bytes(), (int)max_match_len ); + if( available < len_limit ) return 0; const uint8_t * const data = ptr_to_current_pos(); key4 = ( ( key4 << 4 ) ^ data[3] ) & key4_mask; - const int pos1 = pos + 1; - int newpos = prev_positions[key4]; + int newpos1 = prev_positions[key4]; prev_positions[key4] = pos1; - int32_t * ptr0 = pos_array + cyclic_pos; int maxlen = 0; for( int count = 4; ; ) { - if( --count < 0 || newpos <= 0 ) { *ptr0 = 0; break; } - const int delta = pos1 - newpos; - if( delta > dictionary_size ) { *ptr0 = 0; break; } + int delta; + if( newpos1 <= 0 || --count < 0 || + ( delta = pos1 - newpos1 ) > dictionary_size ) { *ptr0 = 0; break; } int32_t * const newptr = pos_array + ( cyclic_pos - delta + ( ( cyclic_pos >= delta ) ? 0 : dictionary_size + 1 ) ); @@ -57,22 +56,15 @@ int FLZ_encoder::longest_match_len( int * const distance ) if( data[maxlen-delta] == data[maxlen] ) { int len = 0; - while( len < len_limit && data[len-delta] == data[len] ) ++len; - if( maxlen < len ) { maxlen = len; *distance = delta - 1; } + while( len < available && data[len-delta] == data[len] ) ++len; + if( maxlen < len ) + { maxlen = len; *distance = delta - 1; + if( maxlen >= len_limit ) { *ptr0 = *newptr; break; } } } - if( maxlen < len_limit ) - { - *ptr0 = newpos; - ptr0 = newptr; - newpos = *ptr0; - } - else - { - *ptr0 = *newptr; - maxlen += true_match_len( maxlen, *distance + 1, max_match_len - maxlen ); - break; - } + *ptr0 = newpos1; + ptr0 = newptr; + newpos1 = *ptr0; } return maxlen; } @@ -110,7 +102,7 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) for( int i = 0; i < num_rep_distances; ++i ) { - const int tlen = true_match_len( 0, reps[i] + 1, max_match_len ); + const int tlen = true_match_len( 0, reps[i] + 1 ); if( tlen > len ) { len = tlen; rep = i; } } if( len > min_match_len && len + 3 > main_len ) @@ -181,11 +173,10 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) // literal byte renc.encode_bit( bm_match[state()][pos_state], 0 ); - if( state.is_char() ) + if( state.is_char_set_char() ) encode_literal( prev_byte, cur_byte ); else encode_matched( prev_byte, cur_byte, match_byte ); - state.set_char(); } full_flush( state ); diff --git a/fast_encoder.h b/fast_encoder.h index 2e0bd50..37ab8ef 100644 --- a/fast_encoder.h +++ b/fast_encoder.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,7 +17,7 @@ class FLZ_encoder : public LZ_encoder_base { - int key4; // key made from latest 4 bytes + unsigned key4; // key made from latest 4 bytes void reset_key4() { @@ -35,9 +35,8 @@ class FLZ_encoder : public LZ_encoder_base if( available_bytes() >= 4 ) { key4 = ( ( key4 << 4 ) ^ buffer[pos+3] ) & key4_mask; - const int newpos = prev_positions[key4]; + pos_array[cyclic_pos] = prev_positions[key4]; prev_positions[key4] = pos + 1; - pos_array[cyclic_pos] = newpos; } move_pos(); } diff --git a/file_index.cc b/file_index.cc new file mode 100644 index 0000000..4a1759c --- /dev/null +++ b/file_index.cc @@ -0,0 +1,192 @@ +/* Lzip - LZMA lossless data compressor + Copyright (C) 2008-2017 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> + +#include "lzip.h" +#include "file_index.h" + + +namespace { + +int seek_read( const int fd, uint8_t * const buf, const int size, + const long long pos ) + { + if( lseek( fd, pos, SEEK_SET ) == pos ) + return readblock( fd, buf, size ); + return 0; + } + +} // end namespace + + +void File_index::set_errno_error( const char * const msg ) + { + error_ = msg; error_ += std::strerror( errno ); + retval_ = 1; + } + +void File_index::set_num_error( const char * const msg, unsigned long long num ) + { + char buf[80]; + snprintf( buf, sizeof buf, "%s%llu", msg, num ); + error_ = buf; + retval_ = 2; + } + + +// If successful, push last member and set pos to member header. +bool File_index::skip_trailing_data( const int fd, long long & pos ) + { + enum { block_size = 16384, + buffer_size = block_size + File_trailer::size - 1 + File_header::size }; + uint8_t buffer[buffer_size]; + if( pos < min_member_size ) return false; + int bsize = pos % block_size; // total bytes in buffer + if( bsize <= buffer_size - block_size ) bsize += block_size; + int search_size = bsize; // bytes to search for trailer + int rd_size = bsize; // bytes to read from file + unsigned long long ipos = pos - rd_size; // aligned to block_size + + while( true ) + { + if( seek_read( fd, buffer, rd_size, ipos ) != rd_size ) + { set_errno_error( "Error seeking member trailer: " ); return false; } + const uint8_t max_msb = ( ipos + search_size ) >> 56; + for( int i = search_size; i >= File_trailer::size; --i ) + if( buffer[i-1] <= max_msb ) // most significant byte of member_size + { + File_trailer & trailer = + *(File_trailer *)( buffer + i - File_trailer::size ); + const unsigned long long member_size = trailer.member_size(); + if( member_size == 0 ) + { while( i > File_trailer::size && buffer[i-9] == 0 ) --i; continue; } + if( member_size < min_member_size || member_size > ipos + i ) + continue; + File_header header; + if( seek_read( fd, header.data, File_header::size, + ipos + i - member_size ) != File_header::size ) + { set_errno_error( "Error reading member header: " ); return false; } + const unsigned dictionary_size = header.dictionary_size(); + if( !header.verify_magic() || !header.verify_version() || + !isvalid_ds( dictionary_size ) ) continue; + if( (*(File_header *)( buffer + i )).verify_prefix( bsize - i ) ) + { + error_ = "Last member in input file is truncated or corrupt."; + retval_ = 2; return false; + } + pos = ipos + i - member_size; + member_vector.push_back( Member( 0, trailer.data_size(), pos, + member_size, dictionary_size ) ); + return true; + } + if( ipos <= 0 ) + { set_num_error( "Member size in trailer is corrupt at pos ", pos - 8 ); + return false; } + bsize = buffer_size; + search_size = bsize - File_header::size; + rd_size = block_size; + ipos -= rd_size; + std::memcpy( buffer + rd_size, buffer, buffer_size - rd_size ); + } + } + + +File_index::File_index( const int infd, const bool ignore_trailing ) + : isize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ) + { + if( isize < 0 ) + { set_errno_error( "Input file is not seekable: " ); return; } + if( isize < min_member_size ) + { error_ = "Input file is too short."; retval_ = 2; return; } + if( isize > INT64_MAX ) + { error_ = "Input file is too long (2^63 bytes or more)."; + retval_ = 2; return; } + + File_header header; + if( seek_read( infd, header.data, File_header::size, 0 ) != File_header::size ) + { set_errno_error( "Error reading member header: " ); return; } + if( !header.verify_magic() ) + { error_ = bad_magic_msg; retval_ = 2; return; } + if( !header.verify_version() ) + { error_ = bad_version( header.version() ); retval_ = 2; return; } + if( !isvalid_ds( header.dictionary_size() ) ) + { error_ = bad_dict_msg; retval_ = 2; return; } + + long long pos = isize; // always points to a header or to EOF + while( pos >= min_member_size ) + { + File_trailer trailer; + if( seek_read( infd, trailer.data, File_trailer::size, + pos - File_trailer::size ) != File_trailer::size ) + { set_errno_error( "Error reading member trailer: " ); break; } + const unsigned long long member_size = trailer.member_size(); + if( member_size < min_member_size || member_size > (unsigned long long)pos ) + { + if( !member_vector.empty() ) + set_num_error( "Member size in trailer is corrupt at pos ", pos - 8 ); + else if( skip_trailing_data( infd, pos ) ) + { if( ignore_trailing ) continue; + error_ = trailing_msg; retval_ = 2; return; } + break; + } + if( seek_read( infd, header.data, File_header::size, + pos - member_size ) != File_header::size ) + { set_errno_error( "Error reading member header: " ); break; } + const unsigned dictionary_size = header.dictionary_size(); + if( !header.verify_magic() || !header.verify_version() || + !isvalid_ds( dictionary_size ) ) + { + if( !member_vector.empty() ) + set_num_error( "Bad header at pos ", pos - member_size ); + else if( skip_trailing_data( infd, pos ) ) + { if( ignore_trailing ) continue; + error_ = trailing_msg; retval_ = 2; return; } + break; + } + pos -= member_size; + member_vector.push_back( Member( 0, trailer.data_size(), pos, + member_size, dictionary_size ) ); + } + if( pos != 0 || member_vector.empty() ) + { + member_vector.clear(); + if( retval_ == 0 ) { error_ = "Can't create file index."; retval_ = 2; } + return; + } + std::reverse( member_vector.begin(), member_vector.end() ); + for( unsigned long i = 0; i < member_vector.size() - 1; ++i ) + { + const long long end = member_vector[i].dblock.end(); + if( end < 0 || end > INT64_MAX ) + { + member_vector.clear(); + error_ = "Data in input file is too long (2^63 bytes or more)."; + retval_ = 2; return; + } + member_vector[i+1].dblock.pos( end ); + } + } diff --git a/file_index.h b/file_index.h new file mode 100644 index 0000000..fbfbef1 --- /dev/null +++ b/file_index.h @@ -0,0 +1,85 @@ +/* Lzip - LZMA lossless data compressor + Copyright (C) 2008-2017 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef INT64_MAX +#define INT64_MAX 0x7FFFFFFFFFFFFFFFLL +#endif + + +class Block + { + long long pos_, size_; // pos + size <= INT64_MAX + +public: + Block( const long long p, const long long s ) : pos_( p ), size_( s ) {} + + long long pos() const { return pos_; } + long long size() const { return size_; } + long long end() const { return pos_ + size_; } + + void pos( const long long p ) { pos_ = p; } + void size( const long long s ) { size_ = s; } + }; + + +class File_index + { + struct Member + { + Block dblock, mblock; // data block, member block + unsigned dictionary_size; + + Member( const long long dp, const long long ds, + const long long mp, const long long ms, const unsigned dict_size ) + : dblock( dp, ds ), mblock( mp, ms ), dictionary_size( dict_size ) {} + }; + + std::vector< Member > member_vector; + std::string error_; + const long long isize; + int retval_; + + void set_errno_error( const char * const msg ); + void set_num_error( const char * const msg, unsigned long long num ); + bool skip_trailing_data( const int fd, long long & pos ); + +public: + File_index( const int infd, const bool ignore_trailing ); + + long members() const { return member_vector.size(); } + const std::string & error() const { return error_; } + int retval() const { return retval_; } + + long long udata_size() const + { if( member_vector.empty() ) return 0; + return member_vector.back().dblock.end(); } + + long long cdata_size() const + { if( member_vector.empty() ) return 0; + return member_vector.back().mblock.end(); } + + // total size including trailing data (if any) + long long file_size() const + { if( isize >= 0 ) return isize; else return 0; } + + const Block & dblock( const long i ) const + { return member_vector[i].dblock; } + const Block & mblock( const long i ) const + { return member_vector[i].mblock; } + unsigned dictionary_size( const long i ) const + { return member_vector[i].dictionary_size; } + }; @@ -0,0 +1,121 @@ +/* Lzip - LZMA lossless data compressor + Copyright (C) 2008-2017 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "file_index.h" + + +namespace { + +void list_line( const unsigned long long uncomp_size, + const unsigned long long comp_size, + const char * const input_filename ) + { + if( uncomp_size > 0 ) + std::printf( "%15llu %15llu %6.2f%% %s\n", uncomp_size, comp_size, + 100.0 * ( 1.0 - ( (double)comp_size / uncomp_size ) ), + input_filename ); + else + std::printf( "%15llu %15llu -INF%% %s\n", uncomp_size, comp_size, + input_filename ); + } + +} // end namespace + + +int list_files( const std::vector< std::string > & filenames, + const bool ignore_trailing ) + { + unsigned long long total_comp = 0, total_uncomp = 0; + int files = 0, retval = 0; + bool first_post = true; + bool stdin_used = false; + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = + from_stdin ? "(stdin)" : filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, true, true ); + if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } + + const File_index file_index( infd, ignore_trailing ); + close( infd ); + if( file_index.retval() != 0 ) + { + show_file_error( input_filename, file_index.error().c_str() ); + if( retval < file_index.retval() ) retval = file_index.retval(); + continue; + } + if( verbosity >= 0 ) + { + const unsigned long long udata_size = file_index.udata_size(); + const unsigned long long cdata_size = file_index.cdata_size(); + total_comp += cdata_size; total_uncomp += udata_size; ++files; + if( first_post ) + { + first_post = false; + if( verbosity >= 1 ) std::fputs( " dict memb trail ", stdout ); + std::fputs( " uncompressed compressed saved name\n", stdout ); + } + if( verbosity >= 1 ) + { + unsigned dictionary_size = 0; + for( long i = 0; i < file_index.members(); ++i ) + dictionary_size = + std::max( dictionary_size, file_index.dictionary_size( i ) ); + const long long trailing_size = file_index.file_size() - cdata_size; + std::printf( "%s %5ld %6lld ", format_ds( dictionary_size ), + file_index.members(), trailing_size ); + } + list_line( udata_size, cdata_size, input_filename ); + + if( verbosity >= 2 && file_index.members() > 1 ) + { + std::fputs( " member data_pos data_size member_pos member_size\n", stdout ); + for( long i = 0; i < file_index.members(); ++i ) + { + const Block & db = file_index.dblock( i ); + const Block & mb = file_index.mblock( i ); + std::printf( "%5ld %15llu %15llu %15llu %15llu\n", + i + 1, db.pos(), db.size(), mb.pos(), mb.size() ); + } + first_post = true; // reprint heading after list of members + } + std::fflush( stdout ); + } + } + if( verbosity >= 0 && files > 1 ) + { + if( verbosity >= 1 ) std::fputs( " ", stdout ); + list_line( total_uncomp, total_comp, "(totals)" ); + std::fflush( stdout ); + } + return retval; + } @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,8 +30,12 @@ public: static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; st = next[st]; } - void set_char1() { st -= ( st < 4 ) ? st : 3; } // for st < 7 - void set_char2() { st -= ( st < 10 ) ? 3 : 6; } // for st >= 7 + bool is_char_set_char() + { + if( st < 7 ) { st -= ( st < 4 ) ? st : 3; return true; } + else { st -= ( st < 10 ) ? 3 : 6; return false; } + } + void set_char_rep() { st = 8; } void set_match() { st = ( st < 7 ) ? 7 : 10; } void set_rep() { st = ( st < 7 ) ? 8 : 11; } void set_short_rep() { st = ( st < 7 ) ? 9 : 11; } @@ -43,6 +47,7 @@ enum { min_dictionary_size = 1 << min_dictionary_bits, // >= modeled_distances max_dictionary_bits = 29, max_dictionary_size = 1 << max_dictionary_bits, + min_member_size = 36, literal_context_bits = 3, literal_pos_state_bits = 0, // not used pos_state_bits = 2, @@ -168,8 +173,10 @@ public: void update_buf( uint32_t & crc, const uint8_t * const buffer, const int size ) const { + uint32_t c = crc; for( int i = 0; i < size; ++i ) - crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + c = data[(c^buffer[i])&0xFF] ^ ( c >> 8 ); + crc = c; } }; @@ -227,7 +234,7 @@ struct File_header { const unsigned base_size = 1 << data[5]; const unsigned fraction = base_size / 16; - for( int i = 7; i >= 1; --i ) + for( unsigned i = 7; i >= 1; --i ) if( base_size - ( i * fraction ) >= sz ) { data[5] |= ( i << 5 ); break; } } @@ -283,14 +290,29 @@ struct Error }; +const char * const bad_magic_msg = "Bad magic number (file not in lzip format)."; +const char * const bad_dict_msg = "Invalid dictionary size in member header."; +const char * const trailing_msg = "Trailing data not allowed."; + // defined in decoder.cc int readblock( const int fd, uint8_t * const buf, const int size ); int writeblock( const int fd, const uint8_t * const buf, const int size ); +// defined in list.cc +int list_files( const std::vector< std::string > & filenames, + const bool ignore_trailing ); + // defined in main.cc extern int verbosity; +struct stat; +const char * bad_version( const unsigned version ); +const char * format_ds( const unsigned dictionary_size ); +int open_instream( const char * const name, struct stat * const in_statsp, + const bool no_ofile, const bool reg_only = false ); void show_error( const char * const msg, const int errcode = 0, const bool help = false ); +void show_file_error( const char * const filename, const char * const msg, + const int errcode = 0 ); void internal_error( const char * const msg ); class Matchfinder_base; void show_progress( const unsigned long long partial_size = 0, @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2016 Antonio Diaz Diaz. + Copyright (C) 2008-2017 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -75,10 +75,10 @@ namespace { const char * const Program_name = "Lzip"; const char * const program_name = "lzip"; -const char * const program_year = "2016"; +const char * const program_year = "2017"; const char * invocation_name = 0; -struct { const char * from; const char * to; } const known_extensions[] = { +const struct { const char * from; const char * to; } known_extensions[] = { { ".lz", "" }, { ".tlz", ".tar" }, { 0, 0 } }; @@ -89,7 +89,7 @@ struct Lzma_options int match_len_limit; // 5 .. 273 }; -enum Mode { m_compress, m_decompress, m_test }; +enum Mode { m_compress, m_decompress, m_list, m_test }; std::string output_filename; int outfd = -1; @@ -110,6 +110,7 @@ void show_help() " -f, --force overwrite existing output files\n" " -F, --recompress force re-compression of compressed files\n" " -k, --keep keep (don't delete) input files\n" + " -l, --list print (un)compressed file sizes\n" " -m, --match-length=<bytes> set match length limit in bytes [36]\n" " -o, --output=<file> if reading standard input, write to <file>\n" " -q, --quiet suppress all messages\n" @@ -148,24 +149,41 @@ void show_version() "There is NO WARRANTY, to the extent permitted by law.\n" ); } +} // end namespace + +const char * bad_version( const unsigned version ) + { + static char buf[80]; + snprintf( buf, sizeof buf, "Version %u member format not supported.", + version ); + return buf; + } + + +const char * format_ds( const unsigned dictionary_size ) + { + enum { bufsize = 16, factor = 1024 }; + static char buf[bufsize]; + const char * const prefix[8] = + { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; + const char * p = ""; + const char * np = " "; + unsigned num = dictionary_size; + bool exact = ( num % factor == 0 ); + + for( int i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) + { num /= factor; if( num % factor != 0 ) exact = false; + p = prefix[i]; np = ""; } + snprintf( buf, bufsize, "%s%4u %sB", np, num, p ); + return buf; + } + +namespace { void show_header( const unsigned dictionary_size ) { if( verbosity >= 3 ) - { - const char * const prefix[8] = - { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; - enum { factor = 1024 }; - const char * p = ""; - const char * np = " "; - unsigned num = dictionary_size; - bool exact = ( num % factor == 0 ); - - for( int i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) - { num /= factor; if( num % factor != 0 ) exact = false; - p = prefix[i]; np = ""; } - std::fprintf( stderr, "dictionary size %s%4u %sB. ", np, num, p ); - } + std::fprintf( stderr, "dictionary %s. ", format_ds( dictionary_size ) ); } @@ -184,7 +202,7 @@ unsigned long long getnum( const char * const ptr, if( !errno && tail[0] ) { - const int factor = ( tail[1] == 'i' ) ? 1024 : 1000; + const unsigned factor = ( tail[1] == 'i' ) ? 1024 : 1000; int exponent = 0; // 0 = bad multiplier switch( tail[0] ) { @@ -222,7 +240,7 @@ unsigned long long getnum( const char * const ptr, int get_dict_size( const char * const arg ) { char * tail; - const int bits = std::strtol( arg, &tail, 0 ); + const long bits = std::strtol( arg, &tail, 0 ); if( bits >= min_dictionary_bits && bits <= max_dictionary_bits && *tail == 0 ) return ( 1 << bits ); @@ -230,62 +248,75 @@ int get_dict_size( const char * const arg ) } +void set_mode( Mode & program_mode, const Mode new_mode ) + { + if( program_mode != m_compress && program_mode != new_mode ) + { + show_error( "Only one operation can be specified.", 0, true ); + std::exit( 1 ); + } + program_mode = new_mode; + } + + int extension_index( const std::string & name ) { - for( int i = 0; known_extensions[i].from; ++i ) + for( int eindex = 0; known_extensions[eindex].from; ++eindex ) { - const std::string ext( known_extensions[i].from ); + const std::string ext( known_extensions[eindex].from ); if( name.size() > ext.size() && name.compare( name.size() - ext.size(), ext.size(), ext ) == 0 ) - return i; + return eindex; } return -1; } +} // end namespace int open_instream( const char * const name, struct stat * const in_statsp, - const Mode program_mode, const int eindex, - const bool recompress, const bool to_stdout ) + const bool no_ofile, const bool reg_only ) { - int infd = -1; - if( program_mode == m_compress && !recompress && eindex >= 0 ) - { - if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Input file '%s' already has '%s' suffix.\n", - program_name, name, known_extensions[eindex].from ); - } + int infd = open( name, O_RDONLY | O_BINARY ); + if( infd < 0 ) + show_file_error( name, "Can't open input file", errno ); else { - infd = open( name, O_RDONLY | O_BINARY ); - if( infd < 0 ) + const int i = fstat( infd, in_statsp ); + const mode_t mode = in_statsp->st_mode; + const bool can_read = ( i == 0 && !reg_only && + ( S_ISBLK( mode ) || S_ISCHR( mode ) || + S_ISFIFO( mode ) || S_ISSOCK( mode ) ) ); + if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) ) { if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Can't open input file '%s': %s\n", - program_name, name, std::strerror( errno ) ); - } - else - { - const int i = fstat( infd, in_statsp ); - const mode_t mode = in_statsp->st_mode; - const bool can_read = ( i == 0 && - ( S_ISBLK( mode ) || S_ISCHR( mode ) || - S_ISFIFO( mode ) || S_ISSOCK( mode ) ) ); - const bool no_ofile = ( to_stdout || program_mode == m_test ); - if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) ) - { - if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", - program_name, name, - ( can_read && !no_ofile ) ? - ",\n and '--stdout' was not specified" : "" ); - close( infd ); - infd = -1; - } + std::fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", + program_name, name, + ( can_read && !no_ofile ) ? + ",\n and '--stdout' was not specified" : "" ); + close( infd ); + infd = -1; } } return infd; } +namespace { + +int open_instream2( const char * const name, struct stat * const in_statsp, + const Mode program_mode, const int eindex, + const bool recompress, const bool to_stdout ) + { + if( program_mode == m_compress && !recompress && eindex >= 0 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Input file '%s' already has '%s' suffix.\n", + program_name, name, known_extensions[eindex].from ); + return -1; + } + const bool no_ofile = ( to_stdout || program_mode == m_test ); + return open_instream( name, in_statsp, no_ofile, false ); + } + void set_c_outname( const std::string & name, const bool multifile ) { @@ -295,15 +326,15 @@ void set_c_outname( const std::string & name, const bool multifile ) } -void set_d_outname( const std::string & name, const int i ) +void set_d_outname( const std::string & name, const int eindex ) { - if( i >= 0 ) + if( eindex >= 0 ) { - const std::string from( known_extensions[i].from ); + const std::string from( known_extensions[eindex].from ); if( name.size() > from.size() ) { output_filename.assign( name, 0, name.size() - from.size() ); - output_filename += known_extensions[i].to; + output_filename += known_extensions[eindex].to; return; } } @@ -337,7 +368,8 @@ bool open_outstream( const bool force, const bool from_stdin ) } -bool check_tty( const int infd, const Mode program_mode ) +bool check_tty( const char * const input_filename, const int infd, + const Mode program_mode ) { if( program_mode == m_compress && isatty( outfd ) ) { @@ -347,7 +379,8 @@ bool check_tty( const int infd, const Mode program_mode ) if( ( program_mode == m_decompress || program_mode == m_test ) && isatty( infd ) ) { - show_error( "I won't read compressed data from a terminal.", 0, true ); + show_file_error( input_filename, + "I won't read compressed data from a terminal." ); return false; } return true; @@ -406,9 +439,10 @@ void close_and_set_permissions( const struct stat * const in_statsp ) bool next_filename() { + const unsigned name_len = output_filename.size(); const unsigned ext_len = std::strlen( known_extensions[0].from ); - if( output_filename.size() >= ext_len + 5 ) // "*00001.lz" - for( int i = output_filename.size() - ext_len - 1, j = 0; j < 5; --i, ++j ) + if( name_len >= ext_len + 5 ) // "*00001.lz" + for( int i = name_len - ext_len - 1, j = 0; j < 5; --i, ++j ) { if( output_filename[i] < '9' ) { ++output_filename[i]; return true; } else output_filename[i] = '0'; @@ -495,10 +529,10 @@ int compress( const unsigned long long member_size, } -unsigned char xdigit( const int value ) +unsigned char xdigit( const unsigned value ) { - if( value >= 0 && value <= 9 ) return '0' + value; - if( value >= 10 && value <= 15 ) return 'A' + value - 10; + if( value <= 9 ) return '0' + value; + if( value <= 15 ) return 'A' + value - 10; return 0; } @@ -512,26 +546,18 @@ bool show_trailing_data( const uint8_t * const data, const int size, std::string msg; if( !all ) msg = "first bytes of "; msg += "trailing data = "; - bool text = true; for( int i = 0; i < size; ++i ) - if( !std::isprint( data[i] ) ) { text = false; break; } - if( text ) { - msg += '\''; - msg.append( (const char *)data, size ); - msg += '\''; - } - else - { - for( int i = 0; i < size; ++i ) - { - if( i > 0 ) msg += ' '; - msg += xdigit( data[i] >> 4 ); - msg += xdigit( data[i] & 0x0F ); - } + msg += xdigit( data[i] >> 4 ); + msg += xdigit( data[i] & 0x0F ); + msg += ' '; } + msg += '\''; + for( int i = 0; i < size; ++i ) + { if( std::isprint( data[i] ) ) msg += data[i]; else msg += '.'; } + msg += '\''; pp( msg.c_str() ); - if( !ignore_trailing ) show_error( "Trailing data not allowed." ); + if( !ignore_trailing ) show_file_error( pp.name(), trailing_msg ); } return ignore_trailing; } @@ -562,22 +588,16 @@ int decompress( const int infd, const Pretty_print & pp, if( !header.verify_magic() ) { if( first_member ) - { pp( "Bad magic number (file not in lzip format)." ); retval = 2; } + { show_file_error( pp.name(), bad_magic_msg ); retval = 2; } else if( !show_trailing_data( header.data, size, pp, false, ignore_trailing ) ) retval = 2; break; } if( !header.verify_version() ) - { - if( verbosity >= 0 ) - { pp(); - std::fprintf( stderr, "Version %d member format not supported.\n", - header.version() ); } - retval = 2; break; - } + { pp( bad_version( header.version() ) ); retval = 2; break; } const unsigned dictionary_size = header.dictionary_size(); if( !isvalid_ds( dictionary_size ) ) - { pp( "Invalid dictionary size in member header." ); retval = 2; break; } + { pp( bad_dict_msg ); retval = 2; break; } if( verbosity >= 2 || ( verbosity == 1 && first_member ) ) { pp(); show_header( dictionary_size ); } @@ -640,6 +660,16 @@ void show_error( const char * const msg, const int errcode, const bool help ) } +void show_file_error( const char * const filename, const char * const msg, + const int errcode ) + { + if( verbosity < 0 ) return; + std::fprintf( stderr, "%s: %s: %s", program_name, filename, msg ); + if( errcode > 0 ) std::fprintf( stderr, ": %s", std::strerror( errcode ) ); + std::fputc( '\n', stderr ); + } + + void internal_error( const char * const msg ) { if( verbosity >= 0 ) @@ -693,7 +723,6 @@ int main( const int argc, const char * const argv[] ) const unsigned long long max_volume_size = 0x4000000000000000ULL; unsigned long long member_size = max_member_size; unsigned long long volume_size = 0; - std::string input_filename; std::string default_output_filename; std::vector< std::string > filenames; int infd = -1; @@ -726,6 +755,7 @@ int main( const int argc, const char * const argv[] ) { 'F', "recompress", Arg_parser::no }, { 'h', "help", Arg_parser::no }, { 'k', "keep", Arg_parser::no }, + { 'l', "list", Arg_parser::no }, { 'm', "match-length", Arg_parser::yes }, { 'n', "threads", Arg_parser::yes }, { 'o', "output", Arg_parser::yes }, @@ -746,8 +776,8 @@ int main( const int argc, const char * const argv[] ) { const int code = parser.code( argind ); if( !code ) break; // no more options - const std::string & arg = parser.argument( argind ); - const char * const ptr = arg.c_str(); + const std::string & sarg = parser.argument( argind ); + const char * const arg = sarg.c_str(); switch( code ) { case '0': case '1': case '2': case '3': case '4': @@ -755,23 +785,24 @@ int main( const int argc, const char * const argv[] ) zero = ( code == '0' ); encoder_options = option_mapping[code-'0']; break; case 'a': ignore_trailing = false; break; - case 'b': member_size = getnum( ptr, 100000, max_member_size ); break; + case 'b': member_size = getnum( arg, 100000, max_member_size ); break; case 'c': to_stdout = true; break; - case 'd': program_mode = m_decompress; break; + case 'd': set_mode( program_mode, m_decompress ); break; case 'f': force = true; break; case 'F': recompress = true; break; case 'h': show_help(); return 0; case 'k': keep_input_files = true; break; + case 'l': set_mode( program_mode, m_list ); break; case 'm': encoder_options.match_len_limit = - getnum( ptr, min_match_len_limit, max_match_len ); + getnum( arg, min_match_len_limit, max_match_len ); zero = false; break; case 'n': break; - case 'o': default_output_filename = arg; break; + case 'o': default_output_filename = sarg; break; case 'q': verbosity = -1; break; - case 's': encoder_options.dictionary_size = get_dict_size( ptr ); + case 's': encoder_options.dictionary_size = get_dict_size( arg ); zero = false; break; - case 'S': volume_size = getnum( ptr, 100000, max_volume_size ); break; - case 't': program_mode = m_test; break; + case 'S': volume_size = getnum( arg, 100000, max_volume_size ); break; + case 't': set_mode( program_mode, m_test ); break; case 'v': if( verbosity < 4 ) ++verbosity; break; case 'V': show_version(); return 0; default : internal_error( "uncaught option." ); @@ -783,6 +814,17 @@ int main( const int argc, const char * const argv[] ) setmode( STDOUT_FILENO, O_BINARY ); #endif + bool filenames_given = false; + for( ; argind < parser.arguments(); ++argind ) + { + filenames.push_back( parser.argument( argind ) ); + if( filenames.back() != "-" ) filenames_given = true; + } + if( filenames.empty() ) filenames.push_back("-"); + + if( program_mode == m_list ) + return list_files( filenames, ignore_trailing ); + if( program_mode == m_test ) outfd = -1; else if( program_mode == m_compress ) @@ -791,14 +833,6 @@ int main( const int argc, const char * const argv[] ) prob_prices.init(); } - bool filenames_given = false; - for( ; argind < parser.arguments(); ++argind ) - { - filenames.push_back( parser.argument( argind ) ); - if( filenames.back() != "-" ) filenames_given = true; - } - - if( filenames.empty() ) filenames.push_back("-"); if( !to_stdout && program_mode != m_test && ( filenames_given || default_output_filename.size() ) ) set_signals(); @@ -809,13 +843,13 @@ int main( const int argc, const char * const argv[] ) bool stdin_used = false; for( unsigned i = 0; i < filenames.size(); ++i ) { + std::string input_filename; struct stat in_stats; output_filename.clear(); if( filenames[i].empty() || filenames[i] == "-" ) { if( stdin_used ) continue; else stdin_used = true; - input_filename.clear(); infd = STDIN_FILENO; if( program_mode != m_test ) { @@ -837,10 +871,9 @@ int main( const int argc, const char * const argv[] ) } else { - input_filename = filenames[i]; - const int eindex = extension_index( input_filename ); - infd = open_instream( input_filename.c_str(), &in_stats, program_mode, - eindex, recompress, to_stdout ); + const int eindex = extension_index( input_filename = filenames[i] ); + infd = open_instream2( input_filename.c_str(), &in_stats, program_mode, + eindex, recompress, to_stdout ); if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } if( program_mode != m_test ) { @@ -860,14 +893,15 @@ int main( const int argc, const char * const argv[] ) } } - if( !check_tty( infd, program_mode ) ) + pp.set_name( input_filename ); + if( !check_tty( pp.name(), infd, program_mode ) ) { if( retval < 1 ) retval = 1; + if( program_mode == m_test ) { close( infd ); infd = -1; continue; } cleanup_and_fail( retval ); } const struct stat * const in_statsp = input_filename.size() ? &in_stats : 0; - pp.set_name( input_filename ); int tmp; if( program_mode == m_compress ) tmp = compress( member_size, volume_size, infd, encoder_options, pp, diff --git a/testsuite/check.sh b/testsuite/check.sh index d94e8f3..79e22eb 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -1,6 +1,6 @@ #! /bin/sh # check script for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2016 Antonio Diaz Diaz. +# Copyright (C) 2008-2017 Antonio Diaz Diaz. # # This script is free software: you have unlimited permission # to copy, distribute and modify it. @@ -17,12 +17,12 @@ if [ ! -f "${LZIP}" ] || [ ! -x "${LZIP}" ] ; then exit 1 fi -if [ -e "${LZIP}" ] 2> /dev/null ; then true -else +[ -e "${LZIP}" ] 2> /dev/null || + { echo "$0: a POSIX shell is required to run the tests" echo "Try bash -c \"$0 $1 $2\"" exit 1 -fi + } if [ -d tmp ] ; then rm -rf tmp ; fi mkdir tmp @@ -31,143 +31,229 @@ cd "${objdir}"/tmp || framework_failure cat "${testdir}"/test.txt > in || framework_failure in_lz="${testdir}"/test.txt.lz fail=0 +test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; } printf "testing lzip-%s..." "$2" "${LZIP}" -fkqm4 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO "${LZIP}" -fkqm274 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs-1 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs0 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs4095 in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -fkqs513MiB in -if [ $? = 1 ] && [ ! -e in.lz ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO +for i in bad_size -1 0 4095 513MiB 1G 1T 1P 1E 1Z 1Y 10KB ; do + "${LZIP}" -fkqs $i in + { [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO $i +done +"${LZIP}" -lq in +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -tq in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -tq < in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -cdq in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -cdq < in -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi -dd if="${in_lz}" bs=1 count=6 2> /dev/null | "${LZIP}" -tq -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi -dd if="${in_lz}" bs=1 count=20 2> /dev/null | "${LZIP}" -tq -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO +# these are for code coverage +"${LZIP}" -lt "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cdl "${in_lz}" > out 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cdt "${in_lz}" > out 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -t -- nx_file 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --help > /dev/null || test_failed $LINENO +"${LZIP}" -n1 -V > /dev/null || test_failed $LINENO +"${LZIP}" -m 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -z 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --bad_option 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --t 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --test=2 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --output= 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --output 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +printf "LZIP\001-.............................." | "${LZIP}" -t 2> /dev/null +printf "LZIP\002-.............................." | "${LZIP}" -t 2> /dev/null +printf "LZIP\001+.............................." | "${LZIP}" -t 2> /dev/null printf "\ntesting decompression..." -"${LZIP}" -t "${in_lz}" -if [ $? = 0 ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -cd "${in_lz}" > copy || fail=1 -cmp in copy || fail=1 -printf . +"${LZIP}" -lq "${in_lz}" || test_failed $LINENO +"${LZIP}" -t "${in_lz}" || test_failed $LINENO +"${LZIP}" -cd "${in_lz}" > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO rm -f copy cat "${in_lz}" > copy.lz || framework_failure -"${LZIP}" -dk copy.lz || fail=1 -cmp in copy || fail=1 +"${LZIP}" -dk copy.lz || test_failed $LINENO +cmp in copy || test_failed $LINENO printf "to be overwritten" > copy || framework_failure -"${LZIP}" -dq copy.lz -if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi +"${LZIP}" -d copy.lz 2> /dev/null +[ $? = 1 ] || test_failed $LINENO "${LZIP}" -df copy.lz -if [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; then - printf . ; else printf - ; fail=1 ; fi +{ [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; } || test_failed $LINENO printf "to be overwritten" > copy || framework_failure -"${LZIP}" -df -o copy < "${in_lz}" || fail=1 -cmp in copy || fail=1 -printf . +"${LZIP}" -df -o copy < "${in_lz}" || test_failed $LINENO +cmp in copy || test_failed $LINENO rm -f copy -"${LZIP}" < in > anyothername || fail=1 -"${LZIP}" -d -o copy - anyothername - < "${in_lz}" -if [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; then - printf . ; else printf - ; fail=1 ; fi +"${LZIP}" < in > anyothername || test_failed $LINENO +"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null +{ [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; } || + test_failed $LINENO rm -f copy anyothername.out +"${LZIP}" -lq in "${in_lz}" +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -lq nx_file.lz "${in_lz}" +[ $? = 1 ] || test_failed $LINENO "${LZIP}" -tq in "${in_lz}" -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -tq foo.lz "${in_lz}" -if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -tq nx_file.lz "${in_lz}" +[ $? = 1 ] || test_failed $LINENO "${LZIP}" -cdq in "${in_lz}" > copy -if [ $? = 2 ] && cat copy in | cmp in - ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -cdq foo.lz "${in_lz}" > copy -if [ $? = 1 ] && cmp in copy ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && cat copy in | cmp in - ; } || test_failed $LINENO +"${LZIP}" -cdq nx_file.lz "${in_lz}" > copy +{ [ $? = 1 ] && cmp in copy ; } || test_failed $LINENO rm -f copy cat "${in_lz}" > copy.lz || framework_failure +for i in 1 2 3 4 5 6 7 ; do + printf "g" >> copy.lz || framework_failure + "${LZIP}" -alvv copy.lz "${in_lz}" > /dev/null 2>&1 + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -atvvvv copy.lz "${in_lz}" 2> /dev/null + [ $? = 2 ] || test_failed $LINENO $i +done "${LZIP}" -dq in copy.lz -if [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; then - printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -dq foo.lz copy.lz -if [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e foo ] && cmp in copy ; then - printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; } || + test_failed $LINENO +"${LZIP}" -dq nx_file.lz copy.lz +{ [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e nx_file ] && cmp in copy ; } || + test_failed $LINENO cat in in > in2 || framework_failure -"${LZIP}" -o copy2 < in2 || fail=1 -"${LZIP}" -t copy2.lz || fail=1 -"${LZIP}" -cd copy2.lz > copy2 || fail=1 -cmp in2 copy2 || fail=1 -printf . +cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure +"${LZIP}" -lq in2.lz || test_failed $LINENO +"${LZIP}" -t in2.lz || test_failed $LINENO +"${LZIP}" -cd in2.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO + +"${LZIP}" --output=copy2 < in2 || test_failed $LINENO +"${LZIP}" -lq copy2.lz || test_failed $LINENO +"${LZIP}" -t copy2.lz || test_failed $LINENO +"${LZIP}" -cd copy2.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO -printf "garbage" >> copy2.lz || framework_failure +printf "\ngarbage" >> copy2.lz || framework_failure +"${LZIP}" -tvvvv copy2.lz 2> /dev/null || test_failed $LINENO rm -f copy2 +"${LZIP}" -alq copy2.lz +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -atq copy2.lz -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -atq < copy2.lz -if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi +[ $? = 2 ] || test_failed $LINENO "${LZIP}" -adkq copy2.lz -if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO "${LZIP}" -adkq -o copy2 < copy2.lz -if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi +{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO printf "to be overwritten" > copy2 || framework_failure -"${LZIP}" -df copy2.lz || fail=1 -cmp in2 copy2 || fail=1 -printf . +"${LZIP}" -df copy2.lz || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO printf "\ntesting compression..." -"${LZIP}" -cfq "${in_lz}" > out # /dev/null is a tty on OS/2 -if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi -"${LZIP}" -cF "${in_lz}" > out || fail=1 -"${LZIP}" -cd out | "${LZIP}" -d > copy || fail=1 -cmp in copy || fail=1 -printf . +"${LZIP}" -cf "${in_lz}" > out 2> /dev/null # /dev/null is a tty on OS/2 +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cFvvm36 "${in_lz}" > out 2> /dev/null || test_failed $LINENO +"${LZIP}" -cd out | "${LZIP}" -d > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -k -$i in || fail=1 - mv -f in.lz copy.lz || fail=1 - printf "garbage" >> copy.lz || fail=1 - "${LZIP}" -df copy.lz || fail=1 - cmp in copy || fail=1 + "${LZIP}" -k -$i in || test_failed $LINENO $i + mv -f in.lz copy.lz || test_failed $LINENO $i + printf "garbage" >> copy.lz || framework_failure + "${LZIP}" -df copy.lz || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -c -$i in > out || fail=1 - printf "g" >> out || fail=1 - "${LZIP}" -cd out > copy || fail=1 - cmp in copy || fail=1 + "${LZIP}" -c -$i in > out || test_failed $LINENO $i + printf "g" >> out || framework_failure + "${LZIP}" -cd out > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -$i < in > out || fail=1 - "${LZIP}" -d < out > copy || fail=1 - cmp in copy || fail=1 + "${LZIP}" -$i < in > out || test_failed $LINENO $i + "${LZIP}" -d < out > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do - "${LZIP}" -f -$i -o out < in || fail=1 - "${LZIP}" -df -o copy < out.lz || fail=1 - cmp in copy || fail=1 + "${LZIP}" -f -$i -o out < in || test_failed $LINENO $i + "${LZIP}" -df -o copy < out.lz || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i done -printf . + +cat in in in in in in in in > in8 || framework_failure +"${LZIP}" -1s12 -S100k -o out < in8 || test_failed $LINENO +"${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO +"${LZIP}" -cd out00001.lz out00002.lz | cmp in8 - || test_failed $LINENO +rm -f out00001.lz +"${LZIP}" -1ks4Ki -b100000 in8 || test_failed $LINENO +"${LZIP}" -t in8.lz || test_failed $LINENO +"${LZIP}" -cd in8.lz | cmp in8 - || test_failed $LINENO +rm -f in8 +"${LZIP}" -0 -S100k -o out < in8.lz || test_failed $LINENO +"${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO +"${LZIP}" -cd out00001.lz out00002.lz | cmp in8.lz - || test_failed $LINENO +rm -f out00001.lz out00002.lz +"${LZIP}" -0kF -b100k in8.lz || test_failed $LINENO +"${LZIP}" -t in8.lz.lz || test_failed $LINENO +"${LZIP}" -cd in8.lz.lz | cmp in8.lz - || test_failed $LINENO +rm -f in8.lz in8.lz.lz + +printf "\ntesting bad input..." + +cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure +if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && + [ -e trunc.lz ] && cmp in2.lz trunc.lz > /dev/null 2>&1 ; then + for i in 6 20 14734 14753 14754 14755 14756 14757 14758 ; do + dd if=in3.lz of=trunc.lz bs=$i count=1 2> /dev/null + "${LZIP}" -lq trunc.lz + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -t trunc.lz 2> /dev/null + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -tq < trunc.lz + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -cdq trunc.lz > out + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -dq < trunc.lz > out + [ $? = 2 ] || test_failed $LINENO $i + done +else + printf "\nwarning: skipping truncation test: 'dd' does not work on your system." +fi + +cat "${in_lz}" > ingin.lz || framework_failure +printf "g" >> ingin.lz || framework_failure +cat "${in_lz}" >> ingin.lz || framework_failure +"${LZIP}" -lq ingin.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -t ingin.lz || test_failed $LINENO +"${LZIP}" -cd ingin.lz > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO +"${LZIP}" -t < ingin.lz || test_failed $LINENO +"${LZIP}" -d < ingin.lz > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO echo if [ ${fail} = 0 ] ; then diff --git a/testsuite/test.txt.lz b/testsuite/test.txt.lz Binary files differindex 41d2e39..22cea6e 100644 --- a/testsuite/test.txt.lz +++ b/testsuite/test.txt.lz |