From 1bd12f8e72751a6431189a7088f01477a987a212 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 6 Jan 2019 11:08:51 +0100 Subject: Merging upstream version 1.8. Signed-off-by: Daniel Baumann --- ChangeLog | 19 +++++- INSTALL | 26 +++++-- Makefile.in | 13 ++-- NEWS | 47 ++++++------- README | 66 ++++++++++-------- arg_parser.cc | 2 +- arg_parser.h | 2 +- compress.cc | 28 ++++---- configure | 25 ++++--- dec_stdout.cc | 31 ++++----- dec_stream.cc | 26 +++---- decompress.cc | 99 +++++++++++++++++++-------- doc/plzip.1 | 30 ++++++-- doc/plzip.info | 166 ++++++++++++++++++++++++++------------------ doc/plzip.texi | 134 ++++++++++++++++++++++-------------- file_index.cc | 196 ---------------------------------------------------- file_index.h | 87 ----------------------- list.cc | 34 ++++----- lzip.h | 47 ++++++++----- lzip_index.cc | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++ lzip_index.h | 87 +++++++++++++++++++++++ main.cc | 137 ++++++++++++++++++++++++++----------- testsuite/check.sh | 170 +++++++++++++++++++++++++++------------------ 23 files changed, 969 insertions(+), 700 deletions(-) delete mode 100644 file_index.cc delete mode 100644 file_index.h create mode 100644 lzip_index.cc create mode 100644 lzip_index.h diff --git a/ChangeLog b/ChangeLog index 20d8605..cbbf413 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2019-01-05 Antonio Diaz Diaz + + * Version 1.8 released. + * File_* renamed to Lzip_*. + * main.cc: Added new options '--in-slots' and '--out-slots'. + * main.cc: Increased default in_slots per worker from 2 to 4. + * main.cc: Increased default out_slots per worker from 32 to 64. + * lzip.h (Lzip_trailer): New function 'verify_consistency'. + * lzip_index.cc: Detect some kinds of corrupt trailers. + * main.cc (main): Check return value of close( infd ). + * plzip.texi: Improved description of '-0..-9', '-m' and '-s'. + * configure: Added new option '--with-mingw'. + * configure: Accept appending to CXXFLAGS, 'CXXFLAGS+=OPTIONS'. + * INSTALL: Document use of CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO'. + 2018-02-07 Antonio Diaz Diaz * Version 1.7 released. @@ -25,7 +40,7 @@ * The option '-l, --list' has been ported from lziprecover. * Don't allow mixing different operations (-d, -l or -t). * main.cc: Continue testing if any input file is a terminal. - * file_index.cc: Improve detection of bad dict and trailing data. + * lzip_index.cc: Improve detection of bad dict and trailing data. * lzip.h: Unified messages for bad magic, trailing data, etc. 2016-05-14 Antonio Diaz Diaz @@ -169,7 +184,7 @@ until something better appears on the net. -Copyright (C) 2009-2018 Antonio Diaz Diaz. +Copyright (C) 2009-2019 Antonio Diaz Diaz. This file is a collection of facts, and thus it is not copyrightable, but just in case, you have unlimited permission to copy, distribute and diff --git a/INSTALL b/INSTALL index 6f359ca..fb86f55 100644 --- a/INSTALL +++ b/INSTALL @@ -1,15 +1,18 @@ Requirements ------------ You will need a C++ compiler and the lzlib compression library installed. -I use gcc 5.3.0 and 4.1.2, but the code should compile with any -standards compliant compiler. -Lzlib must be version 1.0 or newer, but the fast encoder is only -available in lzlib 1.7 or newer, and the HD = 3 detection of corrupt -headers on non-seekable multimember files is only available in lzlib -1.10 or newer. +I use gcc 5.3.0 and 4.1.2, but the code should compile with any standards +compliant compiler. +Lzlib must be version 1.0 or newer, but the fast encoder is only available +in lzlib 1.7 or newer, and the HD = 3 detection of corrupt headers on +non-seekable multimember files is only available in lzlib 1.10 or newer. Gcc is available at http://gcc.gnu.org. Lzlib is available at http://www.nongnu.org/lzip/lzlib.html. +The operating system must allow signal handlers read access to objects with +static storage duration so that the cleanup handler for Control-C can delete +the partial output file. + Procedure --------- @@ -28,6 +31,15 @@ the main archive. cd plzip[version] ./configure + To link against a lzlib not installed in a standard place, use: + + ./configure CPPFLAGS='-I' LDFLAGS='-L' + + If you are compiling on MinGW, use --with-mingw (note that the Windows + I/O functions used with MinGW are not guaranteed to be thread safe): + + ./configure --with-mingw CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO' + 3. Run make. make @@ -67,7 +79,7 @@ After running 'configure', you can run 'make' and 'make install' as explained above. -Copyright (C) 2009-2018 Antonio Diaz Diaz. +Copyright (C) 2009-2019 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/Makefile.in b/Makefile.in index a9ead0a..6d40fd1 100644 --- a/Makefile.in +++ b/Makefile.in @@ -8,7 +8,7 @@ LIBS = -llz -lpthread SHELL = /bin/sh CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 -objs = arg_parser.o file_index.o list.o compress.o dec_stdout.o \ +objs = arg_parser.o lzip_index.o list.o compress.o dec_stdout.o \ dec_stream.o decompress.o main.o @@ -24,6 +24,9 @@ all : $(progname) $(progname) : $(objs) $(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ $(objs) $(LIBS) +decompress.o : decompress.cc + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(with_mingw) -c -o $@ $< + main.o : main.cc $(CXX) $(CPPFLAGS) $(CXXFLAGS) -DPROGVERSION=\"$(pkgversion)\" -c -o $@ $< @@ -33,11 +36,11 @@ main.o : main.cc $(objs) : Makefile arg_parser.o : arg_parser.h compress.o : lzip.h -dec_stdout.o : lzip.h file_index.h +dec_stdout.o : lzip.h lzip_index.h dec_stream.o : lzip.h -decompress.o : lzip.h file_index.h -file_index.o : lzip.h file_index.h -list.o : lzip.h file_index.h +decompress.o : lzip.h lzip_index.h +list.o : lzip.h lzip_index.h +lzip_index.o : lzip.h lzip_index.h main.o : arg_parser.h lzip.h diff --git a/NEWS b/NEWS index 1916e07..bfe26e6 100644 --- a/NEWS +++ b/NEWS @@ -1,34 +1,31 @@ -Changes in version 1.7: +Changes in version 1.8: -When compressing on a 32 bit system, plzip now tries to limit the memory -use to under 2.22 GiB (4 worker threads at level -9) by reducing the -number of threads below the system's default. +The new options '--in-slots' and '--out-slots', setting the number of input +and output packets buffered during streamed decompression, have been added. +Increasing the number of packets may increase decompression speed, but +requires more memory. -The option '--loose-trailing', has been added. +The default number of input packets buffered per worker thread when +decompressing from non-seekable input has been increased from 2 to 4. -The test used by plzip to discriminate trailing data from a corrupt -header in multimember regular (seekable) files has been improved to a -Hamming distance (HD) of 3, and the 3 bit flips must happen in different -magic bytes for the test to fail. As a consequence some kinds of files -no longer can be appended to a lzip file as trailing data unless the -'--loose-trailing' option is used when decompressing. -Lzlib 1.10 or newer is required for this test to work on non-seekable -files. -Lziprecover can be used to remove conflicting trailing data from a file. +The default number of output packets buffered per worker thread when +decompressing to non-seekable output has been increased from 32 to 64. -The 'bits/byte' ratio has been replaced with the inverse compression -ratio in the output. +Detection of forbidden combinations of characters in trailing data has been +improved. -The progress of decompression is now shown at verbosity level 2 (-vv) or -higher. +Errors are now also checked when closing the input file. -Progress of (de)compression is only shown if stderr is a terminal. +The descriptions of '-0..-9', '-m' and '-s' in the manual have been +improved. -A second '.lz' extension is no longer added to the argument of '-o' if -it already ends in '.lz' or '.tlz'. +The configure script now accepts the option '--with-mingw' to enable the +compilation of plzip under MS Windows (with the MinGW compiler). Use with +care. The Windows I/O functions used are not guaranteed to be thread safe. +(Code based on a patch by Hannes Domani). -The dictionary size is now shown at verbosity level 4 (-vvvv) when -decompressing or testing. +The configure script now accepts appending options to CXXFLAGS using the +syntax 'CXXFLAGS+=OPTIONS'. -The new chapter "Meaning of plzip's output", and a block diagram of -plzip have been added to the manual. +It has been documented in INSTALL the use of +CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO' when compiling on MinGW. diff --git a/README b/README index d6f4d4a..695aa6c 100644 --- a/README +++ b/README @@ -1,8 +1,15 @@ Description -Plzip is a massively parallel (multi-threaded) lossless data compressor -based on the lzlib compression library, with a user interface similar to -the one of lzip, bzip2 or gzip. +Plzip is a massively parallel (multi-threaded) implementation of lzip, fully +compatible with lzip 1.4 or newer. Plzip uses the lzlib compression library. + +Lzip is a lossless data compressor with a user interface similar to the +one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0) +or compress most files more than bzip2 (lzip -9). Decompression speed is +intermediate between gzip and bzip2. Lzip is better than gzip and bzip2 +from a data recovery perspective. Lzip has been designed, written and +tested with great care to replace gzip and bzip2 as the standard +general-purpose compressed format for unix-like systems. Plzip can compress/decompress large files on multiprocessor machines much faster than lzip, at the cost of a slightly reduced compression @@ -21,25 +28,21 @@ be decompressed faster than using lzip (unless the '-b' option was used) because lzip usually produces single-member files, which can't be decompressed in parallel. -Plzip uses the lzip file format; the files produced by plzip are fully -compatible with lzip-1.4 or newer, and can be rescued with lziprecover. - -The lzip file format is designed for data sharing and long-term -archiving, taking into account both data integrity and decoder -availability: +The lzip file format is designed for data sharing and long-term archiving, +taking into account both data integrity and decoder availability: * The lzip format provides very safe integrity checking and some data - recovery means. The lziprecover program can repair bit-flip errors + recovery means. The lziprecover program can repair bit flip errors (one of the most common forms of data corruption) in lzip files, and provides data recovery capabilities, including error-checked merging of damaged copies of a file. * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the source code of a simple decompressor along - with a detailed explanation of how it works, so that with the only - help of the lzip manual it would be possible for a digital - archaeologist to extract the data from a lzip file long after - quantum computers eventually render LZMA obsolete. + lzip manual provides the source code of a simple decompressor + along with a detailed explanation of how it works, so that with + the only help of the lzip manual it would be possible for a + digital archaeologist to extract the data from a lzip file long + after quantum computers eventually render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. @@ -49,15 +52,14 @@ repair the nearer it is from the beginning of the file. Therefore, with the help of lziprecover, losing an entire archive just because of a corrupt byte near the beginning is a thing of the past. -Plzip uses the same well-defined exit status values used by lzip and -bzip2, which makes it safer than compressors returning ambiguous warning -values (like gzip) when it is used as a back end for other programs like -tar or zutils. +Plzip uses the same well-defined exit status values used by lzip, which +makes it safer than compressors returning ambiguous warning values (like +gzip) when it is used as a back end for other programs like tar or zutils. -Plzip will automatically use the smallest possible dictionary size for -each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit. +Plzip will automatically use for each file the largest dictionary size +that does not exceed neither the file size nor the limit given. Keep in +mind that the decompression memory requirement is affected at +compression time by the choice of dictionary size limit. When compressing, plzip replaces every file given in the command line with a compressed version of itself, with the name "original_name.lz". @@ -70,25 +72,29 @@ anyothername becomes anyothername.out (De)compressing a file is much like copying or moving it; therefore plzip preserves the access and modification dates, permissions, and, when -possible, ownership of the file just as "cp -p" does. (If the user ID or +possible, ownership of the file just as 'cp -p' does. (If the user ID or the group ID can't be duplicated, the file permission bits S_ISUID and S_ISGID are cleared). Plzip is able to read from some types of non regular files if the -"--stdout" option is specified. +'--stdout' option is specified. If no file names are specified, plzip compresses (or decompresses) from standard input to standard output. In this case, plzip will decline to write compressed output to a terminal, as this would be entirely incomprehensible and therefore pointless. -Plzip will correctly decompress a file which is the concatenation of two -or more compressed files. The result is the concatenation of the -corresponding decompressed files. Integrity testing of concatenated -compressed files is also supported. +Plzip will correctly decompress a file which is the concatenation of two or +more compressed files. The result is the concatenation of the corresponding +decompressed files. Integrity testing of concatenated compressed files is +also supported. + +LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never +have been compressed. Decompressed is used to refer to data which have +undergone the process of decompression. -Copyright (C) 2009-2018 Antonio Diaz Diaz. +Copyright (C) 2009-2019 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/arg_parser.cc b/arg_parser.cc index 008ebc8..ea32fde 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2018 Antonio Diaz Diaz. + Copyright (C) 2006-2019 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided diff --git a/arg_parser.h b/arg_parser.h index f015881..ceb9933 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2018 Antonio Diaz Diaz. + Copyright (C) 2006-2019 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided diff --git a/compress.cc b/compress.cc index beae59e..af36f95 100644 --- a/compress.cc +++ b/compress.cc @@ -1,6 +1,6 @@ -/* Plzip - Parallel compressor compatible with lzip +/* Plzip - Massively parallel implementation of lzip Copyright (C) 2009 Laszlo Ersek. - Copyright (C) 2009-2018 Antonio Diaz Diaz. + Copyright (C) 2009-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -196,7 +195,7 @@ public: ocheck_counter( 0 ), owait_counter( 0 ), receive_id( 0 ), distrib_id( 0 ), deliver_id( 0 ), slot_tally( slots ), circular_ibuffer( slots ), - circular_obuffer( slots, (Packet *) 0 ), + circular_obuffer( slots, (const Packet *) 0 ), num_working( workers ), num_slots( slots ), eof( false ) { xinit_mutex( &imutex ); xinit_cond( &iav_or_eof ); @@ -318,7 +317,7 @@ struct Splitter_arg // courier for packaging and distribution to workers. extern "C" void * csplitter( void * arg ) { - const Splitter_arg & tmp = *(Splitter_arg *)arg; + const Splitter_arg & tmp = *(const Splitter_arg *)arg; Packet_courier & courier = *tmp.courier; const Pretty_print & pp = *tmp.pp; const int infd = tmp.infd; @@ -364,7 +363,7 @@ struct Worker_arg // them to courier. extern "C" void * cworker( void * arg ) { - const Worker_arg & tmp = *(Worker_arg *)arg; + const Worker_arg & tmp = *(const Worker_arg *)arg; Packet_courier & courier = *tmp.courier; const Pretty_print & pp = *tmp.pp; const int dictionary_size = tmp.dictionary_size; @@ -401,18 +400,15 @@ extern "C" void * cworker( void * arg ) int new_pos = 0; while( true ) { - if( LZ_compress_write_size( encoder ) > 0 ) + if( written < packet->size ) { - if( written < packet->size ) - { - const int wr = LZ_compress_write( encoder, - packet->data + offset + written, - packet->size - written ); - if( wr < 0 ) internal_error( "library error (LZ_compress_write)." ); - written += wr; - } - if( written >= packet->size ) LZ_compress_finish( encoder ); + const int wr = LZ_compress_write( encoder, + packet->data + offset + written, + packet->size - written ); + if( wr < 0 ) internal_error( "library error (LZ_compress_write)." ); + written += wr; } + if( written >= packet->size ) LZ_compress_finish( encoder ); const int rd = LZ_compress_read( encoder, packet->data + new_pos, offset + written - new_pos ); if( rd < 0 ) diff --git a/configure b/configure index e31f675..c26658b 100755 --- a/configure +++ b/configure @@ -1,13 +1,14 @@ #! /bin/sh -# configure script for Plzip - Parallel compressor compatible with lzip -# Copyright (C) 2009-2018 Antonio Diaz Diaz. +# configure script for Plzip - Massively parallel implementation of lzip +# Copyright (C) 2009-2019 Antonio Diaz Diaz. # # This configure script is free software: you have unlimited permission # to copy, distribute and modify it. pkgname=plzip -pkgversion=1.7 +pkgversion=1.8 progname=plzip +with_mingw= srctrigger=doc/${pkgname}.texi # clear some things potentially inherited from environment. @@ -67,9 +68,11 @@ while [ $# != 0 ] ; do echo " --datarootdir=DIR base directory for doc and data [${datarootdir}]" echo " --infodir=DIR info files directory [${infodir}]" echo " --mandir=DIR man pages directory [${mandir}]" + echo " --with-mingw use included pread/pwrite functions missing in MinGW" echo " CXX=COMPILER C++ compiler to use [${CXX}]" echo " CPPFLAGS=OPTIONS command line options for the preprocessor [${CPPFLAGS}]" echo " CXXFLAGS=OPTIONS command line options for the C++ compiler [${CXXFLAGS}]" + echo " CXXFLAGS+=OPTIONS append options to the current value of CXXFLAGS" echo " LDFLAGS=OPTIONS command line options for the linker [${LDFLAGS}]" echo exit 0 ;; @@ -92,11 +95,13 @@ while [ $# != 0 ] ; do --infodir=*) infodir=${optarg} ;; --mandir=*) mandir=${optarg} ;; --no-create) no_create=yes ;; + --with-mingw) with_mingw="-DWITH_MINGW" ;; - CXX=*) CXX=${optarg} ;; - CPPFLAGS=*) CPPFLAGS=${optarg} ;; - CXXFLAGS=*) CXXFLAGS=${optarg} ;; - LDFLAGS=*) LDFLAGS=${optarg} ;; + CXX=*) CXX=${optarg} ;; + CPPFLAGS=*) CPPFLAGS=${optarg} ;; + CXXFLAGS=*) CXXFLAGS=${optarg} ;; + CXXFLAGS+=*) CXXFLAGS="${CXXFLAGS} ${optarg}" ;; + LDFLAGS=*) LDFLAGS=${optarg} ;; --*) echo "configure: WARNING: unrecognized option: '${option}'" 1>&2 ;; @@ -154,6 +159,7 @@ EOF fi echo "creating Makefile" +if [ -n "${with_mingw}" ] ; then echo "WITH_MINGW = yes" ; fi echo "VPATH = ${srcdir}" echo "prefix = ${prefix}" echo "exec_prefix = ${exec_prefix}" @@ -167,8 +173,8 @@ echo "CXXFLAGS = ${CXXFLAGS}" echo "LDFLAGS = ${LDFLAGS}" rm -f Makefile cat > Makefile << EOF -# Makefile for Plzip - Parallel compressor compatible with lzip -# Copyright (C) 2009-2018 Antonio Diaz Diaz. +# Makefile for Plzip - Massively parallel implementation of lzip +# Copyright (C) 2009-2019 Antonio Diaz Diaz. # This file was generated automatically by configure. Don't edit. # # This Makefile is free software: you have unlimited permission @@ -177,6 +183,7 @@ cat > Makefile << EOF pkgname = ${pkgname} pkgversion = ${pkgversion} progname = ${progname} +with_mingw = ${with_mingw} VPATH = ${srcdir} prefix = ${prefix} exec_prefix = ${exec_prefix} diff --git a/dec_stdout.cc b/dec_stdout.cc index 27b9f31..2a85009 100644 --- a/dec_stdout.cc +++ b/dec_stdout.cc @@ -1,6 +1,6 @@ -/* Plzip - Parallel compressor compatible with lzip +/* Plzip - Massively parallel implementation of lzip Copyright (C) 2009 Laszlo Ersek. - Copyright (C) 2009-2018 Antonio Diaz Diaz. + Copyright (C) 2009-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,7 +34,7 @@ #include #include "lzip.h" -#include "file_index.h" +#include "lzip_index.h" namespace { @@ -147,7 +147,7 @@ public: struct Worker_arg { - const File_index * file_index; + const Lzip_index * lzip_index; Packet_courier * courier; const Pretty_print * pp; int worker_id; @@ -160,8 +160,8 @@ struct Worker_arg // give the produced packets to courier. extern "C" void * dworker_o( void * arg ) { - const Worker_arg & tmp = *(Worker_arg *)arg; - const File_index & file_index = *tmp.file_index; + const Worker_arg & tmp = *(const Worker_arg *)arg; + const Lzip_index & lzip_index = *tmp.lzip_index; Packet_courier & courier = *tmp.courier; const Pretty_print & pp = *tmp.pp; const int worker_id = tmp.worker_id; @@ -177,10 +177,10 @@ extern "C" void * dworker_o( void * arg ) { pp( "Not enough memory." ); cleanup_and_fail(); } int new_pos = 0; - for( long i = worker_id; i < file_index.members(); i += num_workers ) + for( long i = worker_id; i < lzip_index.members(); i += num_workers ) { - long long member_pos = file_index.mblock( i ).pos(); - long long member_rest = file_index.mblock( i ).size(); + long long member_pos = lzip_index.mblock( i ).pos(); + long long member_rest = lzip_index.mblock( i ).size(); while( member_rest > 0 ) { @@ -229,7 +229,7 @@ extern "C" void * dworker_o( void * arg ) if( rd == 0 ) break; } } - show_progress( file_index.mblock( i ).size() ); + show_progress( lzip_index.mblock( i ).size() ); } delete[] ibuffer; delete[] new_data; @@ -265,9 +265,8 @@ void muxer( Packet_courier & courier, const Pretty_print & pp, const int outfd ) // init the courier, then start the workers and call the muxer. int dec_stdout( const int num_workers, const int infd, const int outfd, const Pretty_print & pp, const int debug_level, - const File_index & file_index ) + const int out_slots, const Lzip_index & lzip_index ) { - const int out_slots = 32; Packet_courier courier( num_workers, out_slots ); Worker_arg * worker_args = new( std::nothrow ) Worker_arg[num_workers]; @@ -276,7 +275,7 @@ int dec_stdout( const int num_workers, const int infd, const int outfd, { pp( "Not enough memory." ); cleanup_and_fail(); } for( int i = 0; i < num_workers; ++i ) { - worker_args[i].file_index = &file_index; + worker_args[i].lzip_index = &lzip_index; worker_args[i].courier = &courier; worker_args[i].pp = &pp; worker_args[i].worker_id = i; @@ -301,9 +300,9 @@ int dec_stdout( const int num_workers, const int infd, const int outfd, if( verbosity >= 2 ) { - if( verbosity >= 4 ) show_header( file_index.dictionary_size( 0 ) ); - const unsigned long long in_size = file_index.cdata_size(); - const unsigned long long out_size = file_index.udata_size(); + if( verbosity >= 4 ) show_header( lzip_index.dictionary_size( 0 ) ); + const unsigned long long in_size = lzip_index.cdata_size(); + const unsigned long long out_size = lzip_index.udata_size(); if( out_size == 0 || in_size == 0 ) std::fputs( "no data compressed. ", stderr ); else diff --git a/dec_stream.cc b/dec_stream.cc index 36a0ec0..2e1f752 100644 --- a/dec_stream.cc +++ b/dec_stream.cc @@ -1,6 +1,6 @@ -/* Plzip - Parallel compressor compatible with lzip +/* Plzip - Massively parallel implementation of lzip Copyright (C) 2009 Laszlo Ersek. - Copyright (C) 2009-2018 Antonio Diaz Diaz. + Copyright (C) 2009-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -253,8 +253,8 @@ extern "C" void * dsplitter_s( void * arg ) Packet_courier & courier = *tmp.courier; const Pretty_print & pp = *tmp.pp; const int infd = tmp.infd; - const int hsize = File_header::size; - const int tsize = File_trailer::size; + const int hsize = Lzip_header::size; + const int tsize = Lzip_trailer::size; const int buffer_size = max_packet_size; const int base_buffer_size = tsize + buffer_size + hsize; uint8_t * const base_buffer = new( std::nothrow ) uint8_t[base_buffer_size]; @@ -268,7 +268,7 @@ extern "C" void * dsplitter_s( void * arg ) if( size + hsize < min_member_size ) { show_file_error( pp.name(), "Input file is too short." ); cleanup_and_fail( 2 ); } - const File_header & header = *(File_header *)buffer; + const Lzip_header & header = *(const Lzip_header *)buffer; if( !header.verify_magic() ) { show_file_error( pp.name(), bad_magic_msg ); cleanup_and_fail( 2 ); } if( !header.verify_version() ) @@ -288,11 +288,12 @@ extern "C" void * dsplitter_s( void * arg ) newpos = find_magic( buffer, newpos, size + 4 - newpos ); if( newpos <= size ) { - const File_trailer & trailer = *(File_trailer *)(buffer + newpos - tsize); + const Lzip_trailer & trailer = + *(const Lzip_trailer *)(buffer + newpos - tsize); const unsigned long long member_size = trailer.member_size(); if( partial_member_size + newpos - pos == member_size ) { // header found - const File_header & header = *(File_header *)(buffer + newpos); + const Lzip_header & header = *(const Lzip_header *)(buffer + newpos); if( !header.verify_version() ) { pp( bad_version( header.version() ) ); cleanup_and_fail( 2 ); } const unsigned dictionary_size = header.dictionary_size(); @@ -354,7 +355,7 @@ struct Worker_arg // if not testing, give the produced packets to courier. extern "C" void * dworker_s( void * arg ) { - const Worker_arg & tmp = *(Worker_arg *)arg; + const Worker_arg & tmp = *(const Worker_arg *)arg; Packet_courier & courier = *tmp.courier; const Pretty_print & pp = *tmp.pp; const int worker_id = tmp.worker_id; @@ -479,15 +480,14 @@ void muxer( Packet_courier & courier, const Pretty_print & pp, const int outfd ) int dec_stream( const unsigned long long cfile_size, const int num_workers, const int infd, const int outfd, const Pretty_print & pp, const int debug_level, + const int in_slots, const int out_slots, const bool ignore_trailing, const bool loose_trailing ) { - const int in_slots_per_worker = 2; - const int out_slots = 32; - const int in_slots = ( INT_MAX / num_workers >= in_slots_per_worker ) ? - num_workers * in_slots_per_worker : INT_MAX; + const int total_in_slots = ( INT_MAX / num_workers >= in_slots ) ? + num_workers * in_slots : INT_MAX; in_size = 0; out_size = 0; - Packet_courier courier( num_workers, in_slots, out_slots ); + Packet_courier courier( num_workers, total_in_slots, out_slots ); Splitter_arg splitter_arg; splitter_arg.cfile_size = cfile_size; diff --git a/decompress.cc b/decompress.cc index ed1ac21..19cb1df 100644 --- a/decompress.cc +++ b/decompress.cc @@ -1,6 +1,6 @@ -/* Plzip - Parallel compressor compatible with lzip +/* Plzip - Massively parallel implementation of lzip Copyright (C) 2009 Laszlo Ersek. - Copyright (C) 2009-2018 Antonio Diaz Diaz. + Copyright (C) 2009-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,7 +34,46 @@ #include #include "lzip.h" -#include "file_index.h" +#include "lzip_index.h" + + +// This code is based on a patch by Hannes Domani, ssbssa@yahoo.de +// to be able to compile plzip under MS Windows (with MINGW compiler). +#if defined(__MSVCRT__) && defined(WITH_MINGW) +#include +#warning "Parallel I/O is not guaranteed to work on Windows." + +ssize_t pread( int fd, void *buf, size_t count, uint64_t offset ) + { + OVERLAPPED o = {0,0,0,0,0}; + HANDLE fh = (HANDLE)_get_osfhandle(fd); + DWORD bytes; + BOOL ret; + + if( fh == INVALID_HANDLE_VALUE ) { errno = EBADF; return -1; } + o.Offset = offset & 0xffffffff; + o.OffsetHigh = (offset >> 32) & 0xffffffff; + ret = ReadFile( fh, buf, (DWORD)count, &bytes, &o ); + if( !ret ) { errno = EIO; return -1; } + return (ssize_t)bytes; + } + +ssize_t pwrite( int fd, const void *buf, size_t count, uint64_t offset ) + { + OVERLAPPED o = {0,0,0,0,0}; + HANDLE fh = (HANDLE)_get_osfhandle(fd); + DWORD bytes; + BOOL ret; + + if( fh == INVALID_HANDLE_VALUE ) { errno = EBADF; return -1; } + o.Offset = offset & 0xffffffff; + o.OffsetHigh = (offset >> 32) & 0xffffffff; + ret = WriteFile(fh, buf, (DWORD)count, &bytes, &o); + if( !ret ) { errno = EIO; return -1; } + return (ssize_t)bytes; + } + +#endif // __MSVCRT__ // Returns the number of bytes really read. @@ -95,7 +134,7 @@ namespace { struct Worker_arg { - const File_index * file_index; + const Lzip_index * lzip_index; const Pretty_print * pp; int worker_id; int num_workers; @@ -108,8 +147,8 @@ struct Worker_arg // write the produced data to file. extern "C" void * dworker( void * arg ) { - const Worker_arg & tmp = *(Worker_arg *)arg; - const File_index & file_index = *tmp.file_index; + const Worker_arg & tmp = *(const Worker_arg *)arg; + const Lzip_index & lzip_index = *tmp.lzip_index; const Pretty_print & pp = *tmp.pp; const int worker_id = tmp.worker_id; const int num_workers = tmp.num_workers; @@ -124,12 +163,12 @@ extern "C" void * dworker( void * arg ) LZ_decompress_errno( decoder ) != LZ_ok ) { pp( "Not enough memory." ); cleanup_and_fail(); } - for( long i = worker_id; i < file_index.members(); i += num_workers ) + for( long i = worker_id; i < lzip_index.members(); i += num_workers ) { - long long data_pos = file_index.dblock( i ).pos(); - long long data_rest = file_index.dblock( i ).size(); - long long member_pos = file_index.mblock( i ).pos(); - long long member_rest = file_index.mblock( i ).size(); + long long data_pos = lzip_index.dblock( i ).pos(); + long long data_rest = lzip_index.dblock( i ).size(); + long long member_pos = lzip_index.mblock( i ).pos(); + long long member_rest = lzip_index.mblock( i ).size(); while( member_rest > 0 ) { @@ -180,7 +219,7 @@ extern "C" void * dworker( void * arg ) if( rd == 0 ) break; } } - show_progress( file_index.mblock( i ).size() ); + show_progress( lzip_index.mblock( i ).size() ); } delete[] obuffer; delete[] ibuffer; @@ -197,26 +236,27 @@ extern "C" void * dworker( void * arg ) // start the workers and wait for them to finish. int decompress( const unsigned long long cfile_size, int num_workers, const int infd, const int outfd, const Pretty_print & pp, - const int debug_level, const bool ignore_trailing, + const int debug_level, const int in_slots, + const int out_slots, const bool ignore_trailing, const bool loose_trailing, const bool infd_isreg ) { if( !infd_isreg ) - return dec_stream( cfile_size, num_workers, infd, outfd, pp, - debug_level, ignore_trailing, loose_trailing ); + return dec_stream( cfile_size, num_workers, infd, outfd, pp, debug_level, + in_slots, out_slots, ignore_trailing, loose_trailing ); - const File_index file_index( infd, ignore_trailing, loose_trailing ); - if( file_index.retval() == 1 ) + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing ); + if( lzip_index.retval() == 1 ) { lseek( infd, 0, SEEK_SET ); - return dec_stream( cfile_size, num_workers, infd, outfd, pp, - debug_level, ignore_trailing, loose_trailing ); + return dec_stream( cfile_size, num_workers, infd, outfd, pp, debug_level, + in_slots, out_slots, ignore_trailing, loose_trailing ); } - if( file_index.retval() != 0 ) - { show_file_error( pp.name(), file_index.error().c_str() ); - return file_index.retval(); } + if( lzip_index.retval() != 0 ) + { show_file_error( pp.name(), lzip_index.error().c_str() ); + return lzip_index.retval(); } - if( num_workers > file_index.members() ) - num_workers = file_index.members(); + if( num_workers > lzip_index.members() ) + num_workers = lzip_index.members(); if( verbosity >= 1 ) pp(); show_progress( 0, cfile_size, &pp ); // init @@ -225,7 +265,8 @@ int decompress( const unsigned long long cfile_size, int num_workers, struct stat st; if( fstat( outfd, &st ) != 0 || !S_ISREG( st.st_mode ) || lseek( outfd, 0, SEEK_CUR ) < 0 ) - return dec_stdout( num_workers, infd, outfd, pp, debug_level, file_index ); + return dec_stdout( num_workers, infd, outfd, pp, debug_level, out_slots, + lzip_index ); } Worker_arg * worker_args = new( std::nothrow ) Worker_arg[num_workers]; @@ -234,7 +275,7 @@ int decompress( const unsigned long long cfile_size, int num_workers, { pp( "Not enough memory." ); cleanup_and_fail(); } for( int i = 0; i < num_workers; ++i ) { - worker_args[i].file_index = &file_index; + worker_args[i].lzip_index = &lzip_index; worker_args[i].pp = &pp; worker_args[i].worker_id = i; worker_args[i].num_workers = num_workers; @@ -257,9 +298,9 @@ int decompress( const unsigned long long cfile_size, int num_workers, if( verbosity >= 2 ) { - if( verbosity >= 4 ) show_header( file_index.dictionary_size( 0 ) ); - const unsigned long long in_size = file_index.cdata_size(); - const unsigned long long out_size = file_index.udata_size(); + if( verbosity >= 4 ) show_header( lzip_index.dictionary_size( 0 ) ); + const unsigned long long in_size = lzip_index.cdata_size(); + const unsigned long long out_size = lzip_index.udata_size(); if( out_size == 0 || in_size == 0 ) std::fputs( "no data compressed. ", stderr ); else diff --git a/doc/plzip.1 b/doc/plzip.1 index 99dfd8b..694a99d 100644 --- a/doc/plzip.1 +++ b/doc/plzip.1 @@ -1,12 +1,28 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH PLZIP "1" "February 2018" "plzip 1.7" "User Commands" +.TH PLZIP "1" "January 2019" "plzip 1.8" "User Commands" .SH NAME plzip \- reduces the size of files .SH SYNOPSIS .B plzip [\fI\,options\/\fR] [\fI\,files\/\fR] .SH DESCRIPTION -Plzip \- Parallel compressor compatible with lzip. +Plzip is a massively parallel (multi\-threaded) implementation of lzip, fully +compatible with lzip 1.4 or newer. Plzip uses the lzlib compression library. +.PP +Lzip is a lossless data compressor with a user interface similar to the +one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip \fB\-0\fR) +or compress most files more than bzip2 (lzip \fB\-9\fR). Decompression speed is +intermediate between gzip and bzip2. Lzip is better than gzip and bzip2 +from a data recovery perspective. Lzip has been designed, written and +tested with great care to replace gzip and bzip2 as the standard +general\-purpose compressed format for unix\-like systems. +.PP +Plzip can compress/decompress large files on multiprocessor machines +much faster than lzip, at the cost of a slightly reduced compression +ratio (0.4 to 2 percent larger compressed files). Note that the number +of usable threads is limited by file size; on files larger than a few GB +plzip can use hundreds of processors, but on files of only a few MB +plzip is no faster than lzip. .SH OPTIONS .TP \fB\-h\fR, \fB\-\-help\fR @@ -71,6 +87,12 @@ alias for \fB\-9\fR .TP \fB\-\-loose\-trailing\fR allow trailing data seeming corrupt header +.TP +\fB\-\-in\-slots=\fR +number of 1 MiB input packets buffered [4] +.TP +\fB\-\-out\-slots=\fR +number of 1 MiB output packets buffered [64] .PP If no file names are given, or if a file is '\-', plzip compresses or decompresses from standard input to standard output. @@ -95,8 +117,8 @@ Plzip home page: http://www.nongnu.org/lzip/plzip.html .SH COPYRIGHT Copyright \(co 2009 Laszlo Ersek. .br -Copyright \(co 2018 Antonio Diaz Diaz. -Using lzlib 1.10 +Copyright \(co 2019 Antonio Diaz Diaz. +Using lzlib 1.11 License GPLv2+: GNU GPL version 2 or later .br This is free software: you are free to change and redistribute it. diff --git a/doc/plzip.info b/doc/plzip.info index c8d7387..2b7aa52 100644 --- a/doc/plzip.info +++ b/doc/plzip.info @@ -2,7 +2,7 @@ This is plzip.info, produced by makeinfo version 4.13+ from plzip.texi. INFO-DIR-SECTION Data Compression START-INFO-DIR-ENTRY -* Plzip: (plzip). Parallel compressor compatible with lzip +* Plzip: (plzip). Massively parallel implementation of lzip END-INFO-DIR-ENTRY  @@ -11,7 +11,7 @@ File: plzip.info, Node: Top, Next: Introduction, Up: (dir) Plzip Manual ************ -This manual is for Plzip (version 1.7, 7 February 2018). +This manual is for Plzip (version 1.8, 5 January 2019). * Menu: @@ -28,7 +28,7 @@ This manual is for Plzip (version 1.7, 7 February 2018). * Concept index:: Index of concepts - Copyright (C) 2009-2018 Antonio Diaz Diaz. + Copyright (C) 2009-2019 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -39,20 +39,25 @@ File: plzip.info, Node: Introduction, Next: Output, Prev: Top, Up: Top 1 Introduction ************** -Plzip is a massively parallel (multi-threaded) lossless data compressor -based on the lzlib compression library, with a user interface similar to -the one of lzip, bzip2 or gzip. +Plzip is a massively parallel (multi-threaded) implementation of lzip, +fully compatible with lzip 1.4 or newer. Plzip uses the lzlib +compression library. + + Lzip is a lossless data compressor with a user interface similar to +the one of gzip or bzip2. Lzip can compress about as fast as gzip +(lzip -0) or compress most files more than bzip2 (lzip -9). +Decompression speed is intermediate between gzip and bzip2. Lzip is +better than gzip and bzip2 from a data recovery perspective. Lzip has +been designed, written and tested with great care to replace gzip and +bzip2 as the standard general-purpose compressed format for unix-like +systems. Plzip can compress/decompress large files on multiprocessor machines much faster than lzip, at the cost of a slightly reduced compression ratio (0.4 to 2 percent larger compressed files). Note that the number of usable threads is limited by file size; on files larger than a few GB plzip can use hundreds of processors, but on files of only a few MB -plzip is no faster than lzip (*note Minimum file sizes::). - - Plzip uses the lzip file format; the files produced by plzip are -fully compatible with lzip-1.4 or newer, and can be rescued with -lziprecover. +plzip is no faster than lzip. *Note Minimum file sizes::. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -80,15 +85,16 @@ repair the nearer it is from the beginning of the file. Therefore, with the help of lziprecover, losing an entire archive just because of a corrupt byte near the beginning is a thing of the past. - Plzip uses the same well-defined exit status values used by lzip and -bzip2, which makes it safer than compressors returning ambiguous warning -values (like gzip) when it is used as a back end for other programs like -tar or zutils. + Plzip uses the same well-defined exit status values used by lzip, +which makes it safer than compressors returning ambiguous warning +values (like gzip) when it is used as a back end for other programs +like tar or zutils. - Plzip will automatically use the smallest possible dictionary size -for each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit (*note Memory requirements::). + Plzip will automatically use for each file the largest dictionary +size that does not exceed neither the file size nor the limit given. +Keep in mind that the decompression memory requirement is affected at +compression time by the choice of dictionary size limit. *Note Memory +requirements::. When compressing, plzip replaces every file given in the command line with a compressed version of itself, with the name "original_name.lz". @@ -101,7 +107,7 @@ anyothername becomes anyothername.out (De)compressing a file is much like copying or moving it; therefore plzip preserves the access and modification dates, permissions, and, -when possible, ownership of the file just as "cp -p" does. (If the user +when possible, ownership of the file just as 'cp -p' does. (If the user ID or the group ID can't be duplicated, the file permission bits S_ISUID and S_ISGID are cleared). @@ -188,6 +194,7 @@ command line. '-V' '--version' Print the version number of plzip on the standard output and exit. + This version number should be included in all bug reports. '-a' '--trailing-error' @@ -286,12 +293,14 @@ command line. '-s BYTES' '--dictionary-size=BYTES' When compressing, set the dictionary size limit in bytes. Plzip - will use the smallest possible dictionary size for each file - without exceeding this limit. Valid values range from 4 KiB to - 512 MiB. Values 12 to 29 are interpreted as powers of two, meaning - 2^12 to 2^29 bytes. Note that dictionary sizes are quantized. If - the specified size does not match one of the valid sizes, it will - be rounded upwards by adding up to (BYTES / 8) to it. + will use for each file the largest dictionary size that does not + exceed neither the file size nor this limit. Valid values range + from 4 KiB to 512 MiB. Values 12 to 29 are interpreted as powers + of two, meaning 2^12 to 2^29 bytes. Dictionary sizes are quantized + so that they can be coded in just one byte (*note + coded-dict-size::). If the specified size does not match one of + the valid sizes, it will be rounded upwards by adding up to + (BYTES / 8) to it. For maximum compression you should use a dictionary size limit as large as possible, but keep in mind that the decompression memory @@ -320,27 +329,32 @@ command line. except for single-member files. '-0 .. -9' - Set the compression parameters (dictionary size and match length - limit) as shown in the table below. The default compression level - is '-6'. Note that '-9' can be much slower than '-0'. These - options have no effect when decompressing, testing or listing. + Compression level. Set the compression parameters (dictionary size + and match length limit) as shown in the table below. The default + compression level is '-6', equivalent to '-s8MiB -m36'. Note that + '-9' can be much slower than '-0'. These options have no effect + when decompressing, testing or listing. The bidimensional parameter space of LZMA can't be mapped to a linear scale optimal for all files. If your files are large, very repetitive, etc, you may need to use the '--dictionary-size' and '--match-length' options directly to achieve optimal performance. - Level Dictionary size Match length limit - -0 64 KiB 16 bytes - -1 1 MiB 5 bytes - -2 1.5 MiB 6 bytes - -3 2 MiB 8 bytes - -4 3 MiB 12 bytes - -5 4 MiB 20 bytes - -6 8 MiB 36 bytes - -7 16 MiB 68 bytes - -8 24 MiB 132 bytes - -9 32 MiB 273 bytes + If several compression levels or '-s' or '-m' options are given, + the last setting is used. For example '-9 -s64MiB' is equivalent + to '-s64MiB -m273' + + Level Dictionary size (-s) Match length limit (-m) + -0 64 KiB 16 bytes + -1 1 MiB 5 bytes + -2 1.5 MiB 6 bytes + -3 2 MiB 8 bytes + -4 3 MiB 12 bytes + -5 4 MiB 20 bytes + -6 8 MiB 36 bytes + -7 16 MiB 68 bytes + -8 24 MiB 132 bytes + -9 32 MiB 273 bytes '--fast' '--best' @@ -353,6 +367,18 @@ command line. if a file triggers a "corrupt header" error and the cause is not indeed a corrupt header. +'--in-slots=N' + Number of 1 MiB input packets buffered per worker thread when + decompressing from non-seekable input. Increasing the number of + packets may increase decompression speed, but requires more + memory. Valid values range from 1 to 64. The default value is 4. + +'--out-slots=N' + Number of 1 MiB output packets buffered per worker thread when + decompressing to non-seekable output. Increasing the number of + packets may increase decompression speed, but requires more + memory. Valid values range from 1 to 1024. The default value is 64. + Numbers given as arguments to options may be followed by a multiplier and an optional 'B' for "byte". @@ -465,11 +491,11 @@ additional information before, between, or after them. 'DS (coded dictionary size, 1 byte)' The dictionary size is calculated by taking a power of 2 (the base - size) and substracting from it a fraction between 0/16 and 7/16 of + size) and subtracting from it a fraction between 0/16 and 7/16 of the base size. Bits 4-0 contain the base 2 logarithm of the base size (12 to 29). - Bits 7-5 contain the numerator of the fraction (0 to 7) to - substract from the base size to obtain the dictionary size. + Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract + from the base size to obtain the dictionary size. Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB Valid values for dictionary size range from 4 KiB to 512 MiB. @@ -497,22 +523,25 @@ File: plzip.info, Node: Memory requirements, Next: Minimum file sizes, Prev: 6 Memory required to compress and decompress ******************************************** -The amount of memory required *per thread* for decompression or testing -is approximately the following: +The amount of memory required *per worker thread* for decompression or +testing is approximately the following: * For decompression of a regular (seekable) file to another regular file, or for testing of a regular file; the dictionary size. * For testing of a non-seekable file or of standard input; the - dictionary size plus up to 5 MiB. + dictionary size plus 1 MiB plus up to the number of 1 MiB input + packets buffered (4 by default). * For decompression of a regular file to a non-seekable file or to - standard output; the dictionary size plus up to 32 MiB. + standard output; the dictionary size plus up to the number of 1 MiB + output packets buffered (64 by default). * For decompression of a non-seekable file or of standard input; the - dictionary size plus up to 35 MiB. + dictionary size plus 1 MiB plus up to the number of 1 MiB input + and output packets buffered (68 by default). -The amount of memory required *per thread* for compression is +The amount of memory required *per worker thread* for compression is approximately the following: * For compression at level -0; 1.5 MiB plus 3.375 times the data size @@ -561,7 +590,7 @@ for full use of N processors at a given compression level, using the default data size for each level: Processors 2 4 8 16 64 256 -------------------------------------------------------------------------- +------------------------------------------------------------------ Level -0 2 MiB 4 MiB 8 MiB 16 MiB 64 MiB 256 MiB -1 4 MiB 8 MiB 16 MiB 32 MiB 128 MiB 512 MiB @@ -633,7 +662,11 @@ compressed file (bugs in the system libraries, memory errors, etc). Therefore, if the data you are going to compress are important, give the '--keep' option to plzip and don't remove the original file until you verify the compressed file with a command like -'plzip -cd file.lz | cmp file -'. +'plzip -cd file.lz | cmp file -'. Most RAM errors happening during +compression can only be detected by comparing the compressed file with +the original because the corruption happens before plzip compresses the +RAM contents, resulting in a valid compressed file containing wrong +data. Example 1: Replace a regular file with its compressed version 'file.lz' @@ -728,21 +761,22 @@ Concept index  Tag Table: -Node: Top221 +Node: Top222 Node: Introduction1158 -Node: Output5134 -Node: Invoking plzip6614 -Ref: --trailing-error7177 -Ref: --data-size7420 -Node: Program design14938 -Node: File format17090 -Node: Memory requirements19522 -Node: Minimum file sizes20985 -Node: Trailing data23002 -Node: Examples25285 -Ref: concat-example26450 -Node: Problems27025 -Node: Concept index27553 +Node: Output5456 +Node: Invoking plzip6936 +Ref: --trailing-error7563 +Ref: --data-size7806 +Node: Program design16267 +Node: File format18419 +Ref: coded-dict-size19719 +Node: Memory requirements20849 +Node: Minimum file sizes22531 +Node: Trailing data24540 +Node: Examples26823 +Ref: concat-example28238 +Node: Problems28813 +Node: Concept index29341  End Tag Table diff --git a/doc/plzip.texi b/doc/plzip.texi index 44cff75..b5469b9 100644 --- a/doc/plzip.texi +++ b/doc/plzip.texi @@ -6,19 +6,19 @@ @finalout @c %**end of header -@set UPDATED 7 February 2018 -@set VERSION 1.7 +@set UPDATED 5 January 2019 +@set VERSION 1.8 @dircategory Data Compression @direntry -* Plzip: (plzip). Parallel compressor compatible with lzip +* Plzip: (plzip). Massively parallel implementation of lzip @end direntry @ifnothtml @titlepage @title Plzip -@subtitle Parallel compressor compatible with lzip +@subtitle Massively parallel implementation of lzip @subtitle for Plzip version @value{VERSION}, @value{UPDATED} @author by Antonio Diaz Diaz @@ -49,7 +49,7 @@ This manual is for Plzip (version @value{VERSION}, @value{UPDATED}). @end menu @sp 1 -Copyright @copyright{} 2009-2018 Antonio Diaz Diaz. +Copyright @copyright{} 2009-2019 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -59,23 +59,28 @@ to copy, distribute and modify it. @chapter Introduction @cindex introduction -Plzip is a massively parallel (multi-threaded) lossless data compressor -based on the lzlib compression library, with a user interface similar to -the one of lzip, bzip2 or gzip. +@uref{http://www.nongnu.org/lzip/plzip.html,,Plzip} is a massively parallel +(multi-threaded) implementation of lzip, fully compatible with lzip 1.4 or +newer. Plzip uses the lzlib compression library. + +@uref{http://www.nongnu.org/lzip/lzip.html,,Lzip} is a lossless data +compressor with a user interface similar to the one of gzip or bzip2. Lzip +can compress about as fast as gzip @w{(lzip -0)} or compress most files more +than bzip2 @w{(lzip -9)}. Decompression speed is intermediate between gzip +and bzip2. Lzip is better than gzip and bzip2 from a data recovery +perspective. Lzip has been designed, written and tested with great care to +replace gzip and bzip2 as the standard general-purpose compressed format for +unix-like systems. Plzip can compress/decompress large files on multiprocessor machines much faster than lzip, at the cost of a slightly reduced compression ratio (0.4 to 2 percent larger compressed files). Note that the number of usable threads is limited by file size; on files larger than a few GB plzip can use hundreds of processors, but on files of only a few MB -plzip is no faster than lzip (@pxref{Minimum file sizes}). - -Plzip uses the lzip file format; the files produced by plzip are fully -compatible with lzip-1.4 or newer, and can be rescued with lziprecover. +plzip is no faster than lzip. @xref{Minimum file sizes}. -The lzip file format is designed for data sharing and long-term -archiving, taking into account both data integrity and decoder -availability: +The lzip file format is designed for data sharing and long-term archiving, +taking into account both data integrity and decoder availability: @itemize @bullet @item @@ -107,15 +112,14 @@ repair the nearer it is from the beginning of the file. Therefore, with the help of lziprecover, losing an entire archive just because of a corrupt byte near the beginning is a thing of the past. -Plzip uses the same well-defined exit status values used by lzip and -bzip2, which makes it safer than compressors returning ambiguous warning -values (like gzip) when it is used as a back end for other programs like -tar or zutils. +Plzip uses the same well-defined exit status values used by lzip, which +makes it safer than compressors returning ambiguous warning values (like +gzip) when it is used as a back end for other programs like tar or zutils. -Plzip will automatically use the smallest possible dictionary size for -each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit (@pxref{Memory requirements}). +Plzip will automatically use for each file the largest dictionary size that +does not exceed neither the file size nor the limit given. Keep in mind that +the decompression memory requirement is affected at compression time by the +choice of dictionary size limit. @xref{Memory requirements}. When compressing, plzip replaces every file given in the command line with a compressed version of itself, with the name "original_name.lz". @@ -130,7 +134,7 @@ file from that of the compressed file as follows: (De)compressing a file is much like copying or moving it; therefore plzip preserves the access and modification dates, permissions, and, when -possible, ownership of the file just as "cp -p" does. (If the user ID or +possible, ownership of the file just as @samp{cp -p} does. (If the user ID or the group ID can't be duplicated, the file permission bits S_ISUID and S_ISGID are cleared). @@ -142,10 +146,10 @@ standard input to standard output. In this case, plzip will decline to write compressed output to a terminal, as this would be entirely incomprehensible and therefore pointless. -Plzip will correctly decompress a file which is the concatenation of two -or more compressed files. The result is the concatenation of the -corresponding decompressed files. Integrity testing of concatenated -compressed files is also supported. +Plzip will correctly decompress a file which is the concatenation of two or +more compressed files. The result is the concatenation of the corresponding +decompressed files. Integrity testing of concatenated compressed files is +also supported. @node Output @@ -225,6 +229,7 @@ Print an informative help message describing the options and exit. @item -V @itemx --version Print the version number of plzip on the standard output and exit. +This version number should be included in all bug reports. @anchor{--trailing-error} @item -a @@ -322,12 +327,13 @@ Quiet operation. Suppress all messages. @item -s @var{bytes} @itemx --dictionary-size=@var{bytes} When compressing, set the dictionary size limit in bytes. Plzip will use -the smallest possible dictionary size for each file without exceeding -this limit. Valid values range from @w{4 KiB} to @w{512 MiB}. Values 12 -to 29 are interpreted as powers of two, meaning 2^12 to 2^29 bytes. Note -that dictionary sizes are quantized. If the specified size does not -match one of the valid sizes, it will be rounded upwards by adding up to -@w{(@var{bytes} / 8)} to it. +for each file the largest dictionary size that does not exceed neither +the file size nor this limit. Valid values range from @w{4 KiB} to +@w{512 MiB}. Values 12 to 29 are interpreted as powers of two, meaning +2^12 to 2^29 bytes. Dictionary sizes are quantized so that they can be +coded in just one byte (@pxref{coded-dict-size}). If the specified size +does not match one of the valid sizes, it will be rounded upwards by +adding up to @w{(@var{bytes} / 8)} to it. For maximum compression you should use a dictionary size limit as large as possible, but keep in mind that the decompression memory requirement @@ -354,18 +360,23 @@ Two or more @samp{-v} options show the progress of (de)compression, except for single-member files. @item -0 .. -9 -Set the compression parameters (dictionary size and match length limit) -as shown in the table below. The default compression level is @samp{-6}. -Note that @samp{-9} can be much slower than @samp{-0}. These options -have no effect when decompressing, testing or listing. +Compression level. Set the compression parameters (dictionary size and +match length limit) as shown in the table below. The default compression +level is @samp{-6}, equivalent to @w{@samp{-s8MiB -m36}}. Note that +@samp{-9} can be much slower than @samp{-0}. These options have no +effect when decompressing, testing or listing. The bidimensional parameter space of LZMA can't be mapped to a linear scale optimal for all files. If your files are large, very repetitive, etc, you may need to use the @samp{--dictionary-size} and @samp{--match-length} options directly to achieve optimal performance. -@multitable {Level} {Dictionary size} {Match length limit} -@item Level @tab Dictionary size @tab Match length limit +If several compression levels or @samp{-s} or @samp{-m} options are +given, the last setting is used. For example @w{@samp{-9 -s64MiB}} is +equivalent to @w{@samp{-s64MiB -m273}} + +@multitable {Level} {Dictionary size (-s)} {Match length limit (-m)} +@item Level @tab Dictionary size (-s) @tab Match length limit (-m) @item -0 @tab 64 KiB @tab 16 bytes @item -1 @tab 1 MiB @tab 5 bytes @item -2 @tab 1.5 MiB @tab 6 bytes @@ -388,6 +399,18 @@ bytes are so similar to the magic bytes of a lzip header that they can be confused with a corrupt header. Use this option if a file triggers a "corrupt header" error and the cause is not indeed a corrupt header. +@item --in-slots=@var{n} +Number of @w{1 MiB} input packets buffered per worker thread when +decompressing from non-seekable input. Increasing the number of packets +may increase decompression speed, but requires more memory. Valid values +range from 1 to 64. The default value is 4. + +@item --out-slots=@var{n} +Number of @w{1 MiB} output packets buffered per worker thread when +decompressing to non-seekable output. Increasing the number of packets +may increase decompression speed, but requires more memory. Valid values +range from 1 to 1024. The default value is 64. + @end table Numbers given as arguments to options may be followed by a multiplier @@ -506,12 +529,13 @@ A four byte string, identifying the lzip format, with the value "LZIP" @item VN (version number, 1 byte) Just in case something needs to be modified in the future. 1 for now. +@anchor{coded-dict-size} @item DS (coded dictionary size, 1 byte) The dictionary size is calculated by taking a power of 2 (the base size) -and substracting from it a fraction between 0/16 and 7/16 of the base +and subtracting from it a fraction between 0/16 and 7/16 of the base size.@* Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).@* -Bits 7-5 contain the numerator of the fraction (0 to 7) to substract +Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract from the base size to obtain the dictionary size.@* Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB@* Valid values for dictionary size range from 4 KiB to 512 MiB. @@ -546,8 +570,8 @@ facilitates safe recovery of undamaged members from multimember files. @chapter Memory required to compress and decompress @cindex memory requirements -The amount of memory required @strong{per thread} for decompression or -testing is approximately the following: +The amount of memory required @strong{per worker thread} for +decompression or testing is approximately the following: @itemize @bullet @item @@ -556,20 +580,23 @@ or for testing of a regular file; the dictionary size. @item For testing of a non-seekable file or of standard input; the dictionary -size plus up to @w{5 MiB}. +size plus @w{1 MiB} plus up to the number of @w{1 MiB} input packets +buffered (4 by default). @item For decompression of a regular file to a non-seekable file or to -standard output; the dictionary size plus up to @w{32 MiB}. +standard output; the dictionary size plus up to the number of @w{1 MiB} +output packets buffered (64 by default). @item For decompression of a non-seekable file or of standard input; the -dictionary size plus up to @w{35 MiB}. +dictionary size plus @w{1 MiB} plus up to the number of @w{1 MiB} input +and output packets buffered (68 by default). @end itemize @noindent -The amount of memory required @strong{per thread} for compression is -approximately the following: +The amount of memory required @strong{per worker thread} for compression +is approximately the following: @itemize @bullet @item @@ -696,9 +723,12 @@ where a file containing trailing data must be rejected, the option WARNING! Even if plzip is bug-free, other causes may result in a corrupt compressed file (bugs in the system libraries, memory errors, etc). Therefore, if the data you are going to compress are important, give the -@samp{--keep} option to plzip and don't remove the original file until -you verify the compressed file with a command like -@w{@samp{plzip -cd file.lz | cmp file -}}. +@samp{--keep} option to plzip and don't remove the original file until you +verify the compressed file with a command like +@w{@samp{plzip -cd file.lz | cmp file -}}. Most RAM errors happening during +compression can only be detected by comparing the compressed file with the +original because the corruption happens before plzip compresses the RAM +contents, resulting in a valid compressed file containing wrong data. @sp 1 @noindent diff --git a/file_index.cc b/file_index.cc deleted file mode 100644 index 8238054..0000000 --- a/file_index.cc +++ /dev/null @@ -1,196 +0,0 @@ -/* Plzip - Parallel compressor compatible with lzip - Copyright (C) 2009-2018 Antonio Diaz Diaz. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#define _FILE_OFFSET_BITS 64 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lzip.h" -#include "file_index.h" - - -namespace { - -int seek_read( const int fd, uint8_t * const buf, const int size, - const long long pos ) - { - if( lseek( fd, pos, SEEK_SET ) == pos ) - return readblock( fd, buf, size ); - return 0; - } - -} // end namespace - - -void File_index::set_errno_error( const char * const msg ) - { - error_ = msg; error_ += std::strerror( errno ); - retval_ = 1; - } - -void File_index::set_num_error( const char * const msg, unsigned long long num ) - { - char buf[80]; - snprintf( buf, sizeof buf, "%s%llu", msg, num ); - error_ = buf; - retval_ = 2; - } - - -// If successful, push last member and set pos to member header. -bool File_index::skip_trailing_data( const int fd, long long & pos, - const bool ignore_trailing, const bool loose_trailing ) - { - enum { block_size = 16384, - buffer_size = block_size + File_trailer::size - 1 + File_header::size }; - uint8_t buffer[buffer_size]; - if( pos < min_member_size ) return false; - int bsize = pos % block_size; // total bytes in buffer - if( bsize <= buffer_size - block_size ) bsize += block_size; - int search_size = bsize; // bytes to search for trailer - int rd_size = bsize; // bytes to read from file - unsigned long long ipos = pos - rd_size; // aligned to block_size - - while( true ) - { - if( seek_read( fd, buffer, rd_size, ipos ) != rd_size ) - { set_errno_error( "Error seeking member trailer: " ); return false; } - const uint8_t max_msb = ( ipos + search_size ) >> 56; - for( int i = search_size; i >= File_trailer::size; --i ) - if( buffer[i-1] <= max_msb ) // most significant byte of member_size - { - File_trailer & trailer = - *(File_trailer *)( buffer + i - File_trailer::size ); - const unsigned long long member_size = trailer.member_size(); - if( member_size == 0 ) - { while( i > File_trailer::size && buffer[i-9] == 0 ) --i; continue; } - if( member_size < min_member_size || member_size > ipos + i ) - continue; - File_header header; - if( seek_read( fd, header.data, File_header::size, - ipos + i - member_size ) != File_header::size ) - { set_errno_error( "Error reading member header: " ); return false; } - const unsigned dictionary_size = header.dictionary_size(); - if( !header.verify_magic() || !header.verify_version() || - !isvalid_ds( dictionary_size ) ) continue; - if( (*(File_header *)( buffer + i )).verify_prefix( bsize - i ) ) - { error_ = "Last member in input file is truncated or corrupt."; - retval_ = 2; return false; } - if( !loose_trailing && bsize - i >= File_header::size && - (*(File_header *)( buffer + i )).verify_corrupt() ) - { error_ = corrupt_mm_msg; retval_ = 2; return false; } - if( !ignore_trailing ) - { error_ = trailing_msg; retval_ = 2; return false; } - pos = ipos + i - member_size; - member_vector.push_back( Member( 0, trailer.data_size(), pos, - member_size, dictionary_size ) ); - return true; - } - if( ipos <= 0 ) - { set_num_error( "Member size in trailer is corrupt at pos ", pos - 8 ); - return false; } - bsize = buffer_size; - search_size = bsize - File_header::size; - rd_size = block_size; - ipos -= rd_size; - std::memcpy( buffer + rd_size, buffer, buffer_size - rd_size ); - } - } - - -File_index::File_index( const int infd, const bool ignore_trailing, - const bool loose_trailing ) - : isize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ) - { - if( isize < 0 ) - { set_errno_error( "Input file is not seekable: " ); return; } - if( isize < min_member_size ) - { error_ = "Input file is too short."; retval_ = 2; return; } - if( isize > INT64_MAX ) - { error_ = "Input file is too long (2^63 bytes or more)."; - retval_ = 2; return; } - - File_header header; - if( seek_read( infd, header.data, File_header::size, 0 ) != File_header::size ) - { set_errno_error( "Error reading member header: " ); return; } - if( !header.verify_magic() ) - { error_ = bad_magic_msg; retval_ = 2; return; } - if( !header.verify_version() ) - { error_ = bad_version( header.version() ); retval_ = 2; return; } - if( !isvalid_ds( header.dictionary_size() ) ) - { error_ = bad_dict_msg; retval_ = 2; return; } - - long long pos = isize; // always points to a header or to EOF - while( pos >= min_member_size ) - { - File_trailer trailer; - if( seek_read( infd, trailer.data, File_trailer::size, - pos - File_trailer::size ) != File_trailer::size ) - { set_errno_error( "Error reading member trailer: " ); break; } - const unsigned long long member_size = trailer.member_size(); - if( member_size < min_member_size || member_size > (unsigned long long)pos ) - { - if( member_vector.empty() ) - { if( skip_trailing_data( infd, pos, ignore_trailing, loose_trailing ) ) - continue; else return; } - set_num_error( "Member size in trailer is corrupt at pos ", pos - 8 ); - break; - } - if( seek_read( infd, header.data, File_header::size, - pos - member_size ) != File_header::size ) - { set_errno_error( "Error reading member header: " ); break; } - const unsigned dictionary_size = header.dictionary_size(); - if( !header.verify_magic() || !header.verify_version() || - !isvalid_ds( dictionary_size ) ) - { - if( member_vector.empty() ) - { if( skip_trailing_data( infd, pos, ignore_trailing, loose_trailing ) ) - continue; else return; } - set_num_error( "Bad header at pos ", pos - member_size ); - break; - } - pos -= member_size; - member_vector.push_back( Member( 0, trailer.data_size(), pos, - member_size, dictionary_size ) ); - } - if( pos != 0 || member_vector.empty() ) - { - member_vector.clear(); - if( retval_ == 0 ) { error_ = "Can't create file index."; retval_ = 2; } - return; - } - std::reverse( member_vector.begin(), member_vector.end() ); - for( unsigned long i = 0; i < member_vector.size() - 1; ++i ) - { - const long long end = member_vector[i].dblock.end(); - if( end < 0 || end > INT64_MAX ) - { - member_vector.clear(); - error_ = "Data in input file is too long (2^63 bytes or more)."; - retval_ = 2; return; - } - member_vector[i+1].dblock.pos( end ); - } - } diff --git a/file_index.h b/file_index.h deleted file mode 100644 index 7962b99..0000000 --- a/file_index.h +++ /dev/null @@ -1,87 +0,0 @@ -/* Plzip - Parallel compressor compatible with lzip - Copyright (C) 2009-2018 Antonio Diaz Diaz. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef INT64_MAX -#define INT64_MAX 0x7FFFFFFFFFFFFFFFLL -#endif - - -class Block - { - long long pos_, size_; // pos + size <= INT64_MAX - -public: - Block( const long long p, const long long s ) : pos_( p ), size_( s ) {} - - long long pos() const { return pos_; } - long long size() const { return size_; } - long long end() const { return pos_ + size_; } - - void pos( const long long p ) { pos_ = p; } - void size( const long long s ) { size_ = s; } - }; - - -class File_index - { - struct Member - { - Block dblock, mblock; // data block, member block - unsigned dictionary_size; - - Member( const long long dp, const long long ds, - const long long mp, const long long ms, const unsigned dict_size ) - : dblock( dp, ds ), mblock( mp, ms ), dictionary_size( dict_size ) {} - }; - - std::vector< Member > member_vector; - std::string error_; - const long long isize; - int retval_; - - void set_errno_error( const char * const msg ); - void set_num_error( const char * const msg, unsigned long long num ); - bool skip_trailing_data( const int fd, long long & pos, - const bool ignore_trailing, const bool loose_trailing ); - -public: - File_index( const int infd, const bool ignore_trailing, - const bool loose_trailing ); - - long members() const { return member_vector.size(); } - const std::string & error() const { return error_; } - int retval() const { return retval_; } - - long long udata_size() const - { if( member_vector.empty() ) return 0; - return member_vector.back().dblock.end(); } - - long long cdata_size() const - { if( member_vector.empty() ) return 0; - return member_vector.back().mblock.end(); } - - // total size including trailing data (if any) - long long file_size() const - { if( isize >= 0 ) return isize; else return 0; } - - const Block & dblock( const long i ) const - { return member_vector[i].dblock; } - const Block & mblock( const long i ) const - { return member_vector[i].mblock; } - unsigned dictionary_size( const long i ) const - { return member_vector[i].dictionary_size; } - }; diff --git a/list.cc b/list.cc index eeef1c3..d3e4908 100644 --- a/list.cc +++ b/list.cc @@ -1,5 +1,5 @@ -/* Plzip - Parallel compressor compatible with lzip - Copyright (C) 2009-2018 Antonio Diaz Diaz. +/* Plzip - Massively parallel implementation of lzip + Copyright (C) 2009-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ #include #include "lzip.h" -#include "file_index.h" +#include "lzip_index.h" namespace { @@ -66,18 +66,18 @@ int list_files( const std::vector< std::string > & filenames, open_instream( input_filename, &in_stats, true, true ); if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } - const File_index file_index( infd, ignore_trailing, loose_trailing ); + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing ); close( infd ); - if( file_index.retval() != 0 ) + if( lzip_index.retval() != 0 ) { - show_file_error( input_filename, file_index.error().c_str() ); - if( retval < file_index.retval() ) retval = file_index.retval(); + show_file_error( input_filename, lzip_index.error().c_str() ); + if( retval < lzip_index.retval() ) retval = lzip_index.retval(); continue; } if( verbosity >= 0 ) { - const unsigned long long udata_size = file_index.udata_size(); - const unsigned long long cdata_size = file_index.cdata_size(); + const unsigned long long udata_size = lzip_index.udata_size(); + const unsigned long long cdata_size = lzip_index.cdata_size(); total_comp += cdata_size; total_uncomp += udata_size; ++files; if( first_post ) { @@ -88,22 +88,22 @@ int list_files( const std::vector< std::string > & filenames, if( verbosity >= 1 ) { unsigned dictionary_size = 0; - for( long i = 0; i < file_index.members(); ++i ) + for( long i = 0; i < lzip_index.members(); ++i ) dictionary_size = - std::max( dictionary_size, file_index.dictionary_size( i ) ); - const long long trailing_size = file_index.file_size() - cdata_size; + std::max( dictionary_size, lzip_index.dictionary_size( i ) ); + const long long trailing_size = lzip_index.file_size() - cdata_size; std::printf( "%s %5ld %6lld ", format_ds( dictionary_size ), - file_index.members(), trailing_size ); + lzip_index.members(), trailing_size ); } list_line( udata_size, cdata_size, input_filename ); - if( verbosity >= 2 && file_index.members() > 1 ) + if( verbosity >= 2 && lzip_index.members() > 1 ) { std::fputs( " member data_pos data_size member_pos member_size\n", stdout ); - for( long i = 0; i < file_index.members(); ++i ) + for( long i = 0; i < lzip_index.members(); ++i ) { - const Block & db = file_index.dblock( i ); - const Block & mb = file_index.mblock( i ); + const Block & db = lzip_index.dblock( i ); + const Block & mb = lzip_index.mblock( i ); std::printf( "%5ld %15llu %15llu %15llu %15llu\n", i + 1, db.pos(), db.size(), mb.pos(), mb.size() ); } diff --git a/lzip.h b/lzip.h index 3587a8f..dfbf4f7 100644 --- a/lzip.h +++ b/lzip.h @@ -1,5 +1,5 @@ -/* Plzip - Parallel compressor compatible with lzip - Copyright (C) 2009-2018 Antonio Diaz Diaz. +/* Plzip - Massively parallel implementation of lzip + Copyright (C) 2009-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -48,7 +48,7 @@ public: { const std::string & s = filenames[i]; const unsigned len = ( s == "-" ) ? stdin_name_len : s.size(); - if( len > longest_name ) longest_name = len; + if( longest_name < len ) longest_name = len; } if( longest_name == 0 ) longest_name = stdin_name_len; } @@ -58,7 +58,7 @@ public: if( filename.size() && filename != "-" ) name_ = filename; else name_ = stdin_name; padded_name = " "; padded_name += name_; padded_name += ": "; - if( name_.size() < longest_name ) + if( longest_name > name_.size() ) padded_name.append( longest_name - name_.size(), ' ' ); first_post = true; } @@ -82,30 +82,30 @@ inline int real_bits( unsigned value ) } -const uint8_t magic_string[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" +const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" -struct File_header +struct Lzip_header { uint8_t data[6]; // 0-3 magic bytes // 4 version // 5 coded_dict_size enum { size = 6 }; - void set_magic() { std::memcpy( data, magic_string, 4 ); data[4] = 1; } + void set_magic() { std::memcpy( data, lzip_magic, 4 ); data[4] = 1; } bool verify_magic() const - { return ( std::memcmp( data, magic_string, 4 ) == 0 ); } + { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); } bool verify_prefix( const int sz ) const // detect (truncated) header { for( int i = 0; i < sz && i < 4; ++i ) - if( data[i] != magic_string[i] ) return false; + if( data[i] != lzip_magic[i] ) return false; return ( sz > 0 ); } bool verify_corrupt() const // detect corrupt header { int matches = 0; for( int i = 0; i < 4; ++i ) - if( data[i] == magic_string[i] ) ++matches; + if( data[i] == lzip_magic[i] ) ++matches; return ( matches > 1 && matches < 4 ); } @@ -137,12 +137,11 @@ struct File_header }; -struct File_trailer +struct Lzip_trailer { uint8_t data[20]; // 0-3 CRC32 of the uncompressed data // 4-11 size of the uncompressed data // 12-19 member size including header and trailer - enum { size = 20 }; unsigned data_crc() const @@ -174,6 +173,20 @@ struct File_trailer void member_size( unsigned long long sz ) { for( int i = 12; i <= 19; ++i ) { data[i] = (uint8_t)sz; sz >>= 8; } } + + bool verify_consistency() const // check internal consistency + { + const unsigned crc = data_crc(); + const unsigned long long dsize = data_size(); + if( ( crc == 0 ) != ( dsize == 0 ) ) return false; + const unsigned long long msize = member_size(); + if( msize < min_member_size ) return false; + const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size; + if( mlimit > dsize && msize > mlimit ) return false; + const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1; + if( dlimit > msize && dsize > dlimit ) return false; + return true; + } }; @@ -200,18 +213,19 @@ int compress( const unsigned long long cfile_size, const int infd, const int outfd, const Pretty_print & pp, const int debug_level ); -// defined in file_index.cc -class File_index; +// defined in lzip_index.cc +class Lzip_index; // defined in dec_stdout.cc int dec_stdout( const int num_workers, const int infd, const int outfd, const Pretty_print & pp, const int debug_level, - const File_index & file_index ); + const int out_slots, const Lzip_index & lzip_index ); // defined in dec_stream.cc int dec_stream( const unsigned long long cfile_size, const int num_workers, const int infd, const int outfd, const Pretty_print & pp, const int debug_level, + const int in_slots, const int out_slots, const bool ignore_trailing, const bool loose_trailing ); // defined in decompress.cc @@ -221,7 +235,8 @@ int decompress_read_error( struct LZ_Decoder * const decoder, const Pretty_print & pp, const int worker_id ); int decompress( const unsigned long long cfile_size, int num_workers, const int infd, const int outfd, const Pretty_print & pp, - const int debug_level, const bool ignore_trailing, + const int debug_level, const int in_slots, + const int out_slots, const bool ignore_trailing, const bool loose_trailing, const bool infd_isreg ); // defined in list.cc diff --git a/lzip_index.cc b/lzip_index.cc new file mode 100644 index 0000000..d9c810c --- /dev/null +++ b/lzip_index.cc @@ -0,0 +1,197 @@ +/* Plzip - Massively parallel implementation of lzip + Copyright (C) 2009-2019 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#define _FILE_OFFSET_BITS 64 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lzip.h" +#include "lzip_index.h" + + +namespace { + +int seek_read( const int fd, uint8_t * const buf, const int size, + const long long pos ) + { + if( lseek( fd, pos, SEEK_SET ) == pos ) + return readblock( fd, buf, size ); + return 0; + } + +} // end namespace + + +void Lzip_index::set_errno_error( const char * const msg ) + { + error_ = msg; error_ += std::strerror( errno ); + retval_ = 1; + } + +void Lzip_index::set_num_error( const char * const msg, unsigned long long num ) + { + char buf[80]; + snprintf( buf, sizeof buf, "%s%llu", msg, num ); + error_ = buf; + retval_ = 2; + } + + +// If successful, push last member and set pos to member header. +bool Lzip_index::skip_trailing_data( const int fd, long long & pos, + const bool ignore_trailing, const bool loose_trailing ) + { + enum { block_size = 16384, + buffer_size = block_size + Lzip_trailer::size - 1 + Lzip_header::size }; + uint8_t buffer[buffer_size]; + if( pos < min_member_size ) return false; + int bsize = pos % block_size; // total bytes in buffer + if( bsize <= buffer_size - block_size ) bsize += block_size; + int search_size = bsize; // bytes to search for trailer + int rd_size = bsize; // bytes to read from file + unsigned long long ipos = pos - rd_size; // aligned to block_size + + while( true ) + { + if( seek_read( fd, buffer, rd_size, ipos ) != rd_size ) + { set_errno_error( "Error seeking member trailer: " ); return false; } + const uint8_t max_msb = ( ipos + search_size ) >> 56; + for( int i = search_size; i >= Lzip_trailer::size; --i ) + if( buffer[i-1] <= max_msb ) // most significant byte of member_size + { + const Lzip_trailer & trailer = + *(const Lzip_trailer *)( buffer + i - Lzip_trailer::size ); + const unsigned long long member_size = trailer.member_size(); + if( member_size == 0 ) // skip trailing zeros + { while( i > Lzip_trailer::size && buffer[i-9] == 0 ) --i; continue; } + if( member_size > ipos + i || !trailer.verify_consistency() ) + continue; + Lzip_header header; + if( seek_read( fd, header.data, Lzip_header::size, + ipos + i - member_size ) != Lzip_header::size ) + { set_errno_error( "Error reading member header: " ); return false; } + const unsigned dictionary_size = header.dictionary_size(); + if( !header.verify_magic() || !header.verify_version() || + !isvalid_ds( dictionary_size ) ) continue; + if( (*(const Lzip_header *)( buffer + i )).verify_prefix( bsize - i ) ) + { error_ = "Last member in input file is truncated or corrupt."; + retval_ = 2; return false; } + if( !loose_trailing && bsize - i >= Lzip_header::size && + (*(const Lzip_header *)( buffer + i )).verify_corrupt() ) + { error_ = corrupt_mm_msg; retval_ = 2; return false; } + if( !ignore_trailing ) + { error_ = trailing_msg; retval_ = 2; return false; } + pos = ipos + i - member_size; + member_vector.push_back( Member( 0, trailer.data_size(), pos, + member_size, dictionary_size ) ); + return true; + } + if( ipos <= 0 ) + { set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); + return false; } + bsize = buffer_size; + search_size = bsize - Lzip_header::size; + rd_size = block_size; + ipos -= rd_size; + std::memcpy( buffer + rd_size, buffer, buffer_size - rd_size ); + } + } + + +Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, + const bool loose_trailing ) + : insize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ) + { + if( insize < 0 ) + { set_errno_error( "Input file is not seekable: " ); return; } + if( insize < min_member_size ) + { error_ = "Input file is too short."; retval_ = 2; return; } + if( insize > INT64_MAX ) + { error_ = "Input file is too long (2^63 bytes or more)."; + retval_ = 2; return; } + + Lzip_header header; + if( seek_read( infd, header.data, Lzip_header::size, 0 ) != Lzip_header::size ) + { set_errno_error( "Error reading member header: " ); return; } + if( !header.verify_magic() ) + { error_ = bad_magic_msg; retval_ = 2; return; } + if( !header.verify_version() ) + { error_ = bad_version( header.version() ); retval_ = 2; return; } + if( !isvalid_ds( header.dictionary_size() ) ) + { error_ = bad_dict_msg; retval_ = 2; return; } + + long long pos = insize; // always points to a header or to EOF + while( pos >= min_member_size ) + { + Lzip_trailer trailer; + if( seek_read( infd, trailer.data, Lzip_trailer::size, + pos - Lzip_trailer::size ) != Lzip_trailer::size ) + { set_errno_error( "Error reading member trailer: " ); break; } + const unsigned long long member_size = trailer.member_size(); + if( member_size > (unsigned long long)pos || !trailer.verify_consistency() ) + { + if( member_vector.empty() ) + { if( skip_trailing_data( infd, pos, ignore_trailing, loose_trailing ) ) + continue; else return; } + set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); + break; + } + if( seek_read( infd, header.data, Lzip_header::size, + pos - member_size ) != Lzip_header::size ) + { set_errno_error( "Error reading member header: " ); break; } + const unsigned dictionary_size = header.dictionary_size(); + if( !header.verify_magic() || !header.verify_version() || + !isvalid_ds( dictionary_size ) ) + { + if( member_vector.empty() ) + { if( skip_trailing_data( infd, pos, ignore_trailing, loose_trailing ) ) + continue; else return; } + set_num_error( "Bad header at pos ", pos - member_size ); + break; + } + pos -= member_size; + member_vector.push_back( Member( 0, trailer.data_size(), pos, + member_size, dictionary_size ) ); + } + if( pos != 0 || member_vector.empty() ) + { + member_vector.clear(); + if( retval_ == 0 ) { error_ = "Can't create file index."; retval_ = 2; } + return; + } + std::reverse( member_vector.begin(), member_vector.end() ); + for( unsigned long i = 0; ; ++i ) + { + const long long end = member_vector[i].dblock.end(); + if( end < 0 || end > INT64_MAX ) + { + member_vector.clear(); + error_ = "Data in input file is too long (2^63 bytes or more)."; + retval_ = 2; return; + } + if( i + 1 >= member_vector.size() ) break; + member_vector[i+1].dblock.pos( end ); + } + } diff --git a/lzip_index.h b/lzip_index.h new file mode 100644 index 0000000..3775446 --- /dev/null +++ b/lzip_index.h @@ -0,0 +1,87 @@ +/* Plzip - Massively parallel implementation of lzip + Copyright (C) 2009-2019 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef INT64_MAX +#define INT64_MAX 0x7FFFFFFFFFFFFFFFLL +#endif + + +class Block + { + long long pos_, size_; // pos + size <= INT64_MAX + +public: + Block( const long long p, const long long s ) : pos_( p ), size_( s ) {} + + long long pos() const { return pos_; } + long long size() const { return size_; } + long long end() const { return pos_ + size_; } + + void pos( const long long p ) { pos_ = p; } + void size( const long long s ) { size_ = s; } + }; + + +class Lzip_index + { + struct Member + { + Block dblock, mblock; // data block, member block + unsigned dictionary_size; + + Member( const long long dp, const long long ds, + const long long mp, const long long ms, const unsigned dict_size ) + : dblock( dp, ds ), mblock( mp, ms ), dictionary_size( dict_size ) {} + }; + + std::vector< Member > member_vector; + std::string error_; + const long long insize; + int retval_; + + void set_errno_error( const char * const msg ); + void set_num_error( const char * const msg, unsigned long long num ); + bool skip_trailing_data( const int fd, long long & pos, + const bool ignore_trailing, const bool loose_trailing ); + +public: + Lzip_index( const int infd, const bool ignore_trailing, + const bool loose_trailing ); + + long members() const { return member_vector.size(); } + const std::string & error() const { return error_; } + int retval() const { return retval_; } + + long long udata_size() const + { if( member_vector.empty() ) return 0; + return member_vector.back().dblock.end(); } + + long long cdata_size() const + { if( member_vector.empty() ) return 0; + return member_vector.back().mblock.end(); } + + // total size including trailing data (if any) + long long file_size() const + { if( insize >= 0 ) return insize; else return 0; } + + const Block & dblock( const long i ) const + { return member_vector[i].dblock; } + const Block & mblock( const long i ) const + { return member_vector[i].mblock; } + unsigned dictionary_size( const long i ) const + { return member_vector[i].dictionary_size; } + }; diff --git a/main.cc b/main.cc index d1f76bc..5eab9f9 100644 --- a/main.cc +++ b/main.cc @@ -1,6 +1,6 @@ -/* Plzip - Parallel compressor compatible with lzip +/* Plzip - Massively parallel implementation of lzip Copyright (C) 2009 Laszlo Ersek. - Copyright (C) 2009-2018 Antonio Diaz Diaz. + Copyright (C) 2009-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,20 +40,21 @@ #include #include #include -#if defined(__MSVCRT__) +#if defined(__MSVCRT__) || defined(__OS2__) #include +#if defined(__MSVCRT__) #define fchmod(x,y) 0 #define fchown(x,y,z) 0 #define strtoull std::strtoul #define SIGHUP SIGTERM #define S_ISSOCK(x) 0 +#ifndef S_IRGRP #define S_IRGRP 0 #define S_IWGRP 0 #define S_IROTH 0 #define S_IWOTH 0 #endif -#if defined(__OS2__) -#include +#endif #endif #include "arg_parser.h" @@ -71,9 +72,8 @@ int verbosity = 0; namespace { -const char * const Program_name = "Plzip"; const char * const program_name = "plzip"; -const char * const program_year = "2018"; +const char * const program_year = "2019"; const char * invocation_name = 0; const struct { const char * from; const char * to; } known_extensions[] = { @@ -89,6 +89,8 @@ struct Lzma_options enum Mode { m_compress, m_decompress, m_list, m_test }; +/* Variables used in signal handler context. + They are not declared volatile because the handler never returns. */ std::string output_filename; int outfd = -1; bool delete_output_on_interrupt = false; @@ -96,8 +98,22 @@ bool delete_output_on_interrupt = false; void show_help( const long num_online ) { - std::printf( "%s - Parallel compressor compatible with lzip.\n", Program_name ); - std::printf( "\nUsage: %s [options] [files]\n", invocation_name ); + std::printf( "Plzip is a massively parallel (multi-threaded) implementation of lzip, fully\n" + "compatible with lzip 1.4 or newer. Plzip uses the lzlib compression library.\n" + "\nLzip is a lossless data compressor with a user interface similar to the\n" + "one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0)\n" + "or compress most files more than bzip2 (lzip -9). Decompression speed is\n" + "intermediate between gzip and bzip2. Lzip is better than gzip and bzip2\n" + "from a data recovery perspective. Lzip has been designed, written and\n" + "tested with great care to replace gzip and bzip2 as the standard\n" + "general-purpose compressed format for unix-like systems.\n" + "\nPlzip can compress/decompress large files on multiprocessor machines\n" + "much faster than lzip, at the cost of a slightly reduced compression\n" + "ratio (0.4 to 2 percent larger compressed files). Note that the number\n" + "of usable threads is limited by file size; on files larger than a few GB\n" + "plzip can use hundreds of processors, but on files of only a few MB\n" + "plzip is no faster than lzip.\n" + "\nUsage: %s [options] [files]\n", invocation_name ); std::printf( "\nOptions:\n" " -h, --help display this help and exit\n" " -V, --version output version information and exit\n" @@ -120,6 +136,8 @@ void show_help( const long num_online ) " --fast alias for -0\n" " --best alias for -9\n" " --loose-trailing allow trailing data seeming corrupt header\n" + " --in-slots= number of 1 MiB input packets buffered [4]\n" + " --out-slots= number of 1 MiB output packets buffered [64]\n" , num_online ); if( verbosity >= 1 ) { @@ -263,7 +281,7 @@ int get_dict_size( const char * const arg ) const long bits = std::strtol( arg, &tail, 0 ); if( bits >= LZ_min_dictionary_bits() && bits <= LZ_max_dictionary_bits() && *tail == 0 ) - return ( 1 << bits ); + return 1 << bits; int dictionary_size = getnum( arg, LZ_min_dictionary_size(), LZ_max_dictionary_size() ); if( dictionary_size == 65535 ) ++dictionary_size; // no fast encoder @@ -409,6 +427,14 @@ bool check_tty( const char * const input_filename, const int infd, return true; } + +void set_signals( void (*action)(int) ) + { + std::signal( SIGHUP, action ); + std::signal( SIGINT, action ); + std::signal( SIGTERM, action ); + } + } // end namespace // This can be called from any thread, main thread or sub-threads alike, @@ -420,6 +446,7 @@ void cleanup_and_fail( const int retval ) // only one thread can delete and exit static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + set_signals( SIG_IGN ); // ignore signals pthread_mutex_lock( &mutex ); // ignore errors to avoid loop const int saved_verbosity = verbosity; verbosity = -1; // suppress messages from other threads @@ -440,6 +467,13 @@ void cleanup_and_fail( const int retval ) namespace { +extern "C" void signal_handler( int ) + { + show_error( "Control-C or similar caught, quitting." ); + cleanup_and_fail( 1 ); + } + + // Set permissions, owner and times. void close_and_set_permissions( const struct stat * const in_statsp ) { @@ -473,21 +507,6 @@ void close_and_set_permissions( const struct stat * const in_statsp ) show_error( "Can't change output file attributes." ); } - -extern "C" void signal_handler( int ) - { - show_error( "Control-C or similar caught, quitting." ); - cleanup_and_fail( 1 ); - } - - -void set_signals() - { - std::signal( SIGHUP, signal_handler ); - std::signal( SIGINT, signal_handler ); - std::signal( SIGTERM, signal_handler ); - } - } // end namespace @@ -495,11 +514,9 @@ void show_error( const char * const msg, const int errcode, const bool help ) { if( verbosity < 0 ) return; if( msg && msg[0] ) - { - std::fprintf( stderr, "%s: %s", program_name, msg ); - if( errcode > 0 ) std::fprintf( stderr, ": %s", std::strerror( errcode ) ); - std::fputc( '\n', stderr ); - } + std::fprintf( stderr, "%s: %s%s%s\n", program_name, msg, + ( errcode > 0 ) ? ": " : "", + ( errcode > 0 ) ? std::strerror( errcode ) : "" ); if( help ) std::fprintf( stderr, "Try '%s --help' for more information.\n", invocation_name ); @@ -509,10 +526,10 @@ void show_error( const char * const msg, const int errcode, const bool help ) void show_file_error( const char * const filename, const char * const msg, const int errcode ) { - if( verbosity < 0 ) return; - std::fprintf( stderr, "%s: %s: %s", program_name, filename, msg ); - if( errcode > 0 ) std::fprintf( stderr, ": %s", std::strerror( errcode ) ); - std::fputc( '\n', stderr ); + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: %s: %s%s%s\n", program_name, filename, msg, + ( errcode > 0 ) ? ": " : "", + ( errcode > 0 ) ? std::strerror( errcode ) : "" ); } @@ -554,6 +571,26 @@ void show_progress( const unsigned long long packet_size, } +#if defined(__MSVCRT__) +#include +#define _SC_NPROCESSORS_ONLN 1 +#define _SC_THREAD_THREADS_MAX 2 + +long sysconf( int flag ) + { + if( flag == _SC_NPROCESSORS_ONLN ) + { + SYSTEM_INFO si; + GetSystemInfo( &si ); + return si.dwNumberOfProcessors; + } + if( flag != _SC_THREAD_THREADS_MAX ) errno = EINVAL; + return -1; // unlimited threads or error + } + +#endif // __MSVCRT__ + + int main( const int argc, const char * const argv[] ) { /* Mapping from gzip/bzip2 style 1..9 compression modes @@ -576,6 +613,8 @@ int main( const int argc, const char * const argv[] ) int data_size = 0; int debug_level = 0; int num_workers = 0; // start this many worker threads + int in_slots = 4; + int out_slots = 64; Mode program_mode = m_compress; bool force = false; bool ignore_trailing = true; @@ -589,7 +628,7 @@ int main( const int argc, const char * const argv[] ) { show_error( "Bad library version. At least lzlib 1.0 is required." ); return 1; } - enum { opt_dbg = 256, opt_lt }; + enum { opt_dbg = 256, opt_in, opt_lt, opt_out }; const Arg_parser::Option options[] = { { '0', "fast", Arg_parser::no }, @@ -622,7 +661,9 @@ int main( const int argc, const char * const argv[] ) { 'v', "verbose", Arg_parser::no }, { 'V', "version", Arg_parser::no }, { opt_dbg, "debug", Arg_parser::yes }, + { opt_in, "in-slots", Arg_parser::yes }, { opt_lt, "loose-trailing", Arg_parser::no }, + { opt_out, "out-slots", Arg_parser::yes }, { 0 , 0, Arg_parser::no } }; const Arg_parser parser( argc, argv, options ); @@ -670,7 +711,9 @@ int main( const int argc, const char * const argv[] ) case 'v': if( verbosity < 4 ) ++verbosity; break; case 'V': show_version(); return 0; case opt_dbg: debug_level = getnum( arg, 0, 3 ); break; + case opt_in: in_slots = getnum( arg, 1, 64 ); break; case opt_lt: loose_trailing = true; break; + case opt_out: out_slots = getnum( arg, 1, 1024 ); break; default : internal_error( "uncaught option." ); } } // end process options @@ -707,8 +750,9 @@ int main( const int argc, const char * const argv[] ) if( num_workers <= 0 ) { - if( sizeof (void *) <= 4 ) // use less than 2.22 GiB on 32 bit systems + if( program_mode == m_compress && sizeof (void *) <= 4 ) { + // use less than 2.22 GiB on 32 bit systems const long long limit = ( 27LL << 25 ) + ( 11LL << 27 ); // 4 * 568 MiB const long long mem = ( 27LL * data_size ) / 8 + ( fast ? 3LL << 19 : 11LL * encoder_options.dictionary_size ); @@ -720,10 +764,11 @@ int main( const int argc, const char * const argv[] ) if( !to_stdout && program_mode != m_test && ( filenames_given || default_output_filename.size() ) ) - set_signals(); + set_signals( signal_handler ); Pretty_print pp( filenames ); + int failed_tests = 0; int retval = 0; bool stdin_used = false; for( unsigned i = 0; i < filenames.size(); ++i ) @@ -798,15 +843,23 @@ int main( const int argc, const char * const argv[] ) num_workers, infd, outfd, pp, debug_level ); else tmp = decompress( cfile_size, num_workers, infd, outfd, pp, debug_level, - ignore_trailing, loose_trailing, infd_isreg ); + in_slots, out_slots, ignore_trailing, loose_trailing, + infd_isreg ); + if( close( infd ) != 0 ) + { + show_error( input_filename.size() ? "Error closing input file" : + "Error closing stdin", errno ); + if( tmp < 1 ) tmp = 1; + } if( tmp > retval ) retval = tmp; - if( tmp && program_mode != m_test ) cleanup_and_fail( retval ); + if( tmp ) + { if( program_mode != m_test ) cleanup_and_fail( retval ); + else ++failed_tests; } if( delete_output_on_interrupt ) close_and_set_permissions( in_statsp ); if( input_filename.size() ) { - close( infd ); if( !keep_input_files && !to_stdout && program_mode != m_test ) std::remove( input_filename.c_str() ); } @@ -816,5 +869,9 @@ int main( const int argc, const char * const argv[] ) show_error( "Error closing stdout", errno ); if( retval < 1 ) retval = 1; } + if( failed_tests > 0 && verbosity >= 1 && filenames.size() > 1 ) + std::fprintf( stderr, "%s: warning: %d %s failed the test.\n", + program_name, failed_tests, + ( failed_tests == 1 ) ? "file" : "files" ); return retval; } diff --git a/testsuite/check.sh b/testsuite/check.sh index a4113c3..59fe3f5 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -1,6 +1,6 @@ #! /bin/sh -# check script for Plzip - Parallel compressor compatible with lzip -# Copyright (C) 2009-2018 Antonio Diaz Diaz. +# check script for Plzip - Massively parallel implementation of lzip +# Copyright (C) 2009-2019 Antonio Diaz Diaz. # # This script is free software: you have unlimited permission # to copy, distribute and modify it. @@ -31,17 +31,28 @@ cd "${objdir}"/tmp || framework_failure cat "${testdir}"/test.txt > in || framework_failure in_lz="${testdir}"/test.txt.lz fail=0 +lwarn8=0 +lwarn10=0 test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; } +lzlib_1_8() { [ ${lwarn8} = 0 ] && + printf "\nwarning: header truncation detection requires lzlib 1.8 or newer" + lwarn8=1 ; } +lzlib_1_10() { [ ${lwarn10} = 0 ] && + printf "\nwarning: header HD=3 detection requires lzlib 1.10 or newer" + lwarn10=1 ; } printf "testing plzip-%s..." "$2" "${LZIP}" -fkqm4 in -{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO +[ $? = 1 ] || test_failed $LINENO +[ ! -e in.lz ] || test_failed $LINENO "${LZIP}" -fkqm274 in -{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO +[ $? = 1 ] || test_failed $LINENO +[ ! -e in.lz ] || test_failed $LINENO for i in bad_size -1 0 4095 513MiB 1G 1T 1P 1E 1Z 1Y 10KB ; do "${LZIP}" -fkqs $i in - { [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO $i + [ $? = 1 ] || test_failed $LINENO $i + [ ! -e in.lz ] || test_failed $LINENO $i done "${LZIP}" -lq in [ $? = 2 ] || test_failed $LINENO @@ -91,26 +102,34 @@ printf "\ntesting decompression..." "${LZIP}" -cd "${in_lz}" > copy || test_failed $LINENO cmp in copy || test_failed $LINENO -rm -f copy +rm -f copy || framework_failure cat "${in_lz}" > copy.lz || framework_failure "${LZIP}" -dk copy.lz || test_failed $LINENO cmp in copy || test_failed $LINENO printf "to be overwritten" > copy || framework_failure "${LZIP}" -d copy.lz 2> /dev/null [ $? = 1 ] || test_failed $LINENO -"${LZIP}" -df copy.lz -{ [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; } || test_failed $LINENO +"${LZIP}" -df copy.lz || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +cmp in copy || test_failed $LINENO + +rm -f copy || framework_failure +cat "${in_lz}" > copy.lz || framework_failure +"${LZIP}" -d -S100k copy.lz || test_failed $LINENO # ignore -S +[ ! -e copy.lz ] || test_failed $LINENO +cmp in copy || test_failed $LINENO printf "to be overwritten" > copy || framework_failure "${LZIP}" -df -o copy < "${in_lz}" || test_failed $LINENO cmp in copy || test_failed $LINENO -rm -f copy +rm -f copy || framework_failure "${LZIP}" < in > anyothername || test_failed $LINENO -"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null -{ [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; } || +"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null || test_failed $LINENO -rm -f copy anyothername.out +cmp in copy || test_failed $LINENO +cmp in anyothername.out || test_failed $LINENO +rm -f copy anyothername.out || framework_failure "${LZIP}" -lq in "${in_lz}" [ $? = 2 ] || test_failed $LINENO @@ -121,10 +140,12 @@ rm -f copy anyothername.out "${LZIP}" -tq nx_file.lz "${in_lz}" [ $? = 1 ] || test_failed $LINENO "${LZIP}" -cdq in "${in_lz}" > copy -{ [ $? = 2 ] && cat copy in | cmp in - ; } || test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +cat copy in | cmp in - || test_failed $LINENO "${LZIP}" -cdq nx_file.lz "${in_lz}" > copy -{ [ $? = 1 ] && cmp in copy ; } || test_failed $LINENO -rm -f copy +[ $? = 1 ] || test_failed $LINENO +cmp in copy || test_failed $LINENO +rm -f copy || framework_failure cat "${in_lz}" > copy.lz || framework_failure for i in 1 2 3 4 5 6 7 ; do printf "g" >> copy.lz || framework_failure @@ -134,11 +155,15 @@ for i in 1 2 3 4 5 6 7 ; do [ $? = 2 ] || test_failed $LINENO $i done "${LZIP}" -dq in copy.lz -{ [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; } || - test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +[ -e copy.lz ] || test_failed $LINENO +[ ! -e copy ] || test_failed $LINENO +[ ! -e in.out ] || test_failed $LINENO "${LZIP}" -dq nx_file.lz copy.lz -{ [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e nx_file ] && cmp in copy ; } || - test_failed $LINENO +[ $? = 1 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +[ ! -e nx_file ] || test_failed $LINENO +cmp in copy || test_failed $LINENO cat in in > in2 || framework_failure cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure @@ -155,7 +180,7 @@ cmp in2 copy2 || test_failed $LINENO printf "\ngarbage" >> copy2.lz || framework_failure "${LZIP}" -tvvvv copy2.lz 2> /dev/null || test_failed $LINENO -rm -f copy2 +rm -f copy2 || framework_failure "${LZIP}" -alq copy2.lz [ $? = 2 ] || test_failed $LINENO "${LZIP}" -atq copy2.lz @@ -163,12 +188,15 @@ rm -f copy2 "${LZIP}" -atq < copy2.lz [ $? = 2 ] || test_failed $LINENO "${LZIP}" -adkq copy2.lz -{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy2 ] || test_failed $LINENO "${LZIP}" -adkq -o copy2 < copy2.lz -{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy2 ] || test_failed $LINENO printf "to be overwritten" > copy2 || framework_failure "${LZIP}" -df copy2.lz || test_failed $LINENO cmp in2 copy2 || test_failed $LINENO +rm -f in2 copy2 || framework_failure printf "\ntesting compression..." @@ -204,24 +232,30 @@ for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do "${LZIP}" -df -o copy < out.lz || test_failed $LINENO $i cmp in copy || test_failed $LINENO $i done +rm -f out.lz || framework_failure cat in in in in > in4 || framework_failure for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ; do + "${LZIP}" -s4Ki -B8Ki -n$i < in4 > out4.lz || test_failed $LINENO $i + printf "g" >> out4.lz || framework_failure + "${LZIP}" -d -n$i < out4.lz > out4 || test_failed $LINENO $i + cmp in4 out4 || test_failed $LINENO $i + "${LZIP}" -d --in-slots=$i < out4.lz > out4 || test_failed $LINENO $i + cmp in4 out4 || test_failed $LINENO $i + "${LZIP}" -d --out-slots=$i < out4.lz > out4 || test_failed $LINENO $i + cmp in4 out4 || test_failed $LINENO $i + "${LZIP}" -c -s4Ki -B8Ki -n$i in4 > out4.lz || test_failed $LINENO $i printf "g" >> out4.lz || framework_failure - "${LZIP}" -cd -n$i out4.lz > copy4 || test_failed $LINENO $i - cmp in4 copy4 || test_failed $LINENO $i + "${LZIP}" -cd -n$i out4.lz > out4 || test_failed $LINENO $i + cmp in4 out4 || test_failed $LINENO $i + "${LZIP}" -cd --out-slots=$i out4.lz > out4 || test_failed $LINENO $i + cmp in4 out4 || test_failed $LINENO $i + rm -f out4 || framework_failure "${LZIP}" -d -n$i out4.lz || test_failed $LINENO $i cmp in4 out4 || test_failed $LINENO $i - rm -f out4 -done - -for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ; do - "${LZIP}" -s4Ki -B8Ki -n$i < in4 > out4 || test_failed $LINENO $i - printf "g" >> out4 || framework_failure - "${LZIP}" -d -n$i < out4 > copy4 || test_failed $LINENO $i - cmp in4 copy4 || test_failed $LINENO $i done +rm -f out4 || framework_failure cat in in in in in in in in | "${LZIP}" -1s4Ki | "${LZIP}" -t || test_failed $LINENO @@ -230,58 +264,58 @@ printf "\ntesting bad input..." headers='LZIp LZiP LZip LzIP LzIp LziP lZIP lZIp lZiP lzIP' body='\001\014\000\203\377\373\377\377\300\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000$\000\000\000\000\000\000\000' -cat "${in_lz}" > in0.lz -printf "LZIP${body}" >> in0.lz -if "${LZIP}" -tq in0.lz ; then +cat "${in_lz}" > int.lz +printf "LZIP${body}" >> int.lz +if "${LZIP}" -tq int.lz ; then for header in ${headers} ; do - printf "${header}${body}" > in0.lz # first member - "${LZIP}" -lq in0.lz + printf "${header}${body}" > int.lz # first member + "${LZIP}" -lq int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq in0.lz + "${LZIP}" -tq int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq < in0.lz + "${LZIP}" -tq < int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -cdq in0.lz > /dev/null + "${LZIP}" -cdq int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -lq --loose-trailing in0.lz + "${LZIP}" -lq --loose-trailing int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq --loose-trailing in0.lz + "${LZIP}" -tq --loose-trailing int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq --loose-trailing < in0.lz + "${LZIP}" -tq --loose-trailing < int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -cdq --loose-trailing in0.lz > /dev/null + "${LZIP}" -cdq --loose-trailing int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} - cat "${in_lz}" > in0.lz - printf "${header}${body}" >> in0.lz # trailing data - "${LZIP}" -lq in0.lz + cat "${in_lz}" > int.lz + printf "${header}${body}" >> int.lz # trailing data + "${LZIP}" -lq int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq in0.lz + "${LZIP}" -tq int.lz [ $? = 2 ] || test_failed $LINENO ${header} -# "${LZIP}" -tq < in0.lz # requires lzlib-1.10 -# [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -cdq in0.lz > /dev/null + "${LZIP}" -tq < int.lz + [ $? = 2 ] || lzlib_1_10 # requires lzlib 1.10 + "${LZIP}" -cdq int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -lq --loose-trailing in0.lz - [ $? = 0 ] || test_failed $LINENO ${header} - "${LZIP}" -t --loose-trailing in0.lz - [ $? = 0 ] || test_failed $LINENO ${header} - "${LZIP}" -t --loose-trailing < in0.lz - [ $? = 0 ] || test_failed $LINENO ${header} - "${LZIP}" -cd --loose-trailing in0.lz > /dev/null - [ $? = 0 ] || test_failed $LINENO ${header} - "${LZIP}" -lq --loose-trailing --trailing-error in0.lz + "${LZIP}" -lq --loose-trailing int.lz || + test_failed $LINENO ${header} + "${LZIP}" -t --loose-trailing int.lz || + test_failed $LINENO ${header} + "${LZIP}" -t --loose-trailing < int.lz || + test_failed $LINENO ${header} + "${LZIP}" -cd --loose-trailing int.lz > /dev/null || + test_failed $LINENO ${header} + "${LZIP}" -lq --loose-trailing --trailing-error int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq --loose-trailing --trailing-error in0.lz + "${LZIP}" -tq --loose-trailing --trailing-error int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq --loose-trailing --trailing-error < in0.lz + "${LZIP}" -tq --loose-trailing --trailing-error < int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -cdq --loose-trailing --trailing-error in0.lz > /dev/null + "${LZIP}" -cdq --loose-trailing --trailing-error int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} done else printf "\nwarning: skipping header test: 'printf' does not work on your system." fi -rm -f in0.lz +rm -f int.lz || framework_failure cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && @@ -293,16 +327,16 @@ if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && "${LZIP}" -tq trunc.lz [ $? = 2 ] || test_failed $LINENO $i "${LZIP}" -tq < trunc.lz - [ $? = 2 ] || test_failed $LINENO $i + [ $? = 2 ] || lzlib_1_8 # requires lzlib 1.8 "${LZIP}" -cdq trunc.lz > out [ $? = 2 ] || test_failed $LINENO $i "${LZIP}" -dq < trunc.lz > out - [ $? = 2 ] || test_failed $LINENO $i + [ $? = 2 ] || lzlib_1_8 # requires lzlib 1.8 done else printf "\nwarning: skipping truncation test: 'dd' does not work on your system." fi -rm -f in3.lz trunc.lz +rm -f in2.lz in3.lz trunc.lz out || framework_failure cat "${in_lz}" > ingin.lz || framework_failure printf "g" >> ingin.lz || framework_failure @@ -316,7 +350,7 @@ cat "${in_lz}" >> ingin.lz || framework_failure "${LZIP}" -t < ingin.lz || test_failed $LINENO "${LZIP}" -d < ingin.lz > copy || test_failed $LINENO cmp in copy || test_failed $LINENO -rm -f ingin.lz +rm -f copy ingin.lz || framework_failure echo if [ ${fail} = 0 ] ; then -- cgit v1.2.3