diff options
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | Makefile.in | 14 | ||||
-rw-r--r-- | NEWS | 25 | ||||
-rwxr-xr-x | configure | 2 | ||||
-rw-r--r-- | create.cc | 140 | ||||
-rw-r--r-- | doc/tarlz.1 | 8 | ||||
-rw-r--r-- | doc/tarlz.info | 79 | ||||
-rw-r--r-- | doc/tarlz.texi | 82 | ||||
-rw-r--r-- | extended.cc | 156 | ||||
-rw-r--r-- | extract.cc | 133 | ||||
-rw-r--r-- | list_lz.cc | 35 | ||||
-rw-r--r-- | lzip.h | 146 | ||||
-rw-r--r-- | lzip_index.cc | 2 | ||||
-rw-r--r-- | main.cc | 11 | ||||
-rw-r--r-- | tarlz.h | 216 | ||||
-rwxr-xr-x | testsuite/check.sh | 19 | ||||
-rw-r--r-- | testsuite/rbar | 1 | ||||
-rw-r--r-- | testsuite/rbaz | 1 | ||||
-rw-r--r-- | testsuite/rfoo | 1 | ||||
-rw-r--r-- | testsuite/t155.tar | bin | 6144 -> 9216 bytes | |||
-rw-r--r-- | testsuite/t155.tar.lz | bin | 579 -> 906 bytes |
21 files changed, 634 insertions, 444 deletions
@@ -1,3 +1,10 @@ +2019-01-31 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 0.10 released. + * Added new option '--bsolid'. + * Added new option '-B, --data-size'. + * create.cc: Set ustar name to zero if extended header is used. + 2019-01-22 Antonio Diaz Diaz <antonio@gnu.org> * Version 0.9 released. diff --git a/Makefile.in b/Makefile.in index 8e41edb..289818f 100644 --- a/Makefile.in +++ b/Makefile.in @@ -8,7 +8,7 @@ LIBS = -llz -lpthread SHELL = /bin/sh CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 -objs = arg_parser.o lzip_index.o create.o extract.o list_lz.o main.o +objs = arg_parser.o lzip_index.o create.o extended.o extract.o list_lz.o main.o .PHONY : all install install-bin install-info install-man \ @@ -30,10 +30,11 @@ main.o : main.cc $(objs) : Makefile arg_parser.o : arg_parser.h -create.o : arg_parser.h lzip.h tarlz.h -extract.o : arg_parser.h lzip.h lzip_index.h tarlz.h -list_lz.o : arg_parser.h lzip.h lzip_index.h tarlz.h -lzip_index.o : lzip.h lzip_index.h +create.o : arg_parser.h tarlz.h +extended.o : tarlz.h +extract.o : arg_parser.h lzip_index.h tarlz.h +list_lz.o : arg_parser.h lzip_index.h tarlz.h +lzip_index.o : lzip_index.h tarlz.h main.o : arg_parser.h tarlz.h @@ -123,6 +124,9 @@ dist : doc $(DISTNAME)/testsuite/test_bad1.txt.tar \ $(DISTNAME)/testsuite/test_bad[12].txt \ $(DISTNAME)/testsuite/t155.tar \ + $(DISTNAME)/testsuite/rfoo \ + $(DISTNAME)/testsuite/rbar \ + $(DISTNAME)/testsuite/rbaz \ $(DISTNAME)/testsuite/test3.tar \ $(DISTNAME)/testsuite/test3_bad[1-5].tar \ $(DISTNAME)/testsuite/test.txt.lz \ @@ -1,16 +1,15 @@ -Changes in version 0.9: +Changes in version 0.10: -Multi-threaded '-t, --list' has been implemented. See chapter 'Limitations -of parallel tar decoding' in the manual for details. +The new option '--bsolid', which selects per-data-block compression of the +archive, has been added. This option improves compression efficiency for +archives with lots of small files. -The new option '-n, --threads', which sets the number of decompression -threads, has been added. +The new option '-B, --data-size', which sets the size of the input data +blocks for '--bsolid', has been added. -Tarlz now recognizes global pax headers, but for now ignores them. - -Tarlz now decodes numerical fields in headers using length-safe parsers -instead of strtoul to prevent the parser from exceeding the end of the field -if it does not contain a terminating character. - -The new chapter 'Limitations of parallel tar decoding' has been added to the -manual. +If an extended header is required for any reason (for example a file size +larger than 8 GiB or a link name longer than 100 bytes), tarlz now moves the +filename also to the extended header to prevent an ustar tool from trying to +extract the file or link. This also makes easier during parallel extraction +or listing the detection of a tar member split between two lzip members at +the boundary between the extended header and the ustar header. @@ -6,7 +6,7 @@ # to copy, distribute and modify it. pkgname=tarlz -pkgversion=0.9 +pkgversion=0.10 progname=tarlz srctrigger=doc/${pkgname}.texi @@ -38,20 +38,21 @@ #include <lzlib.h> #include "arg_parser.h" -#include "lzip.h" #include "tarlz.h" -const CRC32C crc32c; +const CRC32 crc32c( true ); int cl_owner = -1; // global vars needed by add_member int cl_group = -1; +int cl_data_size = 0; Solidity solidity = no_solid; namespace { LZ_Encoder * encoder = 0; // local vars needed by add_member const char * archive_namep = 0; +unsigned long long partial_data_size = 0; // current block size int outfd = -1; int gretval = 0; @@ -150,17 +151,18 @@ bool check_appendable( const int fd, const bool remove_eof ) } -class File_is_archive +class File_is_the_archive { dev_t archive_dev; ino_t archive_ino; bool initialized; + public: - File_is_archive() : initialized( false ) {} - bool init() + File_is_the_archive() : initialized( false ) {} + bool init( const int fd ) { struct stat st; - if( fstat( outfd, &st ) != 0 ) return false; + if( fstat( fd, &st ) != 0 ) return false; if( S_ISREG( st.st_mode ) ) { archive_dev = st.st_dev; archive_ino = st.st_ino; initialized = true; } return true; @@ -169,7 +171,7 @@ public: { return initialized && archive_dev == st.st_dev && archive_ino == st.st_ino; } - } file_is_archive; + } file_is_the_archive; bool archive_write( const uint8_t * const buf, const int size ) @@ -223,50 +225,32 @@ void print_octal( uint8_t * const buf, int size, unsigned long long num ) while( --size >= 0 ) { buf[size] = '0' + ( num % 8 ); num /= 8; } } -unsigned decimal_digits( unsigned long long value ) - { - unsigned digits = 1; - while( value >= 10 ) { value /= 10; ++digits; } - return digits; - } - -int record_size( const unsigned keyword_size, const unsigned long value_size ) - { - // size = ' ' + keyword + '=' + value + '\n' - unsigned long long size = 1 + keyword_size + 1 + value_size + 1; - const unsigned d1 = decimal_digits( size ); - size += decimal_digits( d1 + size ); - if( size >= INT_MAX ) size = 0; // overflows snprintf size - return size; - } - bool write_extended( const Extended & extended ) { - const int path_rec = extended.path.size() ? - record_size( 4, extended.path.size() ) : 0; - const int lpath_rec = extended.linkpath.size() ? - record_size( 8, extended.linkpath.size() ) : 0; - const int size_rec = ( extended.size > 0 ) ? - record_size( 4, decimal_digits( extended.size ) ) : 0; - const unsigned long long edsize = path_rec + lpath_rec + size_rec + 22; - const unsigned long long bufsize = round_up( edsize ); + const int path_rec = extended.recsize_path(); + const int lpath_rec = extended.recsize_linkpath(); + const int size_rec = extended.recsize_file_size(); + const unsigned long long edsize = extended.edsize(); + const unsigned long long bufsize = extended.edsize_pad(); if( edsize >= 1ULL << 33 ) return false; // too much extended data if( bufsize == 0 ) return edsize == 0; // overflow or no extended data char * const buf = new char[bufsize+1]; // extended records buffer - unsigned long long pos = path_rec; // goto can't cross this + unsigned long long pos = path_rec; // goto can't cross these + const unsigned crc_size = Extended::crc_record.size(); + if( path_rec && snprintf( buf, path_rec + 1, "%d path=%s\n", - path_rec, extended.path.c_str() ) != path_rec ) + path_rec, extended.path().c_str() ) != path_rec ) goto error; if( lpath_rec && snprintf( buf + pos, lpath_rec + 1, "%d linkpath=%s\n", - lpath_rec, extended.linkpath.c_str() ) != lpath_rec ) + lpath_rec, extended.linkpath().c_str() ) != lpath_rec ) goto error; pos += lpath_rec; if( size_rec && snprintf( buf + pos, size_rec + 1, "%d size=%llu\n", - size_rec, extended.size ) != size_rec ) + size_rec, extended.file_size() ) != size_rec ) goto error; pos += size_rec; - if( snprintf( buf + pos, 23, "22 GNU.crc32=00000000\n" ) != 22 ) goto error; - pos += 22; + std::memcpy( buf + pos, Extended::crc_record.c_str(), crc_size ); + pos += crc_size; if( pos != edsize ) goto error; print_hex( buf + edsize - 9, 8, crc32c.windowed_crc( (const uint8_t *)buf, edsize - 9, edsize ) ); @@ -316,27 +300,29 @@ const char * remove_leading_dotdot( const char * const filename ) } -// Return true if filename fits in the ustar header. +// Return true if it stores filename in the ustar header. bool store_name( const char * const filename, Extended & extended, - Tar_header header ) + Tar_header header, const bool force_extended_name ) { const char * const stored_name = remove_leading_dotdot( filename ); - const int len = std::strlen( stored_name ); - enum { max_len = prefix_l + 1 + name_l }; // prefix + '/' + name - - // first try storing filename in the ustar header - if( len <= name_l ) // stored_name fits in name - { std::memcpy( header + name_o, stored_name, len ); return true; } - if( len <= max_len ) // find shortest prefix - for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i ) - if( stored_name[i] == '/' ) // stored_name can be split - { - std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 ); - std::memcpy( header + prefix_o, stored_name, i ); - return true; - } + + if( !force_extended_name ) // try storing filename in the ustar header + { + const int len = std::strlen( stored_name ); + enum { max_len = prefix_l + 1 + name_l }; // prefix + '/' + name + if( len <= name_l ) // stored_name fits in name + { std::memcpy( header + name_o, stored_name, len ); return true; } + if( len <= max_len ) // find shortest prefix + for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i ) + if( stored_name[i] == '/' ) // stored_name can be split + { + std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 ); + std::memcpy( header + prefix_o, stored_name, i ); + return true; + } + } // store filename in extended record, leave name zeroed in ustar header - extended.path = stored_name; + extended.path( stored_name ); return false; } @@ -348,13 +334,13 @@ int add_member( const char * const filename, const struct stat *, if( lstat( filename, &st ) != 0 ) { show_file_error( filename, "Can't stat input file", errno ); gretval = 1; return 0; } - if( file_is_archive( st ) ) + if( file_is_the_archive( st ) ) { show_file_error( archive_namep, "File is the archive; not dumped." ); return 0; } Extended extended; // metadata for extended records Tar_header header; init_tar_header( header ); - store_name( filename, extended, header ); + bool force_extended_name = false; const mode_t mode = st.st_mode; print_octal( header + mode_o, mode_l - 1, @@ -392,7 +378,8 @@ int add_member( const char * const filename, const struct stat *, { char * const buf = new char[st.st_size+1]; len = readlink( filename, buf, st.st_size ); - if( len == st.st_size ) { buf[len] = 0; extended.linkpath = buf; } + if( len == st.st_size ) + { buf[len] = 0; extended.linkpath( buf ); force_extended_name = true; } delete[] buf; } if( len != st.st_size ) @@ -418,12 +405,30 @@ int add_member( const char * const filename, const struct stat *, const struct group * const gr = getgrgid( gid ); if( gr && gr->gr_name ) std::strncpy( (char *)header + gname_o, gr->gr_name, gname_l - 1 ); - if( file_size >= 1ULL << 33 ) extended.size = file_size; + if( file_size >= 1ULL << 33 ) + { extended.file_size( file_size ); force_extended_name = true; } else print_octal( header + size_o, size_l - 1, file_size ); + store_name( filename, extended, header, force_extended_name ); print_octal( header + chksum_o, chksum_l - 1, ustar_chksum( header ) ); const int infd = file_size ? open_instream( filename ) : -1; if( file_size && infd < 0 ) { gretval = 1; return 0; } + if( encoder && solidity == bsolid ) + { + const unsigned long long member_size = + header_size + extended.full_size() + round_up( file_size ); + const unsigned long long target_size = cl_data_size; + if( partial_data_size >= target_size || + ( partial_data_size >= min_data_size && + partial_data_size + member_size / 2 > target_size ) ) + { + partial_data_size = member_size; + if( !archive_write( 0, 0 ) ) + { show_error( "Error flushing encoder", errno ); return 1; } + } + else partial_data_size += member_size; + } + if( !extended.empty() && !write_extended( extended ) ) { show_error( "Error writing extended header", errno ); return 1; } if( !archive_write( header, header_size ) ) @@ -491,7 +496,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser, { show_error( "'--concatenate' is incompatible with '-f -'.", 0, true ); return 1; } if( ( outfd = open_outstream( archive_name, false ) ) < 0 ) return 1; - if( !file_is_archive.init() ) + if( !file_is_the_archive.init( outfd ) ) { show_file_error( archive_name.c_str(), "Can't stat", errno ); return 1; } int retval = 0; @@ -507,7 +512,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser, { show_file_error( filename, "Not an appendable tar.lz archive." ); close( infd ); retval = 2; break; } struct stat st; - if( fstat( infd, &st ) == 0 && file_is_archive( st ) ) + if( fstat( infd, &st ) == 0 && file_is_the_archive( st ) ) { show_file_error( filename, "File is the archive; not concatenated." ); close( infd ); continue; } if( !check_appendable( outfd, true ) ) @@ -572,12 +577,18 @@ int encode( const std::string & archive_name, const Arg_parser & parser, } archive_namep = archive_name.size() ? archive_name.c_str() : "(stdout)"; - if( !file_is_archive.init() ) + if( !file_is_the_archive.init( outfd ) ) { show_file_error( archive_namep, "Can't stat", errno ); return 1; } if( compressed ) { - encoder = LZ_compress_open( option_mapping[level].dictionary_size, + const int dictionary_size = option_mapping[level].dictionary_size; + if( cl_data_size <= 0 ) + { + if( level == 0 ) cl_data_size = 1 << 20; + else cl_data_size = 2 * dictionary_size; + } + encoder = LZ_compress_open( dictionary_size, option_mapping[level].match_len_limit, LLONG_MAX ); if( !encoder || LZ_compress_errno( encoder ) != LZ_ok ) { @@ -619,7 +630,8 @@ int encode( const std::string & archive_name, const Arg_parser & parser, enum { bufsize = 2 * header_size }; uint8_t buf[bufsize]; std::memset( buf, 0, bufsize ); - if( encoder && solidity == asolid && !archive_write( 0, 0 ) ) + if( encoder && ( solidity == asolid || solidity == bsolid ) && + !archive_write( 0, 0 ) ) { show_error( "Error flushing encoder", errno ); retval = 1; } else if( !archive_write( buf, bufsize ) || ( encoder && !archive_write( 0, 0 ) ) ) // flush encoder diff --git a/doc/tarlz.1 b/doc/tarlz.1 index b83a7e6..9450c57 100644 --- a/doc/tarlz.1 +++ b/doc/tarlz.1 @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH TARLZ "1" "January 2019" "tarlz 0.9" "User Commands" +.TH TARLZ "1" "January 2019" "tarlz 0.10" "User Commands" .SH NAME tarlz \- creates tar archives with multimember lzip compression .SH SYNOPSIS @@ -33,6 +33,9 @@ output version information and exit \fB\-A\fR, \fB\-\-concatenate\fR append tar.lz archives to the end of an archive .TP +\fB\-B\fR, \fB\-\-data\-size=\fR<bytes> +set target size of input data blocks [2x8=16 MiB] +.TP \fB\-c\fR, \fB\-\-create\fR create a new archive .TP @@ -66,6 +69,9 @@ set compression level [default 6] \fB\-\-asolid\fR create solidly compressed appendable archive .TP +\fB\-\-bsolid\fR +create per\-data\-block compressed archive +.TP \fB\-\-dsolid\fR create per\-directory compressed archive .TP diff --git a/doc/tarlz.info b/doc/tarlz.info index 7f90766..bf1e1f5 100644 --- a/doc/tarlz.info +++ b/doc/tarlz.info @@ -11,7 +11,7 @@ File: tarlz.info, Node: Top, Next: Introduction, Up: (dir) Tarlz Manual ************ -This manual is for Tarlz (version 0.9, 22 January 2019). +This manual is for Tarlz (version 0.10, 31 January 2019). * Menu: @@ -120,6 +120,13 @@ archive 'foo'. the archive if no FILES have been specified. Tarlz can't concatenate uncompressed tar archives. +'-B BYTES' +'--data-size=BYTES' + Set target size of input data blocks for the '--bsolid' option. + Valid values range from 8 KiB to 1 GiB. Default value is two times + the dictionary size, except for option '-0' where it defaults to + 1 MiB. + '-c' '--create' Create a new archive from FILES. @@ -190,6 +197,18 @@ archive 'foo'. members it creates, reducing the amount of memory required for decompression. + Level Dictionary size Match length limit + -0 64 KiB 16 bytes + -1 1 MiB 5 bytes + -2 1.5 MiB 6 bytes + -3 2 MiB 8 bytes + -4 3 MiB 12 bytes + -5 4 MiB 20 bytes + -6 8 MiB 36 bytes + -7 16 MiB 68 bytes + -8 24 MiB 132 bytes + -9 32 MiB 273 bytes + '--asolid' When creating or appending to a compressed archive, use appendable solid compression. All the files being added to the archive are @@ -197,6 +216,15 @@ archive 'foo'. are compressed into a separate lzip member. This creates a solidly compressed appendable archive. +'--bsolid' + When creating or appending to a compressed archive, compress tar + members together in a lzip member until they approximate a target + uncompressed size. The size can't be exact because each solidly + compressed data block must contain an integer number of tar + members. This option improves compression efficiency for archives + with lots of small files. *Note --data-size::, to set the target + block size. + '--dsolid' When creating or appending to a compressed archive, use solid compression for each directory especified in the command line. The @@ -560,13 +588,13 @@ old tar programs from extracting the extended records as a file in the wrong place. Tarlz also sets to zero those fields of the ustar header overridden by extended records. - If the extended header is needed because of a file size larger than -8 GiB, the size field will be unable to contain the full size of the -file. Therefore the file may be partially extracted, and the tool will -issue a spurious warning about a corrupt header at the point where it -thinks the file ends. Setting to zero the overridden size in the ustar -header at least prevents the partial extraction and makes obvious that -the file has been truncated. + If an extended header is required for any reason (for example a file +size larger than 8 GiB or a link name longer than 100 bytes), tarlz +moves the filename also to the extended header to prevent an ustar tool +from trying to extract the file or link. This also makes easier during +parallel extraction or listing the detection of a tar member split +between two lzip members at the boundary between the extended header +and the ustar header. 4.3 As simple as possible (but not simpler) @@ -626,10 +654,10 @@ to single-threaded mode and continues decoding the archive. Currently only the '--list' option is able to do multi-threaded decoding. If the files in the archive are large, multi-threaded '--list' on a -regular tar.lz archive can be hundreds of times faster than sequential -'--list' because, in addition to using several processors, it only -needs to decompress part of each lzip member. See the following example -listing the Silesia corpus on a dual core machine: +regular (seekable) tar.lz archive can be hundreds of times faster than +sequential '--list' because, in addition to using several processors, +it only needs to decompress part of each lzip member. See the following +example listing the Silesia corpus on a dual core machine: tarlz -9 -cf silesia.tar.lz silesia time lzip -cd silesia.tar.lz | tar -tf - (5.032s) @@ -690,9 +718,9 @@ Example 7: Extract files 'a' and 'c' from archive 'archive.tar.lz'. Example 8: Copy the contents of directory 'sourcedir' to the directory -'targetdir'. +'destdir'. - tarlz -C sourcedir -c . | tarlz -C targetdir -x + tarlz -C sourcedir -c . | tarlz -C destdir -x File: tarlz.info, Node: Problems, Next: Concept index, Prev: Examples, Up: Top @@ -734,17 +762,18 @@ Concept index Tag Table: Node: Top223 -Node: Introduction1012 -Node: Invoking tarlz3124 -Node: File format10384 -Ref: key_crc3215169 -Node: Amendments to pax format20586 -Ref: crc3221110 -Ref: flawed-compat22135 -Node: Multi-threaded tar24508 -Node: Examples27012 -Node: Problems28682 -Node: Concept index29208 +Node: Introduction1013 +Node: Invoking tarlz3125 +Ref: --data-size4717 +Node: File format11536 +Ref: key_crc3216321 +Node: Amendments to pax format21738 +Ref: crc3222262 +Ref: flawed-compat23287 +Node: Multi-threaded tar25649 +Node: Examples28164 +Node: Problems29830 +Node: Concept index30356 End Tag Table diff --git a/doc/tarlz.texi b/doc/tarlz.texi index d9bdc14..2ab37fb 100644 --- a/doc/tarlz.texi +++ b/doc/tarlz.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 22 January 2019 -@set VERSION 0.9 +@set UPDATED 31 January 2019 +@set VERSION 0.10 @dircategory Data Compression @direntry @@ -89,7 +89,7 @@ member) just like to an uncompressed tar archive. It is a safe posix-style backup format. In case of corruption, tarlz can extract all the undamaged members from the tar.lz archive, skipping over the damaged members, just like the standard -(uncompressed) tar. Moreover, the option @code{--keep-damaged} can be +(uncompressed) tar. Moreover, the option @samp{--keep-damaged} can be used to recover as much data as possible from each damaged member, and lziprecover can be used to recover some of the damaged members. @@ -154,6 +154,13 @@ end-of-file blocks are removed as each new archive is concatenated. Exit with status 0 without modifying the archive if no @var{files} have been specified. Tarlz can't concatenate uncompressed tar archives. +@anchor{--data-size} +@item -B @var{bytes} +@itemx --data-size=@var{bytes} +Set target size of input data blocks for the @samp{--bsolid} option. Valid +values range from @w{8 KiB} to @w{1 GiB}. Default value is two times the +dictionary size, except for option @samp{-0} where it defaults to @w{1 MiB}. + @item -c @itemx --create Create a new archive from @var{files}. @@ -161,13 +168,13 @@ Create a new archive from @var{files}. @item -C @var{dir} @itemx --directory=@var{dir} Change to directory @var{dir}. When creating or appending, the position -of each @code{-C} option in the command line is significant; it will +of each @samp{-C} option in the command line is significant; it will change the current working directory for the following @var{files} until -a new @code{-C} option appears in the command line. When extracting, all -the @code{-C} options are executed in sequence before starting the -extraction. Listing ignores any @code{-C} options specified. @var{dir} +a new @samp{-C} option appears in the command line. When extracting, all +the @samp{-C} options are executed in sequence before starting the +extraction. Listing ignores any @samp{-C} options specified. @var{dir} is relative to the then current working directory, perhaps changed by a -previous @code{-C} option. +previous @samp{-C} option. @item -f @var{archive} @itemx --file=@var{archive} @@ -222,6 +229,20 @@ Set the compression level. The default compression level is @samp{-6}. Like lzip, tarlz also minimizes the dictionary size of the lzip members it creates, reducing the amount of memory required for decompression. +@multitable {Level} {Dictionary size} {Match length limit} +@item Level @tab Dictionary size @tab Match length limit +@item -0 @tab 64 KiB @tab 16 bytes +@item -1 @tab 1 MiB @tab 5 bytes +@item -2 @tab 1.5 MiB @tab 6 bytes +@item -3 @tab 2 MiB @tab 8 bytes +@item -4 @tab 3 MiB @tab 12 bytes +@item -5 @tab 4 MiB @tab 20 bytes +@item -6 @tab 8 MiB @tab 36 bytes +@item -7 @tab 16 MiB @tab 68 bytes +@item -8 @tab 24 MiB @tab 132 bytes +@item -9 @tab 32 MiB @tab 273 bytes +@end multitable + @item --asolid When creating or appending to a compressed archive, use appendable solid compression. All the files being added to the archive are compressed @@ -229,6 +250,14 @@ into a single lzip member, but the end-of-file blocks are compressed into a separate lzip member. This creates a solidly compressed appendable archive. +@item --bsolid +When creating or appending to a compressed archive, compress tar members +together in a lzip member until they approximate a target uncompressed size. +The size can't be exact because each solidly compressed data block must +contain an integer number of tar members. This option improves compression +efficiency for archives with lots of small files. @xref{--data-size}, to set +the target block size. + @item --dsolid When creating or appending to a compressed archive, use solid compression for each directory especified in the command line. The @@ -252,7 +281,7 @@ resulting archive is not appendable. No more files can be later appended to the archive. @item --anonymous -Equivalent to @code{--owner=root --group=root}. +Equivalent to @samp{--owner=root --group=root}. @item --owner=@var{owner} When creating or appending, use @var{owner} for files added to the @@ -287,7 +316,7 @@ keyword appearing in the same block of extended records. @end ignore @item --uncompressed -With @code{--create}, don't compress the created tar archive. Create an +With @samp{--create}, don't compress the created tar archive. Create an uncompressed tar archive instead. @end table @@ -350,7 +379,7 @@ Zero or more blocks that contain the contents of the file. @end itemize Each tar member must be contiguously stored in a lzip member for the -parallel decoding operations like @code{--list} to work. If any tar member +parallel decoding operations like @samp{--list} to work. If any tar member is split over two or more lzip members, the archive must be decoded sequentially. @xref{Multi-threaded tar}. @@ -381,7 +410,7 @@ tar.lz @end verbatim @ignore -When @code{--permissive} is used, the following violations of the +When @samp{--permissive} is used, the following violations of the archive format are allowed:@* If several extended headers precede an ustar header, only the last extended header takes effect. The other extended headers are ignored. @@ -623,13 +652,12 @@ programs from extracting the extended records as a file in the wrong place. Tarlz also sets to zero those fields of the ustar header overridden by extended records. -If the extended header is needed because of a file size larger than -@w{8 GiB}, the size field will be unable to contain the full size of the -file. Therefore the file may be partially extracted, and the tool will issue -a spurious warning about a corrupt header at the point where it thinks the -file ends. Setting to zero the overridden size in the ustar header at least -prevents the partial extraction and makes obvious that the file has been -truncated. +If an extended header is required for any reason (for example a file size +larger than @w{8 GiB} or a link name longer than 100 bytes), tarlz moves the +filename also to the extended header to prevent an ustar tool from trying to +extract the file or link. This also makes easier during parallel extraction +or listing the detection of a tar member split between two lzip members at +the boundary between the extended header and the ustar header. @sp 1 @section As simple as possible (but not simpler) @@ -679,14 +707,14 @@ decoding it safely in parallel. Tarlz is able to automatically decode aligned and unaligned multimember tar.lz archives, keeping backwards compatibility. If tarlz finds a member misalignment during multi-threaded decoding, it switches to single-threaded -mode and continues decoding the archive. Currently only the @code{--list} +mode and continues decoding the archive. Currently only the @samp{--list} option is able to do multi-threaded decoding. -If the files in the archive are large, multi-threaded @code{--list} on a -regular tar.lz archive can be hundreds of times faster than sequential -@code{--list} because, in addition to using several processors, it only -needs to decompress part of each lzip member. See the following example -listing the Silesia corpus on a dual core machine: +If the files in the archive are large, multi-threaded @samp{--list} on a +regular (seekable) tar.lz archive can be hundreds of times faster than +sequential @samp{--list} because, in addition to using several processors, +it only needs to decompress part of each lzip member. See the following +example listing the Silesia corpus on a dual core machine: @example tarlz -9 -cf silesia.tar.lz silesia @@ -772,10 +800,10 @@ tarlz -xf archive.tar.lz a c @sp 1 @noindent Example 8: Copy the contents of directory @samp{sourcedir} to the -directory @samp{targetdir}. +directory @samp{destdir}. @example -tarlz -C sourcedir -c . | tarlz -C targetdir -x +tarlz -C sourcedir -c . | tarlz -C destdir -x @end example diff --git a/extended.cc b/extended.cc new file mode 100644 index 0000000..7e0cb30 --- /dev/null +++ b/extended.cc @@ -0,0 +1,156 @@ +/* Tarlz - Archiver with multimember lzip compression + Copyright (C) 2013-2019 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <cctype> +#include <climits> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> + +#include "tarlz.h" + + +namespace { + +unsigned decimal_digits( unsigned long long value ) + { + unsigned digits = 1; + while( value >= 10 ) { value /= 10; ++digits; } + return digits; + } + + +int record_size( const unsigned keyword_size, const unsigned long value_size ) + { + // size = ' ' + keyword + '=' + value + '\n' + unsigned long long size = 1 + keyword_size + 1 + value_size + 1; + const unsigned d1 = decimal_digits( size ); + size += decimal_digits( d1 + size ); + if( size >= INT_MAX ) size = 0; // overflows snprintf size + return size; + } + + +unsigned long long parse_decimal( const char * const ptr, + const char ** const tailp, + const unsigned long long size ) + { + unsigned long long result = 0; + unsigned long long i = 0; + while( i < size && std::isspace( ptr[i] ) ) ++i; + if( !std::isdigit( (unsigned char)ptr[i] ) ) + { if( tailp ) *tailp = ptr; return 0; } + for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i ) + { + const unsigned long long prev = result; + result *= 10; result += ptr[i] - '0'; + if( result < prev || result > LLONG_MAX ) // overflow + { if( tailp ) *tailp = ptr; return 0; } + } + if( tailp ) *tailp = ptr + i; + return result; + } + + +uint32_t parse_record_crc( const char * const ptr ) + { + uint32_t crc = 0; + for( int i = 0; i < 8; ++i ) + { + crc <<= 4; + if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0'; + else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A'; + else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a'; + else { crc = 0; break; } // invalid digit in crc string + } + return crc; + } + +} // end namespace + + +const std::string Extended::crc_record( "22 GNU.crc32=00000000\n" ); + +int Extended::recsize_linkpath() const + { + if( recsize_linkpath_ < 0 ) recsize_linkpath_ = + linkpath_.size() ? record_size( 8, linkpath_.size() ) : 0; + return recsize_linkpath_; + } + +int Extended::recsize_path() const + { + if( recsize_path_ < 0 ) + recsize_path_ = path_.size() ? record_size( 4, path_.size() ) : 0; + return recsize_path_; + } + +int Extended::recsize_file_size() const + { + if( recsize_file_size_ < 0 ) recsize_file_size_ = + ( file_size_ > 0 ) ? record_size( 4, file_size_ ) : 0; + return recsize_file_size_; + } + + +bool Extended::parse( const char * const buf, const unsigned long long edsize, + const bool permissive ) + { + reset(); + for( unsigned long long pos = 0; pos < edsize; ) // parse records + { + const char * tail; + const unsigned long long rsize = + parse_decimal( buf + pos, &tail, edsize - pos ); + if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' || + buf[pos+rsize-1] != '\n' ) return false; + ++tail; // point to keyword + // rest = length of (keyword + '=' + value) without the final newline + const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail; + if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 ) + { if( path_.size() && !permissive ) return false; + path_.assign( tail + 5, rest - 5 ); } + else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 ) + { if( linkpath_.size() && !permissive ) return false; + linkpath_.assign( tail + 9, rest - 9 ); } + else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 ) + { + if( file_size_ != 0 && !permissive ) return false; + file_size_ = parse_decimal( tail + 5, &tail, rest - 5 ); + // parse error or size fits in ustar header + if( file_size_ < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) ) + return false; + } + else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 ) + { + if( crc_present_ && !permissive ) return false; + if( rsize != crc_record.size() ) return false; + const uint32_t stored_crc = parse_record_crc( tail + 10 ); + const uint32_t computed_crc = + crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize ); + crc_present_ = true; + if( stored_crc != computed_crc ) return false; + } + pos += rsize; + } + full_size_ = header_size + round_up( edsize ); + return true; + } @@ -37,7 +37,6 @@ #include <lzlib.h> #include "arg_parser.h" -#include "lzip.h" #include "lzip_index.h" #include "tarlz.h" @@ -268,19 +267,19 @@ void format_member_name( const Extended & extended, const Tar_header header, for( int i = 0; i < 2; ++i ) { const int len = snprintf( rbuf() + offset, rbuf.size() - offset, - " %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n", - extended.size, 1900 + tm->tm_year, 1 + tm->tm_mon, - tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path.c_str(), - link_string, !islink ? "" : extended.linkpath.c_str() ); + " %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n", + extended.file_size(), 1900 + tm->tm_year, 1 + tm->tm_mon, + tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path().c_str(), + link_string, !islink ? "" : extended.linkpath().c_str() ); if( (int)rbuf.size() > len + offset ) break; else rbuf.resize( len + offset + 1 ); } } else { - if( rbuf.size() < extended.path.size() + 2 ) - rbuf.resize( extended.path.size() + 2 ); - snprintf( rbuf(), rbuf.size(), "%s\n", extended.path.c_str() ); + if( rbuf.size() < extended.path().size() + 2 ) + rbuf.resize( extended.path().size() + 2 ); + snprintf( rbuf(), rbuf.size(), "%s\n", extended.path().c_str() ); } } @@ -303,8 +302,8 @@ int list_member( const int infd, const Extended & extended, const unsigned bufsize = 32 * header_size; uint8_t buf[bufsize]; - unsigned long long rest = extended.size; - const int rem = extended.size % header_size; + unsigned long long rest = extended.file_size(); + const int rem = rest % header_size; const int padding = rem ? header_size - rem : 0; while( rest > 0 ) { @@ -331,7 +330,7 @@ bool contains_dotdot( const char * const filename ) int extract_member( const int infd, const Extended & extended, const Tar_header header, const bool keep_damaged ) { - const char * const filename = extended.path.c_str(); + const char * const filename = extended.path().c_str(); if( contains_dotdot( filename ) ) { show_file_error( filename, "Contains a '..' component, skipping." ); @@ -357,7 +356,7 @@ int extract_member( const int infd, const Extended & extended, case tf_link: case tf_symlink: { - const char * const linkname = extended.linkpath.c_str(); + const char * const linkname = extended.linkpath().c_str(); /* if( contains_dotdot( linkname ) ) { show_file_error( filename, @@ -421,8 +420,8 @@ int extract_member( const int infd, const Extended & extended, const unsigned bufsize = 32 * header_size; uint8_t buf[bufsize]; - unsigned long long rest = extended.size; - const int rem = extended.size % header_size; + unsigned long long rest = extended.file_size(); + const int rem = rest % header_size; const int padding = rem ? header_size - rem : 0; while( rest > 0 ) { @@ -501,42 +500,6 @@ bool compare_tslash( const char * const name1, const char * const name2 ) namespace { -unsigned long long parse_decimal( const char * const ptr, - const char ** const tailp, - const unsigned long long size ) - { - unsigned long long result = 0; - unsigned long long i = 0; - while( i < size && std::isspace( ptr[i] ) ) ++i; - if( !std::isdigit( (unsigned char)ptr[i] ) ) - { if( tailp ) *tailp = ptr; return 0; } - for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i ) - { - const unsigned long long prev = result; - result *= 10; result += ptr[i] - '0'; - if( result < prev || result > LLONG_MAX ) // overflow - { if( tailp ) *tailp = ptr; return 0; } - } - if( tailp ) *tailp = ptr + i; - return result; - } - - -uint32_t parse_record_crc( const char * const ptr ) - { - uint32_t crc = 0; - for( int i = 0; i < 8; ++i ) - { - crc <<= 4; - if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0'; - else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A'; - else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a'; - else { crc = 0; break; } // invalid digit in crc string - } - return crc; - } - - bool parse_records( const int infd, Extended & extended, const Tar_header header, const bool permissive ) { @@ -602,48 +565,6 @@ unsigned long long parse_octal( const uint8_t * const ptr, const int size ) } -bool Extended::parse( const char * const buf, const unsigned long long edsize, - const bool permissive ) - { - for( unsigned long long pos = 0; pos < edsize; ) // parse records - { - const char * tail; - const unsigned long long rsize = - parse_decimal( buf + pos, &tail, edsize - pos ); - if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' || - buf[pos+rsize-1] != '\n' ) return false; - ++tail; // point to keyword - // rest = length of (keyword + '=' + value) without the final newline - const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail; - if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 ) - { if( path.size() && !permissive ) return false; - path.assign( tail + 5, rest - 5 ); } - else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 ) - { if( linkpath.size() && !permissive ) return false; - linkpath.assign( tail + 9, rest - 9 ); } - else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 ) - { - if( size != 0 && !permissive ) return false; - size = parse_decimal( tail + 5, &tail, rest - 5 ); - // parse error or size fits in ustar header - if( size < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) ) return false; - } - else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 ) - { - if( crc_present && !permissive ) return false; - if( rsize != 22 ) return false; - const uint32_t stored_crc = parse_record_crc( tail + 10 ); - const uint32_t computed_crc = - crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize ); - crc_present = true; - if( stored_crc != computed_crc ) return false; - } - pos += rsize; - } - return true; - } - - int decode( const std::string & archive_name, const Arg_parser & parser, const int filenames, const int num_workers, const int debug_level, const bool keep_damaged, const bool listing, const bool missing_crc, @@ -722,23 +643,27 @@ int decode( const std::string & archive_name, const Arg_parser & parser, if( !parse_records( infd, extended, header, permissive ) ) { show_error( "Error in extended records. Skipping to next header." ); extended.reset(); gretval = 2; } - else if( !extended.crc_present && missing_crc ) + else if( !extended.crc_present() && missing_crc ) { show_error( "Missing CRC in extended records.", 0, true ); return 2; } prev_extended = true; continue; } prev_extended = false; - if( extended.linkpath.empty() ) // copy linkpath from ustar header + if( extended.linkpath().empty() ) // copy linkpath from ustar header { - for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i ) - extended.linkpath += header[linkname_o+i]; - while( extended.linkpath.size() > 1 && // trailing '/' - extended.linkpath[extended.linkpath.size()-1] == '/' ) - extended.linkpath.resize( extended.linkpath.size() - 1 ); + int len = 0; + while( len < linkname_l && header[linkname_o+len] ) ++len; + while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/' + if( len > 0 ) + { + const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0; + extended.linkpath( (const char *)header + linkname_o ); + header[linkname_o+len] = c; + } } - if( extended.path.empty() ) // copy path from ustar header + if( extended.path().empty() ) // copy path from ustar header { char stored_name[prefix_l+1+name_l+1]; int len = 0; @@ -749,9 +674,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser, { stored_name[len] = header[name_o+i]; ++len; } while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/' stored_name[len] = 0; - extended.path = remove_leading_slash( stored_name ); + extended.path( remove_leading_slash( stored_name ) ); } - const char * const filename = extended.path.c_str(); + const char * const filename = extended.path().c_str(); bool skip = filenames > 0; if( skip ) @@ -765,9 +690,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser, { skip = false; name_pending[i] = false; break; } } - if( extended.size == 0 && + if( extended.file_size() == 0 && ( typeflag == tf_regular || typeflag == tf_hiperf ) ) - extended.size = parse_octal( header + size_o, size_l ); + extended.file_size( parse_octal( header + size_o, size_l ) ); if( listing || skip ) retval = list_member( infd, extended, header, skip ); @@ -32,7 +32,6 @@ #include <lzlib.h> #include "arg_parser.h" -#include "lzip.h" #include "lzip_index.h" #include "tarlz.h" @@ -355,8 +354,8 @@ int list_member_lz( LZ_Decoder * const decoder, const int infd, Resizable_buffer & rbuf, const long member_id, const int worker_id, const char ** msg, const bool skip ) { - unsigned long long rest = extended.size; - const int rem = extended.size % header_size; + unsigned long long rest = extended.file_size(); + const int rem = rest % header_size; const int padding = rem ? header_size - rem : 0; const long long data_rest = mdata_end - ( data_pos + rest + padding ); bool master = false; @@ -527,7 +526,7 @@ extern "C" void * dworker_l( void * arg ) ret = 2; } else ret = parse_records_lz( decoder, infd, file_pos, member_end, cdata_size, data_pos, extended, header, &msg, permissive ); - if( ret == 0 && !extended.crc_present && missing_crc ) + if( ret == 0 && !extended.crc_present() && missing_crc ) { msg = "Missing CRC in extended records."; ret = 2; } if( ret != 0 ) { @@ -549,16 +548,20 @@ extern "C" void * dworker_l( void * arg ) } prev_extended = false; - if( extended.linkpath.empty() ) // copy linkpath from ustar header + if( extended.linkpath().empty() ) // copy linkpath from ustar header { - for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i ) - extended.linkpath += header[linkname_o+i]; - while( extended.linkpath.size() > 1 && // trailing '/' - extended.linkpath[extended.linkpath.size()-1] == '/' ) - extended.linkpath.resize( extended.linkpath.size() - 1 ); + int len = 0; + while( len < linkname_l && header[linkname_o+len] ) ++len; + while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/' + if( len > 0 ) + { + const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0; + extended.linkpath( (const char *)header + linkname_o ); + header[linkname_o+len] = c; + } } - if( extended.path.empty() ) // copy path from ustar header + if( extended.path().empty() ) // copy path from ustar header { char stored_name[prefix_l+1+name_l+1]; int len = 0; @@ -569,9 +572,9 @@ extern "C" void * dworker_l( void * arg ) { stored_name[len] = header[name_o+i]; ++len; } while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/' stored_name[len] = 0; - extended.path = remove_leading_slash( stored_name ); + extended.path( remove_leading_slash( stored_name ) ); } - const char * const filename = extended.path.c_str(); + const char * const filename = extended.path().c_str(); bool skip = filenames > 0; if( skip ) @@ -585,9 +588,9 @@ extern "C" void * dworker_l( void * arg ) { skip = false; name_pending[i] = false; break; } } - if( extended.size == 0 && + if( extended.file_size() == 0 && ( typeflag == tf_regular || typeflag == tf_hiperf ) ) - extended.size = parse_octal( header + size_o, size_l ); + extended.file_size( parse_octal( header + size_o, size_l ) ); retval = list_member_lz( decoder, infd, file_pos, member_end, cdata_size, data_pos, mdata_end, courier, @@ -643,7 +646,7 @@ int list_lz( const Arg_parser & parser, std::vector< char > & name_pending, const int debug_level, const int infd, const int num_workers, const bool missing_crc, const bool permissive ) { - const int out_slots = 100; + const int out_slots = 65536; // max small files (<=512B) in 64 MiB Packet_courier courier( num_workers, out_slots ); Worker_arg * worker_args = new( std::nothrow ) Worker_arg[num_workers]; @@ -1,146 +0,0 @@ -/* Tarlz - Archiver with multimember lzip compression - Copyright (C) 2013-2019 Antonio Diaz Diaz. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#ifndef LZ_API_VERSION -#define LZ_API_VERSION 1 -#endif - -enum { - min_dictionary_bits = 12, - min_dictionary_size = 1 << min_dictionary_bits, - max_dictionary_bits = 29, - max_dictionary_size = 1 << max_dictionary_bits, - min_member_size = 36 }; - - -class CRC32 - { - uint32_t data[256]; // Table of CRCs of all 8-bit messages. - -public: - CRC32() - { - for( unsigned n = 0; n < 256; ++n ) - { - unsigned c = n; - for( int k = 0; k < 8; ++k ) - { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } - data[n] = c; - } - } - - void update_byte( uint32_t & crc, const uint8_t byte ) const - { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); } - }; - - -inline bool isvalid_ds( const unsigned dictionary_size ) - { return ( dictionary_size >= min_dictionary_size && - dictionary_size <= max_dictionary_size ); } - - -const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" - -struct Lzip_header - { - uint8_t data[6]; // 0-3 magic bytes - // 4 version - // 5 coded_dict_size - enum { size = 6 }; - - bool verify_magic() const - { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); } - - bool verify_prefix( const int sz ) const // detect (truncated) header - { - for( int i = 0; i < sz && i < 4; ++i ) - if( data[i] != lzip_magic[i] ) return false; - return ( sz > 0 ); - } - bool verify_corrupt() const // detect corrupt header - { - int matches = 0; - for( int i = 0; i < 4; ++i ) - if( data[i] == lzip_magic[i] ) ++matches; - return ( matches > 1 && matches < 4 ); - } - - uint8_t version() const { return data[4]; } - bool verify_version() const { return ( data[4] == 1 ); } - - unsigned dictionary_size() const - { - unsigned sz = ( 1 << ( data[5] & 0x1F ) ); - if( sz > min_dictionary_size ) - sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 ); - return sz; - } - }; - - -struct Lzip_trailer - { - uint8_t data[20]; // 0-3 CRC32 of the uncompressed data - // 4-11 size of the uncompressed data - // 12-19 member size including header and trailer - enum { size = 20 }; - - unsigned data_crc() const - { - unsigned tmp = 0; - for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; } - return tmp; - } - - unsigned long long data_size() const - { - unsigned long long tmp = 0; - for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; } - return tmp; - } - - unsigned long long member_size() const - { - unsigned long long tmp = 0; - for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; } - return tmp; - } - - bool verify_consistency() const // check internal consistency - { - const unsigned crc = data_crc(); - const unsigned long long dsize = data_size(); - if( ( crc == 0 ) != ( dsize == 0 ) ) return false; - const unsigned long long msize = member_size(); - if( msize < min_member_size ) return false; - const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size; - if( mlimit > dsize && msize > mlimit ) return false; - const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1; - if( dlimit > msize && dsize > dlimit ) return false; - return true; - } - }; - - -const char * const bad_magic_msg = "Bad magic number (file not in lzip format)."; -const char * const bad_dict_msg = "Invalid dictionary size in member header."; -const char * const corrupt_mm_msg = "Corrupt header in multimember file."; -const char * const trailing_msg = "Trailing data not allowed."; - -// defined in extract.cc -int readblock( const int fd, uint8_t * const buf, const int size ); -int writeblock( const int fd, const uint8_t * const buf, const int size ); diff --git a/lzip_index.cc b/lzip_index.cc index cb4e9b1..1b7e576 100644 --- a/lzip_index.cc +++ b/lzip_index.cc @@ -26,8 +26,8 @@ #include <stdint.h> #include <unistd.h> -#include "lzip.h" #include "lzip_index.h" +#include "tarlz.h" namespace { @@ -87,6 +87,7 @@ void show_help( const long num_online ) " -h, --help display this help and exit\n" " -V, --version output version information and exit\n" " -A, --concatenate append tar.lz archives to the end of an archive\n" + " -B, --data-size=<bytes> set target size of input data blocks [2x8=16 MiB]\n" " -c, --create create a new archive\n" " -C, --directory=<dir> change to directory <dir>\n" " -f, --file=<archive> use archive file <archive>\n" @@ -98,6 +99,7 @@ void show_help( const long num_online ) " -x, --extract extract files from an archive\n" " -0 .. -9 set compression level [default 6]\n" " --asolid create solidly compressed appendable archive\n" + " --bsolid create per-data-block compressed archive\n" " --dsolid create per-directory compressed archive\n" " --no-solid create per-file compressed archive (default)\n" " --solid create solidly compressed archive\n" @@ -284,8 +286,8 @@ int main( const int argc, const char * const argv[] ) { show_error( "Bad library version. At least lzlib 1.0 is required." ); return 1; } - enum { opt_ano = 256, opt_aso, opt_crc, opt_dbg, opt_dso, opt_grp, opt_kd, - opt_nso, opt_own, opt_per, opt_sol, opt_un }; + enum { opt_ano = 256, opt_aso, opt_bso, opt_crc, opt_dbg, opt_dso, opt_grp, + opt_kd, opt_nso, opt_own, opt_per, opt_sol, opt_un }; const Arg_parser::Option options[] = { { '0', 0, Arg_parser::no }, @@ -299,6 +301,7 @@ int main( const int argc, const char * const argv[] ) { '8', 0, Arg_parser::no }, { '9', 0, Arg_parser::no }, { 'A', "concatenate", Arg_parser::no }, + { 'B', "data-size", Arg_parser::yes }, { 'c', "create", Arg_parser::no }, { 'C', "directory", Arg_parser::yes }, { 'f', "file", Arg_parser::yes }, @@ -313,6 +316,7 @@ int main( const int argc, const char * const argv[] ) { 'x', "extract", Arg_parser::no }, { opt_ano, "anonymous", Arg_parser::no }, { opt_aso, "asolid", Arg_parser::no }, + { opt_bso, "bsolid", Arg_parser::no }, { opt_dbg, "debug", Arg_parser::yes }, { opt_dso, "dsolid", Arg_parser::no }, { opt_grp, "group", Arg_parser::yes }, @@ -347,6 +351,8 @@ int main( const int argc, const char * const argv[] ) case '5': case '6': case '7': case '8': case '9': level = code - '0'; break; case 'A': set_mode( program_mode, m_concatenate ); break; + case 'B': cl_data_size = getnum( arg, min_data_size, max_data_size ); + break; case 'c': set_mode( program_mode, m_create ); break; case 'C': break; // skip chdir case 'f': if( sarg != "-" ) archive_name = sarg; break; @@ -361,6 +367,7 @@ int main( const int argc, const char * const argv[] ) case 'x': set_mode( program_mode, m_extract ); break; case opt_ano: set_owner( "root" ); set_group( "root" ); break; case opt_aso: solidity = asolid; break; + case opt_bso: solidity = bsolid; break; case opt_crc: missing_crc = true; break; case opt_dbg: debug_level = getnum( arg, 0, 3 ); break; case opt_dso: solidity = dsolid; break; @@ -42,22 +42,195 @@ inline bool verify_ustar_magic( const uint8_t * const header ) { return std::memcmp( header + magic_o, ustar_magic, magic_l ) == 0; } -class CRC32C // Uses CRC32-C (Castagnoli) polynomial. +// Round "size" to the next multiple of header size (512). +// +inline unsigned long long round_up( const unsigned long long size ) + { + const int rem = size % header_size; + const int padding = rem ? header_size - rem : 0; + return size + padding; + } + + +class Extended // stores metadata from/for extended records + { + std::string linkpath_; + std::string path_; + unsigned long long file_size_; + + mutable long long full_size_; // cached sizes + mutable int recsize_linkpath_; + mutable int recsize_path_; + mutable int recsize_file_size_; + + bool crc_present_; // true if CRC present in parsed records + +public: + static const std::string crc_record; + + Extended() + : file_size_( 0 ), full_size_( -1 ), recsize_linkpath_( -1 ), + recsize_path_( -1 ), recsize_file_size_( -1 ), crc_present_( false ) {} + + void reset() + { linkpath_.clear(); path_.clear(); file_size_ = 0; full_size_ = -1; + recsize_linkpath_ = -1; recsize_path_ = -1; recsize_file_size_ = -1; + crc_present_ = false; } + + bool empty() const + { return linkpath_.empty() && path_.empty() && file_size_ == 0; } + + const std::string & linkpath() const { return linkpath_; } + const std::string & path() const { return path_; } + unsigned long long file_size() const { return file_size_; } + + void linkpath( const char * const lp ) + { linkpath_ = lp; full_size_ = -1; recsize_linkpath_ = -1; } + void path( const char * const p ) + { path_ = p; full_size_ = -1; recsize_path_ = -1; } + void file_size( const unsigned long long fs ) + { file_size_ = fs; full_size_ = -1; recsize_file_size_ = -1; } + + int recsize_linkpath() const; + int recsize_path() const; + int recsize_file_size() const; + unsigned long long edsize() const // extended data size + { return empty() ? 0 : recsize_linkpath() + recsize_path() + + recsize_file_size() + crc_record.size(); } + unsigned long long edsize_pad() const // edsize rounded up + { return round_up( edsize() ); } + unsigned long long full_size() const + { if( full_size_ < 0 ) + full_size_ = ( empty() ? 0 : header_size + edsize_pad() ); + return full_size_; } + + bool crc_present() const { return crc_present_; } + bool parse( const char * const buf, const unsigned long long edsize, + const bool permissive ); + }; + + +enum { + min_dictionary_bits = 12, + min_dictionary_size = 1 << min_dictionary_bits, + max_dictionary_bits = 29, + max_dictionary_size = 1 << max_dictionary_bits, + min_member_size = 36, + min_data_size = 2 * min_dictionary_size, + max_data_size = 2 * max_dictionary_size }; + + +inline bool isvalid_ds( const unsigned dictionary_size ) + { return ( dictionary_size >= min_dictionary_size && + dictionary_size <= max_dictionary_size ); } + + +const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" + +struct Lzip_header + { + uint8_t data[6]; // 0-3 magic bytes + // 4 version + // 5 coded_dict_size + enum { size = 6 }; + + bool verify_magic() const + { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); } + + bool verify_prefix( const int sz ) const // detect (truncated) header + { + for( int i = 0; i < sz && i < 4; ++i ) + if( data[i] != lzip_magic[i] ) return false; + return ( sz > 0 ); + } + bool verify_corrupt() const // detect corrupt header + { + int matches = 0; + for( int i = 0; i < 4; ++i ) + if( data[i] == lzip_magic[i] ) ++matches; + return ( matches > 1 && matches < 4 ); + } + + uint8_t version() const { return data[4]; } + bool verify_version() const { return ( data[4] == 1 ); } + + unsigned dictionary_size() const + { + unsigned sz = ( 1 << ( data[5] & 0x1F ) ); + if( sz > min_dictionary_size ) + sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 ); + return sz; + } + }; + + +struct Lzip_trailer + { + uint8_t data[20]; // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + enum { size = 20 }; + + unsigned data_crc() const + { + unsigned tmp = 0; + for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + unsigned long long data_size() const + { + unsigned long long tmp = 0; + for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + unsigned long long member_size() const + { + unsigned long long tmp = 0; + for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + bool verify_consistency() const // check internal consistency + { + const unsigned crc = data_crc(); + const unsigned long long dsize = data_size(); + if( ( crc == 0 ) != ( dsize == 0 ) ) return false; + const unsigned long long msize = member_size(); + if( msize < min_member_size ) return false; + const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size; + if( mlimit > dsize && msize > mlimit ) return false; + const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1; + if( dlimit > msize && dsize > dlimit ) return false; + return true; + } + }; + + +class CRC32 { uint32_t data[256]; // Table of CRCs of all 8-bit messages. public: - CRC32C() + CRC32( const bool castagnoli = false ) { + const unsigned cpol = 0x82F63B78U; // CRC32-C Castagnoli polynomial. + const unsigned ipol = 0xEDB88320U; // IEEE 802.3 Ethernet polynomial. + const unsigned poly = castagnoli ? cpol : ipol; + for( unsigned n = 0; n < 256; ++n ) { unsigned c = n; for( int k = 0; k < 8; ++k ) - { if( c & 1 ) c = 0x82F63B78U ^ ( c >> 1 ); else c >>= 1; } + { if( c & 1 ) c = poly ^ ( c >> 1 ); else c >>= 1; } data[n] = c; } } + void update_byte( uint32_t & crc, const uint8_t byte ) const + { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); } + void update_buf( uint32_t & crc, const uint8_t * const buffer, const int size ) const { @@ -78,32 +251,7 @@ public: } }; -extern const CRC32C crc32c; - - -// Round "size" to the next multiple of header size (512). -// -inline unsigned long long round_up( unsigned long long size ) - { - const int rem = size % header_size; - const int padding = rem ? header_size - rem : 0; - return size + padding; - } - - -struct Extended // stores metadata from/for extended records - { - std::string linkpath; - std::string path; - unsigned long long size; - bool crc_present; - Extended() : size( 0 ), crc_present( false ) {} - void reset() - { linkpath.clear(); path.clear(); size = 0; crc_present = false; } - bool empty() { return linkpath.empty() && path.empty() && size == 0; } - bool parse( const char * const buf, const unsigned long long edsize, - const bool permissive ); - }; +extern const CRC32 crc32c; enum { initial_line_length = 1000 }; // must be >= 77 @@ -132,10 +280,16 @@ public: unsigned size() const { return size_; } }; +const char * const bad_magic_msg = "Bad magic number (file not in lzip format)."; +const char * const bad_dict_msg = "Invalid dictionary size in member header."; +const char * const corrupt_mm_msg = "Corrupt header in multimember file."; +const char * const trailing_msg = "Trailing data not allowed."; + // defined in create.cc -enum Solidity { no_solid, dsolid, asolid, solid }; +enum Solidity { no_solid, bsolid, dsolid, asolid, solid }; extern int cl_owner; extern int cl_group; +extern int cl_data_size; extern Solidity solidity; unsigned ustar_chksum( const uint8_t * const header ); bool verify_ustar_chksum( const uint8_t * const header ); @@ -152,6 +306,8 @@ void format_member_name( const Extended & extended, const Tar_header header, const char * remove_leading_slash( const char * const filename ); bool compare_prefix_dir( const char * const dir, const char * const name ); bool compare_tslash( const char * const name1, const char * const name2 ); +int readblock( const int fd, uint8_t * const buf, const int size ); +int writeblock( const int fd, const uint8_t * const buf, const int size ); unsigned long long parse_octal( const uint8_t * const ptr, const int size ); int decode( const std::string & archive_name, const Arg_parser & parser, const int filenames, const int num_workers, const int debug_level, diff --git a/testsuite/check.sh b/testsuite/check.sh index f6f989f..e1e3f60 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -65,7 +65,7 @@ lzlib_1_11() { [ ${lwarn} = 0 ] && # Description of test files for tarlz: # test.txt.tar.lz: 1 member (test.txt). -# t155.tar[.lz]: directory + file + link + eof, all with 155 char names +# t155.tar[.lz]: directory + links + file + eof, all with 155 char names # tar_in_tlz1.tar.lz 2 members (test.txt.tar test3.tar) 3 lzip members # tar_in_tlz2.tar.lz 2 members (test.txt.tar test3.tar) 5 lzip members # test_bad1.tar.lz: truncated at offset 6000 (of 7495) @@ -163,10 +163,11 @@ rm -f test.txt || framework_failure "${TARLZ}" -xf "${in_tar}" --missing-crc || test_failed $LINENO cmp "${in}" test.txt || test_failed $LINENO rm -f test.txt || framework_failure -# -printf "foo\n" > cfoo || framework_failure -printf "bar\n" > cbar || framework_failure -printf "baz\n" > cbaz || framework_failure + +# reference files for cmp +cat "${testdir}"/rfoo > cfoo || framework_failure +cat "${testdir}"/rbar > cbar || framework_failure +cat "${testdir}"/rbaz > cbaz || framework_failure rm -f foo bar baz || framework_failure "${TARLZ}" -xf "${test3_lz}" --missing-crc || test_failed $LINENO cmp cfoo foo || test_failed $LINENO @@ -261,7 +262,7 @@ for i in "${tarint1_lz}" "${tarint2_lz}" ; do cmp out0 out6 || test_failed $LINENO cmp out2 out6 || test_failed $LINENO cmp outv0 outv2 || test_failed $LINENO - cmp outv0 outv2 || test_failed $LINENO + cmp outv0 outv6 || test_failed $LINENO cmp outv2 outv6 || test_failed $LINENO rm -f out0 out2 out6 outv0 outv2 outv6 || framework_failure "${TARLZ}" -xf "$i" || test_failed $LINENO @@ -409,14 +410,14 @@ cat cbar > bar || framework_failure cat cbaz > baz || framework_failure "${TARLZ}" --solid -0 -cf out.tar.lz foo || test_failed $LINENO cat out.tar.lz > aout.tar.lz || framework_failure -for i in --asolid --dsolid --solid -0 ; do +for i in --asolid --bsolid --dsolid --solid -0 ; do "${TARLZ}" $i -q -rf out.tar.lz bar baz [ $? = 2 ] || test_failed $LINENO $i cmp out.tar.lz aout.tar.lz || test_failed $LINENO $i done rm -f out.tar.lz aout.tar.lz || framework_failure -for i in --asolid --dsolid -0 ; do - for j in --asolid --dsolid --solid -0 ; do +for i in --asolid --bsolid --dsolid -0 ; do + for j in --asolid --bsolid --dsolid --solid -0 ; do "${TARLZ}" $i -0 -cf out.tar.lz foo || test_failed $LINENO "$i $j" "${TARLZ}" $j -0 -rf out.tar.lz bar baz || diff --git a/testsuite/rbar b/testsuite/rbar new file mode 100644 index 0000000..5716ca5 --- /dev/null +++ b/testsuite/rbar @@ -0,0 +1 @@ +bar diff --git a/testsuite/rbaz b/testsuite/rbaz new file mode 100644 index 0000000..7601807 --- /dev/null +++ b/testsuite/rbaz @@ -0,0 +1 @@ +baz diff --git a/testsuite/rfoo b/testsuite/rfoo new file mode 100644 index 0000000..257cc56 --- /dev/null +++ b/testsuite/rfoo @@ -0,0 +1 @@ +foo diff --git a/testsuite/t155.tar b/testsuite/t155.tar Binary files differindex 4a0f37b..f2b8a4e 100644 --- a/testsuite/t155.tar +++ b/testsuite/t155.tar diff --git a/testsuite/t155.tar.lz b/testsuite/t155.tar.lz Binary files differindex 3219071..edc7f04 100644 --- a/testsuite/t155.tar.lz +++ b/testsuite/t155.tar.lz |