summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2019-01-31 23:06:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2019-01-31 23:06:40 +0000
commit01348f04bc92f307f5f61dd9f9f4c8d7746336f5 (patch)
tree90ea2c52891816e8d5c095fd79f599e23659e092
parentAdding upstream version 0.9. (diff)
downloadtarlz-01348f04bc92f307f5f61dd9f9f4c8d7746336f5.tar.xz
tarlz-01348f04bc92f307f5f61dd9f9f4c8d7746336f5.zip
Adding upstream version 0.10.upstream/0.10
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
-rw-r--r--ChangeLog7
-rw-r--r--Makefile.in14
-rw-r--r--NEWS25
-rwxr-xr-xconfigure2
-rw-r--r--create.cc140
-rw-r--r--doc/tarlz.18
-rw-r--r--doc/tarlz.info79
-rw-r--r--doc/tarlz.texi82
-rw-r--r--extended.cc156
-rw-r--r--extract.cc133
-rw-r--r--list_lz.cc35
-rw-r--r--lzip.h146
-rw-r--r--lzip_index.cc2
-rw-r--r--main.cc11
-rw-r--r--tarlz.h216
-rwxr-xr-xtestsuite/check.sh19
-rw-r--r--testsuite/rbar1
-rw-r--r--testsuite/rbaz1
-rw-r--r--testsuite/rfoo1
-rw-r--r--testsuite/t155.tarbin6144 -> 9216 bytes
-rw-r--r--testsuite/t155.tar.lzbin579 -> 906 bytes
21 files changed, 634 insertions, 444 deletions
diff --git a/ChangeLog b/ChangeLog
index 133419a..ccb672e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2019-01-31 Antonio Diaz Diaz <antonio@gnu.org>
+
+ * Version 0.10 released.
+ * Added new option '--bsolid'.
+ * Added new option '-B, --data-size'.
+ * create.cc: Set ustar name to zero if extended header is used.
+
2019-01-22 Antonio Diaz Diaz <antonio@gnu.org>
* Version 0.9 released.
diff --git a/Makefile.in b/Makefile.in
index 8e41edb..289818f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -8,7 +8,7 @@ LIBS = -llz -lpthread
SHELL = /bin/sh
CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1
-objs = arg_parser.o lzip_index.o create.o extract.o list_lz.o main.o
+objs = arg_parser.o lzip_index.o create.o extended.o extract.o list_lz.o main.o
.PHONY : all install install-bin install-info install-man \
@@ -30,10 +30,11 @@ main.o : main.cc
$(objs) : Makefile
arg_parser.o : arg_parser.h
-create.o : arg_parser.h lzip.h tarlz.h
-extract.o : arg_parser.h lzip.h lzip_index.h tarlz.h
-list_lz.o : arg_parser.h lzip.h lzip_index.h tarlz.h
-lzip_index.o : lzip.h lzip_index.h
+create.o : arg_parser.h tarlz.h
+extended.o : tarlz.h
+extract.o : arg_parser.h lzip_index.h tarlz.h
+list_lz.o : arg_parser.h lzip_index.h tarlz.h
+lzip_index.o : lzip_index.h tarlz.h
main.o : arg_parser.h tarlz.h
@@ -123,6 +124,9 @@ dist : doc
$(DISTNAME)/testsuite/test_bad1.txt.tar \
$(DISTNAME)/testsuite/test_bad[12].txt \
$(DISTNAME)/testsuite/t155.tar \
+ $(DISTNAME)/testsuite/rfoo \
+ $(DISTNAME)/testsuite/rbar \
+ $(DISTNAME)/testsuite/rbaz \
$(DISTNAME)/testsuite/test3.tar \
$(DISTNAME)/testsuite/test3_bad[1-5].tar \
$(DISTNAME)/testsuite/test.txt.lz \
diff --git a/NEWS b/NEWS
index cc2b1db..223e417 100644
--- a/NEWS
+++ b/NEWS
@@ -1,16 +1,15 @@
-Changes in version 0.9:
+Changes in version 0.10:
-Multi-threaded '-t, --list' has been implemented. See chapter 'Limitations
-of parallel tar decoding' in the manual for details.
+The new option '--bsolid', which selects per-data-block compression of the
+archive, has been added. This option improves compression efficiency for
+archives with lots of small files.
-The new option '-n, --threads', which sets the number of decompression
-threads, has been added.
+The new option '-B, --data-size', which sets the size of the input data
+blocks for '--bsolid', has been added.
-Tarlz now recognizes global pax headers, but for now ignores them.
-
-Tarlz now decodes numerical fields in headers using length-safe parsers
-instead of strtoul to prevent the parser from exceeding the end of the field
-if it does not contain a terminating character.
-
-The new chapter 'Limitations of parallel tar decoding' has been added to the
-manual.
+If an extended header is required for any reason (for example a file size
+larger than 8 GiB or a link name longer than 100 bytes), tarlz now moves the
+filename also to the extended header to prevent an ustar tool from trying to
+extract the file or link. This also makes easier during parallel extraction
+or listing the detection of a tar member split between two lzip members at
+the boundary between the extended header and the ustar header.
diff --git a/configure b/configure
index a0a9493..9d645ba 100755
--- a/configure
+++ b/configure
@@ -6,7 +6,7 @@
# to copy, distribute and modify it.
pkgname=tarlz
-pkgversion=0.9
+pkgversion=0.10
progname=tarlz
srctrigger=doc/${pkgname}.texi
diff --git a/create.cc b/create.cc
index 7310aee..813923a 100644
--- a/create.cc
+++ b/create.cc
@@ -38,20 +38,21 @@
#include <lzlib.h>
#include "arg_parser.h"
-#include "lzip.h"
#include "tarlz.h"
-const CRC32C crc32c;
+const CRC32 crc32c( true );
int cl_owner = -1; // global vars needed by add_member
int cl_group = -1;
+int cl_data_size = 0;
Solidity solidity = no_solid;
namespace {
LZ_Encoder * encoder = 0; // local vars needed by add_member
const char * archive_namep = 0;
+unsigned long long partial_data_size = 0; // current block size
int outfd = -1;
int gretval = 0;
@@ -150,17 +151,18 @@ bool check_appendable( const int fd, const bool remove_eof )
}
-class File_is_archive
+class File_is_the_archive
{
dev_t archive_dev;
ino_t archive_ino;
bool initialized;
+
public:
- File_is_archive() : initialized( false ) {}
- bool init()
+ File_is_the_archive() : initialized( false ) {}
+ bool init( const int fd )
{
struct stat st;
- if( fstat( outfd, &st ) != 0 ) return false;
+ if( fstat( fd, &st ) != 0 ) return false;
if( S_ISREG( st.st_mode ) )
{ archive_dev = st.st_dev; archive_ino = st.st_ino; initialized = true; }
return true;
@@ -169,7 +171,7 @@ public:
{
return initialized && archive_dev == st.st_dev && archive_ino == st.st_ino;
}
- } file_is_archive;
+ } file_is_the_archive;
bool archive_write( const uint8_t * const buf, const int size )
@@ -223,50 +225,32 @@ void print_octal( uint8_t * const buf, int size, unsigned long long num )
while( --size >= 0 ) { buf[size] = '0' + ( num % 8 ); num /= 8; }
}
-unsigned decimal_digits( unsigned long long value )
- {
- unsigned digits = 1;
- while( value >= 10 ) { value /= 10; ++digits; }
- return digits;
- }
-
-int record_size( const unsigned keyword_size, const unsigned long value_size )
- {
- // size = ' ' + keyword + '=' + value + '\n'
- unsigned long long size = 1 + keyword_size + 1 + value_size + 1;
- const unsigned d1 = decimal_digits( size );
- size += decimal_digits( d1 + size );
- if( size >= INT_MAX ) size = 0; // overflows snprintf size
- return size;
- }
-
bool write_extended( const Extended & extended )
{
- const int path_rec = extended.path.size() ?
- record_size( 4, extended.path.size() ) : 0;
- const int lpath_rec = extended.linkpath.size() ?
- record_size( 8, extended.linkpath.size() ) : 0;
- const int size_rec = ( extended.size > 0 ) ?
- record_size( 4, decimal_digits( extended.size ) ) : 0;
- const unsigned long long edsize = path_rec + lpath_rec + size_rec + 22;
- const unsigned long long bufsize = round_up( edsize );
+ const int path_rec = extended.recsize_path();
+ const int lpath_rec = extended.recsize_linkpath();
+ const int size_rec = extended.recsize_file_size();
+ const unsigned long long edsize = extended.edsize();
+ const unsigned long long bufsize = extended.edsize_pad();
if( edsize >= 1ULL << 33 ) return false; // too much extended data
if( bufsize == 0 ) return edsize == 0; // overflow or no extended data
char * const buf = new char[bufsize+1]; // extended records buffer
- unsigned long long pos = path_rec; // goto can't cross this
+ unsigned long long pos = path_rec; // goto can't cross these
+ const unsigned crc_size = Extended::crc_record.size();
+
if( path_rec && snprintf( buf, path_rec + 1, "%d path=%s\n",
- path_rec, extended.path.c_str() ) != path_rec )
+ path_rec, extended.path().c_str() ) != path_rec )
goto error;
if( lpath_rec && snprintf( buf + pos, lpath_rec + 1, "%d linkpath=%s\n",
- lpath_rec, extended.linkpath.c_str() ) != lpath_rec )
+ lpath_rec, extended.linkpath().c_str() ) != lpath_rec )
goto error;
pos += lpath_rec;
if( size_rec && snprintf( buf + pos, size_rec + 1, "%d size=%llu\n",
- size_rec, extended.size ) != size_rec )
+ size_rec, extended.file_size() ) != size_rec )
goto error;
pos += size_rec;
- if( snprintf( buf + pos, 23, "22 GNU.crc32=00000000\n" ) != 22 ) goto error;
- pos += 22;
+ std::memcpy( buf + pos, Extended::crc_record.c_str(), crc_size );
+ pos += crc_size;
if( pos != edsize ) goto error;
print_hex( buf + edsize - 9, 8,
crc32c.windowed_crc( (const uint8_t *)buf, edsize - 9, edsize ) );
@@ -316,27 +300,29 @@ const char * remove_leading_dotdot( const char * const filename )
}
-// Return true if filename fits in the ustar header.
+// Return true if it stores filename in the ustar header.
bool store_name( const char * const filename, Extended & extended,
- Tar_header header )
+ Tar_header header, const bool force_extended_name )
{
const char * const stored_name = remove_leading_dotdot( filename );
- const int len = std::strlen( stored_name );
- enum { max_len = prefix_l + 1 + name_l }; // prefix + '/' + name
-
- // first try storing filename in the ustar header
- if( len <= name_l ) // stored_name fits in name
- { std::memcpy( header + name_o, stored_name, len ); return true; }
- if( len <= max_len ) // find shortest prefix
- for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i )
- if( stored_name[i] == '/' ) // stored_name can be split
- {
- std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 );
- std::memcpy( header + prefix_o, stored_name, i );
- return true;
- }
+
+ if( !force_extended_name ) // try storing filename in the ustar header
+ {
+ const int len = std::strlen( stored_name );
+ enum { max_len = prefix_l + 1 + name_l }; // prefix + '/' + name
+ if( len <= name_l ) // stored_name fits in name
+ { std::memcpy( header + name_o, stored_name, len ); return true; }
+ if( len <= max_len ) // find shortest prefix
+ for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i )
+ if( stored_name[i] == '/' ) // stored_name can be split
+ {
+ std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 );
+ std::memcpy( header + prefix_o, stored_name, i );
+ return true;
+ }
+ }
// store filename in extended record, leave name zeroed in ustar header
- extended.path = stored_name;
+ extended.path( stored_name );
return false;
}
@@ -348,13 +334,13 @@ int add_member( const char * const filename, const struct stat *,
if( lstat( filename, &st ) != 0 )
{ show_file_error( filename, "Can't stat input file", errno );
gretval = 1; return 0; }
- if( file_is_archive( st ) )
+ if( file_is_the_archive( st ) )
{ show_file_error( archive_namep, "File is the archive; not dumped." );
return 0; }
Extended extended; // metadata for extended records
Tar_header header;
init_tar_header( header );
- store_name( filename, extended, header );
+ bool force_extended_name = false;
const mode_t mode = st.st_mode;
print_octal( header + mode_o, mode_l - 1,
@@ -392,7 +378,8 @@ int add_member( const char * const filename, const struct stat *,
{
char * const buf = new char[st.st_size+1];
len = readlink( filename, buf, st.st_size );
- if( len == st.st_size ) { buf[len] = 0; extended.linkpath = buf; }
+ if( len == st.st_size )
+ { buf[len] = 0; extended.linkpath( buf ); force_extended_name = true; }
delete[] buf;
}
if( len != st.st_size )
@@ -418,12 +405,30 @@ int add_member( const char * const filename, const struct stat *,
const struct group * const gr = getgrgid( gid );
if( gr && gr->gr_name )
std::strncpy( (char *)header + gname_o, gr->gr_name, gname_l - 1 );
- if( file_size >= 1ULL << 33 ) extended.size = file_size;
+ if( file_size >= 1ULL << 33 )
+ { extended.file_size( file_size ); force_extended_name = true; }
else print_octal( header + size_o, size_l - 1, file_size );
+ store_name( filename, extended, header, force_extended_name );
print_octal( header + chksum_o, chksum_l - 1, ustar_chksum( header ) );
const int infd = file_size ? open_instream( filename ) : -1;
if( file_size && infd < 0 ) { gretval = 1; return 0; }
+ if( encoder && solidity == bsolid )
+ {
+ const unsigned long long member_size =
+ header_size + extended.full_size() + round_up( file_size );
+ const unsigned long long target_size = cl_data_size;
+ if( partial_data_size >= target_size ||
+ ( partial_data_size >= min_data_size &&
+ partial_data_size + member_size / 2 > target_size ) )
+ {
+ partial_data_size = member_size;
+ if( !archive_write( 0, 0 ) )
+ { show_error( "Error flushing encoder", errno ); return 1; }
+ }
+ else partial_data_size += member_size;
+ }
+
if( !extended.empty() && !write_extended( extended ) )
{ show_error( "Error writing extended header", errno ); return 1; }
if( !archive_write( header, header_size ) )
@@ -491,7 +496,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser,
{ show_error( "'--concatenate' is incompatible with '-f -'.", 0, true );
return 1; }
if( ( outfd = open_outstream( archive_name, false ) ) < 0 ) return 1;
- if( !file_is_archive.init() )
+ if( !file_is_the_archive.init( outfd ) )
{ show_file_error( archive_name.c_str(), "Can't stat", errno ); return 1; }
int retval = 0;
@@ -507,7 +512,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser,
{ show_file_error( filename, "Not an appendable tar.lz archive." );
close( infd ); retval = 2; break; }
struct stat st;
- if( fstat( infd, &st ) == 0 && file_is_archive( st ) )
+ if( fstat( infd, &st ) == 0 && file_is_the_archive( st ) )
{ show_file_error( filename, "File is the archive; not concatenated." );
close( infd ); continue; }
if( !check_appendable( outfd, true ) )
@@ -572,12 +577,18 @@ int encode( const std::string & archive_name, const Arg_parser & parser,
}
archive_namep = archive_name.size() ? archive_name.c_str() : "(stdout)";
- if( !file_is_archive.init() )
+ if( !file_is_the_archive.init( outfd ) )
{ show_file_error( archive_namep, "Can't stat", errno ); return 1; }
if( compressed )
{
- encoder = LZ_compress_open( option_mapping[level].dictionary_size,
+ const int dictionary_size = option_mapping[level].dictionary_size;
+ if( cl_data_size <= 0 )
+ {
+ if( level == 0 ) cl_data_size = 1 << 20;
+ else cl_data_size = 2 * dictionary_size;
+ }
+ encoder = LZ_compress_open( dictionary_size,
option_mapping[level].match_len_limit, LLONG_MAX );
if( !encoder || LZ_compress_errno( encoder ) != LZ_ok )
{
@@ -619,7 +630,8 @@ int encode( const std::string & archive_name, const Arg_parser & parser,
enum { bufsize = 2 * header_size };
uint8_t buf[bufsize];
std::memset( buf, 0, bufsize );
- if( encoder && solidity == asolid && !archive_write( 0, 0 ) )
+ if( encoder && ( solidity == asolid || solidity == bsolid ) &&
+ !archive_write( 0, 0 ) )
{ show_error( "Error flushing encoder", errno ); retval = 1; }
else if( !archive_write( buf, bufsize ) ||
( encoder && !archive_write( 0, 0 ) ) ) // flush encoder
diff --git a/doc/tarlz.1 b/doc/tarlz.1
index b83a7e6..9450c57 100644
--- a/doc/tarlz.1
+++ b/doc/tarlz.1
@@ -1,5 +1,5 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1.
-.TH TARLZ "1" "January 2019" "tarlz 0.9" "User Commands"
+.TH TARLZ "1" "January 2019" "tarlz 0.10" "User Commands"
.SH NAME
tarlz \- creates tar archives with multimember lzip compression
.SH SYNOPSIS
@@ -33,6 +33,9 @@ output version information and exit
\fB\-A\fR, \fB\-\-concatenate\fR
append tar.lz archives to the end of an archive
.TP
+\fB\-B\fR, \fB\-\-data\-size=\fR<bytes>
+set target size of input data blocks [2x8=16 MiB]
+.TP
\fB\-c\fR, \fB\-\-create\fR
create a new archive
.TP
@@ -66,6 +69,9 @@ set compression level [default 6]
\fB\-\-asolid\fR
create solidly compressed appendable archive
.TP
+\fB\-\-bsolid\fR
+create per\-data\-block compressed archive
+.TP
\fB\-\-dsolid\fR
create per\-directory compressed archive
.TP
diff --git a/doc/tarlz.info b/doc/tarlz.info
index 7f90766..bf1e1f5 100644
--- a/doc/tarlz.info
+++ b/doc/tarlz.info
@@ -11,7 +11,7 @@ File: tarlz.info, Node: Top, Next: Introduction, Up: (dir)
Tarlz Manual
************
-This manual is for Tarlz (version 0.9, 22 January 2019).
+This manual is for Tarlz (version 0.10, 31 January 2019).
* Menu:
@@ -120,6 +120,13 @@ archive 'foo'.
the archive if no FILES have been specified. Tarlz can't
concatenate uncompressed tar archives.
+'-B BYTES'
+'--data-size=BYTES'
+ Set target size of input data blocks for the '--bsolid' option.
+ Valid values range from 8 KiB to 1 GiB. Default value is two times
+ the dictionary size, except for option '-0' where it defaults to
+ 1 MiB.
+
'-c'
'--create'
Create a new archive from FILES.
@@ -190,6 +197,18 @@ archive 'foo'.
members it creates, reducing the amount of memory required for
decompression.
+ Level Dictionary size Match length limit
+ -0 64 KiB 16 bytes
+ -1 1 MiB 5 bytes
+ -2 1.5 MiB 6 bytes
+ -3 2 MiB 8 bytes
+ -4 3 MiB 12 bytes
+ -5 4 MiB 20 bytes
+ -6 8 MiB 36 bytes
+ -7 16 MiB 68 bytes
+ -8 24 MiB 132 bytes
+ -9 32 MiB 273 bytes
+
'--asolid'
When creating or appending to a compressed archive, use appendable
solid compression. All the files being added to the archive are
@@ -197,6 +216,15 @@ archive 'foo'.
are compressed into a separate lzip member. This creates a solidly
compressed appendable archive.
+'--bsolid'
+ When creating or appending to a compressed archive, compress tar
+ members together in a lzip member until they approximate a target
+ uncompressed size. The size can't be exact because each solidly
+ compressed data block must contain an integer number of tar
+ members. This option improves compression efficiency for archives
+ with lots of small files. *Note --data-size::, to set the target
+ block size.
+
'--dsolid'
When creating or appending to a compressed archive, use solid
compression for each directory especified in the command line. The
@@ -560,13 +588,13 @@ old tar programs from extracting the extended records as a file in the
wrong place. Tarlz also sets to zero those fields of the ustar header
overridden by extended records.
- If the extended header is needed because of a file size larger than
-8 GiB, the size field will be unable to contain the full size of the
-file. Therefore the file may be partially extracted, and the tool will
-issue a spurious warning about a corrupt header at the point where it
-thinks the file ends. Setting to zero the overridden size in the ustar
-header at least prevents the partial extraction and makes obvious that
-the file has been truncated.
+ If an extended header is required for any reason (for example a file
+size larger than 8 GiB or a link name longer than 100 bytes), tarlz
+moves the filename also to the extended header to prevent an ustar tool
+from trying to extract the file or link. This also makes easier during
+parallel extraction or listing the detection of a tar member split
+between two lzip members at the boundary between the extended header
+and the ustar header.
4.3 As simple as possible (but not simpler)
@@ -626,10 +654,10 @@ to single-threaded mode and continues decoding the archive. Currently
only the '--list' option is able to do multi-threaded decoding.
If the files in the archive are large, multi-threaded '--list' on a
-regular tar.lz archive can be hundreds of times faster than sequential
-'--list' because, in addition to using several processors, it only
-needs to decompress part of each lzip member. See the following example
-listing the Silesia corpus on a dual core machine:
+regular (seekable) tar.lz archive can be hundreds of times faster than
+sequential '--list' because, in addition to using several processors,
+it only needs to decompress part of each lzip member. See the following
+example listing the Silesia corpus on a dual core machine:
tarlz -9 -cf silesia.tar.lz silesia
time lzip -cd silesia.tar.lz | tar -tf - (5.032s)
@@ -690,9 +718,9 @@ Example 7: Extract files 'a' and 'c' from archive 'archive.tar.lz'.
Example 8: Copy the contents of directory 'sourcedir' to the directory
-'targetdir'.
+'destdir'.
- tarlz -C sourcedir -c . | tarlz -C targetdir -x
+ tarlz -C sourcedir -c . | tarlz -C destdir -x

File: tarlz.info, Node: Problems, Next: Concept index, Prev: Examples, Up: Top
@@ -734,17 +762,18 @@ Concept index

Tag Table:
Node: Top223
-Node: Introduction1012
-Node: Invoking tarlz3124
-Node: File format10384
-Ref: key_crc3215169
-Node: Amendments to pax format20586
-Ref: crc3221110
-Ref: flawed-compat22135
-Node: Multi-threaded tar24508
-Node: Examples27012
-Node: Problems28682
-Node: Concept index29208
+Node: Introduction1013
+Node: Invoking tarlz3125
+Ref: --data-size4717
+Node: File format11536
+Ref: key_crc3216321
+Node: Amendments to pax format21738
+Ref: crc3222262
+Ref: flawed-compat23287
+Node: Multi-threaded tar25649
+Node: Examples28164
+Node: Problems29830
+Node: Concept index30356

End Tag Table
diff --git a/doc/tarlz.texi b/doc/tarlz.texi
index d9bdc14..2ab37fb 100644
--- a/doc/tarlz.texi
+++ b/doc/tarlz.texi
@@ -6,8 +6,8 @@
@finalout
@c %**end of header
-@set UPDATED 22 January 2019
-@set VERSION 0.9
+@set UPDATED 31 January 2019
+@set VERSION 0.10
@dircategory Data Compression
@direntry
@@ -89,7 +89,7 @@ member) just like to an uncompressed tar archive.
It is a safe posix-style backup format. In case of corruption,
tarlz can extract all the undamaged members from the tar.lz
archive, skipping over the damaged members, just like the standard
-(uncompressed) tar. Moreover, the option @code{--keep-damaged} can be
+(uncompressed) tar. Moreover, the option @samp{--keep-damaged} can be
used to recover as much data as possible from each damaged member,
and lziprecover can be used to recover some of the damaged members.
@@ -154,6 +154,13 @@ end-of-file blocks are removed as each new archive is concatenated. Exit
with status 0 without modifying the archive if no @var{files} have been
specified. Tarlz can't concatenate uncompressed tar archives.
+@anchor{--data-size}
+@item -B @var{bytes}
+@itemx --data-size=@var{bytes}
+Set target size of input data blocks for the @samp{--bsolid} option. Valid
+values range from @w{8 KiB} to @w{1 GiB}. Default value is two times the
+dictionary size, except for option @samp{-0} where it defaults to @w{1 MiB}.
+
@item -c
@itemx --create
Create a new archive from @var{files}.
@@ -161,13 +168,13 @@ Create a new archive from @var{files}.
@item -C @var{dir}
@itemx --directory=@var{dir}
Change to directory @var{dir}. When creating or appending, the position
-of each @code{-C} option in the command line is significant; it will
+of each @samp{-C} option in the command line is significant; it will
change the current working directory for the following @var{files} until
-a new @code{-C} option appears in the command line. When extracting, all
-the @code{-C} options are executed in sequence before starting the
-extraction. Listing ignores any @code{-C} options specified. @var{dir}
+a new @samp{-C} option appears in the command line. When extracting, all
+the @samp{-C} options are executed in sequence before starting the
+extraction. Listing ignores any @samp{-C} options specified. @var{dir}
is relative to the then current working directory, perhaps changed by a
-previous @code{-C} option.
+previous @samp{-C} option.
@item -f @var{archive}
@itemx --file=@var{archive}
@@ -222,6 +229,20 @@ Set the compression level. The default compression level is @samp{-6}.
Like lzip, tarlz also minimizes the dictionary size of the lzip members
it creates, reducing the amount of memory required for decompression.
+@multitable {Level} {Dictionary size} {Match length limit}
+@item Level @tab Dictionary size @tab Match length limit
+@item -0 @tab 64 KiB @tab 16 bytes
+@item -1 @tab 1 MiB @tab 5 bytes
+@item -2 @tab 1.5 MiB @tab 6 bytes
+@item -3 @tab 2 MiB @tab 8 bytes
+@item -4 @tab 3 MiB @tab 12 bytes
+@item -5 @tab 4 MiB @tab 20 bytes
+@item -6 @tab 8 MiB @tab 36 bytes
+@item -7 @tab 16 MiB @tab 68 bytes
+@item -8 @tab 24 MiB @tab 132 bytes
+@item -9 @tab 32 MiB @tab 273 bytes
+@end multitable
+
@item --asolid
When creating or appending to a compressed archive, use appendable solid
compression. All the files being added to the archive are compressed
@@ -229,6 +250,14 @@ into a single lzip member, but the end-of-file blocks are compressed
into a separate lzip member. This creates a solidly compressed
appendable archive.
+@item --bsolid
+When creating or appending to a compressed archive, compress tar members
+together in a lzip member until they approximate a target uncompressed size.
+The size can't be exact because each solidly compressed data block must
+contain an integer number of tar members. This option improves compression
+efficiency for archives with lots of small files. @xref{--data-size}, to set
+the target block size.
+
@item --dsolid
When creating or appending to a compressed archive, use solid
compression for each directory especified in the command line. The
@@ -252,7 +281,7 @@ resulting archive is not appendable. No more files can be later appended
to the archive.
@item --anonymous
-Equivalent to @code{--owner=root --group=root}.
+Equivalent to @samp{--owner=root --group=root}.
@item --owner=@var{owner}
When creating or appending, use @var{owner} for files added to the
@@ -287,7 +316,7 @@ keyword appearing in the same block of extended records.
@end ignore
@item --uncompressed
-With @code{--create}, don't compress the created tar archive. Create an
+With @samp{--create}, don't compress the created tar archive. Create an
uncompressed tar archive instead.
@end table
@@ -350,7 +379,7 @@ Zero or more blocks that contain the contents of the file.
@end itemize
Each tar member must be contiguously stored in a lzip member for the
-parallel decoding operations like @code{--list} to work. If any tar member
+parallel decoding operations like @samp{--list} to work. If any tar member
is split over two or more lzip members, the archive must be decoded
sequentially. @xref{Multi-threaded tar}.
@@ -381,7 +410,7 @@ tar.lz
@end verbatim
@ignore
-When @code{--permissive} is used, the following violations of the
+When @samp{--permissive} is used, the following violations of the
archive format are allowed:@*
If several extended headers precede an ustar header, only the last
extended header takes effect. The other extended headers are ignored.
@@ -623,13 +652,12 @@ programs from extracting the extended records as a file in the wrong place.
Tarlz also sets to zero those fields of the ustar header overridden by
extended records.
-If the extended header is needed because of a file size larger than
-@w{8 GiB}, the size field will be unable to contain the full size of the
-file. Therefore the file may be partially extracted, and the tool will issue
-a spurious warning about a corrupt header at the point where it thinks the
-file ends. Setting to zero the overridden size in the ustar header at least
-prevents the partial extraction and makes obvious that the file has been
-truncated.
+If an extended header is required for any reason (for example a file size
+larger than @w{8 GiB} or a link name longer than 100 bytes), tarlz moves the
+filename also to the extended header to prevent an ustar tool from trying to
+extract the file or link. This also makes easier during parallel extraction
+or listing the detection of a tar member split between two lzip members at
+the boundary between the extended header and the ustar header.
@sp 1
@section As simple as possible (but not simpler)
@@ -679,14 +707,14 @@ decoding it safely in parallel.
Tarlz is able to automatically decode aligned and unaligned multimember
tar.lz archives, keeping backwards compatibility. If tarlz finds a member
misalignment during multi-threaded decoding, it switches to single-threaded
-mode and continues decoding the archive. Currently only the @code{--list}
+mode and continues decoding the archive. Currently only the @samp{--list}
option is able to do multi-threaded decoding.
-If the files in the archive are large, multi-threaded @code{--list} on a
-regular tar.lz archive can be hundreds of times faster than sequential
-@code{--list} because, in addition to using several processors, it only
-needs to decompress part of each lzip member. See the following example
-listing the Silesia corpus on a dual core machine:
+If the files in the archive are large, multi-threaded @samp{--list} on a
+regular (seekable) tar.lz archive can be hundreds of times faster than
+sequential @samp{--list} because, in addition to using several processors,
+it only needs to decompress part of each lzip member. See the following
+example listing the Silesia corpus on a dual core machine:
@example
tarlz -9 -cf silesia.tar.lz silesia
@@ -772,10 +800,10 @@ tarlz -xf archive.tar.lz a c
@sp 1
@noindent
Example 8: Copy the contents of directory @samp{sourcedir} to the
-directory @samp{targetdir}.
+directory @samp{destdir}.
@example
-tarlz -C sourcedir -c . | tarlz -C targetdir -x
+tarlz -C sourcedir -c . | tarlz -C destdir -x
@end example
diff --git a/extended.cc b/extended.cc
new file mode 100644
index 0000000..7e0cb30
--- /dev/null
+++ b/extended.cc
@@ -0,0 +1,156 @@
+/* Tarlz - Archiver with multimember lzip compression
+ Copyright (C) 2013-2019 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <cctype>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+#include "tarlz.h"
+
+
+namespace {
+
+unsigned decimal_digits( unsigned long long value )
+ {
+ unsigned digits = 1;
+ while( value >= 10 ) { value /= 10; ++digits; }
+ return digits;
+ }
+
+
+int record_size( const unsigned keyword_size, const unsigned long value_size )
+ {
+ // size = ' ' + keyword + '=' + value + '\n'
+ unsigned long long size = 1 + keyword_size + 1 + value_size + 1;
+ const unsigned d1 = decimal_digits( size );
+ size += decimal_digits( d1 + size );
+ if( size >= INT_MAX ) size = 0; // overflows snprintf size
+ return size;
+ }
+
+
+unsigned long long parse_decimal( const char * const ptr,
+ const char ** const tailp,
+ const unsigned long long size )
+ {
+ unsigned long long result = 0;
+ unsigned long long i = 0;
+ while( i < size && std::isspace( ptr[i] ) ) ++i;
+ if( !std::isdigit( (unsigned char)ptr[i] ) )
+ { if( tailp ) *tailp = ptr; return 0; }
+ for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i )
+ {
+ const unsigned long long prev = result;
+ result *= 10; result += ptr[i] - '0';
+ if( result < prev || result > LLONG_MAX ) // overflow
+ { if( tailp ) *tailp = ptr; return 0; }
+ }
+ if( tailp ) *tailp = ptr + i;
+ return result;
+ }
+
+
+uint32_t parse_record_crc( const char * const ptr )
+ {
+ uint32_t crc = 0;
+ for( int i = 0; i < 8; ++i )
+ {
+ crc <<= 4;
+ if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0';
+ else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A';
+ else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a';
+ else { crc = 0; break; } // invalid digit in crc string
+ }
+ return crc;
+ }
+
+} // end namespace
+
+
+const std::string Extended::crc_record( "22 GNU.crc32=00000000\n" );
+
+int Extended::recsize_linkpath() const
+ {
+ if( recsize_linkpath_ < 0 ) recsize_linkpath_ =
+ linkpath_.size() ? record_size( 8, linkpath_.size() ) : 0;
+ return recsize_linkpath_;
+ }
+
+int Extended::recsize_path() const
+ {
+ if( recsize_path_ < 0 )
+ recsize_path_ = path_.size() ? record_size( 4, path_.size() ) : 0;
+ return recsize_path_;
+ }
+
+int Extended::recsize_file_size() const
+ {
+ if( recsize_file_size_ < 0 ) recsize_file_size_ =
+ ( file_size_ > 0 ) ? record_size( 4, file_size_ ) : 0;
+ return recsize_file_size_;
+ }
+
+
+bool Extended::parse( const char * const buf, const unsigned long long edsize,
+ const bool permissive )
+ {
+ reset();
+ for( unsigned long long pos = 0; pos < edsize; ) // parse records
+ {
+ const char * tail;
+ const unsigned long long rsize =
+ parse_decimal( buf + pos, &tail, edsize - pos );
+ if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' ||
+ buf[pos+rsize-1] != '\n' ) return false;
+ ++tail; // point to keyword
+ // rest = length of (keyword + '=' + value) without the final newline
+ const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail;
+ if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 )
+ { if( path_.size() && !permissive ) return false;
+ path_.assign( tail + 5, rest - 5 ); }
+ else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 )
+ { if( linkpath_.size() && !permissive ) return false;
+ linkpath_.assign( tail + 9, rest - 9 ); }
+ else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 )
+ {
+ if( file_size_ != 0 && !permissive ) return false;
+ file_size_ = parse_decimal( tail + 5, &tail, rest - 5 );
+ // parse error or size fits in ustar header
+ if( file_size_ < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) )
+ return false;
+ }
+ else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 )
+ {
+ if( crc_present_ && !permissive ) return false;
+ if( rsize != crc_record.size() ) return false;
+ const uint32_t stored_crc = parse_record_crc( tail + 10 );
+ const uint32_t computed_crc =
+ crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize );
+ crc_present_ = true;
+ if( stored_crc != computed_crc ) return false;
+ }
+ pos += rsize;
+ }
+ full_size_ = header_size + round_up( edsize );
+ return true;
+ }
diff --git a/extract.cc b/extract.cc
index e25f5b6..f85cf67 100644
--- a/extract.cc
+++ b/extract.cc
@@ -37,7 +37,6 @@
#include <lzlib.h>
#include "arg_parser.h"
-#include "lzip.h"
#include "lzip_index.h"
#include "tarlz.h"
@@ -268,19 +267,19 @@ void format_member_name( const Extended & extended, const Tar_header header,
for( int i = 0; i < 2; ++i )
{
const int len = snprintf( rbuf() + offset, rbuf.size() - offset,
- " %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n",
- extended.size, 1900 + tm->tm_year, 1 + tm->tm_mon,
- tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path.c_str(),
- link_string, !islink ? "" : extended.linkpath.c_str() );
+ " %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n",
+ extended.file_size(), 1900 + tm->tm_year, 1 + tm->tm_mon,
+ tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path().c_str(),
+ link_string, !islink ? "" : extended.linkpath().c_str() );
if( (int)rbuf.size() > len + offset ) break;
else rbuf.resize( len + offset + 1 );
}
}
else
{
- if( rbuf.size() < extended.path.size() + 2 )
- rbuf.resize( extended.path.size() + 2 );
- snprintf( rbuf(), rbuf.size(), "%s\n", extended.path.c_str() );
+ if( rbuf.size() < extended.path().size() + 2 )
+ rbuf.resize( extended.path().size() + 2 );
+ snprintf( rbuf(), rbuf.size(), "%s\n", extended.path().c_str() );
}
}
@@ -303,8 +302,8 @@ int list_member( const int infd, const Extended & extended,
const unsigned bufsize = 32 * header_size;
uint8_t buf[bufsize];
- unsigned long long rest = extended.size;
- const int rem = extended.size % header_size;
+ unsigned long long rest = extended.file_size();
+ const int rem = rest % header_size;
const int padding = rem ? header_size - rem : 0;
while( rest > 0 )
{
@@ -331,7 +330,7 @@ bool contains_dotdot( const char * const filename )
int extract_member( const int infd, const Extended & extended,
const Tar_header header, const bool keep_damaged )
{
- const char * const filename = extended.path.c_str();
+ const char * const filename = extended.path().c_str();
if( contains_dotdot( filename ) )
{
show_file_error( filename, "Contains a '..' component, skipping." );
@@ -357,7 +356,7 @@ int extract_member( const int infd, const Extended & extended,
case tf_link:
case tf_symlink:
{
- const char * const linkname = extended.linkpath.c_str();
+ const char * const linkname = extended.linkpath().c_str();
/* if( contains_dotdot( linkname ) )
{
show_file_error( filename,
@@ -421,8 +420,8 @@ int extract_member( const int infd, const Extended & extended,
const unsigned bufsize = 32 * header_size;
uint8_t buf[bufsize];
- unsigned long long rest = extended.size;
- const int rem = extended.size % header_size;
+ unsigned long long rest = extended.file_size();
+ const int rem = rest % header_size;
const int padding = rem ? header_size - rem : 0;
while( rest > 0 )
{
@@ -501,42 +500,6 @@ bool compare_tslash( const char * const name1, const char * const name2 )
namespace {
-unsigned long long parse_decimal( const char * const ptr,
- const char ** const tailp,
- const unsigned long long size )
- {
- unsigned long long result = 0;
- unsigned long long i = 0;
- while( i < size && std::isspace( ptr[i] ) ) ++i;
- if( !std::isdigit( (unsigned char)ptr[i] ) )
- { if( tailp ) *tailp = ptr; return 0; }
- for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i )
- {
- const unsigned long long prev = result;
- result *= 10; result += ptr[i] - '0';
- if( result < prev || result > LLONG_MAX ) // overflow
- { if( tailp ) *tailp = ptr; return 0; }
- }
- if( tailp ) *tailp = ptr + i;
- return result;
- }
-
-
-uint32_t parse_record_crc( const char * const ptr )
- {
- uint32_t crc = 0;
- for( int i = 0; i < 8; ++i )
- {
- crc <<= 4;
- if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0';
- else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A';
- else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a';
- else { crc = 0; break; } // invalid digit in crc string
- }
- return crc;
- }
-
-
bool parse_records( const int infd, Extended & extended,
const Tar_header header, const bool permissive )
{
@@ -602,48 +565,6 @@ unsigned long long parse_octal( const uint8_t * const ptr, const int size )
}
-bool Extended::parse( const char * const buf, const unsigned long long edsize,
- const bool permissive )
- {
- for( unsigned long long pos = 0; pos < edsize; ) // parse records
- {
- const char * tail;
- const unsigned long long rsize =
- parse_decimal( buf + pos, &tail, edsize - pos );
- if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' ||
- buf[pos+rsize-1] != '\n' ) return false;
- ++tail; // point to keyword
- // rest = length of (keyword + '=' + value) without the final newline
- const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail;
- if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 )
- { if( path.size() && !permissive ) return false;
- path.assign( tail + 5, rest - 5 ); }
- else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 )
- { if( linkpath.size() && !permissive ) return false;
- linkpath.assign( tail + 9, rest - 9 ); }
- else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 )
- {
- if( size != 0 && !permissive ) return false;
- size = parse_decimal( tail + 5, &tail, rest - 5 );
- // parse error or size fits in ustar header
- if( size < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) ) return false;
- }
- else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 )
- {
- if( crc_present && !permissive ) return false;
- if( rsize != 22 ) return false;
- const uint32_t stored_crc = parse_record_crc( tail + 10 );
- const uint32_t computed_crc =
- crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize );
- crc_present = true;
- if( stored_crc != computed_crc ) return false;
- }
- pos += rsize;
- }
- return true;
- }
-
-
int decode( const std::string & archive_name, const Arg_parser & parser,
const int filenames, const int num_workers, const int debug_level,
const bool keep_damaged, const bool listing, const bool missing_crc,
@@ -722,23 +643,27 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
if( !parse_records( infd, extended, header, permissive ) )
{ show_error( "Error in extended records. Skipping to next header." );
extended.reset(); gretval = 2; }
- else if( !extended.crc_present && missing_crc )
+ else if( !extended.crc_present() && missing_crc )
{ show_error( "Missing CRC in extended records.", 0, true ); return 2; }
prev_extended = true;
continue;
}
prev_extended = false;
- if( extended.linkpath.empty() ) // copy linkpath from ustar header
+ if( extended.linkpath().empty() ) // copy linkpath from ustar header
{
- for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i )
- extended.linkpath += header[linkname_o+i];
- while( extended.linkpath.size() > 1 && // trailing '/'
- extended.linkpath[extended.linkpath.size()-1] == '/' )
- extended.linkpath.resize( extended.linkpath.size() - 1 );
+ int len = 0;
+ while( len < linkname_l && header[linkname_o+len] ) ++len;
+ while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/'
+ if( len > 0 )
+ {
+ const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0;
+ extended.linkpath( (const char *)header + linkname_o );
+ header[linkname_o+len] = c;
+ }
}
- if( extended.path.empty() ) // copy path from ustar header
+ if( extended.path().empty() ) // copy path from ustar header
{
char stored_name[prefix_l+1+name_l+1];
int len = 0;
@@ -749,9 +674,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
{ stored_name[len] = header[name_o+i]; ++len; }
while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/'
stored_name[len] = 0;
- extended.path = remove_leading_slash( stored_name );
+ extended.path( remove_leading_slash( stored_name ) );
}
- const char * const filename = extended.path.c_str();
+ const char * const filename = extended.path().c_str();
bool skip = filenames > 0;
if( skip )
@@ -765,9 +690,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
{ skip = false; name_pending[i] = false; break; }
}
- if( extended.size == 0 &&
+ if( extended.file_size() == 0 &&
( typeflag == tf_regular || typeflag == tf_hiperf ) )
- extended.size = parse_octal( header + size_o, size_l );
+ extended.file_size( parse_octal( header + size_o, size_l ) );
if( listing || skip )
retval = list_member( infd, extended, header, skip );
diff --git a/list_lz.cc b/list_lz.cc
index d41d2b7..79d500c 100644
--- a/list_lz.cc
+++ b/list_lz.cc
@@ -32,7 +32,6 @@
#include <lzlib.h>
#include "arg_parser.h"
-#include "lzip.h"
#include "lzip_index.h"
#include "tarlz.h"
@@ -355,8 +354,8 @@ int list_member_lz( LZ_Decoder * const decoder, const int infd,
Resizable_buffer & rbuf, const long member_id,
const int worker_id, const char ** msg, const bool skip )
{
- unsigned long long rest = extended.size;
- const int rem = extended.size % header_size;
+ unsigned long long rest = extended.file_size();
+ const int rem = rest % header_size;
const int padding = rem ? header_size - rem : 0;
const long long data_rest = mdata_end - ( data_pos + rest + padding );
bool master = false;
@@ -527,7 +526,7 @@ extern "C" void * dworker_l( void * arg )
ret = 2; }
else ret = parse_records_lz( decoder, infd, file_pos, member_end,
cdata_size, data_pos, extended, header, &msg, permissive );
- if( ret == 0 && !extended.crc_present && missing_crc )
+ if( ret == 0 && !extended.crc_present() && missing_crc )
{ msg = "Missing CRC in extended records."; ret = 2; }
if( ret != 0 )
{
@@ -549,16 +548,20 @@ extern "C" void * dworker_l( void * arg )
}
prev_extended = false;
- if( extended.linkpath.empty() ) // copy linkpath from ustar header
+ if( extended.linkpath().empty() ) // copy linkpath from ustar header
{
- for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i )
- extended.linkpath += header[linkname_o+i];
- while( extended.linkpath.size() > 1 && // trailing '/'
- extended.linkpath[extended.linkpath.size()-1] == '/' )
- extended.linkpath.resize( extended.linkpath.size() - 1 );
+ int len = 0;
+ while( len < linkname_l && header[linkname_o+len] ) ++len;
+ while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/'
+ if( len > 0 )
+ {
+ const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0;
+ extended.linkpath( (const char *)header + linkname_o );
+ header[linkname_o+len] = c;
+ }
}
- if( extended.path.empty() ) // copy path from ustar header
+ if( extended.path().empty() ) // copy path from ustar header
{
char stored_name[prefix_l+1+name_l+1];
int len = 0;
@@ -569,9 +572,9 @@ extern "C" void * dworker_l( void * arg )
{ stored_name[len] = header[name_o+i]; ++len; }
while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/'
stored_name[len] = 0;
- extended.path = remove_leading_slash( stored_name );
+ extended.path( remove_leading_slash( stored_name ) );
}
- const char * const filename = extended.path.c_str();
+ const char * const filename = extended.path().c_str();
bool skip = filenames > 0;
if( skip )
@@ -585,9 +588,9 @@ extern "C" void * dworker_l( void * arg )
{ skip = false; name_pending[i] = false; break; }
}
- if( extended.size == 0 &&
+ if( extended.file_size() == 0 &&
( typeflag == tf_regular || typeflag == tf_hiperf ) )
- extended.size = parse_octal( header + size_o, size_l );
+ extended.file_size( parse_octal( header + size_o, size_l ) );
retval = list_member_lz( decoder, infd, file_pos, member_end,
cdata_size, data_pos, mdata_end, courier,
@@ -643,7 +646,7 @@ int list_lz( const Arg_parser & parser, std::vector< char > & name_pending,
const int debug_level, const int infd, const int num_workers,
const bool missing_crc, const bool permissive )
{
- const int out_slots = 100;
+ const int out_slots = 65536; // max small files (<=512B) in 64 MiB
Packet_courier courier( num_workers, out_slots );
Worker_arg * worker_args = new( std::nothrow ) Worker_arg[num_workers];
diff --git a/lzip.h b/lzip.h
deleted file mode 100644
index d88e9c7..0000000
--- a/lzip.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Tarlz - Archiver with multimember lzip compression
- Copyright (C) 2013-2019 Antonio Diaz Diaz.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef LZ_API_VERSION
-#define LZ_API_VERSION 1
-#endif
-
-enum {
- min_dictionary_bits = 12,
- min_dictionary_size = 1 << min_dictionary_bits,
- max_dictionary_bits = 29,
- max_dictionary_size = 1 << max_dictionary_bits,
- min_member_size = 36 };
-
-
-class CRC32
- {
- uint32_t data[256]; // Table of CRCs of all 8-bit messages.
-
-public:
- CRC32()
- {
- for( unsigned n = 0; n < 256; ++n )
- {
- unsigned c = n;
- for( int k = 0; k < 8; ++k )
- { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
- data[n] = c;
- }
- }
-
- void update_byte( uint32_t & crc, const uint8_t byte ) const
- { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); }
- };
-
-
-inline bool isvalid_ds( const unsigned dictionary_size )
- { return ( dictionary_size >= min_dictionary_size &&
- dictionary_size <= max_dictionary_size ); }
-
-
-const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP"
-
-struct Lzip_header
- {
- uint8_t data[6]; // 0-3 magic bytes
- // 4 version
- // 5 coded_dict_size
- enum { size = 6 };
-
- bool verify_magic() const
- { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); }
-
- bool verify_prefix( const int sz ) const // detect (truncated) header
- {
- for( int i = 0; i < sz && i < 4; ++i )
- if( data[i] != lzip_magic[i] ) return false;
- return ( sz > 0 );
- }
- bool verify_corrupt() const // detect corrupt header
- {
- int matches = 0;
- for( int i = 0; i < 4; ++i )
- if( data[i] == lzip_magic[i] ) ++matches;
- return ( matches > 1 && matches < 4 );
- }
-
- uint8_t version() const { return data[4]; }
- bool verify_version() const { return ( data[4] == 1 ); }
-
- unsigned dictionary_size() const
- {
- unsigned sz = ( 1 << ( data[5] & 0x1F ) );
- if( sz > min_dictionary_size )
- sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 );
- return sz;
- }
- };
-
-
-struct Lzip_trailer
- {
- uint8_t data[20]; // 0-3 CRC32 of the uncompressed data
- // 4-11 size of the uncompressed data
- // 12-19 member size including header and trailer
- enum { size = 20 };
-
- unsigned data_crc() const
- {
- unsigned tmp = 0;
- for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; }
- return tmp;
- }
-
- unsigned long long data_size() const
- {
- unsigned long long tmp = 0;
- for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; }
- return tmp;
- }
-
- unsigned long long member_size() const
- {
- unsigned long long tmp = 0;
- for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; }
- return tmp;
- }
-
- bool verify_consistency() const // check internal consistency
- {
- const unsigned crc = data_crc();
- const unsigned long long dsize = data_size();
- if( ( crc == 0 ) != ( dsize == 0 ) ) return false;
- const unsigned long long msize = member_size();
- if( msize < min_member_size ) return false;
- const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size;
- if( mlimit > dsize && msize > mlimit ) return false;
- const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1;
- if( dlimit > msize && dsize > dlimit ) return false;
- return true;
- }
- };
-
-
-const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
-const char * const bad_dict_msg = "Invalid dictionary size in member header.";
-const char * const corrupt_mm_msg = "Corrupt header in multimember file.";
-const char * const trailing_msg = "Trailing data not allowed.";
-
-// defined in extract.cc
-int readblock( const int fd, uint8_t * const buf, const int size );
-int writeblock( const int fd, const uint8_t * const buf, const int size );
diff --git a/lzip_index.cc b/lzip_index.cc
index cb4e9b1..1b7e576 100644
--- a/lzip_index.cc
+++ b/lzip_index.cc
@@ -26,8 +26,8 @@
#include <stdint.h>
#include <unistd.h>
-#include "lzip.h"
#include "lzip_index.h"
+#include "tarlz.h"
namespace {
diff --git a/main.cc b/main.cc
index 86734c1..25ff394 100644
--- a/main.cc
+++ b/main.cc
@@ -87,6 +87,7 @@ void show_help( const long num_online )
" -h, --help display this help and exit\n"
" -V, --version output version information and exit\n"
" -A, --concatenate append tar.lz archives to the end of an archive\n"
+ " -B, --data-size=<bytes> set target size of input data blocks [2x8=16 MiB]\n"
" -c, --create create a new archive\n"
" -C, --directory=<dir> change to directory <dir>\n"
" -f, --file=<archive> use archive file <archive>\n"
@@ -98,6 +99,7 @@ void show_help( const long num_online )
" -x, --extract extract files from an archive\n"
" -0 .. -9 set compression level [default 6]\n"
" --asolid create solidly compressed appendable archive\n"
+ " --bsolid create per-data-block compressed archive\n"
" --dsolid create per-directory compressed archive\n"
" --no-solid create per-file compressed archive (default)\n"
" --solid create solidly compressed archive\n"
@@ -284,8 +286,8 @@ int main( const int argc, const char * const argv[] )
{ show_error( "Bad library version. At least lzlib 1.0 is required." );
return 1; }
- enum { opt_ano = 256, opt_aso, opt_crc, opt_dbg, opt_dso, opt_grp, opt_kd,
- opt_nso, opt_own, opt_per, opt_sol, opt_un };
+ enum { opt_ano = 256, opt_aso, opt_bso, opt_crc, opt_dbg, opt_dso, opt_grp,
+ opt_kd, opt_nso, opt_own, opt_per, opt_sol, opt_un };
const Arg_parser::Option options[] =
{
{ '0', 0, Arg_parser::no },
@@ -299,6 +301,7 @@ int main( const int argc, const char * const argv[] )
{ '8', 0, Arg_parser::no },
{ '9', 0, Arg_parser::no },
{ 'A', "concatenate", Arg_parser::no },
+ { 'B', "data-size", Arg_parser::yes },
{ 'c', "create", Arg_parser::no },
{ 'C', "directory", Arg_parser::yes },
{ 'f', "file", Arg_parser::yes },
@@ -313,6 +316,7 @@ int main( const int argc, const char * const argv[] )
{ 'x', "extract", Arg_parser::no },
{ opt_ano, "anonymous", Arg_parser::no },
{ opt_aso, "asolid", Arg_parser::no },
+ { opt_bso, "bsolid", Arg_parser::no },
{ opt_dbg, "debug", Arg_parser::yes },
{ opt_dso, "dsolid", Arg_parser::no },
{ opt_grp, "group", Arg_parser::yes },
@@ -347,6 +351,8 @@ int main( const int argc, const char * const argv[] )
case '5': case '6': case '7': case '8': case '9':
level = code - '0'; break;
case 'A': set_mode( program_mode, m_concatenate ); break;
+ case 'B': cl_data_size = getnum( arg, min_data_size, max_data_size );
+ break;
case 'c': set_mode( program_mode, m_create ); break;
case 'C': break; // skip chdir
case 'f': if( sarg != "-" ) archive_name = sarg; break;
@@ -361,6 +367,7 @@ int main( const int argc, const char * const argv[] )
case 'x': set_mode( program_mode, m_extract ); break;
case opt_ano: set_owner( "root" ); set_group( "root" ); break;
case opt_aso: solidity = asolid; break;
+ case opt_bso: solidity = bsolid; break;
case opt_crc: missing_crc = true; break;
case opt_dbg: debug_level = getnum( arg, 0, 3 ); break;
case opt_dso: solidity = dsolid; break;
diff --git a/tarlz.h b/tarlz.h
index 09baaf2..d34374a 100644
--- a/tarlz.h
+++ b/tarlz.h
@@ -42,22 +42,195 @@ inline bool verify_ustar_magic( const uint8_t * const header )
{ return std::memcmp( header + magic_o, ustar_magic, magic_l ) == 0; }
-class CRC32C // Uses CRC32-C (Castagnoli) polynomial.
+// Round "size" to the next multiple of header size (512).
+//
+inline unsigned long long round_up( const unsigned long long size )
+ {
+ const int rem = size % header_size;
+ const int padding = rem ? header_size - rem : 0;
+ return size + padding;
+ }
+
+
+class Extended // stores metadata from/for extended records
+ {
+ std::string linkpath_;
+ std::string path_;
+ unsigned long long file_size_;
+
+ mutable long long full_size_; // cached sizes
+ mutable int recsize_linkpath_;
+ mutable int recsize_path_;
+ mutable int recsize_file_size_;
+
+ bool crc_present_; // true if CRC present in parsed records
+
+public:
+ static const std::string crc_record;
+
+ Extended()
+ : file_size_( 0 ), full_size_( -1 ), recsize_linkpath_( -1 ),
+ recsize_path_( -1 ), recsize_file_size_( -1 ), crc_present_( false ) {}
+
+ void reset()
+ { linkpath_.clear(); path_.clear(); file_size_ = 0; full_size_ = -1;
+ recsize_linkpath_ = -1; recsize_path_ = -1; recsize_file_size_ = -1;
+ crc_present_ = false; }
+
+ bool empty() const
+ { return linkpath_.empty() && path_.empty() && file_size_ == 0; }
+
+ const std::string & linkpath() const { return linkpath_; }
+ const std::string & path() const { return path_; }
+ unsigned long long file_size() const { return file_size_; }
+
+ void linkpath( const char * const lp )
+ { linkpath_ = lp; full_size_ = -1; recsize_linkpath_ = -1; }
+ void path( const char * const p )
+ { path_ = p; full_size_ = -1; recsize_path_ = -1; }
+ void file_size( const unsigned long long fs )
+ { file_size_ = fs; full_size_ = -1; recsize_file_size_ = -1; }
+
+ int recsize_linkpath() const;
+ int recsize_path() const;
+ int recsize_file_size() const;
+ unsigned long long edsize() const // extended data size
+ { return empty() ? 0 : recsize_linkpath() + recsize_path() +
+ recsize_file_size() + crc_record.size(); }
+ unsigned long long edsize_pad() const // edsize rounded up
+ { return round_up( edsize() ); }
+ unsigned long long full_size() const
+ { if( full_size_ < 0 )
+ full_size_ = ( empty() ? 0 : header_size + edsize_pad() );
+ return full_size_; }
+
+ bool crc_present() const { return crc_present_; }
+ bool parse( const char * const buf, const unsigned long long edsize,
+ const bool permissive );
+ };
+
+
+enum {
+ min_dictionary_bits = 12,
+ min_dictionary_size = 1 << min_dictionary_bits,
+ max_dictionary_bits = 29,
+ max_dictionary_size = 1 << max_dictionary_bits,
+ min_member_size = 36,
+ min_data_size = 2 * min_dictionary_size,
+ max_data_size = 2 * max_dictionary_size };
+
+
+inline bool isvalid_ds( const unsigned dictionary_size )
+ { return ( dictionary_size >= min_dictionary_size &&
+ dictionary_size <= max_dictionary_size ); }
+
+
+const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP"
+
+struct Lzip_header
+ {
+ uint8_t data[6]; // 0-3 magic bytes
+ // 4 version
+ // 5 coded_dict_size
+ enum { size = 6 };
+
+ bool verify_magic() const
+ { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); }
+
+ bool verify_prefix( const int sz ) const // detect (truncated) header
+ {
+ for( int i = 0; i < sz && i < 4; ++i )
+ if( data[i] != lzip_magic[i] ) return false;
+ return ( sz > 0 );
+ }
+ bool verify_corrupt() const // detect corrupt header
+ {
+ int matches = 0;
+ for( int i = 0; i < 4; ++i )
+ if( data[i] == lzip_magic[i] ) ++matches;
+ return ( matches > 1 && matches < 4 );
+ }
+
+ uint8_t version() const { return data[4]; }
+ bool verify_version() const { return ( data[4] == 1 ); }
+
+ unsigned dictionary_size() const
+ {
+ unsigned sz = ( 1 << ( data[5] & 0x1F ) );
+ if( sz > min_dictionary_size )
+ sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 );
+ return sz;
+ }
+ };
+
+
+struct Lzip_trailer
+ {
+ uint8_t data[20]; // 0-3 CRC32 of the uncompressed data
+ // 4-11 size of the uncompressed data
+ // 12-19 member size including header and trailer
+ enum { size = 20 };
+
+ unsigned data_crc() const
+ {
+ unsigned tmp = 0;
+ for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; }
+ return tmp;
+ }
+
+ unsigned long long data_size() const
+ {
+ unsigned long long tmp = 0;
+ for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; }
+ return tmp;
+ }
+
+ unsigned long long member_size() const
+ {
+ unsigned long long tmp = 0;
+ for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; }
+ return tmp;
+ }
+
+ bool verify_consistency() const // check internal consistency
+ {
+ const unsigned crc = data_crc();
+ const unsigned long long dsize = data_size();
+ if( ( crc == 0 ) != ( dsize == 0 ) ) return false;
+ const unsigned long long msize = member_size();
+ if( msize < min_member_size ) return false;
+ const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size;
+ if( mlimit > dsize && msize > mlimit ) return false;
+ const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1;
+ if( dlimit > msize && dsize > dlimit ) return false;
+ return true;
+ }
+ };
+
+
+class CRC32
{
uint32_t data[256]; // Table of CRCs of all 8-bit messages.
public:
- CRC32C()
+ CRC32( const bool castagnoli = false )
{
+ const unsigned cpol = 0x82F63B78U; // CRC32-C Castagnoli polynomial.
+ const unsigned ipol = 0xEDB88320U; // IEEE 802.3 Ethernet polynomial.
+ const unsigned poly = castagnoli ? cpol : ipol;
+
for( unsigned n = 0; n < 256; ++n )
{
unsigned c = n;
for( int k = 0; k < 8; ++k )
- { if( c & 1 ) c = 0x82F63B78U ^ ( c >> 1 ); else c >>= 1; }
+ { if( c & 1 ) c = poly ^ ( c >> 1 ); else c >>= 1; }
data[n] = c;
}
}
+ void update_byte( uint32_t & crc, const uint8_t byte ) const
+ { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); }
+
void update_buf( uint32_t & crc, const uint8_t * const buffer,
const int size ) const
{
@@ -78,32 +251,7 @@ public:
}
};
-extern const CRC32C crc32c;
-
-
-// Round "size" to the next multiple of header size (512).
-//
-inline unsigned long long round_up( unsigned long long size )
- {
- const int rem = size % header_size;
- const int padding = rem ? header_size - rem : 0;
- return size + padding;
- }
-
-
-struct Extended // stores metadata from/for extended records
- {
- std::string linkpath;
- std::string path;
- unsigned long long size;
- bool crc_present;
- Extended() : size( 0 ), crc_present( false ) {}
- void reset()
- { linkpath.clear(); path.clear(); size = 0; crc_present = false; }
- bool empty() { return linkpath.empty() && path.empty() && size == 0; }
- bool parse( const char * const buf, const unsigned long long edsize,
- const bool permissive );
- };
+extern const CRC32 crc32c;
enum { initial_line_length = 1000 }; // must be >= 77
@@ -132,10 +280,16 @@ public:
unsigned size() const { return size_; }
};
+const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
+const char * const bad_dict_msg = "Invalid dictionary size in member header.";
+const char * const corrupt_mm_msg = "Corrupt header in multimember file.";
+const char * const trailing_msg = "Trailing data not allowed.";
+
// defined in create.cc
-enum Solidity { no_solid, dsolid, asolid, solid };
+enum Solidity { no_solid, bsolid, dsolid, asolid, solid };
extern int cl_owner;
extern int cl_group;
+extern int cl_data_size;
extern Solidity solidity;
unsigned ustar_chksum( const uint8_t * const header );
bool verify_ustar_chksum( const uint8_t * const header );
@@ -152,6 +306,8 @@ void format_member_name( const Extended & extended, const Tar_header header,
const char * remove_leading_slash( const char * const filename );
bool compare_prefix_dir( const char * const dir, const char * const name );
bool compare_tslash( const char * const name1, const char * const name2 );
+int readblock( const int fd, uint8_t * const buf, const int size );
+int writeblock( const int fd, const uint8_t * const buf, const int size );
unsigned long long parse_octal( const uint8_t * const ptr, const int size );
int decode( const std::string & archive_name, const Arg_parser & parser,
const int filenames, const int num_workers, const int debug_level,
diff --git a/testsuite/check.sh b/testsuite/check.sh
index f6f989f..e1e3f60 100755
--- a/testsuite/check.sh
+++ b/testsuite/check.sh
@@ -65,7 +65,7 @@ lzlib_1_11() { [ ${lwarn} = 0 ] &&
# Description of test files for tarlz:
# test.txt.tar.lz: 1 member (test.txt).
-# t155.tar[.lz]: directory + file + link + eof, all with 155 char names
+# t155.tar[.lz]: directory + links + file + eof, all with 155 char names
# tar_in_tlz1.tar.lz 2 members (test.txt.tar test3.tar) 3 lzip members
# tar_in_tlz2.tar.lz 2 members (test.txt.tar test3.tar) 5 lzip members
# test_bad1.tar.lz: truncated at offset 6000 (of 7495)
@@ -163,10 +163,11 @@ rm -f test.txt || framework_failure
"${TARLZ}" -xf "${in_tar}" --missing-crc || test_failed $LINENO
cmp "${in}" test.txt || test_failed $LINENO
rm -f test.txt || framework_failure
-#
-printf "foo\n" > cfoo || framework_failure
-printf "bar\n" > cbar || framework_failure
-printf "baz\n" > cbaz || framework_failure
+
+# reference files for cmp
+cat "${testdir}"/rfoo > cfoo || framework_failure
+cat "${testdir}"/rbar > cbar || framework_failure
+cat "${testdir}"/rbaz > cbaz || framework_failure
rm -f foo bar baz || framework_failure
"${TARLZ}" -xf "${test3_lz}" --missing-crc || test_failed $LINENO
cmp cfoo foo || test_failed $LINENO
@@ -261,7 +262,7 @@ for i in "${tarint1_lz}" "${tarint2_lz}" ; do
cmp out0 out6 || test_failed $LINENO
cmp out2 out6 || test_failed $LINENO
cmp outv0 outv2 || test_failed $LINENO
- cmp outv0 outv2 || test_failed $LINENO
+ cmp outv0 outv6 || test_failed $LINENO
cmp outv2 outv6 || test_failed $LINENO
rm -f out0 out2 out6 outv0 outv2 outv6 || framework_failure
"${TARLZ}" -xf "$i" || test_failed $LINENO
@@ -409,14 +410,14 @@ cat cbar > bar || framework_failure
cat cbaz > baz || framework_failure
"${TARLZ}" --solid -0 -cf out.tar.lz foo || test_failed $LINENO
cat out.tar.lz > aout.tar.lz || framework_failure
-for i in --asolid --dsolid --solid -0 ; do
+for i in --asolid --bsolid --dsolid --solid -0 ; do
"${TARLZ}" $i -q -rf out.tar.lz bar baz
[ $? = 2 ] || test_failed $LINENO $i
cmp out.tar.lz aout.tar.lz || test_failed $LINENO $i
done
rm -f out.tar.lz aout.tar.lz || framework_failure
-for i in --asolid --dsolid -0 ; do
- for j in --asolid --dsolid --solid -0 ; do
+for i in --asolid --bsolid --dsolid -0 ; do
+ for j in --asolid --bsolid --dsolid --solid -0 ; do
"${TARLZ}" $i -0 -cf out.tar.lz foo ||
test_failed $LINENO "$i $j"
"${TARLZ}" $j -0 -rf out.tar.lz bar baz ||
diff --git a/testsuite/rbar b/testsuite/rbar
new file mode 100644
index 0000000..5716ca5
--- /dev/null
+++ b/testsuite/rbar
@@ -0,0 +1 @@
+bar
diff --git a/testsuite/rbaz b/testsuite/rbaz
new file mode 100644
index 0000000..7601807
--- /dev/null
+++ b/testsuite/rbaz
@@ -0,0 +1 @@
+baz
diff --git a/testsuite/rfoo b/testsuite/rfoo
new file mode 100644
index 0000000..257cc56
--- /dev/null
+++ b/testsuite/rfoo
@@ -0,0 +1 @@
+foo
diff --git a/testsuite/t155.tar b/testsuite/t155.tar
index 4a0f37b..f2b8a4e 100644
--- a/testsuite/t155.tar
+++ b/testsuite/t155.tar
Binary files differ
diff --git a/testsuite/t155.tar.lz b/testsuite/t155.tar.lz
index 3219071..edc7f04 100644
--- a/testsuite/t155.tar.lz
+++ b/testsuite/t155.tar.lz
Binary files differ