summaryrefslogtreecommitdiffstats
path: root/extract.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--extract.cc398
1 files changed, 243 insertions, 155 deletions
diff --git a/extract.cc b/extract.cc
index 67f4a20..58cda61 100644
--- a/extract.cc
+++ b/extract.cc
@@ -28,6 +28,10 @@
#include <unistd.h>
#include <utime.h>
#include <sys/stat.h>
+#include <sys/types.h>
+#if defined(__GNU_LIBRARY__)
+#include <sys/sysmacros.h> // for makedev
+#endif
#include <lzlib.h>
#include "arg_parser.h"
@@ -38,6 +42,17 @@
namespace {
int gretval = 0;
+bool has_lz_ext; // global var for archive_read
+
+void skip_warn( const bool reset = false ) // avoid duplicate warnings
+ {
+ static bool skipping = false;
+
+ if( reset ) skipping = false;
+ else if( !skipping )
+ { skipping = true; show_error( "Skipping to next header." ); }
+ }
+
bool make_path( const std::string & name )
{
@@ -68,42 +83,6 @@ bool make_path( const std::string & name )
}
-// Returns in buf the first rd bytes of the second lzip member or
-// the first 512 bytes of the second tar member, and sets islz if lzip member
-bool skip_first_member( const int infd, uint8_t * const buf,
- int & rd, bool & islz )
- {
- while( true )
- {
- for( int i = 0; i < rd; ++i )
- if( buf[i] == 'L' && (*(Lzip_header *)( buf + i )).verify_prefix( rd - i ) )
- {
- const int ts = rd - i; // tail size
- std::memmove( buf, buf + i, ts );
- if( ts >= (int)sizeof lzip_magic )
- { rd = ts; islz = true; return true; }
- int rd2 = readblock( infd, buf + ts, header_size - ts );
- if( rd2 != header_size - ts && errno )
- { show_error( "Error reading archive", errno ); return false; }
- if( ts + rd2 >= min_member_size &&
- (*(Lzip_header *)buf).verify_magic() )
- { rd = ts + rd2; islz = true; return true; }
- std::memmove( buf, buf + ts, rd2 );
- int rd3 = readblock( infd, buf + rd2, header_size - rd2 );
- if( rd3 != header_size - rd2 && errno )
- { show_error( "Error reading archive", errno ); return false; }
- rd = rd2 + rd3; i = -1;
- }
- if( rd < header_size ) return false; // eof
- if( rd == header_size && verify_ustar_chksum( buf ) )
- { islz = false; return true; }
- rd = readblock( infd, buf, header_size );
- if( rd != header_size && errno )
- { show_error( "Error reading archive", errno ); return false; }
- }
- }
-
-
inline bool block_is_zero( const uint8_t * const buf, const int size )
{
for( int i = 0; i < size; ++i ) if( buf[i] != 0 ) return false;
@@ -111,58 +90,83 @@ inline bool block_is_zero( const uint8_t * const buf, const int size )
}
-bool archive_read( const int infd, uint8_t * const buf, const int size )
+// Return value: 0 = OK, 1 = damaged member, 2 = fatal error.
+// If sizep and error, return in *sizep the number of bytes read.
+// The first 6 bytes of the archive must be intact for islz to be meaningful.
+int archive_read( const int infd, uint8_t * const buf, const int size,
+ int * const sizep = 0 )
{
static LZ_Decoder * decoder = 0;
- static bool first_call = true;
static bool at_eof = false;
+ static bool fatal = false;
+ static bool first_call = true;
+ if( sizep ) *sizep = 0;
+ if( fatal ) return 2;
if( first_call ) // check format
{
first_call = false;
if( size != header_size )
internal_error( "size != header_size on first call." );
- int rd = readblock( infd, buf, size );
+ const int rd = readblock( infd, buf, size );
+ if( sizep ) *sizep = rd;
if( rd != size && errno )
- { show_error( "Error reading archive", errno ); return false; }
- bool islz =
- ( rd >= min_member_size && (*(Lzip_header *)buf).verify_magic() );
+ { show_error( "Error reading archive", errno ); fatal = true; return 2; }
+ const Lzip_header & header = (*(const Lzip_header *)buf);
+ bool islz = ( rd >= min_member_size && header.verify_magic() &&
+ isvalid_ds( header.dictionary_size() ) );
const bool istar = ( rd == size && verify_ustar_chksum( buf ) );
const bool iseof =
( !islz && !istar && rd == size && block_is_zero( buf, size ) );
- if( !islz && !istar && !iseof )
+ if( !islz && !istar && !iseof ) // corrupt or invalid format
{
- show_error( "This does not look like a tar archive." );
- show_error( "Skipping to next header." );
-// std::fprintf( stderr, "%07o\n", ustar_chksum( buf ) );
- gretval = 2;
- if( !skip_first_member( infd, buf, rd, islz ) ) return false;
+ show_error( "This does not look like a POSIX tar archive." );
+ if( has_lz_ext ) islz = true;
+ if( verbosity >= 2 && !islz && rd == size )
+ std::fprintf( stderr, "ustar chksum = %07o\n", ustar_chksum( buf ) );
+ if( !islz ) return 1;
}
- if( !islz ) return true; // uncompressed
+ if( !islz ) // uncompressed
+ { if( rd == size ) return 0; fatal = true; return 2; }
decoder = LZ_decompress_open(); // compressed
if( !decoder || LZ_decompress_errno( decoder ) != LZ_ok )
{ show_error( "Not enough memory." );
- LZ_decompress_close( decoder ); return false; }
+ LZ_decompress_close( decoder ); fatal = true; return 2; }
if( LZ_decompress_write( decoder, buf, rd ) != rd )
internal_error( "library error (LZ_decompress_write)." );
- if( !archive_read( infd, buf, size ) ) return false;
- if( verify_ustar_chksum( buf ) || block_is_zero( buf, size ) ) return true;
- show_error( "This does not look like a tar archive." );
- show_error( "Skipping to next header." );
- gretval = 2;
- if( LZ_decompress_sync_to_member( decoder ) < 0 )
- internal_error( "library error (LZ_decompress_sync_to_member)." );
+ const int res = archive_read( infd, buf, size, sizep );
+ if( res != 0 ) { if( res == 2 ) fatal = true; return res; }
+ if( verify_ustar_chksum( buf ) || block_is_zero( buf, size ) ) return 0;
+ show_error( "This does not look like a POSIX tar.lz archive." );
+ fatal = true; return 2;
}
if( !decoder ) // uncompressed
- { if( readblock( infd, buf, size ) == size ) return true;
- show_error( "Archive ends unexpectedly." ); return false; }
+ {
+ const int rd = readblock( infd, buf, size ); if( rd == size ) return 0;
+ if( sizep ) *sizep = rd;
+ show_error( "Archive ends unexpectedly." ); fatal = true; return 2;
+ }
const int ibuf_size = 16384;
uint8_t ibuf[ibuf_size];
int sz = 0;
while( sz < size )
{
- if( !at_eof && LZ_decompress_write_size( decoder ) > 0 )
+ const int rd = LZ_decompress_read( decoder, buf + sz, size - sz );
+ if( rd < 0 )
+ {
+ if( LZ_decompress_sync_to_member( decoder ) < 0 )
+ internal_error( "library error (LZ_decompress_sync_to_member)." );
+ skip_warn(); gretval = 2; return 1;
+ }
+ if( rd == 0 && LZ_decompress_finished( decoder ) == 1 )
+ { LZ_decompress_close( decoder );
+ show_error( "Archive ends unexpectedly." ); fatal = true; return 2; }
+ sz += rd; if( sizep ) *sizep = sz;
+ if( sz == size && LZ_decompress_finished( decoder ) == 1 &&
+ LZ_decompress_close( decoder ) < 0 )
+ { show_error( "LZ_decompress_close failed." ); fatal = true; return 2; }
+ if( sz < size && !at_eof && LZ_decompress_write_size( decoder ) > 0 )
{
const int rsize = std::min( ibuf_size, LZ_decompress_write_size( decoder ) );
const int rd = readblock( infd, ibuf, rsize );
@@ -172,27 +176,12 @@ bool archive_read( const int infd, uint8_t * const buf, const int size )
{
at_eof = true; LZ_decompress_finish( decoder );
if( errno )
- { show_error( "Error reading archive", errno ); return false; }
+ { show_error( "Error reading archive", errno ); fatal = true;
+ return 2; }
}
}
- const int rd = LZ_decompress_read( decoder, buf + sz, size - sz );
- if( rd < 0 )
- {
- show_error( "Skipping to next header." );
- gretval = 2;
- if( LZ_decompress_sync_to_member( decoder ) < 0 )
- internal_error( "library error (LZ_decompress_sync_to_member)." );
- continue;
- }
- if( rd == 0 && LZ_decompress_finished( decoder ) == 1 )
- { LZ_decompress_close( decoder );
- show_error( "Archive ends unexpectedly." ); return false; }
- sz += rd;
- if( sz == size && LZ_decompress_finished( decoder ) == 1 &&
- LZ_decompress_close( decoder ) < 0 )
- { show_error( "LZ_decompress_close failed." ); return false; }
}
- return true;
+ return 0;
}
@@ -251,55 +240,44 @@ const char * user_group_string( const Tar_header header )
}
-const char * link_string( const Tar_header header )
- {
- enum { bufsize = 9 + linkname_l + 1 };
- static char buf[bufsize];
- const Typeflag typeflag = (Typeflag)header[typeflag_o];
-
- if( typeflag == tf_link )
- snprintf( buf, bufsize, " link to %.100s", header + linkname_o );
- else if( typeflag == tf_symlink )
- snprintf( buf, bufsize, " -> %.100s", header + linkname_o );
- else buf[0] = 0;
- return buf;
- }
-
-
-void show_member_name( const char * const filename, const Tar_header header,
+void show_member_name( const Extended & extended, const Tar_header header,
const int vlevel )
{
if( verbosity < vlevel ) return;
if( verbosity > vlevel )
{
const time_t mtime = strtoull( header + mtime_o, 0, 8 ); // 33 bits
- struct tm * tm = localtime( &mtime );
- std::printf( "%s %s %9llu %4d-%02u-%02u %02u:%02u %s%s\n",
+ const struct tm * const tm = localtime( &mtime );
+ const Typeflag typeflag = (Typeflag)header[typeflag_o];
+ const bool islink = ( typeflag == tf_link || typeflag == tf_symlink );
+ const char * const link_string = !islink ? "" :
+ ( ( typeflag == tf_link ) ? " link to " : " -> " );
+ std::printf( "%s %s %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n",
mode_string( header ), user_group_string( header ),
- strtoull( header + size_o, 0, 8 ), 1900 + tm->tm_year,
- 1 + tm->tm_mon, tm->tm_mday, tm->tm_hour, tm->tm_min,
- filename, link_string( header ) );
+ extended.size, 1900 + tm->tm_year, 1 + tm->tm_mon,
+ tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path.c_str(),
+ link_string, !islink ? "" : extended.linkpath.c_str() );
}
- else std::printf( "%s\n", filename );
+ else std::printf( "%s\n", extended.path.c_str() );
std::fflush( stdout );
}
-int list_member( const int infd, const char * const filename,
- const unsigned long long file_size, const Tar_header header,
- const bool skip )
+int list_member( const int infd, const Extended & extended,
+ const Tar_header header, const bool skip )
{
- if( !skip ) show_member_name( filename, header, 0 );
+ if( !skip ) show_member_name( extended, header, 0 );
const unsigned bufsize = 32 * header_size;
uint8_t buf[bufsize];
- unsigned long long rest = file_size;
- const int rem = file_size % header_size;
+ unsigned long long rest = extended.size;
+ const int rem = extended.size % header_size;
const int padding = rem ? header_size - rem : 0;
while( rest > 0 )
{
const int rsize = ( rest >= bufsize ) ? bufsize : rest + padding;
- if( !archive_read( infd, buf, rsize ) ) return 2;
+ const int ret = archive_read( infd, buf, rsize );
+ if( ret != 0 ) { if( ret == 2 ) return 2; else break; }
if( rest < bufsize ) break;
rest -= rsize;
}
@@ -317,13 +295,14 @@ bool contains_dotdot( const char * const filename )
}
-int extract_member( const int infd, const char * const filename,
- const unsigned long long file_size, const Tar_header header )
+int extract_member( const int infd, const Extended & extended,
+ const Tar_header header, const bool keep_damaged )
{
+ const char * const filename = extended.path.c_str();
if( contains_dotdot( filename ) )
{
show_file_error( filename, "Contains a '..' component, skipping." );
- return list_member( infd, filename, file_size, header, true );
+ return list_member( infd, extended, header, true );
}
const mode_t mode = strtoul( header + mode_o, 0, 8 ); // 12 bits
const time_t mtime = strtoull( header + mtime_o, 0, 8 ); // 33 bits
@@ -331,7 +310,7 @@ int extract_member( const int infd, const char * const filename,
const bool islink = ( typeflag == tf_link || typeflag == tf_symlink );
int outfd = -1;
- show_member_name( filename, header, 1 );
+ show_member_name( extended, header, 1 );
std::remove( filename );
make_path( filename );
switch( typeflag )
@@ -345,14 +324,12 @@ int extract_member( const int infd, const char * const filename,
case tf_link:
case tf_symlink:
{
- char linkname[linkname_l+1];
- std::memcpy( linkname, header + linkname_o, linkname_l );
- linkname[linkname_l] = 0;
+ const char * const linkname = extended.linkpath.c_str();
/* if( contains_dotdot( linkname ) )
{
show_file_error( filename,
"Link destination contains a '..' component, skipping." );
- return list_member( infd, filename, file_size, header, false );
+ return list_member( infd, extended, header, false );
}*/
const bool hard = typeflag == tf_link;
if( ( hard && link( linkname, filename ) != 0 ) ||
@@ -410,15 +387,25 @@ int extract_member( const int infd, const char * const filename,
const unsigned bufsize = 32 * header_size;
uint8_t buf[bufsize];
- unsigned long long rest = file_size;
- const int rem = file_size % header_size;
+ unsigned long long rest = extended.size;
+ const int rem = extended.size % header_size;
const int padding = rem ? header_size - rem : 0;
while( rest > 0 )
{
const int rsize = ( rest >= bufsize ) ? bufsize : rest + padding;
- if( !archive_read( infd, buf, rsize ) )
- { if( outfd >= 0 ) { close( outfd ); std::remove( filename ); }
- return 2; }
+ int rd;
+ const int ret = archive_read( infd, buf, rsize, &rd );
+ if( ret != 0 )
+ {
+ if( outfd >= 0 )
+ {
+ if( keep_damaged )
+ { writeblock( outfd, buf, std::min( rest, (unsigned long long)rd ) );
+ close( outfd ); }
+ else { close( outfd ); std::remove( filename ); }
+ }
+ if( ret == 2 ) return 2; else return 0;
+ }
const int wsize = ( rest >= bufsize ) ? bufsize : rest;
if( outfd >= 0 && writeblock( outfd, buf, wsize ) != wsize )
{ show_file_error( filename, "Error writing file", errno ); return 2; }
@@ -437,6 +424,7 @@ int extract_member( const int infd, const char * const filename,
}
+// Removes any amount of leading "./" and '/' strings.
const char * remove_leading_slash( const char * const filename )
{
static bool first_post = true;
@@ -479,73 +467,173 @@ bool compare_tslash( const char * const name1, const char * const name2 )
} // end namespace
+bool Extended::parse( const int infd, const Tar_header header,
+ const bool permissive )
+ {
+ const unsigned long long edsize = strtoull( header + size_o, 0, 8 );
+ const unsigned long long bufsize = round_up( edsize );
+ if( bufsize == 0 || edsize == 0 || edsize >= 1ULL << 33 )
+ return false; // overflow or no extended data
+ char * const buf = new char[bufsize]; // extended records buffer
+ if( archive_read( infd, (uint8_t *)buf, bufsize ) != 0 ) goto error;
+ for( unsigned long long pos = 0; pos < edsize; ) // parse records
+ {
+ char * tail;
+ const unsigned long long rsize = strtoull( buf + pos, &tail, 10 );
+ if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' ||
+ buf[pos+rsize-1] != '\n' ) goto error;
+ ++tail; // point to keyword
+ // length of (keyword + '=' + value) without the final newline
+ const unsigned long long rest = ( buf + pos + rsize - 1 ) - tail;
+ if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 )
+ { if( path.size() && !permissive ) goto error;
+ path.assign( tail + 5, rest - 5 ); }
+ else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 )
+ { if( linkpath.size() && !permissive ) goto error;
+ linkpath.assign( tail + 9, rest - 9 ); }
+ else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 )
+ {
+ if( size != 0 && !permissive ) goto error;
+ size = 0;
+ for( unsigned long long i = 5; i < rest; ++i )
+ {
+ if( tail[i] < '0' || tail[i] > '9' ) goto error;
+ const unsigned long long prev = size;
+ size = size * 10 + ( tail[i] - '0' );
+ if( size < prev ) goto error; // overflow
+ }
+ if( size < 1ULL << 33 ) goto error; // size fits in ustar header
+ }
+ else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 )
+ {
+ if( crc_present && !permissive ) goto error;
+ if( rsize != 22 ) goto error;
+ char * t;
+ const uint32_t stored_crc = strtoul( tail + 10, &t, 16 );
+ if( t - tail - 10 != 8 || t[0] != '\n' ) goto error;
+ const uint32_t computed_crc =
+ crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize );
+ crc_present = true;
+ if( stored_crc != computed_crc ) goto error;
+ }
+ pos += rsize;
+ }
+ delete[] buf;
+ return true;
+error:
+ delete[] buf;
+ return false;
+ }
+
+
int decode( const std::string & archive_name, const Arg_parser & parser,
- const int filenames, const bool listing )
+ const int filenames, const bool keep_damaged, const bool listing,
+ const bool missing_crc, const bool permissive )
{
const int infd = archive_name.size() ?
open_instream( archive_name ) : STDIN_FILENO;
if( infd < 0 ) return 1;
+ // execute -C options and mark filenames to be extracted or listed
std::vector< bool > name_pending( parser.arguments(), false );
for( int i = 0; i < parser.arguments(); ++i )
{
const int code = parser.code( i );
if( code == 'C' && !listing )
{
- const char * const filename = parser.argument( i ).c_str();
- if( chdir( filename ) != 0 )
- { show_file_error( filename, "Error changing working directory", errno );
+ const char * const dir = parser.argument( i ).c_str();
+ if( chdir( dir ) != 0 )
+ { show_file_error( dir, "Error changing working directory", errno );
return 1; }
}
if( !code ) name_pending[i] = true;
}
+ has_lz_ext =
+ ( archive_name.size() > 3 &&
+ archive_name.compare( archive_name.size() - 3, 3, ".lz" ) == 0 ) ||
+ ( archive_name.size() > 4 &&
+ archive_name.compare( archive_name.size() - 4, 4, ".tlz" ) == 0 );
+ Extended extended; // metadata from extended records
int retval = 0;
- bool skipping = false;
+ bool prev_extended = false; // prev header was extended
while( true ) // process one member per iteration
{
uint8_t buf[header_size];
- if( !archive_read( infd, buf, header_size ) ) return 2;
- if( !verify_ustar_chksum( buf ) )
+ const int ret = archive_read( infd, buf, header_size );
+ if( ret == 2 ) return 2;
+ if( ret != 0 || !verify_ustar_chksum( buf ) )
{
- if( block_is_zero( buf, header_size ) ) break;
- gretval = 2;
- if( !skipping )
- { skipping = true; show_error( "Skipping to next header." ); }
- continue;
+ if( ret == 0 && block_is_zero( buf, header_size ) ) break; // EOF
+ skip_warn(); gretval = 2; continue;
}
- skipping = false;
+ skip_warn( true ); // reset warning
const char * const header = (const char *)buf;
- enum { max_filename_size = prefix_l + 1 + name_l + 1 };
- char stored_name[max_filename_size];
- int len = 0;
- while( len < prefix_l && header[prefix_o+len] )
- { stored_name[len] = header[prefix_o+len]; ++len; }
- if( len && header[name_o] ) stored_name[len++] = '/';
- for( int i = 0; i < name_l && header[name_o+i]; ++i )
- { stored_name[len] = header[name_o+i]; ++len; }
- while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/'
- stored_name[len] = 0;
- const char * const filename = remove_leading_slash( stored_name );
+ const Typeflag typeflag = (Typeflag)header[typeflag_o];
+ if( typeflag == tf_extended )
+ {
+ if( prev_extended && !permissive )
+ { show_error( "Format violation: consecutive extended headers found."
+ /*" Use --permissive."*/, 0, true ); return 2; }
+ if( !extended.parse( infd, header, permissive ) )
+ { show_error( "Error in extended records. Skipping to next header." );
+ extended.reset(); gretval = 2; }
+ else if( !extended.crc_present && missing_crc )
+ { show_error( "Missing CRC in extended records.", 0, true ); return 2; }
+ prev_extended = true;
+ continue;
+ }
+ prev_extended = false;
+
+ if( extended.linkpath.empty() )
+ {
+ for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i )
+ extended.linkpath += header[linkname_o+i];
+ while( extended.linkpath.size() > 1 && // trailing '/'
+ extended.linkpath[extended.linkpath.size()-1] == '/' )
+ extended.linkpath.resize( extended.linkpath.size() - 1 );
+ }
+
+ if( extended.path.empty() )
+ {
+ char stored_name[prefix_l+1+name_l+1];
+ int len = 0;
+ while( len < prefix_l && header[prefix_o+len] )
+ { stored_name[len] = header[prefix_o+len]; ++len; }
+ if( len && header[name_o] ) stored_name[len++] = '/';
+ for( int i = 0; i < name_l && header[name_o+i]; ++i )
+ { stored_name[len] = header[name_o+i]; ++len; }
+ while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/'
+ stored_name[len] = 0;
+ extended.path = remove_leading_slash( stored_name );
+ }
+ const char * const filename = extended.path.c_str();
bool skip = filenames > 0;
if( skip )
for( int i = 0; i < parser.arguments(); ++i )
- if( parser.code( i ) == 0 &&
- ( compare_prefix_dir( parser.argument( i ).c_str(), filename ) ||
- compare_tslash( filename, parser.argument( i ).c_str() ) ) )
- { skip = false; name_pending[i] = false; break; }
+ if( parser.code( i ) == 0 )
+ {
+ const char * const name =
+ remove_leading_slash( parser.argument( i ).c_str() );
+ if( compare_prefix_dir( name, filename ) ||
+ compare_tslash( name, filename ) )
+ { skip = false; name_pending[i] = false; break; }
+ }
+
+ if( extended.size == 0 &&
+ ( typeflag == tf_regular || typeflag == tf_hiperf ) )
+ extended.size = strtoull( header + size_o, 0, 8 );
- const Typeflag typeflag = (Typeflag)header[typeflag_o];
- const unsigned long long file_size =
- ( typeflag == tf_regular || typeflag == tf_hiperf ) ?
- strtoull( header + size_o, 0, 8 ) : 0;
if( listing || skip )
- retval = list_member( infd, filename, file_size, header, skip );
+ retval = list_member( infd, extended, header, skip );
else
- retval = extract_member( infd, filename, file_size, header );
- if( retval ) return retval;
+ retval = extract_member( infd, extended, header, keep_damaged );
+ extended.reset();
+ if( retval )
+ { show_error( "Error is not recoverable: exiting now." );
+ return retval; }
}
for( int i = 0; i < parser.arguments(); ++i )