summaryrefslogtreecommitdiffstats
path: root/tarlz.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--tarlz.h608
1 files changed, 608 insertions, 0 deletions
diff --git a/tarlz.h b/tarlz.h
new file mode 100644
index 0000000..16ae6e0
--- /dev/null
+++ b/tarlz.h
@@ -0,0 +1,608 @@
+/* Tarlz - Archiver with multimember lzip compression
+ Copyright (C) 2013-2024 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+#define max_file_size ( LLONG_MAX - header_size )
+enum { header_size = 512,
+ max_edata_size = ( INT_MAX / header_size - 2 ) * header_size };
+typedef uint8_t Tar_header[header_size];
+
+enum Offsets {
+ name_o = 0, mode_o = 100, uid_o = 108, gid_o = 116, size_o = 124,
+ mtime_o = 136, chksum_o = 148, typeflag_o = 156, linkname_o = 157,
+ magic_o = 257, version_o = 263, uname_o = 265, gname_o = 297,
+ devmajor_o = 329, devminor_o = 337, prefix_o = 345 };
+
+enum Lengths {
+ name_l = 100, mode_l = 8, uid_l = 8, gid_l = 8, size_l = 12,
+ mtime_l = 12, chksum_l = 8, typeflag_l = 1, linkname_l = 100,
+ magic_l = 6, version_l = 2, uname_l = 32, gname_l = 32,
+ devmajor_l = 8, devminor_l = 8, prefix_l = 155 };
+
+enum Typeflag {
+ tf_regular = '0', tf_link = '1', tf_symlink = '2', tf_chardev = '3',
+ tf_blockdev = '4', tf_directory = '5', tf_fifo = '6', tf_hiperf = '7',
+ tf_global = 'g', tf_extended = 'x' };
+
+const uint8_t ustar_magic[magic_l] =
+ { 0x75, 0x73, 0x74, 0x61, 0x72, 0 }; // "ustar\0"
+
+inline bool check_ustar_magic( const Tar_header header )
+ { return std::memcmp( header + magic_o, ustar_magic, magic_l ) == 0; }
+
+inline void init_tar_header( Tar_header header ) // set magic and version
+ {
+ std::memset( header, 0, header_size );
+ std::memcpy( header + magic_o, ustar_magic, magic_l - 1 );
+ header[version_o] = header[version_o+1] = '0';
+ }
+
+inline void print_octal( uint8_t * const buf, int size, unsigned long long num )
+ { while( --size >= 0 ) { buf[size] = num % 8 + '0'; num /= 8; } }
+
+
+// Round "size" to the next multiple of header size (512).
+//
+inline unsigned long long round_up( const unsigned long long size )
+ {
+ const int rem = size % header_size;
+ const int padding = rem ? header_size - rem : 0;
+ return size + padding;
+ }
+
+
+inline int decimal_digits( unsigned long long value )
+ {
+ int digits = 1;
+ while( value >= 10 ) { value /= 10; ++digits; }
+ return digits;
+ }
+
+
+inline bool dotdot_at_i( const char * const filename, const int i )
+ {
+ return ( filename[i] == '.' && filename[i+1] == '.' &&
+ ( i == 0 || filename[i-1] == '/' ) &&
+ ( filename[i+2] == 0 || filename[i+2] == '/' ) );
+ }
+
+
+inline bool contains_dotdot( const char * const filename )
+ {
+ for( int i = 0; filename[i]; ++i )
+ if( dotdot_at_i( filename, i ) ) return true;
+ return false;
+ }
+
+
+class Resizable_buffer
+ {
+ char * p;
+ unsigned long size_; // size_ < LONG_MAX
+
+public:
+ // must be >= 87 for format_member_name
+ enum { default_initial_size = 2 * header_size };
+
+ explicit Resizable_buffer( const unsigned long initial_size =
+ default_initial_size )
+ : p( (char *)std::malloc( initial_size ) ), size_( p ? initial_size : 0 ) {}
+ ~Resizable_buffer() { if( p ) std::free( p ); p = 0; size_ = 0; }
+
+ bool resize( const unsigned long long new_size )
+ {
+ if( new_size >= LONG_MAX ) return false;
+ if( size_ < new_size )
+ {
+ char * const tmp = (char *)std::realloc( p, new_size );
+ if( !tmp ) return false;
+ p = tmp; size_ = new_size;
+ }
+ return true;
+ }
+ char * operator()() { return p; }
+ const char * operator()() const { return p; }
+ uint8_t * u8() { return (uint8_t *)p; }
+ const uint8_t * u8() const { return (const uint8_t *)p; }
+ unsigned long size() const { return size_; }
+ };
+
+
+inline bool uid_in_ustar_range( const long long uid ) // also for gid
+ { return uid >= 0 && uid < 1 << 21; }
+
+inline bool time_in_ustar_range( const long long seconds )
+ { return seconds >= 0 && seconds < 1LL << 33; }
+
+
+/* The sign of the seconds field applies to the whole time value.
+ A nanoseconds value out of range means an invalid time. */
+class Etime // time since (or before) the epoch
+ {
+ long long sec_;
+ int nsec_; // range [0, 999_999_999]
+
+public:
+ Etime() : sec_( 0 ), nsec_( -1 ) {}
+ void reset() { sec_ = 0; nsec_ = -1; }
+ void set( const long long s ) { sec_ = s; nsec_ = 0; }
+ long long sec() const { return sec_; }
+ int nsec() const { return nsec_; }
+ bool isvalid() const { return nsec_ >= 0 && nsec_ <= 999999999; }
+ bool out_of_ustar_range() const
+ { return isvalid() && !time_in_ustar_range( sec_ ); }
+
+ unsigned decimal_size() const;
+ unsigned print( char * const buf ) const;
+ bool parse( const char * const ptr, const char ** const tailp,
+ const int size );
+ };
+
+
+class Extended // stores metadata from/for extended records
+ {
+ static std::vector< std::string > unknown_keywords; // already diagnosed
+ std::string linkpath_; // these are the real metadata
+ std::string path_;
+ long long file_size_; // >= 0 && <= max_file_size
+ long long uid_, gid_; // may not fit in unsigned int
+ Etime atime_, mtime_;
+
+ // cached sizes; if full_size_ <= -4 they must be recalculated
+ mutable int edsize_; // extended data size
+ mutable int padded_edsize_; // edsize rounded up
+ mutable int full_size_; // header + padded edsize
+ mutable int linkpath_recsize_;
+ mutable int path_recsize_;
+ mutable int file_size_recsize_;
+ mutable int uid_recsize_;
+ mutable int gid_recsize_;
+ mutable int atime_recsize_;
+ mutable int mtime_recsize_;
+
+ // true if CRC present in parsed or formatted records
+ mutable bool crc_present_;
+
+ void calculate_sizes() const;
+ void unknown_keyword( const char * const buf, const int size ) const;
+
+public:
+ static const std::string crc_record;
+ std::string removed_prefix;
+
+ Extended()
+ : file_size_( 0 ), uid_( -1 ), gid_( -1 ), edsize_( 0 ),
+ padded_edsize_( 0 ), full_size_( 0 ), linkpath_recsize_( 0 ),
+ path_recsize_( 0 ), file_size_recsize_( 0 ), uid_recsize_( 0 ),
+ gid_recsize_( 0 ), atime_recsize_( 0 ), mtime_recsize_( 0 ),
+ crc_present_( false ) {}
+
+ void reset()
+ { linkpath_.clear(); path_.clear(); file_size_ = 0; uid_ = -1; gid_ = -1;
+ atime_.reset(); mtime_.reset(); edsize_ = 0; padded_edsize_ = 0;
+ full_size_ = 0; linkpath_recsize_ = 0; path_recsize_ = 0;
+ file_size_recsize_ = 0; uid_recsize_ = 0; gid_recsize_ = 0;
+ atime_recsize_ = 0; mtime_recsize_ = 0; crc_present_ = false;
+ removed_prefix.clear(); }
+
+ const std::string & linkpath() const { return linkpath_; }
+ const std::string & path() const { return path_; }
+ long long file_size() const { return file_size_; }
+ long long get_file_size_and_reset( const Tar_header header );
+ long long get_uid() const { return uid_; }
+ long long get_gid() const { return gid_; }
+ const Etime & atime() const { return atime_; }
+ const Etime & mtime() const { return mtime_; }
+
+ void linkpath( const char * const lp ) { linkpath_ = lp; full_size_ = -4; }
+ void path( const char * const p ) { path_ = p; full_size_ = -4; }
+ void file_size( const long long fs ) { full_size_ = -4;
+ file_size_ = ( fs >= 0 && fs <= max_file_size ) ? fs : 0; }
+ bool set_uid( const long long id )
+ { if( id >= 0 ) { uid_ = id; full_size_ = -4; } return id >= 0; }
+ bool set_gid( const long long id )
+ { if( id >= 0 ) { gid_ = id; full_size_ = -4; } return id >= 0; }
+ void set_atime( const long long s ) { atime_.set( s ); full_size_ = -4; }
+ void set_mtime( const long long s ) { mtime_.set( s ); full_size_ = -4; }
+
+ /* Return the size of the extended block, or 0 if empty.
+ Return -1 if error, -2 if out of memory, -3 if block too long. */
+ int full_size() const
+ { if( full_size_ <= -4 ) calculate_sizes(); return full_size_; }
+ int format_block( Resizable_buffer & rbuf ) const;
+ const char * full_size_error() const;
+
+ bool crc_present() const { return crc_present_; }
+ bool parse( const char * const buf, const int edsize,
+ const bool permissive );
+ void fill_from_ustar( const Tar_header header );
+ };
+
+
+class CRC32
+ {
+ uint32_t data[256]; // Table of CRCs of all 8-bit messages.
+
+public:
+ CRC32( const bool castagnoli = false )
+ {
+ const unsigned cpol = 0x82F63B78U; // CRC32-C Castagnoli polynomial.
+ const unsigned ipol = 0xEDB88320U; // IEEE 802.3 Ethernet polynomial.
+ const unsigned poly = castagnoli ? cpol : ipol;
+
+ for( unsigned n = 0; n < 256; ++n )
+ {
+ unsigned c = n;
+ for( int k = 0; k < 8; ++k )
+ { if( c & 1 ) c = poly ^ ( c >> 1 ); else c >>= 1; }
+ data[n] = c;
+ }
+ }
+
+ void update_byte( uint32_t & crc, const uint8_t byte ) const
+ { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); }
+
+ // about as fast as it is possible without messing with endianness
+ void update_buf( uint32_t & crc, const uint8_t * const buffer,
+ const int size ) const
+ {
+ uint32_t c = crc;
+ for( int i = 0; i < size; ++i )
+ c = data[(c^buffer[i])&0xFF] ^ ( c >> 8 );
+ crc = c;
+ }
+
+ uint32_t compute_crc( const uint8_t * const buffer, const int size ) const
+ {
+ uint32_t crc = 0xFFFFFFFFU;
+ for( int i = 0; i < size; ++i )
+ crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
+ return crc ^ 0xFFFFFFFFU;
+ }
+
+ // Calculates the crc of size bytes except a window of 8 bytes at pos
+ uint32_t windowed_crc( const uint8_t * const buffer, const int pos,
+ const int size ) const
+ {
+ uint32_t crc = 0xFFFFFFFFU;
+ update_buf( crc, buffer, pos );
+ update_buf( crc, buffer + pos + 8, size - pos - 8 );
+ return crc ^ 0xFFFFFFFFU;
+ }
+ };
+
+
+struct Lzma_options
+ {
+ int dictionary_size; // 4 KiB .. 512 MiB
+ int match_len_limit; // 5 .. 273
+ };
+const Lzma_options option_mapping[] =
+ {
+ { 65535, 16 }, // -0
+ { 1 << 20, 5 }, // -1
+ { 3 << 19, 6 }, // -2
+ { 1 << 21, 8 }, // -3
+ { 3 << 20, 12 }, // -4
+ { 1 << 22, 20 }, // -5
+ { 1 << 23, 36 }, // -6
+ { 1 << 24, 68 }, // -7
+ { 3 << 23, 132 }, // -8
+ { 1 << 25, 273 } }; // -9
+
+
+enum {
+ min_dictionary_bits = 12,
+ min_dictionary_size = 1 << min_dictionary_bits,
+ max_dictionary_bits = 29,
+ max_dictionary_size = 1 << max_dictionary_bits,
+ min_member_size = 36,
+ min_data_size = 2 * min_dictionary_size,
+ max_data_size = 2 * max_dictionary_size };
+
+
+inline bool isvalid_ds( const unsigned dictionary_size )
+ { return ( dictionary_size >= min_dictionary_size &&
+ dictionary_size <= max_dictionary_size ); }
+
+
+const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP"
+
+struct Lzip_header
+ {
+ enum { size = 6 };
+ uint8_t data[size]; // 0-3 magic bytes
+ // 4 version
+ // 5 coded dictionary size
+
+ bool check_magic() const
+ { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); }
+
+ bool check_prefix( const int sz ) const // detect (truncated) header
+ {
+ for( int i = 0; i < sz && i < 4; ++i )
+ if( data[i] != lzip_magic[i] ) return false;
+ return ( sz > 0 );
+ }
+
+ bool check_corrupt() const // detect corrupt header
+ {
+ int matches = 0;
+ for( int i = 0; i < 4; ++i )
+ if( data[i] == lzip_magic[i] ) ++matches;
+ return ( matches > 1 && matches < 4 );
+ }
+
+ uint8_t version() const { return data[4]; }
+ bool check_version() const { return ( data[4] == 1 ); }
+
+ unsigned dictionary_size() const
+ {
+ unsigned sz = ( 1 << ( data[5] & 0x1F ) );
+ if( sz > min_dictionary_size )
+ sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 );
+ return sz;
+ }
+
+ bool check() const
+ { return check_magic() && check_version() &&
+ isvalid_ds( dictionary_size() ); }
+ };
+
+
+struct Lzip_trailer
+ {
+ enum { size = 20 };
+ uint8_t data[size]; // 0-3 CRC32 of the uncompressed data
+ // 4-11 size of the uncompressed data
+ // 12-19 member size including header and trailer
+
+ unsigned data_crc() const
+ {
+ unsigned tmp = 0;
+ for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; }
+ return tmp;
+ }
+
+ unsigned long long data_size() const
+ {
+ unsigned long long tmp = 0;
+ for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; }
+ return tmp;
+ }
+
+ unsigned long long member_size() const
+ {
+ unsigned long long tmp = 0;
+ for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; }
+ return tmp;
+ }
+
+ bool check_consistency() const // check internal consistency
+ {
+ const unsigned crc = data_crc();
+ const unsigned long long dsize = data_size();
+ if( ( crc == 0 ) != ( dsize == 0 ) ) return false;
+ const unsigned long long msize = member_size();
+ if( msize < min_member_size ) return false;
+ const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size;
+ if( mlimit > dsize && msize > mlimit ) return false;
+ const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1;
+ if( dlimit > msize && dsize > dlimit ) return false;
+ return true;
+ }
+ };
+
+
+enum Program_mode { m_none, m_append, m_compress, m_concatenate, m_create,
+ m_delete, m_diff, m_extract, m_list };
+enum Solidity { no_solid, bsolid, dsolid, asolid, solid };
+class Arg_parser;
+
+struct Cl_options // command-line options
+ {
+ const Arg_parser & parser;
+ std::string archive_name;
+ std::string output_filename;
+ long long mtime;
+ long long uid;
+ long long gid;
+ Program_mode program_mode;
+ Solidity solidity;
+ int data_size;
+ int debug_level;
+ int level; // compression level, < 0 means uncompressed
+ int num_files;
+ int num_workers; // start this many worker threads
+ int out_slots;
+ bool dereference;
+ bool filenames_given;
+ bool ignore_ids;
+ bool ignore_metadata;
+ bool ignore_overflow;
+ bool keep_damaged;
+ bool level_set; // compression level set in command line
+ bool missing_crc;
+ bool mtime_set;
+ bool permissive;
+ bool preserve_permissions;
+ bool warn_newer;
+
+ Cl_options( const Arg_parser & ap )
+ : parser( ap ), mtime( 0 ), uid( -1 ), gid( -1 ), program_mode( m_none ),
+ solidity( bsolid ), data_size( 0 ), debug_level( 0 ), level( 6 ),
+ num_files( 0 ), num_workers( -1 ), out_slots( 64 ), dereference( false ),
+ filenames_given( false ), ignore_ids( false ), ignore_metadata( false ),
+ ignore_overflow( false ), keep_damaged( false ), level_set( false ),
+ missing_crc( false ), mtime_set( false ), permissive( false ),
+ preserve_permissions( false ), warn_newer( false ) {}
+
+ void set_level( const int l ) { level = l; level_set = true; }
+
+ int compressed() const; // tri-state bool with error (-2)
+ bool uncompressed() const { return level < 0 || level > 9; }
+ bool to_stdout() const { return output_filename == "-"; }
+ };
+
+inline void set_retval( int & retval, const int new_val )
+ { if( retval < new_val ) retval = new_val; }
+
+const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
+const char * const bad_dict_msg = "Invalid dictionary size in member header.";
+const char * const corrupt_mm_msg = "Corrupt header in multimember file.";
+const char * const bad_hdr_msg = "Corrupt or invalid tar header.";
+const char * const gblrec_msg = "Error in global extended records.";
+const char * const extrec_msg = "Error in extended records.";
+const char * const miscrc_msg = "Missing CRC in extended records.";
+const char * const misrec_msg = "Missing extended records.";
+const char * const longrec_msg = "Extended records are too long.";
+const char * const end_msg = "Archive ends unexpectedly.";
+const char * const mem_msg = "Not enough memory.";
+const char * const mem_msg2 = "Not enough memory. Try a lower compression level.";
+const char * const fv_msg1 = "Format violation: extended header followed by EOA blocks.";
+const char * const fv_msg2 = "Format violation: extended header followed by global header.";
+const char * const fv_msg3 = "Format violation: consecutive extended headers found.";
+const char * const posix_msg = "This does not look like a POSIX tar archive.";
+const char * const posix_lz_msg = "This does not look like a POSIX tar.lz archive.";
+const char * const eclosa_msg = "Error closing archive";
+const char * const eclosf_msg = "Error closing file";
+const char * const nfound_msg = "Not found in archive.";
+const char * const seek_msg = "Seek error";
+const char * const werr_msg = "Write error";
+const char * const chdir_msg = "Error changing working directory";
+const char * const intdir_msg = "Failed to create intermediate directory";
+
+// defined in common.cc
+unsigned long long parse_octal( const uint8_t * const ptr, const int size );
+int readblock( const int fd, uint8_t * const buf, const int size );
+int writeblock( const int fd, const uint8_t * const buf, const int size );
+
+// defined in common_decode.cc
+bool block_is_zero( const uint8_t * const buf, const int size );
+bool format_member_name( const Extended & extended, const Tar_header header,
+ Resizable_buffer & rbuf, const bool long_format );
+bool show_member_name( const Extended & extended, const Tar_header header,
+ const int vlevel, Resizable_buffer & rbuf );
+bool check_skip_filename( const Cl_options & cl_opts,
+ std::vector< char > & name_pending,
+ const char * const filename, const int chdir_fd = -1 );
+bool make_dirs( const std::string & name );
+
+// defined in common_mutex.cc
+void exit_fail_mt( const int retval = 1 ); // terminate the program
+bool print_removed_prefix( const std::string & prefix,
+ std::string * const msgp = 0 );
+void set_error_status( const int retval );
+int final_exit_status( int retval, const bool show_msg = true );
+
+// defined in compress.cc
+void show_atpos_error( const char * const filename, const long long pos,
+ const bool isarchive );
+int compress( const Cl_options & cl_opts );
+
+// defined in create.cc
+bool copy_file( const int infd, const int outfd, const long long max_size = -1 );
+bool writeblock_wrapper( const int outfd, const uint8_t * const buffer,
+ const int size );
+bool write_eoa_records( const int outfd, const bool compressed );
+const char * remove_leading_dotslash( const char * const filename,
+ std::string * const removed_prefixp, const bool dotdot = false );
+bool fill_headers( const char * const filename, Extended & extended,
+ Tar_header header, long long & file_size, const int flag );
+bool block_is_full( const int extended_size,
+ const unsigned long long file_size,
+ const unsigned long long target_size,
+ unsigned long long & partial_data_size );
+unsigned ustar_chksum( const Tar_header header );
+bool check_ustar_chksum( const Tar_header header );
+bool has_lz_ext( const std::string & name );
+int concatenate( const Cl_options & cl_opts );
+int encode( const Cl_options & cl_opts );
+
+// defined in create_lz.cc
+int encode_lz( const Cl_options & cl_opts, const char * const archive_namep,
+ const int outfd );
+
+// defined in decode.cc
+bool compare_file_type( std::string & estr, std::string & ostr,
+ const Cl_options & cl_opts,
+ const Extended & extended, const Tar_header header );
+class Archive_reader_base;
+bool compare_file_contents( std::string & estr, std::string & ostr,
+ Archive_reader_base & ar, const long long file_size,
+ const char * const filename, const int infd2 );
+int decode( const Cl_options & cl_opts );
+
+// defined in decode_lz.cc
+struct Archive_descriptor;
+int decode_lz( const Cl_options & cl_opts, const Archive_descriptor & ad,
+ std::vector< char > & name_pending );
+
+// defined in delete.cc
+bool safe_seek( const int fd, const long long pos );
+int tail_copy( const Arg_parser & parser, const Archive_descriptor & ad,
+ std::vector< char > & name_pending, const long long istream_pos,
+ const int outfd, int retval );
+int delete_members( const Cl_options & cl_opts );
+
+// defined in delete_lz.cc
+int delete_members_lz( const Cl_options & cl_opts,
+ const Archive_descriptor & ad,
+ std::vector< char > & name_pending, const int outfd );
+
+// defined in exclude.cc
+namespace Exclude {
+void add_pattern( const std::string & arg );
+void clear();
+bool excluded( const char * const filename );
+} // end namespace Exclude
+
+// defined in extended.cc
+extern const CRC32 crc32c;
+
+// defined in lzip_index.cc
+int seek_read( const int fd, uint8_t * const buf, const int size,
+ const long long pos );
+
+// defined in main.cc
+extern int verbosity;
+extern const char * const program_name;
+struct stat;
+int hstat( const char * const filename, struct stat * const st,
+ const bool dereference );
+bool nonempty_arg( const Arg_parser & parser, const int i );
+int open_instream( const std::string & name );
+int open_outstream( const std::string & name, const bool create = true,
+ Resizable_buffer * const rbufp = 0, const bool force = true );
+void show_error( const char * const msg, const int errcode = 0,
+ const bool help = false );
+bool format_error( Resizable_buffer & rbuf, const int errcode,
+ const char * const format, ... );
+void print_error( const int errcode, const char * const format, ... );
+void format_file_error( std::string & estr, const char * const filename,
+ const char * const msg, const int errcode = 0 );
+bool format_file_error( Resizable_buffer & rbuf, const char * const filename,
+ const char * const msg, const int errcode = 0 );
+void show_file_error( const char * const filename, const char * const msg,
+ const int errcode = 0 );
+void internal_error( const char * const msg );