diff options
Diffstat (limited to 'tarlz.h')
-rw-r--r-- | tarlz.h | 608 |
1 files changed, 608 insertions, 0 deletions
@@ -0,0 +1,608 @@ +/* Tarlz - Archiver with multimember lzip compression + Copyright (C) 2013-2024 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <climits> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> + +#define max_file_size ( LLONG_MAX - header_size ) +enum { header_size = 512, + max_edata_size = ( INT_MAX / header_size - 2 ) * header_size }; +typedef uint8_t Tar_header[header_size]; + +enum Offsets { + name_o = 0, mode_o = 100, uid_o = 108, gid_o = 116, size_o = 124, + mtime_o = 136, chksum_o = 148, typeflag_o = 156, linkname_o = 157, + magic_o = 257, version_o = 263, uname_o = 265, gname_o = 297, + devmajor_o = 329, devminor_o = 337, prefix_o = 345 }; + +enum Lengths { + name_l = 100, mode_l = 8, uid_l = 8, gid_l = 8, size_l = 12, + mtime_l = 12, chksum_l = 8, typeflag_l = 1, linkname_l = 100, + magic_l = 6, version_l = 2, uname_l = 32, gname_l = 32, + devmajor_l = 8, devminor_l = 8, prefix_l = 155 }; + +enum Typeflag { + tf_regular = '0', tf_link = '1', tf_symlink = '2', tf_chardev = '3', + tf_blockdev = '4', tf_directory = '5', tf_fifo = '6', tf_hiperf = '7', + tf_global = 'g', tf_extended = 'x' }; + +const uint8_t ustar_magic[magic_l] = + { 0x75, 0x73, 0x74, 0x61, 0x72, 0 }; // "ustar\0" + +inline bool check_ustar_magic( const Tar_header header ) + { return std::memcmp( header + magic_o, ustar_magic, magic_l ) == 0; } + +inline void init_tar_header( Tar_header header ) // set magic and version + { + std::memset( header, 0, header_size ); + std::memcpy( header + magic_o, ustar_magic, magic_l - 1 ); + header[version_o] = header[version_o+1] = '0'; + } + +inline void print_octal( uint8_t * const buf, int size, unsigned long long num ) + { while( --size >= 0 ) { buf[size] = num % 8 + '0'; num /= 8; } } + + +// Round "size" to the next multiple of header size (512). +// +inline unsigned long long round_up( const unsigned long long size ) + { + const int rem = size % header_size; + const int padding = rem ? header_size - rem : 0; + return size + padding; + } + + +inline int decimal_digits( unsigned long long value ) + { + int digits = 1; + while( value >= 10 ) { value /= 10; ++digits; } + return digits; + } + + +inline bool dotdot_at_i( const char * const filename, const int i ) + { + return ( filename[i] == '.' && filename[i+1] == '.' && + ( i == 0 || filename[i-1] == '/' ) && + ( filename[i+2] == 0 || filename[i+2] == '/' ) ); + } + + +inline bool contains_dotdot( const char * const filename ) + { + for( int i = 0; filename[i]; ++i ) + if( dotdot_at_i( filename, i ) ) return true; + return false; + } + + +class Resizable_buffer + { + char * p; + unsigned long size_; // size_ < LONG_MAX + +public: + // must be >= 87 for format_member_name + enum { default_initial_size = 2 * header_size }; + + explicit Resizable_buffer( const unsigned long initial_size = + default_initial_size ) + : p( (char *)std::malloc( initial_size ) ), size_( p ? initial_size : 0 ) {} + ~Resizable_buffer() { if( p ) std::free( p ); p = 0; size_ = 0; } + + bool resize( const unsigned long long new_size ) + { + if( new_size >= LONG_MAX ) return false; + if( size_ < new_size ) + { + char * const tmp = (char *)std::realloc( p, new_size ); + if( !tmp ) return false; + p = tmp; size_ = new_size; + } + return true; + } + char * operator()() { return p; } + const char * operator()() const { return p; } + uint8_t * u8() { return (uint8_t *)p; } + const uint8_t * u8() const { return (const uint8_t *)p; } + unsigned long size() const { return size_; } + }; + + +inline bool uid_in_ustar_range( const long long uid ) // also for gid + { return uid >= 0 && uid < 1 << 21; } + +inline bool time_in_ustar_range( const long long seconds ) + { return seconds >= 0 && seconds < 1LL << 33; } + + +/* The sign of the seconds field applies to the whole time value. + A nanoseconds value out of range means an invalid time. */ +class Etime // time since (or before) the epoch + { + long long sec_; + int nsec_; // range [0, 999_999_999] + +public: + Etime() : sec_( 0 ), nsec_( -1 ) {} + void reset() { sec_ = 0; nsec_ = -1; } + void set( const long long s ) { sec_ = s; nsec_ = 0; } + long long sec() const { return sec_; } + int nsec() const { return nsec_; } + bool isvalid() const { return nsec_ >= 0 && nsec_ <= 999999999; } + bool out_of_ustar_range() const + { return isvalid() && !time_in_ustar_range( sec_ ); } + + unsigned decimal_size() const; + unsigned print( char * const buf ) const; + bool parse( const char * const ptr, const char ** const tailp, + const int size ); + }; + + +class Extended // stores metadata from/for extended records + { + static std::vector< std::string > unknown_keywords; // already diagnosed + std::string linkpath_; // these are the real metadata + std::string path_; + long long file_size_; // >= 0 && <= max_file_size + long long uid_, gid_; // may not fit in unsigned int + Etime atime_, mtime_; + + // cached sizes; if full_size_ <= -4 they must be recalculated + mutable int edsize_; // extended data size + mutable int padded_edsize_; // edsize rounded up + mutable int full_size_; // header + padded edsize + mutable int linkpath_recsize_; + mutable int path_recsize_; + mutable int file_size_recsize_; + mutable int uid_recsize_; + mutable int gid_recsize_; + mutable int atime_recsize_; + mutable int mtime_recsize_; + + // true if CRC present in parsed or formatted records + mutable bool crc_present_; + + void calculate_sizes() const; + void unknown_keyword( const char * const buf, const int size ) const; + +public: + static const std::string crc_record; + std::string removed_prefix; + + Extended() + : file_size_( 0 ), uid_( -1 ), gid_( -1 ), edsize_( 0 ), + padded_edsize_( 0 ), full_size_( 0 ), linkpath_recsize_( 0 ), + path_recsize_( 0 ), file_size_recsize_( 0 ), uid_recsize_( 0 ), + gid_recsize_( 0 ), atime_recsize_( 0 ), mtime_recsize_( 0 ), + crc_present_( false ) {} + + void reset() + { linkpath_.clear(); path_.clear(); file_size_ = 0; uid_ = -1; gid_ = -1; + atime_.reset(); mtime_.reset(); edsize_ = 0; padded_edsize_ = 0; + full_size_ = 0; linkpath_recsize_ = 0; path_recsize_ = 0; + file_size_recsize_ = 0; uid_recsize_ = 0; gid_recsize_ = 0; + atime_recsize_ = 0; mtime_recsize_ = 0; crc_present_ = false; + removed_prefix.clear(); } + + const std::string & linkpath() const { return linkpath_; } + const std::string & path() const { return path_; } + long long file_size() const { return file_size_; } + long long get_file_size_and_reset( const Tar_header header ); + long long get_uid() const { return uid_; } + long long get_gid() const { return gid_; } + const Etime & atime() const { return atime_; } + const Etime & mtime() const { return mtime_; } + + void linkpath( const char * const lp ) { linkpath_ = lp; full_size_ = -4; } + void path( const char * const p ) { path_ = p; full_size_ = -4; } + void file_size( const long long fs ) { full_size_ = -4; + file_size_ = ( fs >= 0 && fs <= max_file_size ) ? fs : 0; } + bool set_uid( const long long id ) + { if( id >= 0 ) { uid_ = id; full_size_ = -4; } return id >= 0; } + bool set_gid( const long long id ) + { if( id >= 0 ) { gid_ = id; full_size_ = -4; } return id >= 0; } + void set_atime( const long long s ) { atime_.set( s ); full_size_ = -4; } + void set_mtime( const long long s ) { mtime_.set( s ); full_size_ = -4; } + + /* Return the size of the extended block, or 0 if empty. + Return -1 if error, -2 if out of memory, -3 if block too long. */ + int full_size() const + { if( full_size_ <= -4 ) calculate_sizes(); return full_size_; } + int format_block( Resizable_buffer & rbuf ) const; + const char * full_size_error() const; + + bool crc_present() const { return crc_present_; } + bool parse( const char * const buf, const int edsize, + const bool permissive ); + void fill_from_ustar( const Tar_header header ); + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32( const bool castagnoli = false ) + { + const unsigned cpol = 0x82F63B78U; // CRC32-C Castagnoli polynomial. + const unsigned ipol = 0xEDB88320U; // IEEE 802.3 Ethernet polynomial. + const unsigned poly = castagnoli ? cpol : ipol; + + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = poly ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + void update_byte( uint32_t & crc, const uint8_t byte ) const + { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); } + + // about as fast as it is possible without messing with endianness + void update_buf( uint32_t & crc, const uint8_t * const buffer, + const int size ) const + { + uint32_t c = crc; + for( int i = 0; i < size; ++i ) + c = data[(c^buffer[i])&0xFF] ^ ( c >> 8 ); + crc = c; + } + + uint32_t compute_crc( const uint8_t * const buffer, const int size ) const + { + uint32_t crc = 0xFFFFFFFFU; + for( int i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + return crc ^ 0xFFFFFFFFU; + } + + // Calculates the crc of size bytes except a window of 8 bytes at pos + uint32_t windowed_crc( const uint8_t * const buffer, const int pos, + const int size ) const + { + uint32_t crc = 0xFFFFFFFFU; + update_buf( crc, buffer, pos ); + update_buf( crc, buffer + pos + 8, size - pos - 8 ); + return crc ^ 0xFFFFFFFFU; + } + }; + + +struct Lzma_options + { + int dictionary_size; // 4 KiB .. 512 MiB + int match_len_limit; // 5 .. 273 + }; +const Lzma_options option_mapping[] = + { + { 65535, 16 }, // -0 + { 1 << 20, 5 }, // -1 + { 3 << 19, 6 }, // -2 + { 1 << 21, 8 }, // -3 + { 3 << 20, 12 }, // -4 + { 1 << 22, 20 }, // -5 + { 1 << 23, 36 }, // -6 + { 1 << 24, 68 }, // -7 + { 3 << 23, 132 }, // -8 + { 1 << 25, 273 } }; // -9 + + +enum { + min_dictionary_bits = 12, + min_dictionary_size = 1 << min_dictionary_bits, + max_dictionary_bits = 29, + max_dictionary_size = 1 << max_dictionary_bits, + min_member_size = 36, + min_data_size = 2 * min_dictionary_size, + max_data_size = 2 * max_dictionary_size }; + + +inline bool isvalid_ds( const unsigned dictionary_size ) + { return ( dictionary_size >= min_dictionary_size && + dictionary_size <= max_dictionary_size ); } + + +const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" + +struct Lzip_header + { + enum { size = 6 }; + uint8_t data[size]; // 0-3 magic bytes + // 4 version + // 5 coded dictionary size + + bool check_magic() const + { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); } + + bool check_prefix( const int sz ) const // detect (truncated) header + { + for( int i = 0; i < sz && i < 4; ++i ) + if( data[i] != lzip_magic[i] ) return false; + return ( sz > 0 ); + } + + bool check_corrupt() const // detect corrupt header + { + int matches = 0; + for( int i = 0; i < 4; ++i ) + if( data[i] == lzip_magic[i] ) ++matches; + return ( matches > 1 && matches < 4 ); + } + + uint8_t version() const { return data[4]; } + bool check_version() const { return ( data[4] == 1 ); } + + unsigned dictionary_size() const + { + unsigned sz = ( 1 << ( data[5] & 0x1F ) ); + if( sz > min_dictionary_size ) + sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 ); + return sz; + } + + bool check() const + { return check_magic() && check_version() && + isvalid_ds( dictionary_size() ); } + }; + + +struct Lzip_trailer + { + enum { size = 20 }; + uint8_t data[size]; // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + + unsigned data_crc() const + { + unsigned tmp = 0; + for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + unsigned long long data_size() const + { + unsigned long long tmp = 0; + for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + unsigned long long member_size() const + { + unsigned long long tmp = 0; + for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + bool check_consistency() const // check internal consistency + { + const unsigned crc = data_crc(); + const unsigned long long dsize = data_size(); + if( ( crc == 0 ) != ( dsize == 0 ) ) return false; + const unsigned long long msize = member_size(); + if( msize < min_member_size ) return false; + const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size; + if( mlimit > dsize && msize > mlimit ) return false; + const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1; + if( dlimit > msize && dsize > dlimit ) return false; + return true; + } + }; + + +enum Program_mode { m_none, m_append, m_compress, m_concatenate, m_create, + m_delete, m_diff, m_extract, m_list }; +enum Solidity { no_solid, bsolid, dsolid, asolid, solid }; +class Arg_parser; + +struct Cl_options // command-line options + { + const Arg_parser & parser; + std::string archive_name; + std::string output_filename; + long long mtime; + long long uid; + long long gid; + Program_mode program_mode; + Solidity solidity; + int data_size; + int debug_level; + int level; // compression level, < 0 means uncompressed + int num_files; + int num_workers; // start this many worker threads + int out_slots; + bool dereference; + bool filenames_given; + bool ignore_ids; + bool ignore_metadata; + bool ignore_overflow; + bool keep_damaged; + bool level_set; // compression level set in command line + bool missing_crc; + bool mtime_set; + bool permissive; + bool preserve_permissions; + bool warn_newer; + + Cl_options( const Arg_parser & ap ) + : parser( ap ), mtime( 0 ), uid( -1 ), gid( -1 ), program_mode( m_none ), + solidity( bsolid ), data_size( 0 ), debug_level( 0 ), level( 6 ), + num_files( 0 ), num_workers( -1 ), out_slots( 64 ), dereference( false ), + filenames_given( false ), ignore_ids( false ), ignore_metadata( false ), + ignore_overflow( false ), keep_damaged( false ), level_set( false ), + missing_crc( false ), mtime_set( false ), permissive( false ), + preserve_permissions( false ), warn_newer( false ) {} + + void set_level( const int l ) { level = l; level_set = true; } + + int compressed() const; // tri-state bool with error (-2) + bool uncompressed() const { return level < 0 || level > 9; } + bool to_stdout() const { return output_filename == "-"; } + }; + +inline void set_retval( int & retval, const int new_val ) + { if( retval < new_val ) retval = new_val; } + +const char * const bad_magic_msg = "Bad magic number (file not in lzip format)."; +const char * const bad_dict_msg = "Invalid dictionary size in member header."; +const char * const corrupt_mm_msg = "Corrupt header in multimember file."; +const char * const bad_hdr_msg = "Corrupt or invalid tar header."; +const char * const gblrec_msg = "Error in global extended records."; +const char * const extrec_msg = "Error in extended records."; +const char * const miscrc_msg = "Missing CRC in extended records."; +const char * const misrec_msg = "Missing extended records."; +const char * const longrec_msg = "Extended records are too long."; +const char * const end_msg = "Archive ends unexpectedly."; +const char * const mem_msg = "Not enough memory."; +const char * const mem_msg2 = "Not enough memory. Try a lower compression level."; +const char * const fv_msg1 = "Format violation: extended header followed by EOA blocks."; +const char * const fv_msg2 = "Format violation: extended header followed by global header."; +const char * const fv_msg3 = "Format violation: consecutive extended headers found."; +const char * const posix_msg = "This does not look like a POSIX tar archive."; +const char * const posix_lz_msg = "This does not look like a POSIX tar.lz archive."; +const char * const eclosa_msg = "Error closing archive"; +const char * const eclosf_msg = "Error closing file"; +const char * const nfound_msg = "Not found in archive."; +const char * const seek_msg = "Seek error"; +const char * const werr_msg = "Write error"; +const char * const chdir_msg = "Error changing working directory"; +const char * const intdir_msg = "Failed to create intermediate directory"; + +// defined in common.cc +unsigned long long parse_octal( const uint8_t * const ptr, const int size ); +int readblock( const int fd, uint8_t * const buf, const int size ); +int writeblock( const int fd, const uint8_t * const buf, const int size ); + +// defined in common_decode.cc +bool block_is_zero( const uint8_t * const buf, const int size ); +bool format_member_name( const Extended & extended, const Tar_header header, + Resizable_buffer & rbuf, const bool long_format ); +bool show_member_name( const Extended & extended, const Tar_header header, + const int vlevel, Resizable_buffer & rbuf ); +bool check_skip_filename( const Cl_options & cl_opts, + std::vector< char > & name_pending, + const char * const filename, const int chdir_fd = -1 ); +bool make_dirs( const std::string & name ); + +// defined in common_mutex.cc +void exit_fail_mt( const int retval = 1 ); // terminate the program +bool print_removed_prefix( const std::string & prefix, + std::string * const msgp = 0 ); +void set_error_status( const int retval ); +int final_exit_status( int retval, const bool show_msg = true ); + +// defined in compress.cc +void show_atpos_error( const char * const filename, const long long pos, + const bool isarchive ); +int compress( const Cl_options & cl_opts ); + +// defined in create.cc +bool copy_file( const int infd, const int outfd, const long long max_size = -1 ); +bool writeblock_wrapper( const int outfd, const uint8_t * const buffer, + const int size ); +bool write_eoa_records( const int outfd, const bool compressed ); +const char * remove_leading_dotslash( const char * const filename, + std::string * const removed_prefixp, const bool dotdot = false ); +bool fill_headers( const char * const filename, Extended & extended, + Tar_header header, long long & file_size, const int flag ); +bool block_is_full( const int extended_size, + const unsigned long long file_size, + const unsigned long long target_size, + unsigned long long & partial_data_size ); +unsigned ustar_chksum( const Tar_header header ); +bool check_ustar_chksum( const Tar_header header ); +bool has_lz_ext( const std::string & name ); +int concatenate( const Cl_options & cl_opts ); +int encode( const Cl_options & cl_opts ); + +// defined in create_lz.cc +int encode_lz( const Cl_options & cl_opts, const char * const archive_namep, + const int outfd ); + +// defined in decode.cc +bool compare_file_type( std::string & estr, std::string & ostr, + const Cl_options & cl_opts, + const Extended & extended, const Tar_header header ); +class Archive_reader_base; +bool compare_file_contents( std::string & estr, std::string & ostr, + Archive_reader_base & ar, const long long file_size, + const char * const filename, const int infd2 ); +int decode( const Cl_options & cl_opts ); + +// defined in decode_lz.cc +struct Archive_descriptor; +int decode_lz( const Cl_options & cl_opts, const Archive_descriptor & ad, + std::vector< char > & name_pending ); + +// defined in delete.cc +bool safe_seek( const int fd, const long long pos ); +int tail_copy( const Arg_parser & parser, const Archive_descriptor & ad, + std::vector< char > & name_pending, const long long istream_pos, + const int outfd, int retval ); +int delete_members( const Cl_options & cl_opts ); + +// defined in delete_lz.cc +int delete_members_lz( const Cl_options & cl_opts, + const Archive_descriptor & ad, + std::vector< char > & name_pending, const int outfd ); + +// defined in exclude.cc +namespace Exclude { +void add_pattern( const std::string & arg ); +void clear(); +bool excluded( const char * const filename ); +} // end namespace Exclude + +// defined in extended.cc +extern const CRC32 crc32c; + +// defined in lzip_index.cc +int seek_read( const int fd, uint8_t * const buf, const int size, + const long long pos ); + +// defined in main.cc +extern int verbosity; +extern const char * const program_name; +struct stat; +int hstat( const char * const filename, struct stat * const st, + const bool dereference ); +bool nonempty_arg( const Arg_parser & parser, const int i ); +int open_instream( const std::string & name ); +int open_outstream( const std::string & name, const bool create = true, + Resizable_buffer * const rbufp = 0, const bool force = true ); +void show_error( const char * const msg, const int errcode = 0, + const bool help = false ); +bool format_error( Resizable_buffer & rbuf, const int errcode, + const char * const format, ... ); +void print_error( const int errcode, const char * const format, ... ); +void format_file_error( std::string & estr, const char * const filename, + const char * const msg, const int errcode = 0 ); +bool format_file_error( Resizable_buffer & rbuf, const char * const filename, + const char * const msg, const int errcode = 0 ); +void show_file_error( const char * const filename, const char * const msg, + const int errcode = 0 ); +void internal_error( const char * const msg ); |