summaryrefslogtreecommitdiffstats
path: root/create_lz.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--create_lz.cc594
1 files changed, 594 insertions, 0 deletions
diff --git a/create_lz.cc b/create_lz.cc
new file mode 100644
index 0000000..5436bf5
--- /dev/null
+++ b/create_lz.cc
@@ -0,0 +1,594 @@
+/* Tarlz - Archiver with multimember lzip compression
+ Copyright (C) 2013-2024 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <queue>
+#include <pthread.h>
+#include <stdint.h> // for lzlib.h
+#include <unistd.h>
+#include <sys/stat.h>
+#include <ftw.h>
+#include <lzlib.h>
+
+#include "tarlz.h"
+#include "arg_parser.h"
+#include "common_mutex.h"
+#include "create.h"
+
+
+namespace {
+
+const Cl_options * gcl_opts = 0; // local vars needed by add_member_lz
+enum { max_packet_size = 1 << 20 };
+class Packet_courier;
+Packet_courier * courierp = 0;
+unsigned long long partial_data_size = 0; // size of current block
+
+
+class Slot_tally
+ {
+ const int num_slots; // total slots
+ int num_free; // remaining free slots
+ pthread_mutex_t mutex;
+ pthread_cond_t slot_av; // slot available
+
+ Slot_tally( const Slot_tally & ); // declared as private
+ void operator=( const Slot_tally & ); // declared as private
+
+public:
+ explicit Slot_tally( const int slots )
+ : num_slots( slots ), num_free( slots )
+ { xinit_mutex( &mutex ); xinit_cond( &slot_av ); }
+
+ ~Slot_tally() { xdestroy_cond( &slot_av ); xdestroy_mutex( &mutex ); }
+
+ bool all_free() { return ( num_free == num_slots ); }
+
+ void get_slot() // wait for a free slot
+ {
+ xlock( &mutex );
+ while( num_free <= 0 ) xwait( &slot_av, &mutex );
+ --num_free;
+ xunlock( &mutex );
+ }
+
+ void leave_slot() // return a slot to the tally
+ {
+ xlock( &mutex );
+ if( ++num_free == 1 ) xsignal( &slot_av ); // num_free was 0
+ xunlock( &mutex );
+ }
+ };
+
+
+struct Ipacket // filename, file size and headers
+ {
+ const long long file_size;
+ const std::string filename; // filename.empty() means end of lzip member
+ const Extended * const extended;
+ const uint8_t * const header;
+
+ Ipacket() : file_size( 0 ), extended( 0 ), header( 0 ) {}
+ Ipacket( const char * const name, const long long fs,
+ const Extended * const ext, const uint8_t * const head )
+ : file_size( fs ), filename( name ), extended( ext ), header( head ) {}
+ };
+
+struct Opacket // compressed data to be written to the archive
+ {
+ const uint8_t * const data; // data == 0 means end of lzip member
+ const int size; // number of bytes in data (if any)
+
+ Opacket() : data( 0 ), size( 0 ) {}
+ Opacket( uint8_t * const d, const int s ) : data( d ), size( s ) {}
+ };
+
+
+class Packet_courier // moves packets around
+ {
+public:
+ unsigned icheck_counter;
+ unsigned iwait_counter;
+ unsigned ocheck_counter;
+ unsigned owait_counter;
+private:
+ int receive_worker_id; // worker queue currently receiving packets
+ int deliver_worker_id; // worker queue currently delivering packets
+ Slot_tally slot_tally; // limits the number of input packets
+ std::vector< std::queue< const Ipacket * > > ipacket_queues;
+ std::vector< std::queue< const Opacket * > > opacket_queues;
+ int num_working; // number of workers still running
+ const int num_workers; // number of workers
+ const unsigned out_slots; // max output packets per queue
+ pthread_mutex_t imutex;
+ pthread_cond_t iav_or_eof; // input packet available or grouper done
+ pthread_mutex_t omutex;
+ pthread_cond_t oav_or_exit; // output packet available or all workers exited
+ std::vector< pthread_cond_t > slot_av; // output slot available
+ bool eof; // grouper done
+
+ Packet_courier( const Packet_courier & ); // declared as private
+ void operator=( const Packet_courier & ); // declared as private
+
+public:
+ Packet_courier( const int workers, const int in_slots, const int oslots )
+ : icheck_counter( 0 ), iwait_counter( 0 ),
+ ocheck_counter( 0 ), owait_counter( 0 ),
+ receive_worker_id( 0 ), deliver_worker_id( 0 ),
+ slot_tally( in_slots ), ipacket_queues( workers ),
+ opacket_queues( workers ), num_working( workers ),
+ num_workers( workers ), out_slots( oslots ), slot_av( workers ),
+ eof( false )
+ {
+ xinit_mutex( &imutex ); xinit_cond( &iav_or_eof );
+ xinit_mutex( &omutex ); xinit_cond( &oav_or_exit );
+ for( unsigned i = 0; i < slot_av.size(); ++i ) xinit_cond( &slot_av[i] );
+ }
+
+ ~Packet_courier()
+ {
+ for( unsigned i = 0; i < slot_av.size(); ++i ) xdestroy_cond( &slot_av[i] );
+ xdestroy_cond( &oav_or_exit ); xdestroy_mutex( &omutex );
+ xdestroy_cond( &iav_or_eof ); xdestroy_mutex( &imutex );
+ }
+
+ /* Receive an ipacket from grouper.
+ If filename.empty() (end of lzip member token), move to next queue. */
+ void receive_packet( const Ipacket * const ipacket )
+ {
+ if( !ipacket->filename.empty() )
+ slot_tally.get_slot(); // wait for a free slot
+ xlock( &imutex );
+ ipacket_queues[receive_worker_id].push( ipacket );
+ if( ipacket->filename.empty() && ++receive_worker_id >= num_workers )
+ receive_worker_id = 0;
+ xbroadcast( &iav_or_eof );
+ xunlock( &imutex );
+ }
+
+ // distribute an ipacket to a worker
+ const Ipacket * distribute_packet( const int worker_id )
+ {
+ const Ipacket * ipacket = 0;
+ xlock( &imutex );
+ ++icheck_counter;
+ while( ipacket_queues[worker_id].empty() && !eof )
+ {
+ ++iwait_counter;
+ xwait( &iav_or_eof, &imutex );
+ }
+ if( !ipacket_queues[worker_id].empty() )
+ {
+ ipacket = ipacket_queues[worker_id].front();
+ ipacket_queues[worker_id].pop();
+ }
+ xunlock( &imutex );
+ if( ipacket )
+ { if( !ipacket->filename.empty() ) slot_tally.leave_slot(); }
+ else
+ {
+ // notify muxer when last worker exits
+ xlock( &omutex );
+ if( --num_working == 0 ) xsignal( &oav_or_exit );
+ xunlock( &omutex );
+ }
+ return ipacket;
+ }
+
+ // collect an opacket from a worker
+ void collect_packet( const Opacket * const opacket, const int worker_id )
+ {
+ xlock( &omutex );
+ if( opacket->data )
+ {
+ while( opacket_queues[worker_id].size() >= out_slots )
+ xwait( &slot_av[worker_id], &omutex );
+ }
+ opacket_queues[worker_id].push( opacket );
+ if( worker_id == deliver_worker_id ) xsignal( &oav_or_exit );
+ xunlock( &omutex );
+ }
+
+ /* Deliver an opacket to muxer.
+ If opacket data == 0, move to next queue and wait again. */
+ const Opacket * deliver_packet()
+ {
+ const Opacket * opacket = 0;
+ xlock( &omutex );
+ ++ocheck_counter;
+ while( true )
+ {
+ while( opacket_queues[deliver_worker_id].empty() && num_working > 0 )
+ {
+ ++owait_counter;
+ xwait( &oav_or_exit, &omutex );
+ }
+ if( opacket_queues[deliver_worker_id].empty() ) break;
+ opacket = opacket_queues[deliver_worker_id].front();
+ opacket_queues[deliver_worker_id].pop();
+ if( opacket_queues[deliver_worker_id].size() + 1 == out_slots )
+ xsignal( &slot_av[deliver_worker_id] );
+ if( opacket->data ) break;
+ if( ++deliver_worker_id >= num_workers ) deliver_worker_id = 0;
+ delete opacket; opacket = 0;
+ }
+ xunlock( &omutex );
+ return opacket;
+ }
+
+ void finish() // grouper has no more packets to send
+ {
+ xlock( &imutex );
+ eof = true;
+ xbroadcast( &iav_or_eof );
+ xunlock( &imutex );
+ }
+
+ bool finished() // all packets delivered to muxer
+ {
+ if( !slot_tally.all_free() || !eof || num_working != 0 ) return false;
+ for( int i = 0; i < num_workers; ++i )
+ if( !ipacket_queues[i].empty() ) return false;
+ for( int i = 0; i < num_workers; ++i )
+ if( !opacket_queues[i].empty() ) return false;
+ return true;
+ }
+ };
+
+
+// send one ipacket with tar member metadata to courier
+int add_member_lz( const char * const filename, const struct stat *,
+ const int flag, struct FTW * )
+ {
+ if( Exclude::excluded( filename ) ) return 0; // skip excluded files
+ long long file_size;
+ // metadata for extended records
+ Extended * const extended = new( std::nothrow ) Extended;
+ uint8_t * const header = extended ? new( std::nothrow ) Tar_header : 0;
+ if( !header )
+ { show_error( mem_msg ); if( extended ) delete extended; return 1; }
+ if( !fill_headers( filename, *extended, header, file_size, flag ) )
+ { delete[] header; delete extended; return 0; }
+ print_removed_prefix( extended->removed_prefix );
+
+ if( gcl_opts->solidity == bsolid )
+ {
+ const int ebsize = extended->full_size();
+ if( ebsize < 0 ) { show_error( extended->full_size_error() ); return 1; }
+ if( block_is_full( ebsize, file_size, gcl_opts->data_size,
+ partial_data_size ) )
+ courierp->receive_packet( new Ipacket ); // end of group
+ }
+ courierp->receive_packet( new Ipacket( filename, file_size, extended, header ) );
+
+ if( gcl_opts->solidity == no_solid ) // one tar member per group
+ courierp->receive_packet( new Ipacket );
+ if( verbosity >= 1 ) std::fprintf( stderr, "%s\n", filename );
+ return 0;
+ }
+
+
+struct Grouper_arg
+ {
+ const Cl_options * cl_opts;
+ Packet_courier * courier;
+ };
+
+
+/* Package metadata of the files to be archived and pass them to the
+ courier for distribution to workers.
+*/
+extern "C" void * grouper( void * arg )
+ {
+ const Grouper_arg & tmp = *(const Grouper_arg *)arg;
+ const Cl_options & cl_opts = *tmp.cl_opts;
+ Packet_courier & courier = *tmp.courier;
+
+ for( int i = 0; i < cl_opts.parser.arguments(); ++i ) // parse command line
+ {
+ const int code = cl_opts.parser.code( i );
+ const std::string & arg = cl_opts.parser.argument( i );
+ const char * filename = arg.c_str();
+ if( code == 'C' && chdir( filename ) != 0 )
+ { show_file_error( filename, chdir_msg, errno ); exit_fail_mt(); }
+ if( code ) continue; // skip options
+ if( cl_opts.parser.argument( i ).empty() ) continue; // skip empty names
+ std::string deslashed; // arg without trailing slashes
+ unsigned len = arg.size();
+ while( len > 1 && arg[len-1] == '/' ) --len;
+ if( len < arg.size() )
+ { deslashed.assign( arg, 0, len ); filename = deslashed.c_str(); }
+ if( Exclude::excluded( filename ) ) continue; // skip excluded files
+ struct stat st;
+ if( lstat( filename, &st ) != 0 ) // filename from command line
+ { show_file_error( filename, cant_stat, errno ); set_error_status( 1 ); }
+ else if( nftw( filename, add_member_lz, 16,
+ cl_opts.dereference ? 0 : FTW_PHYS ) != 0 )
+ exit_fail_mt(); // write error or OOM
+ else if( cl_opts.solidity == dsolid ) // end of group
+ courier.receive_packet( new Ipacket );
+ }
+
+ if( cl_opts.solidity == bsolid && partial_data_size ) // finish last block
+ { partial_data_size = 0; courierp->receive_packet( new Ipacket ); }
+ courier.finish(); // no more packets to send
+ return 0;
+ }
+
+
+/* Write ibuf to encoder. To minimize dictionary size, do not read from
+ encoder until encoder's input buffer is full or finish is true.
+ Send opacket to courier and allocate new obuf each time obuf is full.
+*/
+void loop_encode( const uint8_t * const ibuf, const int isize,
+ uint8_t * & obuf, int & opos, Packet_courier & courier,
+ LZ_Encoder * const encoder, const int worker_id,
+ const bool finish = false )
+ {
+ int ipos = 0;
+ if( opos < 0 || opos > max_packet_size )
+ internal_error( "bad buffer index in loop_encode." );
+ while( true )
+ {
+ if( ipos < isize )
+ {
+ const int wr = LZ_compress_write( encoder, ibuf + ipos, isize - ipos );
+ if( wr < 0 ) internal_error( "library error (LZ_compress_write)." );
+ ipos += wr;
+ }
+ if( ipos >= isize ) // ibuf is empty
+ { if( finish ) LZ_compress_finish( encoder ); else break; }
+ const int rd =
+ LZ_compress_read( encoder, obuf + opos, max_packet_size - opos );
+ if( rd < 0 )
+ {
+ if( verbosity >= 0 )
+ std::fprintf( stderr, "LZ_compress_read error: %s\n",
+ LZ_strerror( LZ_compress_errno( encoder ) ) );
+ exit_fail_mt();
+ }
+ opos += rd;
+ // obuf is full or last opacket in lzip member
+ if( opos >= max_packet_size || LZ_compress_finished( encoder ) == 1 )
+ {
+ if( opos > max_packet_size )
+ internal_error( "opacket size exceeded in worker." );
+ courier.collect_packet( new Opacket( obuf, opos ), worker_id );
+ opos = 0; obuf = new( std::nothrow ) uint8_t[max_packet_size];
+ if( !obuf ) { show_error( mem_msg2 ); exit_fail_mt(); }
+ if( LZ_compress_finished( encoder ) == 1 )
+ {
+ if( LZ_compress_restart_member( encoder, LLONG_MAX ) >= 0 ) break;
+ show_error( "LZ_compress_restart_member failed." ); exit_fail_mt();
+ }
+ }
+ }
+ if( ipos > isize ) internal_error( "ipacket size exceeded in worker." );
+ if( ipos < isize ) internal_error( "input not fully consumed in worker." );
+ }
+
+
+struct Worker_arg
+ {
+ Packet_courier * courier;
+ int dictionary_size;
+ int match_len_limit;
+ int worker_id;
+ };
+
+
+/* Get ipackets from courier, compress headers and file data, and give the
+ opackets produced to courier.
+*/
+extern "C" void * cworker( void * arg )
+ {
+ const Worker_arg & tmp = *(const Worker_arg *)arg;
+ Packet_courier & courier = *tmp.courier;
+ const int dictionary_size = tmp.dictionary_size;
+ const int match_len_limit = tmp.match_len_limit;
+ const int worker_id = tmp.worker_id;
+
+ LZ_Encoder * encoder = 0;
+ uint8_t * data = 0;
+ Resizable_buffer rbuf; // extended header + data
+ if( !rbuf.size() ) { show_error( mem_msg2 ); exit_fail_mt(); }
+
+ int opos = 0;
+ bool flushed = true; // avoid producing empty lzip members
+ while( true )
+ {
+ const Ipacket * const ipacket = courier.distribute_packet( worker_id );
+ if( !ipacket ) break; // no more packets to process
+ if( ipacket->filename.empty() ) // end of group
+ {
+ if( !flushed ) // this lzip member is not empty
+ loop_encode( 0, 0, data, opos, courier, encoder, worker_id, true );
+ courier.collect_packet( new Opacket, worker_id ); // end of member token
+ flushed = true; delete ipacket; continue;
+ }
+
+ const char * const filename = ipacket->filename.c_str();
+ const int infd = ipacket->file_size ? open_instream( filename ) : -1;
+ if( ipacket->file_size && infd < 0 ) // can't read file data
+ { delete[] ipacket->header; delete ipacket->extended; delete ipacket;
+ set_error_status( 1 ); continue; } // skip file
+
+ flushed = false;
+ if( !encoder ) // init encoder just before using it
+ {
+ data = new( std::nothrow ) uint8_t[max_packet_size];
+ encoder = LZ_compress_open( dictionary_size, match_len_limit, LLONG_MAX );
+ if( !data || !encoder || LZ_compress_errno( encoder ) != LZ_ok )
+ {
+ if( !data || !encoder || LZ_compress_errno( encoder ) == LZ_mem_error )
+ show_error( mem_msg2 );
+ else
+ internal_error( "invalid argument to encoder." );
+ exit_fail_mt();
+ }
+ }
+
+ const int ebsize = ipacket->extended->format_block( rbuf ); // may be 0
+ if( ebsize < 0 )
+ { show_error( ipacket->extended->full_size_error() ); exit_fail_mt(); }
+ if( ebsize > 0 ) // compress extended block
+ loop_encode( rbuf.u8(), ebsize, data, opos, courier, encoder, worker_id );
+ // compress ustar header
+ loop_encode( ipacket->header, header_size, data, opos, courier,
+ encoder, worker_id );
+ delete[] ipacket->header; delete ipacket->extended;
+
+ if( ipacket->file_size )
+ {
+ const long long bufsize = 32 * header_size;
+ uint8_t buf[bufsize];
+ long long rest = ipacket->file_size;
+ while( rest > 0 )
+ {
+ int size = std::min( rest, bufsize );
+ const int rd = readblock( infd, buf, size );
+ rest -= rd;
+ if( rd != size )
+ {
+ show_atpos_error( filename, ipacket->file_size - rest, false );
+ close( infd ); exit_fail_mt();
+ }
+ if( rest == 0 ) // last read
+ {
+ const int rem = ipacket->file_size % header_size;
+ if( rem > 0 )
+ { const int padding = header_size - rem;
+ std::memset( buf + size, 0, padding ); size += padding; }
+ }
+ // compress size bytes of file
+ loop_encode( buf, size, data, opos, courier, encoder, worker_id );
+ }
+ if( close( infd ) != 0 )
+ { show_file_error( filename, eclosf_msg, errno ); exit_fail_mt(); }
+ }
+ if( gcl_opts->warn_newer && archive_attrs.is_newer( filename ) )
+ { show_file_error( filename, "File is newer than the archive." );
+ set_error_status( 1 ); }
+ delete ipacket;
+ }
+ if( data ) delete[] data;
+ if( encoder && LZ_compress_close( encoder ) < 0 )
+ { show_error( "LZ_compress_close failed." ); exit_fail_mt(); }
+ return 0;
+ }
+
+
+/* Get from courier the processed and sorted packets, and write
+ their contents to the output archive.
+*/
+void muxer( Packet_courier & courier, const int outfd )
+ {
+ while( true )
+ {
+ const Opacket * const opacket = courier.deliver_packet();
+ if( !opacket ) break; // queue is empty. all workers exited
+
+ if( !writeblock_wrapper( outfd, opacket->data, opacket->size ) )
+ exit_fail_mt();
+ delete[] opacket->data;
+ delete opacket;
+ }
+ }
+
+} // end namespace
+
+
+// init the courier, then start the grouper and the workers and call the muxer
+int encode_lz( const Cl_options & cl_opts, const char * const archive_namep,
+ const int outfd )
+ {
+ const int in_slots = 65536; // max small files (<=512B) in 64 MiB
+ const int num_workers = cl_opts.num_workers;
+ const int total_in_slots = ( INT_MAX / num_workers >= in_slots ) ?
+ num_workers * in_slots : INT_MAX;
+ const int dictionary_size = option_mapping[cl_opts.level].dictionary_size;
+ const int match_len_limit = option_mapping[cl_opts.level].match_len_limit;
+ gcl_opts = &cl_opts;
+
+ /* If an error happens after any threads have been started, exit must be
+ called before courier goes out of scope. */
+ Packet_courier courier( num_workers, total_in_slots, cl_opts.out_slots );
+ courierp = &courier; // needed by add_member_lz
+
+ Grouper_arg grouper_arg;
+ grouper_arg.cl_opts = &cl_opts;
+ grouper_arg.courier = &courier;
+
+ pthread_t grouper_thread;
+ int errcode = pthread_create( &grouper_thread, 0, grouper, &grouper_arg );
+ if( errcode )
+ { show_error( "Can't create grouper thread", errcode ); return 1; }
+
+ Worker_arg * worker_args = new( std::nothrow ) Worker_arg[num_workers];
+ pthread_t * worker_threads = new( std::nothrow ) pthread_t[num_workers];
+ if( !worker_args || !worker_threads )
+ { show_error( mem_msg ); exit_fail_mt(); }
+ for( int i = 0; i < num_workers; ++i )
+ {
+ worker_args[i].courier = &courier;
+ worker_args[i].dictionary_size = dictionary_size;
+ worker_args[i].match_len_limit = match_len_limit;
+ worker_args[i].worker_id = i;
+ errcode = pthread_create( &worker_threads[i], 0, cworker, &worker_args[i] );
+ if( errcode )
+ { show_error( "Can't create worker threads", errcode ); exit_fail_mt(); }
+ }
+
+ muxer( courier, outfd );
+
+ for( int i = num_workers - 1; i >= 0; --i )
+ {
+ errcode = pthread_join( worker_threads[i], 0 );
+ if( errcode )
+ { show_error( "Can't join worker threads", errcode ); exit_fail_mt(); }
+ }
+ delete[] worker_threads;
+ delete[] worker_args;
+
+ errcode = pthread_join( grouper_thread, 0 );
+ if( errcode )
+ { show_error( "Can't join grouper thread", errcode ); exit_fail_mt(); }
+
+ // write End-Of-Archive records
+ int retval = !write_eoa_records( outfd, true );
+
+ if( close( outfd ) != 0 && retval == 0 )
+ { show_file_error( archive_namep, eclosa_msg, errno ); retval = 1; }
+
+ if( cl_opts.debug_level & 1 )
+ std::fprintf( stderr,
+ "any worker tried to consume from grouper %8u times\n"
+ "any worker had to wait %8u times\n"
+ "muxer tried to consume from workers %8u times\n"
+ "muxer had to wait %8u times\n",
+ courier.icheck_counter,
+ courier.iwait_counter,
+ courier.ocheck_counter,
+ courier.owait_counter );
+
+ if( !courier.finished() ) internal_error( conofin_msg );
+ return final_exit_status( retval );
+ }