summaryrefslogtreecommitdiffstats
path: root/merge.cc
diff options
context:
space:
mode:
Diffstat (limited to 'merge.cc')
-rw-r--r--merge.cc378
1 files changed, 211 insertions, 167 deletions
diff --git a/merge.cc b/merge.cc
index c254a59..c216870 100644
--- a/merge.cc
+++ b/merge.cc
@@ -35,70 +35,107 @@
namespace {
-bool copy_and_diff_file( const std::vector< int > & infd_vector,
- const int outfd, std::vector< Block > & block_vector )
+// Add 'bv' to 'block_vector' splitting blocks as needed to keep all the
+// edges (pos and end of every block).
+// 'block_vector' contains the result. 'bv' is destroyed.
+void combine( std::vector< Block > & block_vector, std::vector< Block > & bv )
{
+ if( block_vector.empty() ) { block_vector.swap( bv ); return; }
+ unsigned i1 = 0, i2 = 0;
+ while( i1 < block_vector.size() && i2 < bv.size() )
+ {
+ Block & b1 = block_vector[i1];
+ Block & b2 = bv[i2];
+ if( b1.overlaps( b2 ) )
+ {
+ if( b1 < b2 )
+ {
+ Block b = b1.split( b2.pos() );
+ block_vector.insert( block_vector.begin() + i1, b ); ++i1;
+ }
+ else if( b2 < b1 )
+ {
+ Block b( b2.pos(), b1.pos() - b2.pos() );
+ b2.split( b1.pos() );
+ block_vector.insert( block_vector.begin() + i1, b ); ++i1;
+ }
+ else if( b1.end() < b2.end() ) { b2.split( b1.end() ); ++i1; }
+ else if( b2.end() < b1.end() )
+ {
+ Block b = b1.split( b2.end() );
+ block_vector.insert( block_vector.begin() + i1, b ); ++i1; ++i2;
+ }
+ else { ++i1; ++i2; } // blocks are identical
+ }
+ else if( b1 < b2 ) ++i1;
+ else { block_vector.insert( block_vector.begin() + i1, b2 ); ++i1; ++i2; }
+ }
+ if( i2 < bv.size() ) // tail copy
+ block_vector.insert( block_vector.end(), bv.begin() + i2, bv.end() );
+ }
+
+
+bool diff_member( const long long mpos, const long long msize,
+ const std::vector< int > & infd_vector,
+ std::vector< Block > & block_vector )
+ {
+ const int files = infd_vector.size();
const int buffer_size = 65536;
- std::vector< uint8_t * > buffer_vector( infd_vector.size() );
- for( unsigned i = 0; i < infd_vector.size(); ++i )
- buffer_vector[i] = new uint8_t[buffer_size];
- Block b( 0, 0 );
- long long partial_pos = 0;
- int equal_bytes = 0;
- bool error = false;
+ uint8_t * const buffer1 = new uint8_t[buffer_size];
+ uint8_t * const buffer2 = new uint8_t[buffer_size];
- while( true )
+ bool error = false;
+ for( int i1 = 0; i1 + 1 < files && !error; ++i1 )
{
- const int rd = readblock( infd_vector[0], buffer_vector[0], buffer_size );
- if( rd != buffer_size && errno )
- { show_error( "Error reading input file", errno ); error = true; break; }
- if( rd > 0 )
+ for( int i2 = i1 + 1; i2 < files && !error; ++i2 )
{
- for( unsigned i = 1; i < infd_vector.size(); ++i )
- if( readblock( infd_vector[i], buffer_vector[i], rd ) != rd )
- { show_error( "Error reading input file", errno );
- error = true; break; }
- if( error ) break;
- const int wr = writeblock( outfd, buffer_vector[0], rd );
- if( wr != rd )
- { show_error( "Error writing output file", errno );
- error = true; break; }
- for( int i = 0; i < rd; ++i )
+ std::vector< Block > bv;
+ long long partial_pos = 0;
+ const int fd1 = infd_vector[i1], fd2 = infd_vector[i2];
+ int begin = -1; // begin of block. -1 means no block
+ bool prev_equal = true;
+ if( !safe_seek( fd1, mpos ) || !safe_seek( fd2, mpos ) )
+ { error = true; break; }
+
+ while( msize > partial_pos )
{
- while( i < rd && b.pos() == 0 )
- {
- for( unsigned j = 1; j < infd_vector.size(); ++j )
- if( buffer_vector[0][i] != buffer_vector[j][i] )
- { b.pos( partial_pos + i ); break; } // begin block
- ++i;
- }
- while( i < rd && b.pos() > 0 )
+ const int size = std::min( (long long)buffer_size, msize - partial_pos );
+ const int rd = readblock( fd1, buffer1, size );
+ if( rd != size && errno )
+ { show_error( "Error reading input file", errno ); error = true; break; }
+ if( rd > 0 )
{
- ++equal_bytes;
- for( unsigned j = 1; j < infd_vector.size(); ++j )
- if( buffer_vector[0][i] != buffer_vector[j][i] )
- { equal_bytes = 0; break; }
- if( equal_bytes >= 2 ) // end block
+ if( readblock( fd2, buffer2, rd ) != rd )
+ { show_error( "Error reading input file", errno );
+ error = true; break; }
+ for( int i = 0; i < rd; ++i )
{
- b.size( partial_pos + i - ( equal_bytes - 1 ) - b.pos() );
- block_vector.push_back( b );
- b.pos( 0 );
- equal_bytes = 0;
+ if( buffer1[i] != buffer2[i] )
+ {
+ prev_equal = false;
+ if( begin < 0 ) begin = partial_pos + i; // begin block
+ }
+ else if( !prev_equal ) prev_equal = true;
+ else if( begin >= 0 ) // end block
+ {
+ Block b( mpos + begin, partial_pos + i - 1 - begin );
+ begin = -1;
+ bv.push_back( b );
+ }
}
- ++i;
+ partial_pos += rd;
}
+ if( rd < buffer_size ) break; // EOF
}
- partial_pos += rd;
+ if( begin >= 0 ) // finish last block
+ {
+ Block b( mpos + begin, partial_pos - prev_equal - begin );
+ bv.push_back( b );
+ }
+ combine( block_vector, bv );
}
- if( rd < buffer_size ) break; // EOF
- }
- if( b.pos() > 0 ) // finish last block
- {
- b.size( partial_pos - b.pos() );
- block_vector.push_back( b );
}
- for( unsigned i = 0; i < infd_vector.size(); ++i )
- delete[] buffer_vector[i];
+ delete[] buffer2; delete[] buffer1;
return !error;
}
@@ -116,15 +153,16 @@ int ipow( const unsigned base, const unsigned exponent )
int open_input_files( const std::vector< std::string > & filenames,
- std::vector< int > & infd_vector, long long & isize,
- const int verbosity )
+ std::vector< int > & infd_vector,
+ File_index & file_index, const int verbosity )
{
+ const int files = filenames.size();
bool identical = false;
- for( unsigned i = 1; i < filenames.size(); ++i )
+ for( int i = 1; i < files; ++i )
if( filenames[0] == filenames[i] )
{ identical = true; break; }
if( !identical )
- for( unsigned i = 0; i < filenames.size(); ++i )
+ for( int i = 0; i < files; ++i )
{
struct stat in_stats;
ino_t st_ino0 = 0;
@@ -137,15 +175,27 @@ int open_input_files( const std::vector< std::string > & filenames,
}
if( identical ) { show_error( "Two input files are the same." ); return 2; }
- isize = 0;
- for( unsigned i = 0; i < filenames.size(); ++i )
+ long long isize = 0;
+ for( int i = 0; i < files; ++i )
{
- const long long tmp = lseek( infd_vector[i], 0, SEEK_END );
- if( tmp < 0 )
+ long long tmp;
+ const File_index fi( infd_vector[i] );
+ if( fi.retval() == 0 ) // file format is intact
+ {
+ if( file_index.retval() != 0 ) file_index = fi;
+ else if( file_index != fi )
+ { show_error( "Input files are different." ); return 2; }
+ tmp = file_index.file_size();
+ }
+ else // file format is damaged
{
- if( verbosity >= 0 )
- std::fprintf( stderr, "File '%s' is not seekable.\n", filenames[i].c_str() );
- return 1;
+ tmp = lseek( infd_vector[i], 0, SEEK_END );
+ if( tmp < 0 )
+ {
+ if( verbosity >= 0 )
+ std::fprintf( stderr, "File '%s' is not seekable.\n", filenames[i].c_str() );
+ return 1;
+ }
}
if( i == 0 )
{
@@ -157,23 +207,33 @@ int open_input_files( const std::vector< std::string > & filenames,
{ show_error( "Sizes of input files are different." ); return 2; }
}
- for( unsigned i = 0; i < filenames.size(); ++i )
- if( !verify_single_member( infd_vector[i], isize, verbosity ) )
- return 2;
+ if( file_index.retval() != 0 )
+ {
+ const File_index fi( infd_vector, isize );
+ if( fi.retval() == 0 ) // file format could be recovered
+ file_index = fi;
+ else
+ { show_error( "Format damaged in all input files." ); return 2; }
+ }
- for( unsigned i = 0; i < filenames.size(); ++i )
+ for( int i = 0; i < files; ++i )
{
- if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 )
- { show_error( "Seek error in input file", errno ); return 1; }
- if( try_decompress( infd_vector[i], isize ) )
+ const int infd = infd_vector[i];
+ bool error = false;
+ for( int j = 0; j < file_index.members(); ++j )
+ {
+ const long long mpos = file_index.mblock( j ).pos();
+ const long long msize = file_index.mblock( j ).size();
+ if( !safe_seek( infd, mpos ) ) return 1;
+ if( !try_decompress_member( infd, msize ) ) { error = true; break; }
+ }
+ if( !error )
{
if( verbosity >= 1 )
std::printf( "File '%s' has no errors. Recovery is not needed.\n",
filenames[i].c_str() );
return 0;
}
- if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 )
- { show_error( "Seek error in input file", errno ); return 1; }
}
return -1;
}
@@ -221,16 +281,15 @@ bool copy_file( const int infd, const int outfd, const long long max_size )
}
-bool try_decompress( const int fd, const unsigned long long file_size,
- long long * failure_posp )
+bool try_decompress_member( const int fd, const unsigned long long msize,
+ long long * failure_posp )
{
try {
Range_decoder rdec( fd );
File_header header;
rdec.read_data( header.data, File_header::size );
if( !rdec.finished() && // End Of File
- header.verify_magic() &&
- header.version() == 1 &&
+ header.verify_magic() && header.verify_version() &&
header.dictionary_size() >= min_dictionary_size &&
header.dictionary_size() <= max_dictionary_size )
{
@@ -238,7 +297,7 @@ bool try_decompress( const int fd, const unsigned long long file_size,
Pretty_print dummy( "", -1 );
if( decoder.decode_member( dummy ) == 0 &&
- rdec.member_position() == file_size ) return true;
+ rdec.member_position() == msize ) return true;
if( failure_posp ) *failure_posp = rdec.member_position();
}
}
@@ -259,12 +318,7 @@ bool verify_header( const File_header & header, const int verbosity )
show_error( "Bad magic number (file not in lzip format)." );
return false;
}
- if( header.version() == 0 )
- {
- show_error( "Version 0 member format can't be recovered." );
- return false;
- }
- if( header.version() != 1 )
+ if( !header.verify_version() )
{
if( verbosity >= 0 )
std::fprintf( stderr, "Version %d member format not supported.\n",
@@ -275,116 +329,106 @@ bool verify_header( const File_header & header, const int verbosity )
}
-bool verify_single_member( const int fd, const long long file_size,
- const int verbosity )
- {
- File_header header;
- if( lseek( fd, 0, SEEK_SET ) < 0 ||
- readblock( fd, header.data, File_header::size ) != File_header::size )
- { show_error( "Error reading member header", errno ); return false; }
- if( !verify_header( header, verbosity ) ) return false;
-
- File_trailer trailer;
- if( lseek( fd, -File_trailer::size(), SEEK_END ) < 0 ||
- readblock( fd, trailer.data, File_trailer::size() ) != File_trailer::size() )
- { show_error( "Error reading member trailer", errno ); return false; }
- const long long member_size = trailer.member_size();
- if( member_size != file_size )
- {
- if( member_size < file_size &&
- lseek( fd, -member_size, SEEK_END ) > 0 &&
- readblock( fd, header.data, File_header::size ) == File_header::size &&
- verify_header( header, verbosity ) )
- show_error( "Input file has more than 1 member. Split it first." );
- else
- show_error( "Member size in input file trailer is corrupt." );
- return false;
- }
- return true;
- }
-
-
int merge_files( const std::vector< std::string > & filenames,
const std::string & output_filename, const int verbosity,
const bool force )
{
- std::vector< int > infd_vector( filenames.size() );
- long long isize = 0;
- const int retval = open_input_files( filenames, infd_vector, isize, verbosity );
+ const int files = filenames.size();
+ std::vector< int > infd_vector( files );
+ File_index file_index;
+ const int retval =
+ open_input_files( filenames, infd_vector, file_index, verbosity );
if( retval >= 0 ) return retval;
+ if( !safe_seek( infd_vector[0], 0 ) ) return 1;
const int outfd = open_outstream_rw( output_filename, force );
if( outfd < 0 ) return 1;
-
- // vector of data blocks differing among the copies of the input file.
- std::vector< Block > block_vector;
- if( !copy_and_diff_file( infd_vector, outfd, block_vector ) )
+ if( !copy_file( infd_vector[0], outfd ) ) // copy whole file
cleanup_and_fail( output_filename, outfd, 1 );
- if( block_vector.size() == 0 )
- { show_error( "Input files are identical. Recovery is not possible." );
- cleanup_and_fail( output_filename, outfd, 2 ); }
-
- const bool single_block = ( block_vector.size() == 1 );
- if( single_block && block_vector[0].size() < 2 )
- { show_error( "Input files have the same byte damaged."
- " Try repairing one of them." );
- cleanup_and_fail( output_filename, outfd, 2 ); }
+ for( int j = 0; j < file_index.members(); ++j )
+ {
+ const long long mpos = file_index.mblock( j ).pos();
+ const long long msize = file_index.mblock( j ).size();
+ // vector of data blocks differing among the copies of the current member
+ std::vector< Block > block_vector;
+ if( !diff_member( mpos, msize, infd_vector, block_vector ) ||
+ !safe_seek( outfd, mpos ) )
+ cleanup_and_fail( output_filename, outfd, 1 );
+
+ if( block_vector.size() == 0 )
+ {
+ if( file_index.members() > 1 && try_decompress_member( outfd, msize ) )
+ continue;
+ show_error( "Input files are (partially) identical. Recovery is not possible." );
+ cleanup_and_fail( output_filename, outfd, 2 );
+ }
- if( ipow( filenames.size(), block_vector.size() ) >= INT_MAX ||
- ( single_block &&
- ipow( filenames.size(), 2 ) >= INT_MAX / block_vector[0].size() ) )
- { show_error( "Input files are too damaged. Recovery is not possible." );
- cleanup_and_fail( output_filename, outfd, 2 ); }
+ const int size0 = block_vector[0].size();
+ const bool single_block = ( block_vector.size() == 1 );
+ if( ipow( files, block_vector.size() ) >= INT_MAX ||
+ ( single_block && ipow( files, 2 ) >= INT_MAX / size0 ) )
+ { show_error( "Input files are too damaged. Recovery is not possible." );
+ cleanup_and_fail( output_filename, outfd, 2 ); }
- const int shifts = ( single_block ? block_vector[0].size() - 1 : 1 );
- if( single_block )
- {
- Block b( block_vector[0].pos() + 1, block_vector[0].size() - 1 );
- block_vector[0].size( 1 );
- block_vector.push_back( b );
- }
+ const int shifts = ( single_block && size0 > 1 ) ? size0 - 1 : 1;
+ if( single_block && size0 > 1 )
+ {
+ Block b( block_vector[0].pos() + 1, size0 - 1 );
+ block_vector[0].size( 1 );
+ block_vector.push_back( b );
+ }
- const int base_variations = ipow( filenames.size(), block_vector.size() );
- const int variations = ( base_variations * shifts ) - 2;
- bool done = false;
- for( int var = 1; var <= variations; ++var )
- {
- if( verbosity >= 1 )
+ if( verbosity >= 1 && file_index.members() > 1 )
{
- std::printf( "Trying variation %d of %d \r", var, variations );
+ std::printf( "Merging member %d\n", j + 1 );
std::fflush( stdout );
}
- int tmp = var;
- for( unsigned i = 0; i < block_vector.size(); ++i )
+ const int base_variations = ipow( files, block_vector.size() );
+ const int variations = base_variations * shifts;
+ bool done = false;
+ for( int var = 0; var < variations; ++var )
{
- const int infd = infd_vector[tmp % filenames.size()];
- tmp /= filenames.size();
- if( lseek( infd, block_vector[i].pos(), SEEK_SET ) < 0 ||
- lseek( outfd, block_vector[i].pos(), SEEK_SET ) < 0 ||
- !copy_file( infd, outfd, block_vector[i].size() ) )
- { show_error( "Error reading output file", errno );
- cleanup_and_fail( output_filename, outfd, 1 ); }
+ if( verbosity >= 1 )
+ {
+ std::printf( "Trying variation %d of %d \r", var + 1, variations );
+ std::fflush( stdout );
+ }
+ int tmp = var;
+ for( unsigned i = 0; i < block_vector.size(); ++i )
+ {
+ const int infd = infd_vector[tmp % files];
+ tmp /= files;
+ if( lseek( infd, block_vector[i].pos(), SEEK_SET ) < 0 ||
+ lseek( outfd, block_vector[i].pos(), SEEK_SET ) < 0 ||
+ !copy_file( infd, outfd, block_vector[i].size() ) )
+ { show_error( "Error reading output file", errno );
+ cleanup_and_fail( output_filename, outfd, 1 ); }
+ }
+ if( !safe_seek( outfd, mpos ) )
+ cleanup_and_fail( output_filename, outfd, 1 );
+ if( try_decompress_member( outfd, msize ) )
+ { done = true; break; }
+ if( var > 0 && var % base_variations == 0 )
+ block_vector[0].shift( block_vector[1] );
+ }
+ if( verbosity >= 1 ) std::printf( "\n" );
+ if( !done )
+ {
+ if( verbosity >= 2 )
+ for( unsigned i = 0; i < block_vector.size(); ++i )
+ std::fprintf( stderr, "area %2d from offset %6lld to %6lld\n", i + 1,
+ block_vector[i].pos(), block_vector[i].end() - 1 );
+ show_error( "Some error areas overlap. Can't recover input file." );
+ cleanup_and_fail( output_filename, outfd, 2 );
}
- if( lseek( outfd, 0, SEEK_SET ) < 0 )
- { show_error( "Seek error in output file", errno );
- cleanup_and_fail( output_filename, outfd, 1 ); }
- if( try_decompress( outfd, isize ) )
- { done = true; break; }
- if( var % base_variations == 0 ) block_vector[0].shift( block_vector[1] );
}
- if( verbosity >= 1 ) std::printf( "\n" );
if( close( outfd ) != 0 )
{
show_error( "Error closing output file", errno );
cleanup_and_fail( output_filename, -1, 1 );
}
- if( !done )
- {
- show_error( "Some error areas overlap. Can't recover input file." );
- cleanup_and_fail( output_filename, -1, 2 );
- }
if( verbosity >= 1 )
std::printf( "Input files merged successfully.\n" );
return 0;