diff options
Diffstat (limited to 'merge.cc')
-rw-r--r-- | merge.cc | 378 |
1 files changed, 211 insertions, 167 deletions
@@ -35,70 +35,107 @@ namespace { -bool copy_and_diff_file( const std::vector< int > & infd_vector, - const int outfd, std::vector< Block > & block_vector ) +// Add 'bv' to 'block_vector' splitting blocks as needed to keep all the +// edges (pos and end of every block). +// 'block_vector' contains the result. 'bv' is destroyed. +void combine( std::vector< Block > & block_vector, std::vector< Block > & bv ) { + if( block_vector.empty() ) { block_vector.swap( bv ); return; } + unsigned i1 = 0, i2 = 0; + while( i1 < block_vector.size() && i2 < bv.size() ) + { + Block & b1 = block_vector[i1]; + Block & b2 = bv[i2]; + if( b1.overlaps( b2 ) ) + { + if( b1 < b2 ) + { + Block b = b1.split( b2.pos() ); + block_vector.insert( block_vector.begin() + i1, b ); ++i1; + } + else if( b2 < b1 ) + { + Block b( b2.pos(), b1.pos() - b2.pos() ); + b2.split( b1.pos() ); + block_vector.insert( block_vector.begin() + i1, b ); ++i1; + } + else if( b1.end() < b2.end() ) { b2.split( b1.end() ); ++i1; } + else if( b2.end() < b1.end() ) + { + Block b = b1.split( b2.end() ); + block_vector.insert( block_vector.begin() + i1, b ); ++i1; ++i2; + } + else { ++i1; ++i2; } // blocks are identical + } + else if( b1 < b2 ) ++i1; + else { block_vector.insert( block_vector.begin() + i1, b2 ); ++i1; ++i2; } + } + if( i2 < bv.size() ) // tail copy + block_vector.insert( block_vector.end(), bv.begin() + i2, bv.end() ); + } + + +bool diff_member( const long long mpos, const long long msize, + const std::vector< int > & infd_vector, + std::vector< Block > & block_vector ) + { + const int files = infd_vector.size(); const int buffer_size = 65536; - std::vector< uint8_t * > buffer_vector( infd_vector.size() ); - for( unsigned i = 0; i < infd_vector.size(); ++i ) - buffer_vector[i] = new uint8_t[buffer_size]; - Block b( 0, 0 ); - long long partial_pos = 0; - int equal_bytes = 0; - bool error = false; + uint8_t * const buffer1 = new uint8_t[buffer_size]; + uint8_t * const buffer2 = new uint8_t[buffer_size]; - while( true ) + bool error = false; + for( int i1 = 0; i1 + 1 < files && !error; ++i1 ) { - const int rd = readblock( infd_vector[0], buffer_vector[0], buffer_size ); - if( rd != buffer_size && errno ) - { show_error( "Error reading input file", errno ); error = true; break; } - if( rd > 0 ) + for( int i2 = i1 + 1; i2 < files && !error; ++i2 ) { - for( unsigned i = 1; i < infd_vector.size(); ++i ) - if( readblock( infd_vector[i], buffer_vector[i], rd ) != rd ) - { show_error( "Error reading input file", errno ); - error = true; break; } - if( error ) break; - const int wr = writeblock( outfd, buffer_vector[0], rd ); - if( wr != rd ) - { show_error( "Error writing output file", errno ); - error = true; break; } - for( int i = 0; i < rd; ++i ) + std::vector< Block > bv; + long long partial_pos = 0; + const int fd1 = infd_vector[i1], fd2 = infd_vector[i2]; + int begin = -1; // begin of block. -1 means no block + bool prev_equal = true; + if( !safe_seek( fd1, mpos ) || !safe_seek( fd2, mpos ) ) + { error = true; break; } + + while( msize > partial_pos ) { - while( i < rd && b.pos() == 0 ) - { - for( unsigned j = 1; j < infd_vector.size(); ++j ) - if( buffer_vector[0][i] != buffer_vector[j][i] ) - { b.pos( partial_pos + i ); break; } // begin block - ++i; - } - while( i < rd && b.pos() > 0 ) + const int size = std::min( (long long)buffer_size, msize - partial_pos ); + const int rd = readblock( fd1, buffer1, size ); + if( rd != size && errno ) + { show_error( "Error reading input file", errno ); error = true; break; } + if( rd > 0 ) { - ++equal_bytes; - for( unsigned j = 1; j < infd_vector.size(); ++j ) - if( buffer_vector[0][i] != buffer_vector[j][i] ) - { equal_bytes = 0; break; } - if( equal_bytes >= 2 ) // end block + if( readblock( fd2, buffer2, rd ) != rd ) + { show_error( "Error reading input file", errno ); + error = true; break; } + for( int i = 0; i < rd; ++i ) { - b.size( partial_pos + i - ( equal_bytes - 1 ) - b.pos() ); - block_vector.push_back( b ); - b.pos( 0 ); - equal_bytes = 0; + if( buffer1[i] != buffer2[i] ) + { + prev_equal = false; + if( begin < 0 ) begin = partial_pos + i; // begin block + } + else if( !prev_equal ) prev_equal = true; + else if( begin >= 0 ) // end block + { + Block b( mpos + begin, partial_pos + i - 1 - begin ); + begin = -1; + bv.push_back( b ); + } } - ++i; + partial_pos += rd; } + if( rd < buffer_size ) break; // EOF } - partial_pos += rd; + if( begin >= 0 ) // finish last block + { + Block b( mpos + begin, partial_pos - prev_equal - begin ); + bv.push_back( b ); + } + combine( block_vector, bv ); } - if( rd < buffer_size ) break; // EOF - } - if( b.pos() > 0 ) // finish last block - { - b.size( partial_pos - b.pos() ); - block_vector.push_back( b ); } - for( unsigned i = 0; i < infd_vector.size(); ++i ) - delete[] buffer_vector[i]; + delete[] buffer2; delete[] buffer1; return !error; } @@ -116,15 +153,16 @@ int ipow( const unsigned base, const unsigned exponent ) int open_input_files( const std::vector< std::string > & filenames, - std::vector< int > & infd_vector, long long & isize, - const int verbosity ) + std::vector< int > & infd_vector, + File_index & file_index, const int verbosity ) { + const int files = filenames.size(); bool identical = false; - for( unsigned i = 1; i < filenames.size(); ++i ) + for( int i = 1; i < files; ++i ) if( filenames[0] == filenames[i] ) { identical = true; break; } if( !identical ) - for( unsigned i = 0; i < filenames.size(); ++i ) + for( int i = 0; i < files; ++i ) { struct stat in_stats; ino_t st_ino0 = 0; @@ -137,15 +175,27 @@ int open_input_files( const std::vector< std::string > & filenames, } if( identical ) { show_error( "Two input files are the same." ); return 2; } - isize = 0; - for( unsigned i = 0; i < filenames.size(); ++i ) + long long isize = 0; + for( int i = 0; i < files; ++i ) { - const long long tmp = lseek( infd_vector[i], 0, SEEK_END ); - if( tmp < 0 ) + long long tmp; + const File_index fi( infd_vector[i] ); + if( fi.retval() == 0 ) // file format is intact + { + if( file_index.retval() != 0 ) file_index = fi; + else if( file_index != fi ) + { show_error( "Input files are different." ); return 2; } + tmp = file_index.file_size(); + } + else // file format is damaged { - if( verbosity >= 0 ) - std::fprintf( stderr, "File '%s' is not seekable.\n", filenames[i].c_str() ); - return 1; + tmp = lseek( infd_vector[i], 0, SEEK_END ); + if( tmp < 0 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "File '%s' is not seekable.\n", filenames[i].c_str() ); + return 1; + } } if( i == 0 ) { @@ -157,23 +207,33 @@ int open_input_files( const std::vector< std::string > & filenames, { show_error( "Sizes of input files are different." ); return 2; } } - for( unsigned i = 0; i < filenames.size(); ++i ) - if( !verify_single_member( infd_vector[i], isize, verbosity ) ) - return 2; + if( file_index.retval() != 0 ) + { + const File_index fi( infd_vector, isize ); + if( fi.retval() == 0 ) // file format could be recovered + file_index = fi; + else + { show_error( "Format damaged in all input files." ); return 2; } + } - for( unsigned i = 0; i < filenames.size(); ++i ) + for( int i = 0; i < files; ++i ) { - if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 ) - { show_error( "Seek error in input file", errno ); return 1; } - if( try_decompress( infd_vector[i], isize ) ) + const int infd = infd_vector[i]; + bool error = false; + for( int j = 0; j < file_index.members(); ++j ) + { + const long long mpos = file_index.mblock( j ).pos(); + const long long msize = file_index.mblock( j ).size(); + if( !safe_seek( infd, mpos ) ) return 1; + if( !try_decompress_member( infd, msize ) ) { error = true; break; } + } + if( !error ) { if( verbosity >= 1 ) std::printf( "File '%s' has no errors. Recovery is not needed.\n", filenames[i].c_str() ); return 0; } - if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 ) - { show_error( "Seek error in input file", errno ); return 1; } } return -1; } @@ -221,16 +281,15 @@ bool copy_file( const int infd, const int outfd, const long long max_size ) } -bool try_decompress( const int fd, const unsigned long long file_size, - long long * failure_posp ) +bool try_decompress_member( const int fd, const unsigned long long msize, + long long * failure_posp ) { try { Range_decoder rdec( fd ); File_header header; rdec.read_data( header.data, File_header::size ); if( !rdec.finished() && // End Of File - header.verify_magic() && - header.version() == 1 && + header.verify_magic() && header.verify_version() && header.dictionary_size() >= min_dictionary_size && header.dictionary_size() <= max_dictionary_size ) { @@ -238,7 +297,7 @@ bool try_decompress( const int fd, const unsigned long long file_size, Pretty_print dummy( "", -1 ); if( decoder.decode_member( dummy ) == 0 && - rdec.member_position() == file_size ) return true; + rdec.member_position() == msize ) return true; if( failure_posp ) *failure_posp = rdec.member_position(); } } @@ -259,12 +318,7 @@ bool verify_header( const File_header & header, const int verbosity ) show_error( "Bad magic number (file not in lzip format)." ); return false; } - if( header.version() == 0 ) - { - show_error( "Version 0 member format can't be recovered." ); - return false; - } - if( header.version() != 1 ) + if( !header.verify_version() ) { if( verbosity >= 0 ) std::fprintf( stderr, "Version %d member format not supported.\n", @@ -275,116 +329,106 @@ bool verify_header( const File_header & header, const int verbosity ) } -bool verify_single_member( const int fd, const long long file_size, - const int verbosity ) - { - File_header header; - if( lseek( fd, 0, SEEK_SET ) < 0 || - readblock( fd, header.data, File_header::size ) != File_header::size ) - { show_error( "Error reading member header", errno ); return false; } - if( !verify_header( header, verbosity ) ) return false; - - File_trailer trailer; - if( lseek( fd, -File_trailer::size(), SEEK_END ) < 0 || - readblock( fd, trailer.data, File_trailer::size() ) != File_trailer::size() ) - { show_error( "Error reading member trailer", errno ); return false; } - const long long member_size = trailer.member_size(); - if( member_size != file_size ) - { - if( member_size < file_size && - lseek( fd, -member_size, SEEK_END ) > 0 && - readblock( fd, header.data, File_header::size ) == File_header::size && - verify_header( header, verbosity ) ) - show_error( "Input file has more than 1 member. Split it first." ); - else - show_error( "Member size in input file trailer is corrupt." ); - return false; - } - return true; - } - - int merge_files( const std::vector< std::string > & filenames, const std::string & output_filename, const int verbosity, const bool force ) { - std::vector< int > infd_vector( filenames.size() ); - long long isize = 0; - const int retval = open_input_files( filenames, infd_vector, isize, verbosity ); + const int files = filenames.size(); + std::vector< int > infd_vector( files ); + File_index file_index; + const int retval = + open_input_files( filenames, infd_vector, file_index, verbosity ); if( retval >= 0 ) return retval; + if( !safe_seek( infd_vector[0], 0 ) ) return 1; const int outfd = open_outstream_rw( output_filename, force ); if( outfd < 0 ) return 1; - - // vector of data blocks differing among the copies of the input file. - std::vector< Block > block_vector; - if( !copy_and_diff_file( infd_vector, outfd, block_vector ) ) + if( !copy_file( infd_vector[0], outfd ) ) // copy whole file cleanup_and_fail( output_filename, outfd, 1 ); - if( block_vector.size() == 0 ) - { show_error( "Input files are identical. Recovery is not possible." ); - cleanup_and_fail( output_filename, outfd, 2 ); } - - const bool single_block = ( block_vector.size() == 1 ); - if( single_block && block_vector[0].size() < 2 ) - { show_error( "Input files have the same byte damaged." - " Try repairing one of them." ); - cleanup_and_fail( output_filename, outfd, 2 ); } + for( int j = 0; j < file_index.members(); ++j ) + { + const long long mpos = file_index.mblock( j ).pos(); + const long long msize = file_index.mblock( j ).size(); + // vector of data blocks differing among the copies of the current member + std::vector< Block > block_vector; + if( !diff_member( mpos, msize, infd_vector, block_vector ) || + !safe_seek( outfd, mpos ) ) + cleanup_and_fail( output_filename, outfd, 1 ); + + if( block_vector.size() == 0 ) + { + if( file_index.members() > 1 && try_decompress_member( outfd, msize ) ) + continue; + show_error( "Input files are (partially) identical. Recovery is not possible." ); + cleanup_and_fail( output_filename, outfd, 2 ); + } - if( ipow( filenames.size(), block_vector.size() ) >= INT_MAX || - ( single_block && - ipow( filenames.size(), 2 ) >= INT_MAX / block_vector[0].size() ) ) - { show_error( "Input files are too damaged. Recovery is not possible." ); - cleanup_and_fail( output_filename, outfd, 2 ); } + const int size0 = block_vector[0].size(); + const bool single_block = ( block_vector.size() == 1 ); + if( ipow( files, block_vector.size() ) >= INT_MAX || + ( single_block && ipow( files, 2 ) >= INT_MAX / size0 ) ) + { show_error( "Input files are too damaged. Recovery is not possible." ); + cleanup_and_fail( output_filename, outfd, 2 ); } - const int shifts = ( single_block ? block_vector[0].size() - 1 : 1 ); - if( single_block ) - { - Block b( block_vector[0].pos() + 1, block_vector[0].size() - 1 ); - block_vector[0].size( 1 ); - block_vector.push_back( b ); - } + const int shifts = ( single_block && size0 > 1 ) ? size0 - 1 : 1; + if( single_block && size0 > 1 ) + { + Block b( block_vector[0].pos() + 1, size0 - 1 ); + block_vector[0].size( 1 ); + block_vector.push_back( b ); + } - const int base_variations = ipow( filenames.size(), block_vector.size() ); - const int variations = ( base_variations * shifts ) - 2; - bool done = false; - for( int var = 1; var <= variations; ++var ) - { - if( verbosity >= 1 ) + if( verbosity >= 1 && file_index.members() > 1 ) { - std::printf( "Trying variation %d of %d \r", var, variations ); + std::printf( "Merging member %d\n", j + 1 ); std::fflush( stdout ); } - int tmp = var; - for( unsigned i = 0; i < block_vector.size(); ++i ) + const int base_variations = ipow( files, block_vector.size() ); + const int variations = base_variations * shifts; + bool done = false; + for( int var = 0; var < variations; ++var ) { - const int infd = infd_vector[tmp % filenames.size()]; - tmp /= filenames.size(); - if( lseek( infd, block_vector[i].pos(), SEEK_SET ) < 0 || - lseek( outfd, block_vector[i].pos(), SEEK_SET ) < 0 || - !copy_file( infd, outfd, block_vector[i].size() ) ) - { show_error( "Error reading output file", errno ); - cleanup_and_fail( output_filename, outfd, 1 ); } + if( verbosity >= 1 ) + { + std::printf( "Trying variation %d of %d \r", var + 1, variations ); + std::fflush( stdout ); + } + int tmp = var; + for( unsigned i = 0; i < block_vector.size(); ++i ) + { + const int infd = infd_vector[tmp % files]; + tmp /= files; + if( lseek( infd, block_vector[i].pos(), SEEK_SET ) < 0 || + lseek( outfd, block_vector[i].pos(), SEEK_SET ) < 0 || + !copy_file( infd, outfd, block_vector[i].size() ) ) + { show_error( "Error reading output file", errno ); + cleanup_and_fail( output_filename, outfd, 1 ); } + } + if( !safe_seek( outfd, mpos ) ) + cleanup_and_fail( output_filename, outfd, 1 ); + if( try_decompress_member( outfd, msize ) ) + { done = true; break; } + if( var > 0 && var % base_variations == 0 ) + block_vector[0].shift( block_vector[1] ); + } + if( verbosity >= 1 ) std::printf( "\n" ); + if( !done ) + { + if( verbosity >= 2 ) + for( unsigned i = 0; i < block_vector.size(); ++i ) + std::fprintf( stderr, "area %2d from offset %6lld to %6lld\n", i + 1, + block_vector[i].pos(), block_vector[i].end() - 1 ); + show_error( "Some error areas overlap. Can't recover input file." ); + cleanup_and_fail( output_filename, outfd, 2 ); } - if( lseek( outfd, 0, SEEK_SET ) < 0 ) - { show_error( "Seek error in output file", errno ); - cleanup_and_fail( output_filename, outfd, 1 ); } - if( try_decompress( outfd, isize ) ) - { done = true; break; } - if( var % base_variations == 0 ) block_vector[0].shift( block_vector[1] ); } - if( verbosity >= 1 ) std::printf( "\n" ); if( close( outfd ) != 0 ) { show_error( "Error closing output file", errno ); cleanup_and_fail( output_filename, -1, 1 ); } - if( !done ) - { - show_error( "Some error areas overlap. Can't recover input file." ); - cleanup_and_fail( output_filename, -1, 2 ); - } if( verbosity >= 1 ) std::printf( "Input files merged successfully.\n" ); return 0; |