diff options
Diffstat (limited to 'merge.cc')
-rw-r--r-- | merge.cc | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/merge.cc b/merge.cc new file mode 100644 index 0000000..298d90b --- /dev/null +++ b/merge.cc @@ -0,0 +1,406 @@ +/* Lziprecover - Data recovery tool for lzipped files + Copyright (C) 2009, 2010, 2011 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "decoder.h" + + +namespace { + +class Block + { + long long pos_, size_; // pos + size <= LLONG_MAX + +public: + Block( const long long p, const long long s ) throw() + : pos_( p ), size_( s ) {} + + long long pos() const throw() { return pos_; } + long long size() const throw() { return size_; } + long long end() const throw() { return pos_ + size_; } + + void pos( const long long p ) throw() { pos_ = p; } + void size( const long long s ) throw() { size_ = s; } + void shift( Block & b ) throw() { ++size_; ++b.pos_; --b.size_; } + }; + + +bool copy_and_diff_file( const std::vector< int > & infd_vector, + const int outfd, std::vector< Block > & block_vector ) + { + const int buffer_size = 65536; + std::vector< uint8_t * > buffer_vector( infd_vector.size() ); + for( unsigned int i = 0; i < infd_vector.size(); ++i ) + buffer_vector[i] = new uint8_t[buffer_size]; + Block b( 0, 0 ); + long long partial_pos = 0; + int equal_bytes = 0; + bool error = false; + + while( true ) + { + const int rd = readblock( infd_vector[0], buffer_vector[0], buffer_size ); + if( rd != buffer_size && errno ) + { show_error( "Error reading input file", errno ); error = true; break; } + if( rd > 0 ) + { + for( unsigned int i = 1; i < infd_vector.size(); ++i ) + if( readblock( infd_vector[i], buffer_vector[i], rd ) != rd ) + { show_error( "Error reading input file", errno ); + error = true; break; } + if( error ) break; + const int wr = writeblock( outfd, buffer_vector[0], rd ); + if( wr != rd ) + { show_error( "Error writing output file", errno ); + error = true; break; } + for( int i = 0; i < rd; ++i ) + { + while( i < rd && b.pos() == 0 ) + { + for( unsigned int j = 1; j < infd_vector.size(); ++j ) + if( buffer_vector[0][i] != buffer_vector[j][i] ) + { b.pos( partial_pos + i ); break; } // begin block + ++i; + } + while( i < rd && b.pos() > 0 ) + { + ++equal_bytes; + for( unsigned int j = 1; j < infd_vector.size(); ++j ) + if( buffer_vector[0][i] != buffer_vector[j][i] ) + { equal_bytes = 0; break; } + if( equal_bytes >= 2 ) // end block + { + b.size( partial_pos + i - ( equal_bytes - 1 ) - b.pos() ); + block_vector.push_back( b ); + b.pos( 0 ); + equal_bytes = 0; + } + ++i; + } + } + partial_pos += rd; + } + if( rd < buffer_size ) break; // EOF + } + if( b.pos() > 0 ) // finish last block + { + b.size( partial_pos - b.pos() ); + block_vector.push_back( b ); + } + for( unsigned int i = 0; i < infd_vector.size(); ++i ) + delete[] buffer_vector[i]; + return !error; + } + + +int ipow( const unsigned int base, const unsigned int exponent ) throw() + { + int result = 1; + for( unsigned int i = 0; i < exponent; ++i ) + { + if( INT_MAX / base >= (unsigned int)result ) result *= base; + else { result = INT_MAX; break; } + } + return result; + } + + +int open_input_files( const std::vector< std::string > & filenames, + std::vector< int > & infd_vector, long long & isize ) + { + bool identical = false; + for( unsigned int i = 1; i < filenames.size(); ++i ) + if( filenames[0] == filenames[i] ) + { identical = true; break; } + if( !identical ) + for( unsigned int i = 0; i < filenames.size(); ++i ) + { + struct stat in_stats; + ino_t st_ino0 = 0; + dev_t st_dev0 = 0; + infd_vector[i] = open_instream( filenames[i], &in_stats, true, true ); + if( infd_vector[i] < 0 ) return 1; + if( i == 0 ) { st_ino0 = in_stats.st_ino; st_dev0 = in_stats.st_dev; } + else if( st_ino0 == in_stats.st_ino && st_dev0 == in_stats.st_dev ) + { identical = true; break; } + } + if( identical ) { show_error( "Two input files are the same." ); return 1; } + + isize = 0; + for( unsigned int i = 0; i < filenames.size(); ++i ) + { + const long long tmp = lseek( infd_vector[i], 0, SEEK_END ); + if( tmp < 0 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "File `%s' is not seekable.\n", filenames[i].c_str() ); + return 1; + } + if( i == 0 ) + { + isize = tmp; + if( isize < 36 ) { show_error( "Input file is too short." ); return 2; } + } + else if( isize != tmp ) + { show_error( "Sizes of input files are different." ); return 1; } + } + + for( unsigned int i = 0; i < filenames.size(); ++i ) + if( !verify_single_member( infd_vector[i], isize ) ) + return 2; + + for( unsigned int i = 0; i < filenames.size(); ++i ) + { + if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 ) + { show_error( "Seek error in input file", errno ); return 1; } + if( try_decompress( infd_vector[i], isize ) ) + { + if( verbosity >= 1 ) + std::printf( "File `%s' has no errors. Recovery is not needed.\n", + filenames[i].c_str() ); + return 0; + } + if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 ) + { show_error( "Seek error in input file", errno ); return 1; } + } + return -1; + } + +} // end namespace + + +void cleanup_and_fail( const std::string & output_filename, + const int outfd, const int retval ) throw() + { + if( outfd >= 0 ) close( outfd ); + if( std::remove( output_filename.c_str() ) != 0 && errno != ENOENT ) + show_error( "WARNING: deletion of output file (apparently) failed." ); + std::exit( retval ); + } + + +bool copy_file( const int infd, const int outfd, const long long size ) + { + long long rest = size; + const int buffer_size = 65536; + uint8_t * const buffer = new uint8_t[buffer_size]; + bool error = false; + + while( true ) + { + const int block_size = std::min( (long long)buffer_size, rest ); + if( block_size <= 0 ) break; + const int rd = readblock( infd, buffer, block_size ); + if( rd != block_size && errno ) + { show_error( "Error reading input file", errno ); error = true; break; } + if( rd > 0 ) + { + const int wr = writeblock( outfd, buffer, rd ); + if( wr != rd ) + { show_error( "Error writing output file", errno ); + error = true; break; } + rest -= rd; + } + if( rd < block_size ) break; // EOF + } + delete[] buffer; + return !error; + } + + +bool try_decompress( const int fd, const long long file_size, + long long * failure_posp ) + { + try { + Range_decoder rdec( fd ); + File_header header; + rdec.reset_member_position(); + for( int i = 0; i < File_header::size; ++i ) + header.data[i] = rdec.get_byte(); + if( !rdec.finished() && // End Of File + header.verify_magic() && + header.version() == 1 && + header.dictionary_size() >= min_dictionary_size && + header.dictionary_size() <= max_dictionary_size ) + { + LZ_decoder decoder( header, rdec, -1 ); + std::vector< std::string > dummy_filenames; + Pretty_print dummy( dummy_filenames, -1 ); + + if( decoder.decode_member( dummy ) == 0 && + rdec.member_position() == file_size ) return true; + if( failure_posp ) *failure_posp = rdec.member_position(); + } + } + catch( std::bad_alloc ) + { + show_error( "Not enough memory. Find a machine with more memory." ); + std::exit( 1 ); + } + catch( Error e ) {} + return false; + } + + +bool verify_header( const File_header & header ) + { + if( !header.verify_magic() ) + { + show_error( "Bad magic number (file not in lzip format)." ); + return false; + } + if( header.version() == 0 ) + { + show_error( "Version 0 member format can't be recovered." ); + return false; + } + if( header.version() != 1 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "Version %d member format not supported.\n", + header.version() ); + return false; + } + return true; + } + + +bool verify_single_member( const int fd, const long long file_size ) + { + File_header header; + if( lseek( fd, 0, SEEK_SET ) < 0 || + readblock( fd, header.data, File_header::size ) != File_header::size ) + { show_error( "Error reading member header", errno ); return false; } + if( !verify_header( header ) ) return false; + + File_trailer trailer; + if( lseek( fd, -File_trailer::size(), SEEK_END ) < 0 || + readblock( fd, trailer.data, File_trailer::size() ) != File_trailer::size() ) + { show_error( "Error reading member trailer", errno ); return false; } + const long long member_size = trailer.member_size(); + if( member_size != file_size ) + { + if( member_size < file_size && + lseek( fd, -member_size, SEEK_END ) > 0 && + readblock( fd, header.data, File_header::size ) == File_header::size && + verify_header( header ) ) + show_error( "Input file has more than 1 member. Split it first." ); + else + show_error( "Member size in input file trailer is corrupt." ); + return false; + } + return true; + } + + +int merge_files( const std::vector< std::string > & filenames, + const std::string & output_filename, const bool force ) + { + std::vector< int > infd_vector( filenames.size() ); + long long isize = 0; + const int retval = open_input_files( filenames, infd_vector, isize ); + if( retval >= 0 ) return retval; + + const int outfd = open_outstream_rw( output_filename, force ); + if( outfd < 0 ) return 1; + + // vector of data blocks differing among the copies of the input file. + std::vector< Block > block_vector; + if( !copy_and_diff_file( infd_vector, outfd, block_vector ) ) + cleanup_and_fail( output_filename, outfd, 1 ); + + if( !block_vector.size() ) + { show_error( "Input files are identical. Recovery is not possible." ); + cleanup_and_fail( output_filename, outfd, 2 ); } + + const bool single_block = ( block_vector.size() == 1 ); + if( single_block && block_vector[0].size() < 2 ) + { show_error( "Input files have the same byte damaged." + " Try repairing one of them." ); + cleanup_and_fail( output_filename, outfd, 2 ); } + + if( ipow( filenames.size(), block_vector.size() ) >= INT_MAX || + ( single_block && + ipow( filenames.size(), 2 ) >= INT_MAX / block_vector[0].size() ) ) + { show_error( "Input files are too damaged. Recovery is not possible." ); + cleanup_and_fail( output_filename, outfd, 2 ); } + + const int shifts = ( single_block ? block_vector[0].size() - 1 : 1 ); + if( single_block ) + { + Block b( block_vector[0].pos() + 1, block_vector[0].size() - 1 ); + block_vector[0].size( 1 ); + block_vector.push_back( b ); + } + + const int base_variations = ipow( filenames.size(), block_vector.size() ); + const int variations = ( base_variations * shifts ) - 2; + bool done = false; + for( int var = 1; var <= variations; ++var ) + { + if( verbosity >= 1 ) + { + std::printf( "Trying variation %d of %d \r", var, variations ); + std::fflush( stdout ); + } + int tmp = var; + for( unsigned int i = 0; i < block_vector.size(); ++i ) + { + const int infd = infd_vector[tmp % filenames.size()]; + tmp /= filenames.size(); + if( lseek( infd, block_vector[i].pos(), SEEK_SET ) < 0 || + lseek( outfd, block_vector[i].pos(), SEEK_SET ) < 0 || + !copy_file( infd, outfd, block_vector[i].size() ) ) + { show_error( "Error reading output file", errno ); + cleanup_and_fail( output_filename, outfd, 1 ); } + } + if( lseek( outfd, 0, SEEK_SET ) < 0 ) + { show_error( "Seek error in output file", errno ); + cleanup_and_fail( output_filename, outfd, 1 ); } + if( try_decompress( outfd, isize ) ) + { done = true; break; } + if( var % base_variations == 0 ) block_vector[0].shift( block_vector[1] ); + } + if( verbosity >= 1 ) std::printf( "\n" ); + + if( close( outfd ) != 0 ) + { + show_error( "Error closing output file", errno ); + cleanup_and_fail( output_filename, -1, 1 ); + } + if( !done ) + { + show_error( "Some error areas overlap. Can't recover input file." ); + cleanup_and_fail( output_filename, -1, 2 ); + } + if( verbosity >= 1 ) + std::printf( "Input files merged successfully.\n" ); + return 0; + } |