summaryrefslogtreecommitdiffstats
path: root/merge.cc
diff options
context:
space:
mode:
Diffstat (limited to 'merge.cc')
-rw-r--r--merge.cc406
1 files changed, 406 insertions, 0 deletions
diff --git a/merge.cc b/merge.cc
new file mode 100644
index 0000000..298d90b
--- /dev/null
+++ b/merge.cc
@@ -0,0 +1,406 @@
+/* Lziprecover - Data recovery tool for lzipped files
+ Copyright (C) 2009, 2010, 2011 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <cerrno>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "lzip.h"
+#include "decoder.h"
+
+
+namespace {
+
+class Block
+ {
+ long long pos_, size_; // pos + size <= LLONG_MAX
+
+public:
+ Block( const long long p, const long long s ) throw()
+ : pos_( p ), size_( s ) {}
+
+ long long pos() const throw() { return pos_; }
+ long long size() const throw() { return size_; }
+ long long end() const throw() { return pos_ + size_; }
+
+ void pos( const long long p ) throw() { pos_ = p; }
+ void size( const long long s ) throw() { size_ = s; }
+ void shift( Block & b ) throw() { ++size_; ++b.pos_; --b.size_; }
+ };
+
+
+bool copy_and_diff_file( const std::vector< int > & infd_vector,
+ const int outfd, std::vector< Block > & block_vector )
+ {
+ const int buffer_size = 65536;
+ std::vector< uint8_t * > buffer_vector( infd_vector.size() );
+ for( unsigned int i = 0; i < infd_vector.size(); ++i )
+ buffer_vector[i] = new uint8_t[buffer_size];
+ Block b( 0, 0 );
+ long long partial_pos = 0;
+ int equal_bytes = 0;
+ bool error = false;
+
+ while( true )
+ {
+ const int rd = readblock( infd_vector[0], buffer_vector[0], buffer_size );
+ if( rd != buffer_size && errno )
+ { show_error( "Error reading input file", errno ); error = true; break; }
+ if( rd > 0 )
+ {
+ for( unsigned int i = 1; i < infd_vector.size(); ++i )
+ if( readblock( infd_vector[i], buffer_vector[i], rd ) != rd )
+ { show_error( "Error reading input file", errno );
+ error = true; break; }
+ if( error ) break;
+ const int wr = writeblock( outfd, buffer_vector[0], rd );
+ if( wr != rd )
+ { show_error( "Error writing output file", errno );
+ error = true; break; }
+ for( int i = 0; i < rd; ++i )
+ {
+ while( i < rd && b.pos() == 0 )
+ {
+ for( unsigned int j = 1; j < infd_vector.size(); ++j )
+ if( buffer_vector[0][i] != buffer_vector[j][i] )
+ { b.pos( partial_pos + i ); break; } // begin block
+ ++i;
+ }
+ while( i < rd && b.pos() > 0 )
+ {
+ ++equal_bytes;
+ for( unsigned int j = 1; j < infd_vector.size(); ++j )
+ if( buffer_vector[0][i] != buffer_vector[j][i] )
+ { equal_bytes = 0; break; }
+ if( equal_bytes >= 2 ) // end block
+ {
+ b.size( partial_pos + i - ( equal_bytes - 1 ) - b.pos() );
+ block_vector.push_back( b );
+ b.pos( 0 );
+ equal_bytes = 0;
+ }
+ ++i;
+ }
+ }
+ partial_pos += rd;
+ }
+ if( rd < buffer_size ) break; // EOF
+ }
+ if( b.pos() > 0 ) // finish last block
+ {
+ b.size( partial_pos - b.pos() );
+ block_vector.push_back( b );
+ }
+ for( unsigned int i = 0; i < infd_vector.size(); ++i )
+ delete[] buffer_vector[i];
+ return !error;
+ }
+
+
+int ipow( const unsigned int base, const unsigned int exponent ) throw()
+ {
+ int result = 1;
+ for( unsigned int i = 0; i < exponent; ++i )
+ {
+ if( INT_MAX / base >= (unsigned int)result ) result *= base;
+ else { result = INT_MAX; break; }
+ }
+ return result;
+ }
+
+
+int open_input_files( const std::vector< std::string > & filenames,
+ std::vector< int > & infd_vector, long long & isize )
+ {
+ bool identical = false;
+ for( unsigned int i = 1; i < filenames.size(); ++i )
+ if( filenames[0] == filenames[i] )
+ { identical = true; break; }
+ if( !identical )
+ for( unsigned int i = 0; i < filenames.size(); ++i )
+ {
+ struct stat in_stats;
+ ino_t st_ino0 = 0;
+ dev_t st_dev0 = 0;
+ infd_vector[i] = open_instream( filenames[i], &in_stats, true, true );
+ if( infd_vector[i] < 0 ) return 1;
+ if( i == 0 ) { st_ino0 = in_stats.st_ino; st_dev0 = in_stats.st_dev; }
+ else if( st_ino0 == in_stats.st_ino && st_dev0 == in_stats.st_dev )
+ { identical = true; break; }
+ }
+ if( identical ) { show_error( "Two input files are the same." ); return 1; }
+
+ isize = 0;
+ for( unsigned int i = 0; i < filenames.size(); ++i )
+ {
+ const long long tmp = lseek( infd_vector[i], 0, SEEK_END );
+ if( tmp < 0 )
+ {
+ if( verbosity >= 0 )
+ std::fprintf( stderr, "File `%s' is not seekable.\n", filenames[i].c_str() );
+ return 1;
+ }
+ if( i == 0 )
+ {
+ isize = tmp;
+ if( isize < 36 ) { show_error( "Input file is too short." ); return 2; }
+ }
+ else if( isize != tmp )
+ { show_error( "Sizes of input files are different." ); return 1; }
+ }
+
+ for( unsigned int i = 0; i < filenames.size(); ++i )
+ if( !verify_single_member( infd_vector[i], isize ) )
+ return 2;
+
+ for( unsigned int i = 0; i < filenames.size(); ++i )
+ {
+ if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 )
+ { show_error( "Seek error in input file", errno ); return 1; }
+ if( try_decompress( infd_vector[i], isize ) )
+ {
+ if( verbosity >= 1 )
+ std::printf( "File `%s' has no errors. Recovery is not needed.\n",
+ filenames[i].c_str() );
+ return 0;
+ }
+ if( lseek( infd_vector[i], 0, SEEK_SET ) < 0 )
+ { show_error( "Seek error in input file", errno ); return 1; }
+ }
+ return -1;
+ }
+
+} // end namespace
+
+
+void cleanup_and_fail( const std::string & output_filename,
+ const int outfd, const int retval ) throw()
+ {
+ if( outfd >= 0 ) close( outfd );
+ if( std::remove( output_filename.c_str() ) != 0 && errno != ENOENT )
+ show_error( "WARNING: deletion of output file (apparently) failed." );
+ std::exit( retval );
+ }
+
+
+bool copy_file( const int infd, const int outfd, const long long size )
+ {
+ long long rest = size;
+ const int buffer_size = 65536;
+ uint8_t * const buffer = new uint8_t[buffer_size];
+ bool error = false;
+
+ while( true )
+ {
+ const int block_size = std::min( (long long)buffer_size, rest );
+ if( block_size <= 0 ) break;
+ const int rd = readblock( infd, buffer, block_size );
+ if( rd != block_size && errno )
+ { show_error( "Error reading input file", errno ); error = true; break; }
+ if( rd > 0 )
+ {
+ const int wr = writeblock( outfd, buffer, rd );
+ if( wr != rd )
+ { show_error( "Error writing output file", errno );
+ error = true; break; }
+ rest -= rd;
+ }
+ if( rd < block_size ) break; // EOF
+ }
+ delete[] buffer;
+ return !error;
+ }
+
+
+bool try_decompress( const int fd, const long long file_size,
+ long long * failure_posp )
+ {
+ try {
+ Range_decoder rdec( fd );
+ File_header header;
+ rdec.reset_member_position();
+ for( int i = 0; i < File_header::size; ++i )
+ header.data[i] = rdec.get_byte();
+ if( !rdec.finished() && // End Of File
+ header.verify_magic() &&
+ header.version() == 1 &&
+ header.dictionary_size() >= min_dictionary_size &&
+ header.dictionary_size() <= max_dictionary_size )
+ {
+ LZ_decoder decoder( header, rdec, -1 );
+ std::vector< std::string > dummy_filenames;
+ Pretty_print dummy( dummy_filenames, -1 );
+
+ if( decoder.decode_member( dummy ) == 0 &&
+ rdec.member_position() == file_size ) return true;
+ if( failure_posp ) *failure_posp = rdec.member_position();
+ }
+ }
+ catch( std::bad_alloc )
+ {
+ show_error( "Not enough memory. Find a machine with more memory." );
+ std::exit( 1 );
+ }
+ catch( Error e ) {}
+ return false;
+ }
+
+
+bool verify_header( const File_header & header )
+ {
+ if( !header.verify_magic() )
+ {
+ show_error( "Bad magic number (file not in lzip format)." );
+ return false;
+ }
+ if( header.version() == 0 )
+ {
+ show_error( "Version 0 member format can't be recovered." );
+ return false;
+ }
+ if( header.version() != 1 )
+ {
+ if( verbosity >= 0 )
+ std::fprintf( stderr, "Version %d member format not supported.\n",
+ header.version() );
+ return false;
+ }
+ return true;
+ }
+
+
+bool verify_single_member( const int fd, const long long file_size )
+ {
+ File_header header;
+ if( lseek( fd, 0, SEEK_SET ) < 0 ||
+ readblock( fd, header.data, File_header::size ) != File_header::size )
+ { show_error( "Error reading member header", errno ); return false; }
+ if( !verify_header( header ) ) return false;
+
+ File_trailer trailer;
+ if( lseek( fd, -File_trailer::size(), SEEK_END ) < 0 ||
+ readblock( fd, trailer.data, File_trailer::size() ) != File_trailer::size() )
+ { show_error( "Error reading member trailer", errno ); return false; }
+ const long long member_size = trailer.member_size();
+ if( member_size != file_size )
+ {
+ if( member_size < file_size &&
+ lseek( fd, -member_size, SEEK_END ) > 0 &&
+ readblock( fd, header.data, File_header::size ) == File_header::size &&
+ verify_header( header ) )
+ show_error( "Input file has more than 1 member. Split it first." );
+ else
+ show_error( "Member size in input file trailer is corrupt." );
+ return false;
+ }
+ return true;
+ }
+
+
+int merge_files( const std::vector< std::string > & filenames,
+ const std::string & output_filename, const bool force )
+ {
+ std::vector< int > infd_vector( filenames.size() );
+ long long isize = 0;
+ const int retval = open_input_files( filenames, infd_vector, isize );
+ if( retval >= 0 ) return retval;
+
+ const int outfd = open_outstream_rw( output_filename, force );
+ if( outfd < 0 ) return 1;
+
+ // vector of data blocks differing among the copies of the input file.
+ std::vector< Block > block_vector;
+ if( !copy_and_diff_file( infd_vector, outfd, block_vector ) )
+ cleanup_and_fail( output_filename, outfd, 1 );
+
+ if( !block_vector.size() )
+ { show_error( "Input files are identical. Recovery is not possible." );
+ cleanup_and_fail( output_filename, outfd, 2 ); }
+
+ const bool single_block = ( block_vector.size() == 1 );
+ if( single_block && block_vector[0].size() < 2 )
+ { show_error( "Input files have the same byte damaged."
+ " Try repairing one of them." );
+ cleanup_and_fail( output_filename, outfd, 2 ); }
+
+ if( ipow( filenames.size(), block_vector.size() ) >= INT_MAX ||
+ ( single_block &&
+ ipow( filenames.size(), 2 ) >= INT_MAX / block_vector[0].size() ) )
+ { show_error( "Input files are too damaged. Recovery is not possible." );
+ cleanup_and_fail( output_filename, outfd, 2 ); }
+
+ const int shifts = ( single_block ? block_vector[0].size() - 1 : 1 );
+ if( single_block )
+ {
+ Block b( block_vector[0].pos() + 1, block_vector[0].size() - 1 );
+ block_vector[0].size( 1 );
+ block_vector.push_back( b );
+ }
+
+ const int base_variations = ipow( filenames.size(), block_vector.size() );
+ const int variations = ( base_variations * shifts ) - 2;
+ bool done = false;
+ for( int var = 1; var <= variations; ++var )
+ {
+ if( verbosity >= 1 )
+ {
+ std::printf( "Trying variation %d of %d \r", var, variations );
+ std::fflush( stdout );
+ }
+ int tmp = var;
+ for( unsigned int i = 0; i < block_vector.size(); ++i )
+ {
+ const int infd = infd_vector[tmp % filenames.size()];
+ tmp /= filenames.size();
+ if( lseek( infd, block_vector[i].pos(), SEEK_SET ) < 0 ||
+ lseek( outfd, block_vector[i].pos(), SEEK_SET ) < 0 ||
+ !copy_file( infd, outfd, block_vector[i].size() ) )
+ { show_error( "Error reading output file", errno );
+ cleanup_and_fail( output_filename, outfd, 1 ); }
+ }
+ if( lseek( outfd, 0, SEEK_SET ) < 0 )
+ { show_error( "Seek error in output file", errno );
+ cleanup_and_fail( output_filename, outfd, 1 ); }
+ if( try_decompress( outfd, isize ) )
+ { done = true; break; }
+ if( var % base_variations == 0 ) block_vector[0].shift( block_vector[1] );
+ }
+ if( verbosity >= 1 ) std::printf( "\n" );
+
+ if( close( outfd ) != 0 )
+ {
+ show_error( "Error closing output file", errno );
+ cleanup_and_fail( output_filename, -1, 1 );
+ }
+ if( !done )
+ {
+ show_error( "Some error areas overlap. Can't recover input file." );
+ cleanup_and_fail( output_filename, -1, 2 );
+ }
+ if( verbosity >= 1 )
+ std::printf( "Input files merged successfully.\n" );
+ return 0;
+ }