diff options
-rw-r--r-- | ChangeLog | 153 | ||||
-rw-r--r-- | INSTALL | 28 | ||||
-rw-r--r-- | Makefile.in | 18 | ||||
-rw-r--r-- | NEWS | 95 | ||||
-rw-r--r-- | README | 96 | ||||
-rw-r--r-- | alone_to_lz.cc | 144 | ||||
-rw-r--r-- | arg_parser.cc | 30 | ||||
-rw-r--r-- | arg_parser.h | 69 | ||||
-rwxr-xr-x | configure | 25 | ||||
-rw-r--r-- | decoder.cc | 44 | ||||
-rw-r--r-- | decoder.h | 90 | ||||
-rw-r--r-- | doc/lziprecover.1 | 28 | ||||
-rw-r--r-- | doc/lziprecover.info | 1484 | ||||
-rw-r--r-- | doc/lziprecover.texi | 1141 | ||||
-rw-r--r-- | dump_remove.cc | 62 | ||||
-rw-r--r-- | list.cc | 61 | ||||
-rw-r--r-- | lunzcrash.cc | 250 | ||||
-rw-r--r-- | lzip.h | 94 | ||||
-rw-r--r-- | lzip_index.cc | 180 | ||||
-rw-r--r-- | lzip_index.h | 42 | ||||
-rw-r--r-- | main.cc | 485 | ||||
-rw-r--r-- | main_common.cc | 27 | ||||
-rw-r--r-- | md5.cc | 206 | ||||
-rw-r--r-- | md5.h | 49 | ||||
-rw-r--r-- | merge.cc | 64 | ||||
-rw-r--r-- | mtester.cc | 105 | ||||
-rw-r--r-- | mtester.h | 121 | ||||
-rw-r--r-- | nrep_stats.cc | 117 | ||||
-rw-r--r-- | range_dec.cc | 111 | ||||
-rw-r--r-- | repair.cc | 144 | ||||
-rw-r--r-- | reproduce.cc | 785 | ||||
-rw-r--r-- | split.cc | 36 | ||||
-rwxr-xr-x | testsuite/check.sh | 242 | ||||
-rw-r--r-- | testsuite/fox_bcrc.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_crc0.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_das46.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_de20.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_mes81.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_s11.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_v2.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/test_bad1.lz | bin | 7376 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad6.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad6.txt | 26 | ||||
-rw-r--r-- | testsuite/test_bad7.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad7.txt | 215 | ||||
-rw-r--r-- | testsuite/test_bad8.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad8.txt | 3 | ||||
-rw-r--r-- | testsuite/test_bad9.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad9.txt | 5 | ||||
-rw-r--r-- | testsuite/test_em.txt.lz | bin | 0 -> 14024 bytes | |||
-rw-r--r-- | unzcrash.cc | 106 |
51 files changed, 4981 insertions, 2000 deletions
@@ -1,9 +1,36 @@ +2021-01-02 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.22 released. + * New options '-e, --reproduce', '--lzip-level', '--lzip-name', + '--reference-file', and '-E, --debug-reproduce'. + * Remove '--dump-tdata', '--remove-tdata', and '--strip-tdata'. + * main.cc (main): Report an error if a file name is empty. + Make '-o' behave like '-c', but writing to file. + Make '-c' and '-o' check whether the output is a terminal only once. + Do not open output if input is a terminal. + * main.cc (decompress): With '-i', ignore data errors, keep files. + * range_dec.cc: '-i -D' now decompresses a truncated last member. + * '-i -D' now returns 0 if only ignored errors are found. + * '-i' now considers any block > 36 with header a member, not a gap. + * Replace 'decompressed', 'compressed' with 'out', 'in' in output. + * Fix several compiler warnings. (Reported by Nissanka Gooneratne). + * lzip_index.cc: Improve messages for corruption in last header. + * New debug options '-M, --md5sum' and '-U, --unzcrash'. + * main.cc: Set a valid invocation_name even if argc == 0. + * Document extraction from tar.lz in manual, '--help', and man page. + * New files lunzcrash.cc, md5.h, md5.cc, nrep_stats.cc, reproduce.cc. + * lziprecover.texi: New chapter 'Reproducing one sector'. + New sections 'Merging with a backup' and 'Reproducing a mailbox'. + Document the debug options for experts. + * check.sh: Lzip 1.16 or newer is required to run the tests. + * testsuite: Add 9 new test files. + 2019-01-04 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.21 released. - * File_* renamed to Lzip_*. - * Added new options '--dump', '--remove' and '--strip'. They - replace '--dump-tdata', '--remove-tdata' and '--strip-tdata', + * Rename File_* to Lzip_*. + * New options '--dump', '--remove', and '--strip'. They + replace '--dump-tdata', '--remove-tdata', and '--strip-tdata', which are now aliases and will be removed in version 1.22. * lzip.h (Lzip_trailer): New function 'verify_consistency'. * lzip_index.cc: Lzip_index now detects gaps between members, @@ -17,22 +44,20 @@ * Improve and add new diagnostic messages. * Print '\n' instead of '\r' if !isatty( 1 ) in merge, repair. * main.cc: Compile on DOS with DJGPP. - * lziprecover.texi: Added chapter 'Tarlz'. + * lziprecover.texi: New chapter 'Tarlz'. * configure: Accept appending to CXXFLAGS, 'CXXFLAGS+=OPTIONS'. * INSTALL: Document use of CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO'. - * Added new test files fox.lz fox6_sc[1-6].lz. + * New test files fox.lz, fox6_sc[1-6].lz. 2018-02-12 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.20 released. - * split.cc: Fixed splitting of files > 64 KiB broken since 1.16. - * Added new option '--dump-tdata'. - * Added new option '--remove-tdata'. - * Added new option '--strip-tdata'. - * Added new option '--loose-trailing'. - * Improved corrupt header detection to HD=3. + * split.cc: Fix splitting of files > 64 KiB broken since 1.16. + * New options '--dump-tdata', '--remove-tdata', '--strip-tdata', and + '--loose-trailing'. + * Improve corrupt header detection to HD=3. * main.cc: Show corrupt or truncated header in multimember file. - * Replaced 'bits/byte' with inverse compression ratio in output. + * Replace 'bits/byte' with inverse compression ratio in output. * Show progress of decompression at verbosity level 2 (-vv). * Show progress of decompression only if stderr is a terminal. * main.cc: Show final diagnostic when testing multiple files. @@ -49,14 +74,14 @@ * main.cc: Show trailing data in both hexadecimal and ASCII. * lzip_index.cc: Improve detection of bad dict and trailing data. * lzip_index.cc: Skip trailing data more efficiently. - * lzip.h: Unified messages for bad magic, trailing data, etc. + * lzip.h: Unify messages for bad magic, trailing data, etc. * New struct Bad_byte allows delta and flip modes for bad_value. - * unzcrash.cc: Added new option '-e, --set-byte'. + * unzcrash.cc: New option '-e, --set-byte'. 2016-05-12 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.18 released. - * Added new option '-a, --trailing-error'. + * New option '-a, --trailing-error'. * merge.cc (open_input_files): Use CRC to test identical files. * repair.cc (repair_file): Detect gross damage before repairing. * repair.cc: Repair a damaged dictionary size in the header. @@ -64,25 +89,22 @@ * Decompression time has been reduced by 2%. * main.cc (decompress): Print up to 6 bytes of trailing data when '-tvvvv' is specified. - * decoder.cc (verify_trailer): Removed test of final code. + * decoder.cc (verify_trailer): Remove test of final code. * main.cc (main): Delete '--output' file if infd is a terminal. * main.cc (main): Don't use stdin more than once. * Use 'close_and_set_permissions' and 'set_signals' in all modes. * range_dec.cc (list_file): Show dictionary size and size of trailing data (if any) with '-lv'. - * Added new option '-A, --alone-to-lz'. - * Added new option '-W, --debug-decompress'. - * Added new option '-X, --show-packets'. - * Changed short name of option '--debug-delay' to '-Y'. - * Changed short name of option '--debug-repair' to '-Z'. - * unzcrash.cc: Added new option '-B, --block'. - * unzcrash.cc: Added new option '-d, --delta'. - * unzcrash.cc: Added new option '-t, --truncate'. - * unzcrash.cc: Added new option '-z, --zcmp'. + * New options '-A, --alone-to-lz', '-W, --debug-decompress', and + '-X, --show-packets'. + * Change short name of option '--debug-delay' to '-Y'. + * Change short name of option '--debug-repair' to '-Z'. + * unzcrash.cc: New options '-B, --block', '-d, --delta', + '-t, --truncate', and '-z, --zcmp'. * unzcrash.cc: Read files as large as RAM allows. * unzcrash.cc: Compare output using zcmp if decompressor returns 0. * unzcrash.cc: Accept negative position and size. - * lziprecover.texi: Added chapter 'Trailing data'. + * lziprecover.texi: New chapter 'Trailing data'. * configure: Avoid warning on some shells when testing for g++. * Makefile.in: Detect the existence of install-info. * check.sh: Don't check error messages. @@ -93,19 +115,18 @@ * Version 1.17 released. * New block selection algorithm makes merge up to 100 times faster. * repair.cc: Repair time has been reduced by 15%. - * Added new option '-y, --debug-delay'. - * Added new option '-z, --debug-repair'. - * Makefile.in: Added new targets 'install*-compress'. - * testsuite/unzcrash.cc: Moved to top directory. - * lziprecover.texi: Added chapter 'File names'. + * New options '-y, --debug-delay' and '-z, --debug-repair'. + * Makefile.in: New targets 'install*-compress'. + * testsuite/unzcrash.cc: Move to top directory. + * lziprecover.texi: New chapter 'File names'. 2014-08-29 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.16 released. * New class LZ_mtester makes repair up to 10 times faster. * main.cc (close_and_set_permissions): Behave like 'cp -p'. - * lziprecover.texinfo: Renamed to lziprecover.texi. - * License changed to GPL version 2 or later. + * lziprecover.texinfo: Rename to lziprecover.texi. + * Change license to GPL version 2 or later. 2013-09-14 Antonio Diaz Diaz <antonio@gnu.org> @@ -114,20 +135,20 @@ per member. * merge.cc: Merge multimember files. * main.cc (show_header): Don't show header version. - * lziprecover.texinfo: Added chapters 'Repairing files', - 'Merging files' and 'Unzcrash'. + * lziprecover.texinfo: New chapters 'Repairing files', + 'Merging files', and 'Unzcrash'. 2013-05-31 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.14 released. - * Added new option '-i, --ignore-errors'. + * New option '-i, --ignore-errors'. * Option '-l, --list' now accepts more than one file. * Decompression time has been reduced by 12%. * split.cc: Use as few digits as possible in file names. * split.cc: In verbose mode show names of files being created. * main.cc (show_header): Show header version if verbosity >= 4. * configure: Options now accept a separate argument. - * Makefile.in: Added new targets 'install-as-lzip', 'install-bin'. + * Makefile.in: New targets 'install-as-lzip' and 'install-bin'. * main.cc: Use 'setmode' instead of '_setmode' on Windows and OS/2. 2012-02-24 Antonio Diaz Diaz <ant_diaz@teleline.es> @@ -136,64 +157,60 @@ * Lziprecover is now distributed in its own package. Until version 1.12 it was included in the lzip package. Previous entries in this file are taken from there. - * lziprecover.cc: Renamed to main.cc. + * lziprecover.cc: Rename to main.cc. * New files merge.cc, repair.cc, split.cc, and range_dec.cc. - * main.cc: Added decompressor options (-c, -d, -k, -t) so that - a external decompressor is not needed for recovery nor for + * main.cc: Add decompressor options (-c, -d, -k, -t) so that + an external decompressor is not needed for recovery nor for "make check". - * Added new option '-D, --range-decompress' which extracts a - range of bytes decompressing only the members containing the - desired data. - * Added new option '-l, --list' which prints correct total file - sizes even for multimember files. - * merge.cc repair.cc: Remove output file if recovery fails. - * Changed quote characters in messages as advised by GNU Standards. + * New option '-D, --range-decompress', which extracts a range of + bytes decompressing only the members containing the desired data. + * New option '-l, --list', which prints correct total file sizes + even for multimember files. + * merge.cc, repair.cc: Remove output file if recovery fails. + * Change quote characters in messages as advised by GNU Standards. * split.cc: Use Boyer-Moore algorithm to search for headers. - * configure: 'datadir' renamed to 'datarootdir'. + * configure: Rename 'datadir' to 'datarootdir'. 2011-04-30 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.12 released. * lziprecover.cc: If '-v' is not specified show errors only. * unzcrash.cc: Use Arg_parser. - * unzcrash.cc: Added new option '-b, --bits'. - * unzcrash.cc: Added new option '-p, --position'. - * unzcrash.cc: Added new option '-s, --size'. + * unzcrash.cc: New options '-b, --bits', '-p, --position', and + '-s, --size'. 2010-09-16 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.11 released. - * lziprecover.cc: Added new option '-m, --merge' which tries to - produce a correct file merging the good parts of two or more - damaged copies. - * lziprecover.cc: Added new option '-R, --repair' for repairing - a 1-byte error in single-member files. - * decoder.cc (decode_member): Detect file errors earlier to - improve efficiency of lziprecover's new repair capability. + * lziprecover.cc: New option '-m, --merge', which tries to produce a + correct file by merging the good parts of two or more damaged copies. + * lziprecover.cc: New option '-R, --repair' for repairing a + 1-byte error in single-member files. + * decoder.cc (decode_member): Detect file errors earlier to improve + efficiency of lziprecover's new repair capability. This change also prevents (harmless) access to uninitialized memory when decompressing a corrupt file. - * lziprecover.cc: Added new option '-f, --force'. - * lziprecover.cc: Added new option '-o, --output'. - * lziprecover.cc: Added new option '-s, --split' to select the - until now only operation of splitting multimember files. - * lziprecover.cc: If no operation is specified, warn the user - and do nothing. + * lziprecover.cc: New options '-f, --force' and '-o, --output'. + * lziprecover.cc: New option '-s, --split' to select the until + now only operation of splitting multimember files. + * lziprecover.cc: If no operation is specified, warn the user and do + nothing. 2009-06-22 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.6 released. - * Added man page for lziprecover. + * lziprecover.1: New man page. * check.sh: Test lziprecover. 2009-01-24 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.4 released. - * Added 'lziprecover', a member recoverer program. + * Add 'lziprecover', a member recoverer program. * unzcrash.cc: Test all 1-byte errors. -Copyright (C) 2009-2019 Antonio Diaz Diaz. +Copyright (C) 2009-2021 Antonio Diaz Diaz. This file is a collection of facts, and thus it is not copyrightable, -but just in case, you have unlimited permission to copy, distribute and +but just in case, you have unlimited permission to copy, distribute, and modify it. @@ -1,12 +1,14 @@ Requirements ------------ -You will need a C++ compiler. -I use gcc 5.3.0 and 4.1.2, but the code should compile with any standards +You will need a C++11 compiler. (gcc 3.3.6 or newer is recommended). +I use gcc 6.1.0 and 4.1.2, but the code should compile with any standards compliant compiler. Gcc is available at http://gcc.gnu.org. -Unzcrash needs a zcmp program able to understand the format being tested. -For example the zcmp program provided by zutils. +Lzip 1.16 (or clzip 1.6) or newer is required to run the tests. + +Unzcrash needs a 'zcmp' program able to understand the format being tested. +For example the zcmp provided by zutils. Zutils is available at http://www.nongnu.org/zutils/zutils.html The operating system must allow signal handlers read access to objects with @@ -45,11 +47,11 @@ the main archive. documentation. Or type 'make install-compress', which additionally compresses the - info manual and the man page after installation. (Installing - compressed docs may become the default in the future). + info manual and the man page after installation. + (Installing compressed docs may become the default in the future). - You can install only the program, the info manual or the man page by - typing 'make install-bin', 'make install-info' or 'make install-man' + You can install only the program, the info manual, or the man page by + typing 'make install-bin', 'make install-info', or 'make install-man' respectively. Instead of 'make install', you can type 'make install-as-lzip' to @@ -60,10 +62,10 @@ the main archive. Another way ----------- You can also compile lziprecover into a separate directory. -To do this, you must use a version of 'make' that supports the 'VPATH' -variable, such as GNU 'make'. 'cd' to the directory where you want the +To do this, you must use a version of 'make' that supports the variable +'VPATH', such as GNU 'make'. 'cd' to the directory where you want the object files and executables to go and run the 'configure' script. -'configure' automatically checks for the source code in '.', in '..' and +'configure' automatically checks for the source code in '.', in '..', and in the directory that 'configure' is in. 'configure' recognizes the option '--srcdir=DIR' to control where to @@ -74,7 +76,7 @@ After running 'configure', you can run 'make' and 'make install' as explained above. -Copyright (C) 2009-2019 Antonio Diaz Diaz. +Copyright (C) 2009-2021 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, -distribute and modify it. +distribute, and modify it. diff --git a/Makefile.in b/Makefile.in index 48c8448..ee413f0 100644 --- a/Makefile.in +++ b/Makefile.in @@ -8,7 +8,8 @@ SHELL = /bin/sh CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 objs = arg_parser.o alone_to_lz.o lzip_index.o list.o dump_remove.o \ - merge.o mtester.o range_dec.o repair.o split.o decoder.o main.o + lunzcrash.o md5.o merge.o mtester.o nrep_stats.o range_dec.o \ + repair.o reproduce.o split.o decoder.o main.o unzobjs = arg_parser.o unzcrash.o @@ -42,12 +43,16 @@ arg_parser.o : arg_parser.h decoder.o : lzip.h decoder.h dump_remove.o : lzip.h lzip_index.h list.o : lzip.h lzip_index.h +lunzcrash.o : lzip.h md5.h mtester.h lzip_index.h lzip_index.o : lzip.h lzip_index.h main.o : arg_parser.h lzip.h decoder.h main_common.cc +md5.o : md5.h merge.o : lzip.h decoder.h lzip_index.h -mtester.o : lzip.h mtester.h +mtester.o : lzip.h md5.h mtester.h +nrep_stats.o : lzip.h lzip_index.h range_dec.o : lzip.h decoder.h lzip_index.h repair.o : lzip.h mtester.h lzip_index.h +reproduce.o : lzip.h md5.h mtester.h lzip_index.h split.o : lzip.h lzip_index.h unzcrash.o : Makefile arg_parser.h main_common.cc @@ -87,7 +92,7 @@ install-info : -rm -f "$(DESTDIR)$(infodir)/$(pkgname).info"* $(INSTALL_DATA) $(VPATH)/doc/$(pkgname).info "$(DESTDIR)$(infodir)/$(pkgname).info" -if $(CAN_RUN_INSTALLINFO) ; then \ - install-info --info-dir="$(DESTDIR)$(infodir)" "$(DESTDIR)$(infodir)/$(pkgname).info" ; \ + install-info --info-dir="$(DESTDIR)$(infodir)" "$(DESTDIR)$(infodir)/$(pkgname).info" ; \ fi install-info-compress : install-info @@ -112,7 +117,7 @@ uninstall-bin : uninstall-info : -if $(CAN_RUN_INSTALLINFO) ; then \ - install-info --info-dir="$(DESTDIR)$(infodir)" --remove "$(DESTDIR)$(infodir)/$(pkgname).info" ; \ + install-info --info-dir="$(DESTDIR)$(infodir)" --remove "$(DESTDIR)$(infodir)/$(pkgname).info" ; \ fi -rm -f "$(DESTDIR)$(infodir)/$(pkgname).info"* @@ -139,7 +144,9 @@ dist : doc $(DISTNAME)/testsuite/fox6_bad1.txt \ $(DISTNAME)/testsuite/test.txt \ $(DISTNAME)/testsuite/test21723.txt \ + $(DISTNAME)/testsuite/test_bad[6-9].txt \ $(DISTNAME)/testsuite/fox.lz \ + $(DISTNAME)/testsuite/fox_*.lz \ $(DISTNAME)/testsuite/fox6.lz \ $(DISTNAME)/testsuite/fox6_sc[1-6].lz \ $(DISTNAME)/testsuite/fox6_bad[1-6].lz \ @@ -147,7 +154,8 @@ dist : doc $(DISTNAME)/testsuite/numbersbt.lz \ $(DISTNAME)/testsuite/test.txt.lz \ $(DISTNAME)/testsuite/test.txt.lzma \ - $(DISTNAME)/testsuite/test_bad[1-5].lz + $(DISTNAME)/testsuite/test_bad[1-9].lz \ + $(DISTNAME)/testsuite/test_em.txt.lz rm -f $(DISTNAME) lzip -v -9 $(DISTNAME).tar @@ -1,52 +1,73 @@ -Changes in version 1.21: +Changes in version 1.22: -The options '--dump', '--remove' and '--strip' have been added, mainly as -support for the tarlz archive format: http://www.nongnu.org/lzip/tarlz.html -These options replace '--dump-tdata', '--remove-tdata' and '--strip-tdata', -which are now aliases and will be removed in version 1.22. +The option '-e, --reproduce', which can recover a missing (zeroed) sector in +a lzip file, has been added. For it to work, two things are required: + - The same version of the lzip tool that created the file. + - A reference file containing the uncompressed data corresponding to the + missing compressed data of the zeroed sector, plus some context data + before and after them. +Thanks to Nissanka Gooneratne for his help in testing the reproduce mode. - '--dump=[<member_list>][:damaged][:tdata]' dumps the members listed, the - damaged members (if any), or the trailing data (if any) of one or more - regular multimember files to standard output. +The options '--lzip-level', '--lzip-name', and '--reference-file', auxiliary +to '-e, --reproduce', have been added. - '--remove=[<member_list>][:damaged][:tdata]' removes the members listed, - the damaged members (if any), or the trailing data (if any) from regular - multimember files in place. +Option aliases '--dump-tdata', '--remove-tdata', and '--strip-tdata' have +been removed. - '--strip=[<member_list>][:damaged][:tdata]' copies one or more regular - multimember files to standard output, stripping the members listed, the - damaged members (if any), or the trailing data (if any) from each file. +When decompressing or testing, lziprecover now reports an error if a file +name is empty (lziprecover -t ""). -Detection of forbidden combinations of characters in trailing data has been -improved. +Option '-o, --output' now behaves like '-c, --stdout', but sending the +output unconditionally to a file instead of to standard output. See the new +description of '-o' in the manual. This change is backwards compatible only +when decompressing from standard input alone. Therefore commands like: + lziprecover -d -o foo - bar.lz < foo.lz +must now be split into: + lziprecover -d -o foo - < foo.lz + lziprecover -d bar.lz +or rewritten as: + lziprecover -d - bar.lz < foo.lz > foo -'--split' can now detect trailing data and gaps between members, and save -each gap in its own file. Trailing data (if any) are saved alone in the last -file. (Gaps may contain garbage or may be members with corrupt headers or -trailers). +When using '-c' or '-o', lziprecover now checks whether the output is a +terminal only once. -'--ignore-errors' now makes '--list' show gaps between members, ignoring -format errors. +Lziprecover now does not even open the output file if the input file is a +terminal. -'--ignore-errors' now makes '--range-decompress' ignore a truncated last -member. +'--ignore-errors' now makes '--decompress' and '--test' ignore data errors +and continue decompressing the remaining members in the file, keeping input +files unchanged. -Errors are now also checked when closing the input file in decompression -mode. +'--ignore-errors --range-decompress' now decompresses a truncated last +member. It also returns 0 if only ignored errors (format errors or data +errors) are found. -Some diagnostic messages have been improved. +'--ignore-errors' now considers that any fragment of file starting with a +valid header and large enough to be a member is a (corrupt) member, not a +gap, even if it lacks a valid trailer. -'\n' is now printed instead of '\r' when showing progress of merge or repair -if stdout is not a terminal. +The words 'decompressed' and 'compressed' have been replaced with the +shorter 'out' and 'in' in the verbose output when decompressing or testing. -Lziprecover now compiles on DOS with DJGPP. (Patch from Robert Riebisch). +Several compiler warnings have been fixed. (Reported by Nissanka Gooneratne). -The new chapter 'Tarlz', explaining the ways in which lziprecover can -recover and process multimember tar.lz archives, has been added to the -manual. +Option '--list' now reports corruption or truncation of the last header in a +multimenber file specifically instead of showing the generic message "Last +member in input file is truncated or corrupt." -The configure script now accepts appending options to CXXFLAGS using the -syntax 'CXXFLAGS+=OPTIONS'. +The debug options '-E, --debug-reproduce', '-M, --md5sum', and +'-U, --unzcrash' have been added. -It has been documented in INSTALL the use of -CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO' when compiling on MinGW. +The commands needed to extract files from a tar.lz archive have been +documented in the manual, in the output of '--help', and in the man page. + +The new chapter 'Reproducing one sector' has been added to the manual. + +The new sections 'Merging with a backup' and 'Reproducing a mailbox' have +been added to the manual. + +The debug options for experts have been documented in the manual. + +Lzip 1.16 (or clzip 1.6) or newer is required to run the tests. + +9 new test files have been added to the testsuite. @@ -3,8 +3,9 @@ Description Lziprecover is a data recovery tool and decompressor for files in the lzip compressed data format (.lz). Lziprecover is able to repair slightly damaged files, produce a correct file by merging the good parts of two or more -damaged copies, extract data from damaged files, decompress files and test -integrity of files. +damaged copies, reproduce a missing (zeroed) sector using a reference file, +extract data from damaged files, decompress files, and test integrity of +files. Lziprecover can remove the damaged members from multimember files, for example multimember tar.lz archives. @@ -22,68 +23,71 @@ The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder availability: * The lzip format provides very safe integrity checking and some data - recovery means. The lziprecover program can repair bit flip errors - (one of the most common forms of data corruption) in lzip files, - and provides data recovery capabilities, including error-checked - merging of damaged copies of a file. - - * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the source code of a simple decompressor - along with a detailed explanation of how it works, so that with - the only help of the lzip manual it would be possible for a - digital archaeologist to extract the data from a lzip file long - after quantum computers eventually render LZMA obsolete. + recovery means. The program lziprecover can repair bit flip errors + (one of the most common forms of data corruption) in lzip files, and + provides data recovery capabilities, including error-checked merging + of damaged copies of a file. + + * The lzip format is as simple as possible (but not simpler). The lzip + manual provides the source code of a simple decompressor along with a + detailed explanation of how it works, so that with the only help of the + lzip manual it would be possible for a digital archaeologist to extract + the data from a lzip file long after quantum computers eventually + render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. -A nice feature of the lzip format is that a corrupt byte is easier to -repair the nearer it is from the beginning of the file. Therefore, with -the help of lziprecover, losing an entire archive just because of a -corrupt byte near the beginning is a thing of the past. +A nice feature of the lzip format is that a corrupt byte is easier to repair +the nearer it is from the beginning of the file. Therefore, with the help of +lziprecover, losing an entire archive just because of a corrupt byte near +the beginning is a thing of the past. -For compressible data, multiple lzip-compressed copies have a better -chance of surviving intact than one uncompressed copy using the same -amount of storage space. +Compression may be good for long-term archiving. For compressible data, +multiple compressed copies may provide redundancy in a more useful form and +may have a better chance of surviving intact than one uncompressed copy +using the same amount of storage space. This is specially true if the format +provides recovery capabilities like those of lziprecover, which is able to +find and combine the good parts of several damaged copies. -Lziprecover is able to recover or decompress files produced by any of -the compressors in the lzip family; lzip, plzip, minilzip/lzlib, clzip -and pdlzip. +Lziprecover is able to recover or decompress files produced by any of the +compressors in the lzip family; lzip, plzip, minilzip/lzlib, clzip, and +pdlzip. -If the cause of file corruption is damaged media, the combination -GNU ddrescue + lziprecover is the best option for recovering data from -multiple damaged copies. +If the cause of file corruption is a damaged medium, the combination +GNU ddrescue + lziprecover is the recommended option for recovering data +from damaged lzip files. -If a file is too damaged for lziprecover to repair it, all the -recoverable data in all members of the file can be extracted in one step -with the '-D' option. +If a file is too damaged for lziprecover to repair it, all the recoverable +data in all members of the file can be extracted in one step with the +command 'lziprecover -cd -i file.lz > file'. When recovering data, lziprecover takes as arguments the names of the damaged files and writes zero or more recovered files depending on the -operation selected and whether the recovery succeeded or not. The -damaged files themselves are kept unchanged. +operation selected and whether the recovery succeeded or not. The damaged +files themselves are kept unchanged. -When decompressing or testing file integrity, lziprecover behaves like -lzip or lunzip. +When decompressing or testing file integrity, lziprecover behaves like lzip +or lunzip. -To give you an idea of its possibilities, when merging two copies, each -of them with one damaged area affecting 1 percent of the copy, the -probability of obtaining a correct file is about 98 percent. With three -such copies the probability rises to 99.97 percent. For large files (a -few MB) with small errors (one sector damaged per copy), the probability -approaches 100 percent even with only two copies. (Supposing that the -errors are randomly located inside each copy). +To give you an idea of its possibilities, when merging two copies, each of +them with one damaged area affecting 1 percent of the copy, the probability +of obtaining a correct file is about 98 percent. With three such copies the +probability rises to 99.97 percent. For large files (a few MB) with small +errors (one sector damaged per copy), the probability approaches 100 percent +even with only two copies. (Supposing that the errors are randomly located +inside each copy). -The lziprecover package also includes unzcrash, a program written to -test robustness to decompression of corrupted data, inspired by -unzcrash.c from Julian Seward's bzip2. Type 'make unzcrash' in the -lziprecover source directory to build it. Then try 'unzcrash --help'. +The lziprecover package also includes unzcrash, a program written to test +robustness to decompression of corrupted data, inspired by unzcrash.c from +Julian Seward's bzip2. Type 'make unzcrash' in the lziprecover source +directory to build it. Then try 'unzcrash --help'. -Copyright (C) 2009-2019 Antonio Diaz Diaz. +Copyright (C) 2009-2021 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, -distribute and modify it. +distribute, and modify it. The file Makefile.in is a data file used by configure to produce the Makefile. It has the same copyright owner and permissions that configure diff --git a/alone_to_lz.cc b/alone_to_lz.cc index dd39e34..1f65dfe 100644 --- a/alone_to_lz.cc +++ b/alone_to_lz.cc @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -23,6 +23,7 @@ #include <cstdio> #include <cstdlib> #include <cstring> +#include <new> #include <string> #include <vector> #include <stdint.h> @@ -83,71 +84,66 @@ bool validate_ds( unsigned * const dictionary_size ) int alone_to_lz( const int infd, const Pretty_print & pp ) { enum { lzma_header_size = 13, offset = lzma_header_size - Lzip_header::size }; - - try { - long file_size = 0; - uint8_t * const buffer = read_file( infd, &file_size, pp.name() ); - if( !buffer ) return 1; - if( file_size < lzma_header_size ) - { show_file_error( pp.name(), "file is too short" ); - std::free( buffer ); return 2; } - - if( buffer[0] != 93 ) // (45 * 2) + (9 * 0) + 3 - { - const Lzip_header & header = *(const Lzip_header *)buffer; - if( header.verify_magic() && header.verify_version() && - isvalid_ds( header.dictionary_size() ) ) - show_file_error( pp.name(), "file is already in lzip format" ); - else - show_file_error( pp.name(), "file has non-default LZMA properties" ); - std::free( buffer ); return 2; - } - for( int i = 5; i < 13; ++i ) if( buffer[i] != 0xFF ) - { show_file_error( pp.name(), "file is non-streamed" ); - std::free( buffer ); return 2; } - - if( verbosity >= 1 ) pp(); - unsigned dictionary_size = 0; - for( int i = 4; i > 0; --i ) - { dictionary_size <<= 8; dictionary_size += buffer[i]; } - const unsigned orig_dictionary_size = dictionary_size; - validate_ds( &dictionary_size ); - Lzip_header & header = *(Lzip_header *)( buffer + offset ); - header.set_magic(); - header.dictionary_size( dictionary_size ); - for( int i = 0; i < Lzip_trailer::size; ++i ) buffer[file_size++] = 0; + long file_size = 0; + uint8_t * const buffer = read_file( infd, &file_size, pp.name() ); + if( !buffer ) return 1; + if( file_size < lzma_header_size ) + { show_file_error( pp.name(), "file is too short" ); + std::free( buffer ); return 2; } + + if( buffer[0] != 93 ) // (45 * 2) + (9 * 0) + 3 + { + const Lzip_header & header = *(const Lzip_header *)buffer; + if( header.verify_magic() && header.verify_version() && + isvalid_ds( header.dictionary_size() ) ) + show_file_error( pp.name(), "file is already in lzip format" ); + else + show_file_error( pp.name(), "file has non-default LZMA properties" ); + std::free( buffer ); return 2; + } + for( int i = 5; i < 13; ++i ) if( buffer[i] != 0xFF ) + { show_file_error( pp.name(), "file is non-streamed" ); + std::free( buffer ); return 2; } + + if( verbosity >= 1 ) pp(); + unsigned dictionary_size = 0; + for( int i = 4; i > 0; --i ) + { dictionary_size <<= 8; dictionary_size += buffer[i]; } + const unsigned orig_dictionary_size = dictionary_size; + validate_ds( &dictionary_size ); + Lzip_header & header = *(Lzip_header *)( buffer + offset ); + header.set_magic(); + header.dictionary_size( dictionary_size ); + for( int i = 0; i < Lzip_trailer::size; ++i ) buffer[file_size++] = 0; + { + LZ_mtester mtester( buffer + offset, file_size - offset, dictionary_size ); + const int result = mtester.test_member(); + if( result == 1 && orig_dictionary_size > max_dictionary_size ) + { pp( "dictionary size is too large" ); std::free( buffer ); return 2; } + if( result != 3 || !mtester.finished() ) + { pp( "file is corrupt" ); std::free( buffer ); return 2; } + if( mtester.max_distance() < dictionary_size && + dictionary_size > min_dictionary_size ) { - LZ_mtester mtester( buffer + offset, file_size - offset, dictionary_size ); - const int result = mtester.test_member(); - if( result == 1 && orig_dictionary_size > max_dictionary_size ) - { pp( "dictionary size is too large" ); std::free( buffer ); return 2; } - if( result != 3 || !mtester.finished() ) - { pp( "file is corrupt" ); std::free( buffer ); return 2; } - if( mtester.max_distance() < dictionary_size && - dictionary_size > min_dictionary_size ) - { - dictionary_size = - std::max( mtester.max_distance(), (unsigned)min_dictionary_size ); - header.dictionary_size( dictionary_size ); - } - Lzip_trailer & trailer = - *(Lzip_trailer *)( buffer + file_size - Lzip_trailer::size ); - trailer.data_crc( mtester.crc() ); - trailer.data_size( mtester.data_position() ); - trailer.member_size( mtester.member_position() ); + dictionary_size = + std::max( mtester.max_distance(), (unsigned)min_dictionary_size ); + header.dictionary_size( dictionary_size ); } - LZ_mtester mtester( buffer + offset, file_size - offset, dictionary_size ); - if( mtester.test_member() != 0 || !mtester.finished() ) - { pp( "conversion failed" ); std::free( buffer ); return 2; } - if( writeblock( outfd, buffer + offset, file_size - offset ) != file_size - offset ) - { - show_error( "Error writing output file", errno ); - std::free( buffer ); return 1; - } - std::free( buffer ); + Lzip_trailer & trailer = + *(Lzip_trailer *)( buffer + file_size - Lzip_trailer::size ); + trailer.data_crc( mtester.crc() ); + trailer.data_size( mtester.data_position() ); + trailer.member_size( mtester.member_position() ); + } + LZ_mtester mtester( buffer + offset, file_size - offset, dictionary_size ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { pp( "conversion failed" ); std::free( buffer ); return 2; } + if( writeblock( outfd, buffer + offset, file_size - offset ) != file_size - offset ) + { + show_error( "Error writing output file", errno ); + std::free( buffer ); return 1; } - catch( std::bad_alloc & ) { pp( "Not enough memory." ); return 1; } - catch( Error & e ) { pp(); show_error( e.msg, errno ); return 1; } + std::free( buffer ); if( verbosity >= 1 ) std::fputs( "done\n", stderr ); return 0; } diff --git a/arg_parser.cc b/arg_parser.cc index ea32fde..2e40a13 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,20 +1,20 @@ -/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2019 Antonio Diaz Diaz. +/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) + Copyright (C) 2006-2021 Antonio Diaz Diaz. - This library is free software. Redistribution and use in source and - binary forms, with or without modification, are permitted provided - that the following conditions are met: + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include <cstring> @@ -167,7 +167,7 @@ Arg_parser::Arg_parser( const int argc, const char * const argv[], else non_options.push_back( argv[argind++] ); } } - if( error_.size() ) data.clear(); + if( !error_.empty() ) data.clear(); else { for( unsigned i = 0; i < non_options.size(); ++i ) @@ -190,7 +190,7 @@ Arg_parser::Arg_parser( const char * const opt, const char * const arg, { if( opt[2] ) parse_long_option( opt, arg, options, argind ); } else parse_short_option( opt, arg, options, argind ); - if( error_.size() ) data.clear(); + if( !error_.empty() ) data.clear(); } else data.push_back( Record( opt ) ); } diff --git a/arg_parser.h b/arg_parser.h index ceb9933..5629b90 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,43 +1,43 @@ -/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2019 Antonio Diaz Diaz. +/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) + Copyright (C) 2006-2021 Antonio Diaz Diaz. - This library is free software. Redistribution and use in source and - binary forms, with or without modification, are permitted provided - that the following conditions are met: + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ -/* Arg_parser reads the arguments in 'argv' and creates a number of - option codes, option arguments and non-option arguments. +/* Arg_parser reads the arguments in 'argv' and creates a number of + option codes, option arguments, and non-option arguments. - In case of error, 'error' returns a non-empty error message. + In case of error, 'error' returns a non-empty error message. - 'options' is an array of 'struct Option' terminated by an element - containing a code which is zero. A null name means a short-only - option. A code value outside the unsigned char range means a - long-only option. + 'options' is an array of 'struct Option' terminated by an element + containing a code which is zero. A null name means a short-only + option. A code value outside the unsigned char range means a + long-only option. - Arg_parser normally makes it appear as if all the option arguments - were specified before all the non-option arguments for the purposes - of parsing, even if the user of your program intermixed option and - non-option arguments. If you want the arguments in the exact order - the user typed them, call 'Arg_parser' with 'in_order' = true. + Arg_parser normally makes it appear as if all the option arguments + were specified before all the non-option arguments for the purposes + of parsing, even if the user of your program intermixed option and + non-option arguments. If you want the arguments in the exact order + the user typed them, call 'Arg_parser' with 'in_order' = true. - The argument '--' terminates all options; any following arguments are - treated as non-option arguments, even if they begin with a hyphen. + The argument '--' terminates all options; any following arguments are + treated as non-option arguments, even if they begin with a hyphen. - The syntax for optional option arguments is '-<short_option><argument>' - (without whitespace), or '--<long_option>=<argument>'. + The syntax for optional option arguments is '-<short_option><argument>' + (without whitespace), or '--<long_option>=<argument>'. */ class Arg_parser @@ -61,6 +61,7 @@ private: explicit Record( const char * const arg ) : code( 0 ), argument( arg ) {} }; + const std::string empty_arg; std::string error_; std::vector< Record > data; @@ -73,17 +74,17 @@ public: Arg_parser( const int argc, const char * const argv[], const Option options[], const bool in_order = false ); - // Restricted constructor. Parses a single token and argument (if any) + // Restricted constructor. Parses a single token and argument (if any). Arg_parser( const char * const opt, const char * const arg, const Option options[] ); const std::string & error() const { return error_; } - // The number of arguments parsed (may be different from argc) + // The number of arguments parsed. May be different from argc. int arguments() const { return data.size(); } - // If code( i ) is 0, argument( i ) is a non-option. - // Else argument( i ) is the option's argument (or empty). + /* If code( i ) is 0, argument( i ) is a non-option. + Else argument( i ) is the option's argument (or empty). */ int code( const int i ) const { if( i >= 0 && i < arguments() ) return data[i].code; @@ -93,6 +94,6 @@ public: const std::string & argument( const int i ) const { if( i >= 0 && i < arguments() ) return data[i].argument; - else return error_; + else return empty_arg; } }; @@ -1,12 +1,12 @@ #! /bin/sh # configure script for Lziprecover - Data recovery tool for the lzip format -# Copyright (C) 2009-2019 Antonio Diaz Diaz. +# Copyright (C) 2009-2021 Antonio Diaz Diaz. # # This configure script is free software: you have unlimited permission -# to copy, distribute and modify it. +# to copy, distribute, and modify it. pkgname=lziprecover -pkgversion=1.21 +pkgversion=1.22 progname=lziprecover srctrigger=doc/${pkgname}.texi @@ -26,11 +26,7 @@ CXXFLAGS='-Wall -W -O2' LDFLAGS= # checking whether we are using GNU C++. -/bin/sh -c "${CXX} --version" > /dev/null 2>&1 || - { - CXX=c++ - CXXFLAGS=-O2 - } +/bin/sh -c "${CXX} --version" > /dev/null 2>&1 || { CXX=c++ ; CXXFLAGS=-O2 ; } # Loop over all args args= @@ -42,11 +38,12 @@ while [ $# != 0 ] ; do shift # Add the argument quoted to args - args="${args} \"${option}\"" + if [ -z "${args}" ] ; then args="\"${option}\"" + else args="${args} \"${option}\"" ; fi # Split out the argument for options that take them case ${option} in - *=*) optarg=`echo ${option} | sed -e 's,^[^=]*=,,;s,/$,,'` ;; + *=*) optarg=`echo "${option}" | sed -e 's,^[^=]*=,,;s,/$,,'` ;; esac # Process the options @@ -125,7 +122,7 @@ if [ -z "${srcdir}" ] ; then if [ ! -r "${srcdir}/${srctrigger}" ] ; then srcdir=.. ; fi if [ ! -r "${srcdir}/${srctrigger}" ] ; then ## the sed command below emulates the dirname command - srcdir=`echo $0 | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + srcdir=`echo "$0" | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` fi fi @@ -148,7 +145,7 @@ if [ -z "${no_create}" ] ; then # Run this file to recreate the current configuration. # # This script is free software: you have unlimited permission -# to copy, distribute and modify it. +# to copy, distribute, and modify it. exec /bin/sh $0 ${args} --no-create EOF @@ -170,11 +167,11 @@ echo "LDFLAGS = ${LDFLAGS}" rm -f Makefile cat > Makefile << EOF # Makefile for Lziprecover - Data recovery tool for the lzip format -# Copyright (C) 2009-2019 Antonio Diaz Diaz. +# Copyright (C) 2009-2021 Antonio Diaz Diaz. # This file was generated automatically by configure. Don't edit. # # This Makefile is free software: you have unlimited permission -# to copy, distribute and modify it. +# to copy, distribute, and modify it. pkgname = ${pkgname} pkgversion = ${pkgversion} @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -37,13 +37,13 @@ const CRC32 crc32; /* Returns the number of bytes really read. If (returned value < size) and (errno == 0), means EOF was reached. */ -long readblock( const int fd, uint8_t * const buf, const long size ) +long long readblock( const int fd, uint8_t * const buf, const long long size ) { - long sz = 0; + long long sz = 0; errno = 0; while( sz < size ) { - const int n = read( fd, buf + sz, std::min( 1L << 20, size - sz ) ); + const int n = read( fd, buf + sz, std::min( 1LL << 20, size - sz ) ); if( n > 0 ) sz += n; else if( n == 0 ) break; // EOF else if( errno != EINTR ) break; @@ -56,13 +56,14 @@ long readblock( const int fd, uint8_t * const buf, const long size ) /* Returns the number of bytes really written. If (returned value < size), it is always an error. */ -long writeblock( const int fd, const uint8_t * const buf, const long size ) +long long writeblock( const int fd, const uint8_t * const buf, + const long long size ) { - long sz = 0; + long long sz = 0; errno = 0; while( sz < size ) { - const int n = write( fd, buf + sz, std::min( 1L << 20, size - sz ) ); + const int n = write( fd, buf + sz, std::min( 1LL << 20, size - sz ) ); if( n > 0 ) sz += n; else if( n < 0 && errno != EINTR ) break; errno = 0; @@ -166,16 +167,15 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const { if( verbosity >= 4 ) show_header( dictionary_size ); if( data_size == 0 || member_size == 0 ) - std::fputs( "no data compressed. ", stderr ); + std::fputs( "no data compressed. ", stderr ); else - std::fprintf( stderr, "%6.3f:1, %5.2f%% ratio, %5.2f%% saved. ", + std::fprintf( stderr, "%6.3f:1, %5.2f%% ratio, %5.2f%% saved. ", (double)data_size / member_size, ( 100.0 * member_size ) / data_size, 100.0 - ( ( 100.0 * member_size ) / data_size ) ); if( verbosity >= 4 ) std::fprintf( stderr, "CRC %08X, ", td_crc ); if( verbosity >= 3 ) - std::fprintf( stderr, "decompressed %9llu, compressed %8llu. ", - data_size, member_size ); + std::fprintf( stderr, "%9llu out, %8llu in. ", data_size, member_size ); } if( rdec.get_code() != 0 && verbosity >= 1 ) { // corruption in the last 4 bytes of the EOS marker @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ class Range_decoder @@ -49,7 +49,9 @@ public: unsigned get_code() const { return code; } bool finished() { return pos >= stream_pos && !read_block(); } - unsigned long long member_position() const { return partial_member_pos + pos; } + + unsigned long long member_position() const + { return partial_member_pos + pos; } void reset_member_position() { partial_member_pos = 0; partial_member_pos -= pos; } @@ -74,10 +76,40 @@ public: return sz; } + /* if ignore_errors, stop reading before the first wrong byte, so that + unreading the header is not required to sync to next member */ + int read_header_carefully( Lzip_header & header, const bool ignore_errors ) + { + int sz = 0; + while( sz < Lzip_header::size && !finished() ) + { + header.data[sz] = buffer[pos]; + if( ignore_errors && + ( ( sz < 4 && header.data[sz] != lzip_magic[sz] ) || + ( sz == 4 && !header.verify_version() ) || + ( sz == 5 && !isvalid_ds( header.dictionary_size() ) ) ) ) break; + ++pos; ++sz; + } + return sz; + } + + bool find_header( Lzip_header & header ) + { + while( !finished() ) + { + if( buffer[pos] != lzip_magic[0] ) { ++pos; continue; } + reset_member_position(); + Lzip_header h; + if( read_header_carefully( h, true ) == Lzip_header::size ) + { header = h; return true; } + } + return false; + } + void load() { code = 0; - for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte(); + for( int i = 0; i < 5; ++i ) code = ( code << 8 ) | get_byte(); range = 0xFFFFFFFFU; code &= range; // make sure that first byte is discarded } @@ -85,7 +117,7 @@ public: void normalize() { if( range <= 0x00FFFFFFU ) - { range <<= 8; code = (code << 8) | get_byte(); } + { range <<= 8; code = ( code << 8 ) | get_byte(); } } unsigned decode( const int num_bits ) @@ -98,7 +130,7 @@ public: // symbol <<= 1; // if( code >= range ) { code -= range; symbol |= 1; } const bool bit = ( code >= range ); - symbol = ( symbol << 1 ) + bit; + symbol <<= 1; symbol += bit; code -= range & ( 0U - bit ); } return symbol; @@ -111,7 +143,8 @@ public: if( code < bound ) { range = bound; - bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; return 0; } else @@ -125,8 +158,7 @@ public: unsigned decode_tree3( Bit_model bm[] ) { - unsigned symbol = 1; - symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + unsigned symbol = 2 | decode_bit( bm[1] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); return symbol & 7; @@ -134,8 +166,7 @@ public: unsigned decode_tree6( Bit_model bm[] ) { - unsigned symbol = 1; - symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + unsigned symbol = 2 | decode_bit( bm[1] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); @@ -159,7 +190,7 @@ public: for( int i = 0; i < num_bits; ++i ) { const unsigned bit = decode_bit( bm[model] ); - model = ( model << 1 ) + bit; + model <<= 1; model += bit; symbol |= ( bit << i ); } return symbol; @@ -168,12 +199,9 @@ public: unsigned decode_tree_reversed4( Bit_model bm[] ) { unsigned symbol = decode_bit( bm[1] ); - unsigned model = 2 + symbol; - unsigned bit = decode_bit( bm[model] ); - model = ( model << 1 ) + bit; symbol |= ( bit << 1 ); - bit = decode_bit( bm[model] ); - model = ( model << 1 ) + bit; symbol |= ( bit << 2 ); - symbol |= ( decode_bit( bm[model] ) << 3 ); + symbol += decode_bit( bm[2+symbol] ) << 1; + symbol += decode_bit( bm[4+symbol] ) << 2; + symbol += decode_bit( bm[8+symbol] ) << 3; return symbol; } @@ -184,9 +212,9 @@ public: while( symbol < 0x100 ) { const unsigned match_bit = ( match_byte <<= 1 ) & 0x100; - const unsigned bit = decode_bit( bm1[match_bit+symbol] ); - symbol = ( symbol << 1 ) | bit; - if( match_bit != bit << 8 ) + const bool bit = decode_bit( bm1[symbol+match_bit] ); + symbol <<= 1; symbol |= bit; + if( match_bit >> 8 != bit ) { while( symbol < 0x100 ) symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); diff --git a/doc/lziprecover.1 b/doc/lziprecover.1 index 29df1e0..eefa0b9 100644 --- a/doc/lziprecover.1 +++ b/doc/lziprecover.1 @@ -1,5 +1,5 @@ -.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH LZIPRECOVER "1" "January 2019" "lziprecover 1.21" "User Commands" +.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.47.16. +.TH LZIPRECOVER "1" "January 2021" "lziprecover 1.22" "User Commands" .SH NAME lziprecover \- recovers data from damaged lzip files .SH SYNOPSIS @@ -9,8 +9,9 @@ lziprecover \- recovers data from damaged lzip files Lziprecover is a data recovery tool and decompressor for files in the lzip compressed data format (.lz). Lziprecover is able to repair slightly damaged files, produce a correct file by merging the good parts of two or more -damaged copies, extract data from damaged files, decompress files and test -integrity of files. +damaged copies, reproduce a missing (zeroed) sector using a reference file, +extract data from damaged files, decompress files, and test integrity of +files. .PP Lziprecover can repair perfectly most files with small errors (up to one single\-byte error per member), without the need of any extra redundance @@ -51,11 +52,23 @@ decompress \fB\-D\fR, \fB\-\-range\-decompress=\fR<n\-m> decompress a range of bytes to stdout .TP +\fB\-e\fR, \fB\-\-reproduce\fR +try to reproduce a zeroed sector in file +.TP +\fB\-\-lzip\-level\fR=\fI\,N\/\fR|a|m[N] +reproduce one level, all, or match length +.TP +\fB\-\-lzip\-name=\fR<name> +name of lzip executable for \fB\-\-reproduce\fR +.TP +\fB\-\-reference\-file=\fR<file> +reference file for \fB\-\-reproduce\fR +.TP \fB\-f\fR, \fB\-\-force\fR overwrite existing output files .TP \fB\-i\fR, \fB\-\-ignore\-errors\fR -all errors in \fB\-D\fR, format errors in \fB\-l\fR, \fB\-\-dump\fR +ignore some errors in \fB\-d\fR, \fB\-D\fR, \fB\-l\fR, \fB\-t\fR, \fB\-\-dump\fR .TP \fB\-k\fR, \fB\-\-keep\fR keep (don't delete) input files @@ -101,6 +114,9 @@ from standard input to standard output. Numbers may be followed by a multiplier: k = kB = 10^3 = 1000, Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc... .PP +To extract all the files from archive 'foo.tar.lz', use the commands +\&'tar \fB\-xf\fR foo.tar.lz' or 'lziprecover \fB\-cd\fR foo.tar.lz | tar \fB\-xf\fR \-'. +.PP Exit status: 0 for a normal exit, 1 for environmental problems (file not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid input file, 3 for an internal consistency error (eg, bug) which @@ -110,7 +126,7 @@ Report bugs to lzip\-bug@nongnu.org .br Lziprecover home page: http://www.nongnu.org/lzip/lziprecover.html .SH COPYRIGHT -Copyright \(co 2019 Antonio Diaz Diaz. +Copyright \(co 2021 Antonio Diaz Diaz. License GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl.html> .br This is free software: you are free to change and redistribute it. diff --git a/doc/lziprecover.info b/doc/lziprecover.info index 0339c15..49170df 100644 --- a/doc/lziprecover.info +++ b/doc/lziprecover.info @@ -12,29 +12,30 @@ File: lziprecover.info, Node: Top, Next: Introduction, Up: (dir) Lziprecover Manual ****************** -This manual is for Lziprecover (version 1.21, 4 January 2019). +This manual is for Lziprecover (version 1.22, 2 January 2021). * Menu: -* Introduction:: Purpose and features of lziprecover -* Invoking lziprecover:: Command line interface -* Data safety:: Protecting data from accidental loss -* Repairing files:: Fixing bit flips and similar errors -* Merging files:: Fixing several damaged copies -* Tarlz:: Options supporting the tar.lz format -* File names:: Names of the files produced by lziprecover -* File format:: Detailed format of the compressed file -* Trailing data:: Extra data appended to the file -* Examples:: A small tutorial with examples -* Unzcrash:: Testing the robustness of decompressors -* Problems:: Reporting bugs -* Concept index:: Index of concepts +* Introduction:: Purpose and features of lziprecover +* Invoking lziprecover:: Command line interface +* Data safety:: Protecting data from accidental loss +* Repairing one byte:: Fixing bit flips and similar errors +* Merging files:: Fixing several damaged copies +* Reproducing one sector:: Fixing a missing (zeroed) sector +* Tarlz:: Options supporting the tar.lz format +* File names:: Names of the files produced by lziprecover +* File format:: Detailed format of the compressed file +* Trailing data:: Extra data appended to the file +* Examples:: A small tutorial with examples +* Unzcrash:: Testing the robustness of decompressors +* Problems:: Reporting bugs +* Concept index:: Index of concepts - Copyright (C) 2009-2019 Antonio Diaz Diaz. + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This manual is free documentation: you have unlimited permission to -copy, distribute and modify it. + This manual is free documentation: you have unlimited permission to copy, +distribute, and modify it. File: lziprecover.info, Node: Introduction, Next: Invoking lziprecover, Prev: Top, Up: Top @@ -42,79 +43,83 @@ File: lziprecover.info, Node: Introduction, Next: Invoking lziprecover, Prev: 1 Introduction ************** -Lziprecover is a data recovery tool and decompressor for files in the -lzip compressed data format (.lz). Lziprecover is able to repair -slightly damaged files, produce a correct file by merging the good -parts of two or more damaged copies, extract data from damaged files, -decompress files and test integrity of files. +Lziprecover is a data recovery tool and decompressor for files in the lzip +compressed data format (.lz). Lziprecover is able to repair slightly damaged +files, produce a correct file by merging the good parts of two or more +damaged copies, reproduce a missing (zeroed) sector using a reference file, +extract data from damaged files, decompress files, and test integrity of +files. - Lziprecover can remove the damaged members from multimember files, -for example multimember tar.lz archives. + Lziprecover can remove the damaged members from multimember files, for +example multimember tar.lz archives. - Lziprecover provides random access to the data in multimember files; -it only decompresses the members containing the desired data. + Lziprecover provides random access to the data in multimember files; it +only decompresses the members containing the desired data. Lziprecover facilitates the management of metadata stored as trailing data in lzip files. - Lziprecover is not a replacement for regular backups, but a last -line of defense for the case where the backups are also damaged. + Lziprecover is not a replacement for regular backups, but a last line of +defense for the case where the backups are also damaged. The lzip file format is designed for data sharing and long-term -archiving, taking into account both data integrity and decoder -availability: +archiving, taking into account both data integrity and decoder availability: * The lzip format provides very safe integrity checking and some data - recovery means. The lziprecover program can repair bit flip errors - (one of the most common forms of data corruption) in lzip files, - and provides data recovery capabilities, including error-checked - merging of damaged copies of a file. *Note Data safety::. - - * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the source code of a simple decompressor - along with a detailed explanation of how it works, so that with - the only help of the lzip manual it would be possible for a - digital archaeologist to extract the data from a lzip file long - after quantum computers eventually render LZMA obsolete. + recovery means. The program lziprecover can repair bit flip errors + (one of the most common forms of data corruption) in lzip files, and + provides data recovery capabilities, including error-checked merging + of damaged copies of a file. *Note Data safety::. + + * The lzip format is as simple as possible (but not simpler). The lzip + manual provides the source code of a simple decompressor along with a + detailed explanation of how it works, so that with the only help of the + lzip manual it would be possible for a digital archaeologist to extract + the data from a lzip file long after quantum computers eventually + render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. A nice feature of the lzip format is that a corrupt byte is easier to -repair the nearer it is from the beginning of the file. Therefore, with -the help of lziprecover, losing an entire archive just because of a -corrupt byte near the beginning is a thing of the past. - - For compressible data, multiple lzip-compressed copies have a better -chance of surviving intact than one uncompressed copy using the same -amount of storage space. - - Lziprecover is able to recover or decompress files produced by any of -the compressors in the lzip family; lzip, plzip, minilzip/lzlib, clzip -and pdlzip. - - If the cause of file corruption is damaged media, the combination -GNU ddrescue + lziprecover is the best option for recovering data from -multiple damaged copies. *Note ddrescue-example::, for an example. +repair the nearer it is from the beginning of the file. Therefore, with the +help of lziprecover, losing an entire archive just because of a corrupt +byte near the beginning is a thing of the past. + + Compression may be good for long-term archiving. For compressible data, +multiple compressed copies may provide redundancy in a more useful form and +may have a better chance of surviving intact than one uncompressed copy +using the same amount of storage space. This is specially true if the format +provides recovery capabilities like those of lziprecover, which is able to +find and combine the good parts of several damaged copies. + + Lziprecover is able to recover or decompress files produced by any of the +compressors in the lzip family; lzip, plzip, minilzip/lzlib, clzip, and +pdlzip. + + If the cause of file corruption is a damaged medium, the combination +GNU ddrescue + lziprecover is the recommended option for recovering data +from damaged lzip files. *Note ddrescue-example::, and *note +ddrescue-example2::, for examples. If a file is too damaged for lziprecover to repair it, all the recoverable data in all members of the file can be extracted with the -following command (the resulting file may contain errors and some -garbage data may be produced at the end of each member): +following command (the resulting file may contain errors and some garbage +data may be produced at the end of each member): - lziprecover -D0 -i -o file -q file.lz + lziprecover -cd -i file.lz > file When recovering data, lziprecover takes as arguments the names of the damaged files and writes zero or more recovered files depending on the -operation selected and whether the recovery succeeded or not. The -damaged files themselves are kept unchanged. +operation selected and whether the recovery succeeded or not. The damaged +files themselves are kept unchanged. - When decompressing or testing file integrity, lziprecover behaves -like lzip or lunzip. + When decompressing or testing file integrity, lziprecover behaves like +lzip or lunzip. - LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may -never have been compressed. Decompressed is used to refer to data which -have undergone the process of decompression. + LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never +have been compressed. Decompressed is used to refer to data which have +undergone the process of decompression. File: lziprecover.info, Node: Invoking lziprecover, Next: Data safety, Prev: Introduction, Up: Top @@ -126,11 +131,13 @@ The format for running lziprecover is: lziprecover [OPTIONS] [FILES] -When decompressing or testing, '-' used as a FILE argument means -standard input. It can be mixed with other FILES and is read just once, -the first time it appears in the command line. +When decompressing or testing, a hyphen '-' used as a FILE argument means +standard input. It can be mixed with other FILES and is read just once, the +first time it appears in the command line. If no file names are specified, +lziprecover decompresses from standard input to standard output. - lziprecover supports the following options: + lziprecover supports the following options: *Note Argument syntax: +(arg_parser)Argument syntax. '-h' '--help' @@ -139,23 +146,22 @@ the first time it appears in the command line. '-V' '--version' Print the version number of lziprecover on the standard output and - exit. This version number should be included in all bug reports. + exit. This version number should be included in all bug reports. '-a' '--trailing-error' Exit with error status 2 if any remaining input is detected after - decompressing the last member. Such remaining input is usually - trailing garbage that can be safely ignored. *Note - concat-example::. + decompressing the last member. Such remaining input is usually trailing + garbage that can be safely ignored. *Note concat-example::. '-A' '--alone-to-lz' Convert lzma-alone files to lzip format without recompressing, just adding a lzip header and trailer. The conversion minimizes the dictionary size of the resulting file (and therefore the amount of - memory required to decompress it). Only streamed files with - default LZMA properties can be converted; non-streamed lzma-alone - files lack the end of stream marker required in lzip files. + memory required to decompress it). Only streamed files with default + LZMA properties can be converted; non-streamed lzma-alone files lack + the end of stream marker required in lzip files. The name of the converted lzip file is derived from that of the original lzma-alone file as follows: @@ -167,33 +173,60 @@ the first time it appears in the command line. '-c' '--stdout' Write decompressed data to standard output; keep input files - unchanged. This option is needed when reading from a named pipe - (fifo) or from a device. Use it also to recover as much of the - decompressed data as possible when decompressing a corrupt file. + unchanged. This option (or '-o') is needed when reading from a named + pipe (fifo) or from a device. Use it also to recover as much of the + decompressed data as possible when decompressing a corrupt file. '-c' + overrides '-o', but '-c' has no effect when merging, removing members, + repairing, reproducing, splitting, testing or listing. '-d' '--decompress' - Decompress the specified files. If a file does not exist or can't - be opened, lziprecover continues decompressing the rest of the - files. If a file fails to decompress, or is a terminal, - lziprecover exits immediately without decompressing the rest of - the files. + Decompress the files specified. If a file does not exist or can't be + opened, lziprecover continues decompressing the rest of the files. If + a file fails to decompress, or is a terminal, lziprecover exits + immediately without decompressing the rest of the files. '-D RANGE' '--range-decompress=RANGE' - Decompress only a range of bytes starting at decompressed byte - position 'BEGIN' and up to byte position 'END - 1'. Byte - positions start at 0. This option provides random access to the - data in multimember files; it only decompresses the members - containing the desired data. In order to guarantee the correctness - of the data produced, all members containing any part of the - desired data are decompressed and their integrity is verified. + Decompress only a range of bytes starting at decompressed byte position + BEGIN and up to byte position END - 1. Byte positions start at 0. This + option provides random access to the data in multimember files; it + only decompresses the members containing the desired data. In order to + guarantee the correctness of the data produced, all members containing + any part of the desired data are decompressed and their integrity is + verified. Four formats of RANGE are recognized, 'BEGIN', 'BEGIN-END', - 'BEGIN,SIZE', and ',SIZE'. If only BEGIN is specified, END is taken - as the end of the file. If only SIZE is specified, BEGIN is taken - as the beginning of the file. The produced bytes are sent to - standard output unless the '--output' option is used. + 'BEGIN,SIZE', and ',SIZE'. If only BEGIN is specified, END is taken as + the end of the file. If only SIZE is specified, BEGIN is taken as the + beginning of the file. The bytes produced are sent to standard output + unless the option '--output' is used. + +'-e' +'--reproduce' + Try to recover a missing (zeroed) sector in FILE using a reference + file and the same version of lzip that created FILE. If successful, a + repaired copy is written to the file 'FILE_fixed.lz'. FILE is not + modified at all. The exit status is 0 if the member containing the + zeroed sector could be repaired, 2 otherwise. Note that + 'FILE_fixed.lz' may still contain errors in the members following the + one repaired. *Note Reproducing one sector::, for a complete + description of the reproduce mode. + +'--lzip-level=DIGIT|a|m[LENGTH]' + Try only the given compression level or match length limit when + reproducing a zeroed sector. '--lzip-level=a' tries all the + compression levels (0 to 9), while '--lzip-level=m' tries all the + match length limits (5 to 273). + +'--lzip-name=NAME' + Set the name of the lzip executable used by '--reproduce'. If + '--lzip-name' is not specified, 'lzip' is used. + +'--reference-file=FILE' + Set the reference file used by '--reproduce'. It must contain the + uncompressed data corresponding to the missing compressed data of the + zeroed sector, plus some context data before and after them. '-f' '--force' @@ -201,56 +234,63 @@ the first time it appears in the command line. '-i' '--ignore-errors' - Make '--range-decompress' ignore errors and continue decompressing - the remaining members in the file. For example, - 'lziprecover -D0 -i file.lz > file' decompresses all the - recoverable data in all members of 'file.lz' without having to - split it first. - - Make '--list', '--dump', '--remove' and '--strip' ignore format + Make '--decompress', '--test', and '--range-decompress' ignore format + and data errors and continue decompressing the remaining members in + the file; keep input files unchanged. For example, the commands + 'lziprecover -cd -i file.lz > file' or + 'lziprecover -D0 -i file.lz > file' decompress all the recoverable + data in all members of 'file.lz' without having to split it first. The + '-cd -i' method resyncs to the next member header after each error, + and is immune to some format errors that make '-D0 -i' fail. The range + decompressed may be smaller than the range requested, because of the errors. + Make '--list', '--dump', '--remove', and '--strip' ignore format + errors. The sizes of the members with errors (specially the last) may + be wrong. The exit status is set to 0 unless other errors are found + (I/O errors, for example). + '-k' '--keep' Keep (don't delete) input files during decompression. '-l' '--list' - Print the uncompressed size, compressed size and percentage saved - of the specified files. Trailing data are ignored. The values - produced are correct even for multimember files. If more than one - file is given, a final line containing the cumulative sizes is - printed. With '-v', the dictionary size, the number of members in - the file, and the amount of trailing data (if any) are also - printed. With '-vv', the positions and sizes of each member in - multimember files are also printed. With '-i', format errors are - ignored, and with '-ivv', gaps between members are shown. The - member numbers shown coincide with the file numbers produced by - '--split'. + Print the uncompressed size, compressed size, and percentage saved of + the files specified. Trailing data are ignored. The values produced + are correct even for multimember files. If more than one file is + given, a final line containing the cumulative sizes is printed. With + '-v', the dictionary size, the number of members in the file, and the + amount of trailing data (if any) are also printed. With '-vv', the + positions and sizes of each member in multimember files are also + printed. With '-i', format errors are ignored, and with '-ivv', gaps + between members are shown. The member numbers shown coincide with the + file numbers produced by '--split'. '-lq' can be used to verify quickly (without decompressing) the - structural integrity of the specified files. (Use '--test' to - verify the data integrity). '-alq' additionally verifies that none - of the specified files contain trailing data. + structural integrity of the files specified. (Use '--test' to verify + the data integrity). '-alq' additionally verifies that none of the + files specified contain trailing data. '-m' '--merge' - Try to produce a correct file by merging the good parts of two or - more damaged copies. If successful, a repaired copy is written to - the file 'FILE_fixed.lz'. The exit status is 0 if a correct file - could be produced, 2 otherwise. *Note Merging files::, for a - complete description of the merge mode. + Try to produce a correct file by merging the good parts of two or more + damaged copies. If successful, a repaired copy is written to the file + 'FILE_fixed.lz'. The exit status is 0 if a correct file could be + produced, 2 otherwise. *Note Merging files::, for a complete + description of the merge mode. '-o FILE' '--output=FILE' - Place the output into 'FILE' instead of into 'FILE_fixed.lz'. If + Place the output into FILE instead of into 'FILE_fixed.lz'. If splitting, the names of the files produced are in the form - 'rec01FILE', 'rec02FILE', etc. If decompressing from standard - input and '--stdout' has not been specified, use 'FILE' as the - name of the decompressed file. If converting a lzma-alone file - from standard input and '--stdout' has not been specified, use - 'FILE.lz' as the name of the converted file. (Or plain 'FILE' if - it already ends in '.lz' or '.tlz'). + 'rec01FILE', 'rec02FILE', etc. + + If decompressing, or converting lzma-alone files, and '-c' has not been + also specified, write the decompressed or converted output to FILE; + keep input files unchanged. This option (or '-c') is needed when + reading from a named pipe (fifo) or from a device. '-o -' is + equivalent to '-c'. '-o' has no effect when testing or listing. '-q' '--quiet' @@ -258,89 +298,87 @@ the first time it appears in the command line. '-R' '--repair' - Try to repair a file with small errors (up to one single-byte - error per member). If successful, a repaired copy is written to - the file 'FILE_fixed.lz'. 'FILE' is not modified at all. The exit - status is 0 if the file could be repaired, 2 otherwise. *Note - Repairing files::, for a complete description of the repair mode. + Try to repair a FILE with small errors (up to one single-byte error + per member). If successful, a repaired copy is written to the file + 'FILE_fixed.lz'. FILE is not modified at all. The exit status is 0 if + the file could be repaired, 2 otherwise. *Note Repairing one byte::, + for a complete description of the repair mode. '-s' '--split' - Search for members in 'FILE' and write each member in its own - file. Gaps between members are detected and each gap is saved in - its own file. Trailing data (if any) are saved alone in the last - file. You can then use 'lziprecover -t' to test the integrity of - the resulting files, decompress those which are undamaged, and try - to repair or partially decompress those which are damaged. Gaps - may contain garbage or may be members with corrupt headers or - trailers. If other lziprecover functions fail to work on a - multimember FILE because of damage in headers or trailers, try to - split FILE and then work on each member individually. + Search for members in FILE and write each member in its own file. Gaps + between members are detected and each gap is saved in its own file. + Trailing data (if any) are saved alone in the last file. You can then + use 'lziprecover -t' to test the integrity of the resulting files, + decompress those which are undamaged, and try to repair or partially + decompress those which are damaged. Gaps may contain garbage or may be + members with corrupt headers or trailers. If other lziprecover + functions fail to work on a multimember FILE because of damage in + headers or trailers, try to split FILE and then work on each member + individually. The names of the files produced are in the form 'rec01FILE', 'rec02FILE', etc, and are designed so that the use of wildcards in subsequent processing, for example, - 'lziprecover -cd rec*FILE > recovered_data', processes the files - in the correct order. The number of digits used in the names - varies depending on the number of members in 'FILE'. + 'lziprecover -cd rec*FILE > recovered_data', processes the files in + the correct order. The number of digits used in the names varies + depending on the number of members in FILE. '-t' '--test' - Check integrity of the specified files, but don't decompress them. - This really performs a trial decompression and throws away the - result. Use it together with '-v' to see information about the - files. If a file fails the test, does not exist, can't be opened, - or is a terminal, lziprecover continues checking the rest of the - files. A final diagnostic is shown at verbosity level 1 or higher - if any file fails the test when testing multiple files. + Check integrity of the files specified, but don't decompress them. This + really performs a trial decompression and throws away the result. Use + it together with '-v' to see information about the files. If a file + fails the test, does not exist, can't be opened, or is a terminal, + lziprecover continues checking the rest of the files. A final + diagnostic is shown at verbosity level 1 or higher if any file fails + the test when testing multiple files. '-v' '--verbose' Verbose mode. When decompressing or testing, further -v's (up to 4) increase the - verbosity level, showing status, compression ratio, dictionary - size, trailer contents (CRC, data size, member size), and up to 6 - bytes of trailing data (if any) both in hexadecimal and as a - string of printable ASCII characters. + verbosity level, showing status, compression ratio, dictionary size, + trailer contents (CRC, data size, member size), and up to 6 bytes of + trailing data (if any) both in hexadecimal and as a string of printable + ASCII characters. Two or more '-v' options show the progress of decompression. - In other modes, increasing verbosity levels show final status, - progress of operations, and extra information (for example, the - failed areas). + In other modes, increasing verbosity levels show final status, progress + of operations, and extra information (for example, the failed areas). '--loose-trailing' - When decompressing, testing or listing, allow trailing data whose - first bytes are so similar to the magic bytes of a lzip header - that they can be confused with a corrupt header. Use this option - if a file triggers a "corrupt header" error and the cause is not - indeed a corrupt header. + When decompressing, testing, or listing, allow trailing data whose + first bytes are so similar to the magic bytes of a lzip header that + they can be confused with a corrupt header. Use this option if a file + triggers a "corrupt header" error and the cause is not indeed a + corrupt header. '--dump=[MEMBER_LIST][:damaged][:tdata]' - Dump the members listed, the damaged members (if any), or the - trailing data (if any) of one or more regular multimember files to - standard output, or to a file if the '--output' option is used. If - more than one file is given, the elements dumped from all files - are concatenated. If a file does not exist, can't be opened, or - is not regular, lziprecover continues processing the rest of the - files. If the dump fails in one file, lziprecover exits - immediately without processing the rest of the files. + Dump the members listed, the damaged members (if any), or the trailing + data (if any) of one or more regular multimember files to standard + output, or to a file if the option '--output' is used. If more than + one file is given, the elements dumped from all files are concatenated. + If a file does not exist, can't be opened, or is not regular, + lziprecover continues processing the rest of the files. If the dump + fails in one file, lziprecover exits immediately without processing the + rest of the files. The argument to '--dump' is a colon-separated list of the following element specifiers; a member list (1,3-6), a reverse member list - (r1,3-6), and the strings "damaged" and "tdata" (which may be - shortened to 'd' and 't' respectively). A member list selects the - members (or gaps) listed, whose numbers coincide with those shown - by '--list'. A reverse member list selects the members listed - counting from the last member in the file (r1). Negated versions - of both kinds of lists exist (^1,3-6:r^1,3-6) which selects all - the members except those in the list. The strings "damaged" and - "tdata" select the damaged members and the trailing data - respectively. If the same member is selected more than once, for - example by '1:r1' in a single-member file, it is dumped just once. - See the following examples: + (r1,3-6), and the strings "damaged" and "tdata" (which may be shortened + to 'd' and 't' respectively). A member list selects the members (or + gaps) listed, whose numbers coincide with those shown by '--list'. A + reverse member list selects the members listed counting from the last + member in the file (r1). Negated versions of both kinds of lists exist + (^1,3-6:r^1,3-6) which selects all the members except those in the + list. The strings "damaged" and "tdata" select the damaged members and + the trailing data respectively. If the same member is selected more + than once, for example by '1:r1' in a single-member file, it is dumped + just once. See the following examples: '--dump' argument Elements dumped --------------------------------------------------------------------- - '1,3-6' members 1, 3, 4, 5 and 6 + '1,3-6' members 1, 3, 4, 5, 6 'r1-3' last 3 members in file '^13,15' all but 13th and 15th members in file 'r^1' all but last member in file @@ -352,37 +390,113 @@ the first time it appears in the command line. '--remove=[MEMBER_LIST][:damaged][:tdata]' Remove the members listed, the damaged members (if any), or the - trailing data (if any) from regular multimember files in place. - The date of each file is preserved if possible. If all members in - a file are selected to be removed, the file is left unchanged and - the exit status is set to 2. If a file does not exist, can't be - opened, is not regular, or is left unchanged, lziprecover - continues processing the rest of the files. In case of I/O error, - lziprecover exits immediately without processing the rest of the - files. See '--dump' above for a description of the argument. - - This option may be dangerous even if only the trailing data is - being removed because the file may be corrupt or the trailing data - may contain a forbidden combination of characters. *Note Trailing - data::. It is advisable to make a backup before attempting the - removal. At least verify that 'lzip -cd file.lz | wc -c' and the - uncompressed size shown by 'lzip -l file.lz' match before - attempting the removal of trailing data. + trailing data (if any) from regular multimember files in place. The + date of each file is preserved if possible. If all members in a file + are selected to be removed, the file is left unchanged and the exit + status is set to 2. If a file does not exist, can't be opened, is not + regular, or is left unchanged, lziprecover continues processing the + rest of the files. In case of I/O error, lziprecover exits immediately + without processing the rest of the files. See '--dump' above for a + description of the argument. + + This option may be dangerous even if only the trailing data is being + removed because the file may be corrupt or the trailing data may + contain a forbidden combination of characters. *Note Trailing data::. + It is advisable to make a backup before attempting the removal. At + least verify that 'lzip -cd file.lz | wc -c' and the uncompressed size + shown by 'lzip -l file.lz' match before attempting the removal of + trailing data. '--strip=[MEMBER_LIST][:damaged][:tdata]' - Copy one or more regular multimember files to standard output (or - to a file if the '--output' option is used), stripping the members - listed, the damaged members (if any), or the trailing data (if - any) from each file. If all members in a file are selected to be - stripped, the trailing data (if any) are also stripped even if - 'tdata' is not specified. If more than one file is given, the - files are concatenated. In this case the trailing data are also - stripped from all but the last file even if 'tdata' is not - specified. If a file does not exist, can't be opened, or is not - regular, lziprecover continues processing the rest of the files. - If a file fails to copy, lziprecover exits immediately without - processing the rest of the files. See '--dump' above for a - description of the argument. + Copy one or more regular multimember files to standard output (or to a + file if the option '--output' is used), stripping the members listed, + the damaged members (if any), or the trailing data (if any) from each + file. If all members in a file are selected to be stripped, the + trailing data (if any) are also stripped even if 'tdata' is not + specified. If more than one file is given, the files are concatenated. + In this case the trailing data are also stripped from all but the last + file even if 'tdata' is not specified. If a file does not exist, can't + be opened, or is not regular, lziprecover continues processing the + rest of the files. If a file fails to copy, lziprecover exits + immediately without processing the rest of the files. See '--dump' + above for a description of the argument. + + + Lziprecover also supports the following debug options (for experts): + +'-E RANGE[,SECTOR_SIZE]' +'--debug-reproduce=RANGE[,SECTOR_SIZE]' + Load the compressed FILE into memory, set all bytes in the positions + specified by RANGE to 0, and try to reproduce a correct compressed + file. *Note --reproduce::. *Note range-format::, for a description of + RANGE. If a SECTOR_SIZE is specified, set each sector to 0 in sequence + and try to reproduce the file, printing to standard output final + statistics of the number of sectors reproduced successfully. Exit with + nonzero status only in case of fatal error. + +'-M' +'--md5sum' + Print to standard output the MD5 digests of the input FILES one per + line in the same format produced by the 'md5sum' tool. Lziprecover + uses MD5 digests to verify the result of some operations. This option + allows the verification of lziprecover's implementation of the MD5 + algorithm. + +'-S[VALUE]' +'--nrep-stats[=VALUE]' + Compare the frequency of sequences of N repeated bytes of a given + VALUE in the compressed LZMA streams of the input FILES with the + frequency expected for random data (1 / 2^(8N)). If VALUE is not + specified, print the frequency of repeated sequences of all possible + byte values. Print cumulative data for all files followed by the name + of the first file with the longest sequence. + +'-U' +'--unzcrash' + Test 1-bit errors in the LZMA stream of the input FILE like the + command 'unzcrash -b1 -p7 -s-20 'lzip -t' FILE' but in memory, and + therefore much faster. *Note Unzcrash::. This option tests all the + members independently in a multimember file, skipping headers and + trailers. If a decompression succeeds, the decompressed output is + compared with the original decompressed output of FILE using MD5 + digests. The compressed FILE must not contain errors and must + decompress correctly for the comparisons to work. + + By default '--unzcrash' only prints the interesting cases; CRC + mismatches, size mismatches, unsupported marker codes, unexpected EOFs, + apparently successful decompressions, and decoder errors detected + 50_000 or more bytes beyond the byte being tested. At verbosity level + 1 (-v) it also prints decoder errors detected 10_000 or more bytes + beyond the byte being tested. At verbosity level 2 (-vv) it prints all + cases. + +'-W POSITION,VALUE' +'--debug-decompress=POSITION,VALUE' + Load the compressed FILE into memory, set the byte at POSITION to + VALUE, and decompress the modified compressed data to standard output. + +'-X[POSITION,VALUE]' +'--show-packets[=POSITION,VALUE]' + Load the compressed FILE into memory, optionally set the byte at + POSITION to VALUE, decompress the modified compressed data (discarding + the output), and print to standard output descriptions of the LZMA + packets being decoded. + +'-Y RANGE' +'--debug-delay=RANGE' + Load the compressed FILE into memory and then repeatedly decompress + it, increasing 256 times each byte of the subset of the compressed data + positions specified by RANGE, so as to test all possible one-byte + errors. For each decompression error find the error detection delay and + print to standard output the maximum delay. The error detection delay + is the difference between the position of the error and the position + where the decoder realized that the data contains an error. *Note + range-format::, for a description of RANGE. + +'-Z POSITION,VALUE' +'--debug-repair=POSITION,VALUE' + Load the compressed FILE into memory, set the byte at POSITION to + VALUE, and then try to repair the error. *Note --repair::. Numbers given as arguments to options may be followed by a multiplier @@ -401,102 +515,174 @@ Z zettabyte (10^21) | Zi zebibyte (2^70) Y yottabyte (10^24) | Yi yobibyte (2^80) - Exit status: 0 for a normal exit, 1 for environmental problems (file -not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or -invalid input file, 3 for an internal consistency error (eg, bug) which -caused lziprecover to panic. + Exit status: 0 for a normal exit, 1 for environmental problems (file not +found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid +input file, 3 for an internal consistency error (eg, bug) which caused +lziprecover to panic. -File: lziprecover.info, Node: Data safety, Next: Repairing files, Prev: Invoking lziprecover, Up: Top +File: lziprecover.info, Node: Data safety, Next: Repairing one byte, Prev: Invoking lziprecover, Up: Top 3 Protecting data from accidental loss ************************************** -There are 3 main types of data corruption that may cause data loss: -single-byte errors, multibyte errors (generally affecting a whole sector -in a block device), and total device failure. +It is a fact of life that sometimes data will become corrupt. Software has +errors. Hardware may misbehave or fail. RAM may be struck by a cosmic ray. +This is why a safe enough integrity checking is needed in compressed +formats, and the reason why a data recovery tool is sometimes needed. - Lziprecover protects natively against single-byte errors (*note -Repairing files::), as long as file integrity is checked frequently -enough that a second single-byte error does not develop in the same -member before the first one is repaired. + There are 3 main types of data corruption that may cause data loss: +single-byte errors, multibyte errors (generally affecting a whole sector in +a block device), and total device failure. - Lziprecover also protects against multibyte errors (*note Merging -files::), if at least one backup copy of the file is made. + Lziprecover protects natively against single-byte errors as long as file +integrity is checked frequently enough that a second single-byte error does +not develop in the same member before the first one is repaired. *Note +Repairing one byte::. - The only remedy for total device failure is storing backup copies in -separate media. + Lziprecover also protects against multibyte errors if at least one backup +copy of the file is made (*note Merging files::), or if the error is a +zeroed sector and the uncompressed data corresponding to the zeroed sector +are available (*note Reproducing one sector::). If you can choose between +merging and reproducing, try merging first because it is usually faster, +easier to use, and has a high probability of success. - How does lzip compare with gzip and bzip2 with respect to data -safety? Let's suppose that you made a backup of your valuable -scientific data, compressed it, and stored two copies on separate -media. Years later you notice that both copies are corrupt. + Lziprecover can't help in case of device failure. The only remedy for +total device failure is storing backup copies in separate media. + + The extraordinary safety of the lzip format allows lziprecover to exploit +the redundance that occurrs naturally when making compressed backups. +Lziprecover can recover data that would not be recoverable from files +compressed in other formats. Let's see two examples of how much better is +lzip compared with gzip and bzip2 with respect to data safety: + +* Menu: - If you compressed with gzip and both copies suffer any damage in the -data stream, even if it is just one altered bit, the original data can +* Merging with a backup:: Recovering a file using a damaged backup +* Reproducing a mailbox:: Recovering new messages using an old backup + + +File: lziprecover.info, Node: Merging with a backup, Next: Reproducing a mailbox, Up: Data safety + +3.1 Recovering a file using a damaged backup +============================================ + +Let's suppose that you made a compressed backup of your valuable scientific +data and stored two copies on separate media. Years later you notice that +both copies are corrupt. + + If you compressed the data with gzip and both copies suffer any damage in +the data stream, even if it is just one altered bit, the original data can only be recovered by an expert, if at all. - If you used bzip2, and if the file is large enough to contain more -than one compressed data block (usually larger than 900 kB -uncompressed), and if no block is damaged in both files, then the data -can be manually recovered by splitting the files with bzip2recover, -verifying every block and then copying the right blocks in the right -order into another file. + If you used bzip2, and if the file is large enough to contain more than +one compressed data block (usually larger than 900 kB uncompressed), and if +no block is damaged in both files, then the data can be manually recovered +by splitting the files with bzip2recover, verifying every block, and then +copying the right blocks in the right order into another file. - But if you used lzip, the data can be automatically recovered as -long as the damaged areas don't overlap. + But if you used lzip, the data can be automatically recovered with +'lziprecover --merge' as long as the damaged areas don't overlap. - Note that each error in a bzip2 file makes a whole block unusable, -but each error in a lzip file only affects the damaged bytes, making it + Note that each error in a bzip2 file makes a whole block unusable, but +each error in a lzip file only affects the damaged bytes, making it possible to recover a file with thousands of errors. -File: lziprecover.info, Node: Repairing files, Next: Merging files, Prev: Data safety, Up: Top +File: lziprecover.info, Node: Reproducing a mailbox, Prev: Merging with a backup, Up: Data safety + +3.2 Recovering new messages using an old backup +=============================================== + +Let's suppose that you make periodic backups of your email messages stored +in one or more mailboxes. (A mailbox is a file containing a possibly large +number of email messages). New messages are appended to the end of each +mailbox, therefore the initial part of two consecutive backups is identical +unless some messages have been changed or deleted in the meantime. The new +messages added to each backup are usually a small part of the whole mailbox. + ++========================================================+ +| Older backup containing some messages | ++========================================================+ ++========================================================+================+ +| Newer backup containing the messages above plus some | new messages | ++========================================================+================+ + + One day you discover that your mailbox has disappeared because you +deleted it inadvertently or because of a bug in your email reader. Not only +that. You need to recover a recent message, but the last backup you made of +the mailbox (the newer backup above) has lost the data corresponding to a +whole sector because of an I/O error in the part containing the old +messages. + + If you compressed the mailbox with gzip, usually none of the new messages +can be recovered even if they are intact because all the data beyond the +missing sector can't be decoded. + + If you used bzip2, and if the newer backup is large enough that the new +messages are in a different compressed data block than the one damaged +(usually larger than 900 kB uncompressed), then you can recover the new +messages manually with bzip2recover. If the backups are identical except for +the new messages appended, you may even recover the whole newer backup by +combining the good blocks from both backups. + + But if you used lzip, the whole newer backup can be automatically +recovered with 'lziprecover --reproduce' as long as the missing bytes can be +recovered from the older backup, even if other messages in the common part +have been changed or deleted. Mailboxes seem to be specially easy to +reproduce. The probability of reproducing a mailbox (*note +performance-of-reproduce::) is almost as high as that of merging two +identical backups (*note performance-of-merge::). -4 Repairing files -***************** + +File: lziprecover.info, Node: Repairing one byte, Next: Merging files, Prev: Data safety, Up: Top + +4 Repairing one byte +******************** Lziprecover can repair perfectly most files with small errors (up to one -single-byte error per member), without the need of any extra redundance -at all. If the reparation is successful, the repaired file will be -identical bit for bit to the original. This makes lzip files resistant -to bit flip, one of the most common forms of data corruption. +single-byte error per member), without the need of any extra redundance at +all. If the reparation is successful, the repaired file will be identical +bit for bit to the original. This makes lzip files resistant to bit flip, +one of the most common forms of data corruption. + + The file is repaired in memory. Therefore, enough virtual memory +(RAM + swap) to contain the largest damaged member is required. The error may be located anywhere in the file except in the first 5 -bytes of each member header or in the 'Member size' field of the -trailer (last 8 bytes of each member). If the error is in the header it -can be easily repaired with a text editor like GNU Moe (*note File -format::). If the error is in the member size, it is enough to ignore -the message about 'bad member size' when decompressing. +bytes of each member header or in the 'Member size' field of the trailer +(last 8 bytes of each member). If the error is in the header it can be +easily repaired with a text editor like GNU Moe (*note File format::). If +the error is in the member size, it is enough to ignore the message about +'bad member size' when decompressing. - Bit flip happens when one bit in the file is changed from 0 to 1 or -vice versa. It may be caused by bad RAM or even by natural radiation. I -have seen a case of bit flip in a file stored on an USB flash drive. + Bit flip happens when one bit in the file is changed from 0 to 1 or vice +versa. It may be caused by bad RAM or even by natural radiation. I have +seen a case of bit flip in a file stored on an USB flash drive. One byte may seem small, but most file corruptions not produced by -transmission errors or I/O errors just affect one byte, or even one bit, -of the file. Also, unlike magnetic media, where errors usually affect a -whole sector, solid-state storage devices tend to produce single-byte -errors, making of lzip the perfect format for data stored on such -devices. +transmission errors or I/O errors just affect one byte, or even one bit, of +the file. Also, unlike magnetic media, where errors usually affect a whole +sector, solid-state storage devices tend to produce single-byte errors, +making of lzip the perfect format for data stored on such devices. - Repairing a file can take some time. Small files or files with the -error located near the beginning can be repaired in a few seconds. But -repairing a large file compressed with a large dictionary size and with -the error located far from the beginning, can take hours. + Repairing a file can take some time. Small files or files with the error +located near the beginning can be repaired in a few seconds. But repairing +a large file compressed with a large dictionary size and with the error +located far from the beginning, may take hours. - On the other hand, errors located near the beginning of the file -cause much more loss of data than errors located near the end. So -lziprecover repairs more efficiently the worst errors. + On the other hand, errors located near the beginning of the file cause +much more loss of data than errors located near the end. So lziprecover +repairs more efficiently the worst errors. -File: lziprecover.info, Node: Merging files, Next: Tarlz, Prev: Repairing files, Up: Top +File: lziprecover.info, Node: Merging files, Next: Reproducing one sector, Prev: Repairing one byte, Up: Top 5 Merging files *************** If you have several copies of a file but all of them are too damaged to -repair them (*note Repairing files::), lziprecover can try to produce a +repair them (*note Repairing one byte::), lziprecover can try to produce a correct file by merging the good parts of the damaged copies. The merge may succeed even if some copies of the file have all the @@ -504,91 +690,321 @@ headers and trailers damaged, as long as there is at least one copy of every header and trailer intact, even if they are in different copies of the file. - The merge will fail if the damaged areas overlap (at least one byte -is damaged in all copies), or are adjacent and the boundary can't be + The merge will fail if the damaged areas overlap (at least one byte is +damaged in all copies), or are adjacent and the boundary can't be determined, or if the copies have too many damaged areas. - All the copies to be merged must have the same size. If any of them -is larger or smaller than it should, either because it has been -truncated or because it got some garbage data appended at the end, it -can be brought to the correct size with the following command before -merging it with the other copies: + All the copies to be merged must have the same size. If any of them is +larger or smaller than it should, either because it has been truncated or +because it got some garbage data appended at the end, it can be brought to +the correct size with the following command before merging it with the +other copies: ddrescue -s<correct_size> -x<correct_size> file.lz correct_size_file.lz - To give you an idea of its possibilities, when merging two copies, -each of them with one damaged area affecting 1 percent of the copy, the + To give you an idea of its possibilities, when merging two copies, each +of them with one damaged area affecting 1 percent of the copy, the probability of obtaining a correct file is about 98 percent. With three -such copies the probability rises to 99.97 percent. For large files (a -few MB) with small errors (one sector damaged per copy), the probability +such copies the probability rises to 99.97 percent. For large files (a few +MB) with small errors (one sector damaged per copy), the probability approaches 100 percent even with only two copies. (Supposing that the errors are randomly located inside each copy). - Some types of solid-state device (NAND flash, for example) can -produce bursts of scattered single-bit errors. Lziprecover is able to -merge files with thousands of such scattered errors by grouping the -errors into clusters and then merging the files as if each cluster were -a single error. + Some types of solid-state device (NAND flash, for example) can produce +bursts of scattered single-bit errors. Lziprecover is able to merge files +with thousands of such scattered errors by grouping the errors into +clusters and then merging the files as if each cluster were a single error. Here is a real case of successful merging. Two copies of the file 'icecat-3.5.3-x86.tar.lz' (compressed size 9 MB) became corrupt while -stored on the same NAND flash device. One of the copies had 76 -single-bit errors scattered in an area of 1020 bytes, and the other had -3028 such errors in an area of 31729 bytes. Lziprecover produced a -correct file, identical to the original, in just 5 seconds: +stored on the same NAND flash device. One of the copies had 76 single-bit +errors scattered in an area of 1020 bytes, and the other had 3028 such +errors in an area of 31729 bytes. Lziprecover produced a correct file, +identical to the original, in just 5 seconds: - $ lziprecover -vvm a/icecat-3.5.3-x86.tar.lz b/icecat-3.5.3-x86.tar.lz + lziprecover -vvm a/icecat-3.5.3-x86.tar.lz b/icecat-3.5.3-x86.tar.lz Merging member 1 of 1 (2552 errors) 2552 errors have been grouped in 16 clusters. Trying variation 2 of 2, block 2 Input files merged successfully. - Note that the number of errors reported by lziprecover (2552) is -lower than the number of corrupt bytes (3104) because contiguous -corrupt bytes are counted as a single multibyte error. + Note that the number of errors reported by lziprecover (2552) is lower +than the number of corrupt bytes (3104) because contiguous corrupt bytes +are counted as a single multibyte error. + + +Example 1: Recover a compressed backup from two copies on CD-ROM with +error-checked merging of copies. *Note GNU ddrescue manual: (ddrescue)Top, +for details about ddrescue. + + ddrescue -d -r1 -b2048 /dev/cdrom cdimage1 mapfile1 + mount -t iso9660 -o loop,ro cdimage1 /mnt/cdimage + cp /mnt/cdimage/backup.tar.lz rescued1.tar.lz + umount /mnt/cdimage + (insert second copy in the CD drive) + ddrescue -d -r1 -b2048 /dev/cdrom cdimage2 mapfile2 + mount -t iso9660 -o loop,ro cdimage2 /mnt/cdimage + cp /mnt/cdimage/backup.tar.lz rescued2.tar.lz + umount /mnt/cdimage + lziprecover -m -v -o backup.tar.lz rescued1.tar.lz rescued2.tar.lz + Input files merged successfully. + lziprecover -tv backup.tar.lz + backup.tar.lz: ok + + +Example 2: Recover the first volume of those created with the command +'lzip -b 32MiB -S 650MB big_db' from two copies, 'big_db1_00001.lz' and +'big_db2_00001.lz', with member 07 damaged in the first copy, member 18 +damaged in the second copy, and member 12 damaged in both copies. The +correct file produced is saved in 'big_db_00001.lz'. + + lziprecover -m -v -o big_db_00001.lz big_db1_00001.lz big_db2_00001.lz + Input files merged successfully. + lziprecover -tv big_db_00001.lz + big_db_00001.lz: ok -File: lziprecover.info, Node: Tarlz, Next: File names, Prev: Merging files, Up: Top +File: lziprecover.info, Node: Reproducing one sector, Next: Tarlz, Prev: Merging files, Up: Top + +6 Reproducing one sector +************************ + +Lziprecover can recover a zeroed sector in a lzip file by concatenating the +decompressed contents of the file up to the beginning of the zeroed sector +and the uncompressed data corresponding to the zeroed sector, and then +feeding the concatenated data to the same version of lzip that created the +file. For this to work, a reference file is required containing the +uncompressed data corresponding to the missing compressed data of the zeroed +sector, plus some context data before and after them. It is possible to +recover a large file using just a few KB of reference data. + + The difficult part is finding a suitable reference file. It must contain +the exact data required (possibly mixed with other data). Containing similar +data is not enough. + + A zeroed sector may be caused by the incomplete recovery of a damaged +storage device (with I/O errors) using, for example, ddrescue. The +reproduction can't be done if the zeroed sector overlaps with the first 15 +bytes of a member, or if the zeroed sector is smaller than 8 bytes. + + The file is reproduced in memory. Therefore, enough virtual memory +(RAM + swap) to contain the damaged member is required. + + To understand how it works, take any lzipped file, say 'foo.lz', +decompress it (keeping the original), and try to reproduce an artificially +zeroed sector in it by running the following commands: + + lzip -kd foo.lz + lziprecover -vv --debug-reproduce=65536,512 --reference-file=foo foo.lz + +which should produce an output like the following: + + Reproducing: foo.lz + Reference file: foo + Testing sectors of size 512 at file positions 65536 to 66047 + (master mpos = 65536, dpos = 296892) + foo: Match found at offset 296892 + Reproduction succeeded at pos 65536 + + 1 sectors tested + 1 reproductions returned with zero status + all comparisons passed + + Using 'foo' as reference file guarantees that any zeroed sector in +'foo.lz' can be reproduced because both files contain the same data. In +real use, the reference file needs to contain the data corresponding to the +zeroed sector, but the rest of the data (if any) may differ between both +files. The reference data may be obtained from the partial decompression of +the damaged file itself if it contains repeated data. For example if the +damaged file is a compressed tarball containing several partially modified +versions of the same file. + + The offset reported by lziprecover is the position in the reference file +of the first byte that could not be decompressed. This is the first byte +that will be compressed to reproduce the zeroed sector. + + The reproduce mode tries to reproduce the missing compressed data +originally present in the zeroed sector. It is based on the perfect +reproducibility of lzip files (lzip produces identical compressed output +from identical input). Therefore, the same version of lzip that created the +file to be reproduced should be used to reproduce the zeroed sector. Near +versions may also work because the output of lzip changes infrequently. If +reproducing a tar.lz archive created with tarlz, the version of lzip, +clzip, or minilzip corresponding to the version of the lzlib library used +by tarlz to create the archive should be used. + + When recovering a tar.lz archive and using as reference a file from the +filesystem, if the zeroed sector encodes (part of) a tar header, the archive +can't be reproduced. Therefore, the less overhead (smaller headers) a tar +archive has, the more probable is that the zeroed sector does not include a +header, and that the archive can be reproduced. The tarlz format has minimum +overhead. It uses basic ustar headers, and only adds extended pax headers +when they are required. + +6.1 Performance of '--reproduce' +================================ + +Reproduce mode is specially useful when recovering a corrupt backup (or a +corrupt source tarball) that is part of a series. Usually only a small +fraction of the data changes from one backup to the next or from one version +of a source tarball to the next. This makes sometimes possible to reproduce +a given corrupted version using reference data from a near version. The +following two tables show the fraction of reproducible sectors (reproducible +sectors divided by total sectors in archive) for some archives, using sector +sizes of 512 and 4096 bytes. 'mailbox-aug.tar.lz' is a backup of some of my +mailboxes. 'backup-feb.tar.lz' and 'backup-apr.tar.lz' are real backups of +my own working directory: + +Reference file File Reproducible (512) +--------------------------------------------------------- +backup-feb.tar backup-apr.tar.lz 3273 / 4342 = 75.38% +backup-apr.tar backup-feb.tar.lz 3259 / 4161 = 78.32% +gawk-5.0.0.tar gawk-5.0.1.tar.lz 4369 / 5844 = 74.76% +gawk-5.0.1.tar gawk-5.0.0.tar.lz 4379 / 5603 = 78.15% +gmp-6.1.1.tar gmp-6.1.2.tar.lz 2454 / 3787 = 64.8% +gmp-6.1.2.tar gmp-6.1.1.tar.lz 2461 / 3782 = 65.07% + +Reference file File Reproducible (4096) +----------------------------------------------------------- +mailbox-mar.tar mailbox-aug.tar.lz 4036 / 4252 = 94.92% +backup-feb.tar backup-apr.tar.lz 264 / 542 = 48.71% +backup-apr.tar backup-feb.tar.lz 264 / 520 = 50.77% +gawk-5.0.0.tar gawk-5.0.1.tar.lz 327 / 730 = 44.79% +gawk-5.0.1.tar gawk-5.0.0.tar.lz 326 / 700 = 46.57% +gmp-6.1.1.tar gmp-6.1.2.tar.lz 175 / 473 = 37% +gmp-6.1.2.tar gmp-6.1.1.tar.lz 181 / 472 = 38.35% + + Note that the "performance of reproduce" is a probability, not a partial +recovery. The data is either fully recovered (with the probability X shown +in the last column of the tables above) or not recovered at all (with +probability 1 - X). + + Example 1: Recover a damaged source tarball with a zeroed sector of 512 +bytes at file position 1019904, using as reference another source tarball +for a different version of the software. + + lziprecover -vv -e --reference-file=gmp-6.1.1.tar gmp-6.1.2.tar.lz + Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 512, value = 0x00) + (master mpos = 1019904, dpos = 6292134) + warning: gmp-6.1.1.tar: Partial match found at offset 6277798, len 8716. + Reference data may be mixed with other data. + Trying level -9 + Reproducing position 1015808 + Member reproduced successfully. + Copy of input file reproduced successfully. + + +Example 2: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a previous backup. The damaged +backup comes from a damaged partition copied with ddrescue. + + ddrescue -b4096 -r10 /dev/sdc1 hdimage mapfile + mount -o loop,ro hdimage /mnt/hdimage + cp /mnt/hdimage/backup.tar.lz backup.tar.lz + umount /mnt/hdimage + lzip -t backup.tar.lz + backup.tar.lz: Decoder error at pos 1020530 + lziprecover -vv -e --reference-file=old_backup.tar backup.tar.lz + Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) + warning: old_backup.tar: Partial match found at offset 5743778, len 9546. + Reference data may be mixed with other data. + Trying level -9 + Reproducing position 1015808 + Member reproduced successfully. + Copy of input file reproduced successfully. + + +Example 3: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a file from the filesystem. (If +the zeroed sector encodes (part of) a tar header, the tarball can't be +reproduced). + + # List the contents of the backup tarball to locate the damaged member. + tarlz -n0 -tvf backup.tar.lz + [...] + example.txt + tarlz: Skipping to next header. + tarlz: backup.tar.lz: Archive ends unexpectedly. + # Find in the filesystem the last file listed and use it as reference. + lziprecover -vv -e --reference-file=/somedir/example.txt backup.tar.lz + Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) + /somedir/example.txt: Match found at offset 9378 + Trying level -9 + Reproducing position 1015808 + Member reproduced successfully. + Copy of input file reproduced successfully. + + If 'backup.tar.lz' is a multimember file with more than one member +damaged and lziprecover shows the message 'One member reproduced. Copy of +input file still contains errors.', the procedure shown in the example +above can be repeated until all the members have been reproduced. + + 'tarlz --keep-damaged -n0 -xf backup.tar.lz example.txt' produces a +partial copy of the reference file 'example.txt' that may help locate a +complete copy in the filesystem or in another backup, even if 'example.txt' +has been renamed. -6 Options supporting the tar.lz format + +File: lziprecover.info, Node: Tarlz, Next: File names, Prev: Reproducing one sector, Up: Top + +7 Options supporting the tar.lz format ************************************** -Tarlz is an implementation of the tar archiver which by default creates -archives compressed with lzip on a per file basis. Tarlz can append -files to the end of such compressed archives because each tar member is -compressed in its own lzip member, as well as the end-of-file blocks. -Thus tarlz archives are multimember lzip files, which has some safety -advantages over solidly compressed tar.lz archives. For example, in -case of corruption, tarlz can extract all the undamaged members from -the tar.lz archive, skipping over the damaged members, just like the -standard (uncompressed) tar. In this chapter we'll explain the ways in -which lziprecover can recover and process multimember tar.lz archives. -*Note tarlz manual: (tarlz)Top. +Tarlz is a massively parallel (multi-threaded) combined implementation of +the tar archiver and the lzip compressor. + + Tarlz creates tar archives using a simplified and safer variant of the +POSIX pax format compressed in lzip format, keeping the alignment between +tar members and lzip members. The resulting multimember tar.lz archive is +fully backward compatible with standard tar tools like GNU tar, which treat +it like any other tar.lz archive. *Note tarlz manual: (tarlz)Top, and *note +lzip manual: (lzip)Top. + Multimember tar.lz archives have some safety advantages over solidly +compressed tar.lz archives. For example, in case of corruption, tarlz can +extract all the undamaged members from the tar.lz archive, skipping over the +damaged members, just like the standard (uncompressed) tar. Keeping the +alignment between tar members and lzip members minimizes the amount of data +lost in case of corruption. In this chapter we'll explain the ways in which +lziprecover can recover and process multimember tar.lz archives. -6.1 Recovering damaged multimember tar.lz archives + +7.1 Recovering damaged multimember tar.lz archives ================================================== -If you have several copies of the damaged archive, try merging them -first because merging has a high probability of success. If the command -below prints something like 'Input files merged successfully.' you are -done and 'archive.tar.lz' now contains the recovered archive: +If you have several copies of the damaged archive, try merging them first +because merging has a high probability of success. *Note Merging files::. If +the command below prints something like 'Input files merged successfully.' +you are done and 'archive.tar.lz' now contains the recovered archive: lziprecover -m -v -o archive.tar.lz a/archive.tar.lz b/archive.tar.lz - If you only have one copy of the damaged archive, you may try to -repair the archive, but this has a lower probability of success. If the -command below prints something like + If you only have one copy of the damaged archive with a zeroed block of +data caused by an I/O error, you may try to reproduce the archive. *Note +Reproducing one sector::. If the command below prints something like +'Copy of input file reproduced successfully.' you are done and +'archive_fixed.tar.lz' now contains the recovered archive: + + lziprecover -vv -e --reference-file=old_archive.tar archive.tar.lz + + If you only have one copy of the damaged archive, you may try to repair +the archive, but this has a lower probability of success. *Note Repairing +one byte::. If the command below prints something like 'Copy of input file repaired successfully.' you are done and 'archive_fixed.tar.lz' now contains the recovered archive: lziprecover -v -R archive.tar.lz - If all the above fails, you may save the damaged members for later -and then copy the good members to another archive. If the two commands -below succeed, 'bad_members.tar.lz' will contain all the damaged members -and 'archive_cleaned.tar.lz' will contain a good archive with the -damaged members removed: + If all the above fails, and the archive was created with tarlz, you may +save the damaged members for later and then copy the good members to another +archive. If the two commands below succeed, 'bad_members.tar.lz' will +contain all the damaged members and 'archive_cleaned.tar.lz' will contain a +good archive with the damaged members removed: lziprecover -v --dump=damaged -o bad_members.tar.lz archive.tar.lz lziprecover -v --strip=damaged -o archive_cleaned.tar.lz archive.tar.lz @@ -601,32 +1017,32 @@ possible from each damaged member in 'bad_members.tar.lz': tarlz --keep-damaged -xvf ../bad_members.tar.lz -6.2 Processing multimember tar.lz archives +7.2 Processing multimember tar.lz archives ========================================== -Lziprecover is able to copy a list of members from a file to another. -For example the command -'lziprecover --dump=1-10:r1:tdata archive.tar.lz > subarch.tar.lz' -creates a subset archive containing the first ten members, the -end-of-file blocks, and the trailing data (if any) of 'archive.tar.lz'. -The 'r1' part selects the last member, which in an appendable tar.lz -archive contains the end-of-file blocks. +Lziprecover is able to copy a list of members from a file to another. For +example the command +'lziprecover --dump=1-10:r1:tdata archive.tar.lz > subarch.tar.lz' creates +a subset archive containing the first ten members, the end-of-file blocks, +and the trailing data (if any) of 'archive.tar.lz'. The 'r1' part selects +the last member, which in an appendable tar.lz archive contains the +end-of-file blocks. File: lziprecover.info, Node: File names, Next: File format, Prev: Tarlz, Up: Top -7 Names of the files produced by lziprecover +8 Names of the files produced by lziprecover ******************************************** -The name of the fixed file produced by '--merge' and '--repair' is made -by appending the string '_fixed.lz' to the original file name. If the -original file name ends with one of the extensions '.tar.lz', '.lz' or -'.tlz', the string '_fixed' is inserted before the extension. +The name of the fixed file produced by '--merge' and '--repair' is made by +appending the string '_fixed.lz' to the original file name. If the original +file name ends with one of the extensions '.tar.lz', '.lz', or '.tlz', the +string '_fixed' is inserted before the extension. File: lziprecover.info, Node: File format, Next: Trailing data, Prev: File names, Up: Top -8 File format +9 File format ************* Perfection is reached, not when there is no longer anything to add, but @@ -635,11 +1051,13 @@ when there is no longer anything to take away. In the diagram below, a box like this: + +---+ | | <-- the vertical bars might be missing +---+ represents one byte; a box like this: + +==============+ | | +==============+ @@ -648,10 +1066,11 @@ when there is no longer anything to take away. A lzip file consists of a series of "members" (compressed data sets). -The members simply appear one after another in the file, with no -additional information before, between, or after them. +The members simply appear one after another in the file, with no additional +information before, between, or after them. Each member has the following structure: + +--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ID string | VN | DS | LZMA stream | CRC32 | Data size | Member size | +--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -659,17 +1078,16 @@ additional information before, between, or after them. All multibyte values are stored in little endian order. 'ID string (the "magic" bytes)' - A four byte string, identifying the lzip format, with the value - "LZIP" (0x4C, 0x5A, 0x49, 0x50). + A four byte string, identifying the lzip format, with the value "LZIP" + (0x4C, 0x5A, 0x49, 0x50). 'VN (version number, 1 byte)' - Just in case something needs to be modified in the future. 1 for - now. + Just in case something needs to be modified in the future. 1 for now. 'DS (coded dictionary size, 1 byte)' The dictionary size is calculated by taking a power of 2 (the base - size) and subtracting from it a fraction between 0/16 and 7/16 of - the base size. + size) and subtracting from it a fraction between 0/16 and 7/16 of the + base size. Bits 4-0 contain the base 2 logarithm of the base size (12 to 29). Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract from the base size to obtain the dictionary size. @@ -678,68 +1096,65 @@ additional information before, between, or after them. 'LZMA stream' The LZMA stream, finished by an end of stream marker. Uses default - values for encoder properties. *Note Stream format: (lzip)Stream + values for encoder properties. *Note Stream format: (lzip)Stream format, for a complete description. 'CRC32 (4 bytes)' - CRC of the uncompressed original data. + Cyclic Redundancy Check (CRC) of the uncompressed original data. 'Data size (8 bytes)' Size of the uncompressed original data. 'Member size (8 bytes)' - Total size of the member, including header and trailer. This field - acts as a distributed index, allows the verification of stream - integrity, and facilitates safe recovery of undamaged members from - multimember files. + Total size of the member, including header and trailer. This field acts + as a distributed index, allows the verification of stream integrity, + and facilitates safe recovery of undamaged members from multimember + files. File: lziprecover.info, Node: Trailing data, Next: Examples, Prev: File format, Up: Top -9 Extra data appended to the file -********************************* +10 Extra data appended to the file +********************************** Sometimes extra data are found appended to a lzip file after the last member. Such trailing data may be: - * Padding added to make the file size a multiple of some block size, - for example when writing to a tape. It is safe to append any - amount of padding zero bytes to a lzip file. + * Padding added to make the file size a multiple of some block size, for + example when writing to a tape. It is safe to append any amount of + padding zero bytes to a lzip file. * Useful data added by the user; a cryptographically secure hash, a - description of file contents, etc. It is safe to append any amount - of text to a lzip file as long as none of the first four bytes of - the text match the corresponding byte in the string "LZIP", and - the text does not contain any zero bytes (null characters). - Nonzero bytes and zero bytes can't be safely mixed in trailing - data. + description of file contents, etc. It is safe to append any amount of + text to a lzip file as long as none of the first four bytes of the text + match the corresponding byte in the string "LZIP", and the text does + not contain any zero bytes (null characters). Nonzero bytes and zero + bytes can't be safely mixed in trailing data. * Garbage added by some not totally successful copy operation. - * Malicious data added to the file in order to make its total size - and hash value (for a chosen hash) coincide with those of another - file. + * Malicious data added to the file in order to make its total size and + hash value (for a chosen hash) coincide with those of another file. * In rare cases, trailing data could be the corrupt header of another member. In multimember or concatenated files the probability of corruption happening in the magic bytes is 5 times smaller than the - probability of getting a false positive caused by the corruption - of the integrity information itself. Therefore it can be - considered to be below the noise level. Additionally, the test - used by lziprecover to discriminate trailing data from a corrupt - header has a Hamming distance (HD) of 3, and the 3 bit flips must - happen in different magic bytes for the test to fail. In any case, - the option '--trailing-error' guarantees that any corrupt header - will be detected. + probability of getting a false positive caused by the corruption of the + integrity information itself. Therefore it can be considered to be + below the noise level. Additionally, the test used by lziprecover to + discriminate trailing data from a corrupt header has a Hamming + distance (HD) of 3, and the 3 bit flips must happen in different magic + bytes for the test to fail. In any case, the option '--trailing-error' + guarantees that any corrupt header will be detected. Trailing data are in no way part of the lzip file format, but tools reading lzip files are expected to behave as correctly and usefully as possible in the presence of trailing data. - Trailing data can be safely ignored in most cases. In some cases, -like that of user-added data, they are expected to be ignored. In those -cases where a file containing trailing data must be rejected, the option + Trailing data can be safely ignored in most cases. In some cases, like +that of user-added data, they are expected to be ignored. In those cases +where a file containing trailing data must be rejected, the option '--trailing-error' can be used. *Note --trailing-error::. Lziprecover facilitates the management of metadata stored as trailing @@ -758,9 +1173,8 @@ Example 1: Add a comment or description to a compressed file. Example 2: Add and verify a cryptographically secure hash. (This may be -convenient, but a separate copy of the hash must be kept in a safe place -to guarantee that both file and hash have not been maliciously -replaced). +convenient, but a separate copy of the hash must be kept in a safe place to +guarantee that both file and hash have not been maliciously replaced). sha256sum < file.lz >> file.lz lziprecover --strip=tdata file.lz | sha256sum -c \ @@ -769,23 +1183,30 @@ replaced). File: lziprecover.info, Node: Examples, Next: Unzcrash, Prev: Trailing data, Up: Top -10 A small tutorial with examples +11 A small tutorial with examples ********************************* -Example 1: Restore a regular file from its compressed version -'file.lz'. If the operation is successful, 'file.lz' is removed. +Example 1: Extract all the files from archive 'foo.tar.lz'. + + tar -xf foo.tar.lz + or + lziprecover -cd foo.tar.lz | tar -xf - + + +Example 2: Restore a regular file from its compressed version 'file.lz'. If +the operation is successful, 'file.lz' is removed. lziprecover -d file.lz -Example 2: Verify the integrity of the compressed file 'file.lz' and -show status. +Example 3: Verify the integrity of the compressed file 'file.lz' and show +status. lziprecover -tv file.lz -Example 3: The right way of concatenating the decompressed output of two -or more compressed files. *Note Trailing data::. +Example 4: The right way of concatenating the decompressed output of two or +more compressed files. *Note Trailing data::. Don't do this cat file1.lz file2.lz file3.lz | lziprecover -d @@ -797,20 +1218,20 @@ or more compressed files. *Note Trailing data::. lziprecover --strip=damaged file1.lz file2.lz file3.lz > file123.lz -Example 4: Decompress 'file.lz' partially until 10 KiB of decompressed -data are produced. +Example 5: Decompress 'file.lz' partially until 10 KiB of decompressed data +are produced. lziprecover -D 0,10KiB file.lz -Example 5: Decompress 'file.lz' partially from decompressed byte 10000 -to decompressed byte 15000 (5000 bytes are produced). +Example 6: Decompress 'file.lz' partially from decompressed byte at offset +10000 to decompressed byte at offset 14999 (5000 bytes are produced). lziprecover -D 10000-15000 file.lz -Example 6: Repair small errors in the file 'file.lz'. (Indented lines -are abridged diagnostic messages from lziprecover). +Example 7: Repair small errors in the file 'file.lz'. (Indented lines are +abridged diagnostic messages from lziprecover). lziprecover -v -R file.lz Copy of input file repaired successfully. @@ -819,100 +1240,66 @@ are abridged diagnostic messages from lziprecover). mv file_fixed.lz file.lz -Example 7: Split the multimember file 'file.lz' and write each member -in its own 'recXXXfile.lz' file. Then use 'lziprecover -t' to test the +Example 8: Split the multimember file 'file.lz' and write each member in +its own 'recXXXfile.lz' file. Then use 'lziprecover -t' to test the integrity of the resulting files. lziprecover -s file.lz lziprecover -tv rec*file.lz - -Example 8: Recover a compressed backup from two copies on CD-ROM with -error-checked merging of copies. *Note GNU ddrescue manual: -(ddrescue)Top, for details about ddrescue. - - ddrescue -d -r1 -b2048 /dev/cdrom cdimage1 mapfile1 - mount -t iso9660 -o loop,ro cdimage1 /mnt/cdimage - cp /mnt/cdimage/backup.tar.lz rescued1.tar.lz - umount /mnt/cdimage - (insert second copy in the CD drive) - ddrescue -d -r1 -b2048 /dev/cdrom cdimage2 mapfile2 - mount -t iso9660 -o loop,ro cdimage2 /mnt/cdimage - cp /mnt/cdimage/backup.tar.lz rescued2.tar.lz - umount /mnt/cdimage - lziprecover -m -v -o backup.tar.lz rescued1.tar.lz rescued2.tar.lz - Input files merged successfully. - lziprecover -tv backup.tar.lz - backup.tar.lz: ok - - -Example 9: Recover the first volume of those created with the command -'lzip -b 32MiB -S 650MB big_db' from two copies, 'big_db1_00001.lz' and -'big_db2_00001.lz', with member 07 damaged in the first copy, member 18 -damaged in the second copy, and member 12 damaged in both copies. The -correct file produced is saved in 'big_db_00001.lz'. - - lziprecover -m -v -o big_db_00001.lz big_db1_00001.lz big_db2_00001.lz - Input files merged successfully. - lziprecover -tv big_db_00001.lz - big_db_00001.lz: ok - File: lziprecover.info, Node: Unzcrash, Next: Problems, Prev: Examples, Up: Top -11 Testing the robustness of decompressors +12 Testing the robustness of decompressors ****************************************** -The lziprecover package also includes unzcrash, a program written to -test robustness to decompression of corrupted data, inspired by -unzcrash.c from Julian Seward's bzip2. Type 'make unzcrash' in the -lziprecover source directory to build it. +The lziprecover package also includes unzcrash, a program written to test +robustness to decompression of corrupted data, inspired by unzcrash.c from +Julian Seward's bzip2. Type 'make unzcrash' in the lziprecover source +directory to build it. - By default, unzcrash reads the specified file and then repeatedly -decompresses it, increasing 256 times each byte of the compressed data, -so as to test all possible one-byte errors. Note that it may take years -or even centuries to test all possible one-byte errors in a large file -(tens of MB). + By default, unzcrash reads the file specified and then repeatedly +decompresses it, increasing 256 times each byte of the compressed data, so +as to test all possible one-byte errors. Note that it may take years or even +centuries to test all possible one-byte errors in a large file (tens of MB). - If the '--block' option is given, unzcrash reads the specified file -and then repeatedly decompresses it, setting all bytes in each -successive block to the value given, so as to test all possible full -sector errors. + If the option '--block' is given, unzcrash reads the file specified and +then repeatedly decompresses it, setting all bytes in each successive block +to the value given, so as to test all possible full sector errors. - If the '--truncate' option is given, unzcrash reads the specified -file and then repeatedly decompresses it, truncating the file to -increasing lengths, so as to test all possible truncation points. + If the option '--truncate' is given, unzcrash reads the file specified +and then repeatedly decompresses it, truncating the file to increasing +lengths, so as to test all possible truncation points. None of the three test modes described above should cause any invalid memory accesses. If any of them does, please, report it as a bug to the maintainers of the decompressor being tested. - Unzcrash really executes as a subprocess the shell command specified -in the first non-option argument, and then writes the file specified in -the second non-option argument to the standard input of the subprocess, -modifying the corresponding byte each time. Therefore unzcrash can be -used to test any decompressor (not only lzip), or even other decoder -programs having a suitable command line syntax. + Unzcrash really executes as a subprocess the shell command specified in +the first non-option argument, and then writes the file specified in the +second non-option argument to the standard input of the subprocess, +modifying the corresponding byte each time. Therefore unzcrash can be used +to test any decompressor (not only lzip), or even other decoder programs +having a suitable command line syntax. If the decompressor returns with zero status, unzcrash compares the output of the decompressor for the original and corrupt files. If the -outputs differ, it means that the decompressor returned a false -negative; it failed to recognize the corruption and produced garbage -output. The only exception is when a multimember file is truncated just -after the last byte of a member, producing a shorter but valid -compressed file. Except in this latter case, please, report any false -negative as a bug. +outputs differ, it means that the decompressor returned a false negative; +it failed to recognize the corruption and produced garbage output. The only +exception is when a multimember file is truncated just after the last byte +of a member, producing a shorter but valid compressed file. Except in this +latter case, please, report any false negative as a bug. - In order to compare the outputs, unzcrash needs a 'zcmp' program able -to understand the format being tested. For example the 'zcmp' provided -by 'zutils'. *Note Zcmp: (zutils)Zcmp. + In order to compare the outputs, unzcrash needs a 'zcmp' program able to +understand the format being tested. For example the 'zcmp' provided by +zutils. Use '--zcmp=false' to disable comparisons. *Note Zcmp: (zutils)Zcmp. The format for running unzcrash is: - unzcrash [OPTIONS] 'lzip -t' FILE.lz + unzcrash [OPTIONS] 'lzip -t' FILE -FILE.lz must not contain errors and must be correctly decompressed by -the decompressor being tested for the comparisons to work. +The compressed FILE must not contain errors and the decompressor being +tested must decompress it correctly for the comparisons to work. unzcrash supports the following options: @@ -922,60 +1309,59 @@ the decompressor being tested for the comparisons to work. '-V' '--version' - Print the version number of unzcrash on the standard output and - exit. This version number should be included in all bug reports. + Print the version number of unzcrash on the standard output and exit. + This version number should be included in all bug reports. '-b RANGE' '--bits=RANGE' - Test N-bit errors only, instead of testing all the 255 wrong - values for each byte. 'N-bit error' means any value differing from - the original value in N bit positions, not a value differing from - the original value in the bit position N. + Test N-bit errors only, instead of testing all the 255 wrong values for + each byte. 'N-bit error' means any value differing from the original + value in N bit positions, not a value differing from the original + value in the bit position N. The number of N-bit errors per byte (N = 1 to 8) is: 8 28 56 70 56 28 8 1 - Examples of RANGE Tests errors of N-bit + Examples of RANGE Tests errors of N-bits 1 1 - 1,2,3 1, 2 and 3 - 2-4 2, 3 and 4 - 1,3-5,8 1, 3, 4, 5 and 8 - 1-3,5-8 1, 2, 3, 5, 6, 7 and 8 + 1,2,3 1, 2, 3 + 2-4 2, 3, 4 + 1,3-5,8 1, 3, 4, 5, 8 + 1-3,5-8 1, 2, 3, 5, 6, 7, 8 '-B[SIZE][,VALUE]' '--block[=SIZE][,VALUE]' - Test block errors of given SIZE, simulating a whole sector I/O - error. Block SIZE defaults to 512 bytes. VALUE defaults to 0. By - default, only blocks aligned to a SIZE-byte boundary are tested, - but this may be changed with the '--delta' option. + Test block errors of given SIZE, simulating a whole sector I/O error. + SIZE defaults to 512 bytes. VALUE defaults to 0. By default, only + contiguous, non-overlapping blocks are tested, but this may be changed + with the option '--delta'. '-d N' '--delta=N' - Test only one byte, block, or truncation size every N bytes, - instead of all of them. If the '--block' option is given, N - defaults to the block size. Else N defaults to 1. Values of N - smaller than the block size will result in overlappinng blocks. - (Which is convenient for testing because there are usually too few - non-overlappinng blocks in a file). + Test one byte, block, or truncation size every N bytes. If '--delta' + is not specified, unzcrash tests all the bytes, non-overlapping + blocks, or truncation sizes. Values of N smaller than the block size + will result in overlapping blocks. (Which is convenient for testing + because there are usually too few non-overlapping blocks in a file). '-e POSITION,VALUE' '--set-byte=POSITION,VALUE' - Set byte at POSITION to VALUE in the internal buffer after reading - and testing FILE.lz but before the first test call to the - decompressor. If VALUE is preceded by '+', it is added to the - original value of the byte at POSITION. If VALUE is preceded by - 'f' (flip), it is XORed with the original value of the byte at - POSITION. This option can be used to run tests with a changed - dictionary size, for example. + Set byte at POSITION to VALUE in the internal buffer after reading and + testing FILE but before the first test call to the decompressor. Byte + positions start at 0. If VALUE is preceded by '+', it is added to the + original value of the byte at POSITION. If VALUE is preceded by 'f' + (flip), it is XORed with the original value of the byte at POSITION. + This option can be used to run tests with a changed dictionary size, + for example. '-n' '--no-verify' - Skip initial verification of FILE.lz and 'zcmp'. May speed up - things a lot when testing many (or large) known good files. + Skip initial verification of FILE and 'zcmp'. May speed up things a + lot when testing many (or large) known good files. '-p BYTES' '--position=BYTES' - First byte position to test in the file. Defaults to 0. Negative - values are relative to the end of the file. + First byte position to test in the file. Defaults to 0. Negative values + are relative to the end of the file. '-q' '--quiet' @@ -983,9 +1369,9 @@ the decompressor being tested for the comparisons to work. '-s BYTES' '--size=BYTES' - Number of byte positions to test. If not specified, the rest of - the file is tested (from '--position' to end of file). Negative - values are relative to the rest of the file. + Number of byte positions to test. If not specified, the rest of the + file is tested (from '--position' to end of file). Negative values are + relative to the rest of the file. '-t' '--truncate' @@ -1000,30 +1386,30 @@ the decompressor being tested for the comparisons to work. '--zcmp=<command>' Set zcmp command name and options. Defaults to 'zcmp'. Use '--zcmp=false' to disable comparisons. If testing a decompressor - different from the one used by default by zcmp, it is needed to - force unzcrash and zcmp to use the same decompressor with a - command like 'unzcrash --zcmp='zcmp --lz=plzip' 'plzip -t' FILE.lz' + different from the one used by default by zcmp, it is needed to force + unzcrash and zcmp to use the same decompressor with a command like + 'unzcrash --zcmp='zcmp --lz=plzip' 'plzip -t' FILE' - Exit status: 0 for a normal exit, 1 for environmental problems (file -not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or -invalid input file, 3 for an internal consistency error (eg, bug) which -caused unzcrash to panic. + Exit status: 0 for a normal exit, 1 for environmental problems (file not +found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid +input file, 3 for an internal consistency error (eg, bug) which caused +unzcrash to panic. File: lziprecover.info, Node: Problems, Next: Concept index, Prev: Unzcrash, Up: Top -12 Reporting bugs +13 Reporting bugs ***************** There are probably bugs in lziprecover. There are certainly errors and -omissions in this manual. If you report them, they will get fixed. If -you don't, no one will ever know about them and they will remain unfixed -for all eternity, if not longer. +omissions in this manual. If you report them, they will get fixed. If you +don't, no one will ever know about them and they will remain unfixed for +all eternity, if not longer. If you find a bug in lziprecover, please send electronic mail to -<lzip-bug@nongnu.org>. Include the version number, which you can find -by running 'lziprecover --version'. +<lzip-bug@nongnu.org>. Include the version number, which you can find by +running 'lziprecover --version'. File: lziprecover.info, Node: Concept index, Prev: Problems, Up: Top @@ -1034,43 +1420,55 @@ Concept index * Menu: -* bugs: Problems. (line 6) -* data safety: Data safety. (line 6) -* examples: Examples. (line 6) -* file format: File format. (line 6) -* file names: File names. (line 6) -* getting help: Problems. (line 6) -* introduction: Introduction. (line 6) -* invoking: Invoking lziprecover. (line 6) -* merging files: Merging files. (line 6) -* options: Invoking lziprecover. (line 6) -* repairing files: Repairing files. (line 6) -* tarlz: Tarlz. (line 6) -* trailing data: Trailing data. (line 6) -* unzcrash: Unzcrash. (line 6) -* usage: Invoking lziprecover. (line 6) -* version: Invoking lziprecover. (line 6) +* bugs: Problems. (line 6) +* data safety: Data safety. (line 6) +* examples: Examples. (line 6) +* file format: File format. (line 6) +* file names: File names. (line 6) +* getting help: Problems. (line 6) +* introduction: Introduction. (line 6) +* invoking: Invoking lziprecover. (line 6) +* merging files: Merging files. (line 6) +* merging with a backup: Merging with a backup. (line 6) +* options: Invoking lziprecover. (line 6) +* repairing one byte: Repairing one byte. (line 6) +* reproducing a mailbox: Reproducing a mailbox. (line 6) +* reproducing one sector: Reproducing one sector. (line 6) +* tarlz: Tarlz. (line 6) +* trailing data: Trailing data. (line 6) +* unzcrash: Unzcrash. (line 6) +* usage: Invoking lziprecover. (line 6) +* version: Invoking lziprecover. (line 6) Tag Table: Node: Top231 -Node: Introduction1335 -Node: Invoking lziprecover4918 -Ref: --trailing-error5628 -Node: Data safety18371 -Node: Repairing files20322 -Node: Merging files22245 -Node: Tarlz25002 -Node: File names27857 -Node: File format28313 -Node: Trailing data30739 -Node: Examples33974 -Ref: concat-example34407 -Ref: ddrescue-example35778 -Node: Unzcrash37066 -Node: Problems43130 -Node: Concept index43682 +Node: Introduction1410 +Node: Invoking lziprecover5353 +Ref: --trailing-error6220 +Ref: range-format8391 +Ref: --reproduce8726 +Ref: --repair12904 +Node: Data safety24532 +Node: Merging with a backup26520 +Node: Reproducing a mailbox27784 +Node: Repairing one byte30285 +Node: Merging files32350 +Ref: performance-of-merge33520 +Ref: ddrescue-example35129 +Node: Reproducing one sector36416 +Ref: performance-of-reproduce40299 +Ref: ddrescue-example242974 +Node: Tarlz45394 +Node: File names49058 +Node: File format49515 +Node: Trailing data51964 +Node: Examples55186 +Ref: concat-example55762 +Node: Unzcrash57152 +Node: Problems63240 +Node: Concept index63792 End Tag Table diff --git a/doc/lziprecover.texi b/doc/lziprecover.texi index 731515b..6766403 100644 --- a/doc/lziprecover.texi +++ b/doc/lziprecover.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 4 January 2019 -@set VERSION 1.21 +@set UPDATED 2 January 2021 +@set VERSION 1.22 @dircategory Data Compression @direntry @@ -29,68 +29,71 @@ @contents @end ifnothtml +@ifnottex @node Top @top This manual is for Lziprecover (version @value{VERSION}, @value{UPDATED}). @menu -* Introduction:: Purpose and features of lziprecover -* Invoking lziprecover:: Command line interface -* Data safety:: Protecting data from accidental loss -* Repairing files:: Fixing bit flips and similar errors -* Merging files:: Fixing several damaged copies -* Tarlz:: Options supporting the tar.lz format -* File names:: Names of the files produced by lziprecover -* File format:: Detailed format of the compressed file -* Trailing data:: Extra data appended to the file -* Examples:: A small tutorial with examples -* Unzcrash:: Testing the robustness of decompressors -* Problems:: Reporting bugs -* Concept index:: Index of concepts +* Introduction:: Purpose and features of lziprecover +* Invoking lziprecover:: Command line interface +* Data safety:: Protecting data from accidental loss +* Repairing one byte:: Fixing bit flips and similar errors +* Merging files:: Fixing several damaged copies +* Reproducing one sector:: Fixing a missing (zeroed) sector +* Tarlz:: Options supporting the tar.lz format +* File names:: Names of the files produced by lziprecover +* File format:: Detailed format of the compressed file +* Trailing data:: Extra data appended to the file +* Examples:: A small tutorial with examples +* Unzcrash:: Testing the robustness of decompressors +* Problems:: Reporting bugs +* Concept index:: Index of concepts @end menu @sp 1 -Copyright @copyright{} 2009-2019 Antonio Diaz Diaz. +Copyright @copyright{} 2009-2021 Antonio Diaz Diaz. -This manual is free documentation: you have unlimited permission -to copy, distribute and modify it. +This manual is free documentation: you have unlimited permission to copy, +distribute, and modify it. +@end ifnottex @node Introduction @chapter Introduction @cindex introduction -@uref{http://www.nongnu.org/lzip/lziprecover.html,,Lziprecover} is a -data recovery tool and decompressor for files in the lzip compressed -data format (.lz). Lziprecover is able to repair slightly damaged files, -produce a correct file by merging the good parts of two or more damaged -copies, extract data from damaged files, decompress files and test -integrity of files. +@uref{http://www.nongnu.org/lzip/lziprecover.html,,Lziprecover} +is a data recovery tool and decompressor for files in the lzip +compressed data format (.lz). Lziprecover is able to repair slightly damaged +files, produce a correct file by merging the good parts of two or more +damaged copies, reproduce a missing (zeroed) sector using a reference file, +extract data from damaged files, decompress files, and test integrity of +files. Lziprecover can remove the damaged members from multimember files, for example multimember tar.lz archives. -Lziprecover provides random access to the data in multimember files; it -only decompresses the members containing the desired data. +Lziprecover provides random access to the data in multimember files; it only +decompresses the members containing the desired data. -Lziprecover facilitates the management of metadata stored as trailing -data in lzip files. +Lziprecover facilitates the management of metadata stored as trailing data +in lzip files. Lziprecover is not a replacement for regular backups, but a last line of defense for the case where the backups are also damaged. -The lzip file format is designed for data sharing and long-term -archiving, taking into account both data integrity and decoder -availability: +The lzip file format is designed for data sharing and long-term archiving, +taking into account both data integrity and decoder availability: @itemize @bullet @item The lzip format provides very safe integrity checking and some data -recovery means. The lziprecover program can repair bit flip errors (one -of the most common forms of data corruption) in lzip files, and provides -data recovery capabilities, including error-checked merging of damaged -copies of a file. @xref{Data safety}. +recovery means. The program lziprecover can repair bit flip errors +(one of the most common forms of data corruption) in lzip files, and +provides data recovery capabilities, including error-checked merging +of damaged copies of a file. @xref{Data safety}. @item The lzip format is as simple as possible (but not simpler). The lzip @@ -105,43 +108,47 @@ Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. @end itemize -A nice feature of the lzip format is that a corrupt byte is easier to -repair the nearer it is from the beginning of the file. Therefore, with -the help of lziprecover, losing an entire archive just because of a -corrupt byte near the beginning is a thing of the past. +A nice feature of the lzip format is that a corrupt byte is easier to repair +the nearer it is from the beginning of the file. Therefore, with the help of +lziprecover, losing an entire archive just because of a corrupt byte near +the beginning is a thing of the past. -For compressible data, multiple lzip-compressed copies have a better -chance of surviving intact than one uncompressed copy using the same -amount of storage space. +Compression may be good for long-term archiving. For compressible data, +multiple compressed copies may provide redundancy in a more useful form and +may have a better chance of surviving intact than one uncompressed copy +using the same amount of storage space. This is specially true if the format +provides recovery capabilities like those of lziprecover, which is able to +find and combine the good parts of several damaged copies. -Lziprecover is able to recover or decompress files produced by any of -the compressors in the lzip family; lzip, plzip, minilzip/lzlib, clzip -and pdlzip. +Lziprecover is able to recover or decompress files produced by any of the +compressors in the lzip family; lzip, plzip, minilzip/lzlib, clzip, and +pdlzip. -If the cause of file corruption is damaged media, the combination -@w{GNU ddrescue + lziprecover} is the best option for recovering data from -multiple damaged copies. @xref{ddrescue-example}, for an example. +If the cause of file corruption is a damaged medium, the combination +@w{GNU ddrescue + lziprecover} is the recommended option for recovering data +from damaged lzip files. @xref{ddrescue-example}, and +@ref{ddrescue-example2}, for examples. -If a file is too damaged for lziprecover to repair it, all the -recoverable data in all members of the file can be extracted with the -following command (the resulting file may contain errors and some -garbage data may be produced at the end of each member): +If a file is too damaged for lziprecover to repair it, all the recoverable +data in all members of the file can be extracted with the following command +(the resulting file may contain errors and some garbage data may be produced +at the end of each member): @example -lziprecover -D0 -i -o file -q file.lz +lziprecover -cd -i file.lz > file @end example When recovering data, lziprecover takes as arguments the names of the damaged files and writes zero or more recovered files depending on the -operation selected and whether the recovery succeeded or not. The -damaged files themselves are kept unchanged. +operation selected and whether the recovery succeeded or not. The damaged +files themselves are kept unchanged. -When decompressing or testing file integrity, lziprecover behaves like -lzip or lunzip. +When decompressing or testing file integrity, lziprecover behaves like lzip +or lunzip. -LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never -have been compressed. Decompressed is used to refer to data which have -undergone the process of decompression. +LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never have +been compressed. Decompressed is used to refer to data which have undergone +the process of decompression. @node Invoking lziprecover @@ -158,11 +165,17 @@ lziprecover [@var{options}] [@var{files}] @end example @noindent -When decompressing or testing, @samp{-} used as a @var{file} argument -means standard input. It can be mixed with other @var{files} and is read -just once, the first time it appears in the command line. - -lziprecover supports the following options: +When decompressing or testing, a hyphen @samp{-} used as a @var{file} +argument means standard input. It can be mixed with other @var{files} and is +read just once, the first time it appears in the command line. If no file +names are specified, lziprecover decompresses from standard input to +standard output. + +lziprecover supports the following +@uref{http://www.nongnu.org/arg-parser/manual/arg_parser_manual.html#Argument-syntax,,options}: +@ifnothtml +@xref{Argument syntax,,,arg_parser}. +@end ifnothtml @table @code @item -h @@ -201,14 +214,16 @@ lzma-alone file as follows: @item -c @itemx --stdout -Write decompressed data to standard output; keep input files unchanged. -This option is needed when reading from a named pipe (fifo) or from a -device. Use it also to recover as much of the decompressed data as -possible when decompressing a corrupt file. +Write decompressed data to standard output; keep input files unchanged. This +option (or @samp{-o}) is needed when reading from a named pipe (fifo) or +from a device. Use it also to recover as much of the decompressed data as +possible when decompressing a corrupt file. @samp{-c} overrides @samp{-o}, +but @samp{-c} has no effect when merging, removing members, repairing, +reproducing, splitting, testing or listing. @item -d @itemx --decompress -Decompress the specified files. If a file does not exist or can't be +Decompress the files specified. If a file does not exist or can't be opened, lziprecover continues decompressing the rest of the files. If a file fails to decompress, or is a terminal, lziprecover exits immediately without decompressing the rest of the files. @@ -216,19 +231,46 @@ decompressing the rest of the files. @item -D @var{range} @itemx --range-decompress=@var{range} Decompress only a range of bytes starting at decompressed byte position -@samp{@var{begin}} and up to byte position @w{@samp{@var{end} - 1}}. -Byte positions start at 0. This option provides random access to the -data in multimember files; it only decompresses the members containing -the desired data. In order to guarantee the correctness of the data -produced, all members containing any part of the desired data are -decompressed and their integrity is verified. +@var{begin} and up to byte position @w{@var{end} - 1}. Byte positions start +at 0. This option provides random access to the data in multimember files; +it only decompresses the members containing the desired data. In order to +guarantee the correctness of the data produced, all members containing any +part of the desired data are decompressed and their integrity is verified. +@anchor{range-format} Four formats of @var{range} are recognized, @samp{@var{begin}}, @samp{@var{begin}-@var{end}}, @samp{@var{begin},@var{size}}, and -@samp{,@var{size}}. If only @var{begin} is specified, @var{end} is taken -as the end of the file. If only @var{size} is specified, @var{begin} is -taken as the beginning of the file. The produced bytes are sent to -standard output unless the @samp{--output} option is used. +@samp{,@var{size}}. If only @var{begin} is specified, @var{end} is taken as +the end of the file. If only @var{size} is specified, @var{begin} is taken +as the beginning of the file. The bytes produced are sent to standard output +unless the option @samp{--output} is used. + +@anchor{--reproduce} +@item -e +@itemx --reproduce +Try to recover a missing (zeroed) sector in @var{file} using a reference +file and the same version of lzip that created @var{file}. If successful, a +repaired copy is written to the file @samp{@var{file}_fixed.lz}. @var{file} +is not modified at all. The exit status is 0 if the member containing the +zeroed sector could be repaired, 2 otherwise. Note that +@samp{@var{file}_fixed.lz} may still contain errors in the members following +the one repaired. @xref{Reproducing one sector}, for a complete description +of the reproduce mode. + +@item --lzip-level=@var{digit}|a|m[@var{length}] +Try only the given compression level or match length limit when reproducing +a zeroed sector. @samp{--lzip-level=a} tries all the compression levels +@w{(0 to 9)}, while @samp{--lzip-level=m} tries all the match length limits +@w{(5 to 273)}. + +@item --lzip-name=@var{name} +Set the name of the lzip executable used by @samp{--reproduce}. If +@samp{--lzip-name} is not specified, @samp{lzip} is used. + +@item --reference-file=@var{file} +Set the reference file used by @samp{--reproduce}. It must contain the +uncompressed data corresponding to the missing compressed data of the zeroed +sector, plus some context data before and after them. @item -f @itemx --force @@ -236,14 +278,21 @@ Force overwrite of output files. @item -i @itemx --ignore-errors -Make @samp{--range-decompress} ignore errors and continue decompressing -the remaining members in the file. For example, -@w{@samp{lziprecover -D0 -i file.lz > file}} decompresses all the -recoverable data in all members of @samp{file.lz} without having to -split it first. - -Make @samp{--list}, @samp{--dump}, @samp{--remove} and @samp{--strip} -ignore format errors. +Make @samp{--decompress}, @samp{--test}, and @samp{--range-decompress} +ignore format and data errors and continue decompressing the remaining +members in the file; keep input files unchanged. For example, the commands +@w{@samp{lziprecover -cd -i file.lz > file}} or +@w{@samp{lziprecover -D0 -i file.lz > file}} decompress all the recoverable +data in all members of @samp{file.lz} without having to split it first. The +@w{@samp{-cd -i}} method resyncs to the next member header after each error, +and is immune to some format errors that make @w{@samp{-D0 -i}} fail. The +range decompressed may be smaller than the range requested, because of the +errors. + +Make @samp{--list}, @samp{--dump}, @samp{--remove}, and @samp{--strip} +ignore format errors. The sizes of the members with errors (specially the +last) may be wrong. The exit status is set to 0 unless other errors are +found (I/O errors, for example). @item -k @itemx --keep @@ -251,21 +300,21 @@ Keep (don't delete) input files during decompression. @item -l @itemx --list -Print the uncompressed size, compressed size and percentage saved of the -specified files. Trailing data are ignored. The values produced are -correct even for multimember files. If more than one file is given, a -final line containing the cumulative sizes is printed. With @samp{-v}, -the dictionary size, the number of members in the file, and the amount -of trailing data (if any) are also printed. With @samp{-vv}, the -positions and sizes of each member in multimember files are also -printed. With @samp{-i}, format errors are ignored, and with -@samp{-ivv}, gaps between members are shown. The member numbers shown -coincide with the file numbers produced by @samp{--split}. +Print the uncompressed size, compressed size, and percentage saved of the +files specified. Trailing data are ignored. The values produced are correct +even for multimember files. If more than one file is given, a final line +containing the cumulative sizes is printed. With @samp{-v}, the dictionary +size, the number of members in the file, and the amount of trailing data (if +any) are also printed. With @samp{-vv}, the positions and sizes of each +member in multimember files are also printed. With @samp{-i}, format errors +are ignored, and with @samp{-ivv}, gaps between members are shown. The +member numbers shown coincide with the file numbers produced by +@samp{--split}. @samp{-lq} can be used to verify quickly (without decompressing) the -structural integrity of the specified files. (Use @samp{--test} to -verify the data integrity). @samp{-alq} additionally verifies that none -of the specified files contain trailing data. +structural integrity of the files specified. (Use @samp{--test} to verify +the data integrity). @samp{-alq} additionally verifies that none of the +files specified contain trailing data. @item -m @itemx --merge @@ -277,51 +326,51 @@ description of the merge mode. @item -o @var{file} @itemx --output=@var{file} -Place the output into @samp{@var{file}} instead of into -@samp{@var{file}_fixed.lz}. If splitting, the names of the files -produced are in the form @samp{rec01@var{file}}, @samp{rec02@var{file}}, -etc. If decompressing from standard input and @samp{--stdout} has not -been specified, use @samp{@var{file}} as the name of the decompressed -file. If converting a lzma-alone file from standard input and -@samp{--stdout} has not been specified, use @samp{@var{file}.lz} as the -name of the converted file. (Or plain @samp{@var{file}} if it already -ends in @samp{.lz} or @samp{.tlz}). +Place the output into @var{file} instead of into @samp{@var{file}_fixed.lz}. +If splitting, the names of the files produced are in the form +@samp{rec01@var{file}}, @samp{rec02@var{file}}, etc. + +If decompressing, or converting lzma-alone files, and @samp{-c} has not been +also specified, write the decompressed or converted output to @var{file}; +keep input files unchanged. This option (or @samp{-c}) is needed when +reading from a named pipe (fifo) or from a device. @w{@samp{-o -}} is +equivalent to @samp{-c}. @samp{-o} has no effect when testing or listing. @item -q @itemx --quiet Quiet operation. Suppress all messages. +@anchor{--repair} @item -R @itemx --repair -Try to repair a file with small errors (up to one single-byte error per -member). If successful, a repaired copy is written to the file -@samp{@var{file}_fixed.lz}. @samp{@var{file}} is not modified at all. -The exit status is 0 if the file could be repaired, 2 otherwise. -@xref{Repairing files}, for a complete description of the repair mode. +Try to repair a @var{file} with small errors (up to one single-byte error +per member). If successful, a repaired copy is written to the file +@samp{@var{file}_fixed.lz}. @var{file} is not modified at all. The exit +status is 0 if the file could be repaired, 2 otherwise. @xref{Repairing one +byte}, for a complete description of the repair mode. @item -s @itemx --split -Search for members in @samp{@var{file}} and write each member in its own -file. Gaps between members are detected and each gap is saved in its own -file. Trailing data (if any) are saved alone in the last file. You can -then use @samp{lziprecover -t} to test the integrity of the resulting -files, decompress those which are undamaged, and try to repair or -partially decompress those which are damaged. Gaps may contain garbage -or may be members with corrupt headers or trailers. If other lziprecover -functions fail to work on a multimember @var{file} because of damage in -headers or trailers, try to split @var{file} and then work on each -member individually. +Search for members in @var{file} and write each member in its own file. Gaps +between members are detected and each gap is saved in its own file. Trailing +data (if any) are saved alone in the last file. You can then use +@w{@samp{lziprecover -t}} to test the integrity of the resulting files, +decompress those which are undamaged, and try to repair or partially +decompress those which are damaged. Gaps may contain garbage or may be +members with corrupt headers or trailers. If other lziprecover functions +fail to work on a multimember @var{file} because of damage in headers or +trailers, try to split @var{file} and then work on each member individually. The names of the files produced are in the form @samp{rec01@var{file}}, -@samp{rec02@var{file}}, etc, and are designed so that the use of -wildcards in subsequent processing, for example, @w{@samp{lziprecover --cd rec*@var{file} > recovered_data}}, processes the files in the -correct order. The number of digits used in the names varies depending -on the number of members in @samp{@var{file}}. +@samp{rec02@var{file}}, etc, and are designed so that the use of wildcards +in subsequent processing, for example, +@w{@samp{lziprecover -cd rec*@var{file} > recovered_data}}, processes the +files in the correct order. The number of digits used in the names varies +depending on the number of members in @var{file}. @item -t @itemx --test -Check integrity of the specified files, but don't decompress them. This +Check integrity of the files specified, but don't decompress them. This really performs a trial decompression and throws away the result. Use it together with @samp{-v} to see information about the files. If a file fails the test, does not exist, can't be opened, or is a terminal, lziprecover @@ -342,7 +391,7 @@ In other modes, increasing verbosity levels show final status, progress of operations, and extra information (for example, the failed areas). @item --loose-trailing -When decompressing, testing or listing, allow trailing data whose first +When decompressing, testing, or listing, allow trailing data whose first bytes are so similar to the magic bytes of a lzip header that they can be confused with a corrupt header. Use this option if a file triggers a "corrupt header" error and the cause is not indeed a corrupt header. @@ -350,7 +399,7 @@ be confused with a corrupt header. Use this option if a file triggers a @item --dump=[@var{member_list}][:damaged][:tdata] Dump the members listed, the damaged members (if any), or the trailing data (if any) of one or more regular multimember files to standard -output, or to a file if the @samp{--output} option is used. If more than +output, or to a file if the option @samp{--output} is used. If more than one file is given, the elements dumped from all files are concatenated. If a file does not exist, can't be opened, or is not regular, lziprecover continues processing the rest of the files. If the dump @@ -367,12 +416,12 @@ member in the file (r1). Negated versions of both kinds of lists exist (^1,3-6:r^1,3-6) which selects all the members except those in the list. The strings "damaged" and "tdata" select the damaged members and the trailing data respectively. If the same member is selected more than -once, for example by @code{1:r1} in a single-member file, it is dumped +once, for example by @samp{1:r1} in a single-member file, it is dumped just once. See the following examples: @multitable {@code{3,12:damaged:tdata}} {members 3, 12, damaged members, trailing data} @headitem @code{--dump} argument @tab Elements dumped -@item @code{1,3-6} @tab members 1, 3, 4, 5 and 6 +@item @code{1,3-6} @tab members 1, 3, 4, 5, 6 @item @code{r1-3} @tab last 3 members in file @item @code{^13,15} @tab all but 13th and 15th members in file @item @code{r^1} @tab all but last member in file @@ -389,10 +438,9 @@ data (if any) from regular multimember files in place. The date of each file is preserved if possible. If all members in a file are selected to be removed, the file is left unchanged and the exit status is set to 2. If a file does not exist, can't be opened, is not regular, or is left -unchanged, lziprecover continues processing the rest of the files. In -case of I/O error, lziprecover exits immediately without processing the -rest of the files. See @samp{--dump} above for a description of the -argument. +unchanged, lziprecover continues processing the rest of the files. In case +of I/O error, lziprecover exits immediately without processing the rest of +the files. See @samp{--dump} above for a description of the argument. This option may be dangerous even if only the trailing data is being removed because the file may be corrupt or the trailing data may contain @@ -404,7 +452,7 @@ removal of trailing data. @item --strip=[@var{member_list}][:damaged][:tdata] Copy one or more regular multimember files to standard output (or to a -file if the @samp{--output} option is used), stripping the members +file if the option @samp{--output} is used), stripping the members listed, the damaged members (if any), or the trailing data (if any) from each file. If all members in a file are selected to be stripped, the trailing data (if any) are also stripped even if @samp{tdata} is not @@ -418,6 +466,84 @@ above for a description of the argument. @end table +Lziprecover also supports the following debug options (for experts): + +@table @code +@item -E @var{range}[,@var{sector_size}] +@itemx --debug-reproduce=@var{range}[,@var{sector_size}] +Load the compressed @var{file} into memory, set all bytes in the positions +specified by @var{range} to 0, and try to reproduce a correct compressed +file. @xref{--reproduce}. @xref{range-format}, for a description of +@var{range}. If a @var{sector_size} is specified, set each sector to 0 in +sequence and try to reproduce the file, printing to standard output final +statistics of the number of sectors reproduced successfully. Exit with +nonzero status only in case of fatal error. + +@item -M +@itemx --md5sum +Print to standard output the MD5 digests of the input @var{files} one per +line in the same format produced by the @command{md5sum} tool. Lziprecover +uses MD5 digests to verify the result of some operations. This option allows +the verification of lziprecover's implementation of the MD5 algorithm. + +@item -S[@var{value}] +@itemx --nrep-stats[=@var{value}] +Compare the frequency of sequences of N repeated bytes of a given +@var{value} in the compressed LZMA streams of the input @var{files} with the +frequency expected for random data (1 / 2^(8N)). If @var{value} is not +specified, print the frequency of repeated sequences of all possible byte +values. Print cumulative data for all files followed by the name of the +first file with the longest sequence. + +@item -U +@itemx --unzcrash +Test 1-bit errors in the LZMA stream of the input @var{file} like the +command @w{@samp{unzcrash -b1 -p7 -s-20 'lzip -t' @var{file}}} but in +memory, and therefore much faster. @xref{Unzcrash}. This option tests all +the members independently in a multimember file, skipping headers and +trailers. If a decompression succeeds, the decompressed output is compared +with the original decompressed output of @var{file} using MD5 digests. The +compressed @var{file} must not contain errors and must decompress correctly +for the comparisons to work. + +By default @samp{--unzcrash} only prints the interesting cases; CRC +mismatches, size mismatches, unsupported marker codes, unexpected EOFs, +apparently successful decompressions, and decoder errors detected 50_000 or +more bytes beyond the byte being tested. At verbosity level 1 (-v) it also +prints decoder errors detected 10_000 or more bytes beyond the byte being +tested. At verbosity level 2 (-vv) it prints all cases. + +@item -W @var{position},@var{value} +@itemx --debug-decompress=@var{position},@var{value} +Load the compressed @var{file} into memory, set the byte at @var{position} +to @var{value}, and decompress the modified compressed data to standard +output. + +@item -X[@var{position},@var{value}] +@itemx --show-packets[=@var{position},@var{value}] +Load the compressed @var{file} into memory, optionally set the byte at +@var{position} to @var{value}, decompress the modified compressed data +(discarding the output), and print to standard output descriptions of the +LZMA packets being decoded. + +@item -Y @var{range} +@itemx --debug-delay=@var{range} +Load the compressed @var{file} into memory and then repeatedly decompress +it, increasing 256 times each byte of the subset of the compressed data +positions specified by @var{range}, so as to test all possible one-byte +errors. For each decompression error find the error detection delay and +print to standard output the maximum delay. The error detection delay is the +difference between the position of the error and the position where the +decoder realized that the data contains an error. @xref{range-format}, for a +description of @var{range}. + +@item -Z @var{position},@var{value} +@itemx --debug-repair=@var{position},@var{value} +Load the compressed @var{file} into memory, set the byte at @var{position} +to @var{value}, and then try to repair the error. @xref{--repair}. + +@end table + Numbers given as arguments to options may be followed by a multiplier and an optional @samp{B} for "byte". @@ -446,48 +572,117 @@ caused lziprecover to panic. @chapter Protecting data from accidental loss @cindex data safety +It is a fact of life that sometimes data will become corrupt. Software has +errors. Hardware may misbehave or fail. RAM may be struck by a cosmic ray. +This is why a safe enough integrity checking is needed in compressed +formats, and the reason why a data recovery tool is sometimes needed. + There are 3 main types of data corruption that may cause data loss: single-byte errors, multibyte errors (generally affecting a whole sector in a block device), and total device failure. -Lziprecover protects natively against single-byte errors -(@pxref{Repairing files}), as long as file integrity is checked -frequently enough that a second single-byte error does not develop in -the same member before the first one is repaired. +Lziprecover protects natively against single-byte errors as long as file +integrity is checked frequently enough that a second single-byte error does +not develop in the same member before the first one is repaired. +@xref{Repairing one byte}. + +Lziprecover also protects against multibyte errors if at least one backup +copy of the file is made (@pxref{Merging files}), or if the error is a +zeroed sector and the uncompressed data corresponding to the zeroed sector +are available (@pxref{Reproducing one sector}). If you can choose between +merging and reproducing, try merging first because it is usually faster, +easier to use, and has a high probability of success. + +Lziprecover can't help in case of device failure. The only remedy for total +device failure is storing backup copies in separate media. -Lziprecover also protects against multibyte errors (@pxref{Merging -files}), if at least one backup copy of the file is made. +The extraordinary safety of the lzip format allows lziprecover to exploit +the redundance that occurrs naturally when making compressed backups. +Lziprecover can recover data that would not be recoverable from files +compressed in other formats. Let's see two examples of how much better is +lzip compared with gzip and bzip2 with respect to data safety: + +@menu +* Merging with a backup:: Recovering a file using a damaged backup +* Reproducing a mailbox:: Recovering new messages using an old backup +@end menu -The only remedy for total device failure is storing backup copies in -separate media. -How does lzip compare with gzip and bzip2 with respect to data safety? -Let's suppose that you made a backup of your valuable scientific data, -compressed it, and stored two copies on separate media. Years later you -notice that both copies are corrupt. +@node Merging with a backup +@section Recovering a file using a damaged backup +@cindex merging with a backup -If you compressed with gzip and both copies suffer any damage in the -data stream, even if it is just one altered bit, the original data can +Let's suppose that you made a compressed backup of your valuable scientific +data and stored two copies on separate media. Years later you notice that +both copies are corrupt. + +If you compressed the data with gzip and both copies suffer any damage in +the data stream, even if it is just one altered bit, the original data can only be recovered by an expert, if at all. -If you used bzip2, and if the file is large enough to contain more than -one compressed data block (usually larger than @w{900 kB} uncompressed), -and if no block is damaged in both files, then the data can be manually -recovered by splitting the files with bzip2recover, verifying every -block and then copying the right blocks in the right order into another -file. +If you used bzip2, and if the file is large enough to contain more than one +compressed data block (usually larger than @w{900 kB} uncompressed), and if +no block is damaged in both files, then the data can be manually recovered +by splitting the files with bzip2recover, verifying every block, and then +copying the right blocks in the right order into another file. + +But if you used lzip, the data can be automatically recovered with +@w{@samp{lziprecover --merge}} as long as the damaged areas don't overlap. + +Note that each error in a bzip2 file makes a whole block unusable, but each +error in a lzip file only affects the damaged bytes, making it possible to +recover a file with thousands of errors. + + +@node Reproducing a mailbox +@section Recovering new messages using an old backup +@cindex reproducing a mailbox -But if you used lzip, the data can be automatically recovered as long as -the damaged areas don't overlap. +Let's suppose that you make periodic backups of your email messages stored +in one or more mailboxes. (A mailbox is a file containing a possibly large +number of email messages). New messages are appended to the end of each +mailbox, therefore the initial part of two consecutive backups is identical +unless some messages have been changed or deleted in the meantime. The new +messages added to each backup are usually a small part of the whole mailbox. -Note that each error in a bzip2 file makes a whole block unusable, but -each error in a lzip file only affects the damaged bytes, making it -possible to recover a file with thousands of errors. +@verbatim ++========================================================+ +| Older backup containing some messages | ++========================================================+ ++========================================================+================+ +| Newer backup containing the messages above plus some | new messages | ++========================================================+================+ +@end verbatim + +One day you discover that your mailbox has disappeared because you deleted +it inadvertently or because of a bug in your email reader. Not only that. +You need to recover a recent message, but the last backup you made of the +mailbox (the newer backup above) has lost the data corresponding to a whole +sector because of an I/O error in the part containing the old messages. + +If you compressed the mailbox with gzip, usually none of the new messages +can be recovered even if they are intact because all the data beyond the +missing sector can't be decoded. + +If you used bzip2, and if the newer backup is large enough that the new +messages are in a different compressed data block than the one damaged +(usually larger than @w{900 kB} uncompressed), then you can recover the new +messages manually with bzip2recover. If the backups are identical except for +the new messages appended, you may even recover the whole newer backup by +combining the good blocks from both backups. + +But if you used lzip, the whole newer backup can be automatically recovered +with @w{@samp{lziprecover --reproduce}} as long as the missing bytes can be +recovered from the older backup, even if other messages in the common part +have been changed or deleted. Mailboxes seem to be specially easy to +reproduce. The probability of reproducing a mailbox +(@pxref{performance-of-reproduce}) is almost as high as that of merging two +identical backups (@pxref{performance-of-merge}). -@node Repairing files -@chapter Repairing files -@cindex repairing files +@node Repairing one byte +@chapter Repairing one byte +@cindex repairing one byte Lziprecover can repair perfectly most files with small errors (up to one single-byte error per member), without the need of any extra redundance @@ -495,6 +690,9 @@ at all. If the reparation is successful, the repaired file will be identical bit for bit to the original. This makes lzip files resistant to bit flip, one of the most common forms of data corruption. +The file is repaired in memory. Therefore, enough virtual memory +@w{(RAM + swap)} to contain the largest damaged member is required. + The error may be located anywhere in the file except in the first 5 bytes of each member header or in the @samp{Member size} field of the trailer (last 8 bytes of each member). If the error is in the header it @@ -510,13 +708,12 @@ One byte may seem small, but most file corruptions not produced by transmission errors or I/O errors just affect one byte, or even one bit, of the file. Also, unlike magnetic media, where errors usually affect a whole sector, solid-state storage devices tend to produce single-byte -errors, making of lzip the perfect format for data stored on such -devices. +errors, making of lzip the perfect format for data stored on such devices. Repairing a file can take some time. Small files or files with the error located near the beginning can be repaired in a few seconds. But repairing a large file compressed with a large dictionary size and with -the error located far from the beginning, can take hours. +the error located far from the beginning, may take hours. On the other hand, errors located near the beginning of the file cause much more loss of data than errors located near the end. So lziprecover @@ -528,7 +725,7 @@ repairs more efficiently the worst errors. @cindex merging files If you have several copies of a file but all of them are too damaged to -repair them (@pxref{Repairing files}), lziprecover can try to produce a +repair them (@pxref{Repairing one byte}), lziprecover can try to produce a correct file by merging the good parts of the damaged copies. The merge may succeed even if some copies of the file have all the @@ -550,13 +747,14 @@ with the other copies: ddrescue -s<correct_size> -x<correct_size> file.lz correct_size_file.lz @end example -To give you an idea of its possibilities, when merging two copies, each -of them with one damaged area affecting 1 percent of the copy, the -probability of obtaining a correct file is about 98 percent. With three -such copies the probability rises to 99.97 percent. For large files (a -few MB) with small errors (one sector damaged per copy), the probability -approaches 100 percent even with only two copies. (Supposing that the -errors are randomly located inside each copy). +@anchor{performance-of-merge} +To give you an idea of its possibilities, when merging two copies, each of +them with one damaged area affecting 1 percent of the copy, the probability +of obtaining a correct file is about 98 percent. With three such copies the +probability rises to 99.97 percent. For large files (a few MB) with small +errors (one sector damaged per copy), the probability approaches 100 percent +even with only two copies. (Supposing that the errors are randomly located +inside each copy). Some types of solid-state device (NAND flash, for example) can produce bursts of scattered single-bit errors. Lziprecover is able to merge @@ -572,7 +770,7 @@ single-bit errors scattered in an area of 1020 bytes, and the other had correct file, identical to the original, in just 5 seconds: @example -$ lziprecover -vvm a/icecat-3.5.3-x86.tar.lz b/icecat-3.5.3-x86.tar.lz +lziprecover -vvm a/icecat-3.5.3-x86.tar.lz b/icecat-3.5.3-x86.tar.lz Merging member 1 of 1 (2552 errors) 2552 errors have been grouped in 16 clusters. Trying variation 2 of 2, block 2 @@ -583,61 +781,324 @@ Note that the number of errors reported by lziprecover (2552) is lower than the number of corrupt bytes (3104) because contiguous corrupt bytes are counted as a single multibyte error. +@sp 1 +@anchor{ddrescue-example} +@noindent +Example 1: Recover a compressed backup from two copies on CD-ROM with +error-checked merging of copies. +@ifnothtml +@xref{Top,GNU ddrescue manual,,ddrescue}, +@end ifnothtml +@ifhtml +See the +@uref{http://www.gnu.org/software/ddrescue/manual/ddrescue_manual.html,,ddrescue manual} +@end ifhtml +for details about ddrescue. + +@example +ddrescue -d -r1 -b2048 /dev/cdrom cdimage1 mapfile1 +mount -t iso9660 -o loop,ro cdimage1 /mnt/cdimage +cp /mnt/cdimage/backup.tar.lz rescued1.tar.lz +umount /mnt/cdimage + (insert second copy in the CD drive) +ddrescue -d -r1 -b2048 /dev/cdrom cdimage2 mapfile2 +mount -t iso9660 -o loop,ro cdimage2 /mnt/cdimage +cp /mnt/cdimage/backup.tar.lz rescued2.tar.lz +umount /mnt/cdimage +lziprecover -m -v -o backup.tar.lz rescued1.tar.lz rescued2.tar.lz + Input files merged successfully. +lziprecover -tv backup.tar.lz + backup.tar.lz: ok +@end example + +@sp 1 +@noindent +Example 2: Recover the first volume of those created with the command +@w{@samp{lzip -b 32MiB -S 650MB big_db}} from two copies, +@samp{big_db1_00001.lz} and @samp{big_db2_00001.lz}, with member 07 +damaged in the first copy, member 18 damaged in the second copy, and +member 12 damaged in both copies. The correct file produced is saved in +@samp{big_db_00001.lz}. + +@example +lziprecover -m -v -o big_db_00001.lz big_db1_00001.lz big_db2_00001.lz + Input files merged successfully. +lziprecover -tv big_db_00001.lz + big_db_00001.lz: ok +@end example + + +@node Reproducing one sector +@chapter Reproducing one sector +@cindex reproducing one sector + +Lziprecover can recover a zeroed sector in a lzip file by concatenating the +decompressed contents of the file up to the beginning of the zeroed sector +and the uncompressed data corresponding to the zeroed sector, and then +feeding the concatenated data to the same version of lzip that created the +file. For this to work, a reference file is required containing the +uncompressed data corresponding to the missing compressed data of the zeroed +sector, plus some context data before and after them. It is possible to +recover a large file using just a few KB of reference data. + +The difficult part is finding a suitable reference file. It must contain the +exact data required (possibly mixed with other data). Containing similar +data is not enough. + +A zeroed sector may be caused by the incomplete recovery of a damaged +storage device (with I/O errors) using, for example, ddrescue. The +reproduction can't be done if the zeroed sector overlaps with the first 15 +bytes of a member, or if the zeroed sector is smaller than 8 bytes. + +The file is reproduced in memory. Therefore, enough virtual memory +@w{(RAM + swap)} to contain the damaged member is required. + +To understand how it works, take any lzipped file, say @samp{foo.lz}, +decompress it (keeping the original), and try to reproduce an artificially +zeroed sector in it by running the following commands: + +@example +lzip -kd foo.lz +lziprecover -vv --debug-reproduce=65536,512 --reference-file=foo foo.lz +@end example + +@noindent +which should produce an output like the following: + +@example +Reproducing: foo.lz +Reference file: foo +Testing sectors of size 512 at file positions 65536 to 66047 + (master mpos = 65536, dpos = 296892) +foo: Match found at offset 296892 +Reproduction succeeded at pos 65536 + + 1 sectors tested + 1 reproductions returned with zero status + all comparisons passed +@end example + +Using @samp{foo} as reference file guarantees that any zeroed sector in +@samp{foo.lz} can be reproduced because both files contain the same data. In +real use, the reference file needs to contain the data corresponding to the +zeroed sector, but the rest of the data (if any) may differ between both +files. The reference data may be obtained from the partial decompression of +the damaged file itself if it contains repeated data. For example if the +damaged file is a compressed tarball containing several partially modified +versions of the same file. + +The offset reported by lziprecover is the position in the reference file of +the first byte that could not be decompressed. This is the first byte that +will be compressed to reproduce the zeroed sector. + +The reproduce mode tries to reproduce the missing compressed data originally +present in the zeroed sector. It is based on the perfect reproducibility of +lzip files (lzip produces identical compressed output from identical input). +Therefore, the same version of lzip that created the file to be reproduced +should be used to reproduce the zeroed sector. Near versions may also work +because the output of lzip changes infrequently. If reproducing a tar.lz +archive created with tarlz, the version of lzip, clzip, or minilzip +corresponding to the version of the lzlib library used by tarlz to create +the archive should be used. + +When recovering a tar.lz archive and using as reference a file from the +filesystem, if the zeroed sector encodes (part of) a tar header, the archive +can't be reproduced. Therefore, the less overhead (smaller headers) a tar +archive has, the more probable is that the zeroed sector does not include a +header, and that the archive can be reproduced. The tarlz format has minimum +overhead. It uses basic ustar headers, and only adds extended pax headers +when they are required. + +@anchor{performance-of-reproduce} +@section Performance of @samp{--reproduce} +Reproduce mode is specially useful when recovering a corrupt backup (or a +corrupt source tarball) that is part of a series. Usually only a small +fraction of the data changes from one backup to the next or from one version +of a source tarball to the next. This makes sometimes possible to reproduce +a given corrupted version using reference data from a near version. The +following two tables show the fraction of reproducible sectors (reproducible +sectors divided by total sectors in archive) for some archives, using sector +sizes of 512 and 4096 bytes. @samp{mailbox-aug.tar.lz} is a backup of some +of my mailboxes. @samp{backup-feb.tar.lz} and @samp{backup-apr.tar.lz} are +real backups of my own working directory: + +@multitable {Reference file} {gawk-5.0.1.tar.lz} {4369 / 5844 = 74.76%} +@headitem Reference file @tab File @tab Reproducible (512) +@item backup-feb.tar @tab backup-apr.tar.lz @tab 3273 / 4342 = 75.38% +@item backup-apr.tar @tab backup-feb.tar.lz @tab 3259 / 4161 = 78.32% +@item gawk-5.0.0.tar @tab gawk-5.0.1.tar.lz @tab 4369 / 5844 = 74.76% +@item gawk-5.0.1.tar @tab gawk-5.0.0.tar.lz @tab 4379 / 5603 = 78.15% +@item gmp-6.1.1.tar @tab gmp-6.1.2.tar.lz @tab 2454 / 3787 = 64.8% +@item gmp-6.1.2.tar @tab gmp-6.1.1.tar.lz @tab 2461 / 3782 = 65.07% +@end multitable + +@multitable {mailbox-mar.tar} {mailbox-aug.tar.lz} {4036 / 4252 = 94.92%} +@headitem Reference file @tab File @tab Reproducible (4096) +@item mailbox-mar.tar @tab mailbox-aug.tar.lz @tab 4036 / 4252 = 94.92% +@item backup-feb.tar @tab backup-apr.tar.lz @tab 264 / 542 = 48.71% +@item backup-apr.tar @tab backup-feb.tar.lz @tab 264 / 520 = 50.77% +@item gawk-5.0.0.tar @tab gawk-5.0.1.tar.lz @tab 327 / 730 = 44.79% +@item gawk-5.0.1.tar @tab gawk-5.0.0.tar.lz @tab 326 / 700 = 46.57% +@item gmp-6.1.1.tar @tab gmp-6.1.2.tar.lz @tab 175 / 473 = 37% +@item gmp-6.1.2.tar @tab gmp-6.1.1.tar.lz @tab 181 / 472 = 38.35% +@end multitable + +Note that the "performance of reproduce" is a probability, not a partial +recovery. The data is either fully recovered (with the probability X shown +in the last column of the tables above) or not recovered at all (with +probability @w{1 - X}). + +Example 1: Recover a damaged source tarball with a zeroed sector of 512 +bytes at file position 1019904, using as reference another source tarball +for a different version of the software. + +@example +lziprecover -vv -e --reference-file=gmp-6.1.1.tar gmp-6.1.2.tar.lz +Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 512, value = 0x00) + (master mpos = 1019904, dpos = 6292134) +warning: gmp-6.1.1.tar: Partial match found at offset 6277798, len 8716. +Reference data may be mixed with other data. +Trying level -9 + Reproducing position 1015808 +Member reproduced successfully. +Copy of input file reproduced successfully. +@end example + +@sp 1 +@anchor{ddrescue-example2} +@noindent +Example 2: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a previous backup. The damaged +backup comes from a damaged partition copied with ddrescue. + +@example +ddrescue -b4096 -r10 /dev/sdc1 hdimage mapfile +mount -o loop,ro hdimage /mnt/hdimage +cp /mnt/hdimage/backup.tar.lz backup.tar.lz +umount /mnt/hdimage +lzip -t backup.tar.lz + backup.tar.lz: Decoder error at pos 1020530 +lziprecover -vv -e --reference-file=old_backup.tar backup.tar.lz +Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) +warning: old_backup.tar: Partial match found at offset 5743778, len 9546. +Reference data may be mixed with other data. +Trying level -9 + Reproducing position 1015808 +Member reproduced successfully. +Copy of input file reproduced successfully. +@end example + +@sp 1 +@noindent +Example 3: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a file from the filesystem. (If +the zeroed sector encodes (part of) a tar header, the tarball can't be +reproduced). + +@example +# List the contents of the backup tarball to locate the damaged member. +tarlz -n0 -tvf backup.tar.lz + [...] + example.txt +tarlz: Skipping to next header. +tarlz: backup.tar.lz: Archive ends unexpectedly. +# Find in the filesystem the last file listed and use it as reference. +lziprecover -vv -e --reference-file=/somedir/example.txt backup.tar.lz +Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) +/somedir/example.txt: Match found at offset 9378 +Trying level -9 + Reproducing position 1015808 +Member reproduced successfully. +Copy of input file reproduced successfully. +@end example + +If @samp{backup.tar.lz} is a multimember file with more than one member +damaged and lziprecover shows the message @samp{One member reproduced. Copy +of input file still contains errors.}, the procedure shown in the example +above can be repeated until all the members have been reproduced. + +@samp{tarlz --keep-damaged -n0 -xf backup.tar.lz example.txt} produces a +partial copy of the reference file @samp{example.txt} that may help locate a +complete copy in the filesystem or in another backup, even if +@samp{example.txt} has been renamed. + @node Tarlz @chapter Options supporting the tar.lz format @cindex tarlz -@uref{http://www.nongnu.org/lzip/manual/tarlz_manual.html,,Tarlz} is an -implementation of the tar archiver which by default creates archives -compressed with lzip on a per file basis. Tarlz can append files to the -end of such compressed archives because each tar member is compressed in -its own lzip member, as well as the end-of-file blocks. Thus tarlz -archives are multimember lzip files, which has some safety advantages -over solidly compressed tar.lz archives. For example, in case of -corruption, tarlz can extract all the undamaged members from the tar.lz -archive, skipping over the damaged members, just like the standard -(uncompressed) tar. In this chapter we'll explain the ways in which -lziprecover can recover and process multimember tar.lz archives. +@uref{http://www.nongnu.org/lzip/manual/tarlz_manual.html,,Tarlz} is a +massively parallel (multi-threaded) combined implementation of the tar +archiver and the +@uref{http://www.nongnu.org/lzip/manual/lzip_manual.html,,lzip} compressor. + +Tarlz creates tar archives using a simplified and safer variant of the POSIX +pax format compressed in lzip format, keeping the alignment between tar +members and lzip members. The resulting multimember tar.lz archive is fully +backward compatible with standard tar tools like GNU tar, which treat it +like any other tar.lz archive. @ifnothtml -@xref{Top,tarlz manual,,tarlz}. +@xref{Top,tarlz manual,,tarlz}, and @ref{Top,lzip manual,,lzip}. @end ifnothtml +Multimember tar.lz archives have some safety advantages over solidly +compressed tar.lz archives. For example, in case of corruption, tarlz can +extract all the undamaged members from the tar.lz archive, skipping over the +damaged members, just like the standard (uncompressed) tar. Keeping the +alignment between tar members and lzip members minimizes the amount of data +lost in case of corruption. In this chapter we'll explain the ways in which +lziprecover can recover and process multimember tar.lz archives. + @sp 1 @section Recovering damaged multimember tar.lz archives -If you have several copies of the damaged archive, try merging -them first because merging has a high probability of success. If the -command below prints something like -@w{@code{Input files merged successfully.}} you are done and -@code{archive.tar.lz} now contains the recovered archive: +If you have several copies of the damaged archive, try merging them first +because merging has a high probability of success. @xref{Merging files}. If +the command below prints something like +@w{@samp{Input files merged successfully.}} you are done and +@samp{archive.tar.lz} now contains the recovered archive: @example lziprecover -m -v -o archive.tar.lz a/archive.tar.lz b/archive.tar.lz @end example -If you only have one copy of the damaged archive, you may try to repair -the archive, but this has a lower probability of success. If the command -below prints something like -@w{@code{Copy of input file repaired successfully.}} you are done and -@code{archive_fixed.tar.lz} now contains the recovered archive: +If you only have one copy of the damaged archive with a zeroed block of data +caused by an I/O error, you may try to reproduce the archive. +@xref{Reproducing one sector}. If the command below prints something like +@w{@samp{Copy of input file reproduced successfully.}} you are done and +@samp{archive_fixed.tar.lz} now contains the recovered archive: + +@example +lziprecover -vv -e --reference-file=old_archive.tar archive.tar.lz +@end example + +If you only have one copy of the damaged archive, you may try to repair the +archive, but this has a lower probability of success. @xref{Repairing one +byte}. If the command below prints something like +@w{@samp{Copy of input file repaired successfully.}} you are done and +@samp{archive_fixed.tar.lz} now contains the recovered archive: @example lziprecover -v -R archive.tar.lz @end example -If all the above fails, you may save the damaged members for later and -then copy the good members to another archive. If the two commands below -succeed, @code{bad_members.tar.lz} will contain all the damaged members -and @code{archive_cleaned.tar.lz} will contain a good archive with the -damaged members removed: +If all the above fails, and the archive was created with tarlz, you may save +the damaged members for later and then copy the good members to another +archive. If the two commands below succeed, @samp{bad_members.tar.lz} will +contain all the damaged members and @samp{archive_cleaned.tar.lz} will +contain a good archive with the damaged members removed: @example lziprecover -v --dump=damaged -o bad_members.tar.lz archive.tar.lz lziprecover -v --strip=damaged -o archive_cleaned.tar.lz archive.tar.lz @end example -You can then use @code{tarlz --keep-damaged} to recover as much data as +You can then use @samp{tarlz --keep-damaged} to recover as much data as possible from each damaged member in @samp{bad_members.tar.lz}: @example @@ -651,22 +1112,22 @@ tarlz --keep-damaged -xvf ../bad_members.tar.lz Lziprecover is able to copy a list of members from a file to another. For example the command -@w{@code{lziprecover --dump=1-10:r1:tdata archive.tar.lz > subarch.tar.lz}} -creates a subset archive containing the first ten members, the -end-of-file blocks, and the trailing data (if any) of -@code{archive.tar.lz}. The @code{r1} part selects the last member, which -in an appendable tar.lz archive contains the end-of-file blocks. +@w{@samp{lziprecover --dump=1-10:r1:tdata archive.tar.lz > subarch.tar.lz}} +creates a subset archive containing the first ten members, the end-of-file +blocks, and the trailing data (if any) of @samp{archive.tar.lz}. The +@samp{r1} part selects the last member, which in an appendable tar.lz +archive contains the end-of-file blocks. @node File names @chapter Names of the files produced by lziprecover @cindex file names -The name of the fixed file produced by @samp{--merge} and -@samp{--repair} is made by appending the string @samp{_fixed.lz} to the -original file name. If the original file name ends with one of the -extensions @samp{.tar.lz}, @samp{.lz} or @samp{.tlz}, the string -@samp{_fixed} is inserted before the extension. +The name of the fixed file produced by @samp{--merge} and @samp{--repair} is +made by appending the string @samp{_fixed.lz} to the original file name. If +the original file name ends with one of the extensions @samp{.tar.lz}, +@samp{.lz}, or @samp{.tlz}, the string @samp{_fixed} is inserted before the +extension. @node File format @@ -679,6 +1140,7 @@ when there is no longer anything to take away.@* @sp 1 In the diagram below, a box like this: + @verbatim +---+ | | <-- the vertical bars might be missing @@ -686,6 +1148,7 @@ In the diagram below, a box like this: @end verbatim represents one byte; a box like this: + @verbatim +==============+ | | @@ -700,6 +1163,7 @@ The members simply appear one after another in the file, with no additional information before, between, or after them. Each member has the following structure: + @verbatim +--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ID string | VN | DS | LZMA stream | CRC32 | Data size | Member size | @@ -718,8 +1182,7 @@ Just in case something needs to be modified in the future. 1 for now. @item DS (coded dictionary size, 1 byte) The dictionary size is calculated by taking a power of 2 (the base size) -and subtracting from it a fraction between 0/16 and 7/16 of the base -size.@* +and subtracting from it a fraction between 0/16 and 7/16 of the base size.@* Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).@* Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract from the base size to obtain the dictionary size.@* @@ -727,8 +1190,8 @@ Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB@* Valid values for dictionary size range from 4 KiB to 512 MiB. @item LZMA stream -The LZMA stream, finished by an end of stream marker. Uses default -values for encoder properties. +The LZMA stream, finished by an end of stream marker. Uses default values +for encoder properties. @ifnothtml @xref{Stream format,,,lzip}, @end ifnothtml @@ -739,7 +1202,7 @@ See for a complete description. @item CRC32 (4 bytes) -CRC of the uncompressed original data. +Cyclic Redundancy Check (CRC) of the uncompressed original data. @item Data size (8 bytes) Size of the uncompressed original data. @@ -836,9 +1299,18 @@ lziprecover --strip=tdata file.lz | sha256sum -c \ @chapter A small tutorial with examples @cindex examples -Example 1: Restore a regular file from its compressed version -@samp{file.lz}. If the operation is successful, @samp{file.lz} is -removed. +Example 1: Extract all the files from archive @samp{foo.tar.lz}. + +@example + tar -xf foo.tar.lz +or + lziprecover -cd foo.tar.lz | tar -xf - +@end example + +@sp 1 +@noindent +Example 2: Restore a regular file from its compressed version +@samp{file.lz}. If the operation is successful, @samp{file.lz} is removed. @example lziprecover -d file.lz @@ -846,8 +1318,8 @@ lziprecover -d file.lz @sp 1 @noindent -Example 2: Verify the integrity of the compressed file @samp{file.lz} -and show status. +Example 3: Verify the integrity of the compressed file @samp{file.lz} and +show status. @example lziprecover -tv file.lz @@ -856,8 +1328,8 @@ lziprecover -tv file.lz @sp 1 @anchor{concat-example} @noindent -Example 3: The right way of concatenating the decompressed output of two -or more compressed files. @xref{Trailing data}. +Example 4: The right way of concatenating the decompressed output of two or +more compressed files. @xref{Trailing data}. @example Don't do this @@ -872,7 +1344,7 @@ Or keeping the trailing data of the last file like this @sp 1 @noindent -Example 4: Decompress @samp{file.lz} partially until @w{10 KiB} of +Example 5: Decompress @samp{file.lz} partially until @w{10 KiB} of decompressed data are produced. @example @@ -881,8 +1353,8 @@ lziprecover -D 0,10KiB file.lz @sp 1 @noindent -Example 5: Decompress @samp{file.lz} partially from decompressed byte -10000 to decompressed byte 15000 (5000 bytes are produced). +Example 6: Decompress @samp{file.lz} partially from decompressed byte at +offset 10000 to decompressed byte at offset 14999 (5000 bytes are produced). @example lziprecover -D 10000-15000 file.lz @@ -890,8 +1362,8 @@ lziprecover -D 10000-15000 file.lz @sp 1 @noindent -Example 6: Repair small errors in the file @samp{file.lz}. (Indented -lines are abridged diagnostic messages from lziprecover). +Example 7: Repair small errors in the file @samp{file.lz}. (Indented lines +are abridged diagnostic messages from lziprecover). @example lziprecover -v -R file.lz @@ -903,126 +1375,74 @@ mv file_fixed.lz file.lz @sp 1 @noindent -Example 7: Split the multimember file @samp{file.lz} and write each -member in its own @samp{recXXXfile.lz} file. Then use -@w{@samp{lziprecover -t}} to test the integrity of the resulting files. +Example 8: Split the multimember file @samp{file.lz} and write each member +in its own @samp{recXXXfile.lz} file. Then use @w{@samp{lziprecover -t}} to +test the integrity of the resulting files. @example lziprecover -s file.lz lziprecover -tv rec*file.lz @end example -@sp 1 -@anchor{ddrescue-example} -@noindent -Example 8: Recover a compressed backup from two copies on CD-ROM with -error-checked merging of copies. -@ifnothtml -@xref{Top,GNU ddrescue manual,,ddrescue}, -@end ifnothtml -@ifhtml -See the -@uref{http://www.gnu.org/software/ddrescue/manual/ddrescue_manual.html,,ddrescue manual} -@end ifhtml -for details about ddrescue. - -@example -ddrescue -d -r1 -b2048 /dev/cdrom cdimage1 mapfile1 -mount -t iso9660 -o loop,ro cdimage1 /mnt/cdimage -cp /mnt/cdimage/backup.tar.lz rescued1.tar.lz -umount /mnt/cdimage - (insert second copy in the CD drive) -ddrescue -d -r1 -b2048 /dev/cdrom cdimage2 mapfile2 -mount -t iso9660 -o loop,ro cdimage2 /mnt/cdimage -cp /mnt/cdimage/backup.tar.lz rescued2.tar.lz -umount /mnt/cdimage -lziprecover -m -v -o backup.tar.lz rescued1.tar.lz rescued2.tar.lz - Input files merged successfully. -lziprecover -tv backup.tar.lz - backup.tar.lz: ok -@end example - -@sp 1 -@noindent -Example 9: Recover the first volume of those created with the command -@w{@samp{lzip -b 32MiB -S 650MB big_db}} from two copies, -@samp{big_db1_00001.lz} and @samp{big_db2_00001.lz}, with member 07 -damaged in the first copy, member 18 damaged in the second copy, and -member 12 damaged in both copies. The correct file produced is saved in -@samp{big_db_00001.lz}. - -@example -lziprecover -m -v -o big_db_00001.lz big_db1_00001.lz big_db2_00001.lz - Input files merged successfully. -lziprecover -tv big_db_00001.lz - big_db_00001.lz: ok -@end example - @node Unzcrash @chapter Testing the robustness of decompressors @cindex unzcrash -The lziprecover package also includes unzcrash, a program written to -test robustness to decompression of corrupted data, inspired by -unzcrash.c from Julian Seward's bzip2. Type @samp{make unzcrash} in the -lziprecover source directory to build it. - -By default, unzcrash reads the specified file and then repeatedly -decompresses it, increasing 256 times each byte of the compressed data, -so as to test all possible one-byte errors. Note that it may take years -or even centuries to test all possible one-byte errors in a large file -(tens of MB). - -If the @code{--block} option is given, unzcrash reads the specified file -and then repeatedly decompresses it, setting all bytes in each -successive block to the value given, so as to test all possible full -sector errors. - -If the @code{--truncate} option is given, unzcrash reads the specified -file and then repeatedly decompresses it, truncating the file to -increasing lengths, so as to test all possible truncation points. - -None of the three test modes described above should cause any invalid -memory accesses. If any of them does, please, report it as a bug to the -maintainers of the decompressor being tested. - -Unzcrash really executes as a subprocess the shell command specified in -the first non-option argument, and then writes the file specified in the -second non-option argument to the standard input of the subprocess, -modifying the corresponding byte each time. Therefore unzcrash can be -used to test any decompressor (not only lzip), or even other decoder -programs having a suitable command line syntax. - -If the decompressor returns with zero status, unzcrash compares the -output of the decompressor for the original and corrupt files. If the -outputs differ, it means that the decompressor returned a false -negative; it failed to recognize the corruption and produced garbage -output. The only exception is when a multimember file is truncated just -after the last byte of a member, producing a shorter but valid -compressed file. Except in this latter case, please, report any false -negative as a bug. +The lziprecover package also includes unzcrash, a program written to test +robustness to decompression of corrupted data, inspired by unzcrash.c from +Julian Seward's bzip2. Type @samp{make unzcrash} in the lziprecover source +directory to build it. + +By default, unzcrash reads the file specified and then repeatedly +decompresses it, increasing 256 times each byte of the compressed data, so +as to test all possible one-byte errors. Note that it may take years or even +centuries to test all possible one-byte errors in a large file (tens of MB). + +If the option @samp{--block} is given, unzcrash reads the file specified and +then repeatedly decompresses it, setting all bytes in each successive block +to the value given, so as to test all possible full sector errors. + +If the option @samp{--truncate} is given, unzcrash reads the file specified +and then repeatedly decompresses it, truncating the file to increasing +lengths, so as to test all possible truncation points. + +None of the three test modes described above should cause any invalid memory +accesses. If any of them does, please, report it as a bug to the maintainers +of the decompressor being tested. + +Unzcrash really executes as a subprocess the shell command specified in the +first non-option argument, and then writes the file specified in the second +non-option argument to the standard input of the subprocess, modifying the +corresponding byte each time. Therefore unzcrash can be used to test any +decompressor (not only lzip), or even other decoder programs having a +suitable command line syntax. + +If the decompressor returns with zero status, unzcrash compares the output +of the decompressor for the original and corrupt files. If the outputs +differ, it means that the decompressor returned a false negative; it failed +to recognize the corruption and produced garbage output. The only exception +is when a multimember file is truncated just after the last byte of a +member, producing a shorter but valid compressed file. Except in this latter +case, please, report any false negative as a bug. In order to compare the outputs, unzcrash needs a @samp{zcmp} program able to understand the format being tested. For example the @samp{zcmp} provided -by @samp{zutils}. +by @uref{http://www.nongnu.org/zutils/manual/zutils_manual.html#Zcmp,,zutils}. +Use @samp{--zcmp=false} to disable comparisons. @ifnothtml @xref{Zcmp,,,zutils}. @end ifnothtml -@ifhtml -See -@uref{http://www.nongnu.org/zutils/manual/zutils_manual.html#Zcmp,,zcmp}. -@end ifhtml The format for running unzcrash is: @example -unzcrash [@var{options}] 'lzip -t' @var{file}.lz +unzcrash [@var{options}] 'lzip -t' @var{file} @end example @noindent -@var{file}.lz must not contain errors and must be correctly decompressed -by the decompressor being tested for the comparisons to work. +The compressed @var{file} must not contain errors and the decompressor being +tested must decompress it correctly for the comparisons to work. unzcrash supports the following options: @@ -1045,45 +1465,44 @@ original value in the bit position N.@* The number of N-bit errors per byte (N = 1 to 8) is: @w{8 28 56 70 56 28 8 1} -@multitable {Examples of @var{range}} {1, 2, 3, 5, 6, 7 and 8} -@item Examples of @var{range} @tab Tests errors of N-bit +@multitable {Examples of @var{range}} {Tests errors of N-bits} +@item Examples of @var{range} @tab Tests errors of N-bits @item 1 @tab 1 -@item 1,2,3 @tab 1, 2 and 3 -@item 2-4 @tab 2, 3 and 4 -@item 1,3-5,8 @tab 1, 3, 4, 5 and 8 -@item 1-3,5-8 @tab 1, 2, 3, 5, 6, 7 and 8 +@item 1,2,3 @tab 1, 2, 3 +@item 2-4 @tab 2, 3, 4 +@item 1,3-5,8 @tab 1, 3, 4, 5, 8 +@item 1-3,5-8 @tab 1, 2, 3, 5, 6, 7, 8 @end multitable @item -B[@var{size}][,@var{value}] @itemx --block[=@var{size}][,@var{value}] -Test block errors of given @var{size}, simulating a whole sector I/O -error. Block @var{size} defaults to 512 bytes. @var{value} defaults to -0. By default, only blocks aligned to a @var{size}-byte boundary are -tested, but this may be changed with the @code{--delta} option. +Test block errors of given @var{size}, simulating a whole sector I/O error. +@var{size} defaults to 512 bytes. @var{value} defaults to 0. By default, +only contiguous, non-overlapping blocks are tested, but this may be changed +with the option @samp{--delta}. @item -d @var{n} @itemx --delta=@var{n} -Test only one byte, block, or truncation size every @var{n} bytes, -instead of all of them. If the @code{--block} option is given, @var{n} -defaults to the block size. Else @var{n} defaults to 1. Values of -@var{n} smaller than the block size will result in overlappinng blocks. -(Which is convenient for testing because there are usually too few -non-overlappinng blocks in a file). +Test one byte, block, or truncation size every @var{n} bytes. If +@samp{--delta} is not specified, unzcrash tests all the bytes, +non-overlapping blocks, or truncation sizes. Values of @var{n} smaller than +the block size will result in overlapping blocks. (Which is convenient for +testing because there are usually too few non-overlapping blocks in a file). @item -e @var{position},@var{value} @itemx --set-byte=@var{position},@var{value} Set byte at @var{position} to @var{value} in the internal buffer after -reading and testing @var{file}.lz but before the first test call to the -decompressor. If @var{value} is preceded by @samp{+}, it is added to the -original value of the byte at @var{position}. If @var{value} is preceded -by @samp{f} (flip), it is XORed with the original value of the byte at -@var{position}. This option can be used to run tests with a changed -dictionary size, for example. +reading and testing @var{file} but before the first test call to the +decompressor. Byte positions start at 0. If @var{value} is preceded by +@samp{+}, it is added to the original value of the byte at @var{position}. +If @var{value} is preceded by @samp{f} (flip), it is XORed with the original +value of the byte at @var{position}. This option can be used to run tests +with a changed dictionary size, for example. @item -n @itemx --no-verify -Skip initial verification of @var{file}.lz and @samp{zcmp}. May speed up -things a lot when testing many (or large) known good files. +Skip initial verification of @var{file} and @samp{zcmp}. May speed up things +a lot when testing many (or large) known good files. @item -p @var{bytes} @itemx --position=@var{bytes} @@ -1097,13 +1516,13 @@ Quiet operation. Suppress all messages. @item -s @var{bytes} @itemx --size=@var{bytes} Number of byte positions to test. If not specified, the rest of the file -is tested (from @code{--position} to end of file). Negative values are +is tested (from @samp{--position} to end of file). Negative values are relative to the rest of the file. @item -t @itemx --truncate Test all possible truncation points in the range specified by -@code{--position} and @code{--size}. +@samp{--position} and @samp{--size}. @item -v @itemx --verbose @@ -1111,11 +1530,11 @@ Verbose mode. @item -z @itemx --zcmp=<command> -Set zcmp command name and options. Defaults to @code{zcmp}. Use -@code{--zcmp=false} to disable comparisons. If testing a decompressor +Set zcmp command name and options. Defaults to @samp{zcmp}. Use +@samp{--zcmp=false} to disable comparisons. If testing a decompressor different from the one used by default by zcmp, it is needed to force unzcrash and zcmp to use the same decompressor with a command like -@w{@code{unzcrash --zcmp='zcmp --lz=plzip' 'plzip -t' @var{file}.lz}} +@w{@samp{unzcrash --zcmp='zcmp --lz=plzip' 'plzip -t' @var{file}}} @end table @@ -1137,7 +1556,7 @@ for all eternity, if not longer. If you find a bug in lziprecover, please send electronic mail to @email{lzip-bug@@nongnu.org}. Include the version number, which you can -find by running @w{@code{lziprecover --version}}. +find by running @w{@samp{lziprecover --version}}. @node Concept index diff --git a/dump_remove.cc b/dump_remove.cc index 7bbe829..d33551f 100644 --- a/dump_remove.cc +++ b/dump_remove.cc @@ -1,22 +1,23 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 +#include <algorithm> #include <cerrno> #include <cstdio> #include <cstring> @@ -37,14 +38,15 @@ int dump_members( const std::vector< std::string > & filenames, const std::string & default_output_filename, const Member_list & member_list, const bool force, bool ignore_errors, bool ignore_trailing, - const bool loose_trailing, const bool strip ) + const bool loose_trailing, const bool strip, + const bool to_stdout ) { - if( default_output_filename.empty() ) outfd = STDOUT_FILENO; + if( to_stdout || default_output_filename.empty() ) outfd = STDOUT_FILENO; else { output_filename = default_output_filename; set_signal_handler(); - if( !open_outstream( force, true, false, false ) ) return 1; + if( !open_outstream( force, false, false, false ) ) return 1; } unsigned long long copied_size = 0, stripped_size = 0; unsigned long long copied_tsize = 0, stripped_tsize = 0; @@ -61,15 +63,15 @@ int dump_members( const std::vector< std::string > & filenames, from_stdin ? "(stdin)" : filenames[i].c_str(); struct stat in_stats; // not used const int infd = from_stdin ? STDIN_FILENO : - open_instream( input_filename, &in_stats, true, true ); - if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } + open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, ignore_errors, ignore_errors ); if( lzip_index.retval() != 0 ) { show_file_error( input_filename, lzip_index.error().c_str() ); - if( retval < lzip_index.retval() ) retval = lzip_index.retval(); + set_retval( retval, lzip_index.retval() ); close( infd ); continue; } @@ -131,7 +133,7 @@ int dump_members( const std::vector< std::string > & filenames, else if( trailing_size > 0 ) { stripped_tsize += trailing_size; ++tfiles; } close( infd ); } - if( close_outstream( 0 ) != 0 && retval < 1 ) retval = 1; + if( close_outstream( 0 ) != 0 ) set_retval( retval, 1 ); if( verbosity >= 1 ) { if( !strip ) @@ -173,20 +175,20 @@ int remove_members( const std::vector< std::string > & filenames, { const char * const filename = filenames[i].c_str(); struct stat in_stats, dummy_stats; - const int infd = open_instream( filename, &in_stats, true, true ); - if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } + const int infd = open_instream( filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, ignore_errors, ignore_errors ); if( lzip_index.retval() != 0 ) { show_file_error( filename, lzip_index.error().c_str() ); - if( retval < lzip_index.retval() ) retval = lzip_index.retval(); + set_retval( retval, lzip_index.retval() ); close( infd ); continue; } const int fd = open_truncable_stream( filename, &dummy_stats ); - if( fd < 0 ) { close( infd ); if( retval < 1 ) retval = 1; continue; } + if( fd < 0 ) { close( infd ); set_retval( retval, 1 ); continue; } if( !safe_seek( infd, 0 ) ) return 1; const long blocks = lzip_index.blocks( false ); // not counting tdata @@ -206,7 +208,7 @@ int remove_members( const std::vector< std::string > & filenames, ( !safe_seek( infd, prev_end ) || !safe_seek( fd, stream_pos ) || !copy_file( infd, fd, mb.pos() - prev_end ) ) ) - { error = true; if( retval < 1 ) retval = 1; break; } + { error = true; set_retval( retval, 1 ); break; } stream_pos += mb.pos() - prev_end; } else ++members; @@ -216,7 +218,7 @@ int remove_members( const std::vector< std::string > & filenames, if( !in && member_list.damaged ) { if( !safe_seek( infd, mb.pos() ) ) - { error = true; if( retval < 1 ) retval = 1; break; } + { error = true; set_retval( retval, 1 ); break; } in = ( test_member_from_file( infd, mb.size() ) != 0 ); // damaged } if( !in ) @@ -225,7 +227,7 @@ int remove_members( const std::vector< std::string > & filenames, ( !safe_seek( infd, mb.pos() ) || !safe_seek( fd, stream_pos ) || !copy_file( infd, fd, mb.size() ) ) ) - { error = true; if( retval < 1 ) retval = 1; break; } + { error = true; set_retval( retval, 1 ); break; } stream_pos += mb.size(); } else ++members; @@ -233,7 +235,7 @@ int remove_members( const std::vector< std::string > & filenames, if( error ) { close( fd ); close( infd ); break; } if( stream_pos == 0 ) // all members were removed { show_file_error( filename, "All members would be removed, skipping." ); - close( fd ); close( infd ); if( retval < 2 ) retval = 2; + close( fd ); close( infd ); set_retval( retval, 2 ); members = prev_members; continue; } const long long cdata_size = lzip_index.cdata_size(); if( cdata_size > stream_pos ) @@ -248,7 +250,7 @@ int remove_members( const std::vector< std::string > & filenames, ( !safe_seek( infd, cdata_size ) || !safe_seek( fd, stream_pos ) || !copy_file( infd, fd, trailing_size ) ) ) - { close( fd ); close( infd ); if( retval < 1 ) retval = 1; break; } + { close( fd ); close( infd ); set_retval( retval, 1 ); break; } stream_pos += trailing_size; } else { removed_tsize += trailing_size; ++tfiles; } @@ -261,12 +263,12 @@ int remove_members( const std::vector< std::string > & filenames, if( result != 0 ) { show_file_error( filename, "Can't truncate file", errno ); - close( fd ); close( infd ); if( retval < 1 ) retval = 1; break; + close( fd ); close( infd ); set_retval( retval, 1 ); break; } if( close( fd ) != 0 || close( infd ) != 0 ) { show_file_error( filename, "Error closing file", errno ); - if( retval < 1 ) { retval = 1; } break; + set_retval( retval, 1 ); break; } struct utimbuf t; t.actime = in_stats.st_atime; @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -36,11 +36,11 @@ void list_line( const unsigned long long uncomp_size, const char * const input_filename ) { if( uncomp_size > 0 ) - std::printf( "%15llu %15llu %6.2f%% %s\n", uncomp_size, comp_size, + std::printf( "%14llu %14llu %6.2f%% %s\n", uncomp_size, comp_size, 100.0 - ( ( 100.0 * comp_size ) / uncomp_size ), input_filename ); else - std::printf( "%15llu %15llu -INF%% %s\n", uncomp_size, comp_size, + std::printf( "%14llu %14llu -INF%% %s\n", uncomp_size, comp_size, input_filename ); } @@ -55,6 +55,7 @@ int list_files( const std::vector< std::string > & filenames, int files = 0, retval = 0; bool first_post = true; bool stdin_used = false; + for( unsigned i = 0; i < filenames.size(); ++i ) { const bool from_stdin = ( filenames[i] == "-" ); @@ -63,8 +64,8 @@ int list_files( const std::vector< std::string > & filenames, from_stdin ? "(stdin)" : filenames[i].c_str(); struct stat in_stats; // not used const int infd = from_stdin ? STDIN_FILENO : - open_instream( input_filename, &in_stats, true, true ); - if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } + open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, ignore_errors, ignore_errors ); @@ -72,7 +73,7 @@ int list_files( const std::vector< std::string > & filenames, if( lzip_index.retval() != 0 ) { show_file_error( input_filename, lzip_index.error().c_str() ); - if( retval < lzip_index.retval() ) retval = lzip_index.retval(); + set_retval( retval, lzip_index.retval() ); continue; } if( verbosity >= 0 ) @@ -80,39 +81,35 @@ int list_files( const std::vector< std::string > & filenames, const unsigned long long udata_size = lzip_index.udata_size(); const unsigned long long cdata_size = lzip_index.cdata_size(); total_comp += cdata_size; total_uncomp += udata_size; ++files; + const long members = lzip_index.members(); if( first_post ) { first_post = false; if( verbosity >= 1 ) std::fputs( " dict memb trail ", stdout ); - std::fputs( " uncompressed compressed saved name\n", stdout ); + std::fputs( " uncompressed compressed saved name\n", stdout ); } if( verbosity >= 1 ) - { - unsigned dictionary_size = 0; - for( long i = 0; i < lzip_index.members(); ++i ) - dictionary_size = - std::max( dictionary_size, lzip_index.dictionary_size( i ) ); - const long long trailing_size = lzip_index.file_size() - cdata_size; - std::printf( "%s %5ld %6lld ", format_ds( dictionary_size ), - lzip_index.members(), trailing_size ); - } + std::printf( "%s %5ld %6lld ", + format_ds( lzip_index.dictionary_size() ), members, + lzip_index.file_size() - cdata_size ); list_line( udata_size, cdata_size, input_filename ); - if( verbosity >= 2 && lzip_index.members() > 1 ) + if( verbosity >= 2 && ( members > 1 || + ( members == 1 && lzip_index.mblock( 0 ).pos() > 0 ) ) ) { - std::fputs( " member data_pos data_size member_pos member_size\n", stdout ); + std::fputs( " member data_pos data_size member_pos member_size\n", stdout ); long long prev_end = 0; - for( long i = 0, gaps = 0; i < lzip_index.members(); ++i ) + for( long i = 0, gaps = 0; i < members; ++i ) { const Block & db = lzip_index.dblock( i ); const Block & mb = lzip_index.mblock( i ); if( mb.pos() > prev_end ) { - std::printf( " gap - - %15llu %15llu\n", + std::printf( " gap - - %14llu %14llu\n", prev_end, mb.pos() - prev_end ); ++gaps; } - std::printf( "%5ld %15llu %15llu %15llu %15llu\n", + std::printf( "%6ld %14llu %14llu %14llu %14llu\n", i + gaps + 1, db.pos(), db.size(), mb.pos(), mb.size() ); prev_end = mb.end(); } diff --git a/lunzcrash.cc b/lunzcrash.cc new file mode 100644 index 0000000..b07b748 --- /dev/null +++ b/lunzcrash.cc @@ -0,0 +1,250 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "md5.h" +#include "mtester.h" +#include "lzip_index.h" + + +namespace { + +bool verify_member( const uint8_t * const mbuffer, const long long msize, + const unsigned dictionary_size, const char * const name, + uint8_t digest[16] ) + { + MD5SUM md5sum; + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, &md5sum ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { show_file_error( name, "Error verifying input file." ); return false; } + md5sum.md5_finish( digest ); + return true; + } + + +bool compare_member( const uint8_t * const mbuffer, const long long msize, + const unsigned dictionary_size, + const long long byte_pos, const uint8_t digest[16] ) + { + MD5SUM md5sum; + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, &md5sum ); + bool error = ( mtester.test_member() != 0 || !mtester.finished() ); + if( !error ) + { + uint8_t new_digest[16]; + md5sum.md5_finish( new_digest ); + if( std::memcmp( digest, new_digest, 16 ) != 0 ) error = true; + } + if( error && verbosity >= 0 ) + std::printf( "byte %llu comparison failed\n", byte_pos ); + return !error; + } + + +int test_member_rest( const LZ_mtester & master, long * const failure_posp, + const unsigned long long byte_pos ) + { + LZ_mtester mtester( master ); + mtester.duplicate_buffer(); + int result = mtester.test_member( LLONG_MAX, LLONG_MAX, stdout, byte_pos ); + if( result == 0 && !mtester.finished() ) result = -1; + if( result != 0 ) *failure_posp = mtester.member_position(); + return result; + } + + +long next_pct_pos( const Lzip_index & lzip_index, const int i, const int pct ) + { + if( pct <= 0 ) return 0; + const long long cdata_size = lzip_index.cdata_size(); + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + long long pct_pos = (long long)( cdata_size / ( 100.0 / pct ) ); + + if( pct_pos <= mpos ) pct_pos = 0; + else if( pct_pos == cdata_size ) pct_pos = msize - 21; // 100% + else if( pct_pos >= mpos + msize ) pct_pos = msize; + else pct_pos -= mpos; + return pct_pos; + } + +} // end namespace + + +/* Test 1-bit errors in LZMA streams in file. + Unless verbosity >= 1, print only the bytes with interesting results. */ +int lunzcrash( const std::string & input_filename ) + { + struct stat in_stats; // not used + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + if( verbosity >= 2 ) printf( "Testing file '%s'\n", input_filename.c_str() ); + + const long long cdata_size = lzip_index.cdata_size(); + long positions = 0, decompressions = 0, successes = 0, failed_comparisons = 0; + int pct = ( cdata_size >= 1000 && isatty( STDERR_FILENO ) ) ? 0 : 100; + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + const unsigned dictionary_size = lzip_index.dictionary_size( i ); + uint8_t * const mbuffer = read_member( infd, mpos, msize ); + if( !mbuffer ) return 1; + uint8_t md5_orig[16]; + if( !verify_member( mbuffer, msize, dictionary_size, + input_filename.c_str(), md5_orig ) ) return 2; + long pct_pos = next_pct_pos( lzip_index, i, pct ); + long pos = Lzip_header::size + 1, printed = 0; // last pos printed + const long end = msize - 20; + if( verbosity == 0 ) // give a clue of the range being tested + std::printf( "Testing bytes %llu to %llu\n", mpos + pos, mpos + end - 1 ); + LZ_mtester master( mbuffer, msize, dictionary_size ); + for( ; pos < end; ++pos ) + { + const long pos_limit = pos - 16; + if( pos_limit > 0 && master.test_member( pos_limit ) != -1 ) + { show_error( "Can't advance master." ); return 1; } + if( verbosity >= 0 && pos >= pct_pos ) + { std::fprintf( stderr, "\r%3u%% done\r", pct ); ++pct; + pct_pos = next_pct_pos( lzip_index, i, pct ); } + if( verbosity >= 1 ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + ++positions; + const uint8_t byte = mbuffer[pos]; + for( uint8_t mask = 1; mask != 0; mask <<= 1 ) + { + ++decompressions; + mbuffer[pos] ^= mask; + long failure_pos = 0; + const int result = test_member_rest( master, &failure_pos, + ( printed < pos ) ? mpos + pos : 0 ); + if( result == 0 ) + { + ++successes; + if( verbosity >= 0 ) + { + if( printed < pos ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + std::printf( "0x%02X (0x%02X^0x%02X) passed the test\n", + mbuffer[pos], byte, mask ); + } + if( !compare_member( mbuffer, msize, dictionary_size, mpos + pos, + md5_orig ) ) ++failed_comparisons; + } + else if( result == 1 ) + { + if( verbosity >= 2 || + ( verbosity >= 1 && failure_pos - pos >= 10000 ) || + ( verbosity >= 0 && failure_pos - pos >= 50000 ) ) + { + if( printed < pos ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + std::printf( "Decoder error at pos %llu\n", mpos + failure_pos ); + } + } + else if( result == 3 || result == 4 ) // test_member printed the error + { if( verbosity >= 0 && printed < pos ) printed = pos; } + else if( verbosity >= 0 ) + { + if( printed < pos ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + if( result == 2 ) + std::printf( "File ends unexpectedly at pos %llu\n", + mpos + failure_pos ); + else + std::printf( "Unknown error code '%d'\n", result ); + } + mbuffer[pos] ^= mask; + } + } + delete[] mbuffer; + } + + if( verbosity >= 0 ) + { + std::printf( "\n%8ld bytes tested\n%8ld total decompressions" + "\n%8ld decompressions returned with zero status", + positions, decompressions, successes ); + if( successes > 0 ) + { + if( failed_comparisons > 0 ) + std::printf( ", of which\n%8ld comparisons failed\n", + failed_comparisons ); + else std::fputs( "\n all comparisons passed\n", stdout ); + } + else std::fputc( '\n', stdout ); + } + return 0; + } + + +int md5sum_files( const std::vector< std::string > & filenames ) + { + int retval = 0; + bool stdin_used = false; + + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, false ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + enum { buffer_size = 16384 }; + uint8_t buffer[buffer_size], md5_digest[16]; + MD5SUM md5sum; + while( true ) + { + const int len = readblock( infd, buffer, buffer_size ); + if( len != buffer_size && errno ) throw Error( "Read error" ); + if( len > 0 ) md5sum.md5_update( buffer, len ); + if( len < buffer_size ) break; + } + md5sum.md5_finish( md5_digest ); + if( close( infd ) != 0 ) + { show_file_error( input_filename, "Error closing input file", errno ); + return 1; } + + for( int i = 0; i < 16; ++i ) std::printf( "%02x", md5_digest[i] ); + std::printf( " %s\n", input_filename ); + std::fflush( stdout ); + } + return retval; + } @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ class State @@ -77,7 +77,7 @@ inline int get_len_state( const int len ) { return std::min( len - min_match_len, len_states - 1 ); } inline int get_lit_state( const uint8_t prev_byte ) - { return ( prev_byte >> ( 8 - literal_context_bits ) ); } + { return prev_byte >> ( 8 - literal_context_bits ); } enum { bit_model_move_bits = 5, @@ -180,6 +180,15 @@ public: c = data[(c^buffer[i])&0xFF] ^ ( c >> 8 ); crc = c; } + + uint32_t compute_crc( const uint8_t * const buffer, + const long long size ) const + { + uint32_t crc = 0xFFFFFFFFU; + for( long long i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + return crc ^ 0xFFFFFFFFU; + } }; extern const CRC32 crc32; @@ -204,7 +213,7 @@ struct Lzip_header { uint8_t data[6]; // 0-3 magic bytes // 4 version - // 5 coded_dict_size + // 5 coded dictionary size enum { size = 6 }; void set_magic() { std::memcpy( data, lzip_magic, 4 ); data[4] = 1; } @@ -250,6 +259,10 @@ struct Lzip_header } return true; } + + bool verify( const bool ignore_bad_ds ) const + { return verify_magic() && verify_version() && + ( ignore_bad_ds || isvalid_ds( dictionary_size() ) ); } }; @@ -352,6 +365,8 @@ public: { return ( pos_ <= pos && end() > pos ); } bool overlaps( const Block & b ) const { return ( pos_ < b.end() && b.pos_ < end() ); } + bool overlaps( const long long pos, const long long size ) const + { return ( pos_ < pos + size && pos < end() ); } void shift( Block & b ) { ++size_; ++b.pos_; --b.size_; } Block split( const long long pos ); @@ -395,11 +410,12 @@ struct Error explicit Error( const char * const s ) : msg( s ) {} }; - inline unsigned long long positive_diff( const unsigned long long x, const unsigned long long y ) { return ( ( x > y ) ? x - y : 0 ); } +inline void set_retval( int & retval, const int new_val ) + { if( retval < new_val ) retval = new_val; } const char * const bad_magic_msg = "Bad magic number (file not in lzip format)."; const char * const bad_dict_msg = "Invalid dictionary size in member header."; @@ -410,15 +426,17 @@ const char * const trailing_msg = "Trailing data not allowed."; int alone_to_lz( const int infd, const Pretty_print & pp ); // defined in decoder.cc -long readblock( const int fd, uint8_t * const buf, const long size ); -long writeblock( const int fd, const uint8_t * const buf, const long size ); +long long readblock( const int fd, uint8_t * const buf, const long long size ); +long long writeblock( const int fd, const uint8_t * const buf, + const long long size ); // defined in dump_remove.cc int dump_members( const std::vector< std::string > & filenames, const std::string & default_output_filename, const Member_list & member_list, const bool force, bool ignore_errors, bool ignore_trailing, - const bool loose_trailing, const bool strip ); + const bool loose_trailing, const bool strip, + const bool to_stdout ); int remove_members( const std::vector< std::string > & filenames, const Member_list & member_list, bool ignore_errors, bool ignore_trailing, const bool loose_trailing ); @@ -432,7 +450,12 @@ int list_files( const std::vector< std::string > & filenames, int seek_read( const int fd, uint8_t * const buf, const int size, const long long pos ); +// defined in lunzcrash.cc +int lunzcrash( const std::string & input_filename ); +int md5sum_files( const std::vector< std::string > & filenames ); + // defined in main.cc +extern const char * const program_name; extern std::string output_filename; // global vars for output file extern int outfd; struct stat; @@ -440,10 +463,10 @@ const char * bad_version( const unsigned version ); const char * format_ds( const unsigned dictionary_size ); void show_header( const unsigned dictionary_size ); int open_instream( const char * const name, struct stat * const in_statsp, - const bool no_ofile, const bool reg_only = false ); + const bool one_to_one, const bool reg_only = false ); int open_truncable_stream( const char * const name, struct stat * const in_statsp ); -bool open_outstream( const bool force, const bool from_stdin, +bool open_outstream( const bool force, const bool protect, const bool rw = false, const bool skipping = true ); bool file_exists( const std::string & filename ); void cleanup_and_fail( const int retval ); @@ -456,7 +479,7 @@ void show_file_error( const char * const filename, const char * const msg, const int errcode = 0 ); void internal_error( const char * const msg ); void show_2file_error( const char * const msg1, const char * const name1, - const char * const name2, const char * const msg2 ); + const char * const name2, const char * const msg2 ); class Range_decoder; void show_dprogress( const unsigned long long cfile_size = 0, const unsigned long long partial_size = 0, @@ -470,9 +493,17 @@ int test_member_from_file( const int infd, const unsigned long long msize, long long * const failure_posp = 0 ); int merge_files( const std::vector< std::string > & filenames, const std::string & default_output_filename, - const bool force, const char terminator ); + const char terminator, const bool force ); + +// defined in nrep_stats.cc +int print_nrep_stats( const std::vector< std::string > & filenames, + const int repeated_byte, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing ); // defined in range_dec.cc +const char * format_num( unsigned long long num, + unsigned long long limit = -1ULL, + const int set_prefix = 0 ); bool safe_seek( const int fd, const long long pos ); int range_decompress( const std::string & input_filename, const std::string & default_output_filename, @@ -481,9 +512,13 @@ int range_decompress( const std::string & input_filename, const bool to_stdout ); // defined in repair.cc +long long seek_write( const int fd, const uint8_t * const buf, + const long long size, const long long pos ); +uint8_t * read_member( const int infd, const long long mpos, + const long long msize ); int repair_file( const std::string & input_filename, const std::string & default_output_filename, - const bool force, const char terminator ); + const char terminator, const bool force ); int debug_delay( const std::string & input_filename, Block range, const char terminator ); int debug_repair( const std::string & input_filename, @@ -491,6 +526,19 @@ int debug_repair( const std::string & input_filename, int debug_decompress( const std::string & input_filename, const Bad_byte & bad_byte, const bool show_packets ); +// defined in reproduce.cc +int reproduce_file( const std::string & input_filename, + const std::string & default_output_filename, + const char * const lzip_name, + const char * const reference_filename, + const int lzip_level, const char terminator, + const bool force ); +int debug_reproduce_file( const std::string & input_filename, + const char * const lzip_name, + const char * const reference_filename, + const Block & range, const int sector_size, + const int lzip_level ); + // defined in split.cc int split_file( const std::string & input_filename, const std::string & default_output_filename, const bool force ); diff --git a/lzip_index.cc b/lzip_index.cc index f70307c..66eb30d 100644 --- a/lzip_index.cc +++ b/lzip_index.cc @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -39,6 +39,18 @@ int seek_read( const int fd, uint8_t * const buf, const int size, } +bool Lzip_index::check_header_error( const Lzip_header & header, + const bool ignore_bad_ds ) + { + if( !header.verify_magic() ) + { error_ = bad_magic_msg; retval_ = 2; return true; } + if( !header.verify_version() ) + { error_ = bad_version( header.version() ); retval_ = 2; return true; } + if( !ignore_bad_ds && !isvalid_ds( header.dictionary_size() ) ) + { error_ = bad_dict_msg; retval_ = 2; return true; } + return false; + } + void Lzip_index::set_errno_error( const char * const msg ) { error_ = msg; error_ += std::strerror( errno ); @@ -53,22 +65,40 @@ void Lzip_index::set_num_error( const char * const msg, unsigned long long num ) retval_ = 2; } + +bool Lzip_index::read_header( const int fd, Lzip_header & header, + const long long pos ) + { + if( seek_read( fd, header.data, Lzip_header::size, pos ) != Lzip_header::size ) + { set_errno_error( "Error reading member header: " ); return false; } + return true; + } + +bool Lzip_index::read_trailer( const int fd, Lzip_trailer & trailer, + const long long pos ) + { + if( seek_read( fd, trailer.data, Lzip_trailer::size, + pos - Lzip_trailer::size ) != Lzip_trailer::size ) + { set_errno_error( "Error reading member trailer: " ); return false; } + return true; + } + + /* Skip backwards the gap or trailing data ending at pos. 'ignore_gaps' also ignores format errors and a truncated last member. If successful, push member preceding gap and set pos to member header. */ -bool Lzip_index::skip_gap( const int fd, long long & pos, +bool Lzip_index::skip_gap( const int fd, unsigned long long & pos, const bool ignore_trailing, const bool loose_trailing, const bool ignore_bad_ds, const bool ignore_gaps ) { - enum { block_size = 16384, - buffer_size = block_size + Lzip_trailer::size - 1 + Lzip_header::size }; - uint8_t buffer[buffer_size]; if( pos < min_member_size ) { - if( pos >= 0 && ignore_gaps && !member_vector.empty() ) - { pos = 0; return true; } + if( ignore_gaps && !member_vector.empty() ) { pos = 0; return true; } return false; } + enum { block_size = 16384, + buffer_size = block_size + Lzip_trailer::size - 1 + Lzip_header::size }; + uint8_t buffer[buffer_size]; int bsize = pos % block_size; // total bytes in buffer if( bsize <= buffer_size - block_size ) bsize += block_size; int search_size = bsize; // bytes to search for trailer @@ -91,47 +121,55 @@ bool Lzip_index::skip_gap( const int fd, long long & pos, if( member_size > ipos + i || !trailer.verify_consistency() ) continue; Lzip_header header; - if( seek_read( fd, header.data, Lzip_header::size, - ipos + i - member_size ) != Lzip_header::size ) - { set_errno_error( "Error reading member header: " ); return false; } - const unsigned dictionary_size = header.dictionary_size(); - if( !header.verify_magic() || !header.verify_version() || - ( !ignore_bad_ds && !isvalid_ds( dictionary_size ) ) ) continue; - if( member_vector.empty() ) // trailing data or truncated member + if( !read_header( fd, header, ipos + i - member_size ) ) return false; + if( !header.verify( ignore_bad_ds ) ) continue; + const Lzip_header & header2 = *(const Lzip_header *)( buffer + i ); + const bool full_h2 = bsize - i >= Lzip_header::size; + if( header2.verify_prefix( bsize - i ) ) // next header { - const Lzip_header & last_header = *(const Lzip_header *)( buffer + i ); - if( last_header.verify_prefix( bsize - i ) ) + if( !ignore_gaps && member_vector.empty() ) // last member { - if( !ignore_gaps ) - { error_ = "Last member in input file is truncated or corrupt."; - retval_ = 2; return false; } - const unsigned dictionary_size = - ( bsize - i >= Lzip_header::size ) ? - last_header.dictionary_size() : 0; - const unsigned long long member_size = pos - ( ipos + i ); - pos = ipos + i; - member_vector.push_back( Member( 0, 0, pos, - member_size, dictionary_size ) ); - return true; + if( !full_h2 ) error_ = "Last member in input file is truncated."; + else if( !check_header_error( header2, ignore_bad_ds ) ) + error_ = "Last member in input file is truncated or corrupt."; + retval_ = 2; return false; } + const unsigned dictionary_size = + full_h2 ? header2.dictionary_size() : 0; + const unsigned long long member_size = pos - ( ipos + i ); + pos = ipos + i; + // approximate data and member sizes for '-i -D' + member_vector.push_back( Member( 0, member_size, pos, + member_size, dictionary_size ) ); } if( !ignore_gaps && member_vector.empty() ) { - if( !loose_trailing && bsize - i >= Lzip_header::size && - (*(const Lzip_header *)( buffer + i )).verify_corrupt() ) + if( !loose_trailing && full_h2 && header2.verify_corrupt() ) { error_ = corrupt_mm_msg; retval_ = 2; return false; } if( !ignore_trailing ) { error_ = trailing_msg; retval_ = 2; return false; } } pos = ipos + i - member_size; + const unsigned dictionary_size = header.dictionary_size(); member_vector.push_back( Member( 0, trailer.data_size(), pos, member_size, dictionary_size ) ); + if( dictionary_size_ < dictionary_size ) + dictionary_size_ = dictionary_size; return true; } - if( ipos <= 0 ) - { if( ignore_gaps && !member_vector.empty() ) { pos = 0; return true; } - set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); - return false; } + if( ipos == 0 ) + { + if( ignore_gaps && !member_vector.empty() ) + { + const Lzip_header * header = (const Lzip_header *)buffer; + const unsigned dictionary_size = header->dictionary_size(); + // approximate data and member sizes for '-i -D' + member_vector.push_back( Member( 0, pos, 0, pos, dictionary_size ) ); + pos = 0; return true; + } + set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); + return false; + } bsize = buffer_size; search_size = bsize - Lzip_header::size; rd_size = block_size; @@ -144,7 +182,7 @@ bool Lzip_index::skip_gap( const int fd, long long & pos, Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, const bool loose_trailing, const bool ignore_bad_ds, const bool ignore_gaps, const long long max_pos ) - : insize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ) + : insize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ), dictionary_size_( 0 ) { if( insize < 0 ) { set_errno_error( "Input file is not seekable: " ); return; } @@ -155,25 +193,20 @@ Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, retval_ = 2; return; } Lzip_header header; - if( seek_read( infd, header.data, Lzip_header::size, 0 ) != Lzip_header::size ) - { set_errno_error( "Error reading member header: " ); return; } - if( !header.verify_magic() ) - { error_ = bad_magic_msg; retval_ = 2; return; } - if( !header.verify_version() ) - { error_ = bad_version( header.version() ); retval_ = 2; return; } - if( !ignore_bad_ds && !isvalid_ds( header.dictionary_size() ) ) - { error_ = bad_dict_msg; retval_ = 2; return; } + if( !read_header( infd, header, 0 ) ) return; + if( check_header_error( header, ignore_bad_ds ) ) return; // pos always points to a header or to ( EOF || max_pos ) - long long pos = ( max_pos > 0 ) ? max_pos : insize; + unsigned long long pos = ( max_pos > 0 ) ? max_pos : insize; while( pos >= min_member_size ) { Lzip_trailer trailer; - if( seek_read( infd, trailer.data, Lzip_trailer::size, - pos - Lzip_trailer::size ) != Lzip_trailer::size ) - { set_errno_error( "Error reading member trailer: " ); break; } + if( !read_trailer( infd, trailer, pos ) ) break; const unsigned long long member_size = trailer.member_size(); - if( member_size > (unsigned long long)pos || !trailer.verify_consistency() ) + // if gaps are being ignored, verify consistency of last trailer only. + if( member_size > pos || member_size < min_member_size || + ( ( !ignore_gaps || member_vector.empty() ) && + !trailer.verify_consistency() ) ) // bad trailer { if( ignore_gaps || member_vector.empty() ) { if( skip_gap( infd, pos, ignore_trailing, loose_trailing, @@ -181,12 +214,8 @@ Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); break; } - if( seek_read( infd, header.data, Lzip_header::size, - pos - member_size ) != Lzip_header::size ) - { set_errno_error( "Error reading member header: " ); break; } - const unsigned dictionary_size = header.dictionary_size(); - if( !header.verify_magic() || !header.verify_version() || - ( !ignore_bad_ds && !isvalid_ds( dictionary_size ) ) ) + if( !read_header( infd, header, pos - member_size ) ) break; + if( !header.verify( ignore_bad_ds ) ) // bad header { if( ignore_gaps || member_vector.empty() ) { if( skip_gap( infd, pos, ignore_trailing, loose_trailing, @@ -195,10 +224,14 @@ Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, break; } pos -= member_size; + const unsigned dictionary_size = header.dictionary_size(); member_vector.push_back( Member( 0, trailer.data_size(), pos, member_size, dictionary_size ) ); + if( dictionary_size_ < dictionary_size ) + dictionary_size_ = dictionary_size; } - if( pos < 0 || pos >= min_member_size || ( pos != 0 && !ignore_gaps ) || + // block at pos == 0 must be a member unless shorter than min_member_size + if( pos >= min_member_size || ( pos != 0 && !ignore_gaps ) || member_vector.empty() ) { member_vector.clear(); @@ -217,6 +250,8 @@ Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, } if( i + 1 >= member_vector.size() ) break; member_vector[i+1].dblock.pos( end ); + if( member_vector[i].mblock.end() > member_vector[i+1].mblock.pos() ) + internal_error( "two mblocks overlap after constructing a Lzip_index." ); } } @@ -224,7 +259,7 @@ Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, // All files in 'infd_vector' must be at least 'fsize' bytes long. Lzip_index::Lzip_index( const std::vector< int > & infd_vector, const long long fsize ) - : insize( fsize ), retval_( 0 ) + : insize( fsize ), retval_( 0 ), dictionary_size_( 0 ) // DS not used { if( insize < 0 ) { set_errno_error( "Input file is not seekable: " ); return; } @@ -240,8 +275,7 @@ Lzip_index::Lzip_index( const std::vector< int > & infd_vector, for( int i = 0; i < files && !done; ++i ) { const int infd = infd_vector[i]; - if( seek_read( infd, header.data, Lzip_header::size, 0 ) != Lzip_header::size ) - { set_errno_error( "Error reading member header: " ); return; } + if( !read_header( infd, header, 0 ) ) return; if( header.verify_magic() && header.verify_version() ) done = true; } if( !done ) @@ -256,17 +290,13 @@ Lzip_index::Lzip_index( const std::vector< int > & infd_vector, for( int it = 0; it < files && !done; ++it ) { const int tfd = infd_vector[it]; - if( seek_read( tfd, trailer.data, Lzip_trailer::size, - pos - Lzip_trailer::size ) != Lzip_trailer::size ) - { set_errno_error( "Error reading member trailer: " ); goto error; } + if( !read_trailer( tfd, trailer, pos ) ) goto error; member_size = trailer.member_size(); if( member_size <= (unsigned long long)pos && trailer.verify_consistency() ) for( int ih = 0; ih < files && !done; ++ih ) { const int hfd = infd_vector[ih]; - if( seek_read( hfd, header.data, Lzip_header::size, - pos - member_size ) != Lzip_header::size ) - { set_errno_error( "Error reading member header: " ); goto error; } + if( !read_header( hfd, header, pos - member_size ) ) goto error; if( header.verify_magic() && header.verify_version() ) done = true; } } @@ -323,6 +353,6 @@ long Lzip_index::blocks( const bool count_tdata ) const long n = member_vector.size() + ( count_tdata && cdata_size() < file_size() ); if( member_vector.size() && member_vector[0].mblock.pos() > 0 ) ++n; for( unsigned long i = 1; i < member_vector.size(); ++i ) - if( member_vector[i].mblock.pos() > member_vector[i-1].mblock.end() ) ++n; + if( member_vector[i-1].mblock.end() < member_vector[i].mblock.pos() ) ++n; return n; } diff --git a/lzip_index.h b/lzip_index.h index d4f2ef9..717c06c 100644 --- a/lzip_index.h +++ b/lzip_index.h @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ class Lzip_index @@ -30,21 +30,28 @@ class Lzip_index bool operator!=( const Member & m ) const { return ( mblock != m.mblock ); } }; - // member_vector only contains good members. + // member_vector only contains members with a valid header. // Garbage between members is represented by gaps between mblocks. std::vector< Member > member_vector; std::string error_; long long insize; int retval_; + unsigned dictionary_size_; // largest dictionary size in the file + bool check_header_error( const Lzip_header & header, + const bool ignore_bad_ds ); void set_errno_error( const char * const msg ); void set_num_error( const char * const msg, unsigned long long num ); - bool skip_gap( const int fd, long long & pos, - const bool ignore_trailing, const bool loose_trailing, - const bool ignore_bad_ds, const bool ignore_gaps ); + bool read_header( const int fd, Lzip_header & header, const long long pos ); + bool read_trailer( const int fd, Lzip_trailer & trailer, + const long long pos ); + bool skip_gap( const int fd, unsigned long long & pos, + const bool ignore_trailing, const bool loose_trailing, + const bool ignore_bad_ds, const bool ignore_gaps ); public: - Lzip_index() : error_( "No index" ), insize( 0 ), retval_( 2 ) {} + Lzip_index() + : error_( "No index" ), insize( 0 ), retval_( 2 ), dictionary_size_( 0 ) {} Lzip_index( const int infd, const bool ignore_trailing, const bool loose_trailing, const bool ignore_bad_ds = false, const bool ignore_gaps = false, const long long max_pos = 0 ); @@ -54,6 +61,7 @@ public: long blocks( const bool count_tdata ) const; // members + gaps [+ tdata] const std::string & error() const { return error_; } int retval() const { return retval_; } + unsigned dictionary_size() const { return dictionary_size_; } bool operator==( const Lzip_index & li ) const { @@ -1,24 +1,24 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* - Exit status: 0 for a normal exit, 1 for environmental problems - (file not found, invalid flags, I/O errors, etc), 2 to indicate a - corrupt or invalid input file, 3 for an internal consistency error - (eg, bug) which caused lziprecover to panic. + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file, 3 for an internal consistency error + (eg, bug) which caused lziprecover to panic. */ #define _FILE_OFFSET_BITS 64 @@ -31,6 +31,7 @@ #include <cstdio> #include <cstdlib> #include <cstring> +#include <new> #include <string> #include <vector> #include <fcntl.h> @@ -71,13 +72,14 @@ #endif int verbosity = 0; + +const char * const program_name = "lziprecover"; std::string output_filename; // global vars for output file int outfd = -1; // see 'delete_output_on_interrupt' below namespace { -const char * const program_name = "lziprecover"; -const char * invocation_name = 0; +const char * invocation_name = program_name; // default value const struct { const char * from; const char * to; } known_extensions[] = { { ".lz", "" }, @@ -85,9 +87,9 @@ const struct { const char * from; const char * to; } known_extensions[] = { { 0, 0 } }; enum Mode { m_none, m_alone_to_lz, m_debug_decompress, m_debug_delay, - m_debug_repair, m_decompress, m_dump, m_list, m_merge, - m_range_dec, m_remove, m_repair, m_show_packets, m_split, - m_strip, m_test }; + m_debug_repair, m_decompress, m_dump, m_list, m_md5sum, m_merge, + m_nrep_stats, m_range_dec, m_remove, m_repair, m_reproduce, + m_show_packets, m_split, m_strip, m_test, m_unzcrash }; /* Variable used in signal handler context. It is not declared volatile because the handler never returns. */ @@ -99,8 +101,9 @@ void show_help() std::printf( "Lziprecover is a data recovery tool and decompressor for files in the lzip\n" "compressed data format (.lz). Lziprecover is able to repair slightly damaged\n" "files, produce a correct file by merging the good parts of two or more\n" - "damaged copies, extract data from damaged files, decompress files and test\n" - "integrity of files.\n" + "damaged copies, reproduce a missing (zeroed) sector using a reference file,\n" + "extract data from damaged files, decompress files, and test integrity of\n" + "files.\n" "\nLziprecover can repair perfectly most files with small errors (up to one\n" "single-byte error per member), without the need of any extra redundance\n" "at all. Losing an entire archive just because of a corrupt byte near the\n" @@ -122,8 +125,12 @@ void show_help() " -c, --stdout write to standard output, keep input files\n" " -d, --decompress decompress\n" " -D, --range-decompress=<n-m> decompress a range of bytes to stdout\n" + " -e, --reproduce try to reproduce a zeroed sector in file\n" + " --lzip-level=N|a|m[N] reproduce one level, all, or match length\n" + " --lzip-name=<name> name of lzip executable for --reproduce\n" + " --reference-file=<file> reference file for --reproduce\n" " -f, --force overwrite existing output files\n" - " -i, --ignore-errors all errors in -D, format errors in -l, --dump\n" + " -i, --ignore-errors ignore some errors in -d, -D, -l, -t, --dump\n" " -k, --keep keep (don't delete) input files\n" " -l, --list print (un)compressed file sizes\n" " -m, --merge correct errors in file using several copies\n" @@ -139,15 +146,22 @@ void show_help() " --strip=<list>:d:t copy files to stdout stripping members given\n" ); if( verbosity >= 1 ) { - std::printf( " -W, --debug-decompress=<pos>,<val> set pos to val and decompress to stdout\n" + std::printf( "\nDebug options for experts:\n" + " -E, --debug-reproduce=<range>[,ss] set range to 0 and try to reproduce file\n" + " -M, --md5sum print the MD5 digests of the input files\n" + " -S, --nrep-stats[=<val>] print stats of N-byte repeated sequences\n" + " -U, --unzcrash test 1-bit errors in the input file\n" + " -W, --debug-decompress=<pos>,<val> set pos to val and decompress to stdout\n" " -X, --show-packets[=<pos>,<val>] show in stdout the decoded LZMA packets\n" " -Y, --debug-delay=<range> find max error detection delay in <range>\n" " -Z, --debug-repair=<pos>,<val> test repair one-byte error at <pos>\n" ); } - std::printf( "If no file names are given, or if a file is '-', lziprecover decompresses\n" + std::printf( "\nIf no file names are given, or if a file is '-', lziprecover decompresses\n" "from standard input to standard output.\n" "Numbers may be followed by a multiplier: k = kB = 10^3 = 1000,\n" "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc...\n" + "\nTo extract all the files from archive 'foo.tar.lz', use the commands\n" + "'tar -xf foo.tar.lz' or 'lziprecover -cd foo.tar.lz | tar -xf -'.\n" "\nExit status: 0 for a normal exit, 1 for environmental problems (file\n" "not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or\n" "invalid input file, 3 for an internal consistency error (eg, bug) which\n" @@ -203,7 +217,7 @@ const char * format_ds( const unsigned dictionary_size ) void show_header( const unsigned dictionary_size ) { - std::fprintf( stderr, "dictionary %s, ", format_ds( dictionary_size ) ); + std::fprintf( stderr, "dict %s, ", format_ds( dictionary_size ) ); } @@ -218,7 +232,7 @@ void Member_list::parse( const char * p ) const char * tp = p; // points to terminator; ':' or null while( *tp && *tp != ':' ) ++tp; const unsigned len = tp - p; - if( std::isalpha( (const unsigned char)*p ) ) + if( std::isalpha( *(const unsigned char *)p ) ) { if( len <= 7 && std::strncmp( "damaged", p, len ) == 0 ) { damaged = true; goto next; } @@ -230,7 +244,7 @@ void Member_list::parse( const char * p ) if( reverse ) ++p; if( *p == '^' ) { ++p; if( reverse ) rin = false; else in = false; } std::vector< Block > * rvp = reverse ? &rrange_vector : &range_vector; - while( std::isdigit( (const unsigned char)*p ) ) + while( std::isdigit( *(const unsigned char *)p ) ) { const char * tail; const int pos = getnum( p, 0, 1, INT_MAX, &tail ) - 1; @@ -252,9 +266,26 @@ next: namespace { -// Recognized formats: <begin> <begin>-<end> <begin>,<size> ,<size> +// Recognized formats: <digit> 'a' m[<match_length>] // -void parse_range( const char * const ptr, Block & range ) +int parse_lzip_level( const char * const p ) + { + if( *p == 'a' || std::isdigit( *(const unsigned char *)p ) ) return *p; + if( *p != 'm' ) + { + show_error( "Bad argument in option '--lzip-level'.", 0, true ); + std::exit( 1 ); + } + if( p[1] == 0 ) return -1; + return -getnum( p + 1, 0, min_match_len_limit, max_match_len ); + } + + +/* Recognized format: <range>[,<sector_size>] + range formats: <begin> <begin>-<end> <begin>,<size> ,<size> +*/ +void parse_range( const char * const ptr, Block & range, + int * const sector_sizep = 0 ) { const char * tail = ptr; long long value = @@ -264,11 +295,18 @@ void parse_range( const char * const ptr, Block & range ) range.pos( value ); if( tail[0] == 0 ) { range.size( INT64_MAX - value ); return; } const bool is_size = ( tail[0] == ',' ); - value = getnum( tail + 1, 0, 1, INT64_MAX ); // size + if( sector_sizep && tail[1] == ',' ) { value = INT64_MAX - value; ++tail; } + else value = getnum( tail + 1, 0, 1, INT64_MAX, &tail ); // size if( is_size || value > range.pos() ) { if( !is_size ) value -= range.pos(); - if( INT64_MAX - range.pos() >= value ) { range.size( value ); return; } + if( INT64_MAX - range.pos() >= value ) + { + range.size( value ); + if( sector_sizep && tail[0] == ',' ) + *sector_sizep = getnum( tail + 1, 0, 8, INT_MAX ); + return; + } } } show_error( "Bad decompression range.", 0, true ); @@ -361,7 +399,7 @@ void set_d_outname( const std::string & name, const int eindex ) } // end namespace int open_instream( const char * const name, struct stat * const in_statsp, - const bool no_ofile, const bool reg_only ) + const bool one_to_one, const bool reg_only ) { int infd = open( name, O_RDONLY | O_BINARY ); if( infd < 0 ) @@ -373,13 +411,12 @@ int open_instream( const char * const name, struct stat * const in_statsp, const bool can_read = ( i == 0 && !reg_only && ( S_ISBLK( mode ) || S_ISCHR( mode ) || S_ISFIFO( mode ) || S_ISSOCK( mode ) ) ); - if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) ) + if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || one_to_one ) ) ) { if( verbosity >= 0 ) std::fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", - program_name, name, - ( can_read && !no_ofile ) ? - ",\n and '--stdout' was not specified" : "" ); + program_name, name, ( can_read && one_to_one ) ? + ",\n and neither '-c' nor '-o' were specified" : "" ); close( infd ); infd = -1; } @@ -399,24 +436,18 @@ int open_truncable_stream( const char * const name, const int i = fstat( fd, in_statsp ); const mode_t mode = in_statsp->st_mode; if( i != 0 || !S_ISREG( mode ) ) - { - if( verbosity >= 0 ) - std::fprintf( stderr, "%s: File '%s' is not a regular file.\n", - program_name, name ); - close( fd ); - fd = -1; - } + { show_file_error( name, "Not a regular file." ); close( fd ); fd = -1; } } return fd; } -bool open_outstream( const bool force, const bool from_stdin, +bool open_outstream( const bool force, const bool protect, const bool rw, const bool skipping ) { const mode_t usr_rw = S_IRUSR | S_IWUSR; const mode_t all_rw = usr_rw | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; - const mode_t outfd_mode = from_stdin ? all_rw : usr_rw; + const mode_t outfd_mode = protect ? usr_rw : all_rw; int flags = O_CREAT | ( rw ? O_RDWR : O_WRONLY ) | O_BINARY; if( force ) flags |= O_TRUNC; else flags |= O_EXCL; @@ -451,24 +482,6 @@ bool file_exists( const std::string & filename ) } -bool check_tty( const char * const input_filename, const int infd, - const Mode program_mode ) - { - if( program_mode == m_alone_to_lz && isatty( outfd ) ) - { - show_error( "I won't write compressed data to a terminal.", 0, true ); - return false; - } - if( isatty( infd ) ) // all modes read compressed data - { - show_file_error( input_filename, - "I won't read compressed data from a terminal." ); - return false; - } - return true; - } - - void set_signals( void (*action)(int) ) { std::signal( SIGHUP, action ); @@ -502,7 +515,30 @@ extern "C" void signal_handler( int ) } - // Set permissions, owner and times. +bool check_tty_in( const char * const input_filename, const int infd, + const Mode program_mode, int & retval ) + { + if( isatty( infd ) ) // all modes read compressed data + { show_file_error( input_filename, + "I won't read compressed data from a terminal." ); + close( infd ); set_retval( retval, 1 ); + if( program_mode != m_test ) cleanup_and_fail( retval ); + return false; } + return true; + } + +bool check_tty_out( const Mode program_mode ) + { + if( program_mode == m_alone_to_lz && isatty( outfd ) ) + { show_file_error( output_filename.size() ? + output_filename.c_str() : "(stdout)", + "I won't write compressed data to a terminal." ); + return false; } + return true; + } + + +// Set permissions, owner, and times. void close_and_set_permissions( const struct stat * const in_statsp ) { bool warning = false; @@ -571,76 +607,76 @@ bool show_trailing_data( const uint8_t * const data, const int size, int decompress( const unsigned long long cfile_size, const int infd, - const Pretty_print & pp, const bool ignore_trailing, - const bool loose_trailing, const bool testing ) + const Pretty_print & pp, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing, + const bool testing ) { int retval = 0; - - try { - unsigned long long partial_file_pos = 0; - Range_decoder rdec( infd ); - for( bool first_member = true; ; first_member = false ) + unsigned long long partial_file_pos = 0; + Range_decoder rdec( infd ); + for( bool first_member = true; ; first_member = false ) + { + Lzip_header header; + rdec.reset_member_position(); + const int size = rdec.read_header_carefully( header, ignore_errors ); + if( rdec.finished() || // End Of File + ( size < Lzip_header::size && !rdec.find_header( header ) ) ) { - Lzip_header header; - rdec.reset_member_position(); - const int size = rdec.read_data( header.data, Lzip_header::size ); - if( rdec.finished() ) // End Of File - { - if( first_member ) - { show_file_error( pp.name(), "File ends unexpectedly at member header." ); - retval = 2; } - else if( header.verify_prefix( size ) ) - { pp( "Truncated header in multimember file." ); - show_trailing_data( header.data, size, pp, true, -1 ); - retval = 2; } - else if( size > 0 && !show_trailing_data( header.data, size, pp, - true, ignore_trailing ) ) - retval = 2; - break; - } - if( !header.verify_magic() ) - { - if( first_member ) - { show_file_error( pp.name(), bad_magic_msg ); retval = 2; } - else if( !loose_trailing && header.verify_corrupt() ) - { pp( corrupt_mm_msg ); - show_trailing_data( header.data, size, pp, false, -1 ); - retval = 2; } - else if( !show_trailing_data( header.data, size, pp, false, ignore_trailing ) ) - retval = 2; - break; - } - if( !header.verify_version() ) - { pp( bad_version( header.version() ) ); retval = 2; break; } - const unsigned dictionary_size = header.dictionary_size(); - if( !isvalid_ds( dictionary_size ) ) - { pp( bad_dict_msg ); retval = 2; break; } - - if( verbosity >= 2 || ( verbosity == 1 && first_member ) ) pp(); - - LZ_decoder decoder( rdec, dictionary_size, outfd ); - show_dprogress( cfile_size, partial_file_pos, &rdec, &pp ); // init - const int result = decoder.decode_member( pp ); - partial_file_pos += rdec.member_position(); - if( result != 0 ) + if( first_member ) + { show_file_error( pp.name(), "File ends unexpectedly at member header." ); + retval = 2; } + else if( header.verify_prefix( size ) ) + { pp( "Truncated header in multimember file." ); + show_trailing_data( header.data, size, pp, true, -1 ); + retval = 2; } + else if( size > 0 && !show_trailing_data( header.data, size, pp, + true, ignore_trailing ) ) + retval = 2; + break; + } + if( !header.verify_magic() ) + { + if( first_member ) + { show_file_error( pp.name(), bad_magic_msg ); retval = 2; } + else if( !loose_trailing && header.verify_corrupt() ) + { pp( corrupt_mm_msg ); + show_trailing_data( header.data, size, pp, false, -1 ); + retval = 2; } + else if( !show_trailing_data( header.data, size, pp, false, ignore_trailing ) ) + retval = 2; + if( ignore_errors ) { pp.reset(); continue; } else break; + } + if( !header.verify_version() ) + { pp( bad_version( header.version() ) ); retval = 2; + if( ignore_errors ) { pp.reset(); continue; } else break; } + const unsigned dictionary_size = header.dictionary_size(); + if( !isvalid_ds( dictionary_size ) ) + { pp( bad_dict_msg ); retval = 2; + if( ignore_errors ) { pp.reset(); continue; } else break; } + + if( verbosity >= 2 || ( verbosity == 1 && first_member ) ) pp(); + + LZ_decoder decoder( rdec, dictionary_size, outfd ); + show_dprogress( cfile_size, partial_file_pos, &rdec, &pp ); // init + const int result = decoder.decode_member( pp ); + partial_file_pos += rdec.member_position(); + if( result != 0 ) + { + if( verbosity >= 0 && result <= 2 ) { - if( verbosity >= 0 && result <= 2 ) - { - pp(); - std::fprintf( stderr, "%s at pos %llu\n", ( result == 2 ) ? - "File ends unexpectedly" : "Decoder error", - partial_file_pos ); - } - retval = 2; break; + pp(); + std::fprintf( stderr, "%s at pos %llu\n", ( result == 2 ) ? + "File ends unexpectedly" : "Decoder error", + partial_file_pos ); } - if( verbosity >= 2 ) - { std::fputs( testing ? "ok\n" : "done\n", stderr ); pp.reset(); } + retval = 2; if( ignore_errors ) { pp.reset(); continue; } else break; } + if( verbosity >= 2 ) + { std::fputs( testing ? "ok\n" : "done\n", stderr ); pp.reset(); } } - catch( std::bad_alloc & ) { pp( "Not enough memory." ); retval = 1; } - catch( Error & e ) { pp(); show_error( e.msg, errno ); retval = 1; } if( verbosity == 1 && retval == 0 ) std::fputs( testing ? "ok\n" : "done\n", stderr ); + if( retval == 2 && ignore_errors ) retval = 0; return retval; } @@ -725,20 +761,27 @@ void show_dprogress( const unsigned long long cfile_size, int main( const int argc, const char * const argv[] ) { Block range( 0, 0 ); + int sector_size = INT_MAX; // default larger than practical range Bad_byte bad_byte; Member_list member_list; std::string default_output_filename; std::vector< std::string > filenames; + const char * lzip_name = "lzip"; // default is lzip + const char * reference_filename = 0; Mode program_mode = m_none; + int lzip_level = 0; // 0 = test all levels and match lengths + // '0'..'9' = level, 'a' = all levels + // -5..-273 = match length, -1 = all lengths + int repeated_byte = -1; // 0 to 255, or -1 for all values bool force = false; bool ignore_errors = false; bool ignore_trailing = true; bool keep_input_files = false; bool loose_trailing = false; bool to_stdout = false; - invocation_name = argv[0]; + if( argc > 0 ) invocation_name = argv[0]; - enum { opt_du = 256, opt_dtd, opt_lt, opt_re, opt_rtd, opt_st, opt_std }; + enum { opt_du = 256, opt_lt, opt_lzl, opt_lzn, opt_ref, opt_re, opt_st }; const Arg_parser::Option options[] = { { 'a', "trailing-error", Arg_parser::no }, @@ -746,18 +789,23 @@ int main( const int argc, const char * const argv[] ) { 'c', "stdout", Arg_parser::no }, { 'd', "decompress", Arg_parser::no }, { 'D', "range-decompress", Arg_parser::yes }, + { 'e', "reproduce", Arg_parser::no }, + { 'E', "debug-reproduce", Arg_parser::yes }, { 'f', "force", Arg_parser::no }, { 'h', "help", Arg_parser::no }, { 'i', "ignore-errors", Arg_parser::no }, { 'k', "keep", Arg_parser::no }, { 'l', "list", Arg_parser::no }, { 'm', "merge", Arg_parser::no }, + { 'M', "md5sum", Arg_parser::no }, { 'n', "threads", Arg_parser::yes }, { 'o', "output", Arg_parser::yes }, { 'q', "quiet", Arg_parser::no }, { 'R', "repair", Arg_parser::no }, { 's', "split", Arg_parser::no }, + { 'S', "nrep-stats", Arg_parser::maybe }, { 't', "test", Arg_parser::no }, + { 'U', "unzcrash", Arg_parser::no }, { 'v', "verbose", Arg_parser::no }, { 'V', "version", Arg_parser::no }, { 'W', "debug-decompress", Arg_parser::yes }, @@ -765,12 +813,12 @@ int main( const int argc, const char * const argv[] ) { 'Y', "debug-delay", Arg_parser::yes }, { 'Z', "debug-repair", Arg_parser::yes }, { opt_du, "dump", Arg_parser::yes }, - { opt_dtd, "dump-tdata", Arg_parser::no }, { opt_lt, "loose-trailing", Arg_parser::no }, + { opt_lzl, "lzip-level", Arg_parser::yes }, + { opt_lzn, "lzip-name", Arg_parser::yes }, + { opt_ref, "reference-file", Arg_parser::yes }, { opt_re, "remove", Arg_parser::yes }, - { opt_rtd, "remove-tdata", Arg_parser::no }, { opt_st, "strip", Arg_parser::yes }, - { opt_std, "strip-tdata", Arg_parser::no }, { 0 , 0, Arg_parser::no } }; const Arg_parser parser( argc, argv, options ); @@ -792,18 +840,26 @@ int main( const int argc, const char * const argv[] ) case 'd': set_mode( program_mode, m_decompress ); break; case 'D': set_mode( program_mode, m_range_dec ); parse_range( arg, range ); break; + case 'e': set_mode( program_mode, m_reproduce ); break; + case 'E': set_mode( program_mode, m_reproduce ); + parse_range( arg, range, §or_size ); break; case 'f': force = true; break; case 'h': show_help(); return 0; case 'i': ignore_errors = true; break; case 'k': keep_input_files = true; break; case 'l': set_mode( program_mode, m_list ); break; case 'm': set_mode( program_mode, m_merge ); break; + case 'M': set_mode( program_mode, m_md5sum ); break; case 'n': break; - case 'o': default_output_filename = sarg; break; + case 'o': if( sarg == "-" ) to_stdout = true; + else { default_output_filename = sarg; } break; case 'q': verbosity = -1; break; case 'R': set_mode( program_mode, m_repair ); break; case 's': set_mode( program_mode, m_split ); break; + case 'S': if( arg[0] ) repeated_byte = getnum( arg, 0, 0, 255 ); + set_mode( program_mode, m_nrep_stats ); break; case 't': set_mode( program_mode, m_test ); break; + case 'U': set_mode( program_mode, m_unzcrash ); break; case 'v': if( verbosity < 4 ) ++verbosity; break; case 'V': show_version(); return 0; case 'W': set_mode( program_mode, m_debug_decompress ); @@ -816,17 +872,14 @@ int main( const int argc, const char * const argv[] ) parse_pos_value( arg, bad_byte ); break; case opt_du: set_mode( program_mode, m_dump ); member_list.parse( arg ); break; - case opt_dtd: set_mode( program_mode, m_dump ); - member_list.parse( "tdata" ); break; case opt_lt: loose_trailing = true; break; + case opt_lzl: lzip_level = parse_lzip_level( arg ); break; + case opt_lzn: lzip_name = arg; break; + case opt_ref: reference_filename = arg; break; case opt_re: set_mode( program_mode, m_remove ); member_list.parse( arg ); break; - case opt_rtd: set_mode( program_mode, m_remove ); - member_list.parse( "tdata" ); break; case opt_st: set_mode( program_mode, m_strip ); member_list.parse( arg ); break; - case opt_std: set_mode( program_mode, m_strip ); - member_list.parse( "tdata" ); break; default : internal_error( "uncaught option." ); } } // end process options @@ -871,12 +924,15 @@ int main( const int argc, const char * const argv[] ) { show_error( "You must specify at least 1 file.", 0, true ); return 1; } return dump_members( filenames, default_output_filename, member_list, force, ignore_errors, ignore_trailing, - loose_trailing, program_mode == m_strip ); + loose_trailing, program_mode == m_strip, to_stdout ); case m_list: break; + case m_md5sum: break; case m_merge: if( filenames.size() < 2 ) { show_error( "You must specify at least 2 files.", 0, true ); return 1; } - return merge_files( filenames, default_output_filename, force, terminator ); + return merge_files( filenames, default_output_filename, terminator, force ); + case m_nrep_stats: return print_nrep_stats( filenames, repeated_byte, + ignore_errors, ignore_trailing, loose_trailing ); case m_range_dec: one_file( filenames.size() ); return range_decompress( filenames[0], default_output_filename, range, @@ -889,7 +945,17 @@ int main( const int argc, const char * const argv[] ) ignore_trailing, loose_trailing ); case m_repair: one_file( filenames.size() ); - return repair_file( filenames[0], default_output_filename, force, terminator ); + return repair_file( filenames[0], default_output_filename, terminator, force ); + case m_reproduce: + one_file( filenames.size() ); + if( !reference_filename || !reference_filename[0] ) + { show_error( "You must specify a reference file.", 0, true ); return 1; } + if( range.size() > 0 ) + return debug_reproduce_file( filenames[0], lzip_name, + reference_filename, range, sector_size, lzip_level ); + else + return reproduce_file( filenames[0], default_output_filename, + lzip_name, reference_filename, lzip_level, terminator, force ); case m_show_packets: one_file( filenames.size() ); return debug_decompress( filenames[0], bad_byte, true ); @@ -897,127 +963,116 @@ int main( const int argc, const char * const argv[] ) one_file( filenames.size() ); return split_file( filenames[0], default_output_filename, force ); case m_test: break; + case m_unzcrash: + one_file( filenames.size() ); + return lunzcrash( filenames[0] ); } } - catch( std::bad_alloc & ) - { show_error( "Not enough memory." ); cleanup_and_fail( 1 ); } + catch( std::bad_alloc & ) { show_error( mem_msg ); cleanup_and_fail( 1 ); } catch( Error & e ) { show_error( e.msg, errno ); cleanup_and_fail( 1 ); } if( filenames.empty() ) filenames.push_back("-"); if( program_mode == m_list ) - return list_files( filenames, ignore_errors, ignore_trailing, - loose_trailing ); + return list_files( filenames, ignore_errors, ignore_trailing, loose_trailing ); + if( program_mode == m_md5sum ) + return md5sum_files( filenames ); - if( program_mode == m_test ) - outfd = -1; - else if( program_mode != m_alone_to_lz && program_mode != m_decompress ) + if( program_mode != m_alone_to_lz && program_mode != m_decompress && + program_mode != m_test ) internal_error( "invalid decompressor operation." ); - if( !to_stdout && program_mode != m_test && - ( filenames_given || default_output_filename.size() ) ) + if( program_mode == m_test ) to_stdout = false; // apply overrides + if( program_mode == m_test || to_stdout ) default_output_filename.clear(); + + if( to_stdout && program_mode != m_test ) // check tty only once + { outfd = STDOUT_FILENO; if( !check_tty_out( program_mode ) ) return 1; } + else outfd = -1; + + const bool to_file = !to_stdout && program_mode != m_test && + default_output_filename.size(); + if( !to_stdout && program_mode != m_test && ( filenames_given || to_file ) ) set_signals( signal_handler ); Pretty_print pp( filenames ); int failed_tests = 0; int retval = 0; + const bool one_to_one = !to_stdout && program_mode != m_test && !to_file; bool stdin_used = false; for( unsigned i = 0; i < filenames.size(); ++i ) { std::string input_filename; int infd; struct stat in_stats; - output_filename.clear(); - if( filenames[i].empty() || filenames[i] == "-" ) + pp.set_name( filenames[i] ); + if( filenames[i] == "-" ) { if( stdin_used ) continue; else stdin_used = true; infd = STDIN_FILENO; - if( program_mode != m_test ) - { - if( to_stdout || default_output_filename.empty() ) - outfd = STDOUT_FILENO; - else - { - output_filename = default_output_filename; - if( program_mode == m_alone_to_lz && - extension_index( default_output_filename ) < 0 ) - output_filename += known_extensions[0].from; - if( !open_outstream( force, true ) ) - { - if( retval < 1 ) retval = 1; - close( infd ); - continue; - } - } - } + if( !check_tty_in( pp.name(), infd, program_mode, retval ) ) continue; + if( one_to_one ) { outfd = STDOUT_FILENO; output_filename.clear(); } } else { input_filename = filenames[i]; - infd = open_instream( input_filename.c_str(), &in_stats, - to_stdout || program_mode == m_test ); - if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } - if( program_mode != m_test ) + infd = open_instream( input_filename.c_str(), &in_stats, one_to_one ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + if( !check_tty_in( pp.name(), infd, program_mode, retval ) ) continue; + if( one_to_one ) // open outfd after verifying infd { - if( to_stdout ) outfd = STDOUT_FILENO; - else - { - if( program_mode == m_alone_to_lz ) - set_a_outname( input_filename ); - else set_d_outname( input_filename, extension_index( input_filename ) ); - if( !open_outstream( force, false ) ) - { - if( retval < 1 ) retval = 1; - close( infd ); - continue; - } - } + if( program_mode == m_alone_to_lz ) set_a_outname( input_filename ); + else set_d_outname( input_filename, extension_index( input_filename ) ); + if( !open_outstream( force, true ) ) + { close( infd ); set_retval( retval, 1 ); continue; } } } - pp.set_name( input_filename ); - if( !check_tty( pp.name(), infd, program_mode ) ) + if( one_to_one && !check_tty_out( program_mode ) ) + { set_retval( retval, 1 ); return retval; } // don't delete a tty + + if( to_file && outfd < 0 ) // open outfd after verifying infd { - if( retval < 1 ) retval = 1; - if( program_mode == m_test ) { close( infd ); continue; } - cleanup_and_fail( retval ); + output_filename = default_output_filename; + if( !open_outstream( force, false ) || !check_tty_out( program_mode ) ) + return 1; // check tty only once and don't try to delete a tty } - const struct stat * const in_statsp = input_filename.size() ? &in_stats : 0; + const struct stat * const in_statsp = + ( input_filename.size() && one_to_one ) ? &in_stats : 0; const unsigned long long cfile_size = - ( in_statsp && S_ISREG( in_statsp->st_mode ) ) ? - ( in_statsp->st_size + 99 ) / 100 : 0; + ( input_filename.size() && S_ISREG( in_stats.st_mode ) ) ? + ( in_stats.st_size + 99 ) / 100 : 0; int tmp; - if( program_mode == m_alone_to_lz ) - tmp = alone_to_lz( infd, pp ); - else - tmp = decompress( cfile_size, infd, pp, ignore_trailing, - loose_trailing, program_mode == m_test ); - if( close( infd ) != 0 ) - { - show_error( input_filename.size() ? "Error closing input file" : - "Error closing stdin", errno ); - if( tmp < 1 ) tmp = 1; + try { + if( program_mode == m_alone_to_lz ) + tmp = alone_to_lz( infd, pp ); + else + tmp = decompress( cfile_size, infd, pp, ignore_errors, ignore_trailing, + loose_trailing, program_mode == m_test ); } - if( tmp > retval ) retval = tmp; + catch( std::bad_alloc & ) { pp( mem_msg ); tmp = 1; } + catch( Error & e ) { pp(); show_error( e.msg, errno ); tmp = 1; } + if( close( infd ) != 0 ) + { show_file_error( pp.name(), "Error closing input file", errno ); + set_retval( tmp, 1 ); } + set_retval( retval, tmp ); if( tmp ) { if( program_mode != m_test ) cleanup_and_fail( retval ); else ++failed_tests; } - if( delete_output_on_interrupt ) + if( delete_output_on_interrupt && one_to_one ) close_and_set_permissions( in_statsp ); - if( input_filename.size() ) - { - if( !keep_input_files && !to_stdout && program_mode != m_test ) - std::remove( input_filename.c_str() ); - } + if( input_filename.size() && !keep_input_files && one_to_one && + ( program_mode != m_decompress || !ignore_errors ) ) + std::remove( input_filename.c_str() ); } - if( outfd >= 0 && close( outfd ) != 0 ) + if( delete_output_on_interrupt ) close_and_set_permissions( 0 ); // -o + else if( outfd >= 0 && close( outfd ) != 0 ) // -c { show_error( "Error closing stdout", errno ); - if( retval < 1 ) retval = 1; + set_retval( retval, 1 ); } if( failed_tests > 0 && verbosity >= 1 && filenames.size() > 1 ) std::fprintf( stderr, "%s: warning: %d %s failed the test.\n", diff --git a/main_common.cc b/main_common.cc index d7a2e81..386a5b1 100644 --- a/main_common.cc +++ b/main_common.cc @@ -1,23 +1,24 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ namespace { -const char * const program_year = "2019"; +const char * const program_year = "2021"; +const char * const mem_msg = "Not enough memory."; void show_version() { @@ -0,0 +1,206 @@ +/* Functions to compute MD5 message digest of memory blocks according to the + definition of MD5 in RFC 1321 from April 1992. + Copyright (C) 2020, 2021 Antonio Diaz Diaz. + + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <cstring> +#include <stdint.h> + +#include "md5.h" + + +namespace { + +/* These are the four functions used in the four steps of the MD5 algorithm + as defined in RFC 1321. */ +#define F(x, y, z) ((x & y) | (~x & z)) +#define G(x, y, z) ((x & z) | (y & ~z)) +#define H(x, y, z) (x ^ y ^ z) +#define I(x, y, z) (y ^ (x | ~z)) + +/* Rotate x left n bits. + It is unfortunate that C++ does not provide an operator for rotation. + Hope the compiler is smart enough. */ +#define ROTATE_LEFT(x, n) (x = (x << n) | (x >> (32 - n))) + +// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +#define FF(a, b, c, d, x, s, ac) \ + { a += F(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } +#define GG(a, b, c, d, x, s, ac) \ + { a += G(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } +#define HH(a, b, c, d, x, s, ac) \ + { a += H(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } +#define II(a, b, c, d, x, s, ac) \ + { a += I(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } + +} // end namespace + + +void MD5SUM::md5_process_block( const uint8_t block[64] ) + { + uint32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + for( int i = 0, j = 0; i < 16; ++i, j += 4 ) // fill x in little endian + x[i] = block[j] | (block[j+1] << 8) | (block[j+2] << 16) | (block[j+3] << 24); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], 7, 0xD76AA478); // 1 + FF (d, a, b, c, x[ 1], 12, 0xE8C7B756); // 2 + FF (c, d, a, b, x[ 2], 17, 0x242070DB); // 3 + FF (b, c, d, a, x[ 3], 22, 0xC1BDCEEE); // 4 + FF (a, b, c, d, x[ 4], 7, 0xF57C0FAF); // 5 + FF (d, a, b, c, x[ 5], 12, 0x4787C62A); // 6 + FF (c, d, a, b, x[ 6], 17, 0xA8304613); // 7 + FF (b, c, d, a, x[ 7], 22, 0xFD469501); // 8 + FF (a, b, c, d, x[ 8], 7, 0x698098D8); // 9 + FF (d, a, b, c, x[ 9], 12, 0x8B44F7AF); // 10 + FF (c, d, a, b, x[10], 17, 0xFFFF5BB1); // 11 + FF (b, c, d, a, x[11], 22, 0x895CD7BE); // 12 + FF (a, b, c, d, x[12], 7, 0x6B901122); // 13 + FF (d, a, b, c, x[13], 12, 0xFD987193); // 14 + FF (c, d, a, b, x[14], 17, 0xA679438E); // 15 + FF (b, c, d, a, x[15], 22, 0x49B40821); // 16 + + /* Round 2 */ + GG (a, b, c, d, x[ 1], 5, 0xF61E2562); // 17 + GG (d, a, b, c, x[ 6], 9, 0xC040B340); // 18 + GG (c, d, a, b, x[11], 14, 0x265E5A51); // 19 + GG (b, c, d, a, x[ 0], 20, 0xE9B6C7AA); // 20 + GG (a, b, c, d, x[ 5], 5, 0xD62F105D); // 21 + GG (d, a, b, c, x[10], 9, 0x02441453); // 22 + GG (c, d, a, b, x[15], 14, 0xD8A1E681); // 23 + GG (b, c, d, a, x[ 4], 20, 0xE7D3FBC8); // 24 + GG (a, b, c, d, x[ 9], 5, 0x21E1CDE6); // 25 + GG (d, a, b, c, x[14], 9, 0xC33707D6); // 26 + GG (c, d, a, b, x[ 3], 14, 0xF4D50D87); // 27 + GG (b, c, d, a, x[ 8], 20, 0x455A14ED); // 28 + GG (a, b, c, d, x[13], 5, 0xA9E3E905); // 29 + GG (d, a, b, c, x[ 2], 9, 0xFCEFA3F8); // 30 + GG (c, d, a, b, x[ 7], 14, 0x676F02D9); // 31 + GG (b, c, d, a, x[12], 20, 0x8D2A4C8A); // 32 + + /* Round 3 */ + HH (a, b, c, d, x[ 5], 4, 0xFFFA3942); // 33 + HH (d, a, b, c, x[ 8], 11, 0x8771F681); // 34 + HH (c, d, a, b, x[11], 16, 0x6D9D6122); // 35 + HH (b, c, d, a, x[14], 23, 0xFDE5380C); // 36 + HH (a, b, c, d, x[ 1], 4, 0xA4BEEA44); // 37 + HH (d, a, b, c, x[ 4], 11, 0x4BDECFA9); // 38 + HH (c, d, a, b, x[ 7], 16, 0xF6BB4B60); // 39 + HH (b, c, d, a, x[10], 23, 0xBEBFBC70); // 40 + HH (a, b, c, d, x[13], 4, 0x289B7EC6); // 41 + HH (d, a, b, c, x[ 0], 11, 0xEAA127FA); // 42 + HH (c, d, a, b, x[ 3], 16, 0xD4EF3085); // 43 + HH (b, c, d, a, x[ 6], 23, 0x04881D05); // 44 + HH (a, b, c, d, x[ 9], 4, 0xD9D4D039); // 45 + HH (d, a, b, c, x[12], 11, 0xE6DB99E5); // 46 + HH (c, d, a, b, x[15], 16, 0x1FA27CF8); // 47 + HH (b, c, d, a, x[ 2], 23, 0xC4AC5665); // 48 + + /* Round 4 */ + II (a, b, c, d, x[ 0], 6, 0xF4292244); // 49 + II (d, a, b, c, x[ 7], 10, 0x432AFF97); // 50 + II (c, d, a, b, x[14], 15, 0xAB9423A7); // 51 + II (b, c, d, a, x[ 5], 21, 0xFC93A039); // 52 + II (a, b, c, d, x[12], 6, 0x655B59C3); // 53 + II (d, a, b, c, x[ 3], 10, 0x8F0CCC92); // 54 + II (c, d, a, b, x[10], 15, 0xFFEFF47D); // 55 + II (b, c, d, a, x[ 1], 21, 0x85845DD1); // 56 + II (a, b, c, d, x[ 8], 6, 0x6FA87E4F); // 57 + II (d, a, b, c, x[15], 10, 0xFE2CE6E0); // 58 + II (c, d, a, b, x[ 6], 15, 0xA3014314); // 59 + II (b, c, d, a, x[13], 21, 0x4E0811A1); // 60 + II (a, b, c, d, x[ 4], 6, 0xF7537E82); // 61 + II (d, a, b, c, x[11], 10, 0xBD3AF235); // 62 + II (c, d, a, b, x[ 2], 15, 0x2AD7D2BB); // 63 + II (b, c, d, a, x[ 9], 21, 0xEB86D391); // 64 + + // add the processed values to the context + state[0] += a; state[1] += b; state[2] += c; state[3] += d; + } + + +/* Update the context for the next 'len' bytes of 'buffer'. + 'len' does not need to be a multiple of 64. +*/ +void MD5SUM::md5_update( const uint8_t * const buffer, const unsigned long len ) + { + unsigned index = count & 0x3F; // data length in bytes mod 64 + count += len; // update data length + const unsigned rest = 64 - index; + unsigned long i; + + if( len >= rest ) // process as many bytes as possible + { + std::memcpy( ibuf + index, buffer, rest ); + md5_process_block( ibuf ); + for( i = rest; i + 63 < len; i += 64 ) + md5_process_block( buffer + i ); + index = 0; + } + else i = 0; + + std::memcpy( ibuf + index, buffer + i, len - i ); // save remaining input + } + + +// finish computation and return the digest +void MD5SUM::md5_finish( uint8_t digest[16] ) + { + uint8_t padding[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + uint8_t bits[8]; + uint64_t c = count << 3; // save data length in bits + for( int i = 0; i <= 7; ++i ) { bits[i] = (uint8_t)c; c >>= 8; } + + const unsigned index = count & 0x3F; // data length in bytes mod 64 + const unsigned len = (index < 56) ? (56 - index) : (120 - index); + md5_update( padding, len ); // pad to 56 mod 64 + md5_update( bits, 8 ); // append data length in bits + + for( int i = 0, j = 0; i < 4; i++, j += 4 ) // store state in digest + { + digest[j ] = (uint8_t)state[i]; + digest[j+1] = (uint8_t)(state[i] >> 8); + digest[j+2] = (uint8_t)(state[i] >> 16); + digest[j+3] = (uint8_t)(state[i] >> 24); + } + } + + +void compute_md5( const uint8_t * const buffer, const unsigned long len, + uint8_t digest[16] ) + { + MD5SUM md5sum; + if( len > 0 ) md5sum.md5_update( buffer, len ); + md5sum.md5_finish( digest ); + } + + +bool check_md5( const uint8_t * const buffer, const unsigned long len, + const uint8_t digest[16] ) + { + uint8_t new_digest[16]; + compute_md5( buffer, len, new_digest ); + return ( std::memcmp( digest, new_digest, 16 ) == 0 ); + } @@ -0,0 +1,49 @@ +/* Functions to compute MD5 message digest of memory blocks according to the + definition of MD5 in RFC 1321 from April 1992. + Copyright (C) 2020, 2021 Antonio Diaz Diaz. + + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +class MD5SUM + { + uint64_t count; // data length in bytes, modulo 2^64 + uint32_t state[4]; // state (ABCD) + uint8_t ibuf[64]; // input buffer with space for a block + + void md5_process_block( const uint8_t block[64] ); + +public: + MD5SUM() { reset(); } + + void reset() + { + count = 0; + state[0] = 0x67452301; // magic initialization constants + state[1] = 0xEFCDAB89; + state[2] = 0x98BADCFE; + state[3] = 0x10325476; + } + + void md5_update( const uint8_t * const buffer, const unsigned long len ); + void md5_finish( uint8_t digest[16] ); + }; + +void compute_md5( const uint8_t * const buffer, const unsigned long len, + uint8_t digest[16] ); + +bool check_md5( const uint8_t * const buffer, const unsigned long len, + const uint8_t digest[16] ); @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -206,7 +206,7 @@ long ipow( const unsigned base, const unsigned exponent ) unsigned long result = 1; for( unsigned i = 0; i < exponent; ++i ) { - if( LONG_MAX / base >= result ) result *= base; + if( LONG_MAX / result >= base ) result *= base; else { result = LONG_MAX; break; } } return result; @@ -229,7 +229,7 @@ int open_input_files( const std::vector< std::string > & filenames, { struct stat in_stats; // not used infd_vector[i] = open_instream( filenames[i].c_str(), - ( i == 0 ) ? in_statsp : &in_stats, true, true ); + ( i == 0 ) ? in_statsp : &in_stats, false, true ); if( infd_vector[i] < 0 ) return 1; if( !file_crc( crc_vector[i], infd_vector[i], filenames[i].c_str() ) ) return 1; @@ -344,7 +344,7 @@ bool color_done( const std::vector< int > & color_vector, const int i ) } - // try dividing blocks in 2 color groups at every gap +// try dividing blocks in 2 color groups at every gap bool try_merge_member2( const long long mpos, const long long msize, const std::vector< Block > & block_vector, const std::vector< int > & color_vector, @@ -390,7 +390,7 @@ bool try_merge_member2( const long long mpos, const long long msize, } - // merge block by block +// merge block by block bool try_merge_member( const long long mpos, const long long msize, const std::vector< Block > & block_vector, const std::vector< int > & color_vector, @@ -447,7 +447,7 @@ bool try_merge_member( const long long mpos, const long long msize, } - // merge a single block split at every possible position +// merge a single block split at every possible position bool try_merge_member1( const long long mpos, const long long msize, const std::vector< Block > & block_vector, const std::vector< int > & color_vector, @@ -562,7 +562,7 @@ int test_member_from_file( const int infd, const unsigned long long msize, int merge_files( const std::vector< std::string > & filenames, const std::string & default_output_filename, - const bool force, const char terminator ) + const char terminator, const bool force ) { const int files = filenames.size(); std::vector< int > infd_vector( files ); @@ -576,7 +576,7 @@ int merge_files( const std::vector< std::string > & filenames, output_filename = default_output_filename.empty() ? insert_fixed( filenames[0] ) : default_output_filename; set_signal_handler(); - if( !open_outstream( force, false, true, false ) ) return 1; + if( !open_outstream( force, true, true, false ) ) return 1; if( !copy_file( infd_vector[0], outfd ) ) // copy whole file cleanup_and_fail( 1 ); @@ -611,21 +611,19 @@ int merge_files( const std::vector< std::string > & filenames, } bool done = false; - if( lzip_index.members() > 1 || block_vector.size() > 1 ) + if( block_vector.size() > 1 ) { - if( block_vector.size() > 1 ) - { - maybe_cluster_blocks( block_vector ); - done = try_merge_member2( mpos, msize, block_vector, color_vector, - infd_vector, terminator ); - print_pending_newline( terminator ); - } - if( !done ) - { - done = try_merge_member( mpos, msize, block_vector, color_vector, - infd_vector, terminator ); - print_pending_newline( terminator ); - } + maybe_cluster_blocks( block_vector ); + done = try_merge_member2( mpos, msize, block_vector, color_vector, + infd_vector, terminator ); + print_pending_newline( terminator ); + } + // With just one member and one differing block the merge can't succeed. + if( !done && ( lzip_index.members() > 1 || block_vector.size() > 1 ) ) + { + done = try_merge_member( mpos, msize, block_vector, color_vector, + infd_vector, terminator ); + print_pending_newline( terminator ); } if( !done ) { @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -29,6 +29,7 @@ #include <unistd.h> #include "lzip.h" +#include "md5.h" #include "mtester.h" @@ -80,6 +81,7 @@ void LZ_mtester::flush_data() { const int size = pos - stream_pos; crc32.update_buf( crc_, buffer + stream_pos, size ); + if( md5sum ) md5sum->md5_update( buffer + stream_pos, size ); if( outfd >= 0 && writeblock( outfd, buffer + stream_pos, size ) != size ) throw Error( "Write error" ); if( pos >= dictionary_size ) @@ -89,27 +91,68 @@ void LZ_mtester::flush_data() } -bool LZ_mtester::verify_trailer() +bool LZ_mtester::verify_trailer( FILE * const f, unsigned long long byte_pos ) { const Lzip_trailer * const trailer = rdec.get_trailer(); + if( !trailer ) + { + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fputs( "Can't get trailer.\n", f ); } + return false; + } + const unsigned long long data_size = data_position(); + const unsigned long long member_size = member_position(); + bool error = false; - return ( trailer && - trailer->data_crc() == crc() && - trailer->data_size() == data_position() && - trailer->member_size() == member_position() ); + const unsigned td_crc = trailer->data_crc(); + if( td_crc != crc() ) + { + error = true; + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fprintf( f, "CRC mismatch; stored %08X, computed %08X\n", + td_crc, crc() ); } + } + const unsigned long long td_size = trailer->data_size(); + if( td_size != data_size ) + { + error = true; + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fprintf( f, "Data size mismatch; stored %llu (0x%llX), computed %llu (0x%llX)\n", + td_size, td_size, data_size, data_size ); } + } + const unsigned long long tm_size = trailer->member_size(); + if( tm_size != member_size ) + { + error = true; + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fprintf( f, "Member size mismatch; stored %llu (0x%llX), computed %llu (0x%llX)\n", + tm_size, tm_size, member_size, member_size ); } + } + return !error; } /* Return value: 0 = OK, 1 = decoder error, 2 = unexpected EOF, 3 = trailer error, 4 = unknown marker found, -1 = pos_limit reached. */ -int LZ_mtester::test_member( const unsigned long pos_limit ) +int LZ_mtester::test_member( const unsigned long long mpos_limit, + const unsigned long long dpos_limit, + FILE * const f, const unsigned long long byte_pos ) { - if( pos_limit < Lzip_header::size + 5 ) return -1; + if( mpos_limit < Lzip_header::size + 5 ) return -1; if( member_position() == Lzip_header::size ) rdec.load(); while( !rdec.finished() ) { - if( member_position() >= pos_limit ) { flush_data(); return -1; } + if( member_position() >= mpos_limit || data_position() >= dpos_limit ) + { flush_data(); return -1; } const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { @@ -172,14 +215,19 @@ int LZ_mtester::test_member( const unsigned long pos_limit ) flush_data(); if( len == min_match_len ) // End Of Stream marker { - if( verify_trailer() ) return 0; else return 3; + if( verify_trailer( f, byte_pos ) ) return 0; else return 3; + } + if( verbosity >= 0 && f ) + { + if( byte_pos ) std::fprintf( f, "byte %llu\n", byte_pos ); + std::fprintf( f, "Unsupported marker code '%d'\n", len ); } return 4; } - if( distance > max_rep0 ) max_rep0 = distance; } } rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; + if( rep0 > max_rep0 ) max_rep0 = rep0; state.set_match(); if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) { flush_data(); return 1; } @@ -197,10 +245,15 @@ int LZ_mtester::debug_decode_member( const long long dpos, const long long mpos, const bool show_packets ) { rdec.load(); + unsigned old_tmpos = member_position(); // truncated member_position while( !rdec.finished() ) { const unsigned long long dp = data_position() + dpos; const unsigned long long mp = member_position() + mpos - 4; + const unsigned tmpos = member_position(); + set_max_packet( tmpos - old_tmpos, mp ); + old_tmpos = tmpos; + ++total_packets_; const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { @@ -285,6 +338,9 @@ int LZ_mtester::debug_decode_member( const long long dpos, const long long mpos, { rdec.normalize(); flush_data(); + const unsigned tmpos = member_position(); + set_max_marker( tmpos - old_tmpos ); + old_tmpos = tmpos; if( show_packets ) std::printf( "%6llu %6llu marker code '%d'\n", mp, dp, len ); if( len == min_match_len ) // End Of Stream marker @@ -292,8 +348,7 @@ int LZ_mtester::debug_decode_member( const long long dpos, const long long mpos, if( show_packets ) std::printf( "%6llu %6llu member trailer\n", mpos + member_position(), dpos + data_position() ); - if( verify_trailer() ) return 0; - if( show_packets ) std::fputs( "trailer error\n", stdout ); + if( verify_trailer( show_packets ? stdout : 0 ) ) return 0; return 3; } if( len == min_match_len + 1 ) // Sync Flush marker @@ -302,10 +357,10 @@ int LZ_mtester::debug_decode_member( const long long dpos, const long long mpos, } return 4; } - if( distance > max_rep0 ) max_rep0 = distance; } } rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; + if( rep0 > max_rep0 ) { max_rep0 = rep0; max_rep0_pos = mp; } state.set_match(); if( show_packets ) std::printf( "%6llu %6llu match %6u,%3d (%6lld)", @@ -1,33 +1,31 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ class Range_mtester { const uint8_t * const buffer; // input buffer - const long buffer_size; - long pos; // current pos in buffer + const long long buffer_size; + long long pos; // current pos in buffer uint32_t code; uint32_t range; bool at_stream_end; - void operator=( const Range_mtester & ); // declared as private - public: - Range_mtester( const uint8_t * const buf, const long buf_size ) + Range_mtester( const uint8_t * const buf, const long long buf_size ) : buffer( buf ), buffer_size( buf_size ), @@ -38,7 +36,7 @@ public: {} bool finished() { return pos >= buffer_size; } - unsigned long member_position() const { return pos; } + unsigned long long member_position() const { return pos; } uint8_t get_byte() { @@ -58,7 +56,7 @@ public: void load() { code = 0; - for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte(); + for( int i = 0; i < 5; ++i ) code = ( code << 8 ) | get_byte(); range = 0xFFFFFFFFU; code &= range; // make sure that first byte is discarded } @@ -66,7 +64,7 @@ public: void normalize() { if( range <= 0x00FFFFFFU ) - { range <<= 8; code = (code << 8) | get_byte(); } + { range <<= 8; code = ( code << 8 ) | get_byte(); } } unsigned decode( const int num_bits ) @@ -79,7 +77,7 @@ public: // symbol <<= 1; // if( code >= range ) { code -= range; symbol |= 1; } const bool bit = ( code >= range ); - symbol = ( symbol << 1 ) + bit; + symbol <<= 1; symbol += bit; code -= range & ( 0U - bit ); } return symbol; @@ -92,7 +90,8 @@ public: if( code < bound ) { range = bound; - bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; return 0; } else @@ -106,8 +105,7 @@ public: unsigned decode_tree3( Bit_model bm[] ) { - unsigned symbol = 1; - symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + unsigned symbol = 2 | decode_bit( bm[1] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); return symbol & 7; @@ -115,8 +113,7 @@ public: unsigned decode_tree6( Bit_model bm[] ) { - unsigned symbol = 1; - symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + unsigned symbol = 2 | decode_bit( bm[1] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); @@ -140,7 +137,7 @@ public: for( int i = 0; i < num_bits; ++i ) { const unsigned bit = decode_bit( bm[model] ); - model = ( model << 1 ) + bit; + model <<= 1; model += bit; symbol |= ( bit << i ); } return symbol; @@ -149,12 +146,9 @@ public: unsigned decode_tree_reversed4( Bit_model bm[] ) { unsigned symbol = decode_bit( bm[1] ); - unsigned model = 2 + symbol; - unsigned bit = decode_bit( bm[model] ); - model = ( model << 1 ) + bit; symbol |= ( bit << 1 ); - bit = decode_bit( bm[model] ); - model = ( model << 1 ) + bit; symbol |= ( bit << 2 ); - symbol |= ( decode_bit( bm[model] ) << 3 ); + symbol += decode_bit( bm[2+symbol] ) << 1; + symbol += decode_bit( bm[4+symbol] ) << 2; + symbol += decode_bit( bm[8+symbol] ) << 3; return symbol; } @@ -165,9 +159,9 @@ public: while( symbol < 0x100 ) { const unsigned match_bit = ( match_byte <<= 1 ) & 0x100; - const unsigned bit = decode_bit( bm1[match_bit+symbol] ); - symbol = ( symbol << 1 ) | bit; - if( match_bit != bit << 8 ) + const bool bit = decode_bit( bm1[symbol+match_bit] ); + symbol <<= 1; symbol |= bit; + if( match_bit >> 8 != bit ) { while( symbol < 0x100 ) symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); @@ -187,6 +181,7 @@ public: } }; +class MD5SUM; // forward declaration class LZ_mtester { @@ -203,7 +198,13 @@ class LZ_mtester unsigned rep2; // repeated distances unsigned rep3; State state; - unsigned max_rep0; // maximum distance found + MD5SUM * const md5sum; + unsigned long long total_packets_; // total number of packets in member + unsigned long long max_rep0_pos; // file position of maximum distance + unsigned max_rep0; // maximum distance found + std::vector< unsigned long long > max_packet_posv_; // file pos of large packets + unsigned max_packet_size_; // maximum packet size found + unsigned max_marker_size_; // maximum marker size found bool pos_wrapped; Bit_model bm_literal[1<<literal_context_bits][0x300]; @@ -222,7 +223,7 @@ class LZ_mtester void print_block( const int len ); void flush_data(); - bool verify_trailer(); + bool verify_trailer( FILE * const f = 0, unsigned long long byte_pos = 0 ); uint8_t peek_prev() const { return buffer[((pos > 0) ? pos : dictionary_size)-1]; } @@ -271,11 +272,21 @@ class LZ_mtester } } - void operator=( const LZ_mtester & ); // declared as private +void set_max_packet( const unsigned new_size, const unsigned long long pos ) + { + if( max_packet_size_ > new_size || new_size == 0 ) return; + if( max_packet_size_ < new_size ) // new max size + { max_packet_size_ = new_size; max_packet_posv_.clear(); } + max_packet_posv_.push_back( pos - new_size ); // pos of first byte + } + +void set_max_marker( const unsigned new_size ) + { if( max_marker_size_ < new_size ) max_marker_size_ = new_size; } public: - LZ_mtester( const uint8_t * const ibuf, const long ibuf_size, - const unsigned dict_size, const int ofd = -1 ) + LZ_mtester( const uint8_t * const ibuf, const long long ibuf_size, + const unsigned dict_size, const int ofd = -1, + MD5SUM * const md5sum_ = 0 ) : partial_data_pos( 0 ), rdec( ibuf, ibuf_size ), @@ -289,7 +300,12 @@ public: rep1( 0 ), rep2( 0 ), rep3( 0 ), + md5sum( md5sum_ ), + total_packets_( -1ULL ), // don't count EOS marker + max_rep0_pos( 0 ), max_rep0( 0 ), + max_packet_size_( 0 ), + max_marker_size_( 0 ), pos_wrapped( false ) // prev_byte of first byte; also for peek( 0 ) on corrupt file { buffer[dictionary_size-1] = 0; } @@ -299,11 +315,28 @@ public: unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } unsigned long long data_position() const { return partial_data_pos + pos; } bool finished() { return rdec.finished(); } - unsigned long member_position() const { return rdec.member_position(); } + unsigned long long member_position() const { return rdec.member_position(); } + unsigned long long total_packets() const { return total_packets_; } + unsigned long long max_distance_pos() const { return max_rep0_pos; } unsigned max_distance() const { return max_rep0 + 1; } + const std::vector< unsigned long long > & max_packet_posv() const + { return max_packet_posv_; } + unsigned max_packet_size() const { return max_packet_size_; } + unsigned max_marker_size() const { return max_marker_size_; } + + const uint8_t * get_buffers( const uint8_t ** prev_bufferp, + int * sizep, int * prev_sizep ) const + { *sizep = ( pos_wrapped && pos == 0 ) ? dictionary_size : pos; + *prev_sizep = ( pos_wrapped && pos > 0 ) ? dictionary_size - pos : 0; + *prev_bufferp = buffer + pos; return buffer; } void duplicate_buffer(); - int test_member( const unsigned long pos_limit = LONG_MAX ); // sets max_rep0 + // these two functions set max_rep0 + int test_member( const unsigned long long mpos_limit = LLONG_MAX, + const unsigned long long dpos_limit = LLONG_MAX, + FILE * const f = 0, const unsigned long long byte_pos = 0 ); + /* this function also sets max_rep0_pos, total_packets_, max_packet_size_, + max_packet_posv_, and max_marker_size_ */ int debug_decode_member( const long long dpos, const long long mpos, - const bool show_packets ); // sets max_rep0 + const bool show_packets ); }; diff --git a/nrep_stats.cc b/nrep_stats.cc new file mode 100644 index 0000000..2f335e6 --- /dev/null +++ b/nrep_stats.cc @@ -0,0 +1,117 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "lzip_index.h" + + +/* Show how well the frequency of sequences of N repeated bytes in LZMA data + matches the value expected for random data. ( 1 / 2^( 8 * N ) ) + Print cumulative data for all files followed by the name of the first + file with the longest sequence. +*/ +int print_nrep_stats( const std::vector< std::string > & filenames, + const int repeated_byte, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing ) + { + std::vector< unsigned long > len_vector; + unsigned long long best_pos = 0, lzma_size = 0; + int best_name = -1, retval = 0; + const bool count_all = ( repeated_byte < 0 || repeated_byte >= 256 ); + bool stdin_used = false; + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = + from_stdin ? "(stdin)" : filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, + ignore_errors, ignore_errors ); + if( lzip_index.retval() != 0 ) + { + show_file_error( input_filename, lzip_index.error().c_str() ); + set_retval( retval, lzip_index.retval() ); + close( infd ); + continue; + } + const unsigned long long cdata_size = lzip_index.cdata_size(); + const uint8_t * const buffer = + (const uint8_t *)mmap( 0, cdata_size, PROT_READ, MAP_PRIVATE, infd, 0 ); + close( infd ); + if( buffer == MAP_FAILED ) + { show_file_error( input_filename, "Can't mmap", errno ); + set_retval( retval, 1 ); continue; } + for( long j = 0; j < lzip_index.members(); ++j ) + { + const Block & mb = lzip_index.mblock( j ); + long long pos = mb.pos() + 7; // skip header (+1 byte) and + const long long end = mb.end() - 20; // trailer of each member + lzma_size += end - pos; + while( pos < end ) + { + const uint8_t byte = buffer[pos++]; + if( buffer[pos] == byte ) + { + unsigned len = 2; + ++pos; + while( pos < end && buffer[pos] == byte ) { ++pos; ++len; } + if( !count_all && repeated_byte != (int)byte ) continue; + if( len >= len_vector.size() ) { len_vector.resize( len + 1 ); + best_name = i; best_pos = pos - len; } + ++len_vector[len]; + } + } + } + munmap( (void *)buffer, cdata_size ); + } + + if( count_all ) + std::fputs( "\nShowing repeated sequences of any byte value.\n", stdout ); + else + std::printf( "\nShowing repeated sequences of the byte value 0x%02X\n", + repeated_byte ); + std::printf( "Total size of LZMA data: %llu bytes (%sBytes)\n", + lzma_size, format_num( lzma_size, 999 ) ); + for( unsigned len = 2; len < len_vector.size(); ++len ) + if( len_vector[len] > 0 ) + std::printf( "len %u found %lu times, 1 every %llu bytes " + "(expected 1 every %sB)\n", + len, len_vector[len], lzma_size / len_vector[len], + format_num( 1ULL << ( 8 * ( len - count_all ) ), -1ULL, -1 ) ); + if( best_name >= 0 ) + std::printf( "Longest sequence found at position %llu of '%s'\n", + best_pos, filenames[best_name].c_str() ); + return retval; + } diff --git a/range_dec.cc b/range_dec.cc index 78d586f..24ac5e8 100644 --- a/range_dec.cc +++ b/range_dec.cc @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -35,21 +35,21 @@ namespace { -int decompress_member( const int infd, const Pretty_print & pp, - const unsigned long long mpos, - const unsigned long long outskip, - const unsigned long long outend ) +bool decompress_member( const int infd, const Pretty_print & pp, + const unsigned long long mpos, + const unsigned long long outskip, + const unsigned long long outend ) { Range_decoder rdec( infd ); Lzip_header header; rdec.read_data( header.data, Lzip_header::size ); if( rdec.finished() ) // End Of File - { pp( "File ends unexpectedly at member header." ); return 2; } - if( !header.verify_magic() ) { pp( bad_magic_msg ); return 2; } + { pp( "File ends unexpectedly at member header." ); return false; } + if( !header.verify_magic() ) { pp( bad_magic_msg ); return false; } if( !header.verify_version() ) - { pp( bad_version( header.version() ) ); return 2; } + { pp( bad_version( header.version() ) ); return false; } const unsigned dictionary_size = header.dictionary_size(); - if( !isvalid_ds( dictionary_size ) ) { pp( bad_dict_msg ); return 2; } + if( !isvalid_ds( dictionary_size ) ) { pp( bad_dict_msg ); return false; } if( verbosity >= 2 ) pp(); @@ -64,16 +64,27 @@ int decompress_member( const int infd, const Pretty_print & pp, "File ends unexpectedly" : "Decoder error", mpos + rdec.member_position() ); } - return 2; + return false; + } + if( decoder.data_position() < outend - outskip ) + { + if( verbosity >= 0 ) + { pp(); std::fprintf( stderr, + "%sMember at pos %llu contains only %llu bytes of %llu requested.\n", + ( verbosity >= 2 ) ? "\n" : "", mpos, + decoder.data_position() - outskip, outend - outskip ); } + return false; } if( verbosity >= 2 ) std::fputs( "done\n", stderr ); - return 0; + return true; } +} // end namespace + const char * format_num( unsigned long long num, - unsigned long long limit = -1ULL, - const int set_prefix = 0 ) + unsigned long long limit, + const int set_prefix ) { const char * const si_prefix[8] = { "k", "M", "G", "T", "P", "E", "Z", "Y" }; @@ -85,20 +96,22 @@ const char * format_num( unsigned long long num, static bool si = true; if( set_prefix ) si = ( set_prefix > 0 ); + unsigned long long den = 1; const unsigned factor = ( si ? 1000 : 1024 ); char * const buf = buffer[current++]; current %= buffers; const char * const * prefix = ( si ? si_prefix : binary_prefix ); const char * p = ""; - bool exact = ( num % factor == 0 ); - for( int i = 0; i < 8 && ( num > limit || ( exact && num >= factor ) ); ++i ) - { num /= factor; if( num % factor != 0 ) exact = false; p = prefix[i]; } - snprintf( buf, bufsize, "%llu %s", num, p ); + for( int i = 0; i < 8 && num / den >= factor && den * factor > den; ++i ) + { if( num / den <= limit && num % ( den * factor ) != 0 ) break; + den *= factor; p = prefix[i]; } + if( num % den == 0 ) + snprintf( buf, bufsize, "%llu %s", num / den, p ); + else + snprintf( buf, bufsize, "%3.2f %s", (double)num / den, p ); return buf; } -} // end namespace - bool safe_seek( const int fd, const long long pos ) { @@ -114,7 +127,8 @@ int range_decompress( const std::string & input_filename, const bool to_stdout ) { struct stat in_stats; - const int infd = open_instream( input_filename.c_str(), &in_stats, true, true ); + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); if( infd < 0 ) return 1; const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, @@ -123,30 +137,30 @@ int range_decompress( const std::string & input_filename, { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); return lzip_index.retval(); } - if( range.end() > lzip_index.udata_size() ) - range.size( std::max( 0LL, lzip_index.udata_size() - range.pos() ) ); + const long long udata_size = lzip_index.udata_size(); + if( range.end() > udata_size ) + range.size( std::max( 0LL, udata_size - range.pos() ) ); if( range.size() <= 0 ) - { show_file_error( input_filename.c_str(), "Nothing to do." ); return 0; } + { if( udata_size > 0 ) + show_file_error( input_filename.c_str(), "Nothing to do." ); + return 0; } - if( to_stdout || default_output_filename.empty() ) - outfd = STDOUT_FILENO; + if( to_stdout || default_output_filename.empty() ) outfd = STDOUT_FILENO; else { output_filename = default_output_filename; set_signal_handler(); - if( !open_outstream( force, false, false, false ) ) - { close( infd ); return 1; } + if( !open_outstream( force, true, false, false ) ) return 1; } if( verbosity >= 1 ) - std::fprintf( stderr, "Decompressing range %sB to %sB (%sof %sBytes)\n", + std::fprintf( stderr, "Decompressing range %sB to %sB (%sB of %sBytes)\n", format_num( range.pos() ), format_num( range.pos() + range.size() ), - format_num( range.size() ), - format_num( lzip_index.udata_size() ) ); + format_num( range.size() ), format_num( udata_size ) ); Pretty_print pp( input_filename ); - int retval = 0; + bool error = false; for( long i = 0; i < lzip_index.members(); ++i ) { const Block & db = lzip_index.dblock( i ); @@ -157,16 +171,15 @@ int range_decompress( const std::string & input_filename, const long long outskip = std::max( 0LL, range.pos() - db.pos() ); const long long outend = std::min( db.size(), range.end() - db.pos() ); const long long mpos = lzip_index.mblock( i ).pos(); - if( !safe_seek( infd, mpos ) ) { retval = 1; break; } - const int tmp = decompress_member( infd, pp, mpos, outskip, outend ); - if( tmp && ( tmp != 2 || !ignore_errors ) ) cleanup_and_fail( tmp ); - if( tmp > retval ) retval = tmp; + if( !safe_seek( infd, mpos ) ) cleanup_and_fail( 1 ); + if( !decompress_member( infd, pp, mpos, outskip, outend ) ) + { if( !ignore_errors ) cleanup_and_fail( 2 ); else error = true; } pp.reset(); } } close( infd ); - retval = std::max( retval, close_outstream( &in_stats ) ); - if( verbosity >= 2 && retval == 0 ) + if( close_outstream( &in_stats ) != 0 ) cleanup_and_fail( 1 ); + if( verbosity >= 2 && !error ) std::fputs( "Byte range decompressed successfully.\n", stderr ); - return retval; + return 0; // either no error or ignored } @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -43,24 +43,9 @@ void print_pending_newline( const char terminator ) pending_newline = false; } -uint8_t * read_member( const int infd, const long long mpos, - const long long msize ) - { - if( msize <= 0 || msize > LONG_MAX ) - { show_error( "Member is larger than LONG_MAX." ); return 0; } - if( !safe_seek( infd, mpos ) ) return 0; - uint8_t * const buffer = new uint8_t[msize]; - - if( readblock( infd, buffer, msize ) != msize ) - { show_error( "Error reading input file", errno ); - delete[] buffer; return 0; } - return buffer; - } - - bool gross_damage( const long long msize, const uint8_t * const mbuffer ) { - enum { maxlen = 6 }; // max number of consecutive identical bytes + enum { maxlen = 7 }; // max number of consecutive identical bytes long i = Lzip_header::size; const long end = msize - Lzip_trailer::size - maxlen; while( i < end ) @@ -73,19 +58,10 @@ bool gross_damage( const long long msize, const uint8_t * const mbuffer ) } -int seek_write( const int fd, const uint8_t * const buf, const int size, - const long long pos ) - { - if( lseek( fd, pos, SEEK_SET ) == pos ) - return writeblock( fd, buf, size ); - return 0; - } - - // Return value: 0 = no change, 5 = repaired pos int repair_dictionary_size( const long long msize, uint8_t * const mbuffer ) { - enum { dictionary_size_9 = 1 << 25 }; // dictionary size of option -9 + const unsigned long long dictionary_size_9 = 1 << 25; // dict size of opt -9 Lzip_header & header = *(Lzip_header *)mbuffer; unsigned dictionary_size = header.dictionary_size(); const Lzip_trailer & trailer = @@ -96,8 +72,7 @@ int repair_dictionary_size( const long long msize, uint8_t * const mbuffer ) if( !valid_ds || dictionary_size < dictionary_size_9 ) { - dictionary_size = - std::min( data_size, (unsigned long long)dictionary_size_9 ); + dictionary_size = std::min( data_size, dictionary_size_9 ); if( dictionary_size < min_dictionary_size ) dictionary_size = min_dictionary_size; LZ_mtester mtester( mbuffer, msize, dictionary_size ); @@ -176,12 +151,37 @@ long repair_member( const long long mpos, const long long msize, } // end namespace +long long seek_write( const int fd, const uint8_t * const buf, + const long long size, const long long pos ) + { + if( lseek( fd, pos, SEEK_SET ) == pos ) + return writeblock( fd, buf, size ); + return 0; + } + + +uint8_t * read_member( const int infd, const long long mpos, + const long long msize ) + { + if( msize <= 0 || msize > LONG_MAX ) + { show_error( "Member is larger than LONG_MAX." ); return 0; } + if( !safe_seek( infd, mpos ) ) return 0; + uint8_t * const buffer = new uint8_t[msize]; + + if( readblock( infd, buffer, msize ) != msize ) + { show_error( "Error reading input file", errno ); + delete[] buffer; return 0; } + return buffer; + } + + int repair_file( const std::string & input_filename, const std::string & default_output_filename, - const bool force, const char terminator ) + const char terminator, const bool force ) { struct stat in_stats; - const int infd = open_instream( input_filename.c_str(), &in_stats, true, true ); + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); if( infd < 0 ) return 1; const Lzip_index lzip_index( infd, true, true, true ); @@ -221,20 +221,21 @@ int repair_file( const std::string & input_filename, pos = repair_dictionary_size( msize, mbuffer ); if( pos == 0 ) pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 1, - Lzip_header::size + 5, dictionary_size, terminator ); + Lzip_header::size + 6, dictionary_size, terminator ); if( pos == 0 ) - pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 6, + pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 7, failure_pos, dictionary_size, terminator ); print_pending_newline( terminator ); } - if( pos < 0 ) cleanup_and_fail( 1 ); + if( pos < 0 ) + { show_error( "Can't prepare master." ); cleanup_and_fail( 1 ); } if( pos > 0 ) { if( outfd < 0 ) // first damaged member repaired { if( !safe_seek( infd, 0 ) ) return 1; set_signal_handler(); - if( !open_outstream( true, false ) ) { close( infd ); return 1; } + if( !open_outstream( true, true ) ) return 1; if( !copy_file( infd, outfd ) ) // copy whole file cleanup_and_fail( 1 ); } @@ -267,7 +268,8 @@ int debug_delay( const std::string & input_filename, Block range, const char terminator ) { struct stat in_stats; // not used - const int infd = open_instream( input_filename.c_str(), &in_stats, true, true ); + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); if( infd < 0 ) return 1; const Lzip_index lzip_index( infd, true, true ); @@ -346,7 +348,8 @@ int debug_repair( const std::string & input_filename, const Bad_byte & bad_byte, const char terminator ) { struct stat in_stats; // not used - const int infd = open_instream( input_filename.c_str(), &in_stats, true, true ); + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); if( infd < 0 ) return 1; const Lzip_index lzip_index( infd, true, true ); @@ -368,9 +371,9 @@ int debug_repair( const std::string & input_filename, if( test_member_from_file( infd, msize, &failure_pos ) != 0 ) { if( verbosity >= 0 ) - std::printf( "Member %ld of %ld already damaged (failure pos = %llu)\n", - idx + 1, lzip_index.members(), mpos + failure_pos ); - return 1; + std::fprintf( stderr, "Member %ld of %ld already damaged (failure pos = %llu)\n", + idx + 1, lzip_index.members(), mpos + failure_pos ); + return 2; } } uint8_t * const mbuffer = read_member( infd, mpos, msize ); @@ -410,26 +413,35 @@ int debug_repair( const std::string & input_filename, long pos = repair_dictionary_size( msize, mbuffer ); if( pos == 0 ) pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 1, - Lzip_header::size + 5, dictionary_size, terminator ); + Lzip_header::size + 6, dictionary_size, terminator ); if( pos == 0 ) - pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 6, + pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 7, failure_pos, dictionary_size, terminator ); print_pending_newline( terminator ); delete[] mbuffer; - if( pos < 0 ) - { show_error( "Can't prepare master." ); return 1; } + if( pos < 0 ) { show_error( "Can't prepare master." ); return 1; } if( pos == 0 ) internal_error( "can't repair input file." ); - if( verbosity >= 1 ) - std::fputs( "Member repaired successfully.\n", stdout ); + if( verbosity >= 1 ) std::fputs( "Member repaired successfully.\n", stdout ); return 0; } +/* If show_packets is true, print to stdout descriptions of the decoded LZMA + packets. Print also some global values; total number of packets in + member, max distance (rep0) and its file position, max LZMA packet size + in each member and the file position of these packets. + (Packet sizes are a fractionary number of bytes. The packet and marker + sizes shown by option -X are the number of extra bytes required to decode + the packet, not counting the data present in the range decoder before and + after the decoding. The max marker size of a 'Sync Flush marker' does not + include the 5 bytes read by rdec.load). +*/ int debug_decompress( const std::string & input_filename, const Bad_byte & bad_byte, const bool show_packets ) { struct stat in_stats; - const int infd = open_instream( input_filename.c_str(), &in_stats, true, true ); + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); if( infd < 0 ) return 1; const Lzip_index lzip_index( infd, true, true ); @@ -465,6 +477,22 @@ int debug_decompress( const std::string & input_filename, LZ_mtester mtester( mbuffer, msize, dictionary_size, outfd ); const int result = mtester.debug_decode_member( dpos, mpos, show_packets ); delete[] mbuffer; + if( show_packets ) + { + const std::vector< unsigned long long > & mppv = mtester.max_packet_posv(); + const unsigned mpackets = mppv.size(); + std::printf( "Total packets in member = %llu\n" + "Max distance in any match = %u at file position %llu\n" + "Max marker size found = %u\n" + "Max packet size found = %u (%u packets)%s", + mtester.total_packets(), mtester.max_distance(), + mtester.max_distance_pos(), mtester.max_marker_size(), + mtester.max_packet_size(), mpackets, + mpackets ? " at file positions" : "" ); + for( unsigned i = 0; i < mpackets; ++i ) + std::printf( " %llu", mppv[i] ); + std::fputc( '\n', stdout ); + } if( result != 0 ) { if( verbosity >= 0 && result <= 2 && show_packets ) diff --git a/reproduce.cc b/reproduce.cc new file mode 100644 index 0000000..40104b7 --- /dev/null +++ b/reproduce.cc @@ -0,0 +1,785 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <csignal> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/wait.h> + +#include "lzip.h" +#include "md5.h" +#include "mtester.h" +#include "lzip_index.h" + + +namespace { + +const char * final_msg = 0; + +bool pending_newline = false; + +void print_pending_newline( const char terminator ) + { if( pending_newline && terminator != '\n' ) std::fputc( '\n', stdout ); + pending_newline = false; } + +int fatal_retval = 0; + +int fatal( const int retval ) + { if( fatal_retval == 0 ) fatal_retval = retval; return retval; } + +// Returns the position of the damaged area in the member, or -1 if error. +long long zeroed_sector_pos( const char * const input_filename, + const uint8_t * const mbuffer, const long long msize, + long long * const sizep, uint8_t * const valuep ) + { + enum { minlen = 8 }; // min number of consecutive identical bytes + long long i = Lzip_header::size; + const long long end = msize - minlen; + long long begin = -1; + long long size = 0; + uint8_t value = 0; + while( i < end ) // leave i pointing to the first differing byte + { + const uint8_t byte = mbuffer[i++]; + if( mbuffer[i] == byte ) + { + const long long pos = i - 1; + ++i; + while( i < msize && mbuffer[i] == byte ) ++i; + if( i - pos >= minlen ) + { + if( size > 0 ) + { show_file_error( input_filename, + "Member contains more than one damaged area." ); + return -1; } + begin = pos; + size = i - pos; + value = byte; + break; + } + } + } + if( begin < 0 || size <= 0 ) + { show_file_error( input_filename, "Can't locate damaged area." ); + return -1; } + *sizep = size; + *valuep = value; + return begin; + } + + +const LZ_mtester * prepare_master2( const uint8_t * const mbuffer, + const long long msize, + const long long begin, + const unsigned dictionary_size ) + { + long long pos_limit = std::max( begin - 16, (long long)Lzip_header::size ); + LZ_mtester * master = new LZ_mtester( mbuffer, msize, dictionary_size ); + if( master->test_member( pos_limit ) != -1 || + master->member_position() > (unsigned long long)begin ) + { delete master; return 0; } + // decompress as much data as possible without surpassing begin + while( pos_limit < begin && master->test_member( pos_limit + 1 ) == -1 && + master->member_position() <= (unsigned long long)begin ) + ++pos_limit; + delete master; + master = new LZ_mtester( mbuffer, msize, dictionary_size ); + if( master->test_member( pos_limit ) == -1 && + master->member_position() <= (unsigned long long)begin ) return master; + delete master; + return 0; + } + + +/* Locate in the reference file (rbuf) the truncated data in the dictionary. + The reference file must match from the last byte decoded back to the + beginning of the file or to the beginning of the dictionary. + Choose the match nearest to the beginning of the file. + As a fallback, locate the longest partial match at least 512 bytes long. + Returns the offset in file of the first undecoded byte, or -1 if no match. */ +long long match_file( const LZ_mtester & master, const uint8_t * const rbuf, + const long long rsize, + const char * const reference_filename ) + { + const uint8_t * prev_buffer; + int dec_size, prev_size; + const uint8_t * const dec_buffer = + master.get_buffers( &prev_buffer, &dec_size, &prev_size ); + if( dec_size < 4 ) + { if( verbosity >= 1 ) + { std::printf( "'%s' can't match: not enough data in dictionary.\n", + reference_filename ); pending_newline = false; } + return -1; } + long long offset = -1; // offset in file of the first undecoded byte + bool multiple = false; + const uint8_t last_byte = dec_buffer[dec_size-1]; + for( long long i = rsize - 1; i >= 3; --i ) // match at least 4 bytes at bof + if( rbuf[i] == last_byte ) + { + // compare file with the two parts of the dictionary + int len = std::min( (long long)dec_size - 1, i ); + if( std::memcmp( rbuf + i - len, dec_buffer + dec_size - 1 - len, len ) == 0 ) + { + int len2 = std::min( (long long)prev_size, i - len ); + if( len2 <= 0 || !prev_buffer || + std::memcmp( rbuf + i - len - len2, + prev_buffer + prev_size - len2, len2 ) == 0 ) + { + if( offset >= 0 ) multiple = true; + offset = i + 1; + i -= len + len2; + } + } + } + if( offset >= 0 ) + { + if( multiple && verbosity >= 1 ) + { std::printf( "warning: %s: Multiple matches. Using match at offset %lld\n", + reference_filename, offset ); std::fflush( stdout ); } + if( !multiple && verbosity >= 2 ) + { std::printf( "%s: Match found at offset %lld\n", + reference_filename, offset ); std::fflush( stdout ); } + return offset; + } + int maxlen = 0; // choose longest match in reference file + for( long long i = rsize - 1; i >= 0; --i ) + if( rbuf[i] == last_byte ) + { + // compare file with the two parts of the dictionary + const int size1 = std::min( (long long)dec_size, i + 1 ); + int len = 1; + while( len < size1 && rbuf[i-len] == dec_buffer[dec_size-len-1] ) ++len; + if( len == size1 ) + { + int size2 = std::min( (long long)prev_size, i + 1 - size1 ); + while( len < size1 + size2 && + rbuf[i-len] == prev_buffer[prev_size+size1-len] ) ++len; + } + if( len > maxlen ) { maxlen = len; offset = i + 1; i -= len; } + } + if( maxlen >= 512 && offset >= 0 ) + { + if( verbosity >= 1 ) + { std::printf( "warning: %s: Partial match found at offset %lld, len %d." + " Reference data may be mixed with other data.\n", + reference_filename, offset, maxlen ); + std::fflush( stdout ); } + return offset; + } + if( verbosity >= 1 ) + { std::printf( "'%s' does not match with decoded data.\n", + reference_filename ); pending_newline = false; } + return -1; + } + + +void show_close_error( const char * const prog_name = "data feeder" ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error closing output of %s: %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +void show_exec_error( const char * const prog_name ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't exec '%s': %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +void show_fork_error( const char * const prog_name ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't fork '%s': %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +/* Returns -1 if child not terminated, 1 in case of error, or exit status of + child process 'pid'. */ +int child_status( const pid_t pid, const char * const name ) + { + int status; + while( true ) + { + const int tmp = waitpid( pid, &status, WNOHANG ); + if( tmp == -1 && errno != EINTR ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error checking status of '%s': %s\n", + program_name, name, std::strerror( errno ) ); + return 1; + } + if( tmp == 0 ) return -1; // child not terminated + if( tmp == pid ) break; // child terminated + } + if( WIFEXITED( status ) ) return WEXITSTATUS( status ); + return 1; + } + + +// Returns exit status of child process 'pid', or 1 in case of error. +// +int wait_for_child( const pid_t pid, const char * const name ) + { + int status; + while( waitpid( pid, &status, 0 ) == -1 ) + { + if( errno != EINTR ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error waiting termination of '%s': %s\n", + program_name, name, std::strerror( errno ) ); + return 1; + } + } + if( WIFEXITED( status ) ) return WEXITSTATUS( status ); + return 1; + } + + +bool good_status( const pid_t pid, const char * const name, const bool finished ) + { + bool error = false; + if( pid ) + { + if( !finished ) + { + const int tmp = child_status( pid, name ); + if( tmp < 0 ) // child not terminated + { kill( pid, SIGTERM ); wait_for_child( pid, name ); } + else if( tmp != 0 ) error = true; // child status != 0 + } + else + if( wait_for_child( pid, name ) != 0 ) error = true; + if( error ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: %s: Child terminated with error status.\n", + program_name, name ); + return false; + } + } + return !error; + } + + +/* Feed to lzip through 'ofd' the data decompressed up to 'good_dsize' + (master->data_position) followed by the reference data from byte at + offset 'offset' of reference file, up to a total of 'dsize' bytes. */ +bool feed_data( uint8_t * const mbuffer, const long long msize, + const long long dsize, const unsigned long long good_dsize, + const uint8_t * const rbuf, const long long rsize, + const long long offset, const unsigned dictionary_size, + const int ofd ) + { + LZ_mtester mtester( mbuffer, msize, dictionary_size, ofd ); + if( mtester.test_member( LLONG_MAX, good_dsize ) != -1 || + good_dsize != mtester.data_position() ) + { show_error( "Error decompressing prefix data for compressor." ); + return false; } + // limit reference data to remaining decompressed data in member + const long long end = + std::min( (unsigned long long)rsize, dsize - good_dsize + offset ); + for( long long i = offset; i < end; ) + { + const int size = std::min( end - i, 65536LL ); + if( writeblock( ofd, rbuf + i, size ) != size ) + { show_error( "Error writing reference data to compressor", errno ); + return false; } + i += size; + } + return true; + } + + +/* Try to reproduce the zeroed sector. + Return value: -1 = failure, 0 = success, > 0 = fatal error. */ +int try_reproduce( uint8_t * const mbuffer, const long long msize, + const long long dsize, const unsigned long long good_dsize, + const long long begin, const long long end, + const uint8_t * const rbuf, const long long rsize, + const long long offset, const unsigned dictionary_size, + const char ** const lzip_argv, MD5SUM * const md5sump, + const char terminator, const bool auto0 = false ) + { + int fda[2]; // pipe to compressor + int fda2[2]; // pipe from compressor + if( pipe( fda ) < 0 || pipe( fda2 ) < 0 ) + { show_error( "Can't create pipe", errno ); return fatal( 1 ); } + const pid_t pid = fork(); + if( pid == 0 ) // child 1 (compressor feeder) + { + if( close( fda[0] ) != 0 || + close( fda2[0] ) != 0 || close( fda2[1] ) != 0 || + !feed_data( mbuffer, msize, dsize, good_dsize, rbuf, rsize, offset, + dictionary_size, fda[1] ) ) + { close( fda[1] ); _exit( 2 ); } + if( close( fda[1] ) != 0 ) + { show_close_error(); _exit( 2 ); } + _exit( 0 ); + } + if( pid < 0 ) // parent + { show_fork_error( "data feeder" ); return fatal( 1 ); } + + const pid_t pid2 = fork(); + if( pid2 == 0 ) // child 2 (compressor) + { + if( dup2( fda[0], STDIN_FILENO ) >= 0 && + dup2( fda2[1], STDOUT_FILENO ) >= 0 && + close( fda[0] ) == 0 && close( fda[1] ) == 0 && + close( fda2[0] ) == 0 && close( fda2[1] ) == 0 ) + execvp( lzip_argv[0], (char **)lzip_argv ); + show_exec_error( lzip_argv[0] ); + _exit( 2 ); + } + if( pid2 < 0 ) // parent + { show_fork_error( lzip_argv[0] ); return fatal( 1 ); } + + close( fda[0] ); close( fda[1] ); close( fda2[1] ); + const long long xend = std::min( end + 4, msize ); + int retval = 0; // -1 = mismatch + bool first_post = true; + bool same_ds = true; // reproduced DS == header DS + bool tail_mismatch = false; // mismatch after end + for( long long i = 0; i < xend; ) + { + enum { buffer_size = 16384 }; // 65536 makes it slower + uint8_t buffer[buffer_size]; + if( verbosity >= 2 && i >= 65536 && terminator ) + { + if( first_post ) + { first_post = false; print_pending_newline( terminator ); } + std::printf( " Reproducing position %lld %c", i, terminator ); + std::fflush( stdout ); pending_newline = true; + } + const int rd = readblock( fda2[0], buffer, buffer_size ); + // not enough reference data to fill zeroed sector at this level + if( rd <= 0 ) { if( i < end ) retval = -1; break; } + int j = 0; + /* Compare reproduced bytes with data in mbuffer. + Do not fail because of a mismatch beyond the end of the zeroed sector + to prevent the reproduction from failing because of the reference file + just covering the zeroed sector. */ + for( ; j < rd && i < begin; ++j, ++i ) + if( mbuffer[i] != buffer[j] ) // mismatch + { + if( i != 5 ) { retval = -1; goto done; } // ignore different DS + const Lzip_header * header = (const Lzip_header *)buffer; + if( header->dictionary_size() != dictionary_size ) same_ds = false; + } + // copy reproduced bytes into zeroed sector of mbuffer + for( ; j < rd && i < end; ++j, ++i ) mbuffer[i] = buffer[j]; + for( ; j < rd && i < xend; ++j, ++i ) + if( mbuffer[i] != buffer[j] ) { tail_mismatch = true; goto done; } + } +done: + if( !first_post && terminator ) print_pending_newline( terminator ); + if( close( fda2[0] ) != 0 ) { show_close_error( "compressor" ); retval = 1; } + if( !good_status( pid, "data feeder", false ) || + !good_status( pid2, lzip_argv[0], false ) ) retval = auto0 ? -1 : 1; + if( !retval ) // test whole member after reproduction + { + if( md5sump ) md5sump->reset(); + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, md5sump ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { + if( verbosity >= 2 && same_ds && begin >= 4096 && terminator ) + { + if( !tail_mismatch ) + final_msg = " Zeroed sector reproduced, but CRC does not match." + " (Multiple damages in file?).\n"; + else if( !final_msg ) + final_msg = " Zeroed sector reproduced, but data after it does not" + " match. (Maybe wrong reference data or lzip version).\n"; + } + retval = -1; // incorrect reproduction of zeroed sector + } + } + return retval; + } + + +// Return value: -1 = master failed, 0 = success, > 0 = failure +int reproduce_member( uint8_t * const mbuffer, const long long msize, + const long long dsize, const char * const lzip_name, + const char * const reference_filename, + const long long begin, const long long size, + const int lzip_level, MD5SUM * const md5sump, + const char terminator ) + { + struct stat st; + const int rfd = open_instream( reference_filename, &st, false, true ); + if( rfd < 0 ) return fatal( 1 ); + if( st.st_size > LLONG_MAX ) + { show_file_error( reference_filename, "File too large." ); close( rfd ); + return fatal( 2 ); } + const long long rsize = st.st_size; + const uint8_t * const rbuf = + (const uint8_t *)mmap( 0, rsize, PROT_READ, MAP_PRIVATE, rfd, 0 ); + close( rfd ); + if( rbuf == MAP_FAILED ) + { show_file_error( reference_filename, "Can't mmap", errno ); + return fatal( 1 ); } + + const Lzip_header & header = *(const Lzip_header *)mbuffer; + const unsigned dictionary_size = header.dictionary_size(); + const LZ_mtester * const master = + prepare_master2( mbuffer, msize, begin, dictionary_size ); + if( !master ) return -1; + if( verbosity >= 2 ) + { + std::printf( " (master mpos = %llu, dpos = %llu)\n", + master->member_position(), master->data_position() ); + std::fflush( stdout ); + } + + const long long offset = match_file( *master, rbuf, rsize, reference_filename ); + if( offset < 0 ) { delete master; return 2; } // no match + // Reference data from offset must be at least as large as zeroed sector + // minus member trailer if trailer is inside the zeroed sector. + const int t = ( begin + size >= msize ) ? 16 + Lzip_trailer::size : 0; + if( rsize - offset < size - t ) + { show_file_error( reference_filename, "Not enough reference data after match." ); + delete master; return 2; } + + const unsigned long long good_dsize = master->data_position(); + const long long end = begin + size; + char level_str[8] = "-0"; // compression level or match length limit + char dict_str[16]; + snprintf( dict_str, sizeof dict_str, "-s%u", dictionary_size ); + const char * lzip0_argv[3] = { lzip_name, "-0", 0 }; + const char * lzip_argv[4] = { lzip_name, level_str, dict_str, 0 }; + if( lzip_level >= 0 ) + for( unsigned char level = '0'; level <= '9'; ++level ) + { + if( std::isdigit( lzip_level ) && level != lzip_level ) continue; + level_str[1] = level; + if( verbosity >= 1 && terminator ) + { + std::printf( "Trying level %s %c", level_str, terminator ); + std::fflush( stdout ); pending_newline = true; + } + const bool level0 = level == '0'; + const bool auto0 = ( level0 && lzip_level != '0' ); + int ret = try_reproduce( mbuffer, msize, dsize, good_dsize, begin, end, + rbuf, rsize, offset, dictionary_size, + level0 ? lzip0_argv : lzip_argv, md5sump, terminator, auto0 ); + if( ret >= 0 ) + { delete master; munmap( (void *)rbuf, rsize ); return ret; } + } + if( lzip_level <= 0 ) + { + for( int len = min_match_len_limit; len <= max_match_len; ++len ) + { + if( lzip_level < -1 && -lzip_level != len ) continue; + snprintf( level_str, sizeof level_str, "-m%u", len ); + if( verbosity >= 1 && terminator ) + { + std::printf( "Trying match length limit %d %c", len, terminator ); + std::fflush( stdout ); pending_newline = true; + } + int ret = try_reproduce( mbuffer, msize, dsize, good_dsize, begin, end, + rbuf, rsize, offset, dictionary_size, + lzip_argv, md5sump, terminator ); + if( ret >= 0 ) + { delete master; munmap( (void *)rbuf, rsize ); return ret; } + } + } + delete master; + munmap( (void *)rbuf, rsize ); + return 2; + } + +} // end namespace + + +int reproduce_file( const std::string & input_filename, + const std::string & default_output_filename, + const char * const lzip_name, + const char * const reference_filename, + const int lzip_level, const char terminator, + const bool force ) + { + struct stat in_stats; + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + output_filename = default_output_filename.empty() ? + insert_fixed( input_filename ) : default_output_filename; + if( !force && file_exists( output_filename ) ) return 1; + outfd = -1; + int errors = 0; + const long page_size = std::max( 1L, sysconf( _SC_PAGESIZE ) ); + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long dsize = lzip_index.dblock( i ).size(); + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + if( verbosity >= 1 && lzip_index.members() > 1 ) + { + std::printf( "Testing member %ld of %ld %c", + i + 1, lzip_index.members(), terminator ); + std::fflush( stdout ); pending_newline = true; + } + if( !safe_seek( infd, mpos ) ) return 1; + long long failure_pos = 0; + if( test_member_from_file( infd, msize, &failure_pos ) == 0 ) + continue; // member is not damaged + print_pending_newline( terminator ); + if( ++errors > 1 ) break; // only one member can be reproduced + if( failure_pos < Lzip_header::size ) // End Of File + { show_file_error( input_filename.c_str(), "Unexpected end of file." ); + return 2; } + + // without mmap, 3 times more memory are required because of fork + const long mpos_rem = mpos % page_size; + uint8_t * const mbuffer_base = (uint8_t *)mmap( 0, msize + mpos_rem, + PROT_READ | PROT_WRITE, MAP_PRIVATE, infd, mpos - mpos_rem ); + if( mbuffer_base == MAP_FAILED ) + { show_file_error( input_filename.c_str(), "Can't mmap", errno ); return 1; } + uint8_t * const mbuffer = mbuffer_base + mpos_rem; + long long size = 0; + uint8_t value = 0; + const long long begin = zeroed_sector_pos( input_filename.c_str(), mbuffer, + msize, &size, &value ); + if( begin < 0 ) return 2; + if( failure_pos < begin ) + { show_file_error( input_filename.c_str(), + "Data error found before damaged area." ); return 2; } + if( verbosity >= 1 ) + { + std::printf( "Reproducing bad area in member %ld of %ld\n" + " (begin = %lld, size = %lld, value = 0x%02X)\n", + i + 1, lzip_index.members(), begin, size, value ); + std::fflush( stdout ); + } + const int ret = reproduce_member( mbuffer, msize, dsize, lzip_name, + reference_filename, begin, size, lzip_level, 0, terminator ); + if( ret <= 0 ) print_pending_newline( terminator ); + if( ret < 0 ) { show_error( "Can't prepare master." ); return 1; } + if( ret == 0 ) + { + if( outfd < 0 ) // first damaged member reproduced + { + if( !safe_seek( infd, 0 ) ) return 1; + set_signal_handler(); + if( !open_outstream( true, true ) ) return 1; + if( !copy_file( infd, outfd ) ) // copy whole file + cleanup_and_fail( 1 ); + } + if( seek_write( outfd, mbuffer + begin, size, mpos + begin ) != size ) + { show_file_error( output_filename.c_str(), "Error writing file", errno ); + cleanup_and_fail( 1 ); } + if( verbosity >= 1 ) + std::fputs( "Member reproduced successfully.\n", stdout ); + } + munmap( mbuffer_base, msize + mpos_rem ); + if( ret > 0 ) + { + if( final_msg ) + { std::fputs( final_msg, stdout ); std::fflush( stdout ); } + show_file_error( input_filename.c_str(), + "Unable to reproduce member." ); return ret; + } + } + + if( outfd < 0 ) + { + if( verbosity >= 1 ) + std::fputs( "Input file has no errors. Recovery is not needed.\n", stdout ); + return 0; + } + if( close_outstream( &in_stats ) != 0 ) return 1; + if( verbosity >= 0 ) + { + if( errors > 1 ) + std::fputs( "One member reproduced." + " Copy of input file still contains errors.\n", stdout ); + else + std::fputs( "Copy of input file reproduced successfully.\n", stdout ); + } + return 0; + } + + +/* Passes a 0 terminator to other functions to prevent intramember feedback. + Exits only in case of fatal error. (reference file too large, etc). */ +int debug_reproduce_file( const std::string & input_filename, + const char * const lzip_name, + const char * const reference_filename, + const Block & range, const int sector_size, + const int lzip_level ) + { + struct stat in_stats; // not used + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + const long long cdata_size = lzip_index.cdata_size(); + if( range.pos() >= cdata_size ) + { show_file_error( input_filename.c_str(), + "Range is beyond end of last member." ); return 1; } + + const long page_size = std::max( 1L, sysconf( _SC_PAGESIZE ) ); + const long long positions_to_test = + ( ( std::min( range.end(), cdata_size ) - range.pos() ) + + sector_size - 9 ) / sector_size; + long positions = 0, successes = 0, failed_comparisons = 0; + long alternative_reproductions = 0; + const bool pct_enabled = cdata_size > sector_size && + isatty( STDERR_FILENO ) && !isatty( STDOUT_FILENO ); + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + if( !range.overlaps( mpos, msize ) ) continue; + const long long dsize = lzip_index.dblock( i ).size(); + const unsigned dictionary_size = lzip_index.dictionary_size( i ); + + // md5sums of original not damaged member (compressed and decompressed) + uint8_t md5_digest_c[16], md5_digest_d[16]; + bool md5_valid = false; + const long long rm_end = std::min( range.end(), mpos + msize ); + for( long long sector_pos = std::max( range.pos(), mpos ); + sector_pos + 8 <= rm_end; sector_pos += sector_size ) + { + // without mmap, 3 times more memory are required because of fork + const long mpos_rem = mpos % page_size; + uint8_t * const mbuffer_base = (uint8_t *)mmap( 0, msize + mpos_rem, + PROT_READ | PROT_WRITE, MAP_PRIVATE, infd, mpos - mpos_rem ); + if( mbuffer_base == MAP_FAILED ) + { show_file_error( input_filename.c_str(), "Can't mmap", errno ); + return 1; } + uint8_t * const mbuffer = mbuffer_base + mpos_rem; + if( !md5_valid ) + { + if( verbosity >= 0 ) // give a clue of the range being tested + { std::printf( "Reproducing: %s\nReference file: %s\nTesting " + "sectors of size %llu at file positions %llu to %llu\n", + input_filename.c_str(), reference_filename, + std::min( (long long)sector_size, rm_end - sector_pos ), + sector_pos, rm_end - 1 ); std::fflush( stdout ); } + md5_valid = true; compute_md5( mbuffer, msize, md5_digest_c ); + MD5SUM md5sum; + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, &md5sum ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { + if( verbosity >= 0 ) + { std::printf( "Member %ld of %ld already damaged (failure pos " + "= %llu)\n", i + 1, lzip_index.members(), + mpos + mtester.member_position() ); + std::fflush( stdout ); } + munmap( mbuffer_base, msize + mpos_rem ); break; + } + md5sum.md5_finish( md5_digest_d ); + } + ++positions; + const int sector_sz = + std::min( rm_end - sector_pos, (long long)sector_size ); + // set mbuffer[sector] to 0 + std::memset( mbuffer + ( sector_pos - mpos ), 0, sector_sz ); + long long size = 0; + uint8_t value = 0; + const long long begin = zeroed_sector_pos( input_filename.c_str(), mbuffer, + msize, &size, &value ); + if( begin < 0 ) return 2; + MD5SUM md5sum; + const int ret = reproduce_member( mbuffer, msize, dsize, lzip_name, + reference_filename, begin, size, lzip_level, &md5sum, 0 ); + if( ret < 0 ) { show_error( "Can't prepare master." ); return 1; } + if( ret == 0 ) + { + ++successes; + uint8_t new_digest[16]; + md5sum.md5_finish( new_digest ); + if( std::memcmp( md5_digest_d, new_digest, 16 ) != 0 ) + { + ++failed_comparisons; + if( verbosity >= 0 ) + std::printf( "Comparison failed at pos %llu\n", sector_pos ); + } + else if( !check_md5( mbuffer, msize, md5_digest_c ) ) + { + ++alternative_reproductions; + if( verbosity >= 0 ) + std::printf( "Alternative reproduction at pos %llu\n", sector_pos ); + } + else if( verbosity >= 0 ) + std::printf( "Reproduction succeeded at pos %llu\n", sector_pos ); + } + else if( verbosity >= 0 ) // ret > 0 + std::printf( "Unable to reproduce at pos %llu\n", sector_pos ); + if( verbosity >= 0 ) + { + std::fflush( stdout ); // flush result line + if( pct_enabled ) // show feedback + std::fprintf( stderr, "\r%ld sectors %ld successes %ld failcomp " + "%ld altrep %3u%% done\r", positions, successes, + failed_comparisons, alternative_reproductions, + (unsigned)( ( positions * 100.0 ) / positions_to_test ) ); + } + munmap( mbuffer_base, msize + mpos_rem ); + if( fatal_retval ) goto done; + } + } +done: + if( verbosity >= 0 ) + { + std::printf( "\n%8ld sectors tested" + "\n%8ld reproductions returned with zero status", + positions, successes ); + if( successes > 0 ) + { + if( failed_comparisons > 0 ) + std::printf( ", of which\n%8ld comparisons failed\n", + failed_comparisons ); + else std::fputs( "\n all comparisons passed\n", stdout ); + if( alternative_reproductions > 0 ) + std::printf( "%8ld alternative reproductions found\n", + alternative_reproductions ); + } + else std::fputc( '\n', stdout ); + if( fatal_retval ) + std::fputs( "Exiting because of a fatal error\n", stdout ); + } + return fatal_retval; + } @@ -1,18 +1,18 @@ -/* Lziprecover - Data recovery tool for the lzip format - Copyright (C) 2009-2019 Antonio Diaz Diaz. +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define _FILE_OFFSET_BITS 64 @@ -71,7 +71,8 @@ int split_file( const std::string & input_filename, const std::string & default_output_filename, const bool force ) { struct stat in_stats; - const int infd = open_instream( input_filename.c_str(), &in_stats, true, true ); + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); if( infd < 0 ) return 1; Lzip_index lzip_index( infd, true, true, true, true ); @@ -115,15 +116,13 @@ int split_file( const std::string & input_filename, const Block & mb = lzip_index.mblock( i ); if( mb.pos() > stream_pos ) // gap { - if( !open_outstream( force, false, false, false ) ) - { close( infd ); return 1; } + if( !open_outstream( force, true, false, false ) ) return 1; if( !copy_file( infd, outfd, mb.pos() - stream_pos ) || close_outstream( &in_stats ) != 0 ) cleanup_and_fail( 1 ); next_filename( max_digits ); } - if( !open_outstream( force, false, false, false ) ) // member - { close( infd ); return 1; } + if( !open_outstream( force, true, false, false ) ) return 1; // member if( !copy_file( infd, outfd, mb.size() ) || close_outstream( &in_stats ) != 0 ) cleanup_and_fail( 1 ); @@ -132,8 +131,7 @@ int split_file( const std::string & input_filename, } if( lzip_index.file_size() > stream_pos ) // trailing data { - if( !open_outstream( force, false, false, false ) ) - { close( infd ); return 1; } + if( !open_outstream( force, true, false, false ) ) return 1; if( !copy_file( infd, outfd, lzip_index.file_size() - stream_pos ) || close_outstream( &in_stats ) != 0 ) cleanup_and_fail( 1 ); diff --git a/testsuite/check.sh b/testsuite/check.sh index a902d93..e78b7f7 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -1,9 +1,9 @@ #! /bin/sh # check script for Lziprecover - Data recovery tool for the lzip format -# Copyright (C) 2009-2019 Antonio Diaz Diaz. +# Copyright (C) 2009-2021 Antonio Diaz Diaz. # # This script is free software: you have unlimited permission -# to copy, distribute and modify it. +# to copy, distribute, and modify it. LC_ALL=C export LC_ALL @@ -32,6 +32,7 @@ cd "${objdir}"/tmp || framework_failure cat "${testdir}"/test.txt > in || framework_failure in_lz="${testdir}"/test.txt.lz in_lzma="${testdir}"/test.txt.lzma +in_em="${testdir}"/test_em.txt.lz inD="${testdir}"/test21723.txt bad1_lz="${testdir}"/test_bad1.lz bad2_lz="${testdir}"/test_bad2.lz @@ -60,11 +61,15 @@ test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; } # Description of test files for lziprecover: # single-member files with one or more errors -# test_bad1.lz: byte at offset 66 changed from 0xA6 to 0x46 +# test_bad1.lz: byte at offset 66 changed from 0xA6 to 0x26 # test_bad2.lz: [ 34- 65] --> copy of bytes [ 68- 99] # test_bad3.lz: [ 512-1535] --> zeroed [2560-3583] --> zeroed # test_bad4.lz: [3072-4095] --> random errors [4608-5631] --> zeroed # test_bad5.lz: [1024-2047] --> random errors [5120-6143] --> random data +# test_bad6.lz: [ 512-1023] --> zeroed (reference test.txt [ 891- 2137]) +# test_bad7.lz: [6656-7167] --> zeroed (reference test.txt [20428-32231]) +# test_bad8.lz: [ 66- 73] --> zeroed (reference test.txt [ 89- 110]) +# test_bad9.lz: [6491-6498] --> zeroed (reference test.txt [17977-18120]) # # 6-member files with one or more errors # fox6_bad1.lz: byte at offset 5 changed from 0x0C to 0x00 (DS) @@ -108,6 +113,11 @@ printf "testing lziprecover-%s..." "$2" [ $? = 2 ] || test_failed $LINENO "${LZIP}" -dq -o in < "${in_lz}" [ $? = 1 ] || test_failed $LINENO +"${LZIP}" -dq -o in "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -dq -o out nx_file.lz +[ $? = 1 ] || test_failed $LINENO +[ ! -e out ] || test_failed $LINENO # these are for code coverage "${LZIP}" -lt "${in_lz}" 2> /dev/null [ $? = 1 ] || test_failed $LINENO @@ -115,7 +125,9 @@ printf "testing lziprecover-%s..." "$2" [ $? = 1 ] || test_failed $LINENO "${LZIP}" -cdt "${in_lz}" > out 2> /dev/null [ $? = 1 ] || test_failed $LINENO -"${LZIP}" -t -- nx_file 2> /dev/null +"${LZIP}" -t -- nx_file.lz 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -t "" < /dev/null 2> /dev/null [ $? = 1 ] || test_failed $LINENO "${LZIP}" --help > /dev/null || test_failed $LINENO "${LZIP}" -n1 -V > /dev/null || test_failed $LINENO @@ -136,6 +148,9 @@ printf "testing lziprecover-%s..." "$2" printf "LZIP\001-.............................." | "${LZIP}" -t 2> /dev/null printf "LZIP\002-.............................." | "${LZIP}" -t 2> /dev/null printf "LZIP\001+.............................." | "${LZIP}" -t 2> /dev/null + +"${LZIPRECOVER}" -eq "${testdir}"/test_bad6.lz +[ $? = 1 ] || test_failed $LINENO "${LZIPRECOVER}" -mq "${bad1_lz}" [ $? = 1 ] || test_failed $LINENO "${LZIPRECOVER}" -Rq @@ -201,8 +216,14 @@ printf "LZIP\001+.............................." | "${LZIP}" -t 2> /dev/null [ $? = 1 ] || test_failed $LINENO "${LZIPRECOVER}" -Akq "${in_lzma}" [ $? = 1 ] || test_failed $LINENO +rm -f copy.lz || framework_failure +"${LZIPRECOVER}" -A "${in_lzma}" -o copy.lz || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO "${LZIPRECOVER}" -Ac "${in_lzma}" > copy.lz || test_failed $LINENO cmp "${in_lz}" copy.lz || test_failed $LINENO +rm -f copy.lz || framework_failure +"${LZIPRECOVER}" -A -o copy.lz < "${in_lzma}" || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO "${LZIPRECOVER}" -A < "${in_lzma}" > copy.lz || test_failed $LINENO cmp "${in_lz}" copy.lz || test_failed $LINENO rm -f copy.lz || framework_failure @@ -220,21 +241,35 @@ printf "to be overwritten" > copy.tar.lz || framework_failure "${LZIPRECOVER}" -Af copy.tlz || test_failed $LINENO cmp "${in_lz}" copy.tar.lz || test_failed $LINENO rm -f copy.tar.lz || framework_failure -cat "${in_lzma}" > anyothername || framework_failure -"${LZIPRECOVER}" -A -o copy - anyothername - < "${in_lzma}" || +cat in in > in2 || framework_failure +"${LZIPRECOVER}" -A -o out2.lz - "${in_lzma}" - < "${in_lzma}" || test_failed $LINENO -cmp "${in_lz}" copy.lz || test_failed $LINENO -cmp "${in_lz}" anyothername.lz || test_failed $LINENO -rm -f copy.lz anyothername.lz || framework_failure +"${LZIP}" -cd out2.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +rm -f out2.lz copy2 || framework_failure printf "\ntesting decompression..." -"${LZIP}" -lq "${in_lz}" || test_failed $LINENO -"${LZIP}" -t "${in_lz}" || test_failed $LINENO -"${LZIP}" -cd "${in_lz}" > copy || test_failed $LINENO -cmp in copy || test_failed $LINENO +for i in "${in_lz}" "${in_em}" ; do + "${LZIP}" -lq "$i" || test_failed $LINENO "$i" + "${LZIP}" -t "$i" || test_failed $LINENO "$i" + "${LZIP}" -d "$i" -o copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + "${LZIP}" -cd "$i" > copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + "${LZIP}" -d "$i" -o - > copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + "${LZIP}" -d < "$i" > copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + rm -f copy || framework_failure +done + +lines=$("${LZIP}" -tvv "${in_em}" 2>&1 | wc -l) || test_failed $LINENO +[ "${lines}" -eq 8 ] || test_failed $LINENO "${lines}" + +lines=$("${LZIP}" -lvv "${in_em}" | wc -l) || test_failed $LINENO +[ "${lines}" -eq 11 ] || test_failed $LINENO "${lines}" -rm -f copy || framework_failure cat "${in_lz}" > copy.lz || framework_failure "${LZIP}" -dk copy.lz || test_failed $LINENO cmp in copy || test_failed $LINENO @@ -248,10 +283,16 @@ cmp in copy || test_failed $LINENO printf "to be overwritten" > copy || framework_failure "${LZIP}" -df -o copy < "${in_lz}" || test_failed $LINENO cmp in copy || test_failed $LINENO +rm -f out copy || framework_failure +"${LZIP}" -d -o ./- "${in_lz}" || test_failed $LINENO +cmp in ./- || test_failed $LINENO +rm -f ./- || framework_failure +"${LZIP}" -d -o ./- < "${in_lz}" || test_failed $LINENO +cmp in ./- || test_failed $LINENO +rm -f ./- || framework_failure -rm -f copy || framework_failure cat "${in_lz}" > anyothername || framework_failure -"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null || +"${LZIP}" -dv - anyothername - < "${in_lz}" > copy 2> /dev/null || test_failed $LINENO cmp in copy || test_failed $LINENO cmp in anyothername.out || test_failed $LINENO @@ -291,18 +332,20 @@ done [ ! -e nx_file ] || test_failed $LINENO cmp in copy || test_failed $LINENO -cat in in > in2 || framework_failure -cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure -"${LZIP}" -lq in2.lz || test_failed $LINENO -"${LZIP}" -t in2.lz || test_failed $LINENO -"${LZIP}" -cd in2.lz > copy2 || test_failed $LINENO +"${LZIP}" -lq "${in_lz}" "${in_lz}" || test_failed $LINENO +"${LZIP}" -t "${in_lz}" "${in_lz}" || test_failed $LINENO +"${LZIP}" -cd "${in_lz}" "${in_lz}" -o out > copy2 || test_failed $LINENO +[ ! -e out ] || test_failed $LINENO # override -o +cmp in2 copy2 || test_failed $LINENO +rm -f copy2 || framework_failure +"${LZIP}" -d "${in_lz}" "${in_lz}" -o copy2 || test_failed $LINENO cmp in2 copy2 || test_failed $LINENO +rm -f copy2 || framework_failure -cat in2.lz > copy2.lz || framework_failure +cat "${in_lz}" "${in_lz}" > copy2.lz || framework_failure printf "\ngarbage" >> copy2.lz || framework_failure "${LZIP}" -tvvvv copy2.lz 2> /dev/null || test_failed $LINENO -rm -f copy2 || framework_failure -"${LZIP}" -aD0 -q copy2.lz +"${LZIPRECOVER}" -aD0 -q copy2.lz [ $? = 2 ] || test_failed $LINENO "${LZIP}" -alq copy2.lz [ $? = 2 ] || test_failed $LINENO @@ -319,6 +362,7 @@ rm -f copy2 || framework_failure printf "to be overwritten" > copy2 || framework_failure "${LZIP}" -df copy2.lz || test_failed $LINENO cmp in2 copy2 || test_failed $LINENO +rm -f copy2 || framework_failure "${LZIPRECOVER}" -D ,18000 "${in_lz}" > copy || test_failed $LINENO "${LZIPRECOVER}" -D 18000 "${in_lz}" >> copy || test_failed $LINENO @@ -327,19 +371,6 @@ cmp in copy || test_failed $LINENO cmp "${inD}" copy || test_failed $LINENO "${LZIPRECOVER}" -D 21723,397 "${in_lz}" > copy || test_failed $LINENO cmp "${inD}" copy || test_failed $LINENO -"${LZIPRECOVER}" -D0 -iq "${f6b1_lz}" -fo copy -[ $? = 2 ] || test_failed $LINENO -cmp "${f6b1}" copy || test_failed $LINENO -"${LZIPRECOVER}" -D0 -iq "${f6b1_lz}" > copy -[ $? = 2 ] || test_failed $LINENO -cmp "${f6b1}" copy || test_failed $LINENO - -printf "LZIP\001+" > in2t.lz || framework_failure # gap size < 36 bytes -cat "${in_lz}" in "${in_lz}" >> in2t.lz || framework_failure -printf "LZIP\001-" >> in2t.lz || framework_failure # truncated member -"${LZIPRECOVER}" -D0 -i in2t.lz > copy2 || test_failed $LINENO -cmp in2 copy2 || test_failed $LINENO -rm -f in2 in2t.lz copy2 || framework_failure printf "\ntesting bad input..." @@ -411,6 +442,24 @@ else fi rm -f int.lz || framework_failure +for i in fox_v2.lz fox_s11.lz fox_de20.lz \ + fox_bcrc.lz fox_crc0.lz fox_das46.lz fox_mes81.lz ; do + "${LZIP}" -tq "${testdir}"/$i + [ $? = 2 ] || test_failed $LINENO $i +done + +"${LZIP}" -cd "${fox_lz}" > fox || test_failed $LINENO +for i in fox_bcrc.lz fox_crc0.lz fox_das46.lz fox_mes81.lz ; do + "${LZIP}" -cdq "${testdir}"/$i > out + [ $? = 2 ] || test_failed $LINENO $i + cmp fox out || test_failed $LINENO $i + "${LZIPRECOVER}" -tq -i "${testdir}"/$i || test_failed $LINENO $i + "${LZIPRECOVER}" -cdq -i "${testdir}"/$i > out || test_failed $LINENO $i + cmp fox out || test_failed $LINENO $i +done +rm -f fox out || framework_failure + +cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && [ -e trunc.lz ] && cmp in2.lz trunc.lz > /dev/null 2>&1 ; then @@ -434,11 +483,11 @@ rm -f in3.lz trunc.lz out || framework_failure for i in "${f6s1_lz}" "${f6s2_lz}" ; do lines=`"${LZIP}" -lvv "$i" | wc -l || test_failed $LINENO "$i"` - [ "${lines}" -eq 2 ] || test_failed $LINENO "$i" + [ "${lines}" -eq 2 ] || test_failed $LINENO "$i ${lines}" done for i in "${f6s3_lz}" "${f6s4_lz}" "${f6s5_lz}" "${f6s6_lz}" ; do lines=`"${LZIP}" -lvv "$i" | wc -l || test_failed $LINENO "$i"` - [ "${lines}" -eq 9 ] || test_failed $LINENO "$i" + [ "${lines}" -eq 9 ] || test_failed $LINENO "$i ${lines}" done cat "${in_lz}" > ingin.lz || framework_failure @@ -446,13 +495,50 @@ printf "g" >> ingin.lz || framework_failure cat "${in_lz}" >> ingin.lz || framework_failure "${LZIP}" -lq ingin.lz [ $? = 2 ] || test_failed $LINENO -"${LZIP}" -lq -i ingin.lz || test_failed $LINENO +"${LZIP}" -atq ingin.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -atq < ingin.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -acdq ingin.lz > out +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -adq < ingin.lz > out +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -lq -i ingin.lz || test_failed $LINENO "${LZIP}" -t ingin.lz || test_failed $LINENO +"${LZIP}" -t < ingin.lz || test_failed $LINENO "${LZIP}" -cd ingin.lz > copy || test_failed $LINENO cmp in copy || test_failed $LINENO -"${LZIP}" -t < ingin.lz || test_failed $LINENO "${LZIP}" -d < ingin.lz > copy || test_failed $LINENO cmp in copy || test_failed $LINENO +"${LZIPRECOVER}" -cd -i ingin.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO + +"${LZIPRECOVER}" -D0 -q "${f6b1_lz}" -fo copy +[ $? = 2 ] || test_failed $LINENO +cmp -s "${f6b1}" copy && test_failed $LINENO +"${LZIPRECOVER}" -D0 -q "${f6b1_lz}" > copy +[ $? = 2 ] || test_failed $LINENO +cmp -s "${f6b1}" copy && test_failed $LINENO +"${LZIPRECOVER}" -D0 -iq "${f6b1_lz}" -fo copy || test_failed $LINENO +cmp "${f6b1}" copy || test_failed $LINENO +"${LZIPRECOVER}" -D0 -iq "${f6b1_lz}" > copy || test_failed $LINENO +cmp "${f6b1}" copy || test_failed $LINENO + +touch empty || framework_failure +"${LZIPRECOVER}" -D0 -q ingin.lz > copy +[ $? = 2 ] || test_failed $LINENO +cmp empty copy || test_failed $LINENO +"${LZIPRECOVER}" -D0 -i ingin.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +printf "LZIP\001+" > in2t.lz || framework_failure # gap size < 36 bytes +cat "${in_lz}" in "${in_lz}" >> in2t.lz || framework_failure +printf "LZIP\001-" >> in2t.lz || framework_failure # truncated member +"${LZIPRECOVER}" -D0 -iq in2t.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +"${LZIPRECOVER}" -cd -iq in2t.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +"${LZIPRECOVER}" -t -iq in2t.lz || test_failed $LINENO +rm -f in2 in2t.lz copy copy2 || framework_failure printf "\ntesting --merge..." @@ -635,7 +721,78 @@ mv copy.tar.lz copy.lz || framework_failure mv copy.lz copy.tlz || framework_failure "${LZIPRECOVER}" -R copy.tlz || test_failed $LINENO [ -e copy_fixed.tlz ] || test_failed $LINENO -rm -f copy_fixed.* copy.tlz || framework_failure +rm -f copy_fixed.tlz copy_fixed.lz copy_fixed.tar.lz copy.tlz || + framework_failure + +printf "\ntesting --reproduce..." + +if [ -z "${LZIP_NAME}" ] ; then LZIP_NAME=lzip ; fi +if /bin/sh -c "${LZIP_NAME} -s18KiB" < in > out 2> /dev/null && + cmp "${in_lz}" out > /dev/null 2>&1 ; then + rm -f out || framework_failure + "${LZIPRECOVER}" --reproduce --lzip-name="${LZIP_NAME}" -o out \ + --reference-file=foo "${in_lz}" || test_failed $LINENO "${LZIP_NAME}" + [ ! -e out ] || test_failed $LINENO + + for i in 6 7 8 9 ; do + for f in "${testdir}"/test_bad${i}.txt "${testdir}"/test.txt ; do + rm -f out || framework_failure + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="$f" "${testdir}"/test_bad${i}.lz -o out || + test_failed $LINENO "${LZIP_NAME} $i $f" + cmp "${in_lz}" out || test_failed $LINENO "${LZIP_NAME} $i $f" + rm -f out || framework_failure + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="$f" "${testdir}"/test_bad${i}.lz -o out \ + --lzip-level=6 || test_failed $LINENO "${LZIP_NAME} $i $f level=6" + cmp "${in_lz}" out || test_failed $LINENO "${LZIP_NAME} $i $f level=6" + rm -f out || framework_failure + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="$f" "${testdir}"/test_bad${i}.lz -o out \ + --lzip-level=m36 || test_failed $LINENO "${LZIP_NAME} $i $f level=m36" + cmp "${in_lz}" out || test_failed $LINENO "${LZIP_NAME} $i $f level=m36" + done + done + + cat "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" > in4.lz || framework_failure + # multimember reproduction using test_bad[6789].txt as reference + cat "${testdir}"/test_bad6.lz "${testdir}"/test_bad7.lz \ + "${testdir}"/test_bad8.lz "${testdir}"/test_bad9.lz > mm_bad.lz || + framework_failure + rm -f out || framework_failure + for i in 6 7 8 9 ; do # reproduce one member each time + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test_bad${i}.txt mm_bad.lz -o out || + test_failed $LINENO "${LZIP_NAME} $i" + mv -f out mm_bad.lz + done + cmp in4.lz mm_bad.lz || test_failed $LINENO "${LZIP_NAME}" + + # multimember reproduction using test.txt as reference + cat "${testdir}"/test_bad6.lz "${testdir}"/test_bad7.lz \ + "${testdir}"/test_bad8.lz "${testdir}"/test_bad9.lz > mm_bad.lz || + framework_failure + rm -f out || framework_failure + for i in 6 7 8 9 ; do # reproduce one member each time + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test.txt mm_bad.lz -o out || + test_failed $LINENO "${LZIP_NAME} $i" + mv -f out mm_bad.lz + done + cmp in4.lz mm_bad.lz || test_failed $LINENO "${LZIP_NAME}" + rm -f in4.lz mm_bad.lz || framework_failure + + "${LZIPRECOVER}" -q --debug-reproduce=13-7356 --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test.txt "${testdir}"/test.txt.lz || + test_failed $LINENO "${LZIP_NAME}" + + "${LZIPRECOVER}" -q --debug-reproduce=512,5120,512 --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test.txt "${testdir}"/test.txt.lz || + test_failed $LINENO "${LZIP_NAME}" +else + printf "\nwarning: skipping --reproduce test: ${LZIP_NAME} not found or not the right version." + printf "\nTry 'make LZIP_NAME=<name_of_lzip_executable> check'." +fi printf "\ntesting --split..." @@ -785,7 +942,6 @@ rm -f rec*ingin.lz || framework_failure printf "\ntesting --*=damaged..." -touch empty || framework_failure cat "${in_lz}" > in.lz || framework_failure cat "${in_lz}" in > int.lz || framework_failure "${LZIPRECOVER}" --dump=damaged in.lz > copy || test_failed $LINENO diff --git a/testsuite/fox_bcrc.lz b/testsuite/fox_bcrc.lz Binary files differnew file mode 100644 index 0000000..8f6a7c4 --- /dev/null +++ b/testsuite/fox_bcrc.lz diff --git a/testsuite/fox_crc0.lz b/testsuite/fox_crc0.lz Binary files differnew file mode 100644 index 0000000..1abe926 --- /dev/null +++ b/testsuite/fox_crc0.lz diff --git a/testsuite/fox_das46.lz b/testsuite/fox_das46.lz Binary files differnew file mode 100644 index 0000000..43ed9f9 --- /dev/null +++ b/testsuite/fox_das46.lz diff --git a/testsuite/fox_de20.lz b/testsuite/fox_de20.lz Binary files differnew file mode 100644 index 0000000..10949d8 --- /dev/null +++ b/testsuite/fox_de20.lz diff --git a/testsuite/fox_mes81.lz b/testsuite/fox_mes81.lz Binary files differnew file mode 100644 index 0000000..d50ef2e --- /dev/null +++ b/testsuite/fox_mes81.lz diff --git a/testsuite/fox_s11.lz b/testsuite/fox_s11.lz Binary files differnew file mode 100644 index 0000000..dca909c --- /dev/null +++ b/testsuite/fox_s11.lz diff --git a/testsuite/fox_v2.lz b/testsuite/fox_v2.lz Binary files differnew file mode 100644 index 0000000..8620981 --- /dev/null +++ b/testsuite/fox_v2.lz diff --git a/testsuite/test_bad1.lz b/testsuite/test_bad1.lz Binary files differindex 16762ca..2129c90 100644 --- a/testsuite/test_bad1.lz +++ b/testsuite/test_bad1.lz diff --git a/testsuite/test_bad6.lz b/testsuite/test_bad6.lz Binary files differnew file mode 100644 index 0000000..cfea88c --- /dev/null +++ b/testsuite/test_bad6.lz diff --git a/testsuite/test_bad6.txt b/testsuite/test_bad6.txt new file mode 100644 index 0000000..b47462e --- /dev/null +++ b/testsuite/test_bad6.txt @@ -0,0 +1,26 @@ +) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to
\ No newline at end of file diff --git a/testsuite/test_bad7.lz b/testsuite/test_bad7.lz Binary files differnew file mode 100644 index 0000000..77f2b85 --- /dev/null +++ b/testsuite/test_bad7.lz diff --git a/testsuite/test_bad7.txt b/testsuite/test_bad7.txt new file mode 100644 index 0000000..be54c7c --- /dev/null +++ b/testsuite/test_bad7.txt @@ -0,0 +1,215 @@ +, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
\ No newline at end of file diff --git a/testsuite/test_bad8.lz b/testsuite/test_bad8.lz Binary files differnew file mode 100644 index 0000000..fca701b --- /dev/null +++ b/testsuite/test_bad8.lz diff --git a/testsuite/test_bad8.txt b/testsuite/test_bad8.txt new file mode 100644 index 0000000..3cb3ff4 --- /dev/null +++ b/testsuite/test_bad8.txt @@ -0,0 +1,3 @@ +1 + + Copyright (C) 1989
\ No newline at end of file diff --git a/testsuite/test_bad9.lz b/testsuite/test_bad9.lz Binary files differnew file mode 100644 index 0000000..becb0ec --- /dev/null +++ b/testsuite/test_bad9.lz diff --git a/testsuite/test_bad9.txt b/testsuite/test_bad9.txt new file mode 100644 index 0000000..b72a626 --- /dev/null +++ b/testsuite/test_bad9.txt @@ -0,0 +1,5 @@ +General +Public License instead of this License. + GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
diff --git a/testsuite/test_em.txt.lz b/testsuite/test_em.txt.lz Binary files differnew file mode 100644 index 0000000..7e96250 --- /dev/null +++ b/testsuite/test_em.txt.lz diff --git a/unzcrash.cc b/unzcrash.cc index d22b650..d897021 100644 --- a/unzcrash.cc +++ b/unzcrash.cc @@ -1,25 +1,25 @@ -/* Unzcrash - Tests robustness of decompressors to corrupted data. - Inspired by unzcrash.c from Julian Seward's bzip2. - Copyright (C) 2008-2019 Antonio Diaz Diaz. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. +/* Unzcrash - Tests robustness of decompressors to corrupted data. + Inspired by unzcrash.c from Julian Seward's bzip2. + Copyright (C) 2008-2021 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* - Exit status: 0 for a normal exit, 1 for environmental problems - (file not found, invalid flags, I/O errors, etc), 2 to indicate a - corrupt or invalid input file, 3 for an internal consistency error - (eg, bug) which caused unzcrash to panic. + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file, 3 for an internal consistency error + (eg, bug) which caused unzcrash to panic. */ #define _FILE_OFFSET_BITS 64 @@ -52,7 +52,7 @@ void show_error( const char * const msg, const int errcode = 0, namespace { const char * const program_name = "unzcrash"; -const char * invocation_name = 0; +const char * invocation_name = program_name; // default value int verbosity = 0; @@ -60,31 +60,28 @@ int verbosity = 0; void show_help() { std::printf( "Unzcrash tests the robustness of decompressors to corrupted data.\n" - "\nBy default, unzcrash reads the specified file and then repeatedly\n" - "decompresses it, increasing 256 times each byte of the compressed data,\n" - "so as to test all possible one-byte errors. Note that it may take years\n" - "or even centuries to test all possible one-byte errors in a large file\n" - "(tens of MB).\n" - "\nIf the '--block' option is given, unzcrash reads the specified file\n" - "and then repeatedly decompresses it, setting all bytes in each\n" - "successive block to the value given, so as to test all possible full\n" - "sector errors.\n" - "\nIf the '--truncate' option is given, unzcrash reads the specified\n" - "file and then repeatedly decompresses it, truncating the file to\n" - "increasing lengths, so as to test all possible truncation points.\n" - "\nNone of the three test modes described above should cause any invalid\n" - "memory accesses. If any of them does, please, report it as a bug to the\n" - "maintainers of the decompressor being tested.\n" - "\nIf the decompressor returns with zero status, unzcrash compares the\n" - "output of the decompressor for the original and corrupt files. If the\n" - "outputs differ, it means that the decompressor returned a false\n" - "negative; it failed to recognize the corruption and produced garbage\n" - "output. The only exception is when a multimember file is truncated just\n" - "after the last byte of a member, producing a shorter but valid\n" - "compressed file. Except in this latter case, please, report any false\n" - "negative as a bug.\n" - "\nIn order to compare the outputs, unzcrash needs a zcmp program able to\n" - "understand the format being tested. For example the one provided by zutils.\n" + "\nBy default, unzcrash reads the file specified and then repeatedly\n" + "decompresses it, increasing 256 times each byte of the compressed data, so\n" + "as to test all possible one-byte errors. Note that it may take years or even\n" + "centuries to test all possible one-byte errors in a large file (tens of MB).\n" + "\nIf the option '--block' is given, unzcrash reads the file specified and\n" + "then repeatedly decompresses it, setting all bytes in each successive block\n" + "to the value given, so as to test all possible full sector errors.\n" + "\nIf the option '--truncate' is given, unzcrash reads the file specified\n" + "and then repeatedly decompresses it, truncating the file to increasing\n" + "lengths, so as to test all possible truncation points.\n" + "\nNone of the three test modes described above should cause any invalid memory\n" + "accesses. If any of them does, please, report it as a bug to the maintainers\n" + "of the decompressor being tested.\n" + "\nIf the decompressor returns with zero status, unzcrash compares the output\n" + "of the decompressor for the original and corrupt files. If the outputs\n" + "differ, it means that the decompressor returned a false negative; it failed\n" + "to recognize the corruption and produced garbage output. The only exception\n" + "is when a multimember file is truncated just after the last byte of a\n" + "member, producing a shorter but valid compressed file. Except in this latter\n" + "case, please, report any false negative as a bug.\n" + "\nIn order to compare the outputs, unzcrash needs a 'zcmp' program able to\n" + "understand the format being tested. For example the zcmp provided by zutils.\n" "Use '--zcmp=false' to disable comparisons.\n" "\nUsage: %s [options] 'lzip -t' file.lz\n", invocation_name ); std::printf( "\nOptions:\n" @@ -188,7 +185,7 @@ uint8_t * read_file( const char * const name, long * const size ) long buffer_size = 1 << 20; uint8_t * buffer = (uint8_t *)std::malloc( buffer_size ); - if( !buffer ) { show_error( "Not enough memory." ); return 0; } + if( !buffer ) { show_error( mem_msg ); return 0; } long file_size = std::fread( buffer, 1, buffer_size, f ); while( file_size >= buffer_size ) { @@ -201,8 +198,7 @@ uint8_t * read_file( const char * const name, long * const size ) } buffer_size = ( buffer_size <= LONG_MAX / 2 ) ? 2 * buffer_size : LONG_MAX; uint8_t * const tmp = (uint8_t *)std::realloc( buffer, buffer_size ); - if( !tmp ) - { show_error( "Not enough memory." ); std::free( buffer ); return 0; } + if( !tmp ) { show_error( mem_msg ); std::free( buffer ); return 0; } buffer = tmp; file_size += std::fread( buffer + file_size, 1, buffer_size - file_size, f ); } @@ -304,7 +300,7 @@ int main( const int argc, const char * const argv[] ) Mode program_mode = m_byte; uint8_t block_value = 0; bool verify = true; - invocation_name = argv[0]; + if( argc > 0 ) invocation_name = argv[0]; const Arg_parser::Option options[] = { @@ -439,7 +435,7 @@ int main( const int argc, const char * const argv[] ) { ++failed_comparisons; if( verbosity >= 0 ) - std::fprintf( stderr, "byte %ld comparison failed\n", i ); + std::fprintf( stderr, "length %ld comparison failed\n", i ); } } } @@ -447,7 +443,7 @@ int main( const int argc, const char * const argv[] ) else if( program_mode == m_block ) { uint8_t * block = (uint8_t *)std::malloc( block_size ); - if( !block ) { show_error( "Not enough memory." ); return 1; } + if( !block ) { show_error( mem_msg ); return 1; } for( long i = pos; i < end; i += std::min( delta, end - i ) ) { const long size = std::min( block_size, file_size - i ); @@ -497,7 +493,7 @@ int main( const int argc, const char * const argv[] ) { ++decompressions; if( verbosity >= 2 ) - std::fprintf( stderr, "0x%02X (0x%02X+0x%02X) ", + std::fprintf( stderr, "0x%02X (0x%02X+0x%02X) ", buffer[i], byte, j ); FILE * f = popen( command, "w" ); if( !f ) { show_error( "Can't open pipe", errno ); return 1; } @@ -506,8 +502,8 @@ int main( const int argc, const char * const argv[] ) { ++successes; if( verbosity >= 0 ) - { if( verbosity < 2 ) - std::fprintf( stderr, "0x%02X (0x%02X+0x%02X) ", + { if( verbosity < 2 ) // else already printed above + std::fprintf( stderr, "0x%02X (0x%02X+0x%02X) ", buffer[i], byte, j ); std::fputs( "passed the test\n", stderr ); } if( zcmp_command[0] ) |