diff options
Diffstat (limited to '')
-rw-r--r-- | AUTHORS | 1 | ||||
-rw-r--r-- | COPYING | 338 | ||||
-rw-r--r-- | ChangeLog | 233 | ||||
-rw-r--r-- | INSTALL | 83 | ||||
-rw-r--r-- | Makefile.in | 168 | ||||
-rw-r--r-- | NEWS | 28 | ||||
-rw-r--r-- | README | 94 | ||||
-rw-r--r-- | alone_to_lz.cc | 149 | ||||
-rw-r--r-- | arg_parser.cc | 197 | ||||
-rw-r--r-- | arg_parser.h | 110 | ||||
-rw-r--r-- | common.h | 43 | ||||
-rwxr-xr-x | configure | 193 | ||||
-rw-r--r-- | decoder.cc | 300 | ||||
-rw-r--r-- | decoder.h | 383 | ||||
-rw-r--r-- | doc/lziprecover.1 | 143 | ||||
-rw-r--r-- | doc/lziprecover.info | 1499 | ||||
-rw-r--r-- | doc/lziprecover.texi | 1587 | ||||
-rw-r--r-- | dump_remove.cc | 292 | ||||
-rw-r--r-- | list.cc | 125 | ||||
-rw-r--r-- | lunzcrash.cc | 370 | ||||
-rw-r--r-- | lzip.h | 523 | ||||
-rw-r--r-- | lzip_index.cc | 358 | ||||
-rw-r--r-- | lzip_index.h | 94 | ||||
-rw-r--r-- | main.cc | 1090 | ||||
-rw-r--r-- | main_common.cc | 196 | ||||
-rw-r--r-- | md5.cc | 206 | ||||
-rw-r--r-- | md5.h | 49 | ||||
-rw-r--r-- | merge.cc | 649 | ||||
-rw-r--r-- | mtester.cc | 377 | ||||
-rw-r--r-- | mtester.h | 395 | ||||
-rw-r--r-- | nrep_stats.cc | 117 | ||||
-rw-r--r-- | range_dec.cc | 185 | ||||
-rw-r--r-- | repair.cc | 517 | ||||
-rw-r--r-- | reproduce.cc | 786 | ||||
-rw-r--r-- | split.cc | 142 | ||||
-rwxr-xr-x | testsuite/check.sh | 1448 | ||||
-rw-r--r-- | testsuite/fox.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox6.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_bad1.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_bad1.txt | 4 | ||||
-rw-r--r-- | testsuite/fox6_bad2.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_bad3.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_bad4.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_bad5.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_bad6.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_sc1.lz | bin | 0 -> 480 bytes | |||
-rw-r--r-- | testsuite/fox6_sc2.lz | bin | 0 -> 500 bytes | |||
-rw-r--r-- | testsuite/fox6_sc3.lz | bin | 0 -> 500 bytes | |||
-rw-r--r-- | testsuite/fox6_sc4.lz | bin | 0 -> 500 bytes | |||
-rw-r--r-- | testsuite/fox6_sc5.lz | bin | 0 -> 500 bytes | |||
-rw-r--r-- | testsuite/fox6_sc6.lz | bin | 0 -> 500 bytes | |||
-rw-r--r-- | testsuite/fox_bcrc.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_crc0.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_das46.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_de20.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_mes81.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_s11.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/fox_v2.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/numbers.lz | bin | 0 -> 369 bytes | |||
-rw-r--r-- | testsuite/numbersbt.lz | bin | 0 -> 392 bytes | |||
-rw-r--r-- | testsuite/test.txt | 676 | ||||
-rw-r--r-- | testsuite/test.txt.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test.txt.lzma | bin | 0 -> 7363 bytes | |||
-rw-r--r-- | testsuite/test21723.txt | 7 | ||||
-rw-r--r-- | testsuite/test_bad1.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad2.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad3.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad4.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad5.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad6.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad6.txt | 26 | ||||
-rw-r--r-- | testsuite/test_bad7.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad7.txt | 215 | ||||
-rw-r--r-- | testsuite/test_bad8.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad8.txt | 3 | ||||
-rw-r--r-- | testsuite/test_bad9.lz | bin | 0 -> 7376 bytes | |||
-rw-r--r-- | testsuite/test_bad9.txt | 5 | ||||
-rw-r--r-- | testsuite/test_em.txt.lz | bin | 0 -> 14024 bytes | |||
-rw-r--r-- | unzcrash.cc | 645 |
79 files changed, 15049 insertions, 0 deletions
@@ -0,0 +1 @@ +Lziprecover was written by Antonio Diaz Diaz. @@ -0,0 +1,338 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) <year> <name of author> + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..363ce18 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,233 @@ +2022-01-21 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.23 released. + * Decompression time has been reduced by 5-12% depending on the file. + * main_common.cc (getnum): Show option name and valid range if error. + * dump_remove.cc (dump_members): Check tty except for --dump=tdata. + * Option '-U, --unzcrash' now takes an argument ('1' or 'B<size>'). + * mtester.cc (duplicate_buffer): Use an external buffer. + * repair.cc (debug_decompress): Continue decoding on CRC mismatch. + * unzcrash.cc: Make zcmp_command a string of unlimited size. + Use execvp instead of popen to avoid invoking /bin/sh. + Print byte or block position in messages. + * New file common.h. + * Improve several descriptions in manual, '--help', and man page. + * lziprecover.texi: Change GNU Texinfo category to 'Compression'. + (Reported by Alfred M. Szmidt). + +2021-01-02 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.22 released. + * New options '-e, --reproduce', '--lzip-level', '--lzip-name', + '--reference-file', and '-E, --debug-reproduce'. + * Remove '--dump-tdata', '--remove-tdata', and '--strip-tdata'. + * main.cc (main): Report an error if a file name is empty. + Make '-o' behave like '-c', but writing to file. + Make '-c' and '-o' check whether the output is a terminal only once. + Do not open output if input is a terminal. + * main.cc (decompress): With '-i', ignore data errors, keep files. + * range_dec.cc: '-i -D' now decompresses a truncated last member. + * '-i -D' now returns 0 if only ignored errors are found. + * '-i' now considers any block > 36 with header a member, not a gap. + * Replace 'decompressed', 'compressed' with 'out', 'in' in output. + * Fix several compiler warnings. (Reported by Nissanka Gooneratne). + * lzip_index.cc: Improve messages for corruption in last header. + * New debug options '-M, --md5sum' and '-U, --unzcrash'. + * main.cc: Set a valid invocation_name even if argc == 0. + * Document extraction from tar.lz in manual, '--help', and man page. + * New files lunzcrash.cc, md5.h, md5.cc, nrep_stats.cc, reproduce.cc. + * lziprecover.texi: New chapter 'Reproducing one sector'. + New sections 'Merging with a backup' and 'Reproducing a mailbox'. + Document the debug options for experts. + * check.sh: Lzip 1.16 or newer is required to run the tests. + * testsuite: Add 9 new test files. + +2019-01-04 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.21 released. + * Rename File_* to Lzip_*. + * New options '--dump', '--remove', and '--strip'. They + replace '--dump-tdata', '--remove-tdata', and '--strip-tdata', + which are now aliases and will be removed in version 1.22. + * lzip.h (Lzip_trailer): New function 'verify_consistency'. + * lzip_index.cc: Lzip_index now detects gaps between members, + some kinds of corrupt trailers and + some fake trailers embedded in trailing data. + * split.cc: Use Lzip_index to split members, gaps and trailing data. + * split.cc: Verify last member before writing anything. + * list.cc (list_files): With '-i', ignore format errors, show gaps. + * range_dec.cc: With '-i', ignore a truncated last member. + * main.cc (main): Check return value of close( infd ). + * Improve and add new diagnostic messages. + * Print '\n' instead of '\r' if !isatty( 1 ) in merge, repair. + * main.cc: Compile on DOS with DJGPP. + * lziprecover.texi: New chapter 'Tarlz'. + * configure: Accept appending to CXXFLAGS; 'CXXFLAGS+=OPTIONS'. + * INSTALL: Document use of CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO'. + * New test files fox.lz, fox6_sc[1-6].lz. + +2018-02-12 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.20 released. + * split.cc: Fix splitting of files > 64 KiB broken since 1.16. + * New options '--dump-tdata', '--remove-tdata', '--strip-tdata', and + '--loose-trailing'. + * Improve corrupt header detection to HD=3. + * main.cc: Show corrupt or truncated header in multimember file. + * Replace 'bits/byte' with inverse compression ratio in output. + * Show progress of decompression at verbosity level 2 (-vv). + * Show progress of decompression only if stderr is a terminal. + * main.cc: Show final diagnostic when testing multiple files. + * decoder.cc (verify_trailer): Show stored sizes also in hex. + Show dictionary size at verbosity level 4 (-vvvv). + +2017-04-10 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.19 released. + * merge.cc: Fix members with thousands of scattered errors. + * Option '-a' now works with '-l' and '-D'. + * The output of option '-l, --list' has been simplified. + * main.cc: Continue testing if any input file is a terminal. + * main.cc: Show trailing data in both hexadecimal and ASCII. + * lzip_index.cc: Improve detection of bad dict and trailing data. + * lzip_index.cc: Skip trailing data more efficiently. + * lzip.h: Unify messages for bad magic, trailing data, etc. + * New struct Bad_byte allows delta and flip modes for bad_value. + * unzcrash.cc: New option '-e, --set-byte'. + +2016-05-12 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.18 released. + * New option '-a, --trailing-error'. + * merge.cc (open_input_files): Use CRC to test identical files. + * repair.cc (repair_file): Detect gross damage before repairing. + * repair.cc: Repair a damaged dictionary size in the header. + * repair.cc: Try bytes at offsets 7 to 11 first. + * Decompression time has been reduced by 2%. + * main.cc (decompress): Print up to 6 bytes of trailing data + when '-tvvvv' is specified. + * decoder.cc (verify_trailer): Remove test of final code. + * main.cc (main): Delete '--output' file if infd is a terminal. + * main.cc (main): Don't use stdin more than once. + * Use 'close_and_set_permissions' and 'set_signals' in all modes. + * range_dec.cc (list_file): Show dictionary size and size of + trailing data (if any) with '-lv'. + * New options '-A, --alone-to-lz', '-W, --debug-decompress', and + '-X, --show-packets'. + * Change short name of option '--debug-delay' to '-Y'. + * Change short name of option '--debug-repair' to '-Z'. + * unzcrash.cc: New options '-B, --block', '-d, --delta', + '-t, --truncate', and '-z, --zcmp'. + * unzcrash.cc: Read files as large as RAM allows. + * unzcrash.cc: Compare output using zcmp if decompressor returns 0. + * unzcrash.cc: Accept negative position and size. + * lziprecover.texi: New chapter 'Trailing data'. + * configure: Avoid warning on some shells when testing for g++. + * Makefile.in: Detect the existence of install-info. + * check.sh: Don't check error messages. + * check.sh: A POSIX shell is required to run the tests. + +2015-05-28 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.17 released. + * New block selection algorithm makes merge up to 100 times faster. + * repair.cc: Repair time has been reduced by 15%. + * New options '-y, --debug-delay' and '-z, --debug-repair'. + * Makefile.in: New targets 'install*-compress'. + * testsuite/unzcrash.cc: Move to top directory. + * lziprecover.texi: New chapter 'File names'. + +2014-08-29 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.16 released. + * New class LZ_mtester makes repair up to 10 times faster. + * main.cc (close_and_set_permissions): Behave like 'cp -p'. + * lziprecover.texinfo: Rename to lziprecover.texi. + * Change license to GPL version 2 or later. + +2013-09-14 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.15 released. + * repair.cc: Repair multimember files with up to one byte error + per member. + * merge.cc: Merge multimember files. + * main.cc (show_header): Don't show header version. + * lziprecover.texinfo: New chapters 'Repairing files', + 'Merging files', and 'Unzcrash'. + +2013-05-31 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.14 released. + * New option '-i, --ignore-errors'. + * Option '-l, --list' now accepts more than one file. + * Decompression time has been reduced by 12%. + * split.cc: Use as few digits as possible in file names. + * split.cc: In verbose mode show names of files being created. + * main.cc (show_header): Show header version if verbosity >= 4. + * configure: Options now accept a separate argument. + * Makefile.in: New targets 'install-as-lzip' and 'install-bin'. + * main.cc: Use 'setmode' instead of '_setmode' on Windows and OS/2. + +2012-02-24 Antonio Diaz Diaz <ant_diaz@teleline.es> + + * Version 1.13 released. + * Lziprecover is now distributed in its own package. Until + version 1.12 it was included in the lzip package. Previous + entries in this file are taken from there. + * lziprecover.cc: Rename to main.cc. + * New files merge.cc, repair.cc, split.cc, and range_dec.cc. + * main.cc: Add decompressor options (-c, -d, -k, -t) so that + an external decompressor is not needed for recovery nor for + "make check". + * New option '-D, --range-decompress', which extracts a range of + bytes decompressing only the members containing the desired data. + * New option '-l, --list', which prints correct total file sizes + even for multimember files. + * merge.cc, repair.cc: Remove output file if recovery fails. + * Change quote characters in messages as advised by GNU Standards. + * split.cc: Use Boyer-Moore algorithm to search for headers. + * configure: Rename 'datadir' to 'datarootdir'. + +2011-04-30 Antonio Diaz Diaz <ant_diaz@teleline.es> + + * Version 1.12 released. + * lziprecover.cc: If '-v' is not specified show errors only. + * unzcrash.cc: Use Arg_parser. + * unzcrash.cc: New options '-b, --bits', '-p, --position', and + '-s, --size'. + +2010-09-16 Antonio Diaz Diaz <ant_diaz@teleline.es> + + * Version 1.11 released. + * lziprecover.cc: New option '-m, --merge', which tries to produce a + correct file by merging the good parts of two or more damaged copies. + * lziprecover.cc: New option '-R, --repair' for repairing a + 1-byte error in single-member files. + * decoder.cc (decode_member): Detect file errors earlier to improve + efficiency of lziprecover's new repair capability. + This change also prevents (harmless) access to uninitialized + memory when decompressing a corrupt file. + * lziprecover.cc: New options '-f, --force' and '-o, --output'. + * lziprecover.cc: New option '-s, --split' to select the until + now only operation of splitting multimember files. + * lziprecover.cc: If no operation is specified, warn the user and do + nothing. + +2009-06-22 Antonio Diaz Diaz <ant_diaz@teleline.es> + + * Version 1.6 released. + * lziprecover.1: New man page. + * check.sh: Test lziprecover. + +2009-01-24 Antonio Diaz Diaz <ant_diaz@teleline.es> + + * Version 1.4 released. + * Add 'lziprecover', a member recoverer program. + * unzcrash.cc: Test all 1-byte errors. + + +Copyright (C) 2009-2022 Antonio Diaz Diaz. + +This file is a collection of facts, and thus it is not copyrightable, +but just in case, you have unlimited permission to copy, distribute, and +modify it. @@ -0,0 +1,83 @@ +Requirements +------------ +You will need a C++98 compiler with suport for 'long long'. +(gcc 3.3.6 or newer is recommended). +I use gcc 6.1.0 and 3.3.6, but the code should compile with any standards +compliant compiler. +Gcc is available at http://gcc.gnu.org. + +Lzip 1.16 (or clzip 1.6) or newer is required to run the tests. + +Unzcrash needs a 'zcmp' program able to understand the format being tested. +For example the zcmp provided by zutils. +Zutils is available at http://www.nongnu.org/zutils/zutils.html + +The operating system must allow signal handlers read access to objects with +static storage duration so that the cleanup handler for Control-C can delete +the partial output file. + + +Procedure +--------- +1. Unpack the archive if you have not done so already: + + tar -xf lziprecover[version].tar.lz +or + lzip -cd lziprecover[version].tar.lz | tar -xf - + +This creates the directory ./lziprecover[version] containing the source from +the main archive. + +2. Change to lziprecover directory and run configure. + (Try 'configure --help' for usage instructions). + + cd lziprecover[version] + ./configure + + If you are compiling on MinGW, use: + + ./configure CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO' + +3. Run make. + + make + +4. Optionally, type 'make check' to run the tests that come with lziprecover. + +5. Type 'make install' to install the program and any data files and + documentation. + + Or type 'make install-compress', which additionally compresses the + info manual and the man page after installation. + (Installing compressed docs may become the default in the future). + + You can install only the program, the info manual, or the man page by + typing 'make install-bin', 'make install-info', or 'make install-man' + respectively. + + Instead of 'make install', you can type 'make install-as-lzip' to + install the program and any data files and documentation, and link + the program to the name 'lzip'. + + +Another way +----------- +You can also compile lziprecover into a separate directory. +To do this, you must use a version of 'make' that supports the variable +'VPATH', such as GNU 'make'. 'cd' to the directory where you want the +object files and executables to go and run the 'configure' script. +'configure' automatically checks for the source code in '.', in '..', and +in the directory that 'configure' is in. + +'configure' recognizes the option '--srcdir=DIR' to control where to +look for the sources. Usually 'configure' can determine that directory +automatically. + +After running 'configure', you can run 'make' and 'make install' as +explained above. + + +Copyright (C) 2009-2022 Antonio Diaz Diaz. + +This file is free documentation: you have unlimited permission to copy, +distribute, and modify it. diff --git a/Makefile.in b/Makefile.in new file mode 100644 index 0000000..a8bd012 --- /dev/null +++ b/Makefile.in @@ -0,0 +1,168 @@ + +DISTNAME = $(pkgname)-$(pkgversion) +INSTALL = install +INSTALL_PROGRAM = $(INSTALL) -m 755 +INSTALL_DATA = $(INSTALL) -m 644 +INSTALL_DIR = $(INSTALL) -d -m 755 +SHELL = /bin/sh +CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 + +objs = arg_parser.o alone_to_lz.o lzip_index.o list.o dump_remove.o \ + lunzcrash.o md5.o merge.o mtester.o nrep_stats.o range_dec.o \ + repair.o reproduce.o split.o decoder.o main.o +unzobjs = arg_parser.o unzcrash.o + + +.PHONY : all install install-bin install-info install-man \ + install-strip install-compress install-strip-compress \ + install-bin-strip install-info-compress install-man-compress \ + install-as-lzip \ + uninstall uninstall-bin uninstall-info uninstall-man \ + doc info man check dist clean distclean + +all : $(progname) + +$(progname) : $(objs) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(objs) + +unzcrash : $(unzobjs) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(unzobjs) + +main.o : main.cc + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -DPROGVERSION=\"$(pkgversion)\" -c -o $@ $< + +unzcrash.o : unzcrash.cc + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -DPROGVERSION=\"$(pkgversion)\" -c -o $@ $< + +%.o : %.cc + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< + +$(objs) : Makefile +lzip.h : common.h +alone_to_lz.o : lzip.h mtester.h +arg_parser.o : arg_parser.h +decoder.o : lzip.h decoder.h +dump_remove.o : lzip.h lzip_index.h +list.o : lzip.h lzip_index.h +lunzcrash.o : lzip.h md5.h mtester.h lzip_index.h +lzip_index.o : lzip.h lzip_index.h +main.o : arg_parser.h lzip.h decoder.h main_common.cc +md5.o : md5.h +merge.o : lzip.h decoder.h lzip_index.h +mtester.o : lzip.h md5.h mtester.h +nrep_stats.o : lzip.h lzip_index.h +range_dec.o : lzip.h decoder.h lzip_index.h +repair.o : lzip.h mtester.h lzip_index.h +reproduce.o : lzip.h md5.h mtester.h lzip_index.h +split.o : lzip.h lzip_index.h +unzcrash.o : Makefile arg_parser.h main_common.cc + + +doc : info man + +info : $(VPATH)/doc/$(pkgname).info + +$(VPATH)/doc/$(pkgname).info : $(VPATH)/doc/$(pkgname).texi + cd $(VPATH)/doc && makeinfo $(pkgname).texi + +man : $(VPATH)/doc/$(progname).1 + +$(VPATH)/doc/$(progname).1 : $(progname) + help2man -n 'recovers data from damaged lzip files' -o $@ ./$(progname) + +Makefile : $(VPATH)/configure $(VPATH)/Makefile.in + ./config.status + +check : all + @$(VPATH)/testsuite/check.sh $(VPATH)/testsuite $(pkgversion) + +install : install-bin install-info install-man +install-strip : install-bin-strip install-info install-man +install-compress : install-bin install-info-compress install-man-compress +install-strip-compress : install-bin-strip install-info-compress install-man-compress + +install-bin : all + if [ ! -d "$(DESTDIR)$(bindir)" ] ; then $(INSTALL_DIR) "$(DESTDIR)$(bindir)" ; fi + $(INSTALL_PROGRAM) ./$(progname) "$(DESTDIR)$(bindir)/$(progname)" + +install-bin-strip : all + $(MAKE) INSTALL_PROGRAM='$(INSTALL_PROGRAM) -s' install-bin + +install-info : + if [ ! -d "$(DESTDIR)$(infodir)" ] ; then $(INSTALL_DIR) "$(DESTDIR)$(infodir)" ; fi + -rm -f "$(DESTDIR)$(infodir)/$(pkgname).info"* + $(INSTALL_DATA) $(VPATH)/doc/$(pkgname).info "$(DESTDIR)$(infodir)/$(pkgname).info" + -if $(CAN_RUN_INSTALLINFO) ; then \ + install-info --info-dir="$(DESTDIR)$(infodir)" "$(DESTDIR)$(infodir)/$(pkgname).info" ; \ + fi + +install-info-compress : install-info + lzip -v -9 "$(DESTDIR)$(infodir)/$(pkgname).info" + +install-man : + if [ ! -d "$(DESTDIR)$(mandir)/man1" ] ; then $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1" ; fi + -rm -f "$(DESTDIR)$(mandir)/man1/$(progname).1"* + $(INSTALL_DATA) $(VPATH)/doc/$(progname).1 "$(DESTDIR)$(mandir)/man1/$(progname).1" + +install-man-compress : install-man + lzip -v -9 "$(DESTDIR)$(mandir)/man1/$(progname).1" + +install-as-lzip : install + -rm -f "$(DESTDIR)$(bindir)/lzip" + cd "$(DESTDIR)$(bindir)" && ln -s $(progname) lzip + +uninstall : uninstall-man uninstall-info uninstall-bin + +uninstall-bin : + -rm -f "$(DESTDIR)$(bindir)/$(progname)" + +uninstall-info : + -if $(CAN_RUN_INSTALLINFO) ; then \ + install-info --info-dir="$(DESTDIR)$(infodir)" --remove "$(DESTDIR)$(infodir)/$(pkgname).info" ; \ + fi + -rm -f "$(DESTDIR)$(infodir)/$(pkgname).info"* + +uninstall-man : + -rm -f "$(DESTDIR)$(mandir)/man1/$(progname).1"* + +dist : doc + ln -sf $(VPATH) $(DISTNAME) + tar -Hustar --owner=root --group=root -cvf $(DISTNAME).tar \ + $(DISTNAME)/AUTHORS \ + $(DISTNAME)/COPYING \ + $(DISTNAME)/ChangeLog \ + $(DISTNAME)/INSTALL \ + $(DISTNAME)/Makefile.in \ + $(DISTNAME)/NEWS \ + $(DISTNAME)/README \ + $(DISTNAME)/configure \ + $(DISTNAME)/doc/$(progname).1 \ + $(DISTNAME)/doc/$(pkgname).info \ + $(DISTNAME)/doc/$(pkgname).texi \ + $(DISTNAME)/*.h \ + $(DISTNAME)/*.cc \ + $(DISTNAME)/testsuite/check.sh \ + $(DISTNAME)/testsuite/fox6_bad1.txt \ + $(DISTNAME)/testsuite/test.txt \ + $(DISTNAME)/testsuite/test21723.txt \ + $(DISTNAME)/testsuite/test_bad[6-9].txt \ + $(DISTNAME)/testsuite/fox.lz \ + $(DISTNAME)/testsuite/fox_*.lz \ + $(DISTNAME)/testsuite/fox6.lz \ + $(DISTNAME)/testsuite/fox6_sc[1-6].lz \ + $(DISTNAME)/testsuite/fox6_bad[1-6].lz \ + $(DISTNAME)/testsuite/numbers.lz \ + $(DISTNAME)/testsuite/numbersbt.lz \ + $(DISTNAME)/testsuite/test.txt.lz \ + $(DISTNAME)/testsuite/test.txt.lzma \ + $(DISTNAME)/testsuite/test_bad[1-9].lz \ + $(DISTNAME)/testsuite/test_em.txt.lz + rm -f $(DISTNAME) + lzip -v -9 $(DISTNAME).tar + +clean : + -rm -f $(progname) $(objs) + -rm -f unzcrash unzcrash.o + +distclean : clean + -rm -f Makefile config.status *.tar *.tar.lz @@ -0,0 +1,28 @@ +Changes in version 1.23: + +Decompression time has been reduced by 5-12% depending on the file. + +In case of error in a numerical argument to a command line option, lziprecover +now shows the name of the option and the range of valid values. + +Options '--dump' and '--strip' now refuse to write compressed data to a +terminal except when dumping trailing data with '--dump=tdata'. + +The option '-U, --unzcrash' now requires an argument: '1' to test 1-bit +errors, or 'B<size>' to test zeroed blocks. + +The memory tester now allocates the dictionary once per member instead of +doing it for each test. This makes '-U, --unzcrash' about two times faster +on my machine on files with an uncompressed size larger than about 30 MB. + +'-W, --debug-decompress' now continues decompressing the members following +the damaged member if it has been fully decompressed (just failed with a CRC +mismatch). + +The tool unzcrash now uses execvp instead of popen to avoid invoking /bin/sh +and run faster. It also prints byte or block position in messages. + +Several descriptions have been improved in manual, '--help', and man page. + +The texinfo category of the manual has been changed from 'Data Compression' +to 'Compression' to match that of gzip. (Reported by Alfred M. Szmidt). @@ -0,0 +1,94 @@ +Description + +Lziprecover is a data recovery tool and decompressor for files in the lzip +compressed data format (.lz). Lziprecover is able to repair slightly damaged +files (up to one single-byte error per member), produce a correct file by +merging the good parts of two or more damaged copies, reproduce a missing +(zeroed) sector using a reference file, extract data from damaged files, +decompress files, and test integrity of files. + +Lziprecover can remove the damaged members from multimember files, for +example multimember tar.lz archives. + +Lziprecover provides random access to the data in multimember files; it only +decompresses the members containing the desired data. + +Lziprecover facilitates the management of metadata stored as trailing data +in lzip files. + +Lziprecover is not a replacement for regular backups, but a last line of +defense for the case where the backups are also damaged. + +The lzip file format is designed for data sharing and long-term archiving, +taking into account both data integrity and decoder availability: + + * The lzip format provides very safe integrity checking and some data + recovery means. The program lziprecover can repair bit flip errors + (one of the most common forms of data corruption) in lzip files, and + provides data recovery capabilities, including error-checked merging + of damaged copies of a file. + + * The lzip format is as simple as possible (but not simpler). The lzip + manual provides the source code of a simple decompressor along with a + detailed explanation of how it works, so that with the only help of the + lzip manual it would be possible for a digital archaeologist to extract + the data from a lzip file long after quantum computers eventually + render LZMA obsolete. + + * Additionally the lzip reference implementation is copylefted, which + guarantees that it will remain free forever. + +A nice feature of the lzip format is that a corrupt byte is easier to repair +the nearer it is from the beginning of the file. Therefore, with the help of +lziprecover, losing an entire archive just because of a corrupt byte near +the beginning is a thing of the past. + +Compression may be good for long-term archiving. For compressible data, +multiple compressed copies may provide redundancy in a more useful form and +may have a better chance of surviving intact than one uncompressed copy +using the same amount of storage space. This is specially true if the format +provides recovery capabilities like those of lziprecover, which is able to +find and combine the good parts of several damaged copies. + +Lziprecover is able to recover or decompress files produced by any of the +compressors in the lzip family: lzip, plzip, minilzip/lzlib, clzip, and +pdlzip. + +If the cause of file corruption is a damaged medium, the combination +GNU ddrescue + lziprecover is the recommended option for recovering data +from damaged lzip files. + +If a file is too damaged for lziprecover to repair it, all the recoverable +data in all members of the file can be extracted in one step with the +command 'lziprecover -cd -i file.lz > file'. + +When recovering data, lziprecover takes as arguments the names of the +damaged files and writes zero or more recovered files depending on the +operation selected and whether the recovery succeeded or not. The damaged +files themselves are kept unchanged. + +When decompressing or testing file integrity, lziprecover behaves like lzip +or lunzip. + +To give you an idea of its possibilities, when merging two copies, each of +them with one damaged area affecting 1 percent of the copy, the probability +of obtaining a correct file is about 98 percent. With three such copies the +probability rises to 99.97 percent. For large files (a few MB) with small +errors (one sector damaged per copy), the probability approaches 100 percent +even with only two copies. (Supposing that the errors are randomly located +inside each copy). + +The lziprecover package also includes unzcrash, a program written to test +robustness to decompression of corrupted data, inspired by unzcrash.c from +Julian Seward's bzip2. Type 'make unzcrash' in the lziprecover source +directory to build it. Then try 'unzcrash --help'. + + +Copyright (C) 2009-2022 Antonio Diaz Diaz. + +This file is free documentation: you have unlimited permission to copy, +distribute, and modify it. + +The file Makefile.in is a data file used by configure to produce the +Makefile. It has the same copyright owner and permissions that configure +itself. diff --git a/alone_to_lz.cc b/alone_to_lz.cc new file mode 100644 index 0000000..9e5b330 --- /dev/null +++ b/alone_to_lz.cc @@ -0,0 +1,149 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <new> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> + +#include "lzip.h" +#include "mtester.h" + + +namespace { + +/* Return the address of a malloc'd buffer containing the file data and + the file size in '*size'. The buffer is at least 20 bytes larger. + In case of error, return 0 and do not modify '*size'. +*/ +uint8_t * read_file( const int infd, long * const size, + const char * const filename ) + { + long buffer_size = 1 << 20; + uint8_t * buffer = (uint8_t *)std::malloc( buffer_size ); + if( !buffer ) throw std::bad_alloc(); + + long file_size = readblock( infd, buffer, buffer_size - 20 ); + while( file_size >= buffer_size - 20 && !errno ) + { + if( buffer_size >= LONG_MAX ) + { show_file_error( filename, "File is too large" ); std::free( buffer ); + return 0; } + buffer_size = ( buffer_size <= LONG_MAX / 2 ) ? 2 * buffer_size : LONG_MAX; + uint8_t * const tmp = (uint8_t *)std::realloc( buffer, buffer_size ); + if( !tmp ) { std::free( buffer ); throw std::bad_alloc(); } + buffer = tmp; + file_size += + readblock( infd, buffer + file_size, buffer_size - 20 - file_size ); + } + if( errno ) + { + show_file_error( filename, "Error reading file", errno ); + std::free( buffer ); return 0; + } + *size = file_size; + return buffer; + } + + +bool validate_ds( unsigned * const dictionary_size ) + { + if( *dictionary_size < min_dictionary_size ) + { *dictionary_size = min_dictionary_size; return false; } + if( *dictionary_size > max_dictionary_size ) + { *dictionary_size = max_dictionary_size; return false; } + return true; + } + +} // end namespace + + +int alone_to_lz( const int infd, const Pretty_print & pp ) + { + enum { lzma_header_size = 13, offset = lzma_header_size - Lzip_header::size }; + long file_size = 0; + uint8_t * const buffer = read_file( infd, &file_size, pp.name() ); + if( !buffer ) return 1; + if( file_size < lzma_header_size ) + { show_file_error( pp.name(), "file is too short" ); + std::free( buffer ); return 2; } + + if( buffer[0] != 93 ) // (45 * 2) + (9 * 0) + 3 + { + const Lzip_header & header = *(const Lzip_header *)buffer; + if( header.verify_magic() && header.verify_version() && + isvalid_ds( header.dictionary_size() ) ) + show_file_error( pp.name(), "file is already in lzip format" ); + else + show_file_error( pp.name(), "file has non-default LZMA properties" ); + std::free( buffer ); return 2; + } + for( int i = 5; i < 13; ++i ) if( buffer[i] != 0xFF ) + { show_file_error( pp.name(), "file is non-streamed" ); + std::free( buffer ); return 2; } + + if( verbosity >= 1 ) pp(); + unsigned dictionary_size = 0; + for( int i = 4; i > 0; --i ) + { dictionary_size <<= 8; dictionary_size += buffer[i]; } + const unsigned orig_dictionary_size = dictionary_size; + validate_ds( &dictionary_size ); + Lzip_header & header = *(Lzip_header *)( buffer + offset ); + header.set_magic(); + header.dictionary_size( dictionary_size ); + for( int i = 0; i < Lzip_trailer::size; ++i ) buffer[file_size++] = 0; + { + LZ_mtester mtester( buffer + offset, file_size - offset, dictionary_size ); + const int result = mtester.test_member(); + if( result == 1 && orig_dictionary_size > max_dictionary_size ) + { pp( "dictionary size is too large" ); std::free( buffer ); return 2; } + if( result != 3 || !mtester.finished() ) + { pp( "file is corrupt" ); std::free( buffer ); return 2; } + if( mtester.max_distance() < dictionary_size && + dictionary_size > min_dictionary_size ) + { + dictionary_size = + std::max( mtester.max_distance(), (unsigned)min_dictionary_size ); + header.dictionary_size( dictionary_size ); + } + Lzip_trailer & trailer = + *(Lzip_trailer *)( buffer + file_size - Lzip_trailer::size ); + trailer.data_crc( mtester.crc() ); + trailer.data_size( mtester.data_position() ); + trailer.member_size( mtester.member_position() ); + } + LZ_mtester mtester( buffer + offset, file_size - offset, dictionary_size ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { pp( "conversion failed" ); std::free( buffer ); return 2; } + if( writeblock( outfd, buffer + offset, file_size - offset ) != file_size - offset ) + { + show_error( "Error writing output file", errno ); + std::free( buffer ); return 1; + } + std::free( buffer ); + if( verbosity >= 1 ) std::fputs( "done\n", stderr ); + return 0; + } diff --git a/arg_parser.cc b/arg_parser.cc new file mode 100644 index 0000000..59998ac --- /dev/null +++ b/arg_parser.cc @@ -0,0 +1,197 @@ +/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) + Copyright (C) 2006-2022 Antonio Diaz Diaz. + + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#include <cstring> +#include <string> +#include <vector> + +#include "arg_parser.h" + + +bool Arg_parser::parse_long_option( const char * const opt, const char * const arg, + const Option options[], int & argind ) + { + unsigned len; + int index = -1; + bool exact = false, ambig = false; + + for( len = 0; opt[len+2] && opt[len+2] != '='; ++len ) ; + + // Test all long options for either exact match or abbreviated matches. + for( int i = 0; options[i].code != 0; ++i ) + if( options[i].long_name && + std::strncmp( options[i].long_name, &opt[2], len ) == 0 ) + { + if( std::strlen( options[i].long_name ) == len ) // Exact match found + { index = i; exact = true; break; } + else if( index < 0 ) index = i; // First nonexact match found + else if( options[index].code != options[i].code || + options[index].has_arg != options[i].has_arg ) + ambig = true; // Second or later nonexact match found + } + + if( ambig && !exact ) + { + error_ = "option '"; error_ += opt; error_ += "' is ambiguous"; + return false; + } + + if( index < 0 ) // nothing found + { + error_ = "unrecognized option '"; error_ += opt; error_ += '\''; + return false; + } + + ++argind; + data.push_back( Record( options[index].code, options[index].long_name ) ); + + if( opt[len+2] ) // '--<long_option>=<argument>' syntax + { + if( options[index].has_arg == no ) + { + error_ = "option '--"; error_ += options[index].long_name; + error_ += "' doesn't allow an argument"; + return false; + } + if( options[index].has_arg == yes && !opt[len+3] ) + { + error_ = "option '--"; error_ += options[index].long_name; + error_ += "' requires an argument"; + return false; + } + data.back().argument = &opt[len+3]; + return true; + } + + if( options[index].has_arg == yes ) + { + if( !arg || !arg[0] ) + { + error_ = "option '--"; error_ += options[index].long_name; + error_ += "' requires an argument"; + return false; + } + ++argind; data.back().argument = arg; + return true; + } + + return true; + } + + +bool Arg_parser::parse_short_option( const char * const opt, const char * const arg, + const Option options[], int & argind ) + { + int cind = 1; // character index in opt + + while( cind > 0 ) + { + int index = -1; + const unsigned char c = opt[cind]; + + if( c != 0 ) + for( int i = 0; options[i].code; ++i ) + if( c == options[i].code ) + { index = i; break; } + + if( index < 0 ) + { + error_ = "invalid option -- '"; error_ += c; error_ += '\''; + return false; + } + + data.push_back( Record( c ) ); + if( opt[++cind] == 0 ) { ++argind; cind = 0; } // opt finished + + if( options[index].has_arg != no && cind > 0 && opt[cind] ) + { + data.back().argument = &opt[cind]; ++argind; cind = 0; + } + else if( options[index].has_arg == yes ) + { + if( !arg || !arg[0] ) + { + error_ = "option requires an argument -- '"; error_ += c; + error_ += '\''; + return false; + } + data.back().argument = arg; ++argind; cind = 0; + } + } + return true; + } + + +Arg_parser::Arg_parser( const int argc, const char * const argv[], + const Option options[], const bool in_order ) + { + if( argc < 2 || !argv || !options ) return; + + std::vector< const char * > non_options; // skipped non-options + int argind = 1; // index in argv + + while( argind < argc ) + { + const unsigned char ch1 = argv[argind][0]; + const unsigned char ch2 = ch1 ? argv[argind][1] : 0; + + if( ch1 == '-' && ch2 ) // we found an option + { + const char * const opt = argv[argind]; + const char * const arg = ( argind + 1 < argc ) ? argv[argind+1] : 0; + if( ch2 == '-' ) + { + if( !argv[argind][2] ) { ++argind; break; } // we found "--" + else if( !parse_long_option( opt, arg, options, argind ) ) break; + } + else if( !parse_short_option( opt, arg, options, argind ) ) break; + } + else + { + if( in_order ) data.push_back( Record( argv[argind++] ) ); + else non_options.push_back( argv[argind++] ); + } + } + if( !error_.empty() ) data.clear(); + else + { + for( unsigned i = 0; i < non_options.size(); ++i ) + data.push_back( Record( non_options[i] ) ); + while( argind < argc ) + data.push_back( Record( argv[argind++] ) ); + } + } + + +Arg_parser::Arg_parser( const char * const opt, const char * const arg, + const Option options[] ) + { + if( !opt || !opt[0] || !options ) return; + + if( opt[0] == '-' && opt[1] ) // we found an option + { + int argind = 1; // dummy + if( opt[1] == '-' ) + { if( opt[2] ) parse_long_option( opt, arg, options, argind ); } + else + parse_short_option( opt, arg, options, argind ); + if( !error_.empty() ) data.clear(); + } + else data.push_back( Record( opt ) ); + } diff --git a/arg_parser.h b/arg_parser.h new file mode 100644 index 0000000..e854838 --- /dev/null +++ b/arg_parser.h @@ -0,0 +1,110 @@ +/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) + Copyright (C) 2006-2022 Antonio Diaz Diaz. + + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +/* Arg_parser reads the arguments in 'argv' and creates a number of + option codes, option arguments, and non-option arguments. + + In case of error, 'error' returns a non-empty error message. + + 'options' is an array of 'struct Option' terminated by an element + containing a code which is zero. A null long_name means a short-only + option. A code value outside the unsigned char range means a long-only + option. + + Arg_parser normally makes it appear as if all the option arguments + were specified before all the non-option arguments for the purposes + of parsing, even if the user of your program intermixed option and + non-option arguments. If you want the arguments in the exact order + the user typed them, call 'Arg_parser' with 'in_order' = true. + + The argument '--' terminates all options; any following arguments are + treated as non-option arguments, even if they begin with a hyphen. + + The syntax for optional option arguments is '-<short_option><argument>' + (without whitespace), or '--<long_option>=<argument>'. +*/ + +class Arg_parser + { +public: + enum Has_arg { no, yes, maybe }; + + struct Option + { + int code; // Short option letter or code ( code != 0 ) + const char * long_name; // Long option name (maybe null) + Has_arg has_arg; + }; + +private: + struct Record + { + int code; + std::string parsed_name; + std::string argument; + explicit Record( const unsigned char c ) + : code( c ), parsed_name( "-" ) { parsed_name += c; } + Record( const int c, const char * const long_name ) + : code( c ), parsed_name( "--" ) { parsed_name += long_name; } + explicit Record( const char * const arg ) : code( 0 ), argument( arg ) {} + }; + + const std::string empty_arg; + std::string error_; + std::vector< Record > data; + + bool parse_long_option( const char * const opt, const char * const arg, + const Option options[], int & argind ); + bool parse_short_option( const char * const opt, const char * const arg, + const Option options[], int & argind ); + +public: + Arg_parser( const int argc, const char * const argv[], + const Option options[], const bool in_order = false ); + + // Restricted constructor. Parses a single token and argument (if any). + Arg_parser( const char * const opt, const char * const arg, + const Option options[] ); + + const std::string & error() const { return error_; } + + // The number of arguments parsed. May be different from argc. + int arguments() const { return data.size(); } + + /* If code( i ) is 0, argument( i ) is a non-option. + Else argument( i ) is the option's argument (or empty). */ + int code( const int i ) const + { + if( i >= 0 && i < arguments() ) return data[i].code; + else return 0; + } + + // Full name of the option parsed (short or long). + const std::string & parsed_name( const int i ) const + { + if( i >= 0 && i < arguments() ) return data[i].parsed_name; + else return empty_arg; + } + + const std::string & argument( const int i ) const + { + if( i >= 0 && i < arguments() ) return data[i].argument; + else return empty_arg; + } + }; diff --git a/common.h b/common.h new file mode 100644 index 0000000..c3d0691 --- /dev/null +++ b/common.h @@ -0,0 +1,43 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +struct Bad_byte + { + enum Mode { literal, delta, flip }; + long long pos; + const char * option_name; + Mode mode; + uint8_t value; + + Bad_byte() : pos( -1 ), option_name( 0 ), mode( literal ), value( 0 ) {} + uint8_t operator()( const uint8_t old_value ) const + { + if( mode == delta ) return old_value + value; + if( mode == flip ) return old_value ^ value; + return value; + } + + void parse_bb( const char * const arg, const char * const pn ); + }; + + +// defined in main_common.cc +void show_error( const char * const msg, const int errcode = 0, + const bool help = false ); +void show_file_error( const char * const filename, const char * const msg, + const int errcode = 0 ); +void internal_error( const char * const msg ); diff --git a/configure b/configure new file mode 100755 index 0000000..577f04f --- /dev/null +++ b/configure @@ -0,0 +1,193 @@ +#! /bin/sh +# configure script for Lziprecover - Data recovery tool for the lzip format +# Copyright (C) 2009-2022 Antonio Diaz Diaz. +# +# This configure script is free software: you have unlimited permission +# to copy, distribute, and modify it. + +pkgname=lziprecover +pkgversion=1.23 +progname=lziprecover +srctrigger=doc/${pkgname}.texi + +# clear some things potentially inherited from environment. +LC_ALL=C +export LC_ALL +srcdir= +prefix=/usr/local +exec_prefix='$(prefix)' +bindir='$(exec_prefix)/bin' +datarootdir='$(prefix)/share' +infodir='$(datarootdir)/info' +mandir='$(datarootdir)/man' +CXX=g++ +CPPFLAGS= +CXXFLAGS='-Wall -W -O2' +LDFLAGS= + +# checking whether we are using GNU C++. +/bin/sh -c "${CXX} --version" > /dev/null 2>&1 || { CXX=c++ ; CXXFLAGS=-O2 ; } + +# Loop over all args +args= +no_create= +while [ $# != 0 ] ; do + + # Get the first arg, and shuffle + option=$1 ; arg2=no + shift + + # Add the argument quoted to args + if [ -z "${args}" ] ; then args="\"${option}\"" + else args="${args} \"${option}\"" ; fi + + # Split out the argument for options that take them + case ${option} in + *=*) optarg=`echo "${option}" | sed -e 's,^[^=]*=,,;s,/$,,'` ;; + esac + + # Process the options + case ${option} in + --help | -h) + echo "Usage: $0 [OPTION]... [VAR=VALUE]..." + echo + echo "To assign makefile variables (e.g., CXX, CXXFLAGS...), specify them as" + echo "arguments to configure in the form VAR=VALUE." + echo + echo "Options and variables: [defaults in brackets]" + echo " -h, --help display this help and exit" + echo " -V, --version output version information and exit" + echo " --srcdir=DIR find the sources in DIR [. or ..]" + echo " --prefix=DIR install into DIR [${prefix}]" + echo " --exec-prefix=DIR base directory for arch-dependent files [${exec_prefix}]" + echo " --bindir=DIR user executables directory [${bindir}]" + echo " --datarootdir=DIR base directory for doc and data [${datarootdir}]" + echo " --infodir=DIR info files directory [${infodir}]" + echo " --mandir=DIR man pages directory [${mandir}]" + echo " CXX=COMPILER C++ compiler to use [${CXX}]" + echo " CPPFLAGS=OPTIONS command line options for the preprocessor [${CPPFLAGS}]" + echo " CXXFLAGS=OPTIONS command line options for the C++ compiler [${CXXFLAGS}]" + echo " CXXFLAGS+=OPTIONS append options to the current value of CXXFLAGS" + echo " LDFLAGS=OPTIONS command line options for the linker [${LDFLAGS}]" + echo + exit 0 ;; + --version | -V) + echo "Configure script for ${pkgname} version ${pkgversion}" + exit 0 ;; + --srcdir) srcdir=$1 ; arg2=yes ;; + --prefix) prefix=$1 ; arg2=yes ;; + --exec-prefix) exec_prefix=$1 ; arg2=yes ;; + --bindir) bindir=$1 ; arg2=yes ;; + --datarootdir) datarootdir=$1 ; arg2=yes ;; + --infodir) infodir=$1 ; arg2=yes ;; + --mandir) mandir=$1 ; arg2=yes ;; + + --srcdir=*) srcdir=${optarg} ;; + --prefix=*) prefix=${optarg} ;; + --exec-prefix=*) exec_prefix=${optarg} ;; + --bindir=*) bindir=${optarg} ;; + --datarootdir=*) datarootdir=${optarg} ;; + --infodir=*) infodir=${optarg} ;; + --mandir=*) mandir=${optarg} ;; + --no-create) no_create=yes ;; + + CXX=*) CXX=${optarg} ;; + CPPFLAGS=*) CPPFLAGS=${optarg} ;; + CXXFLAGS=*) CXXFLAGS=${optarg} ;; + CXXFLAGS+=*) CXXFLAGS="${CXXFLAGS} ${optarg}" ;; + LDFLAGS=*) LDFLAGS=${optarg} ;; + + --*) + echo "configure: WARNING: unrecognized option: '${option}'" 1>&2 ;; + *=* | *-*-*) ;; + *) + echo "configure: unrecognized option: '${option}'" 1>&2 + echo "Try 'configure --help' for more information." 1>&2 + exit 1 ;; + esac + + # Check if the option took a separate argument + if [ "${arg2}" = yes ] ; then + if [ $# != 0 ] ; then args="${args} \"$1\"" ; shift + else echo "configure: Missing argument to '${option}'" 1>&2 + exit 1 + fi + fi +done + +# Find the source files, if location was not specified. +srcdirtext= +if [ -z "${srcdir}" ] ; then + srcdirtext="or . or .." ; srcdir=. + if [ ! -r "${srcdir}/${srctrigger}" ] ; then srcdir=.. ; fi + if [ ! -r "${srcdir}/${srctrigger}" ] ; then + ## the sed command below emulates the dirname command + srcdir=`echo "$0" | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + fi +fi + +if [ ! -r "${srcdir}/${srctrigger}" ] ; then + echo "configure: Can't find sources in ${srcdir} ${srcdirtext}" 1>&2 + echo "configure: (At least ${srctrigger} is missing)." 1>&2 + exit 1 +fi + +# Set srcdir to . if that's what it is. +if [ "`pwd`" = "`cd "${srcdir}" ; pwd`" ] ; then srcdir=. ; fi + +echo +if [ -z "${no_create}" ] ; then + echo "creating config.status" + rm -f config.status + cat > config.status << EOF +#! /bin/sh +# This file was generated automatically by configure. Don't edit. +# Run this file to recreate the current configuration. +# +# This script is free software: you have unlimited permission +# to copy, distribute, and modify it. + +exec /bin/sh $0 ${args} --no-create +EOF + chmod +x config.status +fi + +echo "creating Makefile" +echo "VPATH = ${srcdir}" +echo "prefix = ${prefix}" +echo "exec_prefix = ${exec_prefix}" +echo "bindir = ${bindir}" +echo "datarootdir = ${datarootdir}" +echo "infodir = ${infodir}" +echo "mandir = ${mandir}" +echo "CXX = ${CXX}" +echo "CPPFLAGS = ${CPPFLAGS}" +echo "CXXFLAGS = ${CXXFLAGS}" +echo "LDFLAGS = ${LDFLAGS}" +rm -f Makefile +cat > Makefile << EOF +# Makefile for Lziprecover - Data recovery tool for the lzip format +# Copyright (C) 2009-2022 Antonio Diaz Diaz. +# This file was generated automatically by configure. Don't edit. +# +# This Makefile is free software: you have unlimited permission +# to copy, distribute, and modify it. + +pkgname = ${pkgname} +pkgversion = ${pkgversion} +progname = ${progname} +VPATH = ${srcdir} +prefix = ${prefix} +exec_prefix = ${exec_prefix} +bindir = ${bindir} +datarootdir = ${datarootdir} +infodir = ${infodir} +mandir = ${mandir} +CXX = ${CXX} +CPPFLAGS = ${CPPFLAGS} +CXXFLAGS = ${CXXFLAGS} +LDFLAGS = ${LDFLAGS} +EOF +cat "${srcdir}/Makefile.in" >> Makefile + +echo "OK. Now you can run make." diff --git a/decoder.cc b/decoder.cc new file mode 100644 index 0000000..345d02b --- /dev/null +++ b/decoder.cc @@ -0,0 +1,300 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> + +#include "lzip.h" +#include "decoder.h" + + +const CRC32 crc32; + + +/* Return the number of bytes really read. + If (value returned < size) and (errno == 0), means EOF was reached. +*/ +long long readblock( const int fd, uint8_t * const buf, const long long size ) + { + long long sz = 0; + errno = 0; + while( sz < size ) + { + const int n = read( fd, buf + sz, std::min( 1LL << 20, size - sz ) ); + if( n > 0 ) sz += n; + else if( n == 0 ) break; // EOF + else if( errno != EINTR ) break; + errno = 0; + } + return sz; + } + + +/* Return the number of bytes really written. + If (value returned < size), it is always an error. +*/ +long long writeblock( const int fd, const uint8_t * const buf, + const long long size ) + { + long long sz = 0; + errno = 0; + while( sz < size ) + { + const int n = write( fd, buf + sz, std::min( 1LL << 20, size - sz ) ); + if( n > 0 ) sz += n; + else if( n < 0 && errno != EINTR ) break; + errno = 0; + } + return sz; + } + + +bool Range_decoder::read_block() + { + if( !at_stream_end ) + { + stream_pos = readblock( infd, buffer, buffer_size ); + if( stream_pos != buffer_size && errno ) throw Error( "Read error" ); + at_stream_end = ( stream_pos < buffer_size ); + partial_member_pos += pos; + pos = 0; + show_dprogress(); + } + return pos < stream_pos; + } + + +void LZ_decoder::flush_data() + { + if( pos > stream_pos ) + { + const int size = pos - stream_pos; + crc32.update_buf( crc_, buffer + stream_pos, size ); + if( outfd >= 0 ) + { + const unsigned long long sp = stream_position(); + const long long i = positive_diff( outskip, sp ); + const long long s = + std::min( positive_diff( outend, sp ), (unsigned long long)size ) - i; + if( s > 0 && writeblock( outfd, buffer + stream_pos + i, s ) != s ) + throw Error( "Write error" ); + } + if( pos >= dictionary_size ) + { partial_data_pos += pos; pos = 0; pos_wrapped = true; } + stream_pos = pos; + } + } + + +bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const + { + Lzip_trailer trailer; + int size = rdec.read_data( trailer.data, Lzip_trailer::size ); + const unsigned long long data_size = data_position(); + const unsigned long long member_size = rdec.member_position(); + bool error = false; + + if( size < Lzip_trailer::size ) + { + error = true; + if( verbosity >= 0 ) + { + pp(); + std::fprintf( stderr, "Trailer truncated at trailer position %d;" + " some checks may fail.\n", size ); + } + while( size < Lzip_trailer::size ) trailer.data[size++] = 0; + } + + const unsigned td_crc = trailer.data_crc(); + if( td_crc != crc() ) + { + error = true; + if( verbosity >= 0 ) + { + pp(); + std::fprintf( stderr, "CRC mismatch; stored %08X, computed %08X\n", + td_crc, crc() ); + } + } + const unsigned long long td_size = trailer.data_size(); + if( td_size != data_size ) + { + error = true; + if( verbosity >= 0 ) + { + pp(); + std::fprintf( stderr, "Data size mismatch; stored %llu (0x%llX), computed %llu (0x%llX)\n", + td_size, td_size, data_size, data_size ); + } + } + const unsigned long long tm_size = trailer.member_size(); + if( tm_size != member_size ) + { + error = true; + if( verbosity >= 0 ) + { + pp(); + std::fprintf( stderr, "Member size mismatch; stored %llu (0x%llX), computed %llu (0x%llX)\n", + tm_size, tm_size, member_size, member_size ); + } + } + if( error ) return false; + if( verbosity >= 2 ) + { + if( verbosity >= 4 ) show_header( dictionary_size ); + if( data_size == 0 || member_size == 0 ) + std::fputs( "no data compressed. ", stderr ); + else + std::fprintf( stderr, "%6.3f:1, %5.2f%% ratio, %5.2f%% saved. ", + (double)data_size / member_size, + ( 100.0 * member_size ) / data_size, + 100.0 - ( ( 100.0 * member_size ) / data_size ) ); + if( verbosity >= 4 ) std::fprintf( stderr, "CRC %08X, ", td_crc ); + if( verbosity >= 3 ) + std::fprintf( stderr, "%9llu out, %8llu in. ", data_size, member_size ); + } + if( rdec.get_code() != 0 && verbosity >= 1 ) + { // corruption in the last 4 bytes of the EOS marker + pp(); + std::fprintf( stderr, "Range decoder final code is %08X\n", rdec.get_code() ); + } + return true; + } + + +/* Return value: 0 = OK, 1 = decoder error, 2 = unexpected EOF, + 3 = trailer error, 4 = unknown marker found. */ +int LZ_decoder::decode_member( const Pretty_print & pp ) + { + Bit_model bm_literal[1<<literal_context_bits][0x300]; + Bit_model bm_match[State::states][pos_states]; + Bit_model bm_rep[State::states]; + Bit_model bm_rep0[State::states]; + Bit_model bm_rep1[State::states]; + Bit_model bm_rep2[State::states]; + Bit_model bm_len[State::states][pos_states]; + Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; + Bit_model bm_dis[modeled_distances-end_dis_model+1]; + Bit_model bm_align[dis_align_size]; + Len_model match_len_model; + Len_model rep_len_model; + unsigned rep0 = 0; // rep[0-3] latest four distances + unsigned rep1 = 0; // used for efficient coding of + unsigned rep2 = 0; // repeated distances + unsigned rep3 = 0; + State state; + + rdec.load(); + while( !rdec.finished() ) + { + const int pos_state = data_position() & pos_state_mask; + if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit + { + // literal byte + Bit_model * const bm = bm_literal[get_lit_state(peek_prev())]; + if( state.is_char_set_char() ) + put_byte( rdec.decode_tree8( bm ) ); + else + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); + continue; + } + // match or repeated match + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + state.set_rep(); + len = rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + len = rdec.decode_len( match_len_model, pos_state ); + unsigned distance = rdec.decode_tree6( bm_dis_slot[get_len_state(len)] ); + if( distance >= start_dis_model ) + { + const unsigned dis_slot = distance; + const int direct_bits = ( dis_slot >> 1 ) - 1; + distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + distance += rdec.decode_tree_reversed( + bm_dis + ( distance - dis_slot ), direct_bits ); + else + { + distance += + rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + distance += rdec.decode_tree_reversed4( bm_align ); + if( distance == 0xFFFFFFFFU ) // marker found + { + rdec.normalize(); + flush_data(); + if( len == min_match_len ) // End Of Stream marker + { + if( verify_trailer( pp ) ) return 0; else return 3; + } + if( len == min_match_len + 1 ) // Sync Flush marker + { + rdec.load(); continue; + } + if( verbosity >= 0 ) + { + pp(); + std::fprintf( stderr, "Unsupported marker code '%d'\n", len ); + } + return 4; + } + } + } + rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return 1; } + } + copy_block( rep0, len ); + } + flush_data(); + return 2; + } diff --git a/decoder.h b/decoder.h new file mode 100644 index 0000000..5b06b25 --- /dev/null +++ b/decoder.h @@ -0,0 +1,383 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +class Range_decoder + { + enum { buffer_size = 16384 }; + unsigned long long partial_member_pos; + uint8_t * const buffer; // input buffer + int pos; // current pos in buffer + int stream_pos; // when reached, a new block must be read + uint32_t code; + uint32_t range; + const int infd; // input file descriptor + bool at_stream_end; + + bool read_block(); + + Range_decoder( const Range_decoder & ); // declared as private + void operator=( const Range_decoder & ); // declared as private + +public: + explicit Range_decoder( const int ifd ) + : + partial_member_pos( 0 ), + buffer( new uint8_t[buffer_size] ), + pos( 0 ), + stream_pos( 0 ), + code( 0 ), + range( 0xFFFFFFFFU ), + infd( ifd ), + at_stream_end( false ) + {} + + ~Range_decoder() { delete[] buffer; } + + unsigned get_code() const { return code; } + bool finished() { return pos >= stream_pos && !read_block(); } + + unsigned long long member_position() const + { return partial_member_pos + pos; } + + void reset_member_position() + { partial_member_pos = 0; partial_member_pos -= pos; } + + uint8_t get_byte() + { + // 0xFF avoids decoder error if member is truncated at EOS marker + if( finished() ) return 0xFF; + return buffer[pos++]; + } + + int read_data( uint8_t * const outbuf, const int size ) + { + int sz = 0; + while( sz < size && !finished() ) + { + const int rd = std::min( size - sz, stream_pos - pos ); + std::memcpy( outbuf + sz, buffer + pos, rd ); + pos += rd; + sz += rd; + } + return sz; + } + + /* if ignore_errors, stop reading before the first wrong byte, so that + unreading the header is not required to sync to next member */ + int read_header_carefully( Lzip_header & header, const bool ignore_errors ) + { + int sz = 0; + while( sz < Lzip_header::size && !finished() ) + { + header.data[sz] = buffer[pos]; + if( ignore_errors && + ( ( sz < 4 && header.data[sz] != lzip_magic[sz] ) || + ( sz == 4 && !header.verify_version() ) || + ( sz == 5 && !isvalid_ds( header.dictionary_size() ) ) ) ) break; + ++pos; ++sz; + } + return sz; + } + + bool find_header( Lzip_header & header ) + { + while( !finished() ) + { + if( buffer[pos] != lzip_magic[0] ) { ++pos; continue; } + reset_member_position(); + Lzip_header h; + if( read_header_carefully( h, true ) == Lzip_header::size ) + { header = h; return true; } + } + return false; + } + + void load() + { + code = 0; + for( int i = 0; i < 5; ++i ) code = ( code << 8 ) | get_byte(); + range = 0xFFFFFFFFU; + code &= range; // make sure that first byte is discarded + } + + void normalize() + { + if( range <= 0x00FFFFFFU ) + { range <<= 8; code = ( code << 8 ) | get_byte(); } + } + + unsigned decode( const int num_bits ) + { + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) + { + normalize(); + range >>= 1; +// symbol <<= 1; +// if( code >= range ) { code -= range; symbol |= 1; } + const bool bit = ( code >= range ); + symbol <<= 1; symbol += bit; + code -= range & ( 0U - bit ); + } + return symbol; + } + + unsigned decode_bit( Bit_model & bm ) + { + normalize(); + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; + return 0; + } + else + { + code -= bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + return 1; + } + } + + void decode_symbol_bit( Bit_model & bm, unsigned & symbol ) + { + normalize(); + symbol <<= 1; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; + } + else + { + code -= bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol |= 1; + } + } + + void decode_symbol_bit_reversed( Bit_model & bm, unsigned & model, + unsigned & symbol, const int i ) + { + normalize(); + model <<= 1; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; + } + else + { + code -= bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + model |= 1; + symbol |= 1 << i; + } + } + + unsigned decode_tree6( Bit_model bm[] ) + { + unsigned symbol = 1; + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + return symbol & 0x3F; + } + + unsigned decode_tree8( Bit_model bm[] ) + { + unsigned symbol = 1; + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + return symbol & 0xFF; + } + + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + unsigned model = 1; + unsigned symbol = 0; + for( int i = 0; i < num_bits; ++i ) + decode_symbol_bit_reversed( bm[model], model, symbol, i ); + return symbol; + } + + unsigned decode_tree_reversed4( Bit_model bm[] ) + { + unsigned model = 1; + unsigned symbol = 0; + decode_symbol_bit_reversed( bm[model], model, symbol, 0 ); + decode_symbol_bit_reversed( bm[model], model, symbol, 1 ); + decode_symbol_bit_reversed( bm[model], model, symbol, 2 ); + decode_symbol_bit_reversed( bm[model], model, symbol, 3 ); + return symbol; + } + + unsigned decode_matched( Bit_model bm[], unsigned match_byte ) + { + Bit_model * const bm1 = bm + 0x100; + unsigned symbol = 1; + while( symbol < 0x100 ) + { + const unsigned match_bit = ( match_byte <<= 1 ) & 0x100; + const bool bit = decode_bit( bm1[symbol+match_bit] ); + symbol <<= 1; symbol |= bit; + if( match_bit >> 8 != bit ) + { + while( symbol < 0x100 ) decode_symbol_bit( bm[symbol], symbol ); + break; + } + } + return symbol & 0xFF; + } + + unsigned decode_len( Len_model & lm, const int pos_state ) + { + Bit_model * bm; + unsigned mask, offset, symbol = 1; + + if( decode_bit( lm.choice1 ) == 0 ) + { bm = lm.bm_low[pos_state]; mask = 7; offset = 0; goto len3; } + if( decode_bit( lm.choice2 ) == 0 ) + { bm = lm.bm_mid[pos_state]; mask = 7; offset = len_low_symbols; goto len3; } + bm = lm.bm_high; mask = 0xFF; offset = len_low_symbols + len_mid_symbols; + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); +len3: + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + return ( symbol & mask ) + min_match_len + offset; + } + }; + + +class LZ_decoder + { + const unsigned long long outskip; + const unsigned long long outend; + unsigned long long partial_data_pos; + Range_decoder & rdec; + const unsigned dictionary_size; + uint8_t * const buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to file + uint32_t crc_; + const int outfd; // output file descriptor + bool pos_wrapped; + + unsigned long long stream_position() const + { return partial_data_pos + stream_pos; } + void flush_data(); + bool verify_trailer( const Pretty_print & pp ) const; + + uint8_t peek_prev() const + { return buffer[((pos > 0) ? pos : dictionary_size)-1]; } + + uint8_t peek( const unsigned distance ) const + { + const unsigned i = ( ( pos > distance ) ? 0 : dictionary_size ) + + pos - distance - 1; + return buffer[i]; + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + + void copy_block( const unsigned distance, unsigned len ) + { + unsigned lpos = pos, i = lpos - distance - 1; + bool fast, fast2; + if( lpos > distance ) + { + fast = ( len < dictionary_size - lpos ); + fast2 = ( fast && len <= lpos - i ); + } + else + { + i += dictionary_size; + fast = ( len < dictionary_size - i ); // (i == pos) may happen + fast2 = ( fast && len <= i - lpos ); + } + if( fast ) // no wrap + { + pos += len; + if( fast2 ) // no wrap, no overlap + std::memcpy( buffer + lpos, buffer + i, len ); + else + for( ; len > 0; --len ) buffer[lpos++] = buffer[i++]; + } + else for( ; len > 0; --len ) + { + buffer[pos] = buffer[i]; + if( ++pos >= dictionary_size ) flush_data(); + if( ++i >= dictionary_size ) i = 0; + } + } + + LZ_decoder( const LZ_decoder & ); // declared as private + void operator=( const LZ_decoder & ); // declared as private + +public: + LZ_decoder( Range_decoder & rde, const unsigned dict_size, const int ofd, + const unsigned long long oskip = 0, + const unsigned long long oend = -1ULL ) + : + outskip( oskip ), + outend( oend ), + partial_data_pos( 0 ), + rdec( rde ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ), + outfd( ofd ), + pos_wrapped( false ) + // prev_byte of first byte; also for peek( 0 ) on corrupt file + { buffer[dictionary_size-1] = 0; } + + ~LZ_decoder() { delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const { return partial_data_pos + pos; } + + int decode_member( const Pretty_print & pp ); + }; diff --git a/doc/lziprecover.1 b/doc/lziprecover.1 new file mode 100644 index 0000000..e05a366 --- /dev/null +++ b/doc/lziprecover.1 @@ -0,0 +1,143 @@ +.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.47.16. +.TH LZIPRECOVER "1" "January 2022" "lziprecover 1.23" "User Commands" +.SH NAME +lziprecover \- recovers data from damaged lzip files +.SH SYNOPSIS +.B lziprecover +[\fI\,options\/\fR] [\fI\,files\/\fR] +.SH DESCRIPTION +Lziprecover is a data recovery tool and decompressor for files in the lzip +compressed data format (.lz). Lziprecover is able to repair slightly damaged +files (up to one single\-byte error per member), produce a correct file by +merging the good parts of two or more damaged copies, reproduce a missing +(zeroed) sector using a reference file, extract data from damaged files, +decompress files, and test integrity of files. +.PP +With the help of lziprecover, losing an entire archive just because of a +corrupt byte near the beginning is a thing of the past. +.PP +Lziprecover can remove the damaged members from multimember files, for +example multimember tar.lz archives. +.PP +Lziprecover provides random access to the data in multimember files; it only +decompresses the members containing the desired data. +.PP +Lziprecover facilitates the management of metadata stored as trailing data +in lzip files. +.PP +Lziprecover is not a replacement for regular backups, but a last line of +defense for the case where the backups are also damaged. +.SH OPTIONS +.TP +\fB\-h\fR, \fB\-\-help\fR +display this help and exit +.TP +\fB\-V\fR, \fB\-\-version\fR +output version information and exit +.TP +\fB\-a\fR, \fB\-\-trailing\-error\fR +exit with error status if trailing data +.TP +\fB\-A\fR, \fB\-\-alone\-to\-lz\fR +convert lzma\-alone files to lzip format +.TP +\fB\-c\fR, \fB\-\-stdout\fR +write to standard output, keep input files +.TP +\fB\-d\fR, \fB\-\-decompress\fR +decompress +.TP +\fB\-D\fR, \fB\-\-range\-decompress=\fR<n\-m> +decompress a range of bytes to stdout +.TP +\fB\-e\fR, \fB\-\-reproduce\fR +try to reproduce a zeroed sector in file +.TP +\fB\-\-lzip\-level\fR=\fI\,N\/\fR|a|m[N] +reproduce one level, all, or match length +.TP +\fB\-\-lzip\-name=\fR<name> +name of lzip executable for \fB\-\-reproduce\fR +.TP +\fB\-\-reference\-file=\fR<file> +reference file for \fB\-\-reproduce\fR +.TP +\fB\-f\fR, \fB\-\-force\fR +overwrite existing output files +.TP +\fB\-i\fR, \fB\-\-ignore\-errors\fR +ignore some errors in \fB\-d\fR, \fB\-D\fR, \fB\-l\fR, \fB\-t\fR, \fB\-\-dump\fR +.TP +\fB\-k\fR, \fB\-\-keep\fR +keep (don't delete) input files +.TP +\fB\-l\fR, \fB\-\-list\fR +print (un)compressed file sizes +.TP +\fB\-m\fR, \fB\-\-merge\fR +correct errors in file using several copies +.TP +\fB\-o\fR, \fB\-\-output=\fR<file> +place the output into <file> +.TP +\fB\-q\fR, \fB\-\-quiet\fR +suppress all messages +.TP +\fB\-R\fR, \fB\-\-repair\fR +try to repair a small error in file +.TP +\fB\-s\fR, \fB\-\-split\fR +split multimember file in single\-member files +.TP +\fB\-t\fR, \fB\-\-test\fR +test compressed file integrity +.TP +\fB\-v\fR, \fB\-\-verbose\fR +be verbose (a 2nd \fB\-v\fR gives more) +.TP +\fB\-\-loose\-trailing\fR +allow trailing data seeming corrupt header +.TP +\fB\-\-dump=\fR<list>:d:t +dump members listed/damaged, tdata to stdout +.TP +\fB\-\-remove=\fR<list>:d:t +remove members, tdata from files in place +.TP +\fB\-\-strip=\fR<list>:d:t +copy files to stdout stripping members given +.PP +If no file names are given, or if a file is '\-', lziprecover decompresses +from standard input to standard output. +Numbers may be followed by a multiplier: k = kB = 10^3 = 1000, +Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc... +.PP +To extract all the files from archive 'foo.tar.lz', use the commands +\&'tar \fB\-xf\fR foo.tar.lz' or 'lziprecover \fB\-cd\fR foo.tar.lz | tar \fB\-xf\fR \-'. +.PP +Exit status: 0 for a normal exit, 1 for environmental problems (file +not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or +invalid input file, 3 for an internal consistency error (e.g., bug) which +caused lziprecover to panic. +.SH "REPORTING BUGS" +Report bugs to lzip\-bug@nongnu.org +.br +Lziprecover home page: http://www.nongnu.org/lzip/lziprecover.html +.SH COPYRIGHT +Copyright \(co 2022 Antonio Diaz Diaz. +License GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl.html> +.br +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. +.SH "SEE ALSO" +The full documentation for +.B lziprecover +is maintained as a Texinfo manual. If the +.B info +and +.B lziprecover +programs are properly installed at your site, the command +.IP +.B info lziprecover +.PP +should give you access to the complete manual. diff --git a/doc/lziprecover.info b/doc/lziprecover.info new file mode 100644 index 0000000..112f65b --- /dev/null +++ b/doc/lziprecover.info @@ -0,0 +1,1499 @@ +This is lziprecover.info, produced by makeinfo version 4.13+ from +lziprecover.texi. + +INFO-DIR-SECTION Compression +START-INFO-DIR-ENTRY +* Lziprecover: (lziprecover). Data recovery tool for the lzip format +END-INFO-DIR-ENTRY + + +File: lziprecover.info, Node: Top, Next: Introduction, Up: (dir) + +Lziprecover Manual +****************** + +This manual is for Lziprecover (version 1.23, 21 January 2022). + +* Menu: + +* Introduction:: Purpose and features of lziprecover +* Invoking lziprecover:: Command line interface +* Data safety:: Protecting data from accidental loss +* Repairing one byte:: Fixing bit flips and similar errors +* Merging files:: Fixing several damaged copies +* Reproducing one sector:: Fixing a missing (zeroed) sector +* Tarlz:: Options supporting the tar.lz format +* File names:: Names of the files produced by lziprecover +* File format:: Detailed format of the compressed file +* Trailing data:: Extra data appended to the file +* Examples:: A small tutorial with examples +* Unzcrash:: Testing the robustness of decompressors +* Problems:: Reporting bugs +* Concept index:: Index of concepts + + + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This manual is free documentation: you have unlimited permission to copy, +distribute, and modify it. + + +File: lziprecover.info, Node: Introduction, Next: Invoking lziprecover, Prev: Top, Up: Top + +1 Introduction +************** + +Lziprecover is a data recovery tool and decompressor for files in the lzip +compressed data format (.lz). Lziprecover is able to repair slightly damaged +files (up to one single-byte error per member), produce a correct file by +merging the good parts of two or more damaged copies, reproduce a missing +(zeroed) sector using a reference file, extract data from damaged files, +decompress files, and test integrity of files. + + Lziprecover can remove the damaged members from multimember files, for +example multimember tar.lz archives. + + Lziprecover provides random access to the data in multimember files; it +only decompresses the members containing the desired data. + + Lziprecover facilitates the management of metadata stored as trailing +data in lzip files. + + Lziprecover is not a replacement for regular backups, but a last line of +defense for the case where the backups are also damaged. + + The lzip file format is designed for data sharing and long-term +archiving, taking into account both data integrity and decoder availability: + + * The lzip format provides very safe integrity checking and some data + recovery means. The program lziprecover can repair bit flip errors + (one of the most common forms of data corruption) in lzip files, and + provides data recovery capabilities, including error-checked merging + of damaged copies of a file. *Note Data safety::. + + * The lzip format is as simple as possible (but not simpler). The lzip + manual provides the source code of a simple decompressor along with a + detailed explanation of how it works, so that with the only help of the + lzip manual it would be possible for a digital archaeologist to extract + the data from a lzip file long after quantum computers eventually + render LZMA obsolete. + + * Additionally the lzip reference implementation is copylefted, which + guarantees that it will remain free forever. + + A nice feature of the lzip format is that a corrupt byte is easier to +repair the nearer it is from the beginning of the file. Therefore, with the +help of lziprecover, losing an entire archive just because of a corrupt +byte near the beginning is a thing of the past. + + Compression may be good for long-term archiving. For compressible data, +multiple compressed copies may provide redundancy in a more useful form and +may have a better chance of surviving intact than one uncompressed copy +using the same amount of storage space. This is specially true if the format +provides recovery capabilities like those of lziprecover, which is able to +find and combine the good parts of several damaged copies. + + Lziprecover is able to recover or decompress files produced by any of the +compressors in the lzip family: lzip, plzip, minilzip/lzlib, clzip, and +pdlzip. + + If the cause of file corruption is a damaged medium, the combination +GNU ddrescue + lziprecover is the recommended option for recovering data +from damaged lzip files. *Note ddrescue-example::, and *note +ddrescue-example2::, for examples. + + If a file is too damaged for lziprecover to repair it, all the +recoverable data in all members of the file can be extracted with the +following command (the resulting file may contain errors and some garbage +data may be produced at the end of each damaged member): + + lziprecover -cd -i file.lz > file + + When recovering data, lziprecover takes as arguments the names of the +damaged files and writes zero or more recovered files depending on the +operation selected and whether the recovery succeeded or not. The damaged +files themselves are kept unchanged. + + When decompressing or testing file integrity, lziprecover behaves like +lzip or lunzip. + + LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never +have been compressed. Decompressed is used to refer to data which have +undergone the process of decompression. + + +File: lziprecover.info, Node: Invoking lziprecover, Next: Data safety, Prev: Introduction, Up: Top + +2 Invoking lziprecover +********************** + +The format for running lziprecover is: + + lziprecover [OPTIONS] [FILES] + +When decompressing or testing, a hyphen '-' used as a FILE argument means +standard input. It can be mixed with other FILES and is read just once, the +first time it appears in the command line. If no file names are specified, +lziprecover decompresses from standard input to standard output. + + lziprecover supports the following options: *Note Argument syntax: +(arg_parser)Argument syntax. + +'-h' +'--help' + Print an informative help message describing the options and exit. + +'-V' +'--version' + Print the version number of lziprecover on the standard output and + exit. This version number should be included in all bug reports. + +'-a' +'--trailing-error' + Exit with error status 2 if any remaining input is detected after + decompressing the last member. Such remaining input is usually trailing + garbage that can be safely ignored. *Note concat-example::. + +'-A' +'--alone-to-lz' + Convert lzma-alone files to lzip format without recompressing, just + adding a lzip header and trailer. The conversion minimizes the + dictionary size of the resulting file (and therefore the amount of + memory required to decompress it). Only streamed files with default + LZMA properties can be converted; non-streamed lzma-alone files lack + the "End Of Stream" marker required in lzip files. + + The name of the converted lzip file is derived from that of the + original lzma-alone file as follows: + + filename.lzma becomes filename.lz + filename.tlz becomes filename.tar.lz + anyothername becomes anyothername.lz + +'-c' +'--stdout' + Write decompressed data to standard output; keep input files + unchanged. This option (or '-o') is needed when reading from a named + pipe (fifo) or from a device. Use it also to recover as much of the + decompressed data as possible when decompressing a corrupt file. '-c' + overrides '-o'. '-c' has no effect when merging, removing members, + repairing, reproducing, splitting, testing or listing. + +'-d' +'--decompress' + Decompress the files specified. If a file does not exist, can't be + opened, or the destination file already exists and '--force' has not + been specified, lziprecover continues decompressing the rest of the + files and exits with error status 1. If a file fails to decompress, or + is a terminal, lziprecover exits immediately with error status 2 + without decompressing the rest of the files. A terminal is considered + an uncompressed file, and therefore invalid. + +'-D RANGE' +'--range-decompress=RANGE' + Decompress only a range of bytes starting at decompressed byte position + BEGIN and up to byte position END - 1. Byte positions start at 0. This + option provides random access to the data in multimember files; it + only decompresses the members containing the desired data. In order to + guarantee the correctness of the data produced, all members containing + any part of the desired data are decompressed and their integrity is + verified. + + Four formats of RANGE are recognized, 'BEGIN', 'BEGIN-END', + 'BEGIN,SIZE', and ',SIZE'. If only BEGIN is specified, END is taken as + the end of the file. If only SIZE is specified, BEGIN is taken as the + beginning of the file. The bytes produced are sent to standard output + unless the option '--output' is used. + +'-e' +'--reproduce' + Try to recover a missing (zeroed) sector in FILE using a reference + file and the same version of lzip that created FILE. If successful, a + repaired copy is written to the file 'FILE_fixed.lz'. FILE is not + modified at all. The exit status is 0 if the member containing the + zeroed sector could be repaired, 2 otherwise. Note that + 'FILE_fixed.lz' may still contain errors in the members following the + one repaired. *Note Reproducing one sector::, for a complete + description of the reproduce mode. + +'--lzip-level=DIGIT|a|m[LENGTH]' + Try only the given compression level or match length limit when + reproducing a zeroed sector. '--lzip-level=a' tries all the + compression levels (0 to 9), while '--lzip-level=m' tries all the + match length limits (5 to 273). + +'--lzip-name=NAME' + Set the name of the lzip executable used by '--reproduce'. If + '--lzip-name' is not specified, 'lzip' is used. + +'--reference-file=FILE' + Set the reference file used by '--reproduce'. It must contain the + uncompressed data corresponding to the missing compressed data of the + zeroed sector, plus some context data before and after them. + +'-f' +'--force' + Force overwrite of output files. + +'-i' +'--ignore-errors' + Make '--decompress', '--test', and '--range-decompress' ignore format + and data errors and continue decompressing the remaining members in + the file; keep input files unchanged. For example, the commands + 'lziprecover -cd -i file.lz > file' or + 'lziprecover -D0 -i file.lz > file' decompress all the recoverable + data in all members of 'file.lz' without having to split it first. The + '-cd -i' method resyncs to the next member header after each error, + and is immune to some format errors that make '-D0 -i' fail. The range + decompressed may be smaller than the range requested, because of the + errors. The exit status is set to 0 unless other errors are found (I/O + errors, for example). + + Make '--list', '--dump', '--remove', and '--strip' ignore format + errors. The sizes of the members with errors (specially the last) may + be wrong. + +'-k' +'--keep' + Keep (don't delete) input files during decompression. + +'-l' +'--list' + Print the uncompressed size, compressed size, and percentage saved of + the files specified. Trailing data are ignored. The values produced + are correct even for multimember files. If more than one file is + given, a final line containing the cumulative sizes is printed. With + '-v', the dictionary size, the number of members in the file, and the + amount of trailing data (if any) are also printed. With '-vv', the + positions and sizes of each member in multimember files are also + printed. With '-i', format errors are ignored, and with '-ivv', gaps + between members are shown. The member numbers shown coincide with the + file numbers produced by '--split'. + + If any file is damaged, does not exist, can't be opened, or is not + regular, the final exit status will be > 0. '-lq' can be used to verify + quickly (without decompressing) the structural integrity of the files + specified. (Use '--test' to verify the data integrity). '-alq' + additionally verifies that none of the files specified contain + trailing data. + +'-m' +'--merge' + Try to produce a correct file by merging the good parts of two or more + damaged copies. If successful, a repaired copy is written to the file + 'FILE_fixed.lz'. The exit status is 0 if a correct file could be + produced, 2 otherwise. *Note Merging files::, for a complete + description of the merge mode. + +'-o FILE' +'--output=FILE' + Place the output into FILE instead of into 'FILE_fixed.lz'. If + splitting, the names of the files produced are in the form + 'rec01FILE', 'rec02FILE', etc. + + If decompressing, or converting lzma-alone files, and '-c' has not been + also specified, write the decompressed or converted output to FILE; + keep input files unchanged. This option (or '-c') is needed when + reading from a named pipe (fifo) or from a device. '-o -' is + equivalent to '-c'. '-o' has no effect when testing or listing. + +'-q' +'--quiet' + Quiet operation. Suppress all messages. + +'-R' +'--repair' + Try to repair a FILE with small errors (up to one single-byte error + per member). If successful, a repaired copy is written to the file + 'FILE_fixed.lz'. FILE is not modified at all. The exit status is 0 if + the file could be repaired, 2 otherwise. *Note Repairing one byte::, + for a complete description of the repair mode. + +'-s' +'--split' + Search for members in FILE and write each member in its own file. Gaps + between members are detected and each gap is saved in its own file. + Trailing data (if any) are saved alone in the last file. You can then + use 'lziprecover -t' to test the integrity of the resulting files, + decompress those which are undamaged, and try to repair or partially + decompress those which are damaged. Gaps may contain garbage or may be + members with corrupt headers or trailers. If other lziprecover + functions fail to work on a multimember FILE because of damage in + headers or trailers, try to split FILE and then work on each member + individually. + + The names of the files produced are in the form 'rec01FILE', + 'rec02FILE', etc, and are designed so that the use of wildcards in + subsequent processing, for example, + 'lziprecover -cd rec*FILE > recovered_data', processes the files in + the correct order. The number of digits used in the names varies + depending on the number of members in FILE. + +'-t' +'--test' + Check integrity of the files specified, but don't decompress them. This + really performs a trial decompression and throws away the result. Use + it together with '-v' to see information about the files. If a file + fails the test, does not exist, can't be opened, or is a terminal, + lziprecover continues checking the rest of the files. A final + diagnostic is shown at verbosity level 1 or higher if any file fails + the test when testing multiple files. + +'-v' +'--verbose' + Verbose mode. + When decompressing or testing, further -v's (up to 4) increase the + verbosity level, showing status, compression ratio, dictionary size, + trailer contents (CRC, data size, member size), and up to 6 bytes of + trailing data (if any) both in hexadecimal and as a string of printable + ASCII characters. + Two or more '-v' options show the progress of decompression. + In other modes, increasing verbosity levels show final status, progress + of operations, and extra information (for example, the failed areas). + +'--loose-trailing' + When decompressing, testing, or listing, allow trailing data whose + first bytes are so similar to the magic bytes of a lzip header that + they can be confused with a corrupt header. Use this option if a file + triggers a "corrupt header" error and the cause is not indeed a + corrupt header. + +'--dump=[MEMBER_LIST][:damaged][:tdata]' + Dump the members listed, the damaged members (if any), or the trailing + data (if any) of one or more regular multimember files to standard + output, or to a file if the option '--output' is used. If more than + one file is given, the elements dumped from all files are concatenated. + If a file does not exist, can't be opened, or is not regular, + lziprecover continues processing the rest of the files. If the dump + fails in one file, lziprecover exits immediately without processing the + rest of the files. Only '--dump=tdata' can write to a terminal. + + The argument to '--dump' is a colon-separated list of the following + element specifiers; a member list (1,3-6), a reverse member list + (r1,3-6), and the strings "damaged" and "tdata" (which may be shortened + to 'd' and 't' respectively). A member list selects the members (or + gaps) listed, whose numbers coincide with those shown by '--list'. A + reverse member list selects the members listed counting from the last + member in the file (r1). Negated versions of both kinds of lists exist + (^1,3-6:r^1,3-6) which selects all the members except those in the + list. The strings "damaged" and "tdata" select the damaged members and + the trailing data respectively. If the same member is selected more + than once, for example by '1:r1' in a single-member file, it is dumped + just once. See the following examples: + + '--dump' argument Elements dumped + --------------------------------------------------------------------- + '1,3-6' members 1, 3, 4, 5, 6 + 'r1-3' last 3 members in file + '^13,15' all but 13th and 15th members in file + 'r^1' all but last member in file + 'damaged' all damaged members in file + 'tdata' trailing data + '1-5:r1:tdata' members 1 to 5, last member, trailing data + 'damaged:tdata' damaged members, trailing data + '3,12:damaged:tdata' members 3, 12, damaged members, trailing data + +'--remove=[MEMBER_LIST][:damaged][:tdata]' + Remove the members listed, the damaged members (if any), or the + trailing data (if any) from regular multimember files in place. The + date of each file is preserved if possible. If all members in a file + are selected to be removed, the file is left unchanged and the exit + status is set to 2. If a file does not exist, can't be opened, is not + regular, or is left unchanged, lziprecover continues processing the + rest of the files. In case of I/O error, lziprecover exits immediately + without processing the rest of the files. See '--dump' above for a + description of the argument. + + This option may be dangerous even if only the trailing data is being + removed because the file may be corrupt or the trailing data may + contain a forbidden combination of characters. *Note Trailing data::. + It is advisable to make a backup before attempting the removal. At + least verify that 'lzip -cd file.lz | wc -c' and the uncompressed size + shown by 'lzip -l file.lz' match before attempting the removal of + trailing data. + +'--strip=[MEMBER_LIST][:damaged][:tdata]' + Copy one or more regular multimember files to standard output (or to a + file if the option '--output' is used), stripping the members listed, + the damaged members (if any), or the trailing data (if any) from each + file. If all members in a file are selected to be stripped, the + trailing data (if any) are also stripped even if 'tdata' is not + specified. If more than one file is given, the files are concatenated. + In this case the trailing data are also stripped from all but the last + file even if 'tdata' is not specified. If a file does not exist, can't + be opened, or is not regular, lziprecover continues processing the + rest of the files. If a file fails to copy, lziprecover exits + immediately without processing the rest of the files. See '--dump' + above for a description of the argument. + + + Lziprecover also supports the following debug options (for experts): + +'-E RANGE[,SECTOR_SIZE]' +'--debug-reproduce=RANGE[,SECTOR_SIZE]' + Load the compressed FILE into memory, set all bytes in the positions + specified by RANGE to 0, and try to reproduce a correct compressed + file. *Note --reproduce::. *Note range-format::, for a description of + RANGE. If a SECTOR_SIZE is specified, set each sector to 0 in sequence + and try to reproduce the file, printing to standard output final + statistics of the number of sectors reproduced successfully. Exit with + nonzero status only in case of fatal error. + +'-M' +'--md5sum' + Print to standard output the MD5 digests of the input FILES one per + line in the same format produced by the 'md5sum' tool. Lziprecover + uses MD5 digests to verify the result of some operations. This option + allows the verification of lziprecover's implementation of the MD5 + algorithm. + +'-S[VALUE]' +'--nrep-stats[=VALUE]' + Compare the frequency of sequences of N repeated bytes of a given + VALUE in the compressed LZMA streams of the input FILES with the + frequency expected for random data (1 / 2^(8N)). If VALUE is not + specified, print the frequency of repeated sequences of all possible + byte values. Print cumulative data for all files followed by the name + of the first file with the longest sequence. + +'-U 1|BSIZE' +'--unzcrash=1|BSIZE' + With argument '1', test 1-bit errors in the LZMA stream of the + compressed input FILE like the command + 'unzcrash -b1 -p7 -s-20 'lzip -t' FILE' but in memory, and therefore + much faster. *Note Unzcrash::. This option tests all the members + independently in a multimember file, skipping headers and trailers. If + a decompression succeeds, the decompressed output is compared with the + decompressed output of the original FILE using MD5 digests. FILE must + not contain errors and must decompress correctly for the comparisons to + work. + + With argument 'B', test zeroed sectors (blocks of bytes) in the LZMA + stream of the compressed input FILE like the command + 'unzcrash --block=SIZE -d1 -p7 -s-(SIZE+20) 'lzip -t' FILE' but in + memory, and therefore much faster. Testing and comparisons work just + like with the argument '1' explained above. + + By default '--unzcrash' only prints the interesting cases; CRC + mismatches, size mismatches, unsupported marker codes, unexpected EOFs, + apparently successful decompressions, and decoder errors detected + 50_000 or more bytes beyond the byte (or the start of the block) being + tested. At verbosity level 1 (-v) it also prints decoder errors + detected 10_000 or more bytes beyond the byte being tested. At + verbosity level 2 (-vv) it prints all cases for 1-bit errors or the + decoder errors detected beyond the end of the block for zeroed blocks. + +'-W POSITION,VALUE' +'--debug-decompress=POSITION,VALUE' + Load the compressed FILE into memory, set the byte at POSITION to + VALUE, and decompress the modified compressed data to standard output. + If the damaged member is decompressed fully (just fails with a CRC + mismatch), the members following it are also decompressed. + +'-X[POSITION,VALUE]' +'--show-packets[=POSITION,VALUE]' + Load the compressed FILE into memory, optionally set the byte at + POSITION to VALUE, decompress the modified compressed data (discarding + the output), and print to standard output descriptions of the LZMA + packets being decoded. + +'-Y RANGE' +'--debug-delay=RANGE' + Load the compressed FILE into memory and then repeatedly decompress + it, increasing 256 times each byte of the subset of the compressed data + positions specified by RANGE, so as to test all possible one-byte + errors. For each decompression error find the error detection delay and + print to standard output the maximum delay. The error detection delay + is the difference between the position of the error and the position + where the decoder realized that the data contains an error. *Note + range-format::, for a description of RANGE. + +'-Z POSITION,VALUE' +'--debug-repair=POSITION,VALUE' + Load the compressed FILE into memory, set the byte at POSITION to + VALUE, and then try to repair the error. *Note --repair::. + + + Numbers given as arguments to options may be followed by a multiplier +and an optional 'B' for "byte". + + Table of SI and binary prefixes (unit multipliers): + +Prefix Value | Prefix Value +k kilobyte (10^3 = 1000) | Ki kibibyte (2^10 = 1024) +M megabyte (10^6) | Mi mebibyte (2^20) +G gigabyte (10^9) | Gi gibibyte (2^30) +T terabyte (10^12) | Ti tebibyte (2^40) +P petabyte (10^15) | Pi pebibyte (2^50) +E exabyte (10^18) | Ei exbibyte (2^60) +Z zettabyte (10^21) | Zi zebibyte (2^70) +Y yottabyte (10^24) | Yi yobibyte (2^80) + + + Exit status: 0 for a normal exit, 1 for environmental problems (file not +found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid +input file, 3 for an internal consistency error (e.g., bug) which caused +lziprecover to panic. + + +File: lziprecover.info, Node: Data safety, Next: Repairing one byte, Prev: Invoking lziprecover, Up: Top + +3 Protecting data from accidental loss +************************************** + +It is a fact of life that sometimes data will become corrupt. Software has +errors. Hardware may misbehave or fail. RAM may be struck by a cosmic ray. +This is why a safe enough integrity checking is needed in compressed +formats, and the reason why a data recovery tool is sometimes needed. + + There are 3 main types of data corruption that may cause data loss: +single-byte errors, multibyte errors (generally affecting a whole sector in +a block device), and total device failure. + + Lziprecover protects natively against single-byte errors as long as file +integrity is checked frequently enough that a second single-byte error does +not develop in the same member before the first one is repaired. *Note +Repairing one byte::. + + Lziprecover also protects against multibyte errors if at least one backup +copy of the file is made (*note Merging files::), or if the error is a +zeroed sector and the uncompressed data corresponding to the zeroed sector +are available (*note Reproducing one sector::). If you can choose between +merging and reproducing, try merging first because it is usually faster, +easier to use, and has a high probability of success. + + Lziprecover can't help in case of device failure. The only remedy for +total device failure is storing backup copies in separate media. + + The extraordinary safety of the lzip format allows lziprecover to exploit +the redundance that occurrs naturally when making compressed backups. +Lziprecover can recover data that would not be recoverable from files +compressed in other formats. Let's see two examples of how much better is +lzip compared with gzip and bzip2 with respect to data safety: + +* Menu: + +* Merging with a backup:: Recovering a file using a damaged backup +* Reproducing a mailbox:: Recovering new messages using an old backup + + +File: lziprecover.info, Node: Merging with a backup, Next: Reproducing a mailbox, Up: Data safety + +3.1 Recovering a file using a damaged backup +============================================ + +Let's suppose that you made a compressed backup of your valuable scientific +data and stored two copies on separate media. Years later you notice that +both copies are corrupt. + + If you compressed the data with gzip and both copies suffer any damage in +the data stream, even if it is just one altered bit, the original data can +only be recovered by an expert, if at all. + + If you used bzip2, and if the file is large enough to contain more than +one compressed data block (usually larger than 900 kB uncompressed), and if +no block is damaged in both files, then the data can be manually recovered +by splitting the files with bzip2recover, verifying every block, and then +copying the right blocks in the right order into another file. + + But if you used lzip, the data can be automatically recovered with +'lziprecover --merge' as long as the damaged areas don't overlap. + + Note that each error in a bzip2 file makes a whole block unusable, but +each error in a lzip file only affects the damaged bytes, making it +possible to recover a file with thousands of errors. + + +File: lziprecover.info, Node: Reproducing a mailbox, Prev: Merging with a backup, Up: Data safety + +3.2 Recovering new messages using an old backup +=============================================== + +Let's suppose that you make periodic backups of your email messages stored +in one or more mailboxes. (A mailbox is a file containing a possibly large +number of email messages). New messages are appended to the end of each +mailbox, therefore the initial part of two consecutive backups is identical +unless some messages have been changed or deleted in the meantime. The new +messages added to each backup are usually a small part of the whole mailbox. + ++========================================================+ +| Older backup containing some messages | ++========================================================+ ++========================================================+================+ +| Newer backup containing the messages above plus some | new messages | ++========================================================+================+ + + One day you discover that your mailbox has disappeared because you +deleted it inadvertently or because of a bug in your email reader. Not only +that. You need to recover a recent message, but the last backup you made of +the mailbox (the newer backup above) has lost the data corresponding to a +whole sector because of an I/O error in the part containing the old +messages. + + If you compressed the mailbox with gzip, usually none of the new messages +can be recovered even if they are intact because all the data beyond the +missing sector can't be decoded. + + If you used bzip2, and if the newer backup is large enough that the new +messages are in a different compressed data block than the one damaged +(usually larger than 900 kB uncompressed), then you can recover the new +messages manually with bzip2recover. If the backups are identical except for +the new messages appended, you may even recover the whole newer backup by +combining the good blocks from both backups. + + But if you used lzip, the whole newer backup can be automatically +recovered with 'lziprecover --reproduce' as long as the missing bytes can be +recovered from the older backup, even if other messages in the common part +have been changed or deleted. Mailboxes seem to be specially easy to +reproduce. The probability of reproducing a mailbox (*note +performance-of-reproduce::) is almost as high as that of merging two +identical backups (*note performance-of-merge::). + + +File: lziprecover.info, Node: Repairing one byte, Next: Merging files, Prev: Data safety, Up: Top + +4 Repairing one byte +******************** + +Lziprecover can repair perfectly most files with small errors (up to one +single-byte error per member), without the need of any extra redundance at +all. If the reparation is successful, the repaired file will be identical +bit for bit to the original. This makes lzip files resistant to bit flip, +one of the most common forms of data corruption. + + The file is repaired in memory. Therefore, enough virtual memory +(RAM + swap) to contain the largest damaged member is required. + + The error may be located anywhere in the file except in the first 5 +bytes of each member header or in the 'Member size' field of the trailer +(last 8 bytes of each member). If the error is in the header it can be +easily repaired with a text editor like GNU Moe (*note File format::). If +the error is in the member size, it is enough to ignore the message about +'bad member size' when decompressing. + + Bit flip happens when one bit in the file is changed from 0 to 1 or vice +versa. It may be caused by bad RAM or even by natural radiation. I have +seen a case of bit flip in a file stored on an USB flash drive. + + One byte may seem small, but most file corruptions not produced by +transmission errors or I/O errors just affect one byte, or even one bit, of +the file. Also, unlike magnetic media, where errors usually affect a whole +sector, solid-state storage devices tend to produce single-byte errors, +making of lzip the perfect format for data stored on such devices. + + Repairing a file can take some time. Small files or files with the error +located near the beginning can be repaired in a few seconds. But repairing +a large file compressed with a large dictionary size and with the error +located far from the beginning, may take hours. + + On the other hand, errors located near the beginning of the file cause +much more loss of data than errors located near the end. So lziprecover +repairs more efficiently the worst errors. + + +File: lziprecover.info, Node: Merging files, Next: Reproducing one sector, Prev: Repairing one byte, Up: Top + +5 Merging files +*************** + +If you have several copies of a file but all of them are too damaged to +repair them (*note Repairing one byte::), lziprecover can try to produce a +correct file by merging the good parts of the damaged copies. + + The merge may succeed even if some copies of the file have all the +headers and trailers damaged, as long as there is at least one copy of +every header and trailer intact, even if they are in different copies of +the file. + + The merge will fail if the damaged areas overlap (at least one byte is +damaged in all copies), or are adjacent and the boundary can't be +determined, or if the copies have too many damaged areas. + + All the copies to be merged must have the same size. If any of them is +larger or smaller than it should, either because it has been truncated or +because it got some garbage data appended at the end, it can be brought to +the correct size with the following command before merging it with the +other copies: + + ddrescue -s<correct_size> -x<correct_size> file.lz correct_size_file.lz + + To give you an idea of its possibilities, when merging two copies, each +of them with one damaged area affecting 1 percent of the copy, the +probability of obtaining a correct file is about 98 percent. With three +such copies the probability rises to 99.97 percent. For large files (a few +MB) with small errors (one sector damaged per copy), the probability +approaches 100 percent even with only two copies. (Supposing that the +errors are randomly located inside each copy). + + Some types of solid-state device (NAND flash, for example) can produce +bursts of scattered single-bit errors. Lziprecover is able to merge files +with thousands of such scattered errors by grouping the errors into +clusters and then merging the files as if each cluster were a single error. + + Here is a real case of successful merging. Two copies of the file +'icecat-3.5.3-x86.tar.lz' (compressed size 9 MB) became corrupt while +stored on the same NAND flash device. One of the copies had 76 single-bit +errors scattered in an area of 1020 bytes, and the other had 3028 such +errors in an area of 31729 bytes. Lziprecover produced a correct file, +identical to the original, in just 5 seconds: + + lziprecover -vvm a/icecat-3.5.3-x86.tar.lz b/icecat-3.5.3-x86.tar.lz + Merging member 1 of 1 (2552 errors) + 2552 errors have been grouped in 16 clusters. + Trying variation 2 of 2, block 2 + Input files merged successfully. + + Note that the number of errors reported by lziprecover (2552) is lower +than the number of corrupt bytes (3104) because contiguous corrupt bytes +are counted as a single multibyte error. + + +Example 1: Recover a compressed backup from two copies on CD-ROM with +error-checked merging of copies. *Note GNU ddrescue manual: (ddrescue)Top, +for details about ddrescue. + + ddrescue -d -r1 -b2048 /dev/cdrom cdimage1 mapfile1 + mount -t iso9660 -o loop,ro cdimage1 /mnt/cdimage + cp /mnt/cdimage/backup.tar.lz rescued1.tar.lz + umount /mnt/cdimage + (insert second copy in the CD drive) + ddrescue -d -r1 -b2048 /dev/cdrom cdimage2 mapfile2 + mount -t iso9660 -o loop,ro cdimage2 /mnt/cdimage + cp /mnt/cdimage/backup.tar.lz rescued2.tar.lz + umount /mnt/cdimage + lziprecover -m -v -o backup.tar.lz rescued1.tar.lz rescued2.tar.lz + Input files merged successfully. + lziprecover -tv backup.tar.lz + backup.tar.lz: ok + + +Example 2: Recover the first volume of those created with the command +'lzip -b 32MiB -S 650MB big_db' from two copies, 'big_db1_00001.lz' and +'big_db2_00001.lz', with member 07 damaged in the first copy, member 18 +damaged in the second copy, and member 12 damaged in both copies. The +correct file produced is saved in 'big_db_00001.lz'. + + lziprecover -m -v -o big_db_00001.lz big_db1_00001.lz big_db2_00001.lz + Input files merged successfully. + lziprecover -tv big_db_00001.lz + big_db_00001.lz: ok + + +File: lziprecover.info, Node: Reproducing one sector, Next: Tarlz, Prev: Merging files, Up: Top + +6 Reproducing one sector +************************ + +Lziprecover can recover a zeroed sector in a lzip file by concatenating the +decompressed contents of the file up to the beginning of the zeroed sector +and the uncompressed data corresponding to the zeroed sector, and then +feeding the concatenated data to the same version of lzip that created the +file. For this to work, a reference file is required containing the +uncompressed data corresponding to the missing compressed data of the zeroed +sector, plus some context data before and after them. It is possible to +recover a large file using just a few KB of reference data. + + The difficult part is finding a suitable reference file. It must contain +the exact data required (possibly mixed with other data). Containing similar +data is not enough. + + A zeroed sector may be caused by the incomplete recovery of a damaged +storage device (with I/O errors) using, for example, ddrescue. The +reproduction can't be done if the zeroed sector overlaps with the first 15 +bytes of a member, or if the zeroed sector is smaller than 8 bytes. + + The file is reproduced in memory. Therefore, enough virtual memory +(RAM + swap) to contain the damaged member is required. + + To understand how it works, take any lzipped file, say 'foo.lz', +decompress it (keeping the original), and try to reproduce an artificially +zeroed sector in it by running the following commands: + + lzip -kd foo.lz + lziprecover -vv --debug-reproduce=65536,512 --reference-file=foo foo.lz + +which should produce an output like the following: + + Reproducing: foo.lz + Reference file: foo + Testing sectors of size 512 at file positions 65536 to 66047 + (master mpos = 65536, dpos = 296892) + foo: Match found at offset 296892 + Reproduction succeeded at pos 65536 + + 1 sectors tested + 1 reproductions returned with zero status + all comparisons passed + + Using 'foo' as reference file guarantees that any zeroed sector in +'foo.lz' can be reproduced because both files contain the same data. In +real use, the reference file needs to contain the data corresponding to the +zeroed sector, but the rest of the data (if any) may differ between both +files. The reference data may be obtained from the partial decompression of +the damaged file itself if it contains repeated data. For example if the +damaged file is a compressed tarball containing several partially modified +versions of the same file. + + The offset reported by lziprecover is the position in the reference file +of the first byte that could not be decompressed. This is the first byte +that will be compressed to reproduce the zeroed sector. + + The reproduce mode tries to reproduce the missing compressed data +originally present in the zeroed sector. It is based on the perfect +reproducibility of lzip files (lzip produces identical compressed output +from identical input). Therefore, the same version of lzip that created the +file to be reproduced should be used to reproduce the zeroed sector. Near +versions may also work because the output of lzip changes infrequently. If +reproducing a tar.lz archive created with tarlz, the version of lzip, +clzip, or minilzip corresponding to the version of the lzlib library used +by tarlz to create the archive should be used. + + When recovering a tar.lz archive and using as reference a file from the +filesystem, if the zeroed sector encodes (part of) a tar header, the archive +can't be reproduced. Therefore, the less overhead (smaller headers) a tar +archive has, the more probable is that the zeroed sector does not include a +header, and that the archive can be reproduced. The tarlz format has minimum +overhead. It uses basic ustar headers, and only adds extended pax headers +when they are required. + +6.1 Performance of '--reproduce' +================================ + +Reproduce mode is specially useful when recovering a corrupt backup (or a +corrupt source tarball) that is part of a series. Usually only a small +fraction of the data changes from one backup to the next or from one version +of a source tarball to the next. This makes sometimes possible to reproduce +a given corrupted version using reference data from a near version. The +following two tables show the fraction of reproducible sectors (reproducible +sectors divided by total sectors in archive) for some archives, using sector +sizes of 512 and 4096 bytes. 'mailbox-aug.tar.lz' is a backup of some of my +mailboxes. 'backup-feb.tar.lz' and 'backup-apr.tar.lz' are real backups of +my own working directory: + +Reference file File Reproducible (512) +--------------------------------------------------------- +backup-feb.tar backup-apr.tar.lz 3273 / 4342 = 75.38% +backup-apr.tar backup-feb.tar.lz 3259 / 4161 = 78.32% +gawk-5.0.0.tar gawk-5.0.1.tar.lz 4369 / 5844 = 74.76% +gawk-5.0.1.tar gawk-5.0.0.tar.lz 4379 / 5603 = 78.15% +gmp-6.1.1.tar gmp-6.1.2.tar.lz 2454 / 3787 = 64.8% +gmp-6.1.2.tar gmp-6.1.1.tar.lz 2461 / 3782 = 65.07% + +Reference file File Reproducible (4096) +----------------------------------------------------------- +mailbox-mar.tar mailbox-aug.tar.lz 4036 / 4252 = 94.92% +backup-feb.tar backup-apr.tar.lz 264 / 542 = 48.71% +backup-apr.tar backup-feb.tar.lz 264 / 520 = 50.77% +gawk-5.0.0.tar gawk-5.0.1.tar.lz 327 / 730 = 44.79% +gawk-5.0.1.tar gawk-5.0.0.tar.lz 326 / 700 = 46.57% +gmp-6.1.1.tar gmp-6.1.2.tar.lz 175 / 473 = 37% +gmp-6.1.2.tar gmp-6.1.1.tar.lz 181 / 472 = 38.35% + + Note that the "performance of reproduce" is a probability, not a partial +recovery. The data is either recovered fully (with the probability X shown +in the last column of the tables above) or not recovered at all (with +probability 1 - X). + + Example 1: Recover a damaged source tarball with a zeroed sector of 512 +bytes at file position 1019904, using as reference another source tarball +for a different version of the software. + + lziprecover -vv -e --reference-file=gmp-6.1.1.tar gmp-6.1.2.tar.lz + Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 512, value = 0x00) + (master mpos = 1019904, dpos = 6292134) + warning: gmp-6.1.1.tar: Partial match found at offset 6277798, len 8716. + Reference data may be mixed with other data. + Trying level -9 + Reproducing position 1015808 + Member reproduced successfully. + Copy of input file reproduced successfully. + + +Example 2: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a previous backup. The damaged +backup comes from a damaged partition copied with ddrescue. + + ddrescue -b4096 -r10 /dev/sdc1 hdimage mapfile + mount -o loop,ro hdimage /mnt/hdimage + cp /mnt/hdimage/backup.tar.lz backup.tar.lz + umount /mnt/hdimage + lzip -t backup.tar.lz + backup.tar.lz: Decoder error at pos 1020530 + lziprecover -vv -e --reference-file=old_backup.tar backup.tar.lz + Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) + warning: old_backup.tar: Partial match found at offset 5743778, len 9546. + Reference data may be mixed with other data. + Trying level -9 + Reproducing position 1015808 + Member reproduced successfully. + Copy of input file reproduced successfully. + + +Example 3: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a file from the filesystem. (If +the zeroed sector encodes (part of) a tar header, the tarball can't be +reproduced). + + # List the contents of the backup tarball to locate the damaged member. + tarlz -n0 -tvf backup.tar.lz + [...] + example.txt + tarlz: Skipping to next header. + tarlz: backup.tar.lz: Archive ends unexpectedly. + # Find in the filesystem the last file listed and use it as reference. + lziprecover -vv -e --reference-file=/somedir/example.txt backup.tar.lz + Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) + /somedir/example.txt: Match found at offset 9378 + Trying level -9 + Reproducing position 1015808 + Member reproduced successfully. + Copy of input file reproduced successfully. + + If 'backup.tar.lz' is a multimember file with more than one member +damaged and lziprecover shows the message 'One member reproduced. Copy of +input file still contains errors.', the procedure shown in the example +above can be repeated until all the members have been reproduced. + + 'tarlz --keep-damaged -n0 -xf backup.tar.lz example.txt' produces a +partial copy of the reference file 'example.txt' that may help locate a +complete copy in the filesystem or in another backup, even if 'example.txt' +has been renamed. + + +File: lziprecover.info, Node: Tarlz, Next: File names, Prev: Reproducing one sector, Up: Top + +7 Options supporting the tar.lz format +************************************** + +Tarlz is a massively parallel (multi-threaded) combined implementation of +the tar archiver and the lzip compressor. + + Tarlz creates tar archives using a simplified and safer variant of the +POSIX pax format compressed in lzip format, keeping the alignment between +tar members and lzip members. The resulting multimember tar.lz archive is +fully backward compatible with standard tar tools like GNU tar, which treat +it like any other tar.lz archive. *Note tarlz manual: (tarlz)Top, and *note +lzip manual: (lzip)Top. + + Multimember tar.lz archives have some safety advantages over solidly +compressed tar.lz archives. For example, in case of corruption, tarlz can +extract all the undamaged members from the tar.lz archive, skipping over the +damaged members, just like the standard (uncompressed) tar. Keeping the +alignment between tar members and lzip members minimizes the amount of data +lost in case of corruption. In this chapter we'll explain the ways in which +lziprecover can recover and process multimember tar.lz archives. + + +7.1 Recovering damaged multimember tar.lz archives +================================================== + +If you have several copies of the damaged archive, try merging them first +because merging has a high probability of success. *Note Merging files::. If +the command below prints something like 'Input files merged successfully.' +you are done and 'archive.tar.lz' now contains the recovered archive: + + lziprecover -m -v -o archive.tar.lz a/archive.tar.lz b/archive.tar.lz + + If you only have one copy of the damaged archive with a zeroed block of +data caused by an I/O error, you may try to reproduce the archive. *Note +Reproducing one sector::. If the command below prints something like +'Copy of input file reproduced successfully.' you are done and +'archive_fixed.tar.lz' now contains the recovered archive: + + lziprecover -vv -e --reference-file=old_archive.tar archive.tar.lz + + If you only have one copy of the damaged archive, you may try to repair +the archive, but this has a lower probability of success. *Note Repairing +one byte::. If the command below prints something like +'Copy of input file repaired successfully.' you are done and +'archive_fixed.tar.lz' now contains the recovered archive: + + lziprecover -v -R archive.tar.lz + + If all the above fails, and the archive was created with tarlz, you may +save the damaged members for later and then copy the good members to another +archive. If the two commands below succeed, 'bad_members.tar.lz' will +contain all the damaged members and 'archive_cleaned.tar.lz' will contain a +good archive with the damaged members removed: + + lziprecover -v --dump=damaged -o bad_members.tar.lz archive.tar.lz + lziprecover -v --strip=damaged -o archive_cleaned.tar.lz archive.tar.lz + + You can then use 'tarlz --keep-damaged' to recover as much data as +possible from each damaged member in 'bad_members.tar.lz': + + mkdir tmp + cd tmp + tarlz --keep-damaged -xvf ../bad_members.tar.lz + + +7.2 Processing multimember tar.lz archives +========================================== + +Lziprecover is able to copy a list of members from a file to another. For +example the command +'lziprecover --dump=1-10:r1:tdata archive.tar.lz > subarch.tar.lz' creates +a subset archive containing the first ten members, the end-of-file blocks, +and the trailing data (if any) of 'archive.tar.lz'. The 'r1' part selects +the last member, which in an appendable tar.lz archive contains the +end-of-file blocks. + + +File: lziprecover.info, Node: File names, Next: File format, Prev: Tarlz, Up: Top + +8 Names of the files produced by lziprecover +******************************************** + +The name of the fixed file produced by '--merge' and '--repair' is made by +appending the string '_fixed.lz' to the original file name. If the original +file name ends with one of the extensions '.tar.lz', '.lz', or '.tlz', the +string '_fixed' is inserted before the extension. + + +File: lziprecover.info, Node: File format, Next: Trailing data, Prev: File names, Up: Top + +9 File format +************* + +Perfection is reached, not when there is no longer anything to add, but +when there is no longer anything to take away. +-- Antoine de Saint-Exupery + + + In the diagram below, a box like this: + ++---+ +| | <-- the vertical bars might be missing ++---+ + + represents one byte; a box like this: + ++==============+ +| | ++==============+ + + represents a variable number of bytes. + + + A lzip file consists of a series of independent "members" (compressed +data sets). The members simply appear one after another in the file, with no +additional information before, between, or after them. Each member can +encode in compressed form up to 16 EiB - 1 byte of uncompressed data. The +size of a multimember file is unlimited. + + Each member has the following structure: + ++--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| ID string | VN | DS | LZMA stream | CRC32 | Data size | Member size | ++--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + All multibyte values are stored in little endian order. + +'ID string (the "magic" bytes)' + A four byte string, identifying the lzip format, with the value "LZIP" + (0x4C, 0x5A, 0x49, 0x50). + +'VN (version number, 1 byte)' + Just in case something needs to be modified in the future. 1 for now. + +'DS (coded dictionary size, 1 byte)' + The dictionary size is calculated by taking a power of 2 (the base + size) and subtracting from it a fraction between 0/16 and 7/16 of the + base size. + Bits 4-0 contain the base 2 logarithm of the base size (12 to 29). + Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract + from the base size to obtain the dictionary size. + Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB + Valid values for dictionary size range from 4 KiB to 512 MiB. + +'LZMA stream' + The LZMA stream, finished by an "End Of Stream" marker. Uses default + values for encoder properties. *Note Stream format: (lzip)Stream + format, for a complete description. + +'CRC32 (4 bytes)' + Cyclic Redundancy Check (CRC) of the original uncompressed data. + +'Data size (8 bytes)' + Size of the original uncompressed data. + +'Member size (8 bytes)' + Total size of the member, including header and trailer. This field acts + as a distributed index, allows the verification of stream integrity, + and facilitates the safe recovery of undamaged members from + multimember files. Member size should be limited to 2 PiB to prevent + the data size field from overflowing. + + + +File: lziprecover.info, Node: Trailing data, Next: Examples, Prev: File format, Up: Top + +10 Extra data appended to the file +********************************** + +Sometimes extra data are found appended to a lzip file after the last +member. Such trailing data may be: + + * Padding added to make the file size a multiple of some block size, for + example when writing to a tape. It is safe to append any amount of + padding zero bytes to a lzip file. + + * Useful data added by the user; a cryptographically secure hash, a + description of file contents, etc. It is safe to append any amount of + text to a lzip file as long as none of the first four bytes of the text + match the corresponding byte in the string "LZIP", and the text does + not contain any zero bytes (null characters). Nonzero bytes and zero + bytes can't be safely mixed in trailing data. + + * Garbage added by some not totally successful copy operation. + + * Malicious data added to the file in order to make its total size and + hash value (for a chosen hash) coincide with those of another file. + + * In rare cases, trailing data could be the corrupt header of another + member. In multimember or concatenated files the probability of + corruption happening in the magic bytes is 5 times smaller than the + probability of getting a false positive caused by the corruption of the + integrity information itself. Therefore it can be considered to be + below the noise level. Additionally, the test used by lziprecover to + discriminate trailing data from a corrupt header has a Hamming + distance (HD) of 3, and the 3 bit flips must happen in different magic + bytes for the test to fail. In any case, the option '--trailing-error' + guarantees that any corrupt header will be detected. + + Trailing data are in no way part of the lzip file format, but tools +reading lzip files are expected to behave as correctly and usefully as +possible in the presence of trailing data. + + Trailing data can be safely ignored in most cases. In some cases, like +that of user-added data, they are expected to be ignored. In those cases +where a file containing trailing data must be rejected, the option +'--trailing-error' can be used. *Note --trailing-error::. + + Lziprecover facilitates the management of metadata stored as trailing +data in lzip files. See the following examples: + +Example 1: Add a comment or description to a compressed file. + + # First append the comment as trailing data to a lzip file + echo 'This file contains this and that' >> file.lz + # This command prints the comment to standard output + lziprecover --dump=tdata file.lz + # This command outputs file.lz without the comment + lziprecover --strip=tdata file.lz > stripped_file.lz + # This command removes the comment from file.lz + lziprecover --remove=tdata file.lz + + +Example 2: Add and verify a cryptographically secure hash. (This may be +convenient, but a separate copy of the hash must be kept in a safe place to +guarantee that both file and hash have not been maliciously replaced). + + sha256sum < file.lz >> file.lz + lziprecover --strip=tdata file.lz | sha256sum -c \ + <(lziprecover --dump=tdata file.lz) + + +File: lziprecover.info, Node: Examples, Next: Unzcrash, Prev: Trailing data, Up: Top + +11 A small tutorial with examples +********************************* + +Example 1: Extract all the files from archive 'foo.tar.lz'. + + tar -xf foo.tar.lz + or + lziprecover -cd foo.tar.lz | tar -xf - + + +Example 2: Restore a regular file from its compressed version 'file.lz'. If +the operation is successful, 'file.lz' is removed. + + lziprecover -d file.lz + + +Example 3: Verify the integrity of the compressed file 'file.lz' and show +status. + + lziprecover -tv file.lz + + +Example 4: The right way of concatenating the decompressed output of two or +more compressed files. *Note Trailing data::. + + Don't do this + cat file1.lz file2.lz file3.lz | lziprecover -d - + Do this instead + lziprecover -cd file1.lz file2.lz file3.lz + You may also concatenate the compressed files like this + lziprecover --strip=tdata file1.lz file2.lz file3.lz > file123.lz + Or keeping the trailing data of the last file like this + lziprecover --strip=damaged file1.lz file2.lz file3.lz > file123.lz + + +Example 5: Decompress 'file.lz' partially until 10 KiB of decompressed data +are produced. + + lziprecover -D 0,10KiB file.lz + + +Example 6: Decompress 'file.lz' partially from decompressed byte at offset +10000 to decompressed byte at offset 14999 (5000 bytes are produced). + + lziprecover -D 10000-15000 file.lz + + +Example 7: Repair small errors in the file 'file.lz'. (Indented lines are +abridged diagnostic messages from lziprecover). + + lziprecover -v -R file.lz + Copy of input file repaired successfully. + lziprecover -tv file_fixed.lz + file_fixed.lz: ok + mv file_fixed.lz file.lz + + +Example 8: Split the multimember file 'file.lz' and write each member in +its own 'recXXXfile.lz' file. Then use 'lziprecover -t' to test the +integrity of the resulting files. + + lziprecover -s file.lz + lziprecover -tv rec*file.lz + + +File: lziprecover.info, Node: Unzcrash, Next: Problems, Prev: Examples, Up: Top + +12 Testing the robustness of decompressors +****************************************** + +The lziprecover package also includes unzcrash, a program written to test +robustness to decompression of corrupted data, inspired by unzcrash.c from +Julian Seward's bzip2. Type 'make unzcrash' in the lziprecover source +directory to build it. + + By default, unzcrash reads the file specified and then repeatedly +decompresses it, increasing 256 times each byte of the compressed data, so +as to test all possible one-byte errors. Note that it may take years or even +centuries to test all possible one-byte errors in a large file (tens of MB). + + If the option '--block' is given, unzcrash reads the file specified and +then repeatedly decompresses it, setting all bytes in each successive block +to the value given, so as to test all possible full sector errors. + + If the option '--truncate' is given, unzcrash reads the file specified +and then repeatedly decompresses it, truncating the file to increasing +lengths, so as to test all possible truncation points. + + None of the three test modes described above should cause any invalid +memory accesses. If any of them does, please, report it as a bug to the +maintainers of the decompressor being tested. + + Unzcrash really executes as a subprocess the shell command specified in +the first non-option argument, and then writes the file specified in the +second non-option argument to the standard input of the subprocess, +modifying the corresponding byte each time. Therefore unzcrash can be used +to test any decompressor (not only lzip), or even other decoder programs +having a suitable command line syntax. + + If the decompressor returns with zero status, unzcrash compares the +output of the decompressor for the original and corrupt files. If the +outputs differ, it means that the decompressor returned a false negative; +it failed to recognize the corruption and produced garbage output. The only +exception is when a multimember file is truncated just after the last byte +of a member, producing a shorter but valid compressed file. Except in this +latter case, please, report any false negative as a bug. + + In order to compare the outputs, unzcrash needs a 'zcmp' program able to +understand the format being tested. For example the 'zcmp' provided by +zutils. If the 'zcmp' program used does not understand the format being +tested, all the comparisons will fail because the compressed files will be +compared without being decompressed first. Use '--zcmp=false' to disable +comparisons. *Note Zcmp: (zutils)Zcmp. + + The format for running unzcrash is: + + unzcrash [OPTIONS] 'lzip -t' FILE + +The compressed FILE must not contain errors and the decompressor being +tested must decompress it correctly for the comparisons to work. + + unzcrash supports the following options: + +'-h' +'--help' + Print an informative help message describing the options and exit. + +'-V' +'--version' + Print the version number of unzcrash on the standard output and exit. + This version number should be included in all bug reports. + +'-b RANGE' +'--bits=RANGE' + Test N-bit errors only, instead of testing all the 255 wrong values for + each byte. 'N-bit error' means any value differing from the original + value in N bit positions, not a value differing from the original + value in the bit position N. + The number of N-bit errors per byte (N = 1 to 8) is: + 8 28 56 70 56 28 8 1 + + Examples of RANGE Tests errors of N-bits + 1 1 + 1,2,3 1, 2, 3 + 2-4 2, 3, 4 + 1,3-5,8 1, 3, 4, 5, 8 + 1-3,5-8 1, 2, 3, 5, 6, 7, 8 + +'-B[SIZE][,VALUE]' +'--block[=SIZE][,VALUE]' + Test block errors of given SIZE, simulating a whole sector I/O error. + SIZE defaults to 512 bytes. VALUE defaults to 0. By default, only + contiguous, non-overlapping blocks are tested, but this may be changed + with the option '--delta'. + +'-d N' +'--delta=N' + Test one byte, block, or truncation size every N bytes. If '--delta' + is not specified, unzcrash tests all the bytes, non-overlapping + blocks, or truncation sizes. Values of N smaller than the block size + will result in overlapping blocks. (Which is convenient for testing + because there are usually too few non-overlapping blocks in a file). + +'-e POSITION,VALUE' +'--set-byte=POSITION,VALUE' + Set byte at POSITION to VALUE in the internal buffer after reading and + testing FILE but before the first test call to the decompressor. Byte + positions start at 0. If VALUE is preceded by '+', it is added to the + original value of the byte at POSITION. If VALUE is preceded by 'f' + (flip), it is XORed with the original value of the byte at POSITION. + This option can be used to run tests with a changed dictionary size, + for example. + +'-n' +'--no-verify' + Skip initial verification of FILE and 'zcmp'. May speed up things a + lot when testing many (or large) known good files. + +'-p BYTES' +'--position=BYTES' + First byte position to test in the file. Defaults to 0. Negative values + are relative to the end of the file. + +'-q' +'--quiet' + Quiet operation. Suppress all messages. + +'-s BYTES' +'--size=BYTES' + Number of byte positions to test. If not specified, the rest of the + file is tested (from '--position' to end of file). Negative values are + relative to the rest of the file. + +'-t' +'--truncate' + Test all possible truncation points in the range specified by + '--position' and '--size'. + +'-v' +'--verbose' + Verbose mode. + +'-z' +'--zcmp=<command>' + Set zcmp command name and options. Defaults to 'zcmp'. Use + '--zcmp=false' to disable comparisons. If testing a decompressor + different from the one used by default by zcmp, it is needed to force + unzcrash and zcmp to use the same decompressor with a command like + 'unzcrash --zcmp='zcmp --lz=plzip' 'plzip -t' FILE' + + + Exit status: 0 for a normal exit, 1 for environmental problems (file not +found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid +input file, 3 for an internal consistency error (e.g., bug) which caused +unzcrash to panic. + + +File: lziprecover.info, Node: Problems, Next: Concept index, Prev: Unzcrash, Up: Top + +13 Reporting bugs +***************** + +There are probably bugs in lziprecover. There are certainly errors and +omissions in this manual. If you report them, they will get fixed. If you +don't, no one will ever know about them and they will remain unfixed for +all eternity, if not longer. + + If you find a bug in lziprecover, please send electronic mail to +<lzip-bug@nongnu.org>. Include the version number, which you can find by +running 'lziprecover --version'. + + +File: lziprecover.info, Node: Concept index, Prev: Problems, Up: Top + +Concept index +************* + + +* Menu: + +* bugs: Problems. (line 6) +* data safety: Data safety. (line 6) +* examples: Examples. (line 6) +* file format: File format. (line 6) +* file names: File names. (line 6) +* getting help: Problems. (line 6) +* introduction: Introduction. (line 6) +* invoking: Invoking lziprecover. (line 6) +* merging files: Merging files. (line 6) +* merging with a backup: Merging with a backup. (line 6) +* options: Invoking lziprecover. (line 6) +* repairing one byte: Repairing one byte. (line 6) +* reproducing a mailbox: Reproducing a mailbox. (line 6) +* reproducing one sector: Reproducing one sector. (line 6) +* tarlz: Tarlz. (line 6) +* trailing data: Trailing data. (line 6) +* unzcrash: Unzcrash. (line 6) +* usage: Invoking lziprecover. (line 6) +* version: Invoking lziprecover. (line 6) + + + +Tag Table: +Node: Top226 +Node: Introduction1406 +Node: Invoking lziprecover5398 +Ref: --trailing-error6265 +Ref: range-format8644 +Ref: --reproduce8979 +Ref: --repair13278 +Node: Data safety25584 +Node: Merging with a backup27572 +Node: Reproducing a mailbox28836 +Node: Repairing one byte31337 +Node: Merging files33402 +Ref: performance-of-merge34572 +Ref: ddrescue-example36181 +Node: Reproducing one sector37468 +Ref: performance-of-reproduce41351 +Ref: ddrescue-example244026 +Node: Tarlz46446 +Node: File names50110 +Node: File format50567 +Node: Trailing data53258 +Node: Examples56499 +Ref: concat-example57075 +Node: Unzcrash58467 +Node: Problems64739 +Node: Concept index65291 + +End Tag Table + + +Local Variables: +coding: iso-8859-15 +End: diff --git a/doc/lziprecover.texi b/doc/lziprecover.texi new file mode 100644 index 0000000..7b3449e --- /dev/null +++ b/doc/lziprecover.texi @@ -0,0 +1,1587 @@ +\input texinfo @c -*-texinfo-*- +@c %**start of header +@setfilename lziprecover.info +@documentencoding ISO-8859-15 +@settitle Lziprecover Manual +@finalout +@c %**end of header + +@set UPDATED 21 January 2022 +@set VERSION 1.23 + +@dircategory Compression +@direntry +* Lziprecover: (lziprecover). Data recovery tool for the lzip format +@end direntry + + +@ifnothtml +@titlepage +@title Lziprecover +@subtitle Data recovery tool for the lzip format +@subtitle for Lziprecover version @value{VERSION}, @value{UPDATED} +@author by Antonio Diaz Diaz + +@page +@vskip 0pt plus 1filll +@end titlepage + +@contents +@end ifnothtml + +@ifnottex +@node Top +@top + +This manual is for Lziprecover (version @value{VERSION}, @value{UPDATED}). + +@menu +* Introduction:: Purpose and features of lziprecover +* Invoking lziprecover:: Command line interface +* Data safety:: Protecting data from accidental loss +* Repairing one byte:: Fixing bit flips and similar errors +* Merging files:: Fixing several damaged copies +* Reproducing one sector:: Fixing a missing (zeroed) sector +* Tarlz:: Options supporting the tar.lz format +* File names:: Names of the files produced by lziprecover +* File format:: Detailed format of the compressed file +* Trailing data:: Extra data appended to the file +* Examples:: A small tutorial with examples +* Unzcrash:: Testing the robustness of decompressors +* Problems:: Reporting bugs +* Concept index:: Index of concepts +@end menu + +@sp 1 +Copyright @copyright{} 2009-2022 Antonio Diaz Diaz. + +This manual is free documentation: you have unlimited permission to copy, +distribute, and modify it. +@end ifnottex + + +@node Introduction +@chapter Introduction +@cindex introduction + +@uref{http://www.nongnu.org/lzip/lziprecover.html,,Lziprecover} +is a data recovery tool and decompressor for files in the lzip +compressed data format (.lz). Lziprecover is able to repair slightly damaged +files (up to one single-byte error per member), produce a correct file by +merging the good parts of two or more damaged copies, reproduce a missing +(zeroed) sector using a reference file, extract data from damaged files, +decompress files, and test integrity of files. + +Lziprecover can remove the damaged members from multimember files, for +example multimember tar.lz archives. + +Lziprecover provides random access to the data in multimember files; it only +decompresses the members containing the desired data. + +Lziprecover facilitates the management of metadata stored as trailing data +in lzip files. + +Lziprecover is not a replacement for regular backups, but a last line of +defense for the case where the backups are also damaged. + +The lzip file format is designed for data sharing and long-term archiving, +taking into account both data integrity and decoder availability: + +@itemize @bullet +@item +The lzip format provides very safe integrity checking and some data +recovery means. The program lziprecover can repair bit flip errors +(one of the most common forms of data corruption) in lzip files, and +provides data recovery capabilities, including error-checked merging +of damaged copies of a file. @xref{Data safety}. + +@item +The lzip format is as simple as possible (but not simpler). The lzip +manual provides the source code of a simple decompressor along with a +detailed explanation of how it works, so that with the only help of the +lzip manual it would be possible for a digital archaeologist to extract +the data from a lzip file long after quantum computers eventually +render LZMA obsolete. + +@item +Additionally the lzip reference implementation is copylefted, which +guarantees that it will remain free forever. +@end itemize + +A nice feature of the lzip format is that a corrupt byte is easier to repair +the nearer it is from the beginning of the file. Therefore, with the help of +lziprecover, losing an entire archive just because of a corrupt byte near +the beginning is a thing of the past. + +Compression may be good for long-term archiving. For compressible data, +multiple compressed copies may provide redundancy in a more useful form and +may have a better chance of surviving intact than one uncompressed copy +using the same amount of storage space. This is specially true if the format +provides recovery capabilities like those of lziprecover, which is able to +find and combine the good parts of several damaged copies. + +Lziprecover is able to recover or decompress files produced by any of the +compressors in the lzip family: lzip, plzip, minilzip/lzlib, clzip, and +pdlzip. + +If the cause of file corruption is a damaged medium, the combination +@w{GNU ddrescue + lziprecover} is the recommended option for recovering data +from damaged lzip files. @xref{ddrescue-example}, and +@ref{ddrescue-example2}, for examples. + +If a file is too damaged for lziprecover to repair it, all the recoverable +data in all members of the file can be extracted with the following command +(the resulting file may contain errors and some garbage data may be produced +at the end of each damaged member): + +@example +lziprecover -cd -i file.lz > file +@end example + +When recovering data, lziprecover takes as arguments the names of the +damaged files and writes zero or more recovered files depending on the +operation selected and whether the recovery succeeded or not. The damaged +files themselves are kept unchanged. + +When decompressing or testing file integrity, lziprecover behaves like lzip +or lunzip. + +LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never have +been compressed. Decompressed is used to refer to data which have undergone +the process of decompression. + + +@node Invoking lziprecover +@chapter Invoking lziprecover +@cindex invoking +@cindex options +@cindex usage +@cindex version + +The format for running lziprecover is: + +@example +lziprecover [@var{options}] [@var{files}] +@end example + +@noindent +When decompressing or testing, a hyphen @samp{-} used as a @var{file} +argument means standard input. It can be mixed with other @var{files} and is +read just once, the first time it appears in the command line. If no file +names are specified, lziprecover decompresses from standard input to +standard output. + +lziprecover supports the following +@uref{http://www.nongnu.org/arg-parser/manual/arg_parser_manual.html#Argument-syntax,,options}: +@ifnothtml +@xref{Argument syntax,,,arg_parser}. +@end ifnothtml + +@table @code +@item -h +@itemx --help +Print an informative help message describing the options and exit. + +@item -V +@itemx --version +Print the version number of lziprecover on the standard output and exit. +This version number should be included in all bug reports. + +@anchor{--trailing-error} +@item -a +@itemx --trailing-error +Exit with error status 2 if any remaining input is detected after +decompressing the last member. Such remaining input is usually trailing +garbage that can be safely ignored. @xref{concat-example}. + +@item -A +@itemx --alone-to-lz +Convert lzma-alone files to lzip format without recompressing, just +adding a lzip header and trailer. The conversion minimizes the +dictionary size of the resulting file (and therefore the amount of +memory required to decompress it). Only streamed files with default LZMA +properties can be converted; non-streamed lzma-alone files lack the "End +Of Stream" marker required in lzip files. + +The name of the converted lzip file is derived from that of the original +lzma-alone file as follows: + +@multitable {filename.lzma} {becomes} {anyothername.lz} +@item filename.lzma @tab becomes @tab filename.lz +@item filename.tlz @tab becomes @tab filename.tar.lz +@item anyothername @tab becomes @tab anyothername.lz +@end multitable + +@item -c +@itemx --stdout +Write decompressed data to standard output; keep input files unchanged. This +option (or @samp{-o}) is needed when reading from a named pipe (fifo) or +from a device. Use it also to recover as much of the decompressed data as +possible when decompressing a corrupt file. @samp{-c} overrides @samp{-o}. +@samp{-c} has no effect when merging, removing members, repairing, +reproducing, splitting, testing or listing. + +@item -d +@itemx --decompress +Decompress the files specified. If a file does not exist, can't be opened, +or the destination file already exists and @samp{--force} has not been +specified, lziprecover continues decompressing the rest of the files and +exits with error status 1. If a file fails to decompress, or is a terminal, +lziprecover exits immediately with error status 2 without decompressing the +rest of the files. A terminal is considered an uncompressed file, and +therefore invalid. + +@item -D @var{range} +@itemx --range-decompress=@var{range} +Decompress only a range of bytes starting at decompressed byte position +@var{begin} and up to byte position @w{@var{end} - 1}. Byte positions start +at 0. This option provides random access to the data in multimember files; +it only decompresses the members containing the desired data. In order to +guarantee the correctness of the data produced, all members containing any +part of the desired data are decompressed and their integrity is verified. + +@anchor{range-format} +Four formats of @var{range} are recognized, @samp{@var{begin}}, +@samp{@var{begin}-@var{end}}, @samp{@var{begin},@var{size}}, and +@samp{,@var{size}}. If only @var{begin} is specified, @var{end} is taken as +the end of the file. If only @var{size} is specified, @var{begin} is taken +as the beginning of the file. The bytes produced are sent to standard output +unless the option @samp{--output} is used. + +@anchor{--reproduce} +@item -e +@itemx --reproduce +Try to recover a missing (zeroed) sector in @var{file} using a reference +file and the same version of lzip that created @var{file}. If successful, a +repaired copy is written to the file @samp{@var{file}_fixed.lz}. @var{file} +is not modified at all. The exit status is 0 if the member containing the +zeroed sector could be repaired, 2 otherwise. Note that +@samp{@var{file}_fixed.lz} may still contain errors in the members following +the one repaired. @xref{Reproducing one sector}, for a complete description +of the reproduce mode. + +@item --lzip-level=@var{digit}|a|m[@var{length}] +Try only the given compression level or match length limit when reproducing +a zeroed sector. @samp{--lzip-level=a} tries all the compression levels +@w{(0 to 9)}, while @samp{--lzip-level=m} tries all the match length limits +@w{(5 to 273)}. + +@item --lzip-name=@var{name} +Set the name of the lzip executable used by @samp{--reproduce}. If +@samp{--lzip-name} is not specified, @samp{lzip} is used. + +@item --reference-file=@var{file} +Set the reference file used by @samp{--reproduce}. It must contain the +uncompressed data corresponding to the missing compressed data of the zeroed +sector, plus some context data before and after them. + +@item -f +@itemx --force +Force overwrite of output files. + +@item -i +@itemx --ignore-errors +Make @samp{--decompress}, @samp{--test}, and @samp{--range-decompress} +ignore format and data errors and continue decompressing the remaining +members in the file; keep input files unchanged. For example, the commands +@w{@samp{lziprecover -cd -i file.lz > file}} or +@w{@samp{lziprecover -D0 -i file.lz > file}} decompress all the recoverable +data in all members of @samp{file.lz} without having to split it first. The +@w{@samp{-cd -i}} method resyncs to the next member header after each error, +and is immune to some format errors that make @w{@samp{-D0 -i}} fail. The +range decompressed may be smaller than the range requested, because of the +errors. The exit status is set to 0 unless other errors are found (I/O +errors, for example). + +Make @samp{--list}, @samp{--dump}, @samp{--remove}, and @samp{--strip} +ignore format errors. The sizes of the members with errors (specially the +last) may be wrong. + +@item -k +@itemx --keep +Keep (don't delete) input files during decompression. + +@item -l +@itemx --list +Print the uncompressed size, compressed size, and percentage saved of the +files specified. Trailing data are ignored. The values produced are correct +even for multimember files. If more than one file is given, a final line +containing the cumulative sizes is printed. With @samp{-v}, the dictionary +size, the number of members in the file, and the amount of trailing data (if +any) are also printed. With @samp{-vv}, the positions and sizes of each +member in multimember files are also printed. With @samp{-i}, format errors +are ignored, and with @samp{-ivv}, gaps between members are shown. The +member numbers shown coincide with the file numbers produced by @samp{--split}. + +If any file is damaged, does not exist, can't be opened, or is not regular, +the final exit status will be @w{> 0}. @samp{-lq} can be used to verify +quickly (without decompressing) the structural integrity of the files +specified. (Use @samp{--test} to verify the data integrity). @samp{-alq} +additionally verifies that none of the files specified contain trailing data. + +@item -m +@itemx --merge +Try to produce a correct file by merging the good parts of two or more +damaged copies. If successful, a repaired copy is written to the file +@samp{@var{file}_fixed.lz}. The exit status is 0 if a correct file could +be produced, 2 otherwise. @xref{Merging files}, for a complete +description of the merge mode. + +@item -o @var{file} +@itemx --output=@var{file} +Place the output into @var{file} instead of into @samp{@var{file}_fixed.lz}. +If splitting, the names of the files produced are in the form +@samp{rec01@var{file}}, @samp{rec02@var{file}}, etc. + +If decompressing, or converting lzma-alone files, and @samp{-c} has not been +also specified, write the decompressed or converted output to @var{file}; +keep input files unchanged. This option (or @samp{-c}) is needed when +reading from a named pipe (fifo) or from a device. @w{@samp{-o -}} is +equivalent to @samp{-c}. @samp{-o} has no effect when testing or listing. + +@item -q +@itemx --quiet +Quiet operation. Suppress all messages. + +@anchor{--repair} +@item -R +@itemx --repair +Try to repair a @var{file} with small errors (up to one single-byte error +per member). If successful, a repaired copy is written to the file +@samp{@var{file}_fixed.lz}. @var{file} is not modified at all. The exit +status is 0 if the file could be repaired, 2 otherwise. @xref{Repairing one +byte}, for a complete description of the repair mode. + +@item -s +@itemx --split +Search for members in @var{file} and write each member in its own file. Gaps +between members are detected and each gap is saved in its own file. Trailing +data (if any) are saved alone in the last file. You can then use +@w{@samp{lziprecover -t}} to test the integrity of the resulting files, +decompress those which are undamaged, and try to repair or partially +decompress those which are damaged. Gaps may contain garbage or may be +members with corrupt headers or trailers. If other lziprecover functions +fail to work on a multimember @var{file} because of damage in headers or +trailers, try to split @var{file} and then work on each member individually. + +The names of the files produced are in the form @samp{rec01@var{file}}, +@samp{rec02@var{file}}, etc, and are designed so that the use of wildcards +in subsequent processing, for example, +@w{@samp{lziprecover -cd rec*@var{file} > recovered_data}}, processes the +files in the correct order. The number of digits used in the names varies +depending on the number of members in @var{file}. + +@item -t +@itemx --test +Check integrity of the files specified, but don't decompress them. This +really performs a trial decompression and throws away the result. Use it +together with @samp{-v} to see information about the files. If a file +fails the test, does not exist, can't be opened, or is a terminal, lziprecover +continues checking the rest of the files. A final diagnostic is shown at +verbosity level 1 or higher if any file fails the test when testing +multiple files. + +@item -v +@itemx --verbose +Verbose mode.@* +When decompressing or testing, further -v's (up to 4) increase the +verbosity level, showing status, compression ratio, dictionary size, +trailer contents (CRC, data size, member size), and up to 6 bytes of +trailing data (if any) both in hexadecimal and as a string of printable +ASCII characters.@* +Two or more @samp{-v} options show the progress of decompression.@* +In other modes, increasing verbosity levels show final status, progress +of operations, and extra information (for example, the failed areas). + +@item --loose-trailing +When decompressing, testing, or listing, allow trailing data whose first +bytes are so similar to the magic bytes of a lzip header that they can +be confused with a corrupt header. Use this option if a file triggers a +"corrupt header" error and the cause is not indeed a corrupt header. + +@item --dump=[@var{member_list}][:damaged][:tdata] +Dump the members listed, the damaged members (if any), or the trailing +data (if any) of one or more regular multimember files to standard +output, or to a file if the option @samp{--output} is used. If more than +one file is given, the elements dumped from all files are concatenated. +If a file does not exist, can't be opened, or is not regular, +lziprecover continues processing the rest of the files. If the dump +fails in one file, lziprecover exits immediately without processing the +rest of the files. Only @samp{--dump=tdata} can write to a terminal. + +The argument to @samp{--dump} is a colon-separated list of the following +element specifiers; a member list (1,3-6), a reverse member list +(r1,3-6), and the strings "damaged" and "tdata" (which may be shortened +to 'd' and 't' respectively). A member list selects the members (or +gaps) listed, whose numbers coincide with those shown by @samp{--list}. +A reverse member list selects the members listed counting from the last +member in the file (r1). Negated versions of both kinds of lists exist +(^1,3-6:r^1,3-6) which selects all the members except those in the list. +The strings "damaged" and "tdata" select the damaged members and the +trailing data respectively. If the same member is selected more than +once, for example by @samp{1:r1} in a single-member file, it is dumped +just once. See the following examples: + +@multitable {@code{3,12:damaged:tdata}} {members 3, 12, damaged members, trailing data} +@headitem @code{--dump} argument @tab Elements dumped +@item @code{1,3-6} @tab members 1, 3, 4, 5, 6 +@item @code{r1-3} @tab last 3 members in file +@item @code{^13,15} @tab all but 13th and 15th members in file +@item @code{r^1} @tab all but last member in file +@item @code{damaged} @tab all damaged members in file +@item @code{tdata} @tab trailing data +@item @code{1-5:r1:tdata} @tab members 1 to 5, last member, trailing data +@item @code{damaged:tdata} @tab damaged members, trailing data +@item @code{3,12:damaged:tdata} @tab members 3, 12, damaged members, trailing data +@end multitable + +@item --remove=[@var{member_list}][:damaged][:tdata] +Remove the members listed, the damaged members (if any), or the trailing +data (if any) from regular multimember files in place. The date of each +file is preserved if possible. If all members in a file are selected to +be removed, the file is left unchanged and the exit status is set to 2. +If a file does not exist, can't be opened, is not regular, or is left +unchanged, lziprecover continues processing the rest of the files. In case +of I/O error, lziprecover exits immediately without processing the rest of +the files. See @samp{--dump} above for a description of the argument. + +This option may be dangerous even if only the trailing data is being +removed because the file may be corrupt or the trailing data may contain +a forbidden combination of characters. @xref{Trailing data}. It is +advisable to make a backup before attempting the removal. At least +verify that @w{@samp{lzip -cd file.lz | wc -c}} and the uncompressed +size shown by @w{@samp{lzip -l file.lz}} match before attempting the +removal of trailing data. + +@item --strip=[@var{member_list}][:damaged][:tdata] +Copy one or more regular multimember files to standard output (or to a +file if the option @samp{--output} is used), stripping the members +listed, the damaged members (if any), or the trailing data (if any) from +each file. If all members in a file are selected to be stripped, the +trailing data (if any) are also stripped even if @samp{tdata} is not +specified. If more than one file is given, the files are concatenated. +In this case the trailing data are also stripped from all but the last +file even if @samp{tdata} is not specified. If a file does not exist, +can't be opened, or is not regular, lziprecover continues processing the +rest of the files. If a file fails to copy, lziprecover exits +immediately without processing the rest of the files. See @samp{--dump} +above for a description of the argument. + +@end table + +Lziprecover also supports the following debug options (for experts): + +@table @code +@item -E @var{range}[,@var{sector_size}] +@itemx --debug-reproduce=@var{range}[,@var{sector_size}] +Load the compressed @var{file} into memory, set all bytes in the positions +specified by @var{range} to 0, and try to reproduce a correct compressed +file. @xref{--reproduce}. @xref{range-format}, for a description of +@var{range}. If a @var{sector_size} is specified, set each sector to 0 in +sequence and try to reproduce the file, printing to standard output final +statistics of the number of sectors reproduced successfully. Exit with +nonzero status only in case of fatal error. + +@item -M +@itemx --md5sum +Print to standard output the MD5 digests of the input @var{files} one per +line in the same format produced by the @command{md5sum} tool. Lziprecover +uses MD5 digests to verify the result of some operations. This option allows +the verification of lziprecover's implementation of the MD5 algorithm. + +@item -S[@var{value}] +@itemx --nrep-stats[=@var{value}] +Compare the frequency of sequences of N repeated bytes of a given +@var{value} in the compressed LZMA streams of the input @var{files} with the +frequency expected for random data (1 / 2^(8N)). If @var{value} is not +specified, print the frequency of repeated sequences of all possible byte +values. Print cumulative data for all files followed by the name of the +first file with the longest sequence. + +@item -U 1|B@var{size} +@itemx --unzcrash=1|B@var{size} +With argument @samp{1}, test 1-bit errors in the LZMA stream of the +compressed input @var{file} like the command +@w{@samp{unzcrash -b1 -p7 -s-20 'lzip -t' @var{file}}} but in memory, and +therefore much faster. @xref{Unzcrash}. This option tests all the members +independently in a multimember file, skipping headers and trailers. If a +decompression succeeds, the decompressed output is compared with the +decompressed output of the original @var{file} using MD5 digests. @var{file} +must not contain errors and must decompress correctly for the comparisons to +work. + +With argument @samp{B}, test zeroed sectors (blocks of bytes) in the LZMA +stream of the compressed input @var{file} like the command +@w{@samp{unzcrash --block=@var{size} -d1 -p7 -s-(@var{size}+20) 'lzip -t' @var{file}}} +but in memory, and therefore much faster. Testing and comparisons work just +like with the argument @samp{1} explained above. + +By default @samp{--unzcrash} only prints the interesting cases; CRC +mismatches, size mismatches, unsupported marker codes, unexpected EOFs, +apparently successful decompressions, and decoder errors detected 50_000 or +more bytes beyond the byte (or the start of the block) being tested. At +verbosity level 1 (-v) it also prints decoder errors detected 10_000 or more +bytes beyond the byte being tested. At verbosity level 2 (-vv) it prints all +cases for 1-bit errors or the decoder errors detected beyond the end of the +block for zeroed blocks. + +@item -W @var{position},@var{value} +@itemx --debug-decompress=@var{position},@var{value} +Load the compressed @var{file} into memory, set the byte at @var{position} +to @var{value}, and decompress the modified compressed data to standard +output. If the damaged member is decompressed fully (just fails with a CRC +mismatch), the members following it are also decompressed. + +@item -X[@var{position},@var{value}] +@itemx --show-packets[=@var{position},@var{value}] +Load the compressed @var{file} into memory, optionally set the byte at +@var{position} to @var{value}, decompress the modified compressed data +(discarding the output), and print to standard output descriptions of the +LZMA packets being decoded. + +@item -Y @var{range} +@itemx --debug-delay=@var{range} +Load the compressed @var{file} into memory and then repeatedly decompress +it, increasing 256 times each byte of the subset of the compressed data +positions specified by @var{range}, so as to test all possible one-byte +errors. For each decompression error find the error detection delay and +print to standard output the maximum delay. The error detection delay is the +difference between the position of the error and the position where the +decoder realized that the data contains an error. @xref{range-format}, for a +description of @var{range}. + +@item -Z @var{position},@var{value} +@itemx --debug-repair=@var{position},@var{value} +Load the compressed @var{file} into memory, set the byte at @var{position} +to @var{value}, and then try to repair the error. @xref{--repair}. + +@end table + +Numbers given as arguments to options may be followed by a multiplier +and an optional @samp{B} for "byte". + +Table of SI and binary prefixes (unit multipliers): + +@multitable {Prefix} {kilobyte (10^3 = 1000)} {|} {Prefix} {kibibyte (2^10 = 1024)} +@item Prefix @tab Value @tab | @tab Prefix @tab Value +@item k @tab kilobyte (10^3 = 1000) @tab | @tab Ki @tab kibibyte (2^10 = 1024) +@item M @tab megabyte (10^6) @tab | @tab Mi @tab mebibyte (2^20) +@item G @tab gigabyte (10^9) @tab | @tab Gi @tab gibibyte (2^30) +@item T @tab terabyte (10^12) @tab | @tab Ti @tab tebibyte (2^40) +@item P @tab petabyte (10^15) @tab | @tab Pi @tab pebibyte (2^50) +@item E @tab exabyte (10^18) @tab | @tab Ei @tab exbibyte (2^60) +@item Z @tab zettabyte (10^21) @tab | @tab Zi @tab zebibyte (2^70) +@item Y @tab yottabyte (10^24) @tab | @tab Yi @tab yobibyte (2^80) +@end multitable + +@sp 1 +Exit status: 0 for a normal exit, 1 for environmental problems (file not +found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid +input file, 3 for an internal consistency error (e.g., bug) which caused +lziprecover to panic. + + +@node Data safety +@chapter Protecting data from accidental loss +@cindex data safety + +It is a fact of life that sometimes data will become corrupt. Software has +errors. Hardware may misbehave or fail. RAM may be struck by a cosmic ray. +This is why a safe enough integrity checking is needed in compressed +formats, and the reason why a data recovery tool is sometimes needed. + +There are 3 main types of data corruption that may cause data loss: +single-byte errors, multibyte errors (generally affecting a whole sector +in a block device), and total device failure. + +Lziprecover protects natively against single-byte errors as long as file +integrity is checked frequently enough that a second single-byte error does +not develop in the same member before the first one is repaired. +@xref{Repairing one byte}. + +Lziprecover also protects against multibyte errors if at least one backup +copy of the file is made (@pxref{Merging files}), or if the error is a +zeroed sector and the uncompressed data corresponding to the zeroed sector +are available (@pxref{Reproducing one sector}). If you can choose between +merging and reproducing, try merging first because it is usually faster, +easier to use, and has a high probability of success. + +Lziprecover can't help in case of device failure. The only remedy for total +device failure is storing backup copies in separate media. + +The extraordinary safety of the lzip format allows lziprecover to exploit +the redundance that occurrs naturally when making compressed backups. +Lziprecover can recover data that would not be recoverable from files +compressed in other formats. Let's see two examples of how much better is +lzip compared with gzip and bzip2 with respect to data safety: + +@menu +* Merging with a backup:: Recovering a file using a damaged backup +* Reproducing a mailbox:: Recovering new messages using an old backup +@end menu + + +@node Merging with a backup +@section Recovering a file using a damaged backup +@cindex merging with a backup + +Let's suppose that you made a compressed backup of your valuable scientific +data and stored two copies on separate media. Years later you notice that +both copies are corrupt. + +If you compressed the data with gzip and both copies suffer any damage in +the data stream, even if it is just one altered bit, the original data can +only be recovered by an expert, if at all. + +If you used bzip2, and if the file is large enough to contain more than one +compressed data block (usually larger than @w{900 kB} uncompressed), and if +no block is damaged in both files, then the data can be manually recovered +by splitting the files with bzip2recover, verifying every block, and then +copying the right blocks in the right order into another file. + +But if you used lzip, the data can be automatically recovered with +@w{@samp{lziprecover --merge}} as long as the damaged areas don't overlap. + +Note that each error in a bzip2 file makes a whole block unusable, but each +error in a lzip file only affects the damaged bytes, making it possible to +recover a file with thousands of errors. + + +@node Reproducing a mailbox +@section Recovering new messages using an old backup +@cindex reproducing a mailbox + +Let's suppose that you make periodic backups of your email messages stored +in one or more mailboxes. (A mailbox is a file containing a possibly large +number of email messages). New messages are appended to the end of each +mailbox, therefore the initial part of two consecutive backups is identical +unless some messages have been changed or deleted in the meantime. The new +messages added to each backup are usually a small part of the whole mailbox. + +@verbatim ++========================================================+ +| Older backup containing some messages | ++========================================================+ ++========================================================+================+ +| Newer backup containing the messages above plus some | new messages | ++========================================================+================+ +@end verbatim + +One day you discover that your mailbox has disappeared because you deleted +it inadvertently or because of a bug in your email reader. Not only that. +You need to recover a recent message, but the last backup you made of the +mailbox (the newer backup above) has lost the data corresponding to a whole +sector because of an I/O error in the part containing the old messages. + +If you compressed the mailbox with gzip, usually none of the new messages +can be recovered even if they are intact because all the data beyond the +missing sector can't be decoded. + +If you used bzip2, and if the newer backup is large enough that the new +messages are in a different compressed data block than the one damaged +(usually larger than @w{900 kB} uncompressed), then you can recover the new +messages manually with bzip2recover. If the backups are identical except for +the new messages appended, you may even recover the whole newer backup by +combining the good blocks from both backups. + +But if you used lzip, the whole newer backup can be automatically recovered +with @w{@samp{lziprecover --reproduce}} as long as the missing bytes can be +recovered from the older backup, even if other messages in the common part +have been changed or deleted. Mailboxes seem to be specially easy to +reproduce. The probability of reproducing a mailbox +(@pxref{performance-of-reproduce}) is almost as high as that of merging two +identical backups (@pxref{performance-of-merge}). + + +@node Repairing one byte +@chapter Repairing one byte +@cindex repairing one byte + +Lziprecover can repair perfectly most files with small errors (up to one +single-byte error per member), without the need of any extra redundance +at all. If the reparation is successful, the repaired file will be +identical bit for bit to the original. This makes lzip files resistant +to bit flip, one of the most common forms of data corruption. + +The file is repaired in memory. Therefore, enough virtual memory +@w{(RAM + swap)} to contain the largest damaged member is required. + +The error may be located anywhere in the file except in the first 5 +bytes of each member header or in the @samp{Member size} field of the +trailer (last 8 bytes of each member). If the error is in the header it +can be easily repaired with a text editor like GNU Moe (@pxref{File +format}). If the error is in the member size, it is enough to ignore the +message about @samp{bad member size} when decompressing. + +Bit flip happens when one bit in the file is changed from 0 to 1 or vice +versa. It may be caused by bad RAM or even by natural radiation. I have +seen a case of bit flip in a file stored on an USB flash drive. + +One byte may seem small, but most file corruptions not produced by +transmission errors or I/O errors just affect one byte, or even one bit, +of the file. Also, unlike magnetic media, where errors usually affect a +whole sector, solid-state storage devices tend to produce single-byte +errors, making of lzip the perfect format for data stored on such devices. + +Repairing a file can take some time. Small files or files with the error +located near the beginning can be repaired in a few seconds. But +repairing a large file compressed with a large dictionary size and with +the error located far from the beginning, may take hours. + +On the other hand, errors located near the beginning of the file cause +much more loss of data than errors located near the end. So lziprecover +repairs more efficiently the worst errors. + + +@node Merging files +@chapter Merging files +@cindex merging files + +If you have several copies of a file but all of them are too damaged to +repair them (@pxref{Repairing one byte}), lziprecover can try to produce a +correct file by merging the good parts of the damaged copies. + +The merge may succeed even if some copies of the file have all the +headers and trailers damaged, as long as there is at least one copy of +every header and trailer intact, even if they are in different copies of +the file. + +The merge will fail if the damaged areas overlap (at least one byte is +damaged in all copies), or are adjacent and the boundary can't be +determined, or if the copies have too many damaged areas. + +All the copies to be merged must have the same size. If any of them is +larger or smaller than it should, either because it has been truncated +or because it got some garbage data appended at the end, it can be +brought to the correct size with the following command before merging it +with the other copies: + +@example +ddrescue -s<correct_size> -x<correct_size> file.lz correct_size_file.lz +@end example + +@anchor{performance-of-merge} +To give you an idea of its possibilities, when merging two copies, each of +them with one damaged area affecting 1 percent of the copy, the probability +of obtaining a correct file is about 98 percent. With three such copies the +probability rises to 99.97 percent. For large files (a few MB) with small +errors (one sector damaged per copy), the probability approaches 100 percent +even with only two copies. (Supposing that the errors are randomly located +inside each copy). + +Some types of solid-state device (NAND flash, for example) can produce +bursts of scattered single-bit errors. Lziprecover is able to merge +files with thousands of such scattered errors by grouping the errors +into clusters and then merging the files as if each cluster were a +single error. + +Here is a real case of successful merging. Two copies of the file +@samp{icecat-3.5.3-x86.tar.lz} (compressed size @w{9 MB}) became corrupt +while stored on the same NAND flash device. One of the copies had 76 +single-bit errors scattered in an area of 1020 bytes, and the other had +3028 such errors in an area of 31729 bytes. Lziprecover produced a +correct file, identical to the original, in just 5 seconds: + +@example +lziprecover -vvm a/icecat-3.5.3-x86.tar.lz b/icecat-3.5.3-x86.tar.lz +Merging member 1 of 1 (2552 errors) + 2552 errors have been grouped in 16 clusters. + Trying variation 2 of 2, block 2 +Input files merged successfully. +@end example + +Note that the number of errors reported by lziprecover (2552) is lower +than the number of corrupt bytes (3104) because contiguous corrupt bytes +are counted as a single multibyte error. + +@sp 1 +@anchor{ddrescue-example} +@noindent +Example 1: Recover a compressed backup from two copies on CD-ROM with +error-checked merging of copies. +@ifnothtml +@xref{Top,GNU ddrescue manual,,ddrescue}, +@end ifnothtml +@ifhtml +See the +@uref{http://www.gnu.org/software/ddrescue/manual/ddrescue_manual.html,,ddrescue manual} +@end ifhtml +for details about ddrescue. + +@example +ddrescue -d -r1 -b2048 /dev/cdrom cdimage1 mapfile1 +mount -t iso9660 -o loop,ro cdimage1 /mnt/cdimage +cp /mnt/cdimage/backup.tar.lz rescued1.tar.lz +umount /mnt/cdimage + (insert second copy in the CD drive) +ddrescue -d -r1 -b2048 /dev/cdrom cdimage2 mapfile2 +mount -t iso9660 -o loop,ro cdimage2 /mnt/cdimage +cp /mnt/cdimage/backup.tar.lz rescued2.tar.lz +umount /mnt/cdimage +lziprecover -m -v -o backup.tar.lz rescued1.tar.lz rescued2.tar.lz + Input files merged successfully. +lziprecover -tv backup.tar.lz + backup.tar.lz: ok +@end example + +@sp 1 +@noindent +Example 2: Recover the first volume of those created with the command +@w{@samp{lzip -b 32MiB -S 650MB big_db}} from two copies, +@samp{big_db1_00001.lz} and @samp{big_db2_00001.lz}, with member 07 +damaged in the first copy, member 18 damaged in the second copy, and +member 12 damaged in both copies. The correct file produced is saved in +@samp{big_db_00001.lz}. + +@example +lziprecover -m -v -o big_db_00001.lz big_db1_00001.lz big_db2_00001.lz + Input files merged successfully. +lziprecover -tv big_db_00001.lz + big_db_00001.lz: ok +@end example + + +@node Reproducing one sector +@chapter Reproducing one sector +@cindex reproducing one sector + +Lziprecover can recover a zeroed sector in a lzip file by concatenating the +decompressed contents of the file up to the beginning of the zeroed sector +and the uncompressed data corresponding to the zeroed sector, and then +feeding the concatenated data to the same version of lzip that created the +file. For this to work, a reference file is required containing the +uncompressed data corresponding to the missing compressed data of the zeroed +sector, plus some context data before and after them. It is possible to +recover a large file using just a few KB of reference data. + +The difficult part is finding a suitable reference file. It must contain the +exact data required (possibly mixed with other data). Containing similar +data is not enough. + +A zeroed sector may be caused by the incomplete recovery of a damaged +storage device (with I/O errors) using, for example, ddrescue. The +reproduction can't be done if the zeroed sector overlaps with the first 15 +bytes of a member, or if the zeroed sector is smaller than 8 bytes. + +The file is reproduced in memory. Therefore, enough virtual memory +@w{(RAM + swap)} to contain the damaged member is required. + +To understand how it works, take any lzipped file, say @samp{foo.lz}, +decompress it (keeping the original), and try to reproduce an artificially +zeroed sector in it by running the following commands: + +@example +lzip -kd foo.lz +lziprecover -vv --debug-reproduce=65536,512 --reference-file=foo foo.lz +@end example + +@noindent +which should produce an output like the following: + +@example +Reproducing: foo.lz +Reference file: foo +Testing sectors of size 512 at file positions 65536 to 66047 + (master mpos = 65536, dpos = 296892) +foo: Match found at offset 296892 +Reproduction succeeded at pos 65536 + + 1 sectors tested + 1 reproductions returned with zero status + all comparisons passed +@end example + +Using @samp{foo} as reference file guarantees that any zeroed sector in +@samp{foo.lz} can be reproduced because both files contain the same data. In +real use, the reference file needs to contain the data corresponding to the +zeroed sector, but the rest of the data (if any) may differ between both +files. The reference data may be obtained from the partial decompression of +the damaged file itself if it contains repeated data. For example if the +damaged file is a compressed tarball containing several partially modified +versions of the same file. + +The offset reported by lziprecover is the position in the reference file of +the first byte that could not be decompressed. This is the first byte that +will be compressed to reproduce the zeroed sector. + +The reproduce mode tries to reproduce the missing compressed data originally +present in the zeroed sector. It is based on the perfect reproducibility of +lzip files (lzip produces identical compressed output from identical input). +Therefore, the same version of lzip that created the file to be reproduced +should be used to reproduce the zeroed sector. Near versions may also work +because the output of lzip changes infrequently. If reproducing a tar.lz +archive created with tarlz, the version of lzip, clzip, or minilzip +corresponding to the version of the lzlib library used by tarlz to create +the archive should be used. + +When recovering a tar.lz archive and using as reference a file from the +filesystem, if the zeroed sector encodes (part of) a tar header, the archive +can't be reproduced. Therefore, the less overhead (smaller headers) a tar +archive has, the more probable is that the zeroed sector does not include a +header, and that the archive can be reproduced. The tarlz format has minimum +overhead. It uses basic ustar headers, and only adds extended pax headers +when they are required. + +@anchor{performance-of-reproduce} +@section Performance of @samp{--reproduce} +Reproduce mode is specially useful when recovering a corrupt backup (or a +corrupt source tarball) that is part of a series. Usually only a small +fraction of the data changes from one backup to the next or from one version +of a source tarball to the next. This makes sometimes possible to reproduce +a given corrupted version using reference data from a near version. The +following two tables show the fraction of reproducible sectors (reproducible +sectors divided by total sectors in archive) for some archives, using sector +sizes of 512 and 4096 bytes. @samp{mailbox-aug.tar.lz} is a backup of some +of my mailboxes. @samp{backup-feb.tar.lz} and @samp{backup-apr.tar.lz} are +real backups of my own working directory: + +@multitable {Reference file} {gawk-5.0.1.tar.lz} {4369 / 5844 = 74.76%} +@headitem Reference file @tab File @tab Reproducible (512) +@item backup-feb.tar @tab backup-apr.tar.lz @tab 3273 / 4342 = 75.38% +@item backup-apr.tar @tab backup-feb.tar.lz @tab 3259 / 4161 = 78.32% +@item gawk-5.0.0.tar @tab gawk-5.0.1.tar.lz @tab 4369 / 5844 = 74.76% +@item gawk-5.0.1.tar @tab gawk-5.0.0.tar.lz @tab 4379 / 5603 = 78.15% +@item gmp-6.1.1.tar @tab gmp-6.1.2.tar.lz @tab 2454 / 3787 = 64.8% +@item gmp-6.1.2.tar @tab gmp-6.1.1.tar.lz @tab 2461 / 3782 = 65.07% +@end multitable + +@multitable {mailbox-mar.tar} {mailbox-aug.tar.lz} {4036 / 4252 = 94.92%} +@headitem Reference file @tab File @tab Reproducible (4096) +@item mailbox-mar.tar @tab mailbox-aug.tar.lz @tab 4036 / 4252 = 94.92% +@item backup-feb.tar @tab backup-apr.tar.lz @tab 264 / 542 = 48.71% +@item backup-apr.tar @tab backup-feb.tar.lz @tab 264 / 520 = 50.77% +@item gawk-5.0.0.tar @tab gawk-5.0.1.tar.lz @tab 327 / 730 = 44.79% +@item gawk-5.0.1.tar @tab gawk-5.0.0.tar.lz @tab 326 / 700 = 46.57% +@item gmp-6.1.1.tar @tab gmp-6.1.2.tar.lz @tab 175 / 473 = 37% +@item gmp-6.1.2.tar @tab gmp-6.1.1.tar.lz @tab 181 / 472 = 38.35% +@end multitable + +Note that the "performance of reproduce" is a probability, not a partial +recovery. The data is either recovered fully (with the probability X shown +in the last column of the tables above) or not recovered at all (with +probability @w{1 - X}). + +Example 1: Recover a damaged source tarball with a zeroed sector of 512 +bytes at file position 1019904, using as reference another source tarball +for a different version of the software. + +@example +lziprecover -vv -e --reference-file=gmp-6.1.1.tar gmp-6.1.2.tar.lz +Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 512, value = 0x00) + (master mpos = 1019904, dpos = 6292134) +warning: gmp-6.1.1.tar: Partial match found at offset 6277798, len 8716. +Reference data may be mixed with other data. +Trying level -9 + Reproducing position 1015808 +Member reproduced successfully. +Copy of input file reproduced successfully. +@end example + +@sp 1 +@anchor{ddrescue-example2} +@noindent +Example 2: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a previous backup. The damaged +backup comes from a damaged partition copied with ddrescue. + +@example +ddrescue -b4096 -r10 /dev/sdc1 hdimage mapfile +mount -o loop,ro hdimage /mnt/hdimage +cp /mnt/hdimage/backup.tar.lz backup.tar.lz +umount /mnt/hdimage +lzip -t backup.tar.lz + backup.tar.lz: Decoder error at pos 1020530 +lziprecover -vv -e --reference-file=old_backup.tar backup.tar.lz +Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) +warning: old_backup.tar: Partial match found at offset 5743778, len 9546. +Reference data may be mixed with other data. +Trying level -9 + Reproducing position 1015808 +Member reproduced successfully. +Copy of input file reproduced successfully. +@end example + +@sp 1 +@noindent +Example 3: Recover a damaged backup with a zeroed sector of 4096 bytes at +file position 1019904, using as reference a file from the filesystem. (If +the zeroed sector encodes (part of) a tar header, the tarball can't be +reproduced). + +@example +# List the contents of the backup tarball to locate the damaged member. +tarlz -n0 -tvf backup.tar.lz + [...] + example.txt +tarlz: Skipping to next header. +tarlz: backup.tar.lz: Archive ends unexpectedly. +# Find in the filesystem the last file listed and use it as reference. +lziprecover -vv -e --reference-file=/somedir/example.txt backup.tar.lz +Reproducing bad area in member 1 of 1 + (begin = 1019904, size = 4096, value = 0x00) + (master mpos = 1019903, dpos = 5857954) +/somedir/example.txt: Match found at offset 9378 +Trying level -9 + Reproducing position 1015808 +Member reproduced successfully. +Copy of input file reproduced successfully. +@end example + +If @samp{backup.tar.lz} is a multimember file with more than one member +damaged and lziprecover shows the message @samp{One member reproduced. Copy +of input file still contains errors.}, the procedure shown in the example +above can be repeated until all the members have been reproduced. + +@samp{tarlz --keep-damaged -n0 -xf backup.tar.lz example.txt} produces a +partial copy of the reference file @samp{example.txt} that may help locate a +complete copy in the filesystem or in another backup, even if +@samp{example.txt} has been renamed. + + +@node Tarlz +@chapter Options supporting the tar.lz format +@cindex tarlz + +@uref{http://www.nongnu.org/lzip/manual/tarlz_manual.html,,Tarlz} is a +massively parallel (multi-threaded) combined implementation of the tar +archiver and the +@uref{http://www.nongnu.org/lzip/manual/lzip_manual.html,,lzip} compressor. + +Tarlz creates tar archives using a simplified and safer variant of the POSIX +pax format compressed in lzip format, keeping the alignment between tar +members and lzip members. The resulting multimember tar.lz archive is fully +backward compatible with standard tar tools like GNU tar, which treat it +like any other tar.lz archive. +@ifnothtml +@xref{Top,tarlz manual,,tarlz}, and @ref{Top,lzip manual,,lzip}. +@end ifnothtml + +Multimember tar.lz archives have some safety advantages over solidly +compressed tar.lz archives. For example, in case of corruption, tarlz can +extract all the undamaged members from the tar.lz archive, skipping over the +damaged members, just like the standard (uncompressed) tar. Keeping the +alignment between tar members and lzip members minimizes the amount of data +lost in case of corruption. In this chapter we'll explain the ways in which +lziprecover can recover and process multimember tar.lz archives. + +@sp 1 +@section Recovering damaged multimember tar.lz archives + +If you have several copies of the damaged archive, try merging them first +because merging has a high probability of success. @xref{Merging files}. If +the command below prints something like +@w{@samp{Input files merged successfully.}} you are done and +@samp{archive.tar.lz} now contains the recovered archive: + +@example +lziprecover -m -v -o archive.tar.lz a/archive.tar.lz b/archive.tar.lz +@end example + +If you only have one copy of the damaged archive with a zeroed block of data +caused by an I/O error, you may try to reproduce the archive. +@xref{Reproducing one sector}. If the command below prints something like +@w{@samp{Copy of input file reproduced successfully.}} you are done and +@samp{archive_fixed.tar.lz} now contains the recovered archive: + +@example +lziprecover -vv -e --reference-file=old_archive.tar archive.tar.lz +@end example + +If you only have one copy of the damaged archive, you may try to repair the +archive, but this has a lower probability of success. @xref{Repairing one +byte}. If the command below prints something like +@w{@samp{Copy of input file repaired successfully.}} you are done and +@samp{archive_fixed.tar.lz} now contains the recovered archive: + +@example +lziprecover -v -R archive.tar.lz +@end example + +If all the above fails, and the archive was created with tarlz, you may save +the damaged members for later and then copy the good members to another +archive. If the two commands below succeed, @samp{bad_members.tar.lz} will +contain all the damaged members and @samp{archive_cleaned.tar.lz} will +contain a good archive with the damaged members removed: + +@example +lziprecover -v --dump=damaged -o bad_members.tar.lz archive.tar.lz +lziprecover -v --strip=damaged -o archive_cleaned.tar.lz archive.tar.lz +@end example + +You can then use @samp{tarlz --keep-damaged} to recover as much data as +possible from each damaged member in @samp{bad_members.tar.lz}: + +@example +mkdir tmp +cd tmp +tarlz --keep-damaged -xvf ../bad_members.tar.lz +@end example + +@sp 1 +@section Processing multimember tar.lz archives + +Lziprecover is able to copy a list of members from a file to another. +For example the command +@w{@samp{lziprecover --dump=1-10:r1:tdata archive.tar.lz > subarch.tar.lz}} +creates a subset archive containing the first ten members, the end-of-file +blocks, and the trailing data (if any) of @samp{archive.tar.lz}. The +@samp{r1} part selects the last member, which in an appendable tar.lz +archive contains the end-of-file blocks. + + +@node File names +@chapter Names of the files produced by lziprecover +@cindex file names + +The name of the fixed file produced by @samp{--merge} and @samp{--repair} is +made by appending the string @samp{_fixed.lz} to the original file name. If +the original file name ends with one of the extensions @samp{.tar.lz}, +@samp{.lz}, or @samp{.tlz}, the string @samp{_fixed} is inserted before the +extension. + + +@node File format +@chapter File format +@cindex file format + +Perfection is reached, not when there is no longer anything to add, but +when there is no longer anything to take away.@* +--- Antoine de Saint-Exupery + +@sp 1 +In the diagram below, a box like this: + +@verbatim ++---+ +| | <-- the vertical bars might be missing ++---+ +@end verbatim + +represents one byte; a box like this: + +@verbatim ++==============+ +| | ++==============+ +@end verbatim + +represents a variable number of bytes. + +@sp 1 +A lzip file consists of a series of independent "members" (compressed data +sets). The members simply appear one after another in the file, with no +additional information before, between, or after them. Each member can +encode in compressed form up to @w{16 EiB - 1 byte} of uncompressed data. +The size of a multimember file is unlimited. + +Each member has the following structure: + +@verbatim ++--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| ID string | VN | DS | LZMA stream | CRC32 | Data size | Member size | ++--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +@end verbatim + +All multibyte values are stored in little endian order. + +@table @samp +@item ID string (the "magic" bytes) +A four byte string, identifying the lzip format, with the value "LZIP" +(0x4C, 0x5A, 0x49, 0x50). + +@item VN (version number, 1 byte) +Just in case something needs to be modified in the future. 1 for now. + +@item DS (coded dictionary size, 1 byte) +The dictionary size is calculated by taking a power of 2 (the base size) +and subtracting from it a fraction between 0/16 and 7/16 of the base size.@* +Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).@* +Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract +from the base size to obtain the dictionary size.@* +Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB@* +Valid values for dictionary size range from 4 KiB to 512 MiB. + +@item LZMA stream +The LZMA stream, finished by an "End Of Stream" marker. Uses default values +for encoder properties. +@ifnothtml +@xref{Stream format,,,lzip}, +@end ifnothtml +@ifhtml +See +@uref{http://www.nongnu.org/lzip/manual/lzip_manual.html#Stream-format,,Stream format} +@end ifhtml +for a complete description. + +@item CRC32 (4 bytes) +Cyclic Redundancy Check (CRC) of the original uncompressed data. + +@item Data size (8 bytes) +Size of the original uncompressed data. + +@item Member size (8 bytes) +Total size of the member, including header and trailer. This field acts +as a distributed index, allows the verification of stream integrity, and +facilitates the safe recovery of undamaged members from multimember files. +Member size should be limited to @w{2 PiB} to prevent the data size field +from overflowing. + +@end table + + +@node Trailing data +@chapter Extra data appended to the file +@cindex trailing data + +Sometimes extra data are found appended to a lzip file after the last +member. Such trailing data may be: + +@itemize @bullet +@item +Padding added to make the file size a multiple of some block size, for +example when writing to a tape. It is safe to append any amount of +padding zero bytes to a lzip file. + +@item +Useful data added by the user; a cryptographically secure hash, a +description of file contents, etc. It is safe to append any amount of +text to a lzip file as long as none of the first four bytes of the text +match the corresponding byte in the string "LZIP", and the text does not +contain any zero bytes (null characters). Nonzero bytes and zero bytes +can't be safely mixed in trailing data. + +@item +Garbage added by some not totally successful copy operation. + +@item +Malicious data added to the file in order to make its total size and +hash value (for a chosen hash) coincide with those of another file. + +@item +In rare cases, trailing data could be the corrupt header of another +member. In multimember or concatenated files the probability of +corruption happening in the magic bytes is 5 times smaller than the +probability of getting a false positive caused by the corruption of the +integrity information itself. Therefore it can be considered to be below +the noise level. Additionally, the test used by lziprecover to discriminate +trailing data from a corrupt header has a Hamming distance (HD) of 3, +and the 3 bit flips must happen in different magic bytes for the test to +fail. In any case, the option @samp{--trailing-error} guarantees that +any corrupt header will be detected. +@end itemize + +Trailing data are in no way part of the lzip file format, but tools +reading lzip files are expected to behave as correctly and usefully as +possible in the presence of trailing data. + +Trailing data can be safely ignored in most cases. In some cases, like +that of user-added data, they are expected to be ignored. In those cases +where a file containing trailing data must be rejected, the option +@samp{--trailing-error} can be used. @xref{--trailing-error}. + +Lziprecover facilitates the management of metadata stored as trailing +data in lzip files. See the following examples: + +@noindent +Example 1: Add a comment or description to a compressed file. + +@example +# First append the comment as trailing data to a lzip file +echo 'This file contains this and that' >> file.lz +# This command prints the comment to standard output +lziprecover --dump=tdata file.lz +# This command outputs file.lz without the comment +lziprecover --strip=tdata file.lz > stripped_file.lz +# This command removes the comment from file.lz +lziprecover --remove=tdata file.lz +@end example + +@sp 1 +@noindent +Example 2: Add and verify a cryptographically secure hash. (This may be +convenient, but a separate copy of the hash must be kept in a safe place +to guarantee that both file and hash have not been maliciously replaced). + +@example +sha256sum < file.lz >> file.lz +lziprecover --strip=tdata file.lz | sha256sum -c \ + <(lziprecover --dump=tdata file.lz) +@end example + + +@node Examples +@chapter A small tutorial with examples +@cindex examples + +Example 1: Extract all the files from archive @samp{foo.tar.lz}. + +@example + tar -xf foo.tar.lz +or + lziprecover -cd foo.tar.lz | tar -xf - +@end example + +@sp 1 +@noindent +Example 2: Restore a regular file from its compressed version +@samp{file.lz}. If the operation is successful, @samp{file.lz} is removed. + +@example +lziprecover -d file.lz +@end example + +@sp 1 +@noindent +Example 3: Verify the integrity of the compressed file @samp{file.lz} and +show status. + +@example +lziprecover -tv file.lz +@end example + +@sp 1 +@anchor{concat-example} +@noindent +Example 4: The right way of concatenating the decompressed output of two or +more compressed files. @xref{Trailing data}. + +@example +Don't do this + cat file1.lz file2.lz file3.lz | lziprecover -d - +Do this instead + lziprecover -cd file1.lz file2.lz file3.lz +You may also concatenate the compressed files like this + lziprecover --strip=tdata file1.lz file2.lz file3.lz > file123.lz +Or keeping the trailing data of the last file like this + lziprecover --strip=damaged file1.lz file2.lz file3.lz > file123.lz +@end example + +@sp 1 +@noindent +Example 5: Decompress @samp{file.lz} partially until @w{10 KiB} of +decompressed data are produced. + +@example +lziprecover -D 0,10KiB file.lz +@end example + +@sp 1 +@noindent +Example 6: Decompress @samp{file.lz} partially from decompressed byte at +offset 10000 to decompressed byte at offset 14999 (5000 bytes are produced). + +@example +lziprecover -D 10000-15000 file.lz +@end example + +@sp 1 +@noindent +Example 7: Repair small errors in the file @samp{file.lz}. (Indented lines +are abridged diagnostic messages from lziprecover). + +@example +lziprecover -v -R file.lz + Copy of input file repaired successfully. +lziprecover -tv file_fixed.lz + file_fixed.lz: ok +mv file_fixed.lz file.lz +@end example + +@sp 1 +@noindent +Example 8: Split the multimember file @samp{file.lz} and write each member +in its own @samp{recXXXfile.lz} file. Then use @w{@samp{lziprecover -t}} to +test the integrity of the resulting files. + +@example +lziprecover -s file.lz +lziprecover -tv rec*file.lz +@end example + + +@node Unzcrash +@chapter Testing the robustness of decompressors +@cindex unzcrash + +The lziprecover package also includes unzcrash, a program written to test +robustness to decompression of corrupted data, inspired by unzcrash.c from +Julian Seward's bzip2. Type @samp{make unzcrash} in the lziprecover source +directory to build it. + +By default, unzcrash reads the file specified and then repeatedly +decompresses it, increasing 256 times each byte of the compressed data, so +as to test all possible one-byte errors. Note that it may take years or even +centuries to test all possible one-byte errors in a large file (tens of MB). + +If the option @samp{--block} is given, unzcrash reads the file specified and +then repeatedly decompresses it, setting all bytes in each successive block +to the value given, so as to test all possible full sector errors. + +If the option @samp{--truncate} is given, unzcrash reads the file specified +and then repeatedly decompresses it, truncating the file to increasing +lengths, so as to test all possible truncation points. + +None of the three test modes described above should cause any invalid memory +accesses. If any of them does, please, report it as a bug to the maintainers +of the decompressor being tested. + +Unzcrash really executes as a subprocess the shell command specified in the +first non-option argument, and then writes the file specified in the second +non-option argument to the standard input of the subprocess, modifying the +corresponding byte each time. Therefore unzcrash can be used to test any +decompressor (not only lzip), or even other decoder programs having a +suitable command line syntax. + +If the decompressor returns with zero status, unzcrash compares the output +of the decompressor for the original and corrupt files. If the outputs +differ, it means that the decompressor returned a false negative; it failed +to recognize the corruption and produced garbage output. The only exception +is when a multimember file is truncated just after the last byte of a +member, producing a shorter but valid compressed file. Except in this latter +case, please, report any false negative as a bug. + +In order to compare the outputs, unzcrash needs a @samp{zcmp} program able +to understand the format being tested. For example the @samp{zcmp} provided +by @uref{http://www.nongnu.org/zutils/manual/zutils_manual.html#Zcmp,,zutils}. +If the @samp{zcmp} program used does not understand the format being tested, +all the comparisons will fail because the compressed files will be compared +without being decompressed first. Use @samp{--zcmp=false} to disable +comparisons. +@ifnothtml +@xref{Zcmp,,,zutils}. +@end ifnothtml + +The format for running unzcrash is: + +@example +unzcrash [@var{options}] 'lzip -t' @var{file} +@end example + +@noindent +The compressed @var{file} must not contain errors and the decompressor being +tested must decompress it correctly for the comparisons to work. + +unzcrash supports the following options: + +@table @code +@item -h +@itemx --help +Print an informative help message describing the options and exit. + +@item -V +@itemx --version +Print the version number of unzcrash on the standard output and exit. +This version number should be included in all bug reports. + +@item -b @var{range} +@itemx --bits=@var{range} +Test N-bit errors only, instead of testing all the 255 wrong values for +each byte. @samp{N-bit error} means any value differing from the +original value in N bit positions, not a value differing from the +original value in the bit position N.@* +The number of N-bit errors per byte (N = 1 to 8) is: +@w{8 28 56 70 56 28 8 1} + +@multitable {Examples of @var{range}} {Tests errors of N-bits} +@item Examples of @var{range} @tab Tests errors of N-bits +@item 1 @tab 1 +@item 1,2,3 @tab 1, 2, 3 +@item 2-4 @tab 2, 3, 4 +@item 1,3-5,8 @tab 1, 3, 4, 5, 8 +@item 1-3,5-8 @tab 1, 2, 3, 5, 6, 7, 8 +@end multitable + +@item -B[@var{size}][,@var{value}] +@itemx --block[=@var{size}][,@var{value}] +Test block errors of given @var{size}, simulating a whole sector I/O error. +@var{size} defaults to 512 bytes. @var{value} defaults to 0. By default, +only contiguous, non-overlapping blocks are tested, but this may be changed +with the option @samp{--delta}. + +@item -d @var{n} +@itemx --delta=@var{n} +Test one byte, block, or truncation size every @var{n} bytes. If +@samp{--delta} is not specified, unzcrash tests all the bytes, +non-overlapping blocks, or truncation sizes. Values of @var{n} smaller than +the block size will result in overlapping blocks. (Which is convenient for +testing because there are usually too few non-overlapping blocks in a file). + +@item -e @var{position},@var{value} +@itemx --set-byte=@var{position},@var{value} +Set byte at @var{position} to @var{value} in the internal buffer after +reading and testing @var{file} but before the first test call to the +decompressor. Byte positions start at 0. If @var{value} is preceded by +@samp{+}, it is added to the original value of the byte at @var{position}. +If @var{value} is preceded by @samp{f} (flip), it is XORed with the original +value of the byte at @var{position}. This option can be used to run tests +with a changed dictionary size, for example. + +@item -n +@itemx --no-verify +Skip initial verification of @var{file} and @samp{zcmp}. May speed up things +a lot when testing many (or large) known good files. + +@item -p @var{bytes} +@itemx --position=@var{bytes} +First byte position to test in the file. Defaults to 0. Negative values +are relative to the end of the file. + +@item -q +@itemx --quiet +Quiet operation. Suppress all messages. + +@item -s @var{bytes} +@itemx --size=@var{bytes} +Number of byte positions to test. If not specified, the rest of the file +is tested (from @samp{--position} to end of file). Negative values are +relative to the rest of the file. + +@item -t +@itemx --truncate +Test all possible truncation points in the range specified by +@samp{--position} and @samp{--size}. + +@item -v +@itemx --verbose +Verbose mode. + +@item -z +@itemx --zcmp=<command> +Set zcmp command name and options. Defaults to @samp{zcmp}. Use +@samp{--zcmp=false} to disable comparisons. If testing a decompressor +different from the one used by default by zcmp, it is needed to force +unzcrash and zcmp to use the same decompressor with a command like +@w{@samp{unzcrash --zcmp='zcmp --lz=plzip' 'plzip -t' @var{file}}} + +@end table + +Exit status: 0 for a normal exit, 1 for environmental problems (file not +found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or +invalid input file, 3 for an internal consistency error (e.g., bug) which +caused unzcrash to panic. + + +@node Problems +@chapter Reporting bugs +@cindex bugs +@cindex getting help + +There are probably bugs in lziprecover. There are certainly errors and +omissions in this manual. If you report them, they will get fixed. If +you don't, no one will ever know about them and they will remain unfixed +for all eternity, if not longer. + +If you find a bug in lziprecover, please send electronic mail to +@email{lzip-bug@@nongnu.org}. Include the version number, which you can +find by running @w{@samp{lziprecover --version}}. + + +@node Concept index +@unnumbered Concept index + +@printindex cp + +@bye diff --git a/dump_remove.cc b/dump_remove.cc new file mode 100644 index 0000000..37f7f00 --- /dev/null +++ b/dump_remove.cc @@ -0,0 +1,292 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <utime.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "lzip_index.h" + + +// If strip is false, dump to outfd members/gaps/tdata in member_list. +// If strip is true, dump to outfd members/gaps/tdata not in member_list. +int dump_members( const std::vector< std::string > & filenames, + const std::string & default_output_filename, + const Member_list & member_list, const bool force, + bool ignore_errors, bool ignore_trailing, + const bool loose_trailing, const bool strip, + const bool to_stdout ) + { + if( to_stdout || default_output_filename.empty() ) outfd = STDOUT_FILENO; + else + { + output_filename = default_output_filename; + set_signal_handler(); + if( !open_outstream( force, false, false, false ) ) return 1; + } + if( ( strip || !member_list.tdata || member_list.damaged || member_list.range() ) && + !check_tty_out() ) return 1; // check tty except for --dump=tdata + unsigned long long copied_size = 0, stripped_size = 0; + unsigned long long copied_tsize = 0, stripped_tsize = 0; + long members = 0, smembers = 0; + int files = 0, tfiles = 0, retval = 0; + if( member_list.damaged ) ignore_errors = true; + if( member_list.tdata ) ignore_trailing = true; + bool stdin_used = false; + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = + from_stdin ? "(stdin)" : filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, + ignore_errors, ignore_errors ); + if( lzip_index.retval() != 0 ) + { + show_file_error( input_filename, lzip_index.error().c_str() ); + set_retval( retval, lzip_index.retval() ); + close( infd ); + continue; + } + if( !safe_seek( infd, 0 ) ) cleanup_and_fail( 1 ); + const long blocks = lzip_index.blocks( false ); // not counting tdata + long long stream_pos = 0; // first pos not yet read from file + long gaps = 0; + const long prev_members = members, prev_smembers = smembers; + const unsigned long long prev_stripped_size = stripped_size; + for( long j = 0; j < lzip_index.members(); ++j ) // copy members and gaps + { + const Block & mb = lzip_index.mblock( j ); + if( mb.pos() > stream_pos ) // gap + { + const bool in = member_list.damaged || + member_list.includes( j + gaps, blocks ); + if( in == !strip ) + { + if( !safe_seek( infd, stream_pos ) || + !copy_file( infd, outfd, mb.pos() - stream_pos ) ) + cleanup_and_fail( 1 ); + copied_size += mb.pos() - stream_pos; ++members; + } + else { stripped_size += mb.pos() - stream_pos; ++smembers; } + ++gaps; + } + bool in = member_list.includes( j + gaps, blocks ); // member + if( !in && member_list.damaged ) + { + if( !safe_seek( infd, mb.pos() ) ) cleanup_and_fail( 1 ); + in = ( test_member_from_file( infd, mb.size() ) != 0 ); // damaged + } + if( in == !strip ) + { + if( !safe_seek( infd, mb.pos() ) || + !copy_file( infd, outfd, mb.size() ) ) cleanup_and_fail( 1 ); + copied_size += mb.size(); ++members; + } + else { stripped_size += mb.size(); ++smembers; } + stream_pos = mb.end(); + } + if( strip && members == prev_members ) // all members were stripped + { if( verbosity >= 1 ) + show_file_error( input_filename, "All members stripped, skipping." ); + stripped_size = prev_stripped_size; smembers = prev_smembers; + close( infd ); continue; } + if( ( !strip && members > prev_members ) || + ( strip && smembers > prev_smembers ) ) ++files; + // copy trailing data + const unsigned long long cdata_size = lzip_index.cdata_size(); + const long long trailing_size = lzip_index.file_size() - cdata_size; + if( member_list.tdata == !strip && trailing_size > 0 && + ( !strip || i + 1 >= filenames.size() ) ) // strip all but last + { + if( !safe_seek( infd, cdata_size ) || + !copy_file( infd, outfd, trailing_size ) ) cleanup_and_fail( 1 ); + copied_tsize += trailing_size; + } + else if( trailing_size > 0 ) { stripped_tsize += trailing_size; ++tfiles; } + close( infd ); + } + if( close_outstream( 0 ) != 0 ) set_retval( retval, 1 ); + if( verbosity >= 1 ) + { + if( !strip ) + { + if( member_list.damaged || member_list.range() ) + std::fprintf( stderr, "%llu bytes dumped from %ld %s from %d %s.\n", + copied_size, + members, ( members == 1 ) ? "member" : "members", + files, ( files == 1 ) ? "file" : "files" ); + if( member_list.tdata ) + std::fprintf( stderr, "%llu trailing bytes dumped.\n", copied_tsize ); + } + else + { + if( member_list.damaged || member_list.range() ) + std::fprintf( stderr, "%llu bytes stripped from %ld %s from %d %s.\n", + stripped_size, + smembers, ( smembers == 1 ) ? "member" : "members", + files, ( files == 1 ) ? "file" : "files" ); + if( member_list.tdata ) + std::fprintf( stderr, "%llu trailing bytes stripped from %d %s.\n", + stripped_tsize, tfiles, ( tfiles == 1 ) ? "file" : "files" ); + } + } + return retval; + } + + +int remove_members( const std::vector< std::string > & filenames, + const Member_list & member_list, bool ignore_errors, + bool ignore_trailing, const bool loose_trailing ) + { + unsigned long long removed_size = 0, removed_tsize = 0; + long members = 0; + int files = 0, tfiles = 0, retval = 0; + if( member_list.damaged ) ignore_errors = true; + if( member_list.tdata ) ignore_trailing = true; + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const char * const filename = filenames[i].c_str(); + struct stat in_stats, dummy_stats; + const int infd = open_instream( filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, + ignore_errors, ignore_errors ); + if( lzip_index.retval() != 0 ) + { + show_file_error( filename, lzip_index.error().c_str() ); + set_retval( retval, lzip_index.retval() ); + close( infd ); + continue; + } + const int fd = open_truncable_stream( filename, &dummy_stats ); + if( fd < 0 ) { close( infd ); set_retval( retval, 1 ); continue; } + + if( !safe_seek( infd, 0 ) ) return 1; + const long blocks = lzip_index.blocks( false ); // not counting tdata + long long stream_pos = 0; // first pos not yet written to file + long gaps = 0; + bool error = false; + const long prev_members = members; + for( long j = 0; j < lzip_index.members(); ++j ) // copy members and gaps + { + const Block & mb = lzip_index.mblock( j ); + const long long prev_end = (j > 0) ? lzip_index.mblock(j - 1).end() : 0; + if( mb.pos() > prev_end ) // gap + { + if( !member_list.damaged && !member_list.includes( j + gaps, blocks ) ) + { + if( stream_pos != prev_end && + ( !safe_seek( infd, prev_end ) || + !safe_seek( fd, stream_pos ) || + !copy_file( infd, fd, mb.pos() - prev_end ) ) ) + { error = true; set_retval( retval, 1 ); break; } + stream_pos += mb.pos() - prev_end; + } + else ++members; + ++gaps; + } + bool in = member_list.includes( j + gaps, blocks ); // member + if( !in && member_list.damaged ) + { + if( !safe_seek( infd, mb.pos() ) ) + { error = true; set_retval( retval, 1 ); break; } + in = ( test_member_from_file( infd, mb.size() ) != 0 ); // damaged + } + if( !in ) + { + if( stream_pos != mb.pos() && + ( !safe_seek( infd, mb.pos() ) || + !safe_seek( fd, stream_pos ) || + !copy_file( infd, fd, mb.size() ) ) ) + { error = true; set_retval( retval, 1 ); break; } + stream_pos += mb.size(); + } + else ++members; + } + if( error ) { close( fd ); close( infd ); break; } + if( stream_pos == 0 ) // all members were removed + { show_file_error( filename, "All members would be removed, skipping." ); + close( fd ); close( infd ); set_retval( retval, 2 ); + members = prev_members; continue; } + const long long cdata_size = lzip_index.cdata_size(); + if( cdata_size > stream_pos ) + { removed_size += cdata_size - stream_pos; ++files; } + const long long file_size = lzip_index.file_size(); + const long long trailing_size = file_size - cdata_size; + if( trailing_size > 0 ) + { + if( !member_list.tdata ) // copy trailing data + { + if( stream_pos != cdata_size && + ( !safe_seek( infd, cdata_size ) || + !safe_seek( fd, stream_pos ) || + !copy_file( infd, fd, trailing_size ) ) ) + { close( fd ); close( infd ); set_retval( retval, 1 ); break; } + stream_pos += trailing_size; + } + else { removed_tsize += trailing_size; ++tfiles; } + } + if( stream_pos >= file_size ) // no members were removed + { close( fd ); close( infd ); continue; } + int result; + do result = ftruncate( fd, stream_pos ); + while( result != 0 && errno == EINTR ); + if( result != 0 ) + { + show_file_error( filename, "Can't truncate file", errno ); + close( fd ); close( infd ); set_retval( retval, 1 ); break; + } + if( close( fd ) != 0 || close( infd ) != 0 ) + { + show_file_error( filename, "Error closing file", errno ); + set_retval( retval, 1 ); break; + } + struct utimbuf t; + t.actime = in_stats.st_atime; + t.modtime = in_stats.st_mtime; + utime( filename, &t ); + } + if( verbosity >= 1 ) + { + if( member_list.damaged || member_list.range() ) + std::fprintf( stderr, "%llu bytes removed from %ld %s from %d %s.\n", + removed_size, + members, ( members == 1 ) ? "member" : "members", + files, ( files == 1 ) ? "file" : "files" ); + if( member_list.tdata ) + std::fprintf( stderr, "%llu trailing bytes removed from %d %s.\n", + removed_tsize, tfiles, ( tfiles == 1 ) ? "file" : "files" ); + } + return retval; + } @@ -0,0 +1,125 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "lzip_index.h" + + +namespace { + +void list_line( const unsigned long long uncomp_size, + const unsigned long long comp_size, + const char * const input_filename ) + { + if( uncomp_size > 0 ) + std::printf( "%14llu %14llu %6.2f%% %s\n", uncomp_size, comp_size, + 100.0 - ( ( 100.0 * comp_size ) / uncomp_size ), + input_filename ); + else + std::printf( "%14llu %14llu -INF%% %s\n", uncomp_size, comp_size, + input_filename ); + } + +} // end namespace + + +int list_files( const std::vector< std::string > & filenames, + const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing ) + { + unsigned long long total_comp = 0, total_uncomp = 0; + int files = 0, retval = 0; + bool first_post = true; + bool stdin_used = false; + + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = + from_stdin ? "(stdin)" : filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, + ignore_errors, ignore_errors ); + close( infd ); + if( lzip_index.retval() != 0 ) + { + show_file_error( input_filename, lzip_index.error().c_str() ); + set_retval( retval, lzip_index.retval() ); + continue; + } + if( verbosity < 0 ) continue; + const unsigned long long udata_size = lzip_index.udata_size(); + const unsigned long long cdata_size = lzip_index.cdata_size(); + total_comp += cdata_size; total_uncomp += udata_size; ++files; + const long members = lzip_index.members(); + if( first_post ) + { + first_post = false; + if( verbosity >= 1 ) std::fputs( " dict memb trail ", stdout ); + std::fputs( " uncompressed compressed saved name\n", stdout ); + } + if( verbosity >= 1 ) + std::printf( "%s %5ld %6lld ", format_ds( lzip_index.dictionary_size() ), + members, lzip_index.file_size() - cdata_size ); + list_line( udata_size, cdata_size, input_filename ); + + if( verbosity >= 2 && ( members > 1 || + ( members == 1 && lzip_index.mblock( 0 ).pos() > 0 ) ) ) + { + std::fputs( " member data_pos data_size member_pos member_size\n", stdout ); + long long prev_end = 0; + for( long i = 0, gaps = 0; i < members; ++i ) + { + const Block & db = lzip_index.dblock( i ); + const Block & mb = lzip_index.mblock( i ); + if( mb.pos() > prev_end ) + { + std::printf( " gap - - %14llu %14llu\n", + prev_end, mb.pos() - prev_end ); + ++gaps; + } + std::printf( "%6ld %14llu %14llu %14llu %14llu\n", + i + gaps + 1, db.pos(), db.size(), mb.pos(), mb.size() ); + prev_end = mb.end(); + } + first_post = true; // reprint heading after list of members + } + std::fflush( stdout ); + } + if( verbosity >= 0 && files > 1 ) + { + if( verbosity >= 1 ) std::fputs( " ", stdout ); + list_line( total_uncomp, total_comp, "(totals)" ); + std::fflush( stdout ); + } + return retval; + } diff --git a/lunzcrash.cc b/lunzcrash.cc new file mode 100644 index 0000000..577d355 --- /dev/null +++ b/lunzcrash.cc @@ -0,0 +1,370 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "md5.h" +#include "mtester.h" +#include "lzip_index.h" + + +namespace { + +bool verify_member( const uint8_t * const mbuffer, const long long msize, + const unsigned dictionary_size, const char * const name, + uint8_t digest[16] ) + { + MD5SUM md5sum; + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, &md5sum ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { show_file_error( name, "Error verifying input file." ); return false; } + md5sum.md5_finish( digest ); + return true; + } + + +bool compare_member( const uint8_t * const mbuffer, const long long msize, + const unsigned dictionary_size, + const long long byte_pos, const uint8_t digest[16] ) + { + MD5SUM md5sum; + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, &md5sum ); + bool error = ( mtester.test_member() != 0 || !mtester.finished() ); + if( !error ) + { + uint8_t new_digest[16]; + md5sum.md5_finish( new_digest ); + if( std::memcmp( digest, new_digest, 16 ) != 0 ) error = true; + } + if( error && verbosity >= 0 ) + std::printf( "byte %llu comparison failed\n", byte_pos ); + return !error; + } + + +int test_member_rest( const LZ_mtester & master, uint8_t * const buffer2, + long * const failure_posp, + const unsigned long long byte_pos ) + { + LZ_mtester mtester( master ); // tester with external buffer + mtester.duplicate_buffer( buffer2 ); + int result = mtester.test_member( LLONG_MAX, LLONG_MAX, stdout, byte_pos ); + if( result == 0 && !mtester.finished() ) result = -1; // false negative + if( result != 0 ) *failure_posp = mtester.member_position(); + return result; + } + + +long next_pct_pos( const Lzip_index & lzip_index, const int i, const int pct, + const int sector_size = 0 ) + { + if( pct <= 0 ) return 0; + const long long cdata_size = lzip_index.cdata_size() - sector_size; + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size() - sector_size; + long long pct_pos = (long long)( cdata_size / ( 100.0 / pct ) ); + + if( pct_pos <= mpos ) pct_pos = 0; + else if( pct_pos == cdata_size ) pct_pos = msize - 21; // 100% + else if( pct_pos >= mpos + msize ) pct_pos = msize; + else pct_pos -= mpos; + return pct_pos; + } + +} // end namespace + + +/* Test 1-bit errors in LZMA streams in file. + Unless verbosity >= 1, print only the bytes with interesting results. */ +int lunzcrash_bit( const char * const input_filename ) + { + struct stat in_stats; // not used + const int infd = open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename, lzip_index.error().c_str() ); + return lzip_index.retval(); } + if( verbosity >= 2 ) printf( "Testing file '%s'\n", input_filename ); + + const long long cdata_size = lzip_index.cdata_size(); + long positions = 0, decompressions = 0, successes = 0, failed_comparisons = 0; + int pct = ( cdata_size >= 1000 && isatty( STDERR_FILENO ) ) ? 0 : 100; + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + const unsigned dictionary_size = lzip_index.dictionary_size( i ); + uint8_t * const mbuffer = read_member( infd, mpos, msize ); + if( !mbuffer ) return 1; + uint8_t md5_orig[16]; + if( !verify_member( mbuffer, msize, dictionary_size, input_filename, + md5_orig ) ) return 2; + long pct_pos = next_pct_pos( lzip_index, i, pct ); + long pos = Lzip_header::size + 1, printed = 0; // last pos printed + const long end = msize - 20; + if( verbosity == 0 ) // give a clue of the range being tested + std::printf( "Testing bytes %llu to %llu\n", mpos + pos, mpos + end - 1 ); + LZ_mtester master( mbuffer, msize, dictionary_size ); + uint8_t * const buffer2 = new uint8_t[dictionary_size]; + for( ; pos < end; ++pos ) + { + const long pos_limit = pos - 16; + if( pos_limit > 0 && master.test_member( pos_limit ) != -1 ) + { show_error( "Can't advance master." ); return 1; } + if( verbosity >= 0 && pos >= pct_pos ) + { std::fprintf( stderr, "\r%3u%% done\r", pct ); ++pct; + pct_pos = next_pct_pos( lzip_index, i, pct ); } + if( verbosity >= 1 ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + ++positions; + const uint8_t byte = mbuffer[pos]; + for( uint8_t mask = 1; mask != 0; mask <<= 1 ) + { + ++decompressions; + mbuffer[pos] ^= mask; + long failure_pos = 0; + const int result = test_member_rest( master, buffer2, &failure_pos, + ( printed < pos ) ? mpos + pos : 0 ); + if( result <= 0 ) + { + ++successes; + if( verbosity >= 0 ) + { + if( printed < pos ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + std::printf( "0x%02X (0x%02X^0x%02X) passed the test%s", + mbuffer[pos], byte, mask, ( result < 0 ) ? "" : "\n" ); + if( result < 0 ) + std::printf( ", but only consumed %lu bytes of %llu\n", + failure_pos, msize ); + } + if( !compare_member( mbuffer, msize, dictionary_size, mpos + pos, + md5_orig ) ) ++failed_comparisons; + } + else if( result == 1 ) + { + if( verbosity >= 2 || + ( verbosity >= 1 && failure_pos - pos >= 10000 ) || + ( verbosity >= 0 && failure_pos - pos >= 50000 ) ) + { + if( printed < pos ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + std::printf( "Decoder error at pos %llu\n", mpos + failure_pos ); + } + } + else if( result == 3 || result == 4 ) // test_member printed the error + { if( verbosity >= 0 && printed < pos ) printed = pos; } + else if( verbosity >= 0 ) + { + if( printed < pos ) + { std::printf( "byte %llu\n", mpos + pos ); printed = pos; } + if( result == 2 ) + std::printf( "File ends unexpectedly at pos %llu\n", + mpos + failure_pos ); + else + std::printf( "Unknown error code '%d'\n", result ); + } + mbuffer[pos] ^= mask; + } + } + delete[] buffer2; + if( !compare_member( mbuffer, msize, dictionary_size, mpos + pos, md5_orig ) ) + internal_error( "Some byte was not properly restored." ); + delete[] mbuffer; + } + + if( verbosity >= 0 ) + { + std::printf( "\n%9ld bytes tested\n%9ld total decompressions" + "\n%9ld decompressions returned with zero status", + positions, decompressions, successes ); + if( successes > 0 ) + { + if( failed_comparisons > 0 ) + std::printf( ", of which\n%9ld comparisons failed\n", + failed_comparisons ); + else std::fputs( "\n all comparisons passed\n", stdout ); + } + else std::fputc( '\n', stdout ); + } + return 0; + } + + +/* Test zeroed blocks of given size in LZMA streams in file. + Unless verbosity >= 1, print only the bytes with interesting results. */ +int lunzcrash_block( const char * const input_filename, const int sector_size ) + { + struct stat in_stats; // not used + const int infd = open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename, lzip_index.error().c_str() ); + return lzip_index.retval(); } + if( verbosity >= 2 ) printf( "Testing file '%s'\n", input_filename ); + + const long long cdata_size = lzip_index.cdata_size(); + long decompressions = 0, successes = 0, failed_comparisons = 0; + int pct = ( cdata_size >= 1000 && isatty( STDERR_FILENO ) ) ? 0 : 100; + uint8_t * const block = new uint8_t[sector_size]; + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + long pos = Lzip_header::size + 1; + const long end = msize - sector_size - 20; + if( end <= pos ) continue; // sector_size larger than LZMA stream + const unsigned dictionary_size = lzip_index.dictionary_size( i ); + uint8_t * const mbuffer = read_member( infd, mpos, msize ); + if( !mbuffer ) return 1; + uint8_t md5_orig[16]; + if( !verify_member( mbuffer, msize, dictionary_size, input_filename, + md5_orig ) ) return 2; + long pct_pos = next_pct_pos( lzip_index, i, pct, sector_size ); + if( verbosity >= 0 ) // give a clue of the range being tested + std::printf( "Testing blocks of size %u from pos %llu to %llu\n", + sector_size, mpos + pos, mpos + end - 1 ); + LZ_mtester master( mbuffer, msize, dictionary_size ); + uint8_t * const buffer2 = new uint8_t[dictionary_size]; + for( ; pos < end; ++pos ) + { + const long pos_limit = pos - 16; + if( pos_limit > 0 && master.test_member( pos_limit ) != -1 ) + { show_error( "Can't advance master." ); return 1; } + if( verbosity >= 0 && pos >= pct_pos ) + { std::fprintf( stderr, "\r%3u%% done\r", pct ); ++pct; + pct_pos = next_pct_pos( lzip_index, i, pct, sector_size ); } + std::memcpy( block, mbuffer + pos, sector_size ); // save block + std::memset( mbuffer + pos, 0, sector_size ); + ++decompressions; + long failure_pos = 0; + const int result = + test_member_rest( master, buffer2, &failure_pos, mpos + pos ); + if( result <= 0 ) + { + ++successes; + if( verbosity >= 0 ) + { + std::printf( "block %llu,%u passed the test%s", + mpos + pos, sector_size, ( result < 0 ) ? "" : "\n" ); + if( result < 0 ) + std::printf( ", but only consumed %lu bytes of %llu\n", + failure_pos, msize ); + } + if( !compare_member( mbuffer, msize, dictionary_size, mpos + pos, + md5_orig ) ) ++failed_comparisons; + } + else if( result == 1 ) + { + if( verbosity >= 3 || + ( verbosity >= 2 && failure_pos - pos >= sector_size ) || + ( verbosity >= 1 && failure_pos - pos >= 10000 ) || + ( verbosity >= 0 && failure_pos - pos >= 50000 ) ) + std::printf( "block %llu,%u\nDecoder error at pos %llu\n", + mpos + pos, sector_size, mpos + failure_pos ); + } + else if( result == 3 || result == 4 ) // test_member printed the error + {} + else if( verbosity >= 0 ) + { + std::printf( "block %llu,%u\n", mpos + pos, sector_size ); + if( result == 2 ) + std::printf( "File ends unexpectedly at pos %llu\n", + mpos + failure_pos ); + else + std::printf( "Unknown error code '%d'\n", result ); + } + std::memcpy( mbuffer + pos, block, sector_size ); // restore block + } + delete[] buffer2; + if( !compare_member( mbuffer, msize, dictionary_size, mpos + pos, md5_orig ) ) + internal_error( "Block was not properly restored." ); + delete[] mbuffer; + } + delete[] block; + + if( verbosity >= 0 ) + { + std::printf( "\n%9ld blocks tested\n%9ld total decompressions" + "\n%9ld decompressions returned with zero status", + decompressions, decompressions, successes ); + if( successes > 0 ) + { + if( failed_comparisons > 0 ) + std::printf( ", of which\n%9ld comparisons failed\n", + failed_comparisons ); + else std::fputs( "\n all comparisons passed\n", stdout ); + } + else std::fputc( '\n', stdout ); + } + return 0; + } + + +int md5sum_files( const std::vector< std::string > & filenames ) + { + int retval = 0; + bool stdin_used = false; + + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, false ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + enum { buffer_size = 16384 }; + uint8_t buffer[buffer_size], md5_digest[16]; + MD5SUM md5sum; + while( true ) + { + const int len = readblock( infd, buffer, buffer_size ); + if( len != buffer_size && errno ) throw Error( "Read error" ); + if( len > 0 ) md5sum.md5_update( buffer, len ); + if( len < buffer_size ) break; + } + md5sum.md5_finish( md5_digest ); + if( close( infd ) != 0 ) + { show_file_error( input_filename, "Error closing input file", errno ); + return 1; } + + for( int i = 0; i < 16; ++i ) std::printf( "%02x", md5_digest[i] ); + std::printf( " %s\n", input_filename ); + std::fflush( stdout ); + } + return retval; + } @@ -0,0 +1,523 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "common.h" + +class State + { + int st; + +public: + enum { states = 12 }; + State() : st( 0 ) {} + int operator()() const { return st; } + bool is_char() const { return st < 7; } + + void set_char() + { + static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; + st = next[st]; + } + bool is_char_set_char() { set_char(); return st < 4; } + void set_match() { st = ( st < 7 ) ? 7 : 10; } + void set_rep() { st = ( st < 7 ) ? 8 : 11; } + void set_short_rep() { st = ( st < 7 ) ? 9 : 11; } + }; + + +enum { + min_dictionary_bits = 12, + min_dictionary_size = 1 << min_dictionary_bits, // >= modeled_distances + max_dictionary_bits = 29, + max_dictionary_size = 1 << max_dictionary_bits, + min_member_size = 36, + literal_context_bits = 3, + literal_pos_state_bits = 0, // not used + pos_state_bits = 2, + pos_states = 1 << pos_state_bits, + pos_state_mask = pos_states - 1, + + len_states = 4, + dis_slot_bits = 6, + start_dis_model = 4, + end_dis_model = 14, + modeled_distances = 1 << (end_dis_model / 2), // 128 + dis_align_bits = 4, + dis_align_size = 1 << dis_align_bits, + + len_low_bits = 3, + len_mid_bits = 3, + len_high_bits = 8, + len_low_symbols = 1 << len_low_bits, + len_mid_symbols = 1 << len_mid_bits, + len_high_symbols = 1 << len_high_bits, + max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, + + min_match_len = 2, // must be 2 + max_match_len = min_match_len + max_len_symbols - 1, // 273 + min_match_len_limit = 5 }; + +inline int get_len_state( const int len ) + { return std::min( len - min_match_len, len_states - 1 ); } + +inline int get_lit_state( const uint8_t prev_byte ) + { return prev_byte >> ( 8 - literal_context_bits ); } + + +enum { bit_model_move_bits = 5, + bit_model_total_bits = 11, + bit_model_total = 1 << bit_model_total_bits }; + +struct Bit_model + { + int probability; + Bit_model() : probability( bit_model_total / 2 ) {} + }; + +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + + +// defined in main.cc +extern int verbosity; + +class Pretty_print // requires global var 'int verbosity' + { + std::string name_; + std::string padded_name; + const char * const stdin_name; + unsigned longest_name; + mutable bool first_post; + +public: + Pretty_print( const std::vector< std::string > & filenames ) + : stdin_name( "(stdin)" ), longest_name( 0 ), first_post( false ) + { + if( verbosity <= 0 ) return; + const unsigned stdin_name_len = std::strlen( stdin_name ); + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const std::string & s = filenames[i]; + const unsigned len = ( s == "-" ) ? stdin_name_len : s.size(); + if( longest_name < len ) longest_name = len; + } + if( longest_name == 0 ) longest_name = stdin_name_len; + } + + Pretty_print( const std::string & filename ) + : stdin_name( "(stdin)" ), first_post( false ) + { + const unsigned stdin_name_len = std::strlen( stdin_name ); + longest_name = ( filename == "-" ) ? stdin_name_len : filename.size(); + if( longest_name == 0 ) longest_name = stdin_name_len; + set_name( filename ); + } + + void set_name( const std::string & filename ) + { + if( filename.size() && filename != "-" ) name_ = filename; + else name_ = stdin_name; + padded_name = " "; padded_name += name_; padded_name += ": "; + if( longest_name > name_.size() ) + padded_name.append( longest_name - name_.size(), ' ' ); + first_post = true; + } + + void reset() const { if( name_.size() ) first_post = true; } + const char * name() const { return name_.c_str(); } + void operator()( const char * const msg = 0, FILE * const f = stderr ) const; + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32() + { + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + uint32_t operator[]( const uint8_t byte ) const { return data[byte]; } + + void update_byte( uint32_t & crc, const uint8_t byte ) const + { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); } + + // about as fast as it is possible without messing with endianness + void update_buf( uint32_t & crc, const uint8_t * const buffer, + const int size ) const + { + uint32_t c = crc; + for( int i = 0; i < size; ++i ) + c = data[(c^buffer[i])&0xFF] ^ ( c >> 8 ); + crc = c; + } + + uint32_t compute_crc( const uint8_t * const buffer, + const long long size ) const + { + uint32_t crc = 0xFFFFFFFFU; + for( long long i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + return crc ^ 0xFFFFFFFFU; + } + }; + +extern const CRC32 crc32; + + +inline bool isvalid_ds( const unsigned dictionary_size ) + { return ( dictionary_size >= min_dictionary_size && + dictionary_size <= max_dictionary_size ); } + + +inline int real_bits( unsigned value ) + { + int bits = 0; + while( value > 0 ) { value >>= 1; ++bits; } + return bits; + } + + +const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" + +struct Lzip_header + { + uint8_t data[6]; // 0-3 magic bytes + // 4 version + // 5 coded dictionary size + enum { size = 6 }; + + void set_magic() { std::memcpy( data, lzip_magic, 4 ); data[4] = 1; } + bool verify_magic() const + { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); } + + bool verify_prefix( const int sz ) const // detect (truncated) header + { + for( int i = 0; i < sz && i < 4; ++i ) + if( data[i] != lzip_magic[i] ) return false; + return ( sz > 0 ); + } + bool verify_corrupt() const // detect corrupt header + { + int matches = 0; + for( int i = 0; i < 4; ++i ) + if( data[i] == lzip_magic[i] ) ++matches; + return ( matches > 1 && matches < 4 ); + } + + uint8_t version() const { return data[4]; } + bool verify_version() const { return ( data[4] == 1 ); } + + unsigned dictionary_size() const + { + unsigned sz = ( 1 << ( data[5] & 0x1F ) ); + if( sz > min_dictionary_size ) + sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 ); + return sz; + } + + bool dictionary_size( const unsigned sz ) + { + if( !isvalid_ds( sz ) ) return false; + data[5] = real_bits( sz - 1 ); + if( sz > min_dictionary_size ) + { + const unsigned base_size = 1 << data[5]; + const unsigned fraction = base_size / 16; + for( unsigned i = 7; i >= 1; --i ) + if( base_size - ( i * fraction ) >= sz ) + { data[5] |= ( i << 5 ); break; } + } + return true; + } + + bool verify( const bool ignore_bad_ds ) const + { return verify_magic() && verify_version() && + ( ignore_bad_ds || isvalid_ds( dictionary_size() ) ); } + }; + + +struct Lzip_trailer + { + uint8_t data[20]; // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + enum { size = 20 }; + + unsigned data_crc() const + { + unsigned tmp = 0; + for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + void data_crc( unsigned crc ) + { for( int i = 0; i <= 3; ++i ) { data[i] = (uint8_t)crc; crc >>= 8; } } + + unsigned long long data_size() const + { + unsigned long long tmp = 0; + for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + void data_size( unsigned long long sz ) + { for( int i = 4; i <= 11; ++i ) { data[i] = (uint8_t)sz; sz >>= 8; } } + + unsigned long long member_size() const + { + unsigned long long tmp = 0; + for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; } + return tmp; + } + + void member_size( unsigned long long sz ) + { for( int i = 12; i <= 19; ++i ) { data[i] = (uint8_t)sz; sz >>= 8; } } + + bool verify_consistency() const // check internal consistency + { + const unsigned crc = data_crc(); + const unsigned long long dsize = data_size(); + if( ( crc == 0 ) != ( dsize == 0 ) ) return false; + const unsigned long long msize = member_size(); + if( msize < min_member_size ) return false; + const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size; + if( mlimit > dsize && msize > mlimit ) return false; + const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1; + if( dlimit > msize && dsize > dlimit ) return false; + return true; + } + }; + + +#ifndef INT64_MAX +#define INT64_MAX 0x7FFFFFFFFFFFFFFFLL +#endif + +class Block + { + long long pos_, size_; // pos + size <= INT64_MAX + +public: + Block( const long long p, const long long s ) : pos_( p ), size_( s ) {} + + long long pos() const { return pos_; } + long long size() const { return size_; } + long long end() const { return pos_ + size_; } + + void pos( const long long p ) { pos_ = p; } + void size( const long long s ) { size_ = s; } + + bool operator==( const Block & b ) const + { return pos_ == b.pos_ && size_ == b.size_; } + bool operator!=( const Block & b ) const + { return pos_ != b.pos_ || size_ != b.size_; } + + bool operator<( const Block & b ) const { return pos_ < b.pos_; } + + bool includes( const long long pos ) const + { return ( pos_ <= pos && end() > pos ); } + bool overlaps( const Block & b ) const + { return ( pos_ < b.end() && b.pos_ < end() ); } + bool overlaps( const long long pos, const long long size ) const + { return ( pos_ < pos + size && pos < end() ); } + + void shift( Block & b ) { ++size_; ++b.pos_; --b.size_; } + Block split( const long long pos ); + }; + + +struct Member_list // members/gaps/tdata to be dumped/removed/stripped + { + bool damaged; + bool tdata; + bool in, rin; + std::vector< Block > range_vector, rrange_vector; + + Member_list() : damaged( false ), tdata( false ), in( true ), rin( true ) {} + void parse_ml( const char * p, const char * const option_name ); + + bool range() const { return range_vector.size() || rrange_vector.size(); } + + // blocks is the sum of members + gaps, excluding trailing data + bool includes( const long i, const long blocks ) const + { + for( unsigned j = 0; j < range_vector.size(); ++j ) + { + if( range_vector[j].pos() > i ) break; + if( range_vector[j].end() > i ) return in; + } + if( i >= 0 && i < blocks ) + for( unsigned j = 0; j < rrange_vector.size(); ++j ) + { + if( rrange_vector[j].pos() > blocks - i - 1 ) break; + if( rrange_vector[j].end() > blocks - i - 1 ) return rin; + } + return !in || !rin; + } + }; + + +struct Error + { + const char * const msg; + explicit Error( const char * const s ) : msg( s ) {} + }; + +inline unsigned long long positive_diff( const unsigned long long x, + const unsigned long long y ) + { return ( ( x > y ) ? x - y : 0 ); } + +inline void set_retval( int & retval, const int new_val ) + { if( retval < new_val ) retval = new_val; } + +const char * const bad_magic_msg = "Bad magic number (file not in lzip format)."; +const char * const bad_dict_msg = "Invalid dictionary size in member header."; +const char * const corrupt_mm_msg = "Corrupt header in multimember file."; +const char * const trailing_msg = "Trailing data not allowed."; + +// defined in alone_to_lz.cc +int alone_to_lz( const int infd, const Pretty_print & pp ); + +// defined in decoder.cc +long long readblock( const int fd, uint8_t * const buf, const long long size ); +long long writeblock( const int fd, const uint8_t * const buf, + const long long size ); + +// defined in dump_remove.cc +int dump_members( const std::vector< std::string > & filenames, + const std::string & default_output_filename, + const Member_list & member_list, const bool force, + bool ignore_errors, bool ignore_trailing, + const bool loose_trailing, const bool strip, + const bool to_stdout ); +int remove_members( const std::vector< std::string > & filenames, + const Member_list & member_list, bool ignore_errors, + bool ignore_trailing, const bool loose_trailing ); + +// defined in list.cc +int list_files( const std::vector< std::string > & filenames, + const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing ); + +// defined in lzip_index.cc +int seek_read( const int fd, uint8_t * const buf, const int size, + const long long pos ); + +// defined in lunzcrash.cc +int lunzcrash_bit( const char * const input_filename ); +int lunzcrash_block( const char * const input_filename, const int sector_size ); +int md5sum_files( const std::vector< std::string > & filenames ); + +// defined in main.cc +extern const char * const program_name; +extern std::string output_filename; // global vars for output file +extern int outfd; +struct stat; +const char * bad_version( const unsigned version ); +const char * format_ds( const unsigned dictionary_size ); +void show_header( const unsigned dictionary_size ); +int open_instream( const char * const name, struct stat * const in_statsp, + const bool one_to_one, const bool reg_only = false ); +int open_truncable_stream( const char * const name, + struct stat * const in_statsp ); +bool open_outstream( const bool force, const bool protect, + const bool rw = false, const bool skipping = true ); +bool file_exists( const std::string & filename ); +void cleanup_and_fail( const int retval ); +bool check_tty_out(); +void set_signal_handler(); +int close_outstream( const struct stat * const in_statsp ); +std::string insert_fixed( std::string name ); +void show_2file_error( const char * const msg1, const char * const name1, + const char * const name2, const char * const msg2 ); +class Range_decoder; +void show_dprogress( const unsigned long long cfile_size = 0, + const unsigned long long partial_size = 0, + const Range_decoder * const d = 0, + const Pretty_print * const p = 0 ); + +// defined in merge.cc +bool copy_file( const int infd, const int outfd, + const long long max_size = -1 ); +int test_member_from_file( const int infd, const unsigned long long msize, + long long * const failure_posp = 0 ); +int merge_files( const std::vector< std::string > & filenames, + const std::string & default_output_filename, + const char terminator, const bool force ); + +// defined in nrep_stats.cc +int print_nrep_stats( const std::vector< std::string > & filenames, + const int repeated_byte, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing ); + +// defined in range_dec.cc +const char * format_num( unsigned long long num, + unsigned long long limit = -1ULL, + const int set_prefix = 0 ); +bool safe_seek( const int fd, const long long pos ); +int range_decompress( const std::string & input_filename, + const std::string & default_output_filename, + Block range, const bool force, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing, + const bool to_stdout ); + +// defined in repair.cc +long long seek_write( const int fd, const uint8_t * const buf, + const long long size, const long long pos ); +uint8_t * read_member( const int infd, const long long mpos, + const long long msize ); +int repair_file( const std::string & input_filename, + const std::string & default_output_filename, + const char terminator, const bool force ); +int debug_delay( const std::string & input_filename, Block range, + const char terminator ); +int debug_repair( const std::string & input_filename, + const Bad_byte & bad_byte, const char terminator ); +int debug_decompress( const std::string & input_filename, + const Bad_byte & bad_byte, const bool show_packets ); + +// defined in reproduce.cc +int reproduce_file( const std::string & input_filename, + const std::string & default_output_filename, + const char * const lzip_name, + const char * const reference_filename, + const int lzip_level, const char terminator, + const bool force ); +int debug_reproduce_file( const std::string & input_filename, + const char * const lzip_name, + const char * const reference_filename, + const Block & range, const int sector_size, + const int lzip_level ); + +// defined in split.cc +int split_file( const std::string & input_filename, + const std::string & default_output_filename, const bool force ); diff --git a/lzip_index.cc b/lzip_index.cc new file mode 100644 index 0000000..eff4d05 --- /dev/null +++ b/lzip_index.cc @@ -0,0 +1,358 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> + +#include "lzip.h" +#include "lzip_index.h" + + +int seek_read( const int fd, uint8_t * const buf, const int size, + const long long pos ) + { + if( lseek( fd, pos, SEEK_SET ) == pos ) + return readblock( fd, buf, size ); + return 0; + } + + +bool Lzip_index::check_header_error( const Lzip_header & header, + const bool ignore_bad_ds ) + { + if( !header.verify_magic() ) + { error_ = bad_magic_msg; retval_ = 2; return true; } + if( !header.verify_version() ) + { error_ = bad_version( header.version() ); retval_ = 2; return true; } + if( !ignore_bad_ds && !isvalid_ds( header.dictionary_size() ) ) + { error_ = bad_dict_msg; retval_ = 2; return true; } + return false; + } + +void Lzip_index::set_errno_error( const char * const msg ) + { + error_ = msg; error_ += std::strerror( errno ); + retval_ = 1; + } + +void Lzip_index::set_num_error( const char * const msg, unsigned long long num ) + { + char buf[80]; + snprintf( buf, sizeof buf, "%s%llu", msg, num ); + error_ = buf; + retval_ = 2; + } + + +bool Lzip_index::read_header( const int fd, Lzip_header & header, + const long long pos ) + { + if( seek_read( fd, header.data, Lzip_header::size, pos ) != Lzip_header::size ) + { set_errno_error( "Error reading member header: " ); return false; } + return true; + } + +bool Lzip_index::read_trailer( const int fd, Lzip_trailer & trailer, + const long long pos ) + { + if( seek_read( fd, trailer.data, Lzip_trailer::size, + pos - Lzip_trailer::size ) != Lzip_trailer::size ) + { set_errno_error( "Error reading member trailer: " ); return false; } + return true; + } + + +/* Skip backwards the gap or trailing data ending at pos. + 'ignore_gaps' also ignores format errors and a truncated last member. + If successful, push member preceding gap and set pos to member header. */ +bool Lzip_index::skip_gap( const int fd, unsigned long long & pos, + const bool ignore_trailing, const bool loose_trailing, + const bool ignore_bad_ds, const bool ignore_gaps ) + { + if( pos < min_member_size ) + { + if( ignore_gaps && !member_vector.empty() ) { pos = 0; return true; } + return false; + } + enum { block_size = 16384, + buffer_size = block_size + Lzip_trailer::size - 1 + Lzip_header::size }; + uint8_t buffer[buffer_size]; + int bsize = pos % block_size; // total bytes in buffer + if( bsize <= buffer_size - block_size ) bsize += block_size; + int search_size = bsize; // bytes to search for trailer + int rd_size = bsize; // bytes to read from file + unsigned long long ipos = pos - rd_size; // aligned to block_size + + while( true ) + { + if( seek_read( fd, buffer, rd_size, ipos ) != rd_size ) + { set_errno_error( "Error seeking member trailer: " ); return false; } + const uint8_t max_msb = ( ipos + search_size ) >> 56; + for( int i = search_size; i >= Lzip_trailer::size; --i ) + if( buffer[i-1] <= max_msb ) // most significant byte of member_size + { + const Lzip_trailer & trailer = + *(const Lzip_trailer *)( buffer + i - Lzip_trailer::size ); + const unsigned long long member_size = trailer.member_size(); + if( member_size == 0 ) // skip trailing zeros + { while( i > Lzip_trailer::size && buffer[i-9] == 0 ) --i; continue; } + if( member_size > ipos + i || !trailer.verify_consistency() ) + continue; + Lzip_header header; + if( !read_header( fd, header, ipos + i - member_size ) ) return false; + if( !header.verify( ignore_bad_ds ) ) continue; + const Lzip_header & header2 = *(const Lzip_header *)( buffer + i ); + const bool full_h2 = bsize - i >= Lzip_header::size; + if( header2.verify_prefix( bsize - i ) ) // next header + { + if( !ignore_gaps && member_vector.empty() ) // last member + { + if( !full_h2 ) error_ = "Last member in input file is truncated."; + else if( !check_header_error( header2, ignore_bad_ds ) ) + error_ = "Last member in input file is truncated or corrupt."; + retval_ = 2; return false; + } + const unsigned dictionary_size = + full_h2 ? header2.dictionary_size() : 0; + const unsigned long long member_size = pos - ( ipos + i ); + pos = ipos + i; + // approximate data and member sizes for '-i -D' + member_vector.push_back( Member( 0, member_size, pos, + member_size, dictionary_size ) ); + } + if( !ignore_gaps && member_vector.empty() ) + { + if( !loose_trailing && full_h2 && header2.verify_corrupt() ) + { error_ = corrupt_mm_msg; retval_ = 2; return false; } + if( !ignore_trailing ) + { error_ = trailing_msg; retval_ = 2; return false; } + } + pos = ipos + i - member_size; + const unsigned dictionary_size = header.dictionary_size(); + member_vector.push_back( Member( 0, trailer.data_size(), pos, + member_size, dictionary_size ) ); + if( dictionary_size_ < dictionary_size ) + dictionary_size_ = dictionary_size; + return true; + } + if( ipos == 0 ) + { + if( ignore_gaps && !member_vector.empty() ) + { + const Lzip_header * header = (const Lzip_header *)buffer; + const unsigned dictionary_size = header->dictionary_size(); + // approximate data and member sizes for '-i -D' + member_vector.push_back( Member( 0, pos, 0, pos, dictionary_size ) ); + pos = 0; return true; + } + set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); + return false; + } + bsize = buffer_size; + search_size = bsize - Lzip_header::size; + rd_size = block_size; + ipos -= rd_size; + std::memcpy( buffer + rd_size, buffer, buffer_size - rd_size ); + } + } + + +Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, + const bool loose_trailing, const bool ignore_bad_ds, + const bool ignore_gaps, const long long max_pos ) + : insize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ), dictionary_size_( 0 ) + { + if( insize < 0 ) + { set_errno_error( "Input file is not seekable: " ); return; } + if( insize < min_member_size ) + { error_ = "Input file is too short."; retval_ = 2; return; } + if( insize > INT64_MAX ) + { error_ = "Input file is too long (2^63 bytes or more)."; + retval_ = 2; return; } + + Lzip_header header; + if( !read_header( infd, header, 0 ) ) return; + if( check_header_error( header, ignore_bad_ds ) ) return; + + // pos always points to a header or to ( EOF || max_pos ) + unsigned long long pos = ( max_pos > 0 ) ? max_pos : insize; + while( pos >= min_member_size ) + { + Lzip_trailer trailer; + if( !read_trailer( infd, trailer, pos ) ) break; + const unsigned long long member_size = trailer.member_size(); + // if gaps are being ignored, verify consistency of last trailer only. + if( member_size > pos || member_size < min_member_size || + ( ( !ignore_gaps || member_vector.empty() ) && + !trailer.verify_consistency() ) ) // bad trailer + { + if( ignore_gaps || member_vector.empty() ) + { if( skip_gap( infd, pos, ignore_trailing, loose_trailing, + ignore_bad_ds, ignore_gaps ) ) continue; else return; } + set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); + break; + } + if( !read_header( infd, header, pos - member_size ) ) break; + if( !header.verify( ignore_bad_ds ) ) // bad header + { + if( ignore_gaps || member_vector.empty() ) + { if( skip_gap( infd, pos, ignore_trailing, loose_trailing, + ignore_bad_ds, ignore_gaps ) ) continue; else return; } + set_num_error( "Bad header at pos ", pos - member_size ); + break; + } + pos -= member_size; + const unsigned dictionary_size = header.dictionary_size(); + member_vector.push_back( Member( 0, trailer.data_size(), pos, + member_size, dictionary_size ) ); + if( dictionary_size_ < dictionary_size ) + dictionary_size_ = dictionary_size; + } + // block at pos == 0 must be a member unless shorter than min_member_size + if( pos >= min_member_size || ( pos != 0 && !ignore_gaps ) || + member_vector.empty() ) + { + member_vector.clear(); + if( retval_ == 0 ) { error_ = "Can't create file index."; retval_ = 2; } + return; + } + std::reverse( member_vector.begin(), member_vector.end() ); + for( unsigned long i = 0; ; ++i ) + { + const long long end = member_vector[i].dblock.end(); + if( end < 0 || end > INT64_MAX ) + { + member_vector.clear(); + error_ = "Data in input file is too long (2^63 bytes or more)."; + retval_ = 2; return; + } + if( i + 1 >= member_vector.size() ) break; + member_vector[i+1].dblock.pos( end ); + if( member_vector[i].mblock.end() > member_vector[i+1].mblock.pos() ) + internal_error( "two mblocks overlap after constructing a Lzip_index." ); + } + } + + +// All files in 'infd_vector' must be at least 'fsize' bytes long. +Lzip_index::Lzip_index( const std::vector< int > & infd_vector, + const long long fsize ) + : insize( fsize ), retval_( 0 ), dictionary_size_( 0 ) // DS not used + { + if( insize < 0 ) + { set_errno_error( "Input file is not seekable: " ); return; } + if( insize < min_member_size ) + { error_ = "Input file is too short."; retval_ = 2; return; } + if( insize > INT64_MAX ) + { error_ = "Input file is too long (2^63 bytes or more)."; + retval_ = 2; return; } + + const int files = infd_vector.size(); + Lzip_header header; + bool done = false; + for( int i = 0; i < files && !done; ++i ) + { + const int infd = infd_vector[i]; + if( !read_header( infd, header, 0 ) ) return; + if( header.verify_magic() && header.verify_version() ) done = true; + } + if( !done ) + { error_ = bad_magic_msg; retval_ = 2; return; } + + long long pos = insize; // always points to a header or to EOF + while( pos >= min_member_size ) + { + unsigned long long member_size; + Lzip_trailer trailer; + done = false; + for( int it = 0; it < files && !done; ++it ) + { + const int tfd = infd_vector[it]; + if( !read_trailer( tfd, trailer, pos ) ) goto error; + member_size = trailer.member_size(); + if( member_size <= (unsigned long long)pos && trailer.verify_consistency() ) + for( int ih = 0; ih < files && !done; ++ih ) + { + const int hfd = infd_vector[ih]; + if( !read_header( hfd, header, pos - member_size ) ) goto error; + if( header.verify_magic() && header.verify_version() ) done = true; + } + } + if( !done ) + { + if( member_vector.empty() ) { --pos; continue; } // maybe trailing data + set_num_error( "Member size in trailer may be corrupt at pos ", pos - 8 ); + break; + } + if( member_vector.empty() && insize > pos ) + { + const int size = std::min( (long long)Lzip_header::size, insize - pos ); + for( int i = 0; i < files; ++i ) + { + const int infd = infd_vector[i]; + if( seek_read( infd, header.data, size, pos ) == size && + header.verify_prefix( size ) ) + { + error_ = "Last member in input file is truncated or corrupt."; + retval_ = 2; goto error; + } + } + } + pos -= member_size; + member_vector.push_back( Member( 0, trailer.data_size(), pos, + member_size, 0 ) ); + } +error: + if( pos != 0 || member_vector.empty() ) + { + member_vector.clear(); + if( retval_ == 0 ) { error_ = "Can't create file index."; retval_ = 2; } + return; + } + std::reverse( member_vector.begin(), member_vector.end() ); + for( unsigned long i = 0; ; ++i ) + { + const long long end = member_vector[i].dblock.end(); + if( end < 0 || end > INT64_MAX ) + { + member_vector.clear(); + error_ = "Data in input file is too long (2^63 bytes or more)."; + retval_ = 2; return; + } + if( i + 1 >= member_vector.size() ) break; + member_vector[i+1].dblock.pos( end ); + } + } + + +// Return members + gaps [+ trailing data]. +long Lzip_index::blocks( const bool count_tdata ) const + { + long n = member_vector.size() + ( count_tdata && cdata_size() < file_size() ); + if( member_vector.size() && member_vector[0].mblock.pos() > 0 ) ++n; + for( unsigned long i = 1; i < member_vector.size(); ++i ) + if( member_vector[i-1].mblock.end() < member_vector[i].mblock.pos() ) ++n; + return n; + } diff --git a/lzip_index.h b/lzip_index.h new file mode 100644 index 0000000..0b8ace1 --- /dev/null +++ b/lzip_index.h @@ -0,0 +1,94 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +class Lzip_index + { + struct Member + { + Block dblock, mblock; // data block, member block + unsigned dictionary_size; + + Member( const long long dp, const long long ds, + const long long mp, const long long ms, const unsigned dict_size ) + : dblock( dp, ds ), mblock( mp, ms ), dictionary_size( dict_size ) {} + + bool operator==( const Member & m ) const { return ( mblock == m.mblock ); } + bool operator!=( const Member & m ) const { return ( mblock != m.mblock ); } + }; + + // member_vector only contains members with a valid header. + // Garbage between members is represented by gaps between mblocks. + std::vector< Member > member_vector; + std::string error_; + long long insize; + int retval_; + unsigned dictionary_size_; // largest dictionary size in the file + + bool check_header_error( const Lzip_header & header, + const bool ignore_bad_ds ); + void set_errno_error( const char * const msg ); + void set_num_error( const char * const msg, unsigned long long num ); + bool read_header( const int fd, Lzip_header & header, const long long pos ); + bool read_trailer( const int fd, Lzip_trailer & trailer, + const long long pos ); + bool skip_gap( const int fd, unsigned long long & pos, + const bool ignore_trailing, const bool loose_trailing, + const bool ignore_bad_ds, const bool ignore_gaps ); + +public: + Lzip_index() + : error_( "No index" ), insize( 0 ), retval_( 2 ), dictionary_size_( 0 ) {} + Lzip_index( const int infd, const bool ignore_trailing, + const bool loose_trailing, const bool ignore_bad_ds = false, + const bool ignore_gaps = false, const long long max_pos = 0 ); + Lzip_index( const std::vector< int > & infd_vector, const long long fsize ); + + long members() const { return member_vector.size(); } + long blocks( const bool count_tdata ) const; // members + gaps [+ tdata] + const std::string & error() const { return error_; } + int retval() const { return retval_; } + unsigned dictionary_size() const { return dictionary_size_; } + + bool operator==( const Lzip_index & li ) const + { + if( retval_ || li.retval_ || insize != li.insize || + member_vector.size() != li.member_vector.size() ) return false; + for( unsigned long i = 0; i < member_vector.size(); ++i ) + if( member_vector[i] != li.member_vector[i] ) return false; + return true; + } + bool operator!=( const Lzip_index & li ) const { return !( *this == li ); } + + long long udata_size() const + { if( member_vector.empty() ) return 0; + return member_vector.back().dblock.end(); } + + long long cdata_size() const + { if( member_vector.empty() ) return 0; + return member_vector.back().mblock.end(); } + + // total size including trailing data (if any) + long long file_size() const + { if( insize >= 0 ) return insize; else return 0; } + + const Block & dblock( const long i ) const + { return member_vector[i].dblock; } + const Block & mblock( const long i ) const + { return member_vector[i].mblock; } + unsigned dictionary_size( const long i ) const + { return member_vector[i].dictionary_size; } + }; @@ -0,0 +1,1090 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +/* + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file, 3 for an internal consistency error + (e.g., bug) which caused lziprecover to panic. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cctype> +#include <cerrno> +#include <climits> +#include <csignal> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <new> +#include <string> +#include <vector> +#include <fcntl.h> +#include <stdint.h> +#include <unistd.h> +#include <utime.h> +#include <sys/stat.h> +#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__ +#include <io.h> +#if defined __MSVCRT__ +#define fchmod(x,y) 0 +#define fchown(x,y,z) 0 +#define SIGHUP SIGTERM +#define S_ISSOCK(x) 0 +#ifndef S_IRGRP +#define S_IRGRP 0 +#define S_IWGRP 0 +#define S_IROTH 0 +#define S_IWOTH 0 +#endif +#endif +#if defined __DJGPP__ +#define S_ISSOCK(x) 0 +#define S_ISVTX 0 +#endif +#endif + +#include "arg_parser.h" +#include "lzip.h" +#include "decoder.h" + +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +#if CHAR_BIT != 8 +#error "Environments where CHAR_BIT != 8 are not supported." +#endif + +#if ( defined SIZE_MAX && SIZE_MAX < UINT_MAX ) || \ + ( defined SSIZE_MAX && SSIZE_MAX < INT_MAX ) +#error "Environments where 'size_t' is narrower than 'int' are not supported." +#endif + +int verbosity = 0; + +const char * const program_name = "lziprecover"; +std::string output_filename; // global vars for output file +int outfd = -1; // see 'delete_output_on_interrupt' below + +namespace { + +const char * invocation_name = program_name; // default value + +const struct { const char * from; const char * to; } known_extensions[] = { + { ".lz", "" }, + { ".tlz", ".tar" }, + { 0, 0 } }; + +enum Mode { m_none, m_alone_to_lz, m_debug_decompress, m_debug_delay, + m_debug_repair, m_decompress, m_dump, m_list, m_md5sum, m_merge, + m_nrep_stats, m_range_dec, m_remove, m_repair, m_reproduce, + m_show_packets, m_split, m_strip, m_test, m_unzcrash_bit, + m_unzcrash_block }; + +/* Variable used in signal handler context. + It is not declared volatile because the handler never returns. */ +bool delete_output_on_interrupt = false; + + +void show_help() + { + std::printf( "Lziprecover is a data recovery tool and decompressor for files in the lzip\n" + "compressed data format (.lz). Lziprecover is able to repair slightly damaged\n" + "files (up to one single-byte error per member), produce a correct file by\n" + "merging the good parts of two or more damaged copies, reproduce a missing\n" + "(zeroed) sector using a reference file, extract data from damaged files,\n" + "decompress files, and test integrity of files.\n" + "\nWith the help of lziprecover, losing an entire archive just because of a\n" + "corrupt byte near the beginning is a thing of the past.\n" + "\nLziprecover can remove the damaged members from multimember files, for\n" + "example multimember tar.lz archives.\n" + "\nLziprecover provides random access to the data in multimember files; it only\n" + "decompresses the members containing the desired data.\n" + "\nLziprecover facilitates the management of metadata stored as trailing data\n" + "in lzip files.\n" + "\nLziprecover is not a replacement for regular backups, but a last line of\n" + "defense for the case where the backups are also damaged.\n" + "\nUsage: %s [options] [files]\n", invocation_name ); + std::printf( "\nOptions:\n" + " -h, --help display this help and exit\n" + " -V, --version output version information and exit\n" + " -a, --trailing-error exit with error status if trailing data\n" + " -A, --alone-to-lz convert lzma-alone files to lzip format\n" + " -c, --stdout write to standard output, keep input files\n" + " -d, --decompress decompress\n" + " -D, --range-decompress=<n-m> decompress a range of bytes to stdout\n" + " -e, --reproduce try to reproduce a zeroed sector in file\n" + " --lzip-level=N|a|m[N] reproduce one level, all, or match length\n" + " --lzip-name=<name> name of lzip executable for --reproduce\n" + " --reference-file=<file> reference file for --reproduce\n" + " -f, --force overwrite existing output files\n" + " -i, --ignore-errors ignore some errors in -d, -D, -l, -t, --dump\n" + " -k, --keep keep (don't delete) input files\n" + " -l, --list print (un)compressed file sizes\n" + " -m, --merge correct errors in file using several copies\n" + " -o, --output=<file> place the output into <file>\n" + " -q, --quiet suppress all messages\n" + " -R, --repair try to repair a small error in file\n" + " -s, --split split multimember file in single-member files\n" + " -t, --test test compressed file integrity\n" + " -v, --verbose be verbose (a 2nd -v gives more)\n" + " --loose-trailing allow trailing data seeming corrupt header\n" + " --dump=<list>:d:t dump members listed/damaged, tdata to stdout\n" + " --remove=<list>:d:t remove members, tdata from files in place\n" + " --strip=<list>:d:t copy files to stdout stripping members given\n" ); + if( verbosity >= 1 ) + { + std::printf( "\nDebug options for experts:\n" + " -E, --debug-reproduce=<range>[,ss] set range to 0 and try to reproduce file\n" + " -M, --md5sum print the MD5 digests of the input files\n" + " -S, --nrep-stats[=<val>] print stats of N-byte repeated sequences\n" + " -U, --unzcrash=1|B<size> test 1-bit or block errors in input file\n" + " -W, --debug-decompress=<pos>,<val> set pos to val and decompress to stdout\n" + " -X, --show-packets[=<pos>,<val>] show in stdout the decoded LZMA packets\n" + " -Y, --debug-delay=<range> find max error detection delay in <range>\n" + " -Z, --debug-repair=<pos>,<val> test repair one-byte error at <pos>\n" ); + } + std::printf( "\nIf no file names are given, or if a file is '-', lziprecover decompresses\n" + "from standard input to standard output.\n" + "Numbers may be followed by a multiplier: k = kB = 10^3 = 1000,\n" + "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc...\n" + "\nTo extract all the files from archive 'foo.tar.lz', use the commands\n" + "'tar -xf foo.tar.lz' or 'lziprecover -cd foo.tar.lz | tar -xf -'.\n" + "\nExit status: 0 for a normal exit, 1 for environmental problems (file\n" + "not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or\n" + "invalid input file, 3 for an internal consistency error (e.g., bug) which\n" + "caused lziprecover to panic.\n" + "\nReport bugs to lzip-bug@nongnu.org\n" + "Lziprecover home page: http://www.nongnu.org/lzip/lziprecover.html\n" ); + } + +} // end namespace + +void Pretty_print::operator()( const char * const msg, FILE * const f ) const + { + if( verbosity < 0 ) return; + if( first_post ) + { + first_post = false; + std::fputs( padded_name.c_str(), f ); + if( !msg ) std::fflush( f ); + } + if( msg ) std::fprintf( f, "%s\n", msg ); + } + + +const char * bad_version( const unsigned version ) + { + static char buf[80]; + snprintf( buf, sizeof buf, "Version %u member format not supported.", + version ); + return buf; + } + + +const char * format_ds( const unsigned dictionary_size ) + { + enum { bufsize = 16, factor = 1024 }; + static char buf[bufsize]; + const char * const prefix[8] = + { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; + const char * p = ""; + const char * np = " "; + unsigned num = dictionary_size; + bool exact = ( num % factor == 0 ); + + for( int i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) + { num /= factor; if( num % factor != 0 ) exact = false; + p = prefix[i]; np = ""; } + snprintf( buf, bufsize, "%s%4u %sB", np, num, p ); + return buf; + } + + +void show_header( const unsigned dictionary_size ) + { + std::fprintf( stderr, "dict %s, ", format_ds( dictionary_size ) ); + } + + +#include "main_common.cc" + + +// Colon-separated list of "damaged", "tdata", [r][^]<list> (1 1,3-5,8) +void Member_list::parse_ml( const char * arg, const char * const option_name ) + { + while( true ) + { + const char * tp = arg; // points to terminator (':' or '\0') + while( *tp && *tp != ':' ) ++tp; + const unsigned len = tp - arg; + if( std::islower( *(const unsigned char *)arg ) ) + { + if( len <= 7 && std::strncmp( "damaged", arg, len ) == 0 ) + { damaged = true; goto next; } + if( len <= 5 && std::strncmp( "tdata", arg, len ) == 0 ) + { tdata = true; goto next; } + } + { + const bool reverse = ( *arg == 'r' ); + if( reverse ) ++arg; + if( *arg == '^' ) { ++arg; if( reverse ) rin = false; else in = false; } + std::vector< Block > * rvp = reverse ? &rrange_vector : &range_vector; + while( std::isdigit( *(const unsigned char *)arg ) ) + { + const char * tail; + const int pos = getnum( arg, option_name, 0, 1, INT_MAX, &tail ) - 1; + if( rvp->size() && pos < rvp->back().end() ) break; + const int size = (*tail == '-') ? + getnum( tail + 1, option_name, 0, pos + 1, INT_MAX, &tail ) - pos : 1; + rvp->push_back( Block( pos, size ) ); + if( tail == tp ) goto next; + if( *tail == ',' ) arg = tail + 1; else break; + } + } + show_error( "Invalid list of members." ); + std::exit( 1 ); +next: + if( *(arg = tp) != 0 ) ++arg; else return; + } + } + + +namespace { + +// Recognized formats: <digit> 'a' m[<match_length>] +// +int parse_lzip_level( const char * const arg, const char * const option_name ) + { + if( *arg == 'a' || std::isdigit( *(const unsigned char *)arg ) ) return *arg; + if( *arg != 'm' ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Bad argument in option '%s'.\n", + program_name, option_name ); + std::exit( 1 ); + } + if( arg[1] == 0 ) return -1; + return -getnum( arg + 1, option_name, 0, min_match_len_limit, max_match_len ); + } + + +/* Recognized format: <range>[,<sector_size>] + range formats: <begin> <begin>-<end> <begin>,<size> ,<size> +*/ +void parse_range( const char * const arg, const char * const pn, + Block & range, int * const sector_sizep = 0 ) + { + const char * tail = arg; + long long value = + ( arg[0] == ',' ) ? 0 : getnum( arg, pn, 0, 0, INT64_MAX - 1, &tail ); + if( tail[0] == 0 || tail[0] == ',' || tail[0] == '-' ) + { + range.pos( value ); + if( tail[0] == 0 ) { range.size( INT64_MAX - value ); return; } + const bool is_size = ( tail[0] == ',' ); + if( sector_sizep && tail[1] == ',' ) { value = INT64_MAX - value; ++tail; } + else value = getnum( tail + 1, pn, 0, 1, INT64_MAX, &tail ); // size + if( !is_size && value <= range.pos() ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Begin must be < end in range argument " + "of option '%s'.\n", program_name, pn ); + std::exit( 1 ); + } + if( !is_size ) value -= range.pos(); + if( INT64_MAX - value >= range.pos() ) + { + range.size( value ); + if( sector_sizep && tail[0] == ',' ) + *sector_sizep = getnum( tail + 1, pn, 0, 8, INT_MAX ); + return; + } + } + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Bad decompression range in option '%s'.\n", + program_name, pn ); + std::exit( 1 ); + } + + +void one_file( const int files ) + { + if( files != 1 ) + { + show_error( "You must specify exactly 1 file.", 0, true ); + std::exit( 1 ); + } + } + + +void set_mode( Mode & program_mode, const Mode new_mode ) + { + if( program_mode != m_none && program_mode != new_mode ) + { + show_error( "Only one operation can be specified.", 0, true ); + std::exit( 1 ); + } + program_mode = new_mode; + } + + +void parse_u( const char * const arg, const char * const option_name, + Mode & program_mode, int & sector_size ) + { + if( arg[0] == '1' ) set_mode( program_mode, m_unzcrash_bit ); + else if( arg[0] == 'B' ) + { set_mode( program_mode, m_unzcrash_block ); + sector_size = getnum( arg + 1, option_name, 0, 1, INT_MAX ); } + else + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Bad argument for option '%s'.\n", + program_name, option_name ); + std::exit( 1 ); + } + } + + +int extension_index( const std::string & name ) + { + for( int eindex = 0; known_extensions[eindex].from; ++eindex ) + { + const std::string ext( known_extensions[eindex].from ); + if( name.size() > ext.size() && + name.compare( name.size() - ext.size(), ext.size(), ext ) == 0 ) + return eindex; + } + return -1; + } + + +void set_a_outname( const std::string & name ) + { + output_filename = name; + if( name.size() > 5 && name.compare( name.size() - 5, 5, ".lzma" ) == 0 ) + output_filename.erase( name.size() - 2 ); + else if( name.size() > 4 && name.compare( name.size() - 4, 4, ".tlz" ) == 0 ) + output_filename.insert( name.size() - 2, "ar." ); + else if( name.size() <= 3 || name.compare( name.size() - 3, 3, ".lz" ) != 0 ) + output_filename += known_extensions[0].from; + } + + +void set_d_outname( const std::string & name, const int eindex ) + { + if( eindex >= 0 ) + { + const std::string from( known_extensions[eindex].from ); + if( name.size() > from.size() ) + { + output_filename.assign( name, 0, name.size() - from.size() ); + output_filename += known_extensions[eindex].to; + return; + } + } + output_filename = name; output_filename += ".out"; + if( verbosity >= 1 ) + std::fprintf( stderr, "%s: Can't guess original name for '%s' -- using '%s'\n", + program_name, name.c_str(), output_filename.c_str() ); + } + +} // end namespace + +int open_instream( const char * const name, struct stat * const in_statsp, + const bool one_to_one, const bool reg_only ) + { + int infd = open( name, O_RDONLY | O_BINARY ); + if( infd < 0 ) + show_file_error( name, "Can't open input file", errno ); + else + { + const int i = fstat( infd, in_statsp ); + const mode_t mode = in_statsp->st_mode; + const bool can_read = ( i == 0 && !reg_only && + ( S_ISBLK( mode ) || S_ISCHR( mode ) || + S_ISFIFO( mode ) || S_ISSOCK( mode ) ) ); + if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || one_to_one ) ) ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", + program_name, name, ( can_read && one_to_one ) ? + ",\n and neither '-c' nor '-o' were specified" : "" ); + close( infd ); + infd = -1; + } + } + return infd; + } + + +int open_truncable_stream( const char * const name, + struct stat * const in_statsp ) + { + int fd = open( name, O_RDWR | O_BINARY ); + if( fd < 0 ) + show_file_error( name, "Can't open input file", errno ); + else + { + const int i = fstat( fd, in_statsp ); + const mode_t mode = in_statsp->st_mode; + if( i != 0 || !S_ISREG( mode ) ) + { show_file_error( name, "Not a regular file." ); close( fd ); fd = -1; } + } + return fd; + } + + +bool open_outstream( const bool force, const bool protect, + const bool rw, const bool skipping ) + { + const mode_t usr_rw = S_IRUSR | S_IWUSR; + const mode_t all_rw = usr_rw | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + const mode_t outfd_mode = protect ? usr_rw : all_rw; + int flags = O_CREAT | ( rw ? O_RDWR : O_WRONLY ) | O_BINARY; + if( force ) flags |= O_TRUNC; else flags |= O_EXCL; + + outfd = open( output_filename.c_str(), flags, outfd_mode ); + if( outfd >= 0 ) delete_output_on_interrupt = true; + else if( verbosity >= 0 ) + { + if( errno == EEXIST ) + std::fprintf( stderr, "%s: Output file '%s' already exists%s.\n", + program_name, output_filename.c_str(), skipping ? + ", skipping" : ". Use '--force' to overwrite it" ); + else + std::fprintf( stderr, "%s: Can't create output file '%s': %s\n", + program_name, output_filename.c_str(), std::strerror( errno ) ); + } + return ( outfd >= 0 ); + } + + +bool file_exists( const std::string & filename ) + { + struct stat st; + if( stat( filename.c_str(), &st ) == 0 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Output file '%s' already exists." + " Use '--force' to overwrite it.\n", + program_name, filename.c_str() ); + return true; + } + return false; + } + + +void set_signals( void (*action)(int) ) + { + std::signal( SIGHUP, action ); + std::signal( SIGINT, action ); + std::signal( SIGTERM, action ); + } + + +void cleanup_and_fail( const int retval ) + { + set_signals( SIG_IGN ); // ignore signals + if( delete_output_on_interrupt ) + { + delete_output_on_interrupt = false; + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Deleting output file '%s', if it exists.\n", + program_name, output_filename.c_str() ); + if( outfd >= 0 ) { close( outfd ); outfd = -1; } + if( std::remove( output_filename.c_str() ) != 0 && errno != ENOENT ) + show_error( "WARNING: deletion of output file (apparently) failed." ); + } + std::exit( retval ); + } + + +bool check_tty_out() + { + if( isatty( outfd ) ) + { show_file_error( output_filename.size() ? + output_filename.c_str() : "(stdout)", + "I won't write compressed data to a terminal." ); + return false; } + return true; + } + +namespace { + +extern "C" void signal_handler( int ) + { + show_error( "Control-C or similar caught, quitting." ); + cleanup_and_fail( 1 ); + } + + +bool check_tty_in( const char * const input_filename, const int infd, + const Mode program_mode, int & retval ) + { + if( isatty( infd ) ) // all modes read compressed data + { show_file_error( input_filename, + "I won't read compressed data from a terminal." ); + close( infd ); set_retval( retval, 2 ); + if( program_mode != m_test ) cleanup_and_fail( retval ); + return false; } + return true; + } + +bool check_tty_out( const Mode program_mode ) + { return program_mode != m_alone_to_lz || ::check_tty_out(); } + + +// Set permissions, owner, and times. +void close_and_set_permissions( const struct stat * const in_statsp ) + { + bool warning = false; + if( in_statsp ) + { + const mode_t mode = in_statsp->st_mode; + // fchown will in many cases return with EPERM, which can be safely ignored. + if( fchown( outfd, in_statsp->st_uid, in_statsp->st_gid ) == 0 ) + { if( fchmod( outfd, mode ) != 0 ) warning = true; } + else + if( errno != EPERM || + fchmod( outfd, mode & ~( S_ISUID | S_ISGID | S_ISVTX ) ) != 0 ) + warning = true; + } + if( close( outfd ) != 0 ) + { + show_error( "Error closing output file", errno ); + cleanup_and_fail( 1 ); + } + outfd = -1; + delete_output_on_interrupt = false; + if( in_statsp ) + { + struct utimbuf t; + t.actime = in_statsp->st_atime; + t.modtime = in_statsp->st_mtime; + if( utime( output_filename.c_str(), &t ) != 0 ) warning = true; + } + if( warning && verbosity >= 1 ) + show_error( "Can't change output file attributes." ); + } + + +unsigned char xdigit( const unsigned value ) + { + if( value <= 9 ) return '0' + value; + if( value <= 15 ) return 'A' + value - 10; + return 0; + } + + +bool show_trailing_data( const uint8_t * const data, const int size, + const Pretty_print & pp, const bool all, + const int ignore_trailing ) // -1 = show + { + if( verbosity >= 4 || ignore_trailing <= 0 ) + { + std::string msg; + if( !all ) msg = "first bytes of "; + msg += "trailing data = "; + for( int i = 0; i < size; ++i ) + { + msg += xdigit( data[i] >> 4 ); + msg += xdigit( data[i] & 0x0F ); + msg += ' '; + } + msg += '\''; + for( int i = 0; i < size; ++i ) + { if( std::isprint( data[i] ) ) msg += data[i]; else msg += '.'; } + msg += '\''; + pp( msg.c_str() ); + if( ignore_trailing == 0 ) show_file_error( pp.name(), trailing_msg ); + } + return ( ignore_trailing > 0 ); + } + + +int decompress( const unsigned long long cfile_size, const int infd, + const Pretty_print & pp, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing, + const bool testing ) + { + unsigned long long partial_file_pos = 0; + Range_decoder rdec( infd ); + int retval = 0; + + for( bool first_member = true; ; first_member = false ) + { + Lzip_header header; + rdec.reset_member_position(); + const int size = rdec.read_header_carefully( header, ignore_errors ); + if( rdec.finished() || // End Of File + ( size < Lzip_header::size && !rdec.find_header( header ) ) ) + { + if( first_member ) + { show_file_error( pp.name(), "File ends unexpectedly at member header." ); + retval = 2; } + else if( header.verify_prefix( size ) ) + { pp( "Truncated header in multimember file." ); + show_trailing_data( header.data, size, pp, true, -1 ); + retval = 2; } + else if( size > 0 && !show_trailing_data( header.data, size, pp, + true, ignore_trailing ) ) + retval = 2; + break; + } + if( !header.verify_magic() ) + { + if( first_member ) + { show_file_error( pp.name(), bad_magic_msg ); retval = 2; } + else if( !loose_trailing && header.verify_corrupt() ) + { pp( corrupt_mm_msg ); + show_trailing_data( header.data, size, pp, false, -1 ); + retval = 2; } + else if( !show_trailing_data( header.data, size, pp, false, ignore_trailing ) ) + retval = 2; + if( ignore_errors ) { pp.reset(); continue; } else break; + } + if( !header.verify_version() ) + { pp( bad_version( header.version() ) ); retval = 2; + if( ignore_errors ) { pp.reset(); continue; } else break; } + const unsigned dictionary_size = header.dictionary_size(); + if( !isvalid_ds( dictionary_size ) ) + { pp( bad_dict_msg ); retval = 2; + if( ignore_errors ) { pp.reset(); continue; } else break; } + + if( verbosity >= 2 || ( verbosity == 1 && first_member ) ) pp(); + + LZ_decoder decoder( rdec, dictionary_size, outfd ); + show_dprogress( cfile_size, partial_file_pos, &rdec, &pp ); // init + const int result = decoder.decode_member( pp ); + partial_file_pos += rdec.member_position(); + if( result != 0 ) + { + if( verbosity >= 0 && result <= 2 ) + { + pp(); + std::fprintf( stderr, "%s at pos %llu\n", ( result == 2 ) ? + "File ends unexpectedly" : "Decoder error", + partial_file_pos ); + } + retval = 2; if( ignore_errors ) { pp.reset(); continue; } else break; + } + if( verbosity >= 2 ) + { std::fputs( testing ? "ok\n" : "done\n", stderr ); pp.reset(); } + } + if( verbosity == 1 && retval == 0 ) + std::fputs( testing ? "ok\n" : "done\n", stderr ); + if( retval == 2 && ignore_errors ) retval = 0; + return retval; + } + +} // end namespace + +void set_signal_handler() { set_signals( signal_handler ); } + +int close_outstream( const struct stat * const in_statsp ) + { + if( delete_output_on_interrupt ) + close_and_set_permissions( in_statsp ); + if( outfd >= 0 && close( outfd ) != 0 ) + { show_error( "Error closing stdout", errno ); return 1; } + outfd = -1; + return 0; + } + + +std::string insert_fixed( std::string name ) + { + if( name.size() > 7 && name.compare( name.size() - 7, 7, ".tar.lz" ) == 0 ) + name.insert( name.size() - 7, "_fixed" ); + else if( name.size() > 3 && name.compare( name.size() - 3, 3, ".lz" ) == 0 ) + name.insert( name.size() - 3, "_fixed" ); + else if( name.size() > 4 && name.compare( name.size() - 4, 4, ".tlz" ) == 0 ) + name.insert( name.size() - 4, "_fixed" ); + else name += "_fixed.lz"; + return name; + } + + +void show_2file_error( const char * const msg1, const char * const name1, + const char * const name2, const char * const msg2 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: %s '%s' and '%s' %s\n", + program_name, msg1, name1, name2, msg2 ); + } + + +void show_dprogress( const unsigned long long cfile_size, + const unsigned long long partial_size, + const Range_decoder * const d, + const Pretty_print * const p ) + { + static unsigned long long csize = 0; // file_size / 100 + static unsigned long long psize = 0; + static const Range_decoder * rdec = 0; + static const Pretty_print * pp = 0; + static int counter = 0; + static bool enabled = true; + + if( !enabled ) return; + if( p ) // initialize static vars + { + if( verbosity < 2 || !isatty( STDERR_FILENO ) ) { enabled = false; return; } + csize = cfile_size; psize = partial_size; rdec = d; pp = p; counter = 0; + } + if( rdec && pp && --counter <= 0 ) + { + const unsigned long long pos = psize + rdec->member_position(); + counter = 7; // update display every 114688 bytes + if( csize > 0 ) + std::fprintf( stderr, "%4llu%% %.1f MB\r", pos / csize, pos / 1000000.0 ); + else + std::fprintf( stderr, " %.1f MB\r", pos / 1000000.0 ); + pp->reset(); (*pp)(); // restore cursor position + } + } + + +int main( const int argc, const char * const argv[] ) + { + Block range( 0, 0 ); + int sector_size = INT_MAX; // default larger than practical range + Bad_byte bad_byte; + Member_list member_list; + std::string default_output_filename; + const char * lzip_name = "lzip"; // default is lzip + const char * reference_filename = 0; + Mode program_mode = m_none; + int lzip_level = 0; // 0 = test all levels and match lengths + // '0'..'9' = level, 'a' = all levels + // -5..-273 = match length, -1 = all lengths + int repeated_byte = -1; // 0 to 255, or -1 for all values + bool force = false; + bool ignore_errors = false; + bool ignore_trailing = true; + bool keep_input_files = false; + bool loose_trailing = false; + bool to_stdout = false; + if( argc > 0 ) invocation_name = argv[0]; + + enum { opt_du = 256, opt_lt, opt_lzl, opt_lzn, opt_ref, opt_re, opt_st }; + const Arg_parser::Option options[] = + { + { 'a', "trailing-error", Arg_parser::no }, + { 'A', "alone-to-lz", Arg_parser::no }, + { 'c', "stdout", Arg_parser::no }, + { 'd', "decompress", Arg_parser::no }, + { 'D', "range-decompress", Arg_parser::yes }, + { 'e', "reproduce", Arg_parser::no }, + { 'E', "debug-reproduce", Arg_parser::yes }, + { 'f', "force", Arg_parser::no }, + { 'h', "help", Arg_parser::no }, + { 'i', "ignore-errors", Arg_parser::no }, + { 'k', "keep", Arg_parser::no }, + { 'l', "list", Arg_parser::no }, + { 'm', "merge", Arg_parser::no }, + { 'M', "md5sum", Arg_parser::no }, + { 'n', "threads", Arg_parser::yes }, + { 'o', "output", Arg_parser::yes }, + { 'q', "quiet", Arg_parser::no }, + { 'R', "repair", Arg_parser::no }, + { 's', "split", Arg_parser::no }, + { 'S', "nrep-stats", Arg_parser::maybe }, + { 't', "test", Arg_parser::no }, + { 'U', "unzcrash", Arg_parser::yes }, + { 'v', "verbose", Arg_parser::no }, + { 'V', "version", Arg_parser::no }, + { 'W', "debug-decompress", Arg_parser::yes }, + { 'X', "show-packets", Arg_parser::maybe }, + { 'Y', "debug-delay", Arg_parser::yes }, + { 'Z', "debug-repair", Arg_parser::yes }, + { opt_du, "dump", Arg_parser::yes }, + { opt_lt, "loose-trailing", Arg_parser::no }, + { opt_lzl, "lzip-level", Arg_parser::yes }, + { opt_lzn, "lzip-name", Arg_parser::yes }, + { opt_ref, "reference-file", Arg_parser::yes }, + { opt_re, "remove", Arg_parser::yes }, + { opt_st, "strip", Arg_parser::yes }, + { 0 , 0, Arg_parser::no } }; + + const Arg_parser parser( argc, argv, options ); + if( parser.error().size() ) // bad option + { show_error( parser.error().c_str(), 0, true ); return 1; } + + int argind = 0; + for( ; argind < parser.arguments(); ++argind ) + { + const int code = parser.code( argind ); + if( !code ) break; // no more options + const char * const pn = parser.parsed_name( argind ).c_str(); + const std::string & sarg = parser.argument( argind ); + const char * const arg = sarg.c_str(); + switch( code ) + { + case 'a': ignore_trailing = false; break; + case 'A': set_mode( program_mode, m_alone_to_lz ); break; + case 'c': to_stdout = true; break; + case 'd': set_mode( program_mode, m_decompress ); break; + case 'D': set_mode( program_mode, m_range_dec ); + parse_range( arg, pn, range ); break; + case 'e': set_mode( program_mode, m_reproduce ); break; + case 'E': set_mode( program_mode, m_reproduce ); + parse_range( arg, pn, range, §or_size ); break; + case 'f': force = true; break; + case 'h': show_help(); return 0; + case 'i': ignore_errors = true; break; + case 'k': keep_input_files = true; break; + case 'l': set_mode( program_mode, m_list ); break; + case 'm': set_mode( program_mode, m_merge ); break; + case 'M': set_mode( program_mode, m_md5sum ); break; + case 'n': break; + case 'o': if( sarg == "-" ) to_stdout = true; + else { default_output_filename = sarg; } break; + case 'q': verbosity = -1; break; + case 'R': set_mode( program_mode, m_repair ); break; + case 's': set_mode( program_mode, m_split ); break; + case 'S': if( arg[0] ) repeated_byte = getnum( arg, pn, 0, 0, 255 ); + set_mode( program_mode, m_nrep_stats ); break; + case 't': set_mode( program_mode, m_test ); break; + case 'U': parse_u( arg, pn, program_mode, sector_size ); break; + case 'v': if( verbosity < 4 ) ++verbosity; break; + case 'V': show_version(); return 0; + case 'W': set_mode( program_mode, m_debug_decompress ); + bad_byte.parse_bb( arg, pn ); break; + case 'X': set_mode( program_mode, m_show_packets ); + if( arg[0] ) { bad_byte.parse_bb( arg, pn ); } break; + case 'Y': set_mode( program_mode, m_debug_delay ); + parse_range( arg, pn, range ); break; + case 'Z': set_mode( program_mode, m_debug_repair ); + bad_byte.parse_bb( arg, pn ); break; + case opt_du: set_mode( program_mode, m_dump ); + member_list.parse_ml( arg, pn ); break; + case opt_lt: loose_trailing = true; break; + case opt_lzl: lzip_level = parse_lzip_level( arg, pn ); break; + case opt_lzn: lzip_name = arg; break; + case opt_ref: reference_filename = arg; break; + case opt_re: set_mode( program_mode, m_remove ); + member_list.parse_ml( arg, pn ); break; + case opt_st: set_mode( program_mode, m_strip ); + member_list.parse_ml( arg, pn ); break; + default : internal_error( "uncaught option." ); + } + } // end process options + +#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__ + setmode( STDIN_FILENO, O_BINARY ); + setmode( STDOUT_FILENO, O_BINARY ); +#endif + + if( program_mode == m_none ) + { + show_error( "You must specify the operation to be performed.", 0, true ); + return 1; + } + + std::vector< std::string > filenames; + bool filenames_given = false; + for( ; argind < parser.arguments(); ++argind ) + { + filenames.push_back( parser.argument( argind ) ); + if( filenames.back() != "-" ) filenames_given = true; + } + + const char terminator = isatty( STDOUT_FILENO ) ? '\r' : '\n'; + try { + switch( program_mode ) + { + case m_none: internal_error( "invalid operation." ); break; + case m_alone_to_lz: break; + case m_debug_decompress: + one_file( filenames.size() ); + return debug_decompress( filenames[0], bad_byte, false ); + case m_debug_delay: + one_file( filenames.size() ); + return debug_delay( filenames[0], range, terminator ); + case m_debug_repair: + one_file( filenames.size() ); + return debug_repair( filenames[0], bad_byte, terminator ); + case m_decompress: break; + case m_dump: + case m_strip: + if( filenames.size() < 1 ) + { show_error( "You must specify at least 1 file.", 0, true ); return 1; } + return dump_members( filenames, default_output_filename, member_list, + force, ignore_errors, ignore_trailing, + loose_trailing, program_mode == m_strip, to_stdout ); + case m_list: break; + case m_md5sum: break; + case m_merge: + if( filenames.size() < 2 ) + { show_error( "You must specify at least 2 files.", 0, true ); return 1; } + return merge_files( filenames, default_output_filename, terminator, force ); + case m_nrep_stats: return print_nrep_stats( filenames, repeated_byte, + ignore_errors, ignore_trailing, loose_trailing ); + case m_range_dec: + one_file( filenames.size() ); + return range_decompress( filenames[0], default_output_filename, range, + force, ignore_errors, ignore_trailing, + loose_trailing, to_stdout ); + case m_remove: + if( filenames.size() < 1 ) + { show_error( "You must specify at least 1 file.", 0, true ); return 1; } + return remove_members( filenames, member_list, ignore_errors, + ignore_trailing, loose_trailing ); + case m_repair: + one_file( filenames.size() ); + return repair_file( filenames[0], default_output_filename, terminator, force ); + case m_reproduce: + one_file( filenames.size() ); + if( !reference_filename || !reference_filename[0] ) + { show_error( "You must specify a reference file.", 0, true ); return 1; } + if( range.size() > 0 ) + return debug_reproduce_file( filenames[0], lzip_name, + reference_filename, range, sector_size, lzip_level ); + else + return reproduce_file( filenames[0], default_output_filename, + lzip_name, reference_filename, lzip_level, terminator, force ); + case m_show_packets: + one_file( filenames.size() ); + return debug_decompress( filenames[0], bad_byte, true ); + case m_split: + one_file( filenames.size() ); + return split_file( filenames[0], default_output_filename, force ); + case m_test: break; + case m_unzcrash_bit: + one_file( filenames.size() ); + return lunzcrash_bit( filenames[0].c_str() ); + case m_unzcrash_block: + one_file( filenames.size() ); + return lunzcrash_block( filenames[0].c_str(), sector_size ); + } + } + catch( std::bad_alloc & ) { show_error( mem_msg ); cleanup_and_fail( 1 ); } + catch( Error & e ) { show_error( e.msg, errno ); cleanup_and_fail( 1 ); } + + if( filenames.empty() ) filenames.push_back("-"); + + if( program_mode == m_list ) + return list_files( filenames, ignore_errors, ignore_trailing, loose_trailing ); + if( program_mode == m_md5sum ) + return md5sum_files( filenames ); + + if( program_mode != m_alone_to_lz && program_mode != m_decompress && + program_mode != m_test ) + internal_error( "invalid decompressor operation." ); + + if( program_mode == m_test ) to_stdout = false; // apply overrides + if( program_mode == m_test || to_stdout ) default_output_filename.clear(); + + if( to_stdout && program_mode != m_test ) // check tty only once + { outfd = STDOUT_FILENO; if( !check_tty_out( program_mode ) ) return 1; } + else outfd = -1; + + const bool to_file = !to_stdout && program_mode != m_test && + default_output_filename.size(); + if( !to_stdout && program_mode != m_test && ( filenames_given || to_file ) ) + set_signals( signal_handler ); + + Pretty_print pp( filenames ); + + int failed_tests = 0; + int retval = 0; + const bool one_to_one = !to_stdout && program_mode != m_test && !to_file; + bool stdin_used = false; + for( unsigned i = 0; i < filenames.size(); ++i ) + { + std::string input_filename; + int infd; + struct stat in_stats; + + pp.set_name( filenames[i] ); + if( filenames[i] == "-" ) + { + if( stdin_used ) continue; else stdin_used = true; + infd = STDIN_FILENO; + if( !check_tty_in( pp.name(), infd, program_mode, retval ) ) continue; + if( one_to_one ) { outfd = STDOUT_FILENO; output_filename.clear(); } + } + else + { + input_filename = filenames[i]; + infd = open_instream( input_filename.c_str(), &in_stats, one_to_one ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + if( !check_tty_in( pp.name(), infd, program_mode, retval ) ) continue; + if( one_to_one ) // open outfd after verifying infd + { + if( program_mode == m_alone_to_lz ) set_a_outname( input_filename ); + else set_d_outname( input_filename, extension_index( input_filename ) ); + if( !open_outstream( force, true ) ) + { close( infd ); set_retval( retval, 1 ); continue; } + } + } + + if( one_to_one && !check_tty_out( program_mode ) ) + { set_retval( retval, 1 ); return retval; } // don't delete a tty + + if( to_file && outfd < 0 ) // open outfd after verifying infd + { + output_filename = default_output_filename; + if( !open_outstream( force, false ) || !check_tty_out( program_mode ) ) + return 1; // check tty only once and don't try to delete a tty + } + + const struct stat * const in_statsp = + ( input_filename.size() && one_to_one ) ? &in_stats : 0; + const unsigned long long cfile_size = + ( input_filename.size() && S_ISREG( in_stats.st_mode ) ) ? + ( in_stats.st_size + 99 ) / 100 : 0; + int tmp; + try { + if( program_mode == m_alone_to_lz ) + tmp = alone_to_lz( infd, pp ); + else + tmp = decompress( cfile_size, infd, pp, ignore_errors, ignore_trailing, + loose_trailing, program_mode == m_test ); + } + catch( std::bad_alloc & ) { pp( mem_msg ); tmp = 1; } + catch( Error & e ) { pp(); show_error( e.msg, errno ); tmp = 1; } + if( close( infd ) != 0 ) + { show_file_error( pp.name(), "Error closing input file", errno ); + set_retval( tmp, 1 ); } + set_retval( retval, tmp ); + if( tmp ) + { if( program_mode != m_test ) cleanup_and_fail( retval ); + else ++failed_tests; } + + if( delete_output_on_interrupt && one_to_one ) + close_and_set_permissions( in_statsp ); + if( input_filename.size() && !keep_input_files && one_to_one && + ( program_mode != m_decompress || !ignore_errors ) ) + std::remove( input_filename.c_str() ); + } + if( delete_output_on_interrupt ) close_and_set_permissions( 0 ); // -o + else if( outfd >= 0 && close( outfd ) != 0 ) // -c + { + show_error( "Error closing stdout", errno ); + set_retval( retval, 1 ); + } + if( failed_tests > 0 && verbosity >= 1 && filenames.size() > 1 ) + std::fprintf( stderr, "%s: warning: %d %s failed the test.\n", + program_name, failed_tests, + ( failed_tests == 1 ) ? "file" : "files" ); + return retval; + } diff --git a/main_common.cc b/main_common.cc new file mode 100644 index 0000000..8f56a13 --- /dev/null +++ b/main_common.cc @@ -0,0 +1,196 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +namespace { + +const char * const program_year = "2022"; +const char * const mem_msg = "Not enough memory."; + +void show_version() + { + std::printf( "%s %s\n", program_name, PROGVERSION ); + std::printf( "Copyright (C) %s Antonio Diaz Diaz.\n", program_year ); + std::printf( "License GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl.html>\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" ); + } + + +// separate large numbers >= 100_000 in groups of 3 digits using '_' +const char * format_num3( long long num ) + { + const char * const si_prefix = "kMGTPEZY"; + const char * const binary_prefix = "KMGTPEZY"; + enum { buffers = 8, bufsize = 4 * sizeof (long long) }; + static char buffer[buffers][bufsize]; // circle of static buffers for printf + static int current = 0; + + char * const buf = buffer[current++]; current %= buffers; + char * p = buf + bufsize - 1; // fill the buffer backwards + *p = 0; // terminator + const bool negative = num < 0; + if( negative ) num = -num; + if( num > 1024 ) + { + char prefix = 0; // try binary first, then si + for( int i = 0; i < 8 && num >= 1024 && num % 1024 == 0; ++i ) + { num /= 1024; prefix = binary_prefix[i]; } + if( prefix ) *(--p) = 'i'; + else + for( int i = 0; i < 8 && num >= 1000 && num % 1000 == 0; ++i ) + { num /= 1000; prefix = si_prefix[i]; } + if( prefix ) *(--p) = prefix; + } + const bool split = num >= 100000; + + for( int i = 0; ; ) + { + *(--p) = num % 10 + '0'; num /= 10; if( num == 0 ) break; + if( split && ++i >= 3 ) { i = 0; *(--p) = '_'; } + } + if( negative ) *(--p) = '-'; + return p; + } + + +// Recognized formats: <num>[YZEPTGM][i][Bs], <num>k[Bs], <num>Ki[Bs] +// +long long getnum( const char * const arg, const char * const option_name, + const int hardbs, const long long llimit = -LLONG_MAX, + const long long ulimit = LLONG_MAX, + const char ** const tailp = 0 ) + { + char * tail; + errno = 0; + long long result = strtoll( arg, &tail, 0 ); + if( tail == arg ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Bad or missing numerical argument in " + "option '%s'.\n", program_name, option_name ); + std::exit( 1 ); + } + + if( !errno && tail[0] ) + { + char * const p = tail++; + int factor = 1000; // default factor + int exponent = -1; // -1 = bad multiplier + char usuf = 0; // 'B' or 's' unit suffix is present + switch( *p ) + { + case 'Y': exponent = 8; break; + case 'Z': exponent = 7; break; + case 'E': exponent = 6; break; + case 'P': exponent = 5; break; + case 'T': exponent = 4; break; + case 'G': exponent = 3; break; + case 'M': exponent = 2; break; + case 'K': if( tail[0] == 'i' ) { ++tail; factor = 1024; exponent = 1; } break; + case 'k': if( tail[0] != 'i' ) exponent = 1; break; + case 'B': + case 's': usuf = *p; exponent = 0; break; + default : if( tailp ) { tail = p; exponent = 0; } + } + if( exponent > 1 && tail[0] == 'i' ) { ++tail; factor = 1024; } + if( exponent > 0 && usuf == 0 && ( tail[0] == 'B' || tail[0] == 's' ) ) + { usuf = tail[0]; ++tail; } + if( exponent < 0 || ( usuf == 's' && hardbs <= 0 ) || + ( !tailp && tail[0] != 0 ) ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Bad multiplier in numerical argument of " + "option '%s'.\n", program_name, option_name ); + std::exit( 1 ); + } + for( int i = 0; i < exponent; ++i ) + { + if( LLONG_MAX / factor >= llabs( result ) ) result *= factor; + else { errno = ERANGE; break; } + } + if( usuf == 's' ) + { + if( LLONG_MAX / hardbs >= llabs( result ) ) result *= hardbs; + else errno = ERANGE; + } + } + if( !errno && ( result < llimit || result > ulimit ) ) errno = ERANGE; + if( errno ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Numerical argument out of limits [%s,%s] " + "in option '%s'.\n", program_name, format_num3( llimit ), + format_num3( ulimit ), option_name ); + std::exit( 1 ); + } + if( tailp ) *tailp = tail; + return result; + } + +} // end namespace + + +// Recognized formats: <pos>,<value> <pos>,+<value> <pos>,f<value> +// +void Bad_byte::parse_bb( const char * const arg, const char * const pn ) + { + option_name = pn; + const char * tail; + pos = getnum( arg, option_name, 0, 0, LLONG_MAX, &tail ); + if( tail[0] != ',' ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Bad separator between <pos> and <val> in " + "argument of option '%s'.\n", program_name, option_name ); + std::exit( 1 ); + } + if( tail[1] == '+' ) { ++tail; mode = delta; } + else if( tail[1] == 'f' ) { ++tail; mode = flip; } + else mode = literal; + value = getnum( tail + 1, option_name, 0, 0, 255 ); + } + + +void show_error( const char * const msg, const int errcode, const bool help ) + { + if( verbosity < 0 ) return; + if( msg && msg[0] ) + std::fprintf( stderr, "%s: %s%s%s\n", program_name, msg, + ( errcode > 0 ) ? ": " : "", + ( errcode > 0 ) ? std::strerror( errcode ) : "" ); + if( help ) + std::fprintf( stderr, "Try '%s --help' for more information.\n", + invocation_name ); + } + + +void show_file_error( const char * const filename, const char * const msg, + const int errcode ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: %s: %s%s%s\n", program_name, filename, msg, + ( errcode > 0 ) ? ": " : "", + ( errcode > 0 ) ? std::strerror( errcode ) : "" ); + } + + +void internal_error( const char * const msg ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: internal error: %s\n", program_name, msg ); + std::exit( 3 ); + } @@ -0,0 +1,206 @@ +/* Functions to compute MD5 message digest of memory blocks according to the + definition of MD5 in RFC 1321 from April 1992. + Copyright (C) 2020-2022 Antonio Diaz Diaz. + + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <cstring> +#include <stdint.h> + +#include "md5.h" + + +namespace { + +/* These are the four functions used in the four steps of the MD5 algorithm + as defined in RFC 1321. */ +#define F(x, y, z) ((x & y) | (~x & z)) +#define G(x, y, z) ((x & z) | (y & ~z)) +#define H(x, y, z) (x ^ y ^ z) +#define I(x, y, z) (y ^ (x | ~z)) + +/* Rotate x left n bits. + It is unfortunate that C++ does not provide an operator for rotation. + Hope the compiler is smart enough. */ +#define ROTATE_LEFT(x, n) (x = (x << n) | (x >> (32 - n))) + +// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +#define FF(a, b, c, d, x, s, ac) \ + { a += F(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } +#define GG(a, b, c, d, x, s, ac) \ + { a += G(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } +#define HH(a, b, c, d, x, s, ac) \ + { a += H(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } +#define II(a, b, c, d, x, s, ac) \ + { a += I(b, c, d) + x + ac; ROTATE_LEFT(a, s); a += b; } + +} // end namespace + + +void MD5SUM::md5_process_block( const uint8_t block[64] ) + { + uint32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + for( int i = 0, j = 0; i < 16; ++i, j += 4 ) // fill x in little endian + x[i] = block[j] | (block[j+1] << 8) | (block[j+2] << 16) | (block[j+3] << 24); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], 7, 0xD76AA478); // 1 + FF (d, a, b, c, x[ 1], 12, 0xE8C7B756); // 2 + FF (c, d, a, b, x[ 2], 17, 0x242070DB); // 3 + FF (b, c, d, a, x[ 3], 22, 0xC1BDCEEE); // 4 + FF (a, b, c, d, x[ 4], 7, 0xF57C0FAF); // 5 + FF (d, a, b, c, x[ 5], 12, 0x4787C62A); // 6 + FF (c, d, a, b, x[ 6], 17, 0xA8304613); // 7 + FF (b, c, d, a, x[ 7], 22, 0xFD469501); // 8 + FF (a, b, c, d, x[ 8], 7, 0x698098D8); // 9 + FF (d, a, b, c, x[ 9], 12, 0x8B44F7AF); // 10 + FF (c, d, a, b, x[10], 17, 0xFFFF5BB1); // 11 + FF (b, c, d, a, x[11], 22, 0x895CD7BE); // 12 + FF (a, b, c, d, x[12], 7, 0x6B901122); // 13 + FF (d, a, b, c, x[13], 12, 0xFD987193); // 14 + FF (c, d, a, b, x[14], 17, 0xA679438E); // 15 + FF (b, c, d, a, x[15], 22, 0x49B40821); // 16 + + /* Round 2 */ + GG (a, b, c, d, x[ 1], 5, 0xF61E2562); // 17 + GG (d, a, b, c, x[ 6], 9, 0xC040B340); // 18 + GG (c, d, a, b, x[11], 14, 0x265E5A51); // 19 + GG (b, c, d, a, x[ 0], 20, 0xE9B6C7AA); // 20 + GG (a, b, c, d, x[ 5], 5, 0xD62F105D); // 21 + GG (d, a, b, c, x[10], 9, 0x02441453); // 22 + GG (c, d, a, b, x[15], 14, 0xD8A1E681); // 23 + GG (b, c, d, a, x[ 4], 20, 0xE7D3FBC8); // 24 + GG (a, b, c, d, x[ 9], 5, 0x21E1CDE6); // 25 + GG (d, a, b, c, x[14], 9, 0xC33707D6); // 26 + GG (c, d, a, b, x[ 3], 14, 0xF4D50D87); // 27 + GG (b, c, d, a, x[ 8], 20, 0x455A14ED); // 28 + GG (a, b, c, d, x[13], 5, 0xA9E3E905); // 29 + GG (d, a, b, c, x[ 2], 9, 0xFCEFA3F8); // 30 + GG (c, d, a, b, x[ 7], 14, 0x676F02D9); // 31 + GG (b, c, d, a, x[12], 20, 0x8D2A4C8A); // 32 + + /* Round 3 */ + HH (a, b, c, d, x[ 5], 4, 0xFFFA3942); // 33 + HH (d, a, b, c, x[ 8], 11, 0x8771F681); // 34 + HH (c, d, a, b, x[11], 16, 0x6D9D6122); // 35 + HH (b, c, d, a, x[14], 23, 0xFDE5380C); // 36 + HH (a, b, c, d, x[ 1], 4, 0xA4BEEA44); // 37 + HH (d, a, b, c, x[ 4], 11, 0x4BDECFA9); // 38 + HH (c, d, a, b, x[ 7], 16, 0xF6BB4B60); // 39 + HH (b, c, d, a, x[10], 23, 0xBEBFBC70); // 40 + HH (a, b, c, d, x[13], 4, 0x289B7EC6); // 41 + HH (d, a, b, c, x[ 0], 11, 0xEAA127FA); // 42 + HH (c, d, a, b, x[ 3], 16, 0xD4EF3085); // 43 + HH (b, c, d, a, x[ 6], 23, 0x04881D05); // 44 + HH (a, b, c, d, x[ 9], 4, 0xD9D4D039); // 45 + HH (d, a, b, c, x[12], 11, 0xE6DB99E5); // 46 + HH (c, d, a, b, x[15], 16, 0x1FA27CF8); // 47 + HH (b, c, d, a, x[ 2], 23, 0xC4AC5665); // 48 + + /* Round 4 */ + II (a, b, c, d, x[ 0], 6, 0xF4292244); // 49 + II (d, a, b, c, x[ 7], 10, 0x432AFF97); // 50 + II (c, d, a, b, x[14], 15, 0xAB9423A7); // 51 + II (b, c, d, a, x[ 5], 21, 0xFC93A039); // 52 + II (a, b, c, d, x[12], 6, 0x655B59C3); // 53 + II (d, a, b, c, x[ 3], 10, 0x8F0CCC92); // 54 + II (c, d, a, b, x[10], 15, 0xFFEFF47D); // 55 + II (b, c, d, a, x[ 1], 21, 0x85845DD1); // 56 + II (a, b, c, d, x[ 8], 6, 0x6FA87E4F); // 57 + II (d, a, b, c, x[15], 10, 0xFE2CE6E0); // 58 + II (c, d, a, b, x[ 6], 15, 0xA3014314); // 59 + II (b, c, d, a, x[13], 21, 0x4E0811A1); // 60 + II (a, b, c, d, x[ 4], 6, 0xF7537E82); // 61 + II (d, a, b, c, x[11], 10, 0xBD3AF235); // 62 + II (c, d, a, b, x[ 2], 15, 0x2AD7D2BB); // 63 + II (b, c, d, a, x[ 9], 21, 0xEB86D391); // 64 + + // add the processed values to the context + state[0] += a; state[1] += b; state[2] += c; state[3] += d; + } + + +/* Update the context for the next 'len' bytes of 'buffer'. + 'len' does not need to be a multiple of 64. +*/ +void MD5SUM::md5_update( const uint8_t * const buffer, const unsigned long len ) + { + unsigned index = count & 0x3F; // data length in bytes mod 64 + count += len; // update data length + const unsigned rest = 64 - index; + unsigned long i; + + if( len >= rest ) // process as many bytes as possible + { + std::memcpy( ibuf + index, buffer, rest ); + md5_process_block( ibuf ); + for( i = rest; i + 63 < len; i += 64 ) + md5_process_block( buffer + i ); + index = 0; + } + else i = 0; + + std::memcpy( ibuf + index, buffer + i, len - i ); // save remaining input + } + + +// finish computation and return the digest +void MD5SUM::md5_finish( uint8_t digest[16] ) + { + uint8_t padding[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + uint8_t bits[8]; + uint64_t c = count << 3; // save data length in bits + for( int i = 0; i <= 7; ++i ) { bits[i] = (uint8_t)c; c >>= 8; } + + const unsigned index = count & 0x3F; // data length in bytes mod 64 + const unsigned len = (index < 56) ? (56 - index) : (120 - index); + md5_update( padding, len ); // pad to 56 mod 64 + md5_update( bits, 8 ); // append data length in bits + + for( int i = 0, j = 0; i < 4; i++, j += 4 ) // store state in digest + { + digest[j ] = (uint8_t)state[i]; + digest[j+1] = (uint8_t)(state[i] >> 8); + digest[j+2] = (uint8_t)(state[i] >> 16); + digest[j+3] = (uint8_t)(state[i] >> 24); + } + } + + +void compute_md5( const uint8_t * const buffer, const unsigned long len, + uint8_t digest[16] ) + { + MD5SUM md5sum; + if( len > 0 ) md5sum.md5_update( buffer, len ); + md5sum.md5_finish( digest ); + } + + +bool check_md5( const uint8_t * const buffer, const unsigned long len, + const uint8_t digest[16] ) + { + uint8_t new_digest[16]; + compute_md5( buffer, len, new_digest ); + return ( std::memcmp( digest, new_digest, 16 ) == 0 ); + } @@ -0,0 +1,49 @@ +/* Functions to compute MD5 message digest of memory blocks according to the + definition of MD5 in RFC 1321 from April 1992. + Copyright (C) 2020-2022 Antonio Diaz Diaz. + + This library is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +class MD5SUM + { + uint64_t count; // data length in bytes, modulo 2^64 + uint32_t state[4]; // state (ABCD) + uint8_t ibuf[64]; // input buffer with space for a block + + void md5_process_block( const uint8_t block[64] ); + +public: + MD5SUM() { reset(); } + + void reset() + { + count = 0; + state[0] = 0x67452301; // magic initialization constants + state[1] = 0xEFCDAB89; + state[2] = 0x98BADCFE; + state[3] = 0x10325476; + } + + void md5_update( const uint8_t * const buffer, const unsigned long len ); + void md5_finish( uint8_t digest[16] ); + }; + +void compute_md5( const uint8_t * const buffer, const unsigned long len, + uint8_t digest[16] ); + +bool check_md5( const uint8_t * const buffer, const unsigned long len, + const uint8_t digest[16] ); diff --git a/merge.cc b/merge.cc new file mode 100644 index 0000000..8335841 --- /dev/null +++ b/merge.cc @@ -0,0 +1,649 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "decoder.h" +#include "lzip_index.h" + + +Block Block::split( const long long pos ) + { + if( pos > pos_ && pos < end() ) + { + const Block b( pos_, pos - pos_ ); + pos_ = pos; size_ -= b.size_; + return b; + } + return Block( 0, 0 ); + } + +namespace { + +bool pending_newline = false; + +void print_pending_newline( const char terminator ) + { if( pending_newline && terminator != '\n' ) std::fputc( '\n', stdout ); + pending_newline = false; } + + +bool file_crc( uint32_t & crc, const int infd, const char * const filename ) + { + const int buffer_size = 65536; + crc = 0xFFFFFFFFU; + uint8_t * const buffer = new uint8_t[buffer_size]; + bool error = false; + + while( true ) + { + const int rd = readblock( infd, buffer, buffer_size ); + if( rd != buffer_size && errno ) + { show_file_error( filename, "Error reading input file", errno ); + error = true; break; } + if( rd > 0 ) + crc32.update_buf( crc, buffer, rd ); + if( rd < buffer_size ) break; // EOF + } + delete[] buffer; + crc ^= 0xFFFFFFFFU; + return !error; + } + + +// Add 'bv' to 'block_vector' splitting blocks as needed to keep all the +// edges (pos and end of every block). +// 'block_vector' contains the result. 'bv' is destroyed. +void combine( std::vector< Block > & block_vector, std::vector< Block > & bv ) + { + if( block_vector.empty() ) { block_vector.swap( bv ); return; } + unsigned i1 = 0, i2 = 0; + while( i1 < block_vector.size() && i2 < bv.size() ) + { + Block & b1 = block_vector[i1]; + Block & b2 = bv[i2]; + if( b1.overlaps( b2 ) ) + { + if( b1 < b2 ) + { + Block b = b1.split( b2.pos() ); + block_vector.insert( block_vector.begin() + i1, b ); ++i1; + } + else if( b2 < b1 ) + { + Block b( b2.pos(), b1.pos() - b2.pos() ); + b2.split( b1.pos() ); + block_vector.insert( block_vector.begin() + i1, b ); ++i1; + } + else if( b1.end() < b2.end() ) { b2.split( b1.end() ); ++i1; } + else if( b2.end() < b1.end() ) + { + Block b = b1.split( b2.end() ); + block_vector.insert( block_vector.begin() + i1, b ); ++i1; ++i2; + } + else { ++i1; ++i2; } // blocks are identical + } + else if( b1 < b2 ) ++i1; + else { block_vector.insert( block_vector.begin() + i1, b2 ); ++i1; ++i2; } + } + if( i2 < bv.size() ) // tail copy + block_vector.insert( block_vector.end(), bv.begin() + i2, bv.end() ); + } + + +// positions in 'block_vector' are absolute file positions. +// blocks in 'block_vector' are ascending and don't overlap. +bool diff_member( const long long mpos, const long long msize, + const std::vector< std::string > & filenames, + const std::vector< int > & infd_vector, + std::vector< Block > & block_vector, + std::vector< int > & color_vector ) + { + const int files = infd_vector.size(); + const int buffer_size = 65536; + uint8_t * const buffer1 = new uint8_t[buffer_size]; + uint8_t * const buffer2 = new uint8_t[buffer_size]; + int next_color = 1; + + bool error = false; + for( int i1 = 0; i1 < files && !error; ++i1 ) + { + for( int i2 = i1 + 1; i2 < files && !error; ++i2 ) + { + if( color_vector[i1] != 0 && color_vector[i1] == color_vector[i2] ) + continue; + std::vector< Block > bv; + long long partial_pos = 0; + const int fd1 = infd_vector[i1], fd2 = infd_vector[i2]; + int begin = -1; // begin of block. -1 means no block + bool prev_equal = true; + if( !safe_seek( fd1, mpos ) || !safe_seek( fd2, mpos ) ) + { error = true; break; } + + while( partial_pos < msize ) + { + const int size = std::min( (long long)buffer_size, msize - partial_pos ); + const int rd = readblock( fd1, buffer1, size ); + if( rd != size && errno ) + { show_file_error( filenames[i1].c_str(), "Error reading input file", + errno ); error = true; break; } + if( rd > 0 ) + { + if( readblock( fd2, buffer2, rd ) != rd ) + { show_file_error( filenames[i2].c_str(), "Error reading input file", + errno ); error = true; break; } + for( int i = 0; i < rd; ++i ) + { + if( buffer1[i] != buffer2[i] ) + { + prev_equal = false; + if( begin < 0 ) begin = partial_pos + i; // begin block + } + else if( !prev_equal ) prev_equal = true; + else if( begin >= 0 ) // end block + { + Block b( mpos + begin, partial_pos + i - 1 - begin ); + begin = -1; + bv.push_back( b ); + } + } + partial_pos += rd; + } + if( rd < buffer_size ) break; // EOF + } + if( begin >= 0 ) // finish last block + { + Block b( mpos + begin, partial_pos - prev_equal - begin ); + bv.push_back( b ); + } + if( bv.empty() ) // members are identical, set to same color + { + if( color_vector[i1] == 0 ) + { + if( color_vector[i2] != 0 ) color_vector[i1] = color_vector[i2]; + else color_vector[i1] = color_vector[i2] = next_color++; + } + else if( color_vector[i2] == 0 ) color_vector[i2] = color_vector[i1]; + else internal_error( "different colors assigned to identical members." ); + } + combine( block_vector, bv ); + } + if( color_vector[i1] == 0 ) color_vector[i1] = next_color++; + } + delete[] buffer2; delete[] buffer1; + return !error; + } + + +long ipow( const unsigned base, const unsigned exponent ) + { + unsigned long result = 1; + for( unsigned i = 0; i < exponent; ++i ) + { + if( LONG_MAX / result >= base ) result *= base; + else { result = LONG_MAX; break; } + } + return result; + } + + +int open_input_files( const std::vector< std::string > & filenames, + std::vector< int > & infd_vector, + Lzip_index & lzip_index, struct stat * const in_statsp ) + { + const int files = filenames.size(); + for( int i = 0; i + 1 < files; ++i ) + for( int j = i + 1; j < files; ++j ) + if( filenames[i] == filenames[j] ) + { show_file_error( filenames[i].c_str(), "Input file given twice." ); + return 2; } + { + std::vector< uint32_t > crc_vector( files ); + for( int i = 0; i < files; ++i ) + { + struct stat in_stats; // not used + infd_vector[i] = open_instream( filenames[i].c_str(), + ( i == 0 ) ? in_statsp : &in_stats, false, true ); + if( infd_vector[i] < 0 ) return 1; + if( !file_crc( crc_vector[i], infd_vector[i], filenames[i].c_str() ) ) + return 1; + for( int j = 0; j < i; ++j ) + if( crc_vector[i] == crc_vector[j] ) + { show_2file_error( "Input files", filenames[j].c_str(), + filenames[i].c_str(), "are identical." ); return 2; } + } + } + + long long insize = 0; + int good_i = -1; + for( int i = 0; i < files; ++i ) + { + long long tmp; + const Lzip_index li( infd_vector[i], true, true, true ); + if( li.retval() == 0 ) // file format is intact + { + if( good_i < 0 ) { good_i = i; lzip_index = li; } + else if( lzip_index != li ) + { show_2file_error( "Input files", filenames[good_i].c_str(), + filenames[i].c_str(), "are different." ); return 2; } + tmp = lzip_index.file_size(); + } + else // file format is damaged + { + tmp = lseek( infd_vector[i], 0, SEEK_END ); + if( tmp < 0 ) + { + show_file_error( filenames[i].c_str(), "Input file is not seekable." ); + return 1; + } + } + if( tmp < min_member_size ) + { show_file_error( filenames[i].c_str(), "Input file is too short." ); + return 2; } + if( i == 0 ) insize = tmp; + else if( insize != tmp ) + { show_2file_error( "Sizes of input files", filenames[0].c_str(), + filenames[i].c_str(), "are different." ); return 2; } + } + + if( lzip_index.retval() != 0 ) + { + const Lzip_index li( infd_vector, insize ); + if( li.retval() == 0 ) // file format could be recovered + lzip_index = li; + else + { show_error( "Format damaged in all input files." ); return 2; } + } + + for( int i = 0; i < files; ++i ) + { + const int infd = infd_vector[i]; + bool error = false; + for( long j = 0; j < lzip_index.members(); ++j ) + { + const long long mpos = lzip_index.mblock( j ).pos(); + const long long msize = lzip_index.mblock( j ).size(); + if( !safe_seek( infd, mpos ) ) return 1; + if( test_member_from_file( infd, msize ) != 0 ) { error = true; break; } + } + if( !error ) + { + if( verbosity >= 1 ) + std::printf( "File '%s' has no errors. Recovery is not needed.\n", + filenames[i].c_str() ); + return 0; + } + } + return -1; + } + + +void maybe_cluster_blocks( std::vector< Block > & block_vector ) + { + const unsigned long old_size = block_vector.size(); + if( old_size <= 16 ) return; + do { + int min_gap = INT_MAX; + bool same = true; // all gaps have the same size + for( unsigned i = 1; i < block_vector.size(); ++i ) + { + const long long gap = block_vector[i].pos() - block_vector[i-1].end(); + if( gap < min_gap ) + { if( min_gap < INT_MAX ) same = false; min_gap = gap; } + else if( gap != min_gap ) same = false; + } + if( min_gap >= INT_MAX || same ) break; + for( unsigned i = block_vector.size() - 1; i > 0; --i ) + { + const long long gap = block_vector[i].pos() - block_vector[i-1].end(); + if( gap == min_gap ) + { + block_vector[i-1].size( block_vector[i-1].size() + gap + + block_vector[i].size() ); + block_vector.erase( block_vector.begin() + i ); + } + } + } while( block_vector.size() > 16 ); + if( verbosity >= 1 && old_size > block_vector.size() ) + std::printf( " %lu errors have been grouped in %lu clusters.\n", + old_size, (long)block_vector.size() ); + } + + +bool color_done( const std::vector< int > & color_vector, const int i ) + { + for( int j = i - 1; j >= 0; --j ) + if( color_vector[j] == color_vector[i] ) return true; + return false; + } + + +// try dividing blocks in 2 color groups at every gap +bool try_merge_member2( const long long mpos, const long long msize, + const std::vector< Block > & block_vector, + const std::vector< int > & color_vector, + const std::vector< int > & infd_vector, + const char terminator ) + { + const int blocks = block_vector.size(); + const int files = infd_vector.size(); + const int variations = files * ( files - 1 ); + + for( int i1 = 0; i1 < files; ++i1 ) + for( int i2 = 0; i2 < files; ++i2 ) + { + if( i1 == i2 || color_vector[i1] == color_vector[i2] || + color_done( color_vector, i1 ) ) continue; + for( int bi = 0; bi < blocks; ++bi ) + if( !safe_seek( infd_vector[i2], block_vector[bi].pos() ) || + !safe_seek( outfd, block_vector[bi].pos() ) || + !copy_file( infd_vector[i2], outfd, block_vector[bi].size() ) ) + cleanup_and_fail( 1 ); + const int infd = infd_vector[i1]; + const int var = ( i1 * ( files - 1 ) ) + i2 - ( i2 > i1 ) + 1; + for( int bi = 0; bi + 1 < blocks; ++bi ) + { + if( verbosity >= 2 ) + { + std::printf( " Trying variation %d of %d, block %d %c", + var, variations, bi + 1, terminator ); + std::fflush( stdout ); pending_newline = true; + } + if( !safe_seek( infd, block_vector[bi].pos() ) || + !safe_seek( outfd, block_vector[bi].pos() ) || + !copy_file( infd, outfd, block_vector[bi].size() ) || + !safe_seek( outfd, mpos ) ) + cleanup_and_fail( 1 ); + long long failure_pos = 0; + if( test_member_from_file( outfd, msize, &failure_pos ) == 0 ) + return true; + if( mpos + failure_pos < block_vector[bi].end() ) break; + } + } + return false; + } + + +// merge block by block +bool try_merge_member( const long long mpos, const long long msize, + const std::vector< Block > & block_vector, + const std::vector< int > & color_vector, + const std::vector< int > & infd_vector, + const char terminator ) + { + const int blocks = block_vector.size(); + const int files = infd_vector.size(); + const long variations = ipow( files, blocks ); + if( variations >= LONG_MAX ) + { + if( files > 2 ) + show_error( "Too many damaged blocks. Try merging fewer files." ); + else + show_error( "Too many damaged blocks. Merging is not possible." ); + cleanup_and_fail( 2 ); + } + int bi = 0; // block index + std::vector< int > file_idx( blocks, 0 ); // file to read each block from + + while( bi >= 0 ) + { + if( verbosity >= 2 ) + { + long var = 0; + for( int i = 0; i < blocks; ++i ) + var = ( var * files ) + file_idx[i]; + std::printf( " Trying variation %ld of %ld %c", + var + 1, variations, terminator ); + std::fflush( stdout ); pending_newline = true; + } + while( bi < blocks ) + { + const int infd = infd_vector[file_idx[bi]]; + if( !safe_seek( infd, block_vector[bi].pos() ) || + !safe_seek( outfd, block_vector[bi].pos() ) || + !copy_file( infd, outfd, block_vector[bi].size() ) ) + cleanup_and_fail( 1 ); + ++bi; + } + if( !safe_seek( outfd, mpos ) ) cleanup_and_fail( 1 ); + long long failure_pos = 0; + if( test_member_from_file( outfd, msize, &failure_pos ) == 0 ) return true; + while( bi > 0 && mpos + failure_pos < block_vector[bi-1].pos() ) --bi; + while( --bi >= 0 ) + { + while( ++file_idx[bi] < files && + color_done( color_vector, file_idx[bi] ) ); + if( file_idx[bi] < files ) break; + file_idx[bi] = 0; + } + } + return false; + } + + +// merge a single block split at every possible position +bool try_merge_member1( const long long mpos, const long long msize, + const std::vector< Block > & block_vector, + const std::vector< int > & color_vector, + const std::vector< int > & infd_vector, + const char terminator ) + { + if( block_vector.size() != 1 || block_vector[0].size() <= 1 ) return false; + const long long pos = block_vector[0].pos(); + const long long size = block_vector[0].size(); + const int files = infd_vector.size(); + const int variations = files * ( files - 1 ); + uint8_t byte; + + for( int i1 = 0; i1 < files; ++i1 ) + for( int i2 = 0; i2 < files; ++i2 ) + { + if( i1 == i2 || color_vector[i1] == color_vector[i2] || + color_done( color_vector, i1 ) ) continue; + const int infd = infd_vector[i1]; + if( !safe_seek( infd, pos ) || + !safe_seek( infd_vector[i2], pos ) || + !safe_seek( outfd, pos ) || + !copy_file( infd_vector[i2], outfd, size ) ) + cleanup_and_fail( 1 ); + const int var = ( i1 * ( files - 1 ) ) + i2 - ( i2 > i1 ) + 1; + for( long long i = 0; i + 1 < size; ++i ) + { + if( verbosity >= 2 ) + { + std::printf( " Trying variation %d of %d, position %lld %c", + var, variations, pos + i, terminator ); + std::fflush( stdout ); pending_newline = true; + } + if( !safe_seek( outfd, pos + i ) || + readblock( infd, &byte, 1 ) != 1 || + writeblock( outfd, &byte, 1 ) != 1 || + !safe_seek( outfd, mpos ) ) + cleanup_and_fail( 1 ); + long long failure_pos = 0; + if( test_member_from_file( outfd, msize, &failure_pos ) == 0 ) + return true; + if( mpos + failure_pos <= pos + i ) break; + } + } + return false; + } + +} // end namespace + + +// infd and outfd can refer to the same file if copying to a lower file +// position or if source and destination blocks don't overlap. +// max_size < 0 means no size limit. +bool copy_file( const int infd, const int outfd, const long long max_size ) + { + const int buffer_size = 65536; + // remaining number of bytes to copy + long long rest = ( ( max_size >= 0 ) ? max_size : buffer_size ); + long long copied_size = 0; + uint8_t * const buffer = new uint8_t[buffer_size]; + bool error = false; + + while( rest > 0 ) + { + const int size = std::min( (long long)buffer_size, rest ); + if( max_size >= 0 ) rest -= size; + const int rd = readblock( infd, buffer, size ); + if( rd != size && errno ) + { show_error( "Error reading input file", errno ); error = true; break; } + if( rd > 0 ) + { + const int wr = writeblock( outfd, buffer, rd ); + if( wr != rd ) + { show_error( "Error writing output file", errno ); + error = true; break; } + copied_size += rd; + } + if( rd < size ) break; // EOF + } + delete[] buffer; + if( !error && max_size >= 0 && copied_size != max_size ) + { show_error( "Input file ends unexpectedly." ); error = true; } + return !error; + } + + +// Return value: 0 = OK, 1 = bad msize, 2 = data error +// 'failure_pos' is relative to the beginning of the member +int test_member_from_file( const int infd, const unsigned long long msize, + long long * const failure_posp ) + { + Range_decoder rdec( infd ); + Lzip_header header; + rdec.read_data( header.data, Lzip_header::size ); + const unsigned dictionary_size = header.dictionary_size(); + bool done = false; + if( !rdec.finished() && header.verify_magic() && + header.verify_version() && isvalid_ds( dictionary_size ) ) + { + LZ_decoder decoder( rdec, dictionary_size, -1 ); + const int old_verbosity = verbosity; + verbosity = -1; // suppress all messages + Pretty_print dummy_pp( "" ); + done = ( decoder.decode_member( dummy_pp ) == 0 ); + verbosity = old_verbosity; // restore verbosity level + if( done && rdec.member_position() == msize ) return 0; + } + if( failure_posp ) *failure_posp = rdec.member_position(); + return done ? 1 : 2; + } + + +int merge_files( const std::vector< std::string > & filenames, + const std::string & default_output_filename, + const char terminator, const bool force ) + { + const int files = filenames.size(); + std::vector< int > infd_vector( files ); + Lzip_index lzip_index; + struct stat in_stats; + const int retval = + open_input_files( filenames, infd_vector, lzip_index, &in_stats ); + if( retval >= 0 ) return retval; + if( !safe_seek( infd_vector[0], 0 ) ) return 1; + + output_filename = default_output_filename.empty() ? + insert_fixed( filenames[0] ) : default_output_filename; + set_signal_handler(); + if( !open_outstream( force, true, true, false ) ) return 1; + if( !copy_file( infd_vector[0], outfd ) ) // copy whole file + cleanup_and_fail( 1 ); + + for( long j = 0; j < lzip_index.members(); ++j ) + { + const long long mpos = lzip_index.mblock( j ).pos(); + const long long msize = lzip_index.mblock( j ).size(); + // vector of data blocks differing among the copies of the current member + std::vector< Block > block_vector; + // different color means members are different + std::vector< int > color_vector( files, 0 ); + if( !diff_member( mpos, msize, filenames, infd_vector, block_vector, + color_vector ) || !safe_seek( outfd, mpos ) ) + cleanup_and_fail( 1 ); + + if( block_vector.empty() ) + { + if( lzip_index.members() > 1 && test_member_from_file( outfd, msize ) == 0 ) + continue; + if( verbosity >= 0 ) + std::fprintf( stderr, "Member %ld is damaged and identical in all files." + " Merging is not possible.\n", j + 1 ); + cleanup_and_fail( 2 ); + } + + if( verbosity >= 2 ) + { + std::printf( "Merging member %ld of %ld (%lu error%s)\n", + j + 1, lzip_index.members(), (long)block_vector.size(), + ( block_vector.size() == 1 ) ? "" : "s" ); + std::fflush( stdout ); + } + + bool done = false; + if( block_vector.size() > 1 ) + { + maybe_cluster_blocks( block_vector ); + done = try_merge_member2( mpos, msize, block_vector, color_vector, + infd_vector, terminator ); + print_pending_newline( terminator ); + } + // With just one member and one differing block the merge can't succeed. + if( !done && ( lzip_index.members() > 1 || block_vector.size() > 1 ) ) + { + done = try_merge_member( mpos, msize, block_vector, color_vector, + infd_vector, terminator ); + print_pending_newline( terminator ); + } + if( !done ) + { + done = try_merge_member1( mpos, msize, block_vector, color_vector, + infd_vector, terminator ); + print_pending_newline( terminator ); + } + if( !done ) + { + if( verbosity >= 3 ) + for( unsigned i = 0; i < block_vector.size(); ++i ) + std::fprintf( stderr, "area %2d from position %6lld to %6lld\n", i + 1, + block_vector[i].pos(), block_vector[i].end() - 1 ); + show_error( "Some error areas overlap. Merging is not possible." ); + cleanup_and_fail( 2 ); + } + } + + if( close_outstream( &in_stats ) != 0 ) return 1; + if( verbosity >= 1 ) + std::fputs( "Input files merged successfully.\n", stdout ); + return 0; + } diff --git a/mtester.cc b/mtester.cc new file mode 100644 index 0000000..ecdb9c0 --- /dev/null +++ b/mtester.cc @@ -0,0 +1,377 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> + +#include "lzip.h" +#include "md5.h" +#include "mtester.h" + + +namespace { + +const char * format_byte( const uint8_t byte ) + { + enum { buffers = 8, bufsize = 16 }; + static char buffer[buffers][bufsize]; // circle of static buffers for printf + static int current = 0; + char * const buf = buffer[current++]; current %= buffers; + if( ( byte >= 0x20 && byte <= 0x7E ) || byte >= 0xA0 ) + snprintf( buf, bufsize, "'%c' (0x%02X)", byte, byte ); + else + snprintf( buf, bufsize, " (0x%02X)", byte ); + return buf; + } + +} // end namespace + + +void LZ_mtester::print_block( const int len ) + { + std::fputs( " \"", stdout ); + for( int i = len - 1; i >= 0; --i ) + { + uint8_t byte = peek( i ); + if( byte < 0x20 || ( byte > 0x7E && byte < 0xA0 ) ) byte = '.'; + std::fputc( byte, stdout ); + } + std::fputs( "\"\n", stdout ); + } + + +void LZ_mtester::duplicate_buffer( uint8_t * const buffer2 ) + { + if( data_position() > 0 ) + std::memcpy( buffer2, buffer, std::min( data_position(), + (unsigned long long)dictionary_size ) ); + else buffer2[dictionary_size-1] = 0; // prev_byte of first byte + buffer = buffer2; + buffer_is_external = true; + } + + +void LZ_mtester::flush_data() + { + if( pos > stream_pos ) + { + const int size = pos - stream_pos; + crc32.update_buf( crc_, buffer + stream_pos, size ); + if( md5sum ) md5sum->md5_update( buffer + stream_pos, size ); + if( outfd >= 0 && writeblock( outfd, buffer + stream_pos, size ) != size ) + throw Error( "Write error" ); + if( pos >= dictionary_size ) + { partial_data_pos += pos; pos = 0; pos_wrapped = true; } + stream_pos = pos; + } + } + + +bool LZ_mtester::verify_trailer( FILE * const f, unsigned long long byte_pos ) + { + const Lzip_trailer * const trailer = rdec.get_trailer(); + if( !trailer ) + { + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fputs( "Can't get trailer.\n", f ); } + return false; + } + const unsigned long long data_size = data_position(); + const unsigned long long member_size = rdec.member_position(); + bool error = false; + + const unsigned td_crc = trailer->data_crc(); + if( td_crc != crc() ) + { + error = true; + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fprintf( f, "CRC mismatch; stored %08X, computed %08X\n", + td_crc, crc() ); } + } + const unsigned long long td_size = trailer->data_size(); + if( td_size != data_size ) + { + error = true; + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fprintf( f, "Data size mismatch; stored %llu (0x%llX), computed %llu (0x%llX)\n", + td_size, td_size, data_size, data_size ); } + } + const unsigned long long tm_size = trailer->member_size(); + if( tm_size != member_size ) + { + error = true; + if( verbosity >= 0 && f ) + { if( byte_pos ) + { std::fprintf( f, "byte %llu\n", byte_pos ); byte_pos = 0; } + std::fprintf( f, "Member size mismatch; stored %llu (0x%llX), computed %llu (0x%llX)\n", + tm_size, tm_size, member_size, member_size ); } + } + return !error; + } + + +/* Return value: 0 = OK, 1 = decoder error, 2 = unexpected EOF, + 3 = trailer error, 4 = unknown marker found, + -1 = pos_limit reached. */ +int LZ_mtester::test_member( const unsigned long long mpos_limit, + const unsigned long long dpos_limit, + FILE * const f, const unsigned long long byte_pos ) + { + if( mpos_limit < Lzip_header::size + 5 ) return -1; + if( member_position() == Lzip_header::size ) rdec.load(); + while( !rdec.finished() ) + { + if( member_position() >= mpos_limit || data_position() >= dpos_limit ) + { flush_data(); return -1; } + const int pos_state = data_position() & pos_state_mask; + if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit + { + // literal byte + Bit_model * const bm = bm_literal[get_lit_state(peek_prev())]; + if( state.is_char_set_char() ) + put_byte( rdec.decode_tree8( bm ) ); + else + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); + continue; + } + // match or repeated match + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + state.set_rep(); + len = rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + len = rdec.decode_len( match_len_model, pos_state ); + unsigned distance = rdec.decode_tree6( bm_dis_slot[get_len_state(len)] ); + if( distance >= start_dis_model ) + { + const unsigned dis_slot = distance; + const int direct_bits = ( dis_slot >> 1 ) - 1; + distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + distance += rdec.decode_tree_reversed( + bm_dis + ( distance - dis_slot ), direct_bits ); + else + { + distance += + rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + distance += rdec.decode_tree_reversed4( bm_align ); + if( distance == 0xFFFFFFFFU ) // marker found + { + rdec.normalize(); + flush_data(); + if( len == min_match_len ) // End Of Stream marker + { + if( verify_trailer( f, byte_pos ) ) return 0; else return 3; + } + if( verbosity >= 0 && f ) + { + if( byte_pos ) std::fprintf( f, "byte %llu\n", byte_pos ); + std::fprintf( f, "Unsupported marker code '%d'\n", len ); + } + return 4; + } + } + } + rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; + if( rep0 > max_rep0 ) max_rep0 = rep0; + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { if( outfd >= 0 ) { flush_data(); } return 1; } + } + copy_block( rep0, len ); + } + if( outfd >= 0 ) flush_data(); + return 2; + } + + +/* Return value: 0 = OK, 1 = decoder error, 2 = unexpected EOF, + 3 = trailer error, 4 = unknown marker found. */ +int LZ_mtester::debug_decode_member( const long long dpos, const long long mpos, + const bool show_packets ) + { + rdec.load(); + unsigned old_tmpos = member_position(); // truncated member_position + while( !rdec.finished() ) + { + const unsigned long long dp = data_position() + dpos; + const unsigned long long mp = member_position() + mpos - 4; + const unsigned tmpos = member_position(); + set_max_packet( tmpos - old_tmpos, mp ); + old_tmpos = tmpos; + ++total_packets_; + const int pos_state = data_position() & pos_state_mask; + if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit + { + // literal byte + Bit_model * const bm = bm_literal[get_lit_state(peek_prev())]; + if( state.is_char_set_char() ) + { + const uint8_t cur_byte = rdec.decode_tree8( bm ); + put_byte( cur_byte ); + if( show_packets ) + std::printf( "%6llu %6llu literal %s\n", + mp, dp, format_byte( cur_byte ) ); + } + else + { + const uint8_t match_byte = peek( rep0 ); + const uint8_t cur_byte = rdec.decode_matched( bm, match_byte ); + put_byte( cur_byte ); + if( show_packets ) + std::printf( "%6llu %6llu literal %s, match byte %6llu %s\n", + mp, dp, format_byte( cur_byte ), dp - rep0 - 1, + format_byte( match_byte ) ); + } + continue; + } + // match or repeated match + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + { + int rep = 0; + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { + if( show_packets ) + std::printf( "%6llu %6llu shortrep %s %6u (%6llu)\n", + mp, dp, format_byte( peek( rep0 ) ), + rep0 + 1, dp - rep0 - 1 ); + state.set_short_rep(); put_byte( peek( rep0 ) ); continue; + } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + { distance = rep1; rep = 1; } + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + { distance = rep2; rep = 2; } + else + { distance = rep3; rep3 = rep2; rep = 3; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + state.set_rep(); + len = rdec.decode_len( rep_len_model, pos_state ); + if( show_packets ) + std::printf( "%6llu %6llu rep%c %6u,%3d (%6llu)", + mp, dp, rep + '0', rep0 + 1, len, dp - rep0 - 1 ); + } + else // match + { + len = rdec.decode_len( match_len_model, pos_state ); + unsigned distance = rdec.decode_tree6( bm_dis_slot[get_len_state(len)] ); + if( distance >= start_dis_model ) + { + const unsigned dis_slot = distance; + const int direct_bits = ( dis_slot >> 1 ) - 1; + distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + distance += rdec.decode_tree_reversed( + bm_dis + ( distance - dis_slot ), direct_bits ); + else + { + distance += + rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + distance += rdec.decode_tree_reversed4( bm_align ); + if( distance == 0xFFFFFFFFU ) // marker found + { + rdec.normalize(); + flush_data(); + const unsigned tmpos = member_position(); + set_max_marker( tmpos - old_tmpos ); + old_tmpos = tmpos; + if( show_packets ) + std::printf( "%6llu %6llu marker code '%d'\n", mp, dp, len ); + if( len == min_match_len ) // End Of Stream marker + { + if( show_packets ) + std::printf( "%6llu %6llu member trailer\n", + mpos + member_position(), dpos + data_position() ); + if( verify_trailer( show_packets ? stdout : 0 ) ) return 0; + return 3; + } + if( len == min_match_len + 1 ) // Sync Flush marker + { + rdec.load(); continue; + } + return 4; + } + } + } + rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; + if( rep0 > max_rep0 ) { max_rep0 = rep0; max_rep0_pos = mp; } + state.set_match(); + if( show_packets ) + std::printf( "%6llu %6llu match %6u,%3d (%6lld)", + mp, dp, rep0 + 1, len, dp - rep0 - 1 ); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); if( show_packets ) std::fputc( '\n', stdout ); + return 1; } + } + copy_block( rep0, len ); + if( show_packets ) print_block( len ); + } + flush_data(); + return 2; + } diff --git a/mtester.h b/mtester.h new file mode 100644 index 0000000..12c7d2d --- /dev/null +++ b/mtester.h @@ -0,0 +1,395 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +class Range_mtester + { + const uint8_t * const buffer; // input buffer + const long long buffer_size; + long long pos; // current pos in buffer + uint32_t code; + uint32_t range; + bool at_stream_end; + +public: + Range_mtester( const uint8_t * const buf, const long long buf_size ) + : + buffer( buf ), + buffer_size( buf_size ), + pos( Lzip_header::size ), + code( 0 ), + range( 0xFFFFFFFFU ), + at_stream_end( false ) + {} + + bool finished() { return pos >= buffer_size; } + unsigned long long member_position() const { return pos; } + + uint8_t get_byte() + { + // 0xFF avoids decoder error if member is truncated at EOS marker + if( finished() ) return 0xFF; + return buffer[pos++]; + } + + const Lzip_trailer * get_trailer() + { + if( buffer_size - pos < Lzip_trailer::size ) return 0; + const Lzip_trailer * const p = (const Lzip_trailer *)( buffer + pos ); + pos += Lzip_trailer::size; + return p; + } + + void load() + { + code = 0; + for( int i = 0; i < 5; ++i ) code = ( code << 8 ) | get_byte(); + range = 0xFFFFFFFFU; + code &= range; // make sure that first byte is discarded + } + + void normalize() + { + if( range <= 0x00FFFFFFU ) + { range <<= 8; code = ( code << 8 ) | get_byte(); } + } + + unsigned decode( const int num_bits ) + { + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) + { + normalize(); + range >>= 1; +// symbol <<= 1; +// if( code >= range ) { code -= range; symbol |= 1; } + const bool bit = ( code >= range ); + symbol <<= 1; symbol += bit; + code -= range & ( 0U - bit ); + } + return symbol; + } + + unsigned decode_bit( Bit_model & bm ) + { + normalize(); + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; + return 0; + } + else + { + code -= bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + return 1; + } + } + + void decode_symbol_bit( Bit_model & bm, unsigned & symbol ) + { + normalize(); + symbol <<= 1; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; + } + else + { + code -= bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol |= 1; + } + } + + void decode_symbol_bit_reversed( Bit_model & bm, unsigned & model, + unsigned & symbol, const int i ) + { + normalize(); + model <<= 1; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; + } + else + { + code -= bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + model |= 1; + symbol |= 1 << i; + } + } + + unsigned decode_tree6( Bit_model bm[] ) + { + unsigned symbol = 1; + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + return symbol & 0x3F; + } + + unsigned decode_tree8( Bit_model bm[] ) + { + unsigned symbol = 1; + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + return symbol & 0xFF; + } + + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + unsigned model = 1; + unsigned symbol = 0; + for( int i = 0; i < num_bits; ++i ) + decode_symbol_bit_reversed( bm[model], model, symbol, i ); + return symbol; + } + + unsigned decode_tree_reversed4( Bit_model bm[] ) + { + unsigned model = 1; + unsigned symbol = 0; + decode_symbol_bit_reversed( bm[model], model, symbol, 0 ); + decode_symbol_bit_reversed( bm[model], model, symbol, 1 ); + decode_symbol_bit_reversed( bm[model], model, symbol, 2 ); + decode_symbol_bit_reversed( bm[model], model, symbol, 3 ); + return symbol; + } + + unsigned decode_matched( Bit_model bm[], unsigned match_byte ) + { + Bit_model * const bm1 = bm + 0x100; + unsigned symbol = 1; + while( symbol < 0x100 ) + { + const unsigned match_bit = ( match_byte <<= 1 ) & 0x100; + const bool bit = decode_bit( bm1[symbol+match_bit] ); + symbol <<= 1; symbol |= bit; + if( match_bit >> 8 != bit ) + { + while( symbol < 0x100 ) decode_symbol_bit( bm[symbol], symbol ); + break; + } + } + return symbol & 0xFF; + } + + unsigned decode_len( Len_model & lm, const int pos_state ) + { + Bit_model * bm; + unsigned mask, offset, symbol = 1; + + if( decode_bit( lm.choice1 ) == 0 ) + { bm = lm.bm_low[pos_state]; mask = 7; offset = 0; goto len3; } + if( decode_bit( lm.choice2 ) == 0 ) + { bm = lm.bm_mid[pos_state]; mask = 7; offset = len_low_symbols; goto len3; } + bm = lm.bm_high; mask = 0xFF; offset = len_low_symbols + len_mid_symbols; + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); +len3: + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + decode_symbol_bit( bm[symbol], symbol ); + return ( symbol & mask ) + min_match_len + offset; + } + }; + +class MD5SUM; // forward declaration + +class LZ_mtester + { + unsigned long long partial_data_pos; + Range_mtester rdec; + const unsigned dictionary_size; + uint8_t * buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to file + uint32_t crc_; + const int outfd; // output file descriptor + unsigned rep0; // rep[0-3] latest four distances + unsigned rep1; // used for efficient coding of + unsigned rep2; // repeated distances + unsigned rep3; + State state; + MD5SUM * const md5sum; + unsigned long long total_packets_; // total number of packets in member + unsigned long long max_rep0_pos; // file position of maximum distance + unsigned max_rep0; // maximum distance found + std::vector< unsigned long long > max_packet_posv_; // file pos of large packets + unsigned max_packet_size_; // maximum packet size found + unsigned max_marker_size_; // maximum marker size found + bool pos_wrapped; + bool buffer_is_external; + + Bit_model bm_literal[1<<literal_context_bits][0x300]; + Bit_model bm_match[State::states][pos_states]; + Bit_model bm_rep[State::states]; + Bit_model bm_rep0[State::states]; + Bit_model bm_rep1[State::states]; + Bit_model bm_rep2[State::states]; + Bit_model bm_len[State::states][pos_states]; + Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; + Bit_model bm_dis[modeled_distances-end_dis_model+1]; + Bit_model bm_align[dis_align_size]; + + Len_model match_len_model; + Len_model rep_len_model; + + void print_block( const int len ); + void flush_data(); + bool verify_trailer( FILE * const f = 0, unsigned long long byte_pos = 0 ); + + uint8_t peek_prev() const + { return buffer[((pos > 0) ? pos : dictionary_size)-1]; } + + uint8_t peek( const unsigned distance ) const + { + const unsigned i = ( ( pos > distance ) ? 0 : dictionary_size ) + + pos - distance - 1; + return buffer[i]; + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + + void copy_block( const unsigned distance, unsigned len ) + { + unsigned lpos = pos, i = lpos - distance - 1; + bool fast, fast2; + if( lpos > distance ) + { + fast = ( len < dictionary_size - lpos ); + fast2 = ( fast && len <= lpos - i ); + } + else + { + i += dictionary_size; + fast = ( len < dictionary_size - i ); // (i == pos) may happen + fast2 = ( fast && len <= i - lpos ); + } + if( fast ) // no wrap + { + pos += len; + if( fast2 ) // no wrap, no overlap + std::memcpy( buffer + lpos, buffer + i, len ); + else + for( ; len > 0; --len ) buffer[lpos++] = buffer[i++]; + } + else for( ; len > 0; --len ) + { + buffer[pos] = buffer[i]; + if( ++pos >= dictionary_size ) flush_data(); + if( ++i >= dictionary_size ) i = 0; + } + } + +void set_max_packet( const unsigned new_size, const unsigned long long pos ) + { + if( max_packet_size_ > new_size || new_size == 0 ) return; + if( max_packet_size_ < new_size ) // new max size + { max_packet_size_ = new_size; max_packet_posv_.clear(); } + max_packet_posv_.push_back( pos - new_size ); // pos of first byte + } + +void set_max_marker( const unsigned new_size ) + { if( max_marker_size_ < new_size ) max_marker_size_ = new_size; } + +public: + LZ_mtester( const uint8_t * const ibuf, const long long ibuf_size, + const unsigned dict_size, const int ofd = -1, + MD5SUM * const md5sum_ = 0 ) + : + partial_data_pos( 0 ), + rdec( ibuf, ibuf_size ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ), + outfd( ofd ), + rep0( 0 ), + rep1( 0 ), + rep2( 0 ), + rep3( 0 ), + md5sum( md5sum_ ), + total_packets_( -1ULL ), // don't count EOS marker + max_rep0_pos( 0 ), + max_rep0( 0 ), + max_packet_size_( 0 ), + max_marker_size_( 0 ), + pos_wrapped( false ), buffer_is_external( false ) + // prev_byte of first byte; also for peek( 0 ) on corrupt file + { buffer[dictionary_size-1] = 0; } + + ~LZ_mtester() { if( !buffer_is_external ) delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const { return partial_data_pos + pos; } + bool finished() { return rdec.finished(); } + unsigned long long member_position() const { return rdec.member_position(); } + unsigned long long total_packets() const { return total_packets_; } + unsigned long long max_distance_pos() const { return max_rep0_pos; } + unsigned max_distance() const { return max_rep0 + 1; } + const std::vector< unsigned long long > & max_packet_posv() const + { return max_packet_posv_; } + unsigned max_packet_size() const { return max_packet_size_; } + unsigned max_marker_size() const { return max_marker_size_; } + + const uint8_t * get_buffers( const uint8_t ** const prev_bufferp, + int * const sizep, int * const prev_sizep ) const + { *sizep = ( pos_wrapped && pos == 0 ) ? dictionary_size : pos; + *prev_sizep = ( pos_wrapped && pos > 0 ) ? dictionary_size - pos : 0; + *prev_bufferp = buffer + pos; return buffer; } + + void duplicate_buffer( uint8_t * const buffer2 ); + + // these two functions set max_rep0 + int test_member( const unsigned long long mpos_limit = LLONG_MAX, + const unsigned long long dpos_limit = LLONG_MAX, + FILE * const f = 0, const unsigned long long byte_pos = 0 ); + /* this function also sets max_rep0_pos, total_packets_, max_packet_size_, + max_packet_posv_, and max_marker_size_ */ + int debug_decode_member( const long long dpos, const long long mpos, + const bool show_packets ); + }; diff --git a/nrep_stats.cc b/nrep_stats.cc new file mode 100644 index 0000000..1f249ff --- /dev/null +++ b/nrep_stats.cc @@ -0,0 +1,117 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "lzip_index.h" + + +/* Show how well the frequency of sequences of N repeated bytes in LZMA data + matches the value expected for random data. ( 1 / 2^( 8 * N ) ) + Print cumulative data for all files followed by the name of the first + file with the longest sequence. +*/ +int print_nrep_stats( const std::vector< std::string > & filenames, + const int repeated_byte, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing ) + { + std::vector< unsigned long > len_vector; + unsigned long long best_pos = 0, lzma_size = 0; + int best_name = -1, retval = 0; + const bool count_all = ( repeated_byte < 0 || repeated_byte >= 256 ); + bool stdin_used = false; + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = + from_stdin ? "(stdin)" : filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, + ignore_errors, ignore_errors ); + if( lzip_index.retval() != 0 ) + { + show_file_error( input_filename, lzip_index.error().c_str() ); + set_retval( retval, lzip_index.retval() ); + close( infd ); + continue; + } + const unsigned long long cdata_size = lzip_index.cdata_size(); + const uint8_t * const buffer = + (const uint8_t *)mmap( 0, cdata_size, PROT_READ, MAP_PRIVATE, infd, 0 ); + close( infd ); + if( buffer == MAP_FAILED ) + { show_file_error( input_filename, "Can't mmap", errno ); + set_retval( retval, 1 ); continue; } + for( long j = 0; j < lzip_index.members(); ++j ) + { + const Block & mb = lzip_index.mblock( j ); + long long pos = mb.pos() + 7; // skip header (+1 byte) and + const long long end = mb.end() - 20; // trailer of each member + lzma_size += end - pos; + while( pos < end ) + { + const uint8_t byte = buffer[pos++]; + if( buffer[pos] == byte ) + { + unsigned len = 2; + ++pos; + while( pos < end && buffer[pos] == byte ) { ++pos; ++len; } + if( !count_all && repeated_byte != (int)byte ) continue; + if( len >= len_vector.size() ) { len_vector.resize( len + 1 ); + best_name = i; best_pos = pos - len; } + ++len_vector[len]; + } + } + } + munmap( (void *)buffer, cdata_size ); + } + + if( count_all ) + std::fputs( "\nShowing repeated sequences of any byte value.\n", stdout ); + else + std::printf( "\nShowing repeated sequences of the byte value 0x%02X\n", + repeated_byte ); + std::printf( "Total size of LZMA data: %llu bytes (%sBytes)\n", + lzma_size, format_num( lzma_size, 999 ) ); + for( unsigned len = 2; len < len_vector.size(); ++len ) + if( len_vector[len] > 0 ) + std::printf( "len %u found %lu times, 1 every %llu bytes " + "(expected 1 every %sB)\n", + len, len_vector[len], lzma_size / len_vector[len], + format_num( 1ULL << ( 8 * ( len - count_all ) ), -1ULL, -1 ) ); + if( best_name >= 0 ) + std::printf( "Longest sequence found at position %llu of '%s'\n", + best_pos, filenames[best_name].c_str() ); + return retval; + } diff --git a/range_dec.cc b/range_dec.cc new file mode 100644 index 0000000..ea7f7e7 --- /dev/null +++ b/range_dec.cc @@ -0,0 +1,185 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "decoder.h" +#include "lzip_index.h" + + +namespace { + +bool decompress_member( const int infd, const Pretty_print & pp, + const unsigned long long mpos, + const unsigned long long outskip, + const unsigned long long outend ) + { + Range_decoder rdec( infd ); + Lzip_header header; + rdec.read_data( header.data, Lzip_header::size ); + if( rdec.finished() ) // End Of File + { pp( "File ends unexpectedly at member header." ); return false; } + if( !header.verify_magic() ) { pp( bad_magic_msg ); return false; } + if( !header.verify_version() ) + { pp( bad_version( header.version() ) ); return false; } + const unsigned dictionary_size = header.dictionary_size(); + if( !isvalid_ds( dictionary_size ) ) { pp( bad_dict_msg ); return false; } + + if( verbosity >= 2 ) pp(); + + LZ_decoder decoder( rdec, dictionary_size, outfd, outskip, outend ); + const int result = decoder.decode_member( pp ); + if( result != 0 ) + { + if( verbosity >= 0 && result <= 2 ) + { + pp(); + std::fprintf( stderr, "%s at pos %llu\n", ( result == 2 ) ? + "File ends unexpectedly" : "Decoder error", + mpos + rdec.member_position() ); + } + return false; + } + if( decoder.data_position() < outend - outskip ) + { + if( verbosity >= 0 ) + { pp(); std::fprintf( stderr, + "%sMember at pos %llu contains only %llu bytes of %llu requested.\n", + ( verbosity >= 2 ) ? "\n" : "", mpos, + decoder.data_position() - outskip, outend - outskip ); } + return false; + } + if( verbosity >= 2 ) std::fputs( "done\n", stderr ); + return true; + } + +} // end namespace + + +const char * format_num( unsigned long long num, + unsigned long long limit, + const int set_prefix ) + { + const char * const si_prefix[8] = + { "k", "M", "G", "T", "P", "E", "Z", "Y" }; + const char * const binary_prefix[8] = + { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; + enum { buffers = 8, bufsize = 32 }; + static char buffer[buffers][bufsize]; // circle of static buffers for printf + static int current = 0; + static bool si = true; + + if( set_prefix ) si = ( set_prefix > 0 ); + unsigned long long den = 1; + const unsigned factor = ( si ? 1000 : 1024 ); + char * const buf = buffer[current++]; current %= buffers; + const char * const * prefix = ( si ? si_prefix : binary_prefix ); + const char * p = ""; + + for( int i = 0; i < 8 && num / den >= factor && den * factor > den; ++i ) + { if( num / den <= limit && num % ( den * factor ) != 0 ) break; + den *= factor; p = prefix[i]; } + if( num % den == 0 ) + snprintf( buf, bufsize, "%llu %s", num / den, p ); + else + snprintf( buf, bufsize, "%3.2f %s", (double)num / den, p ); + return buf; + } + + +bool safe_seek( const int fd, const long long pos ) + { + if( lseek( fd, pos, SEEK_SET ) == pos ) return true; + show_error( "Seek error", errno ); return false; + } + + +int range_decompress( const std::string & input_filename, + const std::string & default_output_filename, + Block range, const bool force, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing, + const bool to_stdout ) + { + struct stat in_stats; + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, + ignore_errors, ignore_errors ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + const long long udata_size = lzip_index.udata_size(); + if( range.end() > udata_size ) + range.size( std::max( 0LL, udata_size - range.pos() ) ); + if( range.size() <= 0 ) + { if( udata_size > 0 ) + show_file_error( input_filename.c_str(), "Nothing to do." ); + return 0; } + + if( to_stdout || default_output_filename.empty() ) outfd = STDOUT_FILENO; + else + { + output_filename = default_output_filename; + set_signal_handler(); + if( !open_outstream( force, true, false, false ) ) return 1; + } + + if( verbosity >= 1 ) + std::fprintf( stderr, "Decompressing range %sB to %sB (%sB of %sBytes)\n", + format_num( range.pos() ), + format_num( range.pos() + range.size() ), + format_num( range.size() ), format_num( udata_size ) ); + + Pretty_print pp( input_filename ); + bool error = false; + for( long i = 0; i < lzip_index.members(); ++i ) + { + const Block & db = lzip_index.dblock( i ); + if( range.overlaps( db ) ) + { + if( verbosity >= 3 && lzip_index.members() > 1 ) + std::fprintf( stderr, "Decompressing member %3ld\n", i + 1 ); + const long long outskip = std::max( 0LL, range.pos() - db.pos() ); + const long long outend = std::min( db.size(), range.end() - db.pos() ); + const long long mpos = lzip_index.mblock( i ).pos(); + if( !safe_seek( infd, mpos ) ) cleanup_and_fail( 1 ); + if( !decompress_member( infd, pp, mpos, outskip, outend ) ) + { if( !ignore_errors ) cleanup_and_fail( 2 ); else error = true; } + pp.reset(); + } + } + close( infd ); + if( close_outstream( &in_stats ) != 0 ) cleanup_and_fail( 1 ); + if( verbosity >= 2 && !error ) + std::fputs( "Byte range decompressed successfully.\n", stderr ); + return 0; // either no error or ignored + } diff --git a/repair.cc b/repair.cc new file mode 100644 index 0000000..c49fbdb --- /dev/null +++ b/repair.cc @@ -0,0 +1,517 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "mtester.h" +#include "lzip_index.h" + + +namespace { + +bool pending_newline = false; + +void print_pending_newline( const char terminator ) + { if( pending_newline && terminator != '\n' ) std::fputc( '\n', stdout ); + pending_newline = false; } + + +bool gross_damage( const long long msize, const uint8_t * const mbuffer ) + { + enum { maxlen = 7 }; // max number of consecutive identical bytes + long i = Lzip_header::size; + const long end = msize - Lzip_trailer::size - maxlen; + while( i < end ) + { + const uint8_t byte = mbuffer[i]; + int len = 0; // does not count the first byte + while( mbuffer[++i] == byte ) if( ++len >= maxlen ) return true; + } + return false; + } + + +// Return value: 0 = no change, 5 = repaired pos +int repair_dictionary_size( const long long msize, uint8_t * const mbuffer ) + { + const unsigned long long dictionary_size_9 = 1 << 25; // dict size of opt -9 + Lzip_header & header = *(Lzip_header *)mbuffer; + unsigned dictionary_size = header.dictionary_size(); + const Lzip_trailer & trailer = + *(const Lzip_trailer *)( mbuffer + msize - Lzip_trailer::size ); + const unsigned long long data_size = trailer.data_size(); + const bool valid_ds = isvalid_ds( dictionary_size ); + if( valid_ds && dictionary_size >= data_size ) return 0; // can't be bad + + if( !valid_ds || dictionary_size < dictionary_size_9 ) + { + dictionary_size = std::min( data_size, dictionary_size_9 ); + if( dictionary_size < min_dictionary_size ) + dictionary_size = min_dictionary_size; + LZ_mtester mtester( mbuffer, msize, dictionary_size ); + const int result = mtester.test_member(); + if( result == 0 ) + { header.dictionary_size( dictionary_size ); return 5; } // fix DS + if( result != 1 || mtester.max_distance() <= dictionary_size || + mtester.max_distance() > max_dictionary_size ) return 0; + } + if( data_size > dictionary_size_9 ) + { + dictionary_size = + std::min( data_size, (unsigned long long)max_dictionary_size ); + LZ_mtester mtester( mbuffer, msize, dictionary_size ); + if( mtester.test_member() == 0 ) + { header.dictionary_size( dictionary_size ); return 5; } // fix DS + } + return 0; + } + + +const LZ_mtester * prepare_master( const uint8_t * const buffer, + const long buffer_size, + const unsigned long pos_limit, + const unsigned dictionary_size ) + { + LZ_mtester * const master = + new LZ_mtester( buffer, buffer_size, dictionary_size ); + if( master->test_member( pos_limit ) == -1 ) return master; + delete master; + return 0; + } + + +bool test_member_rest( const LZ_mtester & master, uint8_t * const buffer2, + long * const failure_posp = 0 ) + { + LZ_mtester mtester( master ); // tester with external buffer + mtester.duplicate_buffer( buffer2 ); + if( mtester.test_member() == 0 && mtester.finished() ) return true; + if( failure_posp ) *failure_posp = mtester.member_position(); + return false; + } + + +// Return value: -1 = master failed, 0 = begin reached, >0 = repaired pos +long repair_member( const long long mpos, const long long msize, + uint8_t * const mbuffer, const long begin, const long end, + const unsigned dictionary_size, const char terminator ) + { + uint8_t * const buffer2 = new uint8_t[dictionary_size]; + for( long pos = end; pos >= begin && pos > end - 50000; ) + { + const long min_pos = std::max( begin, pos - 100 ); + const unsigned long pos_limit = std::max( min_pos - 16, 0L ); + const LZ_mtester * master = + prepare_master( mbuffer, msize, pos_limit, dictionary_size ); + if( !master ) { delete[] buffer2; return -1; } + for( ; pos >= min_pos; --pos ) + { + if( verbosity >= 2 ) + { + std::printf( " Trying position %llu %c", mpos + pos, terminator ); + std::fflush( stdout ); pending_newline = true; + } + for( int j = 0; j < 255; ++j ) + { + ++mbuffer[pos]; + if( test_member_rest( *master, buffer2 ) ) + { delete master; delete[] buffer2; return pos; } + } + ++mbuffer[pos]; + } + delete master; + } + delete[] buffer2; + return 0; + } + +} // end namespace + + +long long seek_write( const int fd, const uint8_t * const buf, + const long long size, const long long pos ) + { + if( lseek( fd, pos, SEEK_SET ) == pos ) + return writeblock( fd, buf, size ); + return 0; + } + + +uint8_t * read_member( const int infd, const long long mpos, + const long long msize ) + { + if( msize <= 0 || msize > LONG_MAX ) + { show_error( "Member is larger than LONG_MAX." ); return 0; } + if( !safe_seek( infd, mpos ) ) return 0; + uint8_t * const buffer = new uint8_t[msize]; + + if( readblock( infd, buffer, msize ) != msize ) + { show_error( "Error reading input file", errno ); + delete[] buffer; return 0; } + return buffer; + } + + +int repair_file( const std::string & input_filename, + const std::string & default_output_filename, + const char terminator, const bool force ) + { + struct stat in_stats; + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + output_filename = default_output_filename.empty() ? + insert_fixed( input_filename ) : default_output_filename; + if( !force && file_exists( output_filename ) ) return 1; + outfd = -1; + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + if( !safe_seek( infd, mpos ) ) cleanup_and_fail( 1 ); + long long failure_pos = 0; + if( test_member_from_file( infd, msize, &failure_pos ) == 0 ) continue; + if( failure_pos < Lzip_header::size ) // End Of File + { show_error( "Can't repair error in input file." ); + cleanup_and_fail( 2 ); } + if( failure_pos >= msize - 8 ) failure_pos = msize - 8 - 1; + + if( verbosity >= 2 ) // damaged member found + { + std::printf( "Repairing member %ld of %ld (failure pos = %llu)\n", + i + 1, lzip_index.members(), mpos + failure_pos ); + std::fflush( stdout ); + } + uint8_t * const mbuffer = read_member( infd, mpos, msize ); + if( !mbuffer ) cleanup_and_fail( 1 ); + const Lzip_header & header = *(const Lzip_header *)mbuffer; + const unsigned dictionary_size = header.dictionary_size(); + long pos = 0; + if( !gross_damage( msize, mbuffer ) ) + { + pos = repair_dictionary_size( msize, mbuffer ); + if( pos == 0 ) + pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 1, + Lzip_header::size + 6, dictionary_size, terminator ); + if( pos == 0 ) + pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 7, + failure_pos, dictionary_size, terminator ); + print_pending_newline( terminator ); + } + if( pos < 0 ) + { show_error( "Can't prepare master." ); cleanup_and_fail( 1 ); } + if( pos > 0 ) + { + if( outfd < 0 ) // first damaged member repaired + { + if( !safe_seek( infd, 0 ) ) return 1; + set_signal_handler(); + if( !open_outstream( true, true ) ) return 1; + if( !copy_file( infd, outfd ) ) // copy whole file + cleanup_and_fail( 1 ); + } + if( seek_write( outfd, mbuffer + pos, 1, mpos + pos ) != 1 ) + { show_error( "Error writing output file", errno ); + cleanup_and_fail( 1 ); } + } + delete[] mbuffer; + if( pos == 0 ) + { + show_error( "Can't repair input file. Error is probably larger than 1 byte." ); + cleanup_and_fail( 2 ); + } + } + + if( outfd < 0 ) + { + if( verbosity >= 1 ) + std::fputs( "Input file has no errors. Recovery is not needed.\n", stdout ); + return 0; + } + if( close_outstream( &in_stats ) != 0 ) return 1; + if( verbosity >= 1 ) + std::fputs( "Copy of input file repaired successfully.\n", stdout ); + return 0; + } + + +int debug_delay( const std::string & input_filename, Block range, + const char terminator ) + { + struct stat in_stats; // not used + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + if( range.end() > lzip_index.cdata_size() ) + range.size( std::max( 0LL, lzip_index.cdata_size() - range.pos() ) ); + if( range.size() <= 0 ) + { show_file_error( input_filename.c_str(), "Nothing to do." ); return 0; } + + for( long i = 0; i < lzip_index.members(); ++i ) + { + const Block & mb = lzip_index.mblock( i ); + if( !range.overlaps( mb ) ) continue; + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + const unsigned dictionary_size = lzip_index.dictionary_size( i ); + if( verbosity >= 2 ) + { + std::printf( "Finding max delay in member %ld of %ld (mpos = %llu, msize = %llu)\n", + i + 1, lzip_index.members(), mpos, msize ); + std::fflush( stdout ); + } + uint8_t * const mbuffer = read_member( infd, mpos, msize ); + if( !mbuffer ) return 1; + uint8_t * const buffer2 = new uint8_t[dictionary_size]; + long pos = std::max( range.pos() - mpos, Lzip_header::size + 1LL ); + const long end = std::min( range.end() - mpos, msize ); + long max_delay = 0; + while( pos < end ) + { + const unsigned long pos_limit = std::max( pos - 16, 0L ); + const LZ_mtester * master = + prepare_master( mbuffer, msize, pos_limit, dictionary_size ); + if( !master ) { show_error( "Can't prepare master." ); + delete[] buffer2; delete[] mbuffer; return 1; } + const long partial_end = std::min( pos + 100, end ); + for( ; pos < partial_end; ++pos ) + { + if( verbosity >= 2 ) + { + std::printf( " Delays at position %llu %c", mpos + pos, terminator ); + std::fflush( stdout ); pending_newline = true; + } + int value = -1; + for( int j = 0; j < 256; ++j ) + { + ++mbuffer[pos]; + if( j == 255 ) break; + long failure_pos = 0; + if( test_member_rest( *master, buffer2, &failure_pos ) ) continue; + const long delay = failure_pos - pos; + if( delay > max_delay ) { max_delay = delay; value = mbuffer[pos]; } + } + if( value >= 0 && verbosity >= 2 ) + { + std::printf( " New max delay %lu at position %llu (0x%02X)\n", + max_delay, mpos + pos, value ); + std::fflush( stdout ); pending_newline = false; + } + if( pos + max_delay >= msize ) { pos = end; break; } + } + delete master; + } + delete[] buffer2; + delete[] mbuffer; + print_pending_newline( terminator ); + } + + if( verbosity >= 1 ) std::fputs( "Done.\n", stdout ); + return 0; + } + + +int debug_repair( const std::string & input_filename, + const Bad_byte & bad_byte, const char terminator ) + { + struct stat in_stats; // not used + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + long idx = 0; + for( ; idx < lzip_index.members(); ++idx ) + if( lzip_index.mblock( idx ).includes( bad_byte.pos ) ) break; + if( idx >= lzip_index.members() ) + { show_file_error( input_filename.c_str(), "Nothing to do." ); return 0; } + + const long long mpos = lzip_index.mblock( idx ).pos(); + const long long msize = lzip_index.mblock( idx ).size(); + { + long long failure_pos = 0; + if( !safe_seek( infd, mpos ) ) return 1; + if( test_member_from_file( infd, msize, &failure_pos ) != 0 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "Member %ld of %ld already damaged (failure pos = %llu)\n", + idx + 1, lzip_index.members(), mpos + failure_pos ); + return 2; + } + } + uint8_t * const mbuffer = read_member( infd, mpos, msize ); + if( !mbuffer ) return 1; + const Lzip_header & header = *(const Lzip_header *)mbuffer; + const unsigned dictionary_size = header.dictionary_size(); + const uint8_t good_value = mbuffer[bad_byte.pos-mpos]; + const uint8_t bad_value = bad_byte( good_value ); + mbuffer[bad_byte.pos-mpos] = bad_value; + long failure_pos = 0; + if( bad_byte.pos != 5 || isvalid_ds( header.dictionary_size() ) ) + { + LZ_mtester mtester( mbuffer, msize, header.dictionary_size() ); + if( mtester.test_member() == 0 && mtester.finished() ) + { + if( verbosity >= 1 ) + std::fputs( "Member decompressed with no errors.\n", stdout ); + delete[] mbuffer; + return 0; + } + failure_pos = mtester.member_position(); + } + if( verbosity >= 2 ) + { + std::printf( "Test repairing member %ld of %ld (mpos = %llu, msize = %llu)\n" + " (damage pos = %llu (0x%02X->0x%02X), failure pos = %llu, delay = %lld )\n", + idx + 1, lzip_index.members(), mpos, msize, + bad_byte.pos, good_value, bad_value, mpos + failure_pos, + mpos + failure_pos - bad_byte.pos ); + std::fflush( stdout ); + } + if( failure_pos >= msize ) failure_pos = msize - 1; + long pos = repair_dictionary_size( msize, mbuffer ); + if( pos == 0 ) + pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 1, + Lzip_header::size + 6, dictionary_size, terminator ); + if( pos == 0 ) + pos = repair_member( mpos, msize, mbuffer, Lzip_header::size + 7, + failure_pos, dictionary_size, terminator ); + print_pending_newline( terminator ); + delete[] mbuffer; + if( pos < 0 ) { show_error( "Can't prepare master." ); return 1; } + if( pos == 0 ) internal_error( "can't repair input file." ); + if( verbosity >= 1 ) std::fputs( "Member repaired successfully.\n", stdout ); + return 0; + } + + +/* If show_packets is true, print to stdout descriptions of the decoded LZMA + packets. Print also some global values; total number of packets in + member, max distance (rep0) and its file position, max LZMA packet size + in each member and the file position of these packets. + (Packet sizes are a fractionary number of bytes. The packet and marker + sizes shown by option -X are the number of extra bytes required to decode + the packet, not counting the data present in the range decoder before and + after the decoding. The max marker size of a 'Sync Flush marker' does not + include the 5 bytes read by rdec.load). + if bad_byte.pos >= cdata_size, bad_byte is ignored. +*/ +int debug_decompress( const std::string & input_filename, + const Bad_byte & bad_byte, const bool show_packets ) + { + struct stat in_stats; + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + outfd = show_packets ? -1 : STDOUT_FILENO; + int retval = 0; + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long dpos = lzip_index.dblock( i ).pos(); + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + const unsigned dictionary_size = lzip_index.dictionary_size( i ); + if( verbosity >= 1 && show_packets ) + std::printf( "Decoding LZMA packets in member %ld of %ld (mpos = %llu, msize = %llu)\n" + " mpos dpos\n", + i + 1, lzip_index.members(), mpos, msize ); + if( !isvalid_ds( dictionary_size ) ) + { show_error( bad_dict_msg ); retval = 2; break; } + uint8_t * const mbuffer = read_member( infd, mpos, msize ); + if( !mbuffer ) { retval = 1; break; } + if( bad_byte.pos >= 0 && lzip_index.mblock( i ).includes( bad_byte.pos ) ) + { + const uint8_t good_value = mbuffer[bad_byte.pos-mpos]; + const uint8_t bad_value = bad_byte( good_value ); + mbuffer[bad_byte.pos-mpos] = bad_value; + if( verbosity >= 1 && show_packets ) + std::printf( "Byte at pos %llu changed from 0x%02X to 0x%02X\n", + bad_byte.pos, good_value, bad_value ); + } + LZ_mtester mtester( mbuffer, msize, dictionary_size, outfd ); + const int result = mtester.debug_decode_member( dpos, mpos, show_packets ); + delete[] mbuffer; + if( show_packets ) + { + const std::vector< unsigned long long > & mppv = mtester.max_packet_posv(); + const unsigned mpackets = mppv.size(); + std::printf( "Total packets in member = %llu\n" + "Max distance in any match = %u at file position %llu\n" + "Max marker size found = %u\n" + "Max packet size found = %u (%u packets)%s", + mtester.total_packets(), mtester.max_distance(), + mtester.max_distance_pos(), mtester.max_marker_size(), + mtester.max_packet_size(), mpackets, + mpackets ? " at file positions" : "" ); + for( unsigned i = 0; i < mpackets; ++i ) + std::printf( " %llu", mppv[i] ); + std::fputc( '\n', stdout ); + } + if( result != 0 ) + { + if( verbosity >= 0 && result <= 2 && show_packets ) + std::printf( "%s at pos %llu\n", ( result == 2 ) ? + "File ends unexpectedly" : "Decoder error", + mpos + mtester.member_position() ); + retval = 2; + if( result != 3 || !mtester.finished() || mtester.data_position() != + (unsigned long long)lzip_index.dblock( i ).size() ) break; + } + if( i + 1 < lzip_index.members() && show_packets ) + std::fputc( '\n', stdout ); + } + + retval = std::max( retval, close_outstream( &in_stats ) ); + if( verbosity >= 1 && show_packets && retval == 0 ) + std::fputs( "Done.\n", stdout ); + return retval; + } diff --git a/reproduce.cc b/reproduce.cc new file mode 100644 index 0000000..58a0c5d --- /dev/null +++ b/reproduce.cc @@ -0,0 +1,786 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <csignal> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/wait.h> + +#include "lzip.h" +#include "md5.h" +#include "mtester.h" +#include "lzip_index.h" + + +namespace { + +const char * final_msg = 0; + +bool pending_newline = false; + +void print_pending_newline( const char terminator ) + { if( pending_newline && terminator != '\n' ) std::fputc( '\n', stdout ); + pending_newline = false; } + +int fatal_retval = 0; + +int fatal( const int retval ) + { if( fatal_retval == 0 ) fatal_retval = retval; return retval; } + +// Return the position of the damaged area in the member, or -1 if error. +long long zeroed_sector_pos( const char * const input_filename, + const uint8_t * const mbuffer, const long long msize, + long long * const sizep, uint8_t * const valuep ) + { + enum { minlen = 8 }; // min number of consecutive identical bytes + long long i = Lzip_header::size; + const long long end = msize - minlen; + long long begin = -1; + long long size = 0; + uint8_t value = 0; + while( i < end ) // leave i pointing to the first differing byte + { + const uint8_t byte = mbuffer[i++]; + if( mbuffer[i] == byte ) + { + const long long pos = i - 1; + ++i; + while( i < msize && mbuffer[i] == byte ) ++i; + if( i - pos >= minlen ) + { + if( size > 0 ) + { show_file_error( input_filename, + "Member contains more than one damaged area." ); + return -1; } + begin = pos; + size = i - pos; + value = byte; + break; + } + } + } + if( begin < 0 || size <= 0 ) + { show_file_error( input_filename, "Can't locate damaged area." ); + return -1; } + *sizep = size; + *valuep = value; + return begin; + } + + +const LZ_mtester * prepare_master2( const uint8_t * const mbuffer, + const long long msize, + const long long begin, + const unsigned dictionary_size ) + { + long long pos_limit = std::max( begin - 16, (long long)Lzip_header::size ); + LZ_mtester * master = new LZ_mtester( mbuffer, msize, dictionary_size ); + if( master->test_member( pos_limit ) != -1 || + master->member_position() > (unsigned long long)begin ) + { delete master; return 0; } + // decompress as much data as possible without surpassing begin + while( pos_limit < begin && master->test_member( pos_limit + 1 ) == -1 && + master->member_position() <= (unsigned long long)begin ) + ++pos_limit; + delete master; + master = new LZ_mtester( mbuffer, msize, dictionary_size ); + if( master->test_member( pos_limit ) == -1 && + master->member_position() <= (unsigned long long)begin ) return master; + delete master; + return 0; + } + + +/* Locate in the reference file (rbuf) the truncated data in the dictionary. + The reference file must match from the last byte decoded back to the + beginning of the file or to the beginning of the dictionary. + Choose the match nearest to the beginning of the file. + As a fallback, locate the longest partial match at least 512 bytes long. + Return the offset in file of the first undecoded byte, or -1 if no match. */ +long long match_file( const LZ_mtester & master, const uint8_t * const rbuf, + const long long rsize, + const char * const reference_filename ) + { + const uint8_t * prev_buffer; + int dec_size, prev_size; + const uint8_t * const dec_buffer = + master.get_buffers( &prev_buffer, &dec_size, &prev_size ); + if( dec_size < 4 ) + { if( verbosity >= 1 ) + { std::printf( "'%s' can't match: not enough data in dictionary.\n", + reference_filename ); pending_newline = false; } + return -1; } + long long offset = -1; // offset in file of the first undecoded byte + bool multiple = false; + const uint8_t last_byte = dec_buffer[dec_size-1]; + for( long long i = rsize - 1; i >= 3; --i ) // match at least 4 bytes at bof + if( rbuf[i] == last_byte ) + { + // compare file with the two parts of the dictionary + int len = std::min( (long long)dec_size - 1, i ); + if( std::memcmp( rbuf + i - len, dec_buffer + dec_size - 1 - len, len ) == 0 ) + { + int len2 = std::min( (long long)prev_size, i - len ); + if( len2 <= 0 || !prev_buffer || + std::memcmp( rbuf + i - len - len2, + prev_buffer + prev_size - len2, len2 ) == 0 ) + { + if( offset >= 0 ) multiple = true; + offset = i + 1; + i -= len + len2; + } + } + } + if( offset >= 0 ) + { + if( multiple && verbosity >= 1 ) + { std::printf( "warning: %s: Multiple matches. Using match at offset %lld\n", + reference_filename, offset ); std::fflush( stdout ); } + if( !multiple && verbosity >= 2 ) + { std::printf( "%s: Match found at offset %lld\n", + reference_filename, offset ); std::fflush( stdout ); } + return offset; + } + int maxlen = 0; // choose longest match in reference file + for( long long i = rsize - 1; i >= 0; --i ) + if( rbuf[i] == last_byte ) + { + // compare file with the two parts of the dictionary + const int size1 = std::min( (long long)dec_size, i + 1 ); + int len = 1; + while( len < size1 && rbuf[i-len] == dec_buffer[dec_size-len-1] ) ++len; + if( len == size1 ) + { + int size2 = std::min( (long long)prev_size, i + 1 - size1 ); + while( len < size1 + size2 && + rbuf[i-len] == prev_buffer[prev_size+size1-len] ) ++len; + } + if( len > maxlen ) { maxlen = len; offset = i + 1; i -= len; } + } + if( maxlen >= 512 && offset >= 0 ) + { + if( verbosity >= 1 ) + { std::printf( "warning: %s: Partial match found at offset %lld, len %d." + " Reference data may be mixed with other data.\n", + reference_filename, offset, maxlen ); + std::fflush( stdout ); } + return offset; + } + if( verbosity >= 1 ) + { std::printf( "'%s' does not match with decoded data.\n", + reference_filename ); pending_newline = false; } + return -1; + } + + +void show_close_error( const char * const prog_name = "data feeder" ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error closing output of %s: %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +void show_exec_error( const char * const prog_name ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't exec '%s': %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +void show_fork_error( const char * const prog_name ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't fork '%s': %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +/* Return -1 if child not terminated, 1 in case of error, or exit status of + child process 'pid'. +*/ +int child_status( const pid_t pid, const char * const name ) + { + int status; + while( true ) + { + const int tmp = waitpid( pid, &status, WNOHANG ); + if( tmp == -1 && errno != EINTR ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error checking status of '%s': %s\n", + program_name, name, std::strerror( errno ) ); + return 1; + } + if( tmp == 0 ) return -1; // child not terminated + if( tmp == pid ) break; // child terminated + } + if( WIFEXITED( status ) ) return WEXITSTATUS( status ); + return 1; + } + + +// Return exit status of child process 'pid', or 1 in case of error. +// +int wait_for_child( const pid_t pid, const char * const name ) + { + int status; + while( waitpid( pid, &status, 0 ) == -1 ) + { + if( errno != EINTR ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error waiting termination of '%s': %s\n", + program_name, name, std::strerror( errno ) ); + return 1; + } + } + if( WIFEXITED( status ) ) return WEXITSTATUS( status ); + return 1; + } + + +bool good_status( const pid_t pid, const char * const name, const bool finished ) + { + bool error = false; + if( pid ) + { + if( !finished ) + { + const int tmp = child_status( pid, name ); + if( tmp < 0 ) // child not terminated + { kill( pid, SIGTERM ); wait_for_child( pid, name ); } + else if( tmp != 0 ) error = true; // child status != 0 + } + else + if( wait_for_child( pid, name ) != 0 ) error = true; + if( error ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: %s: Child terminated with error status.\n", + program_name, name ); + return false; + } + } + return !error; + } + + +/* Feed to lzip through 'ofd' the data decompressed up to 'good_dsize' + (master->data_position) followed by the reference data from byte at + offset 'offset' of reference file, up to a total of 'dsize' bytes. */ +bool feed_data( uint8_t * const mbuffer, const long long msize, + const long long dsize, const unsigned long long good_dsize, + const uint8_t * const rbuf, const long long rsize, + const long long offset, const unsigned dictionary_size, + const int ofd ) + { + LZ_mtester mtester( mbuffer, msize, dictionary_size, ofd ); + if( mtester.test_member( LLONG_MAX, good_dsize ) != -1 || + good_dsize != mtester.data_position() ) + { show_error( "Error decompressing prefix data for compressor." ); + return false; } + // limit reference data to remaining decompressed data in member + const long long end = + std::min( (unsigned long long)rsize, dsize - good_dsize + offset ); + for( long long i = offset; i < end; ) + { + const int size = std::min( end - i, 65536LL ); + if( writeblock( ofd, rbuf + i, size ) != size ) + { show_error( "Error writing reference data to compressor", errno ); + return false; } + i += size; + } + return true; + } + + +/* Try to reproduce the zeroed sector. + Return value: -1 = failure, 0 = success, > 0 = fatal error. */ +int try_reproduce( uint8_t * const mbuffer, const long long msize, + const long long dsize, const unsigned long long good_dsize, + const long long begin, const long long end, + const uint8_t * const rbuf, const long long rsize, + const long long offset, const unsigned dictionary_size, + const char ** const lzip_argv, MD5SUM * const md5sump, + const char terminator, const bool auto0 = false ) + { + int fda[2]; // pipe to compressor + int fda2[2]; // pipe from compressor + if( pipe( fda ) < 0 || pipe( fda2 ) < 0 ) + { show_error( "Can't create pipe", errno ); return fatal( 1 ); } + const pid_t pid = fork(); + if( pid == 0 ) // child 1 (compressor feeder) + { + if( close( fda[0] ) != 0 || + close( fda2[0] ) != 0 || close( fda2[1] ) != 0 || + !feed_data( mbuffer, msize, dsize, good_dsize, rbuf, rsize, offset, + dictionary_size, fda[1] ) ) + { close( fda[1] ); _exit( 2 ); } + if( close( fda[1] ) != 0 ) + { show_close_error(); _exit( 2 ); } + _exit( 0 ); + } + if( pid < 0 ) // parent + { show_fork_error( "data feeder" ); return fatal( 1 ); } + + const pid_t pid2 = fork(); + if( pid2 == 0 ) // child 2 (compressor) + { + if( dup2( fda[0], STDIN_FILENO ) >= 0 && + dup2( fda2[1], STDOUT_FILENO ) >= 0 && + close( fda[0] ) == 0 && close( fda[1] ) == 0 && + close( fda2[0] ) == 0 && close( fda2[1] ) == 0 ) + execvp( lzip_argv[0], (char **)lzip_argv ); + show_exec_error( lzip_argv[0] ); + _exit( 2 ); + } + if( pid2 < 0 ) // parent + { show_fork_error( lzip_argv[0] ); return fatal( 1 ); } + + close( fda[0] ); close( fda[1] ); close( fda2[1] ); + const long long xend = std::min( end + 4, msize ); + int retval = 0; // -1 = mismatch + bool first_post = true; + bool same_ds = true; // reproduced DS == header DS + bool tail_mismatch = false; // mismatch after end + for( long long i = 0; i < xend; ) + { + enum { buffer_size = 16384 }; // 65536 makes it slower + uint8_t buffer[buffer_size]; + if( verbosity >= 2 && i >= 65536 && terminator ) + { + if( first_post ) + { first_post = false; print_pending_newline( terminator ); } + std::printf( " Reproducing position %lld %c", i, terminator ); + std::fflush( stdout ); pending_newline = true; + } + const int rd = readblock( fda2[0], buffer, buffer_size ); + // not enough reference data to fill zeroed sector at this level + if( rd <= 0 ) { if( i < end ) retval = -1; break; } + int j = 0; + /* Compare reproduced bytes with data in mbuffer. + Do not fail because of a mismatch beyond the end of the zeroed sector + to prevent the reproduction from failing because of the reference file + just covering the zeroed sector. */ + for( ; j < rd && i < begin; ++j, ++i ) + if( mbuffer[i] != buffer[j] ) // mismatch + { + if( i != 5 ) { retval = -1; goto done; } // ignore different DS + const Lzip_header * header = (const Lzip_header *)buffer; + if( header->dictionary_size() != dictionary_size ) same_ds = false; + } + // copy reproduced bytes into zeroed sector of mbuffer + for( ; j < rd && i < end; ++j, ++i ) mbuffer[i] = buffer[j]; + for( ; j < rd && i < xend; ++j, ++i ) + if( mbuffer[i] != buffer[j] ) { tail_mismatch = true; goto done; } + } +done: + if( !first_post && terminator ) print_pending_newline( terminator ); + if( close( fda2[0] ) != 0 ) { show_close_error( "compressor" ); retval = 1; } + if( !good_status( pid, "data feeder", false ) || + !good_status( pid2, lzip_argv[0], false ) ) retval = auto0 ? -1 : 1; + if( !retval ) // test whole member after reproduction + { + if( md5sump ) md5sump->reset(); + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, md5sump ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { + if( verbosity >= 2 && same_ds && begin >= 4096 && terminator ) + { + if( !tail_mismatch ) + final_msg = " Zeroed sector reproduced, but CRC does not match." + " (Multiple damages in file?).\n"; + else if( !final_msg ) + final_msg = " Zeroed sector reproduced, but data after it does not" + " match. (Maybe wrong reference data or lzip version).\n"; + } + retval = -1; // incorrect reproduction of zeroed sector + } + } + return retval; + } + + +// Return value: -1 = master failed, 0 = success, > 0 = failure +int reproduce_member( uint8_t * const mbuffer, const long long msize, + const long long dsize, const char * const lzip_name, + const char * const reference_filename, + const long long begin, const long long size, + const int lzip_level, MD5SUM * const md5sump, + const char terminator ) + { + struct stat st; + const int rfd = open_instream( reference_filename, &st, false, true ); + if( rfd < 0 ) return fatal( 1 ); + if( st.st_size > LLONG_MAX ) + { show_file_error( reference_filename, "File too large." ); close( rfd ); + return fatal( 2 ); } + const long long rsize = st.st_size; + const uint8_t * const rbuf = + (const uint8_t *)mmap( 0, rsize, PROT_READ, MAP_PRIVATE, rfd, 0 ); + close( rfd ); + if( rbuf == MAP_FAILED ) + { show_file_error( reference_filename, "Can't mmap", errno ); + return fatal( 1 ); } + + const Lzip_header & header = *(const Lzip_header *)mbuffer; + const unsigned dictionary_size = header.dictionary_size(); + const LZ_mtester * const master = + prepare_master2( mbuffer, msize, begin, dictionary_size ); + if( !master ) return -1; + if( verbosity >= 2 ) + { + std::printf( " (master mpos = %llu, dpos = %llu)\n", + master->member_position(), master->data_position() ); + std::fflush( stdout ); + } + + const long long offset = match_file( *master, rbuf, rsize, reference_filename ); + if( offset < 0 ) { delete master; return 2; } // no match + // Reference data from offset must be at least as large as zeroed sector + // minus member trailer if trailer is inside the zeroed sector. + const int t = ( begin + size >= msize ) ? 16 + Lzip_trailer::size : 0; + if( rsize - offset < size - t ) + { show_file_error( reference_filename, "Not enough reference data after match." ); + delete master; return 2; } + + const unsigned long long good_dsize = master->data_position(); + const long long end = begin + size; + char level_str[8] = "-0"; // compression level or match length limit + char dict_str[16]; + snprintf( dict_str, sizeof dict_str, "-s%u", dictionary_size ); + const char * lzip0_argv[3] = { lzip_name, "-0", 0 }; + const char * lzip_argv[4] = { lzip_name, level_str, dict_str, 0 }; + if( lzip_level >= 0 ) + for( unsigned char level = '0'; level <= '9'; ++level ) + { + if( std::isdigit( lzip_level ) && level != lzip_level ) continue; + level_str[1] = level; + if( verbosity >= 1 && terminator ) + { + std::printf( "Trying level %s %c", level_str, terminator ); + std::fflush( stdout ); pending_newline = true; + } + const bool level0 = level == '0'; + const bool auto0 = ( level0 && lzip_level != '0' ); + int ret = try_reproduce( mbuffer, msize, dsize, good_dsize, begin, end, + rbuf, rsize, offset, dictionary_size, + level0 ? lzip0_argv : lzip_argv, md5sump, terminator, auto0 ); + if( ret >= 0 ) + { delete master; munmap( (void *)rbuf, rsize ); return ret; } + } + if( lzip_level <= 0 ) + { + for( int len = min_match_len_limit; len <= max_match_len; ++len ) + { + if( lzip_level < -1 && -lzip_level != len ) continue; + snprintf( level_str, sizeof level_str, "-m%u", len ); + if( verbosity >= 1 && terminator ) + { + std::printf( "Trying match length limit %d %c", len, terminator ); + std::fflush( stdout ); pending_newline = true; + } + int ret = try_reproduce( mbuffer, msize, dsize, good_dsize, begin, end, + rbuf, rsize, offset, dictionary_size, + lzip_argv, md5sump, terminator ); + if( ret >= 0 ) + { delete master; munmap( (void *)rbuf, rsize ); return ret; } + } + } + delete master; + munmap( (void *)rbuf, rsize ); + return 2; + } + +} // end namespace + + +int reproduce_file( const std::string & input_filename, + const std::string & default_output_filename, + const char * const lzip_name, + const char * const reference_filename, + const int lzip_level, const char terminator, + const bool force ) + { + struct stat in_stats; + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + output_filename = default_output_filename.empty() ? + insert_fixed( input_filename ) : default_output_filename; + if( !force && file_exists( output_filename ) ) return 1; + outfd = -1; + int errors = 0; + const long page_size = std::max( 1L, sysconf( _SC_PAGESIZE ) ); + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long dsize = lzip_index.dblock( i ).size(); + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + if( verbosity >= 1 && lzip_index.members() > 1 ) + { + std::printf( "Testing member %ld of %ld %c", + i + 1, lzip_index.members(), terminator ); + std::fflush( stdout ); pending_newline = true; + } + if( !safe_seek( infd, mpos ) ) return 1; + long long failure_pos = 0; + if( test_member_from_file( infd, msize, &failure_pos ) == 0 ) + continue; // member is not damaged + print_pending_newline( terminator ); + if( ++errors > 1 ) break; // only one member can be reproduced + if( failure_pos < Lzip_header::size ) // End Of File + { show_file_error( input_filename.c_str(), "Unexpected end of file." ); + return 2; } + + // without mmap, 3 times more memory are required because of fork + const long mpos_rem = mpos % page_size; + uint8_t * const mbuffer_base = (uint8_t *)mmap( 0, msize + mpos_rem, + PROT_READ | PROT_WRITE, MAP_PRIVATE, infd, mpos - mpos_rem ); + if( mbuffer_base == MAP_FAILED ) + { show_file_error( input_filename.c_str(), "Can't mmap", errno ); return 1; } + uint8_t * const mbuffer = mbuffer_base + mpos_rem; + long long size = 0; + uint8_t value = 0; + const long long begin = zeroed_sector_pos( input_filename.c_str(), mbuffer, + msize, &size, &value ); + if( begin < 0 ) return 2; + if( failure_pos < begin ) + { show_file_error( input_filename.c_str(), + "Data error found before damaged area." ); return 2; } + if( verbosity >= 1 ) + { + std::printf( "Reproducing bad area in member %ld of %ld\n" + " (begin = %lld, size = %lld, value = 0x%02X)\n", + i + 1, lzip_index.members(), begin, size, value ); + std::fflush( stdout ); + } + const int ret = reproduce_member( mbuffer, msize, dsize, lzip_name, + reference_filename, begin, size, lzip_level, 0, terminator ); + if( ret <= 0 ) print_pending_newline( terminator ); + if( ret < 0 ) { show_error( "Can't prepare master." ); return 1; } + if( ret == 0 ) + { + if( outfd < 0 ) // first damaged member reproduced + { + if( !safe_seek( infd, 0 ) ) return 1; + set_signal_handler(); + if( !open_outstream( true, true ) ) return 1; + if( !copy_file( infd, outfd ) ) // copy whole file + cleanup_and_fail( 1 ); + } + if( seek_write( outfd, mbuffer + begin, size, mpos + begin ) != size ) + { show_file_error( output_filename.c_str(), "Error writing file", errno ); + cleanup_and_fail( 1 ); } + if( verbosity >= 1 ) + std::fputs( "Member reproduced successfully.\n", stdout ); + } + munmap( mbuffer_base, msize + mpos_rem ); + if( ret > 0 ) + { + if( final_msg ) + { std::fputs( final_msg, stdout ); std::fflush( stdout ); } + show_file_error( input_filename.c_str(), + "Unable to reproduce member." ); return ret; + } + } + + if( outfd < 0 ) + { + if( verbosity >= 1 ) + std::fputs( "Input file has no errors. Recovery is not needed.\n", stdout ); + return 0; + } + if( close_outstream( &in_stats ) != 0 ) return 1; + if( verbosity >= 0 ) + { + if( errors > 1 ) + std::fputs( "One member reproduced." + " Copy of input file still contains errors.\n", stdout ); + else + std::fputs( "Copy of input file reproduced successfully.\n", stdout ); + } + return 0; + } + + +/* Passes a 0 terminator to other functions to prevent intramember feedback. + Exits only in case of fatal error. (reference file too large, etc). */ +int debug_reproduce_file( const std::string & input_filename, + const char * const lzip_name, + const char * const reference_filename, + const Block & range, const int sector_size, + const int lzip_level ) + { + struct stat in_stats; // not used + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + const Lzip_index lzip_index( infd, true, true ); + if( lzip_index.retval() != 0 ) + { show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); } + + const long long cdata_size = lzip_index.cdata_size(); + if( range.pos() >= cdata_size ) + { show_file_error( input_filename.c_str(), + "Range is beyond end of last member." ); return 1; } + + const long page_size = std::max( 1L, sysconf( _SC_PAGESIZE ) ); + const long long positions_to_test = + ( ( std::min( range.end(), cdata_size ) - range.pos() ) + + sector_size - 9 ) / sector_size; + long positions = 0, successes = 0, failed_comparisons = 0; + long alternative_reproductions = 0; + const bool pct_enabled = cdata_size > sector_size && + isatty( STDERR_FILENO ) && !isatty( STDOUT_FILENO ); + for( long i = 0; i < lzip_index.members(); ++i ) + { + const long long mpos = lzip_index.mblock( i ).pos(); + const long long msize = lzip_index.mblock( i ).size(); + if( !range.overlaps( mpos, msize ) ) continue; + const long long dsize = lzip_index.dblock( i ).size(); + const unsigned dictionary_size = lzip_index.dictionary_size( i ); + + // md5sums of original not damaged member (compressed and decompressed) + uint8_t md5_digest_c[16], md5_digest_d[16]; + bool md5_valid = false; + const long long rm_end = std::min( range.end(), mpos + msize ); + for( long long sector_pos = std::max( range.pos(), mpos ); + sector_pos + 8 <= rm_end; sector_pos += sector_size ) + { + // without mmap, 3 times more memory are required because of fork + const long mpos_rem = mpos % page_size; + uint8_t * const mbuffer_base = (uint8_t *)mmap( 0, msize + mpos_rem, + PROT_READ | PROT_WRITE, MAP_PRIVATE, infd, mpos - mpos_rem ); + if( mbuffer_base == MAP_FAILED ) + { show_file_error( input_filename.c_str(), "Can't mmap", errno ); + return 1; } + uint8_t * const mbuffer = mbuffer_base + mpos_rem; + if( !md5_valid ) + { + if( verbosity >= 0 ) // give a clue of the range being tested + { std::printf( "Reproducing: %s\nReference file: %s\nTesting " + "sectors of size %llu at file positions %llu to %llu\n", + input_filename.c_str(), reference_filename, + std::min( (long long)sector_size, rm_end - sector_pos ), + sector_pos, rm_end - 1 ); std::fflush( stdout ); } + md5_valid = true; compute_md5( mbuffer, msize, md5_digest_c ); + MD5SUM md5sum; + LZ_mtester mtester( mbuffer, msize, dictionary_size, -1, &md5sum ); + if( mtester.test_member() != 0 || !mtester.finished() ) + { + if( verbosity >= 0 ) + { std::printf( "Member %ld of %ld already damaged (failure pos " + "= %llu)\n", i + 1, lzip_index.members(), + mpos + mtester.member_position() ); + std::fflush( stdout ); } + munmap( mbuffer_base, msize + mpos_rem ); break; + } + md5sum.md5_finish( md5_digest_d ); + } + ++positions; + const int sector_sz = + std::min( rm_end - sector_pos, (long long)sector_size ); + // set mbuffer[sector] to 0 + std::memset( mbuffer + ( sector_pos - mpos ), 0, sector_sz ); + long long size = 0; + uint8_t value = 0; + const long long begin = zeroed_sector_pos( input_filename.c_str(), mbuffer, + msize, &size, &value ); + if( begin < 0 ) return 2; + MD5SUM md5sum; + const int ret = reproduce_member( mbuffer, msize, dsize, lzip_name, + reference_filename, begin, size, lzip_level, &md5sum, 0 ); + if( ret < 0 ) { show_error( "Can't prepare master." ); return 1; } + if( ret == 0 ) + { + ++successes; + uint8_t new_digest[16]; + md5sum.md5_finish( new_digest ); + if( std::memcmp( md5_digest_d, new_digest, 16 ) != 0 ) + { + ++failed_comparisons; + if( verbosity >= 0 ) + std::printf( "Comparison failed at pos %llu\n", sector_pos ); + } + else if( !check_md5( mbuffer, msize, md5_digest_c ) ) + { + ++alternative_reproductions; + if( verbosity >= 0 ) + std::printf( "Alternative reproduction at pos %llu\n", sector_pos ); + } + else if( verbosity >= 0 ) + std::printf( "Reproduction succeeded at pos %llu\n", sector_pos ); + } + else if( verbosity >= 0 ) // ret > 0 + std::printf( "Unable to reproduce at pos %llu\n", sector_pos ); + if( verbosity >= 0 ) + { + std::fflush( stdout ); // flush result line + if( pct_enabled ) // show feedback + std::fprintf( stderr, "\r%ld sectors %ld successes %ld failcomp " + "%ld altrep %3u%% done\r", positions, successes, + failed_comparisons, alternative_reproductions, + (unsigned)( ( positions * 100.0 ) / positions_to_test ) ); + } + munmap( mbuffer_base, msize + mpos_rem ); + if( fatal_retval ) goto done; + } + } +done: + if( verbosity >= 0 ) + { + std::printf( "\n%8ld sectors tested" + "\n%8ld reproductions returned with zero status", + positions, successes ); + if( successes > 0 ) + { + if( failed_comparisons > 0 ) + std::printf( ", of which\n%8ld comparisons failed\n", + failed_comparisons ); + else std::fputs( "\n all comparisons passed\n", stdout ); + if( alternative_reproductions > 0 ) + std::printf( "%8ld alternative reproductions found\n", + alternative_reproductions ); + } + else std::fputc( '\n', stdout ); + if( fatal_retval ) + std::fputs( "Exiting because of a fatal error\n", stdout ); + } + return fatal_retval; + } diff --git a/split.cc b/split.cc new file mode 100644 index 0000000..269f051 --- /dev/null +++ b/split.cc @@ -0,0 +1,142 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "lzip.h" +#include "lzip_index.h" + + +namespace { + +void first_filename( const std::string & input_filename, + const std::string & default_output_filename, + const int max_digits ) + { + output_filename = default_output_filename.empty() ? + input_filename : default_output_filename; + int b = output_filename.size(); + while( b > 0 && output_filename[b-1] != '/' ) --b; + output_filename.insert( b, "rec1" ); + if( max_digits > 1 ) output_filename.insert( b + 3, max_digits - 1, '0' ); + } + + +bool next_filename( const int max_digits ) + { + if( verbosity >= 1 ) + { + std::printf( "Member '%s' done \n", output_filename.c_str() ); + std::fflush( stdout ); + } + int b = output_filename.size(); + while( b > 0 && output_filename[b-1] != '/' ) --b; + for( int i = b + max_digits + 2; i > b + 2; --i ) // "rec<max_digits>" + { + if( output_filename[i] < '9' ) { ++output_filename[i]; return true; } + else output_filename[i] = '0'; + } + return false; + } + +} // end namespace + + +int split_file( const std::string & input_filename, + const std::string & default_output_filename, const bool force ) + { + struct stat in_stats; + const int infd = + open_instream( input_filename.c_str(), &in_stats, false, true ); + if( infd < 0 ) return 1; + + Lzip_index lzip_index( infd, true, true, true, true ); + if( lzip_index.retval() != 0 ) + { + show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); + } + // verify last member + const Block b = lzip_index.mblock( lzip_index.members() - 1 ); + long long mpos = b.pos(); + long long msize = b.size(); + long long failure_pos = 0; + if( !safe_seek( infd, mpos ) ) return 1; + if( test_member_from_file( infd, msize, &failure_pos ) == 1 ) + { // corrupt or fake trailer + while( true ) + { + mpos += failure_pos; msize -= failure_pos; + if( msize < min_member_size ) break; // trailing data + if( !safe_seek( infd, mpos ) ) return 1; + if( test_member_from_file( infd, msize, &failure_pos ) != 1 ) break; + } + lzip_index = Lzip_index( infd, true, true, true, true, mpos ); + if( lzip_index.retval() != 0 ) + { + show_file_error( input_filename.c_str(), lzip_index.error().c_str() ); + return lzip_index.retval(); + } + } + + if( !safe_seek( infd, 0 ) ) return 1; + int max_digits = 1; + for( long i = lzip_index.blocks( true ); i >= 10; i /= 10 ) ++max_digits; + first_filename( input_filename, default_output_filename, max_digits ); + + long long stream_pos = 0; // first pos not yet written to file + set_signal_handler(); + for( long i = 0; i < lzip_index.members(); ++i ) + { + const Block & mb = lzip_index.mblock( i ); + if( mb.pos() > stream_pos ) // gap + { + if( !open_outstream( force, true, false, false ) ) return 1; + if( !copy_file( infd, outfd, mb.pos() - stream_pos ) || + close_outstream( &in_stats ) != 0 ) + cleanup_and_fail( 1 ); + next_filename( max_digits ); + } + if( !open_outstream( force, true, false, false ) ) return 1; // member + if( !copy_file( infd, outfd, mb.size() ) || + close_outstream( &in_stats ) != 0 ) + cleanup_and_fail( 1 ); + next_filename( max_digits ); + stream_pos = mb.end(); + } + if( lzip_index.file_size() > stream_pos ) // trailing data + { + if( !open_outstream( force, true, false, false ) ) return 1; + if( !copy_file( infd, outfd, lzip_index.file_size() - stream_pos ) || + close_outstream( &in_stats ) != 0 ) + cleanup_and_fail( 1 ); + next_filename( max_digits ); + } + close( infd ); + return 0; + } diff --git a/testsuite/check.sh b/testsuite/check.sh new file mode 100755 index 0000000..af8d787 --- /dev/null +++ b/testsuite/check.sh @@ -0,0 +1,1448 @@ +#! /bin/sh +# check script for Lziprecover - Data recovery tool for the lzip format +# Copyright (C) 2009-2022 Antonio Diaz Diaz. +# +# This script is free software: you have unlimited permission +# to copy, distribute, and modify it. + +LC_ALL=C +export LC_ALL +objdir=`pwd` +testdir=`cd "$1" ; pwd` +LZIP="${objdir}"/lziprecover +LZIPRECOVER="${LZIP}" +framework_failure() { echo "failure in testing framework" ; exit 1 ; } + +if [ ! -f "${LZIP}" ] || [ ! -x "${LZIP}" ] ; then + echo "${LZIP}: cannot execute" + exit 1 +fi + +[ -e "${LZIP}" ] 2> /dev/null || + { + echo "$0: a POSIX shell is required to run the tests" + echo "Try bash -c \"$0 $1 $2\"" + exit 1 + } + +if [ -d tmp ] ; then rm -rf tmp ; fi +mkdir tmp +cd "${objdir}"/tmp || framework_failure + +cat "${testdir}"/test.txt > in || framework_failure +in_lz="${testdir}"/test.txt.lz +in_lzma="${testdir}"/test.txt.lzma +in_em="${testdir}"/test_em.txt.lz +inD="${testdir}"/test21723.txt +bad1_lz="${testdir}"/test_bad1.lz +bad2_lz="${testdir}"/test_bad2.lz +bad3_lz="${testdir}"/test_bad3.lz +bad4_lz="${testdir}"/test_bad4.lz +bad5_lz="${testdir}"/test_bad5.lz +fox_lz="${testdir}"/fox.lz +fox6_lz="${testdir}"/fox6.lz +f6b1="${testdir}"/fox6_bad1.txt +f6b1_lz="${testdir}"/fox6_bad1.lz +f6b2_lz="${testdir}"/fox6_bad2.lz +f6b3_lz="${testdir}"/fox6_bad3.lz +f6b4_lz="${testdir}"/fox6_bad4.lz +f6b5_lz="${testdir}"/fox6_bad5.lz +f6b6_lz="${testdir}"/fox6_bad6.lz +f6s1_lz="${testdir}"/fox6_sc1.lz +f6s2_lz="${testdir}"/fox6_sc2.lz +f6s3_lz="${testdir}"/fox6_sc3.lz +f6s4_lz="${testdir}"/fox6_sc4.lz +f6s5_lz="${testdir}"/fox6_sc5.lz +f6s6_lz="${testdir}"/fox6_sc6.lz +num_lz="${testdir}"/numbers.lz +nbt_lz="${testdir}"/numbersbt.lz +fail=0 +test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; } + +# Description of test files for lziprecover: +# single-member files with one or more errors +# test_bad1.lz: byte at offset 66 changed from 0xA6 to 0x26 +# test_bad2.lz: [ 34- 65] --> copy of bytes [ 68- 99] +# test_bad3.lz: [ 512-1535] --> zeroed [2560-3583] --> zeroed +# test_bad4.lz: [3072-4095] --> random errors [4608-5631] --> zeroed +# test_bad5.lz: [1024-2047] --> random errors [5120-6143] --> random data +# test_bad6.lz: [ 512-1023] --> zeroed (reference test.txt [ 891- 2137]) +# test_bad7.lz: [6656-7167] --> zeroed (reference test.txt [20428-32231]) +# test_bad8.lz: [ 66- 73] --> zeroed (reference test.txt [ 89- 110]) +# test_bad9.lz: [6491-6498] --> zeroed (reference test.txt [17977-18120]) +# +# 6-member files with one or more errors +# fox6_bad1.lz: byte at offset 5 changed from 0x0C to 0x00 (DS) +# byte at offset 142 changed from 0x50 to 0x70 (CRC) +# byte at offset 224 changed from 0x2D to 0x2E (data_size) +# byte at offset 268 changed from 0x34 to 0x33 (mid stream) +# byte at offset 327 changed from 0x2A to 0x2B (byte 7) +# byte at offset 458 changed from 0xA0 to 0x20 (EOS marker) +# fox6_bad2.lz: [110-129] --> zeroed (member 2) +# fox6_bad3.lz: [180-379] --> zeroed (members 3-5) +# fox6_bad4.lz: [330-429] --> zeroed (members 5,6) +# fox6_bad5.lz: [380-479] --> zeroed (members 5,6) +# fox6_bad6.lz: [430-439] --> zeroed (member 6) +# +# 6-member files "shortcircuited" by a corrupt or fake trailer +# fox6_sc1.lz: (corrupt but consistent last trailer) +# last CRC != 0 ; dsize = 4 * msize ; msize = 480 (file size) +# fox6_sc2.lz: (appended fake but consistent trailer) +# fake CRC != 0 ; dsize = 4 * msize ; msize = 500 (file size) +# fox6_sc3.lz: fake CRC = 0 +# fox6_sc4.lz: fake dsize = 0 +# fox6_sc5.lz: fake dsize = 411 (< 8 * ( fake msize - 36 ) / 9) +# fox6_sc6.lz: fake dsize = 3360660 (>= 7090 * ( fake msize - 26 )) +# +# 9-member files "one_" "two_" "three_" "four_" "five_" "six_" "seven_" +# "eight_" "nine_" +# numbers.lz : good file containing the 9 members shown above +# numbersbt.lz: "gap" after "three_", "damaged" after "six_", "trailing data" + +printf "testing lziprecover-%s..." "$2" + +"${LZIP}" -lq in +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -tq in +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -tq < in +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -cdq in +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -cdq < in +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -dq -o in < "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -dq -o in "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -dq -o out nx_file.lz +[ $? = 1 ] || test_failed $LINENO +[ ! -e out ] || test_failed $LINENO +# these are for code coverage +"${LZIP}" -lt "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cdl "${in_lz}" > out 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cdt "${in_lz}" > out 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -t -- nx_file.lz 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -t "" < /dev/null 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --help > /dev/null || test_failed $LINENO +"${LZIP}" -n1 -V > /dev/null || test_failed $LINENO +"${LZIP}" -m 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -z 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --bad_option 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --t 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --test=2 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --output= 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" --output 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +printf "LZIP\001-.............................." | "${LZIP}" -t 2> /dev/null +printf "LZIP\002-.............................." | "${LZIP}" -t 2> /dev/null +printf "LZIP\001+.............................." | "${LZIP}" -t 2> /dev/null + +"${LZIPRECOVER}" -eq "${testdir}"/test_bad6.lz +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -mq "${bad1_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -Rq +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -sq +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -t --remove=damaged "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged -t "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" --remove=tdata -t "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -t --strip=tdata "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --dump=tdata --strip=damaged "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" --remove=tdata --strip=damaged "${in_lz}" 2> /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --dump=damaged +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --dump=damaged in > /dev/null +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --dump=damagedd "${in_lz}" > /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged in > /dev/null +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damagedd "${in_lz}" > /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=damaged +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=damaged in +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=damagedd "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --dump=tdata +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --dump=tdata in > /dev/null +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --dump=tdataa "${in_lz}" > /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=tdata +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=tdata in > /dev/null +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=tdataa "${in_lz}" > /dev/null +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=tdata +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=tdata in +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=tdataa "${in_lz}" +[ $? = 1 ] || test_failed $LINENO + +"${LZIPRECOVER}" -Aq in +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -Aq < in > copy.lz # /dev/null returns 1 on OS/2 +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -Aq < "${in_lz}" > copy.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -Aq "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIPRECOVER}" -Akq "${in_lzma}" +[ $? = 1 ] || test_failed $LINENO +rm -f copy.lz || framework_failure +"${LZIPRECOVER}" -A "${in_lzma}" -o copy.lz || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -Ac "${in_lzma}" > copy.lz || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +rm -f copy.lz || framework_failure +"${LZIPRECOVER}" -A -o copy.lz < "${in_lzma}" || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -A < "${in_lzma}" > copy.lz || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +rm -f copy.lz || framework_failure +cat "${in_lzma}" > copy.lzma || framework_failure +"${LZIPRECOVER}" -Ak copy.lzma || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +printf "to be overwritten" > copy.lz || framework_failure +"${LZIPRECOVER}" -Af copy.lzma || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +rm -f copy.lz || framework_failure +cat "${in_lzma}" > copy.tlz || framework_failure +"${LZIPRECOVER}" -Ak copy.tlz || test_failed $LINENO +cmp "${in_lz}" copy.tar.lz || test_failed $LINENO +printf "to be overwritten" > copy.tar.lz || framework_failure +"${LZIPRECOVER}" -Af copy.tlz || test_failed $LINENO +cmp "${in_lz}" copy.tar.lz || test_failed $LINENO +rm -f copy.tar.lz || framework_failure +cat in in > in2 || framework_failure +"${LZIPRECOVER}" -A -o out2.lz - "${in_lzma}" - < "${in_lzma}" || + test_failed $LINENO +"${LZIP}" -cd out2.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +rm -f out2.lz copy2 || framework_failure + +printf "\ntesting decompression..." + +for i in "${in_lz}" "${in_em}" ; do + "${LZIP}" -lq "$i" || test_failed $LINENO "$i" + "${LZIP}" -t "$i" || test_failed $LINENO "$i" + "${LZIP}" -d "$i" -o copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + "${LZIP}" -cd "$i" > copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + "${LZIP}" -d "$i" -o - > copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + "${LZIP}" -d < "$i" > copy || test_failed $LINENO "$i" + cmp in copy || test_failed $LINENO "$i" + rm -f copy || framework_failure +done + +lines=$("${LZIP}" -tvv "${in_em}" 2>&1 | wc -l) || test_failed $LINENO +[ "${lines}" -eq 8 ] || test_failed $LINENO "${lines}" + +lines=$("${LZIP}" -lvv "${in_em}" | wc -l) || test_failed $LINENO +[ "${lines}" -eq 11 ] || test_failed $LINENO "${lines}" + +"${LZIP}" -cd "${fox_lz}" > fox || test_failed $LINENO +cat "${in_lz}" > copy.lz || framework_failure +"${LZIP}" -dk copy.lz || test_failed $LINENO +cmp in copy || test_failed $LINENO +cat fox > copy || framework_failure +cat "${in_lz}" > out.lz || framework_failure +rm -f out || framework_failure +"${LZIP}" -d copy.lz out.lz 2> /dev/null # skip copy, decompress out +[ $? = 1 ] || test_failed $LINENO +cmp fox copy || test_failed $LINENO +cmp in out || test_failed $LINENO +"${LZIP}" -df copy.lz || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +cmp in copy || test_failed $LINENO +rm -f out || framework_failure + +printf "to be overwritten" > copy || framework_failure +"${LZIP}" -df -o copy < "${in_lz}" || test_failed $LINENO +cmp in copy || test_failed $LINENO +rm -f out copy || framework_failure +"${LZIP}" -d -o ./- "${in_lz}" || test_failed $LINENO +cmp in ./- || test_failed $LINENO +rm -f ./- || framework_failure +"${LZIP}" -d -o ./- < "${in_lz}" || test_failed $LINENO +cmp in ./- || test_failed $LINENO +rm -f ./- || framework_failure + +cat "${in_lz}" > anyothername || framework_failure +"${LZIP}" -dv - anyothername - < "${in_lz}" > copy 2> /dev/null || + test_failed $LINENO +cmp in copy || test_failed $LINENO +cmp in anyothername.out || test_failed $LINENO +rm -f copy anyothername.out || framework_failure + +"${LZIP}" -lq in "${in_lz}" +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -lq nx_file.lz "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -tq in "${in_lz}" +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -tq nx_file.lz "${in_lz}" +[ $? = 1 ] || test_failed $LINENO +"${LZIP}" -cdq in "${in_lz}" > copy +[ $? = 2 ] || test_failed $LINENO +cat copy in | cmp in - || test_failed $LINENO # copy must be empty +"${LZIP}" -cdq nx_file.lz "${in_lz}" > copy +[ $? = 1 ] || test_failed $LINENO +cmp in copy || test_failed $LINENO +rm -f copy || framework_failure +cat "${in_lz}" > copy.lz || framework_failure +for i in 1 2 3 4 5 6 7 ; do + printf "g" >> copy.lz || framework_failure + "${LZIP}" -alvv copy.lz "${in_lz}" > /dev/null 2>&1 + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -atvvvv copy.lz "${in_lz}" 2> /dev/null + [ $? = 2 ] || test_failed $LINENO $i +done +"${LZIP}" -dq in copy.lz +[ $? = 2 ] || test_failed $LINENO +[ -e copy.lz ] || test_failed $LINENO +[ ! -e copy ] || test_failed $LINENO +[ ! -e in.out ] || test_failed $LINENO +"${LZIP}" -dq nx_file.lz copy.lz +[ $? = 1 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +[ ! -e nx_file ] || test_failed $LINENO +cmp in copy || test_failed $LINENO + +"${LZIP}" -lq "${in_lz}" "${in_lz}" || test_failed $LINENO +"${LZIP}" -t "${in_lz}" "${in_lz}" || test_failed $LINENO +"${LZIP}" -cd "${in_lz}" "${in_lz}" -o out > copy2 || test_failed $LINENO +[ ! -e out ] || test_failed $LINENO # override -o +cmp in2 copy2 || test_failed $LINENO +rm -f copy2 || framework_failure +"${LZIP}" -d "${in_lz}" "${in_lz}" -o copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +rm -f copy2 || framework_failure + +cat "${in_lz}" "${in_lz}" > copy2.lz || framework_failure +printf "\ngarbage" >> copy2.lz || framework_failure +"${LZIP}" -tvvvv copy2.lz 2> /dev/null || test_failed $LINENO +"${LZIPRECOVER}" -aD0 -q copy2.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -alq copy2.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -atq copy2.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -atq < copy2.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -adkq copy2.lz +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy2 ] || test_failed $LINENO +"${LZIP}" -adkq -o copy2 < copy2.lz +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy2 ] || test_failed $LINENO +printf "to be overwritten" > copy2 || framework_failure +"${LZIP}" -df copy2.lz || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +rm -f copy2 || framework_failure + +"${LZIPRECOVER}" -D ,18000 "${in_lz}" > copy || test_failed $LINENO +"${LZIPRECOVER}" -D 18000 "${in_lz}" >> copy || test_failed $LINENO +cmp in copy || test_failed $LINENO +"${LZIPRECOVER}" -D 21723-22120 -fo copy "${in_lz}" || test_failed $LINENO +cmp "${inD}" copy || test_failed $LINENO +"${LZIPRECOVER}" -D 21723,397 "${in_lz}" > copy || test_failed $LINENO +cmp "${inD}" copy || test_failed $LINENO + +printf "\ntesting bad input..." + +headers='LZIp LZiP LZip LzIP LzIp LziP lZIP lZIp lZiP lzIP' +body='\001\014\000\203\377\373\377\377\300\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000$\000\000\000\000\000\000\000' +cat "${in_lz}" > int.lz +printf "LZIP${body}" >> int.lz +if "${LZIP}" -tq int.lz ; then + for header in ${headers} ; do + printf "${header}${body}" > int.lz # first member + "${LZIP}" -lq int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq < int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -cdq int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -lq --loose-trailing int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq --loose-trailing int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq --loose-trailing < int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -cdq --loose-trailing int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + cat "${in_lz}" > int.lz + printf "${header}${body}" >> int.lz # trailing data + "${LZIP}" -lq int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq < int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -cdq int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -lq --loose-trailing int.lz || + test_failed $LINENO ${header} + "${LZIP}" -t --loose-trailing int.lz || + test_failed $LINENO ${header} + "${LZIP}" -t --loose-trailing < int.lz || + test_failed $LINENO ${header} + "${LZIP}" -cd --loose-trailing int.lz > /dev/null || + test_failed $LINENO ${header} + "${LZIP}" -lq --loose-trailing --trailing-error int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq --loose-trailing --trailing-error int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq --loose-trailing --trailing-error < int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -cdq --loose-trailing --trailing-error int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIPRECOVER}" -q --dump=tdata int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIPRECOVER}" -q --strip=tdata int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIPRECOVER}" --dump=tdata --loose-trailing int.lz > \ + /dev/null || test_failed $LINENO ${header} + "${LZIPRECOVER}" --strip=tdata --loose-trailing int.lz > \ + /dev/null || test_failed $LINENO ${header} + "${LZIPRECOVER}" -q --remove=tdata int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIPRECOVER}" --remove=tdata --loose-trailing int.lz || + test_failed $LINENO ${header} + cmp "${in_lz}" int.lz || test_failed $LINENO ${header} + done +else + printf "\nwarning: skipping header test: 'printf' does not work on your system." +fi +rm -f int.lz || framework_failure + +for i in fox_v2.lz fox_s11.lz fox_de20.lz \ + fox_bcrc.lz fox_crc0.lz fox_das46.lz fox_mes81.lz ; do + "${LZIP}" -tq "${testdir}"/$i + [ $? = 2 ] || test_failed $LINENO $i +done + +for i in fox_bcrc.lz fox_crc0.lz fox_das46.lz fox_mes81.lz ; do + "${LZIP}" -cdq "${testdir}"/$i > out + [ $? = 2 ] || test_failed $LINENO $i + cmp fox out || test_failed $LINENO $i + "${LZIPRECOVER}" -tq -i "${testdir}"/$i || test_failed $LINENO $i + "${LZIPRECOVER}" -cdq -i "${testdir}"/$i > out || test_failed $LINENO $i + cmp fox out || test_failed $LINENO $i +done +rm -f fox out || framework_failure + +cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure +cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure +if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && + [ -e trunc.lz ] && cmp in2.lz trunc.lz > /dev/null 2>&1 ; then + for i in 6 20 14734 14753 14754 14755 14756 14757 14758 ; do + dd if=in3.lz of=trunc.lz bs=$i count=1 2> /dev/null + "${LZIP}" -lq trunc.lz + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -tq trunc.lz + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -tq < trunc.lz + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -cdq trunc.lz > out + [ $? = 2 ] || test_failed $LINENO $i + "${LZIP}" -dq < trunc.lz > out + [ $? = 2 ] || test_failed $LINENO $i + done +else + printf "\nwarning: skipping truncation test: 'dd' does not work on your system." +fi +rm -f in3.lz trunc.lz out || framework_failure + +for i in "${f6s1_lz}" "${f6s2_lz}" ; do + lines=`"${LZIP}" -lvv "$i" | wc -l || test_failed $LINENO "$i"` + [ "${lines}" -eq 2 ] || test_failed $LINENO "$i ${lines}" +done +for i in "${f6s3_lz}" "${f6s4_lz}" "${f6s5_lz}" "${f6s6_lz}" ; do + lines=`"${LZIP}" -lvv "$i" | wc -l || test_failed $LINENO "$i"` + [ "${lines}" -eq 9 ] || test_failed $LINENO "$i ${lines}" +done + +cat "${in_lz}" > ingin.lz || framework_failure +printf "g" >> ingin.lz || framework_failure +cat "${in_lz}" >> ingin.lz || framework_failure +"${LZIP}" -lq ingin.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -atq ingin.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -atq < ingin.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -acdq ingin.lz > out +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -adq < ingin.lz > out +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -lq -i ingin.lz || test_failed $LINENO +"${LZIP}" -t ingin.lz || test_failed $LINENO +"${LZIP}" -t < ingin.lz || test_failed $LINENO +"${LZIP}" -cd ingin.lz > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO +"${LZIP}" -d < ingin.lz > copy || test_failed $LINENO +cmp in copy || test_failed $LINENO +"${LZIPRECOVER}" -cd -i ingin.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO + +"${LZIPRECOVER}" -D0 -q "${f6b1_lz}" -fo copy +[ $? = 2 ] || test_failed $LINENO +cmp -s "${f6b1}" copy && test_failed $LINENO +"${LZIPRECOVER}" -D0 -q "${f6b1_lz}" > copy +[ $? = 2 ] || test_failed $LINENO +cmp -s "${f6b1}" copy && test_failed $LINENO +"${LZIPRECOVER}" -D0 -iq "${f6b1_lz}" -fo copy || test_failed $LINENO +cmp "${f6b1}" copy || test_failed $LINENO +"${LZIPRECOVER}" -D0 -iq "${f6b1_lz}" > copy || test_failed $LINENO +cmp "${f6b1}" copy || test_failed $LINENO + +touch empty || framework_failure +"${LZIPRECOVER}" -D0 -q ingin.lz > copy +[ $? = 2 ] || test_failed $LINENO +cmp empty copy || test_failed $LINENO +"${LZIPRECOVER}" -D0 -i ingin.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +printf "LZIP\001+" > in2t.lz || framework_failure # gap size < 36 bytes +cat "${in_lz}" in "${in_lz}" >> in2t.lz || framework_failure +printf "LZIP\001-" >> in2t.lz || framework_failure # truncated member +"${LZIPRECOVER}" -D0 -iq in2t.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +"${LZIPRECOVER}" -cd -iq in2t.lz > copy2 || test_failed $LINENO +cmp in2 copy2 || test_failed $LINENO +"${LZIPRECOVER}" -t -iq in2t.lz || test_failed $LINENO +rm -f in2 in2t.lz copy copy2 || framework_failure + +printf "\ntesting --merge..." + +rm -f copy.lz || framework_failure +"${LZIPRECOVER}" -m -o copy.lz "${fox6_lz}" "${f6b1_lz}" || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -m -o copy.lz "${f6b1_lz}" "${fox6_lz}" || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -m -o copy.lz "${bad1_lz}" "${bad2_lz}" "${bad1_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -m -o copy.lz "${bad1_lz}" "${bad2_lz}" "${bad2_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +cat "${bad2_lz}" > bad2.lz || framework_failure +"${LZIPRECOVER}" -m -o copy.lz "${bad1_lz}" "${bad2_lz}" bad2.lz -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +rm -f bad2.lz || framework_failure +"${LZIPRECOVER}" -m -o copy.lz "${f6b1_lz}" "${f6b5_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -m -o copy.lz "${f6b3_lz}" "${f6b5_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -m -o copy.lz "${bad3_lz}" "${bad4_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO + +"${LZIPRECOVER}" -mf -o copy.lz "${f6b1_lz}" "${f6b4_lz}" || test_failed $LINENO +cmp "${fox6_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${f6b4_lz}" "${f6b1_lz}" || test_failed $LINENO +cmp "${fox6_lz}" copy.lz || test_failed $LINENO + +for i in "${f6b1_lz}" "${f6b3_lz}" "${f6b4_lz}" "${f6b5_lz}" "${f6b6_lz}" ; do + "${LZIPRECOVER}" -mf -o copy.lz "${f6b2_lz}" "$i" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" -mf -o copy.lz "$i" "${f6b2_lz}" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" +done + +for i in "${f6b3_lz}" "${f6b4_lz}" "${f6b5_lz}" "${f6b6_lz}" ; do + "${LZIPRECOVER}" -mf -o copy.lz "${f6b1_lz}" "${f6b2_lz}" "$i" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" -mf -o copy.lz "${f6b1_lz}" "$i" "${f6b2_lz}" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" -mf -o copy.lz "${f6b2_lz}" "${f6b1_lz}" "$i" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" -mf -o copy.lz "${f6b2_lz}" "$i" "${f6b1_lz}" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" -mf -o copy.lz "$i" "${f6b1_lz}" "${f6b2_lz}" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" -mf -o copy.lz "$i" "${f6b2_lz}" "${f6b1_lz}" || + test_failed $LINENO "$i" + cmp "${fox6_lz}" copy.lz || test_failed $LINENO "$i" +done + +"${LZIPRECOVER}" -mf -o copy.lz "${f6b3_lz}" "${f6b4_lz}" "${f6b5_lz}" || + test_failed $LINENO +cmp "${fox6_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${f6b1_lz}" "${f6b3_lz}" "${f6b4_lz}" \ +"${f6b5_lz}" || test_failed $LINENO +cmp "${fox6_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${f6b2_lz}" "${f6b3_lz}" "${f6b4_lz}" \ +"${f6b5_lz}" || test_failed $LINENO +cmp "${fox6_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${f6b1_lz}" "${f6b2_lz}" "${f6b3_lz}" \ +"${f6b4_lz}" "${f6b5_lz}" || test_failed $LINENO +cmp "${fox6_lz}" copy.lz || test_failed $LINENO + +"${LZIPRECOVER}" -mf -o copy.lz "${bad1_lz}" "${bad2_lz}" || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${bad2_lz}" "${bad1_lz}" || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO + +cat "${bad1_lz}" "${in_lz}" "${bad1_lz}" "${bad1_lz}" > bad11.lz || framework_failure +cat "${bad1_lz}" "${in_lz}" "${bad2_lz}" "${in_lz}" > bad12.lz || framework_failure +cat "${bad2_lz}" "${in_lz}" "${bad2_lz}" "${bad2_lz}" > bad22.lz || framework_failure +cat "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" > copy4.lz || framework_failure +"${LZIPRECOVER}" -mf -o out4.lz bad11.lz bad12.lz bad22.lz || test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad11.lz bad22.lz bad12.lz || test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad12.lz bad11.lz bad22.lz || test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad12.lz bad22.lz bad11.lz || test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad22.lz bad11.lz bad12.lz || test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad22.lz bad12.lz bad11.lz || test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +rm -f bad11.lz bad12.lz bad22.lz || framework_failure + +for i in "${bad1_lz}" "${bad2_lz}" ; do + for j in "${bad3_lz}" "${bad4_lz}" "${bad5_lz}" ; do + "${LZIPRECOVER}" -mf -o copy.lz "$i" "$j" || + test_failed $LINENO "$i $j" + cmp "${in_lz}" copy.lz || test_failed $LINENO "$i $j" + "${LZIPRECOVER}" -mf -o copy.lz "$j" "$i" || + test_failed $LINENO "$i $j" + cmp "${in_lz}" copy.lz || test_failed $LINENO "$i $j" + done +done + +"${LZIPRECOVER}" -mf -o copy.lz "${bad3_lz}" "${bad4_lz}" "${bad5_lz}" || + test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${bad3_lz}" "${bad5_lz}" "${bad4_lz}" || + test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${bad4_lz}" "${bad3_lz}" "${bad5_lz}" || + test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${bad4_lz}" "${bad5_lz}" "${bad3_lz}" || + test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${bad5_lz}" "${bad3_lz}" "${bad4_lz}" || + test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o copy.lz "${bad5_lz}" "${bad4_lz}" "${bad3_lz}" || + test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO + +cat "${bad3_lz}" "${bad4_lz}" "${bad5_lz}" "${in_lz}" > bad345.lz || framework_failure +cat "${bad4_lz}" "${bad5_lz}" "${bad3_lz}" "${in_lz}" > bad453.lz || framework_failure +cat "${bad5_lz}" "${bad3_lz}" "${bad4_lz}" "${in_lz}" > bad534.lz || framework_failure +cat "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" > copy4.lz || framework_failure +"${LZIPRECOVER}" -mf -o out4.lz bad345.lz bad453.lz bad534.lz || + test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad345.lz bad534.lz bad453.lz || + test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad453.lz bad345.lz bad534.lz || + test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad453.lz bad534.lz bad345.lz || + test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad534.lz bad345.lz bad453.lz || + test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +"${LZIPRECOVER}" -mf -o out4.lz bad534.lz bad453.lz bad345.lz || + test_failed $LINENO +cmp out4.lz copy4.lz || test_failed $LINENO +rm -f bad345.lz bad453.lz bad534.lz out4.lz copy4.lz || framework_failure + +printf "\ntesting --repair..." + +rm -f copy.lz || framework_failure +"${LZIPRECOVER}" -R -o copy.lz "${fox6_lz}" || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -R -o copy.lz "${bad2_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -R -o copy.lz "${bad3_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -R -o copy.lz "${bad4_lz}" -q +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +"${LZIPRECOVER}" -Rf -o copy.lz "${f6b1_lz}" || test_failed $LINENO +cmp "${fox6_lz}" copy.lz || test_failed $LINENO +"${LZIPRECOVER}" -Rf -o copy.lz "${bad1_lz}" || test_failed $LINENO +cmp "${in_lz}" copy.lz || test_failed $LINENO + +cat "${f6b1_lz}" > copy.tar.lz || framework_failure +"${LZIPRECOVER}" -R copy.tar.lz || test_failed $LINENO +[ -e copy_fixed.tar.lz ] || test_failed $LINENO +mv copy.tar.lz copy.lz || framework_failure +"${LZIPRECOVER}" -R copy.lz || test_failed $LINENO +[ -e copy_fixed.lz ] || test_failed $LINENO +mv copy.lz copy.tlz || framework_failure +"${LZIPRECOVER}" -R copy.tlz || test_failed $LINENO +[ -e copy_fixed.tlz ] || test_failed $LINENO +rm -f copy_fixed.tlz copy_fixed.lz copy_fixed.tar.lz copy.tlz || + framework_failure + +printf "\ntesting --reproduce..." + +if [ -z "${LZIP_NAME}" ] ; then LZIP_NAME=lzip ; fi +if /bin/sh -c "${LZIP_NAME} -s18KiB" < in > out 2> /dev/null && + cmp "${in_lz}" out > /dev/null 2>&1 ; then + rm -f out || framework_failure + "${LZIPRECOVER}" --reproduce --lzip-name="${LZIP_NAME}" -o out \ + --reference-file=foo "${in_lz}" || test_failed $LINENO "${LZIP_NAME}" + [ ! -e out ] || test_failed $LINENO + + for i in 6 7 8 9 ; do + for f in "${testdir}"/test_bad${i}.txt "${testdir}"/test.txt ; do + rm -f out || framework_failure + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="$f" "${testdir}"/test_bad${i}.lz -o out || + test_failed $LINENO "${LZIP_NAME} $i $f" + cmp "${in_lz}" out || test_failed $LINENO "${LZIP_NAME} $i $f" + rm -f out || framework_failure + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="$f" "${testdir}"/test_bad${i}.lz -o out \ + --lzip-level=6 || test_failed $LINENO "${LZIP_NAME} $i $f level=6" + cmp "${in_lz}" out || test_failed $LINENO "${LZIP_NAME} $i $f level=6" + rm -f out || framework_failure + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="$f" "${testdir}"/test_bad${i}.lz -o out \ + --lzip-level=m36 || test_failed $LINENO "${LZIP_NAME} $i $f level=m36" + cmp "${in_lz}" out || test_failed $LINENO "${LZIP_NAME} $i $f level=m36" + done + done + + cat "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" > in4.lz || framework_failure + # multimember reproduction using test_bad[6789].txt as reference + cat "${testdir}"/test_bad6.lz "${testdir}"/test_bad7.lz \ + "${testdir}"/test_bad8.lz "${testdir}"/test_bad9.lz > mm_bad.lz || + framework_failure + rm -f out || framework_failure + for i in 6 7 8 9 ; do # reproduce one member each time + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test_bad${i}.txt mm_bad.lz -o out || + test_failed $LINENO "${LZIP_NAME} $i" + mv -f out mm_bad.lz + done + cmp in4.lz mm_bad.lz || test_failed $LINENO "${LZIP_NAME}" + + # multimember reproduction using test.txt as reference + cat "${testdir}"/test_bad6.lz "${testdir}"/test_bad7.lz \ + "${testdir}"/test_bad8.lz "${testdir}"/test_bad9.lz > mm_bad.lz || + framework_failure + rm -f out || framework_failure + for i in 6 7 8 9 ; do # reproduce one member each time + "${LZIPRECOVER}" -q --reproduce --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test.txt mm_bad.lz -o out || + test_failed $LINENO "${LZIP_NAME} $i" + mv -f out mm_bad.lz + done + cmp in4.lz mm_bad.lz || test_failed $LINENO "${LZIP_NAME}" + rm -f in4.lz mm_bad.lz || framework_failure + + "${LZIPRECOVER}" -q --debug-reproduce=13-7356 --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test.txt "${testdir}"/test.txt.lz || + test_failed $LINENO "${LZIP_NAME}" + + "${LZIPRECOVER}" -q --debug-reproduce=512,5120,512 --lzip-name="${LZIP_NAME}" \ + --reference-file="${testdir}"/test.txt "${testdir}"/test.txt.lz || + test_failed $LINENO "${LZIP_NAME}" +else + printf "\nwarning: skipping --reproduce test: ${LZIP_NAME} not found or not the right version." + printf "\nTry 'make LZIP_NAME=<name_of_lzip_executable> check'." +fi + +printf "\ntesting --split..." + +cat "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" \ + "${in_lz}" "${in_lz}" "${in_lz}" > in9.lz || framework_failure +"${LZIPRECOVER}" -s in9.lz || test_failed $LINENO +for i in 1 2 3 4 5 6 7 8 9 ; do + cmp "${in_lz}" rec${i}in9.lz || test_failed $LINENO $i + "${LZIP}" -cd rec${i}in9.lz > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i +done +cat rec*in9.lz | cmp in9.lz - || test_failed $LINENO +rm -f rec*in9.lz || framework_failure + +cat in9.lz > in9t.lz || framework_failure +printf "garbage" >> in9t.lz || framework_failure +"${LZIPRECOVER}" -s in9t.lz || test_failed $LINENO +for i in 01 02 03 04 05 06 07 08 09 ; do + cmp "${in_lz}" rec${i}in9t.lz || test_failed $LINENO $i + "${LZIP}" -cd rec${i}in9t.lz > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i +done +[ -e rec10in9t.lz ] || test_failed $LINENO +[ ! -e rec11in9t.lz ] || test_failed $LINENO +cat rec*in9t.lz | cmp in9t.lz - || test_failed $LINENO +rm -f rec*in9t.lz in9t.lz || framework_failure + +printf "LZIP\001+" > in9t.lz || framework_failure # gap size < 36 bytes +cat "${in_lz}" "${in_lz}" "${in_lz}" in "${in_lz}" "${in_lz}" "${in_lz}" \ + "${in_lz}" "${in_lz}" "${in_lz}" in >> in9t.lz || framework_failure +"${LZIPRECOVER}" -s in9t.lz || test_failed $LINENO +for i in 02 03 04 06 07 08 09 10 11 ; do + cmp "${in_lz}" rec${i}in9t.lz || test_failed $LINENO $i + "${LZIP}" -cd rec${i}in9t.lz > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i +done +cmp in rec05in9t.lz || test_failed $LINENO +cmp in rec12in9t.lz || test_failed $LINENO +[ -e rec01in9t.lz ] || test_failed $LINENO +[ ! -e rec13in9t.lz ] || test_failed $LINENO +cat rec*in9t.lz | cmp in9t.lz - || test_failed $LINENO +rm -f rec*in9t.lz in9t.lz || framework_failure + +cat "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" \ + "${in_lz}" "${in_lz}" in "${in_lz}" > in9t.lz || framework_failure +printf "LZIP\001-" >> in9t.lz || framework_failure # truncated member +"${LZIPRECOVER}" -s in9t.lz || test_failed $LINENO +for i in 01 02 03 04 05 06 07 08 10 ; do + cmp "${in_lz}" rec${i}in9t.lz || test_failed $LINENO $i + "${LZIP}" -cd rec${i}in9t.lz > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i +done +cmp in rec09in9t.lz || test_failed $LINENO +[ -e rec11in9t.lz ] || test_failed $LINENO +[ ! -e rec12in9t.lz ] || test_failed $LINENO +cat rec*in9t.lz | cmp in9t.lz - || test_failed $LINENO +rm -f rec*in9t.lz in9t.lz || framework_failure + +cat "${in_lz}" "${in_lz}" "${in_lz}" in "${in_lz}" > in9t.lz || framework_failure +printf "LZIP\001-" >> in9t.lz || framework_failure # truncated member +cat "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" "${in_lz}" >> in9t.lz || + framework_failure +"${LZIPRECOVER}" -s in9t.lz || test_failed $LINENO +for i in 01 02 03 05 07 08 09 10 11 ; do + cmp "${in_lz}" rec${i}in9t.lz || test_failed $LINENO $i + "${LZIP}" -cd rec${i}in9t.lz > copy || test_failed $LINENO $i + cmp in copy || test_failed $LINENO $i +done +cmp in rec04in9t.lz || test_failed $LINENO +[ -e rec06in9t.lz ] || test_failed $LINENO +[ ! -e rec12in9t.lz ] || test_failed $LINENO +cat rec*in9t.lz | cmp in9t.lz - || test_failed $LINENO +rm -f rec*in9t.lz in9t.lz || framework_failure + +"${LZIPRECOVER}" -s "${f6b1_lz}" -o f6.lz || test_failed $LINENO +for i in 1 2 3 4 5 6 ; do + [ -e rec${i}f6.lz ] || test_failed $LINENO +done +[ ! -e rec7f6.lz ] || test_failed $LINENO +cat rec*f6.lz | cmp "${f6b1_lz}" - || test_failed $LINENO +rm -f rec*f6.lz || framework_failure + +"${LZIPRECOVER}" -s "${f6b2_lz}" -o f6.lz || test_failed $LINENO +for i in 1 3 4 5 6 ; do + cmp "${fox_lz}" rec${i}f6.lz || test_failed $LINENO +done +[ -e rec2f6.lz ] || test_failed $LINENO +[ ! -e rec7f6.lz ] || test_failed $LINENO +cat rec*f6.lz | cmp "${f6b2_lz}" - || test_failed $LINENO +rm -f rec*f6.lz || framework_failure + +"${LZIPRECOVER}" -s "${f6b3_lz}" -o f6.lz || test_failed $LINENO +for i in 1 2 4 ; do + cmp "${fox_lz}" rec${i}f6.lz || test_failed $LINENO +done +[ -e rec3f6.lz ] || test_failed $LINENO +[ ! -e rec5f6.lz ] || test_failed $LINENO +cat rec*f6.lz | cmp "${f6b3_lz}" - || test_failed $LINENO +rm -f rec*f6.lz || framework_failure + +for i in "${f6b4_lz}" "${f6b5_lz}" ; do + "${LZIPRECOVER}" -s "$i" -o f6.lz || test_failed $LINENO + for j in 1 2 3 4 ; do + cmp "${fox_lz}" rec${j}f6.lz || test_failed $LINENO + done + [ -e rec5f6.lz ] || test_failed $LINENO + [ ! -e rec6f6.lz ] || test_failed $LINENO + cat rec*f6.lz | cmp "$i" - || test_failed $LINENO + rm -f rec*f6.lz || framework_failure +done + +"${LZIPRECOVER}" -s "${f6b6_lz}" -o f6.lz || test_failed $LINENO +for i in 1 2 3 4 5 ; do + cmp "${fox_lz}" rec${i}f6.lz || test_failed $LINENO +done +[ -e rec6f6.lz ] || test_failed $LINENO +[ ! -e rec7f6.lz ] || test_failed $LINENO +cat rec*f6.lz | cmp "${f6b6_lz}" - || test_failed $LINENO +rm -f rec*f6.lz || framework_failure + +"${LZIPRECOVER}" -s "${f6s1_lz}" -o f6.lz || test_failed $LINENO +for i in 1 2 3 4 5 ; do + cmp "${fox_lz}" rec${i}f6.lz || test_failed $LINENO +done +[ -e rec6f6.lz ] || test_failed $LINENO +[ ! -e rec7f6.lz ] || test_failed $LINENO +cat rec*f6.lz | cmp "${f6s1_lz}" - || test_failed $LINENO +rm -f rec*f6.lz || framework_failure +for i in "${f6s2_lz}" "${f6s3_lz}" "${f6s4_lz}" "${f6s5_lz}" "${f6s6_lz}" ; do + "${LZIPRECOVER}" -s "$i" -o f6.lz || test_failed $LINENO "$i" + for j in 1 2 3 4 5 6 ; do + cmp "${fox_lz}" rec${j}f6.lz || test_failed $LINENO "$i $j" + done + [ -e rec7f6.lz ] || test_failed $LINENO "$i" + [ ! -e rec8f6.lz ] || test_failed $LINENO "$i" + cat rec*f6.lz | cmp "$i" - || test_failed $LINENO "$i" + rm -f rec*f6.lz || framework_failure +done + +"${LZIPRECOVER}" -s ingin.lz || test_failed $LINENO +cmp "${in_lz}" rec1ingin.lz || test_failed $LINENO +cmp "${in_lz}" rec3ingin.lz || test_failed $LINENO +printf "g" | cmp rec2ingin.lz - || test_failed $LINENO +[ ! -e rec4ingin.lz ] || test_failed $LINENO +cat rec*ingin.lz | cmp ingin.lz - || test_failed $LINENO +rm -f rec*ingin.lz || framework_failure + +printf "\ntesting --*=damaged..." + +cat "${in_lz}" > in.lz || framework_failure +cat "${in_lz}" in > int.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged in.lz > copy || test_failed $LINENO +cmp empty copy || test_failed $LINENO +"${LZIPRECOVER}" --dump=damage int.lz > copy || test_failed $LINENO +cmp empty copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damag in.lz > copy || test_failed $LINENO +cmp in.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=dama int.lz > copy || test_failed $LINENO +cmp int.lz copy || test_failed $LINENO +# strip trailing data from all but the last file +"${LZIPRECOVER}" --strip=dam int.lz int.lz > copy || test_failed $LINENO +cat "${in_lz}" "${in_lz}" in | cmp copy - || test_failed $LINENO +"${LZIPRECOVER}" --remove=da in.lz || test_failed $LINENO +cmp "${in_lz}" in.lz || test_failed $LINENO +"${LZIPRECOVER}" --remove=d int.lz || test_failed $LINENO +cat "${in_lz}" in | cmp int.lz - || test_failed $LINENO +rm -f in.lz int.lz || framework_failure + +cat in9.lz in > in9t.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged in9.lz > copy || test_failed $LINENO +cmp empty copy || test_failed $LINENO +"${LZIPRECOVER}" --dump=damaged in9t.lz > copy || test_failed $LINENO +cmp empty copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged in9.lz > copy || test_failed $LINENO +cmp in9.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged in9t.lz > copy || test_failed $LINENO +cmp in9t.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --remove=damaged in9t.lz || test_failed $LINENO +cat in9.lz in | cmp in9t.lz - || test_failed $LINENO +cat in9.lz > in9t.lz || framework_failure +"${LZIPRECOVER}" --remove=damaged in9t.lz || test_failed $LINENO +cmp in9.lz in9t.lz || test_failed $LINENO +rm -f in9t.lz || framework_failure + +printf "LZIP\001+" > in9t.lz || framework_failure # gap size < 36 bytes +cat "${in_lz}" "${in_lz}" "${in_lz}" in "${in_lz}" "${in_lz}" "${in_lz}" \ + "${in_lz}" "${in_lz}" "${in_lz}" >> in9t.lz || framework_failure +printf "LZIP\001-" >> in9t.lz || framework_failure # truncated member +printf "LZIP\001+" > gaps || framework_failure +cat in >> gaps || framework_failure +printf "LZIP\001-" >> gaps || framework_failure +"${LZIPRECOVER}" --dump=damaged in9t.lz > copy || test_failed $LINENO +cmp gaps copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged in9t.lz > copy || test_failed $LINENO +cmp in9.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --remove=damaged in9t.lz || test_failed $LINENO +cmp in9.lz in9t.lz || test_failed $LINENO +rm -f in9.lz in9t.lz gaps || framework_failure + +"${LZIPRECOVER}" --dump=damaged "${f6b1_lz}" > copy || test_failed $LINENO +cmp "${f6b1_lz}" copy || test_failed $LINENO +cat "${f6b1_lz}" in > f6bt.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged f6bt.lz > copy || test_failed $LINENO +cmp "${f6b1_lz}" copy || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged "${f6b1_lz}" > copy || test_failed $LINENO +cmp empty copy || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged f6bt.lz > copy || test_failed $LINENO +cmp empty copy || test_failed $LINENO +cat "${f6b1_lz}" > f6b.lz || framework_failure +"${LZIPRECOVER}" -q --remove=damaged f6b.lz +[ $? = 2 ] || test_failed $LINENO +cmp "${f6b1_lz}" f6b.lz || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=damaged f6bt.lz +[ $? = 2 ] || test_failed $LINENO +cat "${f6b1_lz}" in | cmp f6bt.lz - || test_failed $LINENO +rm -f f6b.lz f6bt.lz || framework_failure + +"${LZIPRECOVER}" --dump=damaged "${f6b2_lz}" > copy || test_failed $LINENO +cat "${fox_lz}" copy "${fox_lz}" "${fox_lz}" "${fox_lz}" \ + "${fox_lz}" | cmp "${f6b2_lz}" - || test_failed $LINENO +cat "${f6b2_lz}" in > f6bt.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged f6bt.lz > copy || test_failed $LINENO +cat "${fox_lz}" copy "${fox_lz}" "${fox_lz}" "${fox_lz}" \ + "${fox_lz}" | cmp "${f6b2_lz}" - || test_failed $LINENO +cat "${fox_lz}" "${fox_lz}" "${fox_lz}" "${fox_lz}" "${fox_lz}" > fox5.lz +"${LZIPRECOVER}" --strip=damaged "${f6b2_lz}" > copy || test_failed $LINENO +cmp fox5.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged f6bt.lz > copy || test_failed $LINENO +cat fox5.lz in | cmp copy - || test_failed $LINENO +cat "${f6b2_lz}" > f6b.lz || framework_failure +"${LZIPRECOVER}" --remove=damaged f6b.lz || test_failed $LINENO +cmp fox5.lz f6b.lz || test_failed $LINENO +"${LZIPRECOVER}" --remove=damaged f6bt.lz || test_failed $LINENO +cat fox5.lz in | cmp f6bt.lz - || test_failed $LINENO +rm -f f6b.lz f6bt.lz || framework_failure + +"${LZIPRECOVER}" --dump=damaged "${f6b3_lz}" > copy || test_failed $LINENO +cat "${fox_lz}" "${fox_lz}" copy "${fox_lz}" | cmp "${f6b3_lz}" - || + test_failed $LINENO +cat "${f6b3_lz}" in > f6bt.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged f6bt.lz > copy || test_failed $LINENO +cat "${fox_lz}" "${fox_lz}" copy "${fox_lz}" | cmp "${f6b3_lz}" - || + test_failed $LINENO +cat "${fox_lz}" "${fox_lz}" "${fox_lz}" > fox3.lz +"${LZIPRECOVER}" --strip=damaged "${f6b3_lz}" > copy || test_failed $LINENO +cmp fox3.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged f6bt.lz > copy || test_failed $LINENO +cat fox3.lz in | cmp copy - || test_failed $LINENO +cat "${f6b3_lz}" > f6b.lz || framework_failure +"${LZIPRECOVER}" --remove=damaged f6b.lz || test_failed $LINENO +cmp fox3.lz f6b.lz || test_failed $LINENO +"${LZIPRECOVER}" --remove=damaged f6bt.lz || test_failed $LINENO +cat fox3.lz in | cmp f6bt.lz - || test_failed $LINENO +rm -f f6b.lz f6bt.lz fox3.lz || framework_failure + +cat "${fox_lz}" "${fox_lz}" "${fox_lz}" "${fox_lz}" > fox4.lz +for i in "${f6b4_lz}" "${f6b5_lz}" ; do + "${LZIPRECOVER}" --dump=damaged "$i" > copy || test_failed $LINENO "$i" + cat fox4.lz copy | cmp "$i" - || test_failed $LINENO "$i" + cat "$i" in > f6bt.lz || framework_failure + "${LZIPRECOVER}" --dump=damaged f6bt.lz > copy || + test_failed $LINENO "$i" + cat fox4.lz copy | cmp f6bt.lz - || test_failed $LINENO "$i" + "${LZIPRECOVER}" --strip=damaged "$i" > copy || test_failed $LINENO "$i" + cmp fox4.lz copy || test_failed $LINENO "$i" + "${LZIPRECOVER}" --strip=damaged f6bt.lz > copy || + test_failed $LINENO "$i" + cmp fox4.lz copy || test_failed $LINENO "$i" + cat "$i" > f6b.lz || framework_failure + "${LZIPRECOVER}" --remove=damaged f6b.lz || test_failed $LINENO "$i" + cmp fox4.lz f6b.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" --remove=damaged f6bt.lz || test_failed $LINENO "$i" + cmp fox4.lz f6bt.lz || test_failed $LINENO "$i" +done +rm -f f6b.lz f6bt.lz fox4.lz || framework_failure + +"${LZIPRECOVER}" --dump=damaged "${f6b6_lz}" > copy || test_failed $LINENO +cat fox5.lz copy | cmp "${f6b6_lz}" - || test_failed $LINENO +cat "${f6b6_lz}" in > f6bt.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged f6bt.lz > copy || test_failed $LINENO +cat fox5.lz copy | cmp "${f6b6_lz}" - || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged "${f6b6_lz}" > copy || test_failed $LINENO +cmp fox5.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged f6bt.lz > copy || test_failed $LINENO +cat fox5.lz in | cmp copy - || test_failed $LINENO +cat "${f6b6_lz}" > f6b.lz || framework_failure +"${LZIPRECOVER}" --remove=damaged f6b.lz || test_failed $LINENO +cmp fox5.lz f6b.lz || test_failed $LINENO +"${LZIPRECOVER}" --remove=damaged f6bt.lz || test_failed $LINENO +cat fox5.lz in | cmp f6bt.lz - || test_failed $LINENO +rm -f f6b.lz f6bt.lz || framework_failure + +for i in "${f6s1_lz}" "${f6s2_lz}" ; do + "${LZIPRECOVER}" --dump=damaged "$i" > copy || test_failed $LINENO "$i" + cmp "$i" copy || test_failed $LINENO "$i" + cat "$i" in > f6bt.lz || framework_failure + "${LZIPRECOVER}" --dump=damaged f6bt.lz > copy || + test_failed $LINENO "$i" + cmp "$i" copy || test_failed $LINENO "$i" + "${LZIPRECOVER}" -q --strip=damaged "$i" > copy || + test_failed $LINENO "$i" + cmp empty copy || test_failed $LINENO "$i" + "${LZIPRECOVER}" -q --strip=damaged f6bt.lz > copy || + test_failed $LINENO "$i" + cmp empty copy || test_failed $LINENO "$i" + cat "$i" > f6b.lz || framework_failure + "${LZIPRECOVER}" -q --remove=damaged f6b.lz + [ $? = 2 ] || test_failed $LINENO "$i" + cmp "$i" f6b.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" -q --remove=damaged f6bt.lz + [ $? = 2 ] || test_failed $LINENO "$i" + cat "$i" in | cmp f6bt.lz - || test_failed $LINENO "$i" +done +rm -f f6b.lz f6bt.lz || framework_failure + +for i in "${f6s3_lz}" "${f6s4_lz}" "${f6s5_lz}" "${f6s6_lz}" ; do + "${LZIPRECOVER}" --dump=damaged "$i" > copy || test_failed $LINENO "$i" + cmp empty copy || test_failed $LINENO "$i" + cat "$i" in > f6bt.lz || framework_failure + "${LZIPRECOVER}" --dump=damaged f6bt.lz > copy || + test_failed $LINENO "$i" + cmp empty copy || test_failed $LINENO "$i" + "${LZIPRECOVER}" --strip=damaged "$i" > copy || test_failed $LINENO "$i" + cmp "$i" copy || test_failed $LINENO "$i" + "${LZIPRECOVER}" --strip=damaged f6bt.lz > copy || + test_failed $LINENO "$i" + cat "$i" in | cmp copy - || test_failed $LINENO "$i" + cat "$i" > f6b.lz || framework_failure + "${LZIPRECOVER}" --remove=damaged f6b.lz || test_failed $LINENO "$i" + cmp "$i" f6b.lz || test_failed $LINENO "$i" + "${LZIPRECOVER}" --remove=damaged f6bt.lz || test_failed $LINENO "$i" + cat "$i" in | cmp f6bt.lz - || test_failed $LINENO "$i" +done +rm -f f6b.lz f6bt.lz || framework_failure + +cat ingin.lz "${inD}" > ingint.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged ingin.lz > copy || test_failed $LINENO +printf "g" | cmp copy - || test_failed $LINENO +"${LZIPRECOVER}" --dump=damaged ingint.lz > copy || test_failed $LINENO +printf "g" | cmp copy - || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged ingin.lz > copy || test_failed $LINENO +cmp in2.lz copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged ingint.lz > copy || test_failed $LINENO +cat "${in_lz}" "${in_lz}" "${inD}" | cmp copy - || test_failed $LINENO +cat ingin.lz > ingin2.lz || framework_failure +"${LZIPRECOVER}" --remove=damaged ingin2.lz || test_failed $LINENO +cmp in2.lz ingin2.lz || test_failed $LINENO +"${LZIPRECOVER}" --remove=damaged ingint.lz || test_failed $LINENO +cat "${in_lz}" "${in_lz}" "${inD}" | cmp ingint.lz - || test_failed $LINENO +rm -f ingin2.lz ingint.lz || framework_failure + +# concatenate output from several files +"${LZIPRECOVER}" --dump=damaged "${f6b2_lz}" > copy || test_failed $LINENO +"${LZIPRECOVER}" --dump=damaged "${bad2_lz}" "${f6b2_lz}" > copy2 || + test_failed $LINENO +cat "${bad2_lz}" copy | cmp copy2 - || test_failed $LINENO +cat "${bad2_lz}" in > bad2t.lz || framework_failure +cat "${f6b2_lz}" in > f6bt.lz || framework_failure +"${LZIPRECOVER}" --dump=damaged bad2t.lz "${f6b2_lz}" "${bad2_lz}" \ +f6bt.lz > copy4 || test_failed $LINENO +cat "${bad2_lz}" copy "${bad2_lz}" copy | cmp copy4 - || test_failed $LINENO +"${LZIPRECOVER}" --dump=damaged "${f6b2_lz}" bad2t.lz f6bt.lz \ +"${bad2_lz}" > copy4 || test_failed $LINENO +cat copy "${bad2_lz}" copy "${bad2_lz}" | cmp copy4 - || test_failed $LINENO +# +"${LZIPRECOVER}" -q --strip=damaged "${bad2_lz}" "${f6b2_lz}" > copy || + test_failed $LINENO +cmp fox5.lz copy || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged bad2t.lz "${f6b2_lz}" > copy || + test_failed $LINENO +cmp fox5.lz copy || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged "${f6b2_lz}" bad2t.lz f6bt.lz > copy || + test_failed $LINENO +cat fox5.lz fox5.lz in | cmp copy - || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged "${f6b2_lz}" f6bt.lz bad2t.lz > copy || + test_failed $LINENO +cat fox5.lz fox5.lz | cmp copy - || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged f6bt.lz bad2t.lz > copy || + test_failed $LINENO +cmp fox5.lz copy || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=damaged f6bt.lz "${in_lz}" > copy || + test_failed $LINENO +cat fox5.lz "${in_lz}" | cmp copy - || test_failed $LINENO +"${LZIPRECOVER}" --strip=damaged --strip=tdata f6bt.lz "${in_lz}" > copy || + test_failed $LINENO +cat fox5.lz "${in_lz}" | cmp copy - || test_failed $LINENO +# +cat "${f6b2_lz}" > f6b.lz || framework_failure +"${LZIPRECOVER}" -q --remove=damaged f6b.lz bad2t.lz f6bt.lz +[ $? = 2 ] || test_failed $LINENO +cat "${bad2_lz}" in | cmp bad2t.lz - || test_failed $LINENO +cmp fox5.lz f6b.lz || test_failed $LINENO +cat fox5.lz in | cmp f6bt.lz - || test_failed $LINENO +cat "${bad2_lz}" in > bad2t.lz || framework_failure +cat "${fox6_lz}" "${inD}" > fox6t.lz || framework_failure +cat "${f6b1_lz}" in > f6abt.lz || framework_failure +cat "${f6b2_lz}" > f6b.lz || framework_failure +cat "${f6b2_lz}" in > f6bt.lz || framework_failure +"${LZIPRECOVER}" -q --remove=d:t fox6t.lz f6abt.lz f6b.lz bad2t.lz f6bt.lz +[ $? = 2 ] || test_failed $LINENO +cat "${bad2_lz}" in | cmp bad2t.lz - || test_failed $LINENO +cat "${f6b1_lz}" in | cmp f6abt.lz - || test_failed $LINENO +cmp "${fox6_lz}" fox6t.lz || test_failed $LINENO +cmp fox5.lz f6b.lz || test_failed $LINENO +cmp fox5.lz f6bt.lz || test_failed $LINENO +rm -f fox6t.lz f6b.lz f6bt.lz bad2t.lz fox5.lz copy2 copy4 || framework_failure + +printf "\ntesting trailing data..." + +cat "${in_lz}" "${inD}" > int.lz || framework_failure +"${LZIPRECOVER}" --dump=tdata int.lz > copy || test_failed $LINENO +cmp "${inD}" copy || test_failed $LINENO +rm -f copy || framework_failure +"${LZIPRECOVER}" --dump=tdat int.lz -o copy || test_failed $LINENO +cmp "${inD}" copy || test_failed $LINENO +cat "${fox6_lz}" "${inD}" > fox6t.lz || framework_failure +cat "${inD}" "${inD}" > inD2 || framework_failure +"${LZIPRECOVER}" --dump=tda int.lz fox6t.lz -f -o copy || test_failed $LINENO +cmp inD2 copy || test_failed $LINENO +rm -f inD2 || framework_failure +cat ingin.lz "${inD}" > ingint.lz || framework_failure +"${LZIPRECOVER}" -q --dump=td ingint.lz > /dev/null +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=t ingint.lz > copy || test_failed $LINENO +cmp "${inD}" copy || test_failed $LINENO + +"${LZIPRECOVER}" --strip=tdata int.lz > copy || test_failed $LINENO +cmp "${in_lz}" copy || test_failed $LINENO +rm -f copy || framework_failure +"${LZIPRECOVER}" --strip=tdata int.lz -o copy || test_failed $LINENO +cmp "${in_lz}" copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=tdata fox6t.lz -f -o copy || test_failed $LINENO +cmp "${fox6_lz}" copy || test_failed $LINENO +"${LZIPRECOVER}" --strip=tdata int.lz int.lz -f -o copy || test_failed $LINENO +cmp in2.lz copy || test_failed $LINENO +rm -f in2.lz || framework_failure +"${LZIPRECOVER}" --strip=tdata int.lz fox6t.lz > copy || test_failed $LINENO +cat "${in_lz}" "${fox6_lz}" | cmp copy - || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=tdata ingint.lz > /dev/null +[ $? = 2 ] || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=tdata ingint.lz > copy || test_failed $LINENO +cmp ingin.lz copy || test_failed $LINENO + +"${LZIPRECOVER}" --remove=tdata int.lz fox6t.lz || test_failed $LINENO +cmp "${in_lz}" int.lz || test_failed $LINENO +cmp "${fox6_lz}" fox6t.lz || test_failed $LINENO +"${LZIPRECOVER}" --remove=tdata int.lz || test_failed $LINENO +cmp "${in_lz}" int.lz || test_failed $LINENO +"${LZIPRECOVER}" --remove=tdata fox6t.lz || test_failed $LINENO +cmp "${fox6_lz}" fox6t.lz || test_failed $LINENO +"${LZIPRECOVER}" -q --remove=tdata ingint.lz +[ $? = 2 ] || test_failed $LINENO +cmp -s ingin.lz ingint.lz && test_failed $LINENO +"${LZIPRECOVER}" -i --remove=tdata ingint.lz || test_failed $LINENO +cmp ingin.lz ingint.lz || test_failed $LINENO +rm -f int.lz fox6t.lz ingint.lz ingin.lz || framework_failure + +for i in "${f6s3_lz}" "${f6s4_lz}" "${f6s5_lz}" "${f6s6_lz}" ; do + "${LZIPRECOVER}" --strip=tdata "$i" > copy || test_failed $LINENO "$i" + "${LZIPRECOVER}" --dump=tdata "$i" > tdata || test_failed $LINENO "$i" + cmp "${fox6_lz}" copy || test_failed $LINENO "$i" + cat copy tdata | cmp "$i" - || test_failed $LINENO "$i" + cat "$i" "${inD}" > f6t.lz || framework_failure + "${LZIPRECOVER}" --strip=tdata f6t.lz > copy || test_failed $LINENO "$i" + "${LZIPRECOVER}" --dump=tdata f6t.lz > tdata || test_failed $LINENO "$i" + cmp "${fox6_lz}" copy || test_failed $LINENO "$i" + cat copy tdata | cmp f6t.lz - || test_failed $LINENO "$i" + "${LZIPRECOVER}" --remove=tdata f6t.lz || test_failed $LINENO "$i" + cmp "${fox6_lz}" f6t.lz || test_failed $LINENO "$i" + rm -f copy tdata f6t.lz || framework_failure +done + +printf "\ntesting --dump/remove/strip..." + +"${LZIPRECOVER}" -s "${num_lz}" -o num.lz || test_failed $LINENO +[ -e rec9num.lz ] || test_failed $LINENO +[ ! -e rec10num.lz ] || test_failed $LINENO +cat rec*num.lz | cmp "${num_lz}" - || test_failed $LINENO +for i in 1 2 3 4 5 6 7 8 9 ; do + "${LZIPRECOVER}" --dump=$i "${num_lz}" | cmp rec${i}num.lz - || + test_failed $LINENO $i + "${LZIPRECOVER}" --strip=^$i "${num_lz}" | cmp rec${i}num.lz - || + test_failed $LINENO $i + cat "${num_lz}" > num.lz || framework_failure + "${LZIPRECOVER}" --remove=^$i num.lz || test_failed $LINENO $i + cmp rec${i}num.lz num.lz || test_failed $LINENO $i +done +"${LZIPRECOVER}" -q --dump=1 in "${num_lz}" > out +[ $? = 2 ] || test_failed $LINENO +cmp rec1num.lz out || test_failed $LINENO +"${LZIPRECOVER}" -q --strip=^1 in "${num_lz}" > out +[ $? = 2 ] || test_failed $LINENO +cmp rec1num.lz out || test_failed $LINENO + +"${LZIPRECOVER}" --dump=r1 "${num_lz}" | cmp rec9num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=d:r3 "${num_lz}" | cmp rec7num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=r5:d "${num_lz}" | cmp rec5num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=t:r9 "${num_lz}" | cmp rec1num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=r^1:t "${num_lz}" | cmp rec9num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=d:r^3:t "${num_lz}" | cmp rec7num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=r^5:d:t "${num_lz}" | cmp rec5num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=d:t:r^9 "${num_lz}" | cmp rec1num.lz - || + test_failed $LINENO + +"${LZIPRECOVER}" --dump=1,5 "${num_lz}" > out || test_failed $LINENO +cat rec1num.lz rec5num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --dump=3,6 "${num_lz}" > out || test_failed $LINENO +cat rec3num.lz rec6num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --dump=2-4 "${num_lz}" > out || test_failed $LINENO +cat rec2num.lz rec3num.lz rec4num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --dump=4,6,8 "${num_lz}" > out || test_failed $LINENO +cat rec4num.lz rec6num.lz rec8num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --strip=^1,5 "${num_lz}" > out || test_failed $LINENO +cat rec1num.lz rec5num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --strip=^3,6 "${num_lz}" > out || test_failed $LINENO +cat rec3num.lz rec6num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --strip=^2-4 "${num_lz}" > out || test_failed $LINENO +cat rec2num.lz rec3num.lz rec4num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --strip=^4,6,8 "${num_lz}" > out || test_failed $LINENO +cat rec4num.lz rec6num.lz rec8num.lz | cmp out - || test_failed $LINENO + +# create a subset tarlz archive +"${LZIPRECOVER}" --dump=1-2:r1:t "${num_lz}" > out || test_failed $LINENO +cat rec1num.lz rec2num.lz rec9num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --dump=4-5:r1:t "${num_lz}" > out || test_failed $LINENO +cat rec4num.lz rec5num.lz rec9num.lz | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" --dump=7-8:r1:t "${num_lz}" > out || test_failed $LINENO +cat rec7num.lz rec8num.lz rec9num.lz | cmp out - || test_failed $LINENO + +"${LZIPRECOVER}" --dump=1-9 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=r1-9 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=1-1000 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=r1-1000 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=1-4:r1-4:5 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --dump=^10 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=^1-9 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=r^1-9 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=^1-1000 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=r^1-1000 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=^1-4:r^1-4:^5 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO +"${LZIPRECOVER}" --strip=10 "${num_lz}" | cmp "${num_lz}" - || + test_failed $LINENO + +"${LZIPRECOVER}" -i --dump=r1 "${nbt_lz}" | cmp rec9num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" -i --dump=r3 "${nbt_lz}" | cmp rec7num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" -i --dump=r7 "${nbt_lz}" | cmp rec4num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" -i --strip=r^1:t "${nbt_lz}" | cmp rec9num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" -i --strip=r^3:t "${nbt_lz}" | cmp rec7num.lz - || + test_failed $LINENO +"${LZIPRECOVER}" -i --strip=r^7:t "${nbt_lz}" | cmp rec4num.lz - || + test_failed $LINENO + +"${LZIPRECOVER}" -i --dump=4 -f -o out "${nbt_lz}" || test_failed $LINENO +printf "gap" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=8 "${nbt_lz}" > out || test_failed $LINENO +printf "damaged" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=tdata "${nbt_lz}" > out || test_failed $LINENO +printf "trailing data" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=4:t "${nbt_lz}" > out || test_failed $LINENO +printf "gaptrailing data" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=4,8:t "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamagedtrailing data" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=4,8 "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamaged" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=damaged "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamaged" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --dump=d:t "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamagedtrailing data" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=^4:t -f -o out "${nbt_lz}" || test_failed $LINENO +printf "gap" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=^8:t "${nbt_lz}" > out || test_failed $LINENO +printf "damaged" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=1-11 "${nbt_lz}" > out || test_failed $LINENO +cmp empty out || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=^4 "${nbt_lz}" > out || test_failed $LINENO +printf "gaptrailing data" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=^4,8 "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamagedtrailing data" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=^4,8:t "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamaged" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=r^4,8:t "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamaged" | cmp out - || test_failed $LINENO +"${LZIPRECOVER}" -i --strip=r^4,8 "${nbt_lz}" > out || test_failed $LINENO +printf "gapdamagedtrailing data" | cmp out - || test_failed $LINENO + +cat "${num_lz}" > num.lz || framework_failure +"${LZIPRECOVER}" --remove=1-3,5,7,9 num.lz || test_failed $LINENO +cat rec4num.lz rec6num.lz rec8num.lz | cmp num.lz - || test_failed $LINENO +cat "${num_lz}" > num.lz || framework_failure +"${LZIPRECOVER}" --remove=^4,6,8 num.lz || test_failed $LINENO +cat rec4num.lz rec6num.lz rec8num.lz | cmp num.lz - || test_failed $LINENO +cat "${num_lz}" > num.lz || framework_failure +"${LZIPRECOVER}" --remove=r1,3,5,7-9 num.lz || test_failed $LINENO +cat rec4num.lz rec6num.lz rec8num.lz | cmp num.lz - || test_failed $LINENO +cat "${num_lz}" > num.lz || framework_failure +"${LZIPRECOVER}" --remove=r^2,4,6 num.lz || test_failed $LINENO +cat rec4num.lz rec6num.lz rec8num.lz | cmp num.lz - || test_failed $LINENO + +cat "${nbt_lz}" > nbt.lz || framework_failure +"${LZIPRECOVER}" -i --remove=4,8:tdata nbt.lz || test_failed $LINENO +cmp "${num_lz}" nbt.lz || test_failed $LINENO +cat "${nbt_lz}" > nbt.lz || framework_failure +"${LZIPRECOVER}" -i --remove=r4,8:tdata nbt.lz || test_failed $LINENO +cmp "${num_lz}" nbt.lz || test_failed $LINENO +cat "${nbt_lz}" > nbt.lz || framework_failure +"${LZIPRECOVER}" --remove=damaged:tdata nbt.lz || test_failed $LINENO +cmp "${num_lz}" nbt.lz || test_failed $LINENO +rm -f rec*num.lz nbt.lz empty || framework_failure + +for i in 1 2 3 4 5 6 7 8 9 10 ; do + "${LZIPRECOVER}" -i --strip=1-$i "${nbt_lz}" > out || + test_failed $LINENO $i + cat "${nbt_lz}" > nbt.lz || framework_failure + "${LZIPRECOVER}" -i --remove=1-$i nbt.lz || test_failed $LINENO $i + cmp nbt.lz out || test_failed $LINENO $i +done +rm -f nbt.lz out || framework_failure + +echo +if [ ${fail} = 0 ] ; then + echo "tests completed successfully." + cd "${objdir}" && rm -r tmp +else + echo "tests failed." +fi +exit ${fail} diff --git a/testsuite/fox.lz b/testsuite/fox.lz Binary files differnew file mode 100644 index 0000000..509da82 --- /dev/null +++ b/testsuite/fox.lz diff --git a/testsuite/fox6.lz b/testsuite/fox6.lz Binary files differnew file mode 100644 index 0000000..8401b99 --- /dev/null +++ b/testsuite/fox6.lz diff --git a/testsuite/fox6_bad1.lz b/testsuite/fox6_bad1.lz Binary files differnew file mode 100644 index 0000000..4e0d8fd --- /dev/null +++ b/testsuite/fox6_bad1.lz diff --git a/testsuite/fox6_bad1.txt b/testsuite/fox6_bad1.txt new file mode 100644 index 0000000..14e5367 --- /dev/null +++ b/testsuite/fox6_bad1.txt @@ -0,0 +1,4 @@ +The quick brown fox jumps over the lazy dog. +The quick brown fox jumps over the lazy dog. +The quick brown fox c††zzzzzzzzzzzzzzzzzzzzzzVhe quick brown fox jumps over the lazy dog. +The quick brown fox jumps over the lazy dog. diff --git a/testsuite/fox6_bad2.lz b/testsuite/fox6_bad2.lz Binary files differnew file mode 100644 index 0000000..bf8a04a --- /dev/null +++ b/testsuite/fox6_bad2.lz diff --git a/testsuite/fox6_bad3.lz b/testsuite/fox6_bad3.lz Binary files differnew file mode 100644 index 0000000..2d3cff2 --- /dev/null +++ b/testsuite/fox6_bad3.lz diff --git a/testsuite/fox6_bad4.lz b/testsuite/fox6_bad4.lz Binary files differnew file mode 100644 index 0000000..e931d7d --- /dev/null +++ b/testsuite/fox6_bad4.lz diff --git a/testsuite/fox6_bad5.lz b/testsuite/fox6_bad5.lz Binary files differnew file mode 100644 index 0000000..95f44f3 --- /dev/null +++ b/testsuite/fox6_bad5.lz diff --git a/testsuite/fox6_bad6.lz b/testsuite/fox6_bad6.lz Binary files differnew file mode 100644 index 0000000..085b2fd --- /dev/null +++ b/testsuite/fox6_bad6.lz diff --git a/testsuite/fox6_sc1.lz b/testsuite/fox6_sc1.lz Binary files differnew file mode 100644 index 0000000..278f8a8 --- /dev/null +++ b/testsuite/fox6_sc1.lz diff --git a/testsuite/fox6_sc2.lz b/testsuite/fox6_sc2.lz Binary files differnew file mode 100644 index 0000000..dc17461 --- /dev/null +++ b/testsuite/fox6_sc2.lz diff --git a/testsuite/fox6_sc3.lz b/testsuite/fox6_sc3.lz Binary files differnew file mode 100644 index 0000000..a602938 --- /dev/null +++ b/testsuite/fox6_sc3.lz diff --git a/testsuite/fox6_sc4.lz b/testsuite/fox6_sc4.lz Binary files differnew file mode 100644 index 0000000..d1a77f7 --- /dev/null +++ b/testsuite/fox6_sc4.lz diff --git a/testsuite/fox6_sc5.lz b/testsuite/fox6_sc5.lz Binary files differnew file mode 100644 index 0000000..35453c6 --- /dev/null +++ b/testsuite/fox6_sc5.lz diff --git a/testsuite/fox6_sc6.lz b/testsuite/fox6_sc6.lz Binary files differnew file mode 100644 index 0000000..c1fad92 --- /dev/null +++ b/testsuite/fox6_sc6.lz diff --git a/testsuite/fox_bcrc.lz b/testsuite/fox_bcrc.lz Binary files differnew file mode 100644 index 0000000..8f6a7c4 --- /dev/null +++ b/testsuite/fox_bcrc.lz diff --git a/testsuite/fox_crc0.lz b/testsuite/fox_crc0.lz Binary files differnew file mode 100644 index 0000000..1abe926 --- /dev/null +++ b/testsuite/fox_crc0.lz diff --git a/testsuite/fox_das46.lz b/testsuite/fox_das46.lz Binary files differnew file mode 100644 index 0000000..43ed9f9 --- /dev/null +++ b/testsuite/fox_das46.lz diff --git a/testsuite/fox_de20.lz b/testsuite/fox_de20.lz Binary files differnew file mode 100644 index 0000000..10949d8 --- /dev/null +++ b/testsuite/fox_de20.lz diff --git a/testsuite/fox_mes81.lz b/testsuite/fox_mes81.lz Binary files differnew file mode 100644 index 0000000..d50ef2e --- /dev/null +++ b/testsuite/fox_mes81.lz diff --git a/testsuite/fox_s11.lz b/testsuite/fox_s11.lz Binary files differnew file mode 100644 index 0000000..dca909c --- /dev/null +++ b/testsuite/fox_s11.lz diff --git a/testsuite/fox_v2.lz b/testsuite/fox_v2.lz Binary files differnew file mode 100644 index 0000000..8620981 --- /dev/null +++ b/testsuite/fox_v2.lz diff --git a/testsuite/numbers.lz b/testsuite/numbers.lz Binary files differnew file mode 100644 index 0000000..57460bc --- /dev/null +++ b/testsuite/numbers.lz diff --git a/testsuite/numbersbt.lz b/testsuite/numbersbt.lz Binary files differnew file mode 100644 index 0000000..019e54d --- /dev/null +++ b/testsuite/numbersbt.lz diff --git a/testsuite/test.txt b/testsuite/test.txt new file mode 100644 index 0000000..9196a3a --- /dev/null +++ b/testsuite/test.txt @@ -0,0 +1,676 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) <year> <name of author> + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. + GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) <year> <name of author>
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/testsuite/test.txt.lz b/testsuite/test.txt.lz Binary files differnew file mode 100644 index 0000000..22cea6e --- /dev/null +++ b/testsuite/test.txt.lz diff --git a/testsuite/test.txt.lzma b/testsuite/test.txt.lzma Binary files differnew file mode 100644 index 0000000..53e54ea --- /dev/null +++ b/testsuite/test.txt.lzma diff --git a/testsuite/test21723.txt b/testsuite/test21723.txt new file mode 100644 index 0000000..7194547 --- /dev/null +++ b/testsuite/test21723.txt @@ -0,0 +1,7 @@ +Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
diff --git a/testsuite/test_bad1.lz b/testsuite/test_bad1.lz Binary files differnew file mode 100644 index 0000000..2129c90 --- /dev/null +++ b/testsuite/test_bad1.lz diff --git a/testsuite/test_bad2.lz b/testsuite/test_bad2.lz Binary files differnew file mode 100644 index 0000000..e013c34 --- /dev/null +++ b/testsuite/test_bad2.lz diff --git a/testsuite/test_bad3.lz b/testsuite/test_bad3.lz Binary files differnew file mode 100644 index 0000000..0ae9e7d --- /dev/null +++ b/testsuite/test_bad3.lz diff --git a/testsuite/test_bad4.lz b/testsuite/test_bad4.lz Binary files differnew file mode 100644 index 0000000..ddb0d6b --- /dev/null +++ b/testsuite/test_bad4.lz diff --git a/testsuite/test_bad5.lz b/testsuite/test_bad5.lz Binary files differnew file mode 100644 index 0000000..6fab91c --- /dev/null +++ b/testsuite/test_bad5.lz diff --git a/testsuite/test_bad6.lz b/testsuite/test_bad6.lz Binary files differnew file mode 100644 index 0000000..cfea88c --- /dev/null +++ b/testsuite/test_bad6.lz diff --git a/testsuite/test_bad6.txt b/testsuite/test_bad6.txt new file mode 100644 index 0000000..b47462e --- /dev/null +++ b/testsuite/test_bad6.txt @@ -0,0 +1,26 @@ +) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to
\ No newline at end of file diff --git a/testsuite/test_bad7.lz b/testsuite/test_bad7.lz Binary files differnew file mode 100644 index 0000000..77f2b85 --- /dev/null +++ b/testsuite/test_bad7.lz diff --git a/testsuite/test_bad7.txt b/testsuite/test_bad7.txt new file mode 100644 index 0000000..be54c7c --- /dev/null +++ b/testsuite/test_bad7.txt @@ -0,0 +1,215 @@ +, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
\ No newline at end of file diff --git a/testsuite/test_bad8.lz b/testsuite/test_bad8.lz Binary files differnew file mode 100644 index 0000000..fca701b --- /dev/null +++ b/testsuite/test_bad8.lz diff --git a/testsuite/test_bad8.txt b/testsuite/test_bad8.txt new file mode 100644 index 0000000..3cb3ff4 --- /dev/null +++ b/testsuite/test_bad8.txt @@ -0,0 +1,3 @@ +1 + + Copyright (C) 1989
\ No newline at end of file diff --git a/testsuite/test_bad9.lz b/testsuite/test_bad9.lz Binary files differnew file mode 100644 index 0000000..becb0ec --- /dev/null +++ b/testsuite/test_bad9.lz diff --git a/testsuite/test_bad9.txt b/testsuite/test_bad9.txt new file mode 100644 index 0000000..b72a626 --- /dev/null +++ b/testsuite/test_bad9.txt @@ -0,0 +1,5 @@ +General +Public License instead of this License. + GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
diff --git a/testsuite/test_em.txt.lz b/testsuite/test_em.txt.lz Binary files differnew file mode 100644 index 0000000..7e96250 --- /dev/null +++ b/testsuite/test_em.txt.lz diff --git a/unzcrash.cc b/unzcrash.cc new file mode 100644 index 0000000..0c92af8 --- /dev/null +++ b/unzcrash.cc @@ -0,0 +1,645 @@ +/* Unzcrash - Tests robustness of decompressors to corrupted data. + Inspired by unzcrash.c from Julian Seward's bzip2. + Copyright (C) 2008-2022 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +/* + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file, 3 for an internal consistency error + (e.g., bug) which caused unzcrash to panic. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cerrno> +#include <climits> // SSIZE_MAX +#include <csignal> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <stdint.h> // SIZE_MAX +#include <unistd.h> +#include <sys/wait.h> + +#include "arg_parser.h" +#include "common.h" + +#if CHAR_BIT != 8 +#error "Environments where CHAR_BIT != 8 are not supported." +#endif + +#if ( defined SIZE_MAX && SIZE_MAX < ULONG_MAX ) || \ + ( defined SSIZE_MAX && SSIZE_MAX < LONG_MAX ) +#error "Environments where 'size_t' is narrower than 'long' are not supported." +#endif + +namespace { + +const char * const program_name = "unzcrash"; +const char * invocation_name = program_name; // default value + +int verbosity = 0; + + +void show_help() + { + std::printf( "Unzcrash tests the robustness of decompressors to corrupted data.\n" + "\nBy default, unzcrash reads the file specified and then repeatedly\n" + "decompresses it, increasing 256 times each byte of the compressed data, so\n" + "as to test all possible one-byte errors. Note that it may take years or even\n" + "centuries to test all possible one-byte errors in a large file (tens of MB).\n" + "\nIf the option '--block' is given, unzcrash reads the file specified and\n" + "then repeatedly decompresses it, setting all bytes in each successive block\n" + "to the value given, so as to test all possible full sector errors.\n" + "\nIf the option '--truncate' is given, unzcrash reads the file specified\n" + "and then repeatedly decompresses it, truncating the file to increasing\n" + "lengths, so as to test all possible truncation points.\n" + "\nNone of the three test modes described above should cause any invalid memory\n" + "accesses. If any of them does, please, report it as a bug to the maintainers\n" + "of the decompressor being tested.\n" + "\nIf the decompressor returns with zero status, unzcrash compares the output\n" + "of the decompressor for the original and corrupt files. If the outputs\n" + "differ, it means that the decompressor returned a false negative; it failed\n" + "to recognize the corruption and produced garbage output. The only exception\n" + "is when a multimember file is truncated just after the last byte of a\n" + "member, producing a shorter but valid compressed file. Except in this latter\n" + "case, please, report any false negative as a bug.\n" + "\nIn order to compare the outputs, unzcrash needs a 'zcmp' program able to\n" + "understand the format being tested. For example the zcmp provided by zutils.\n" + "Use '--zcmp=false' to disable comparisons.\n" + "\nUsage: %s [options] 'lzip -t' file.lz\n", invocation_name ); + std::printf( "\nOptions:\n" + " -h, --help display this help and exit\n" + " -V, --version output version information and exit\n" + " -b, --bits=<range> test N-bit errors instead of full byte\n" + " -B, --block[=<size>][,<val>] test blocks of given size [512,0]\n" + " -d, --delta=<n> test one byte/block/truncation every n bytes\n" + " -e, --set-byte=<pos>,<val> set byte at position <pos> to value <val>\n" + " -n, --no-verify skip initial verification of file.lz\n" + " -p, --position=<bytes> first byte position to test [default 0]\n" + " -q, --quiet suppress all messages\n" + " -s, --size=<bytes> number of byte positions to test [all]\n" + " -t, --truncate test decompression of truncated file\n" + " -v, --verbose be verbose (a 2nd -v gives more)\n" + " -z, --zcmp=<command> set zcmp command name and options [zcmp]\n" + "Examples of <range>: 1 1,2,3 1-4 1,3-5,8 1-3,5-8\n" + "A negative position is relative to the end of file.\n" + "A negative size is relative to the rest of the file.\n" + "\nExit status: 0 for a normal exit, 1 for environmental problems (file\n" + "not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or\n" + "invalid input file, 3 for an internal consistency error (e.g., bug) which\n" + "caused unzcrash to panic.\n" + "\nReport bugs to lzip-bug@nongnu.org\n" + "Lziprecover home page: http://www.nongnu.org/lzip/lziprecover.html\n" ); + } + +} // end namespace + +#include "main_common.cc" + +namespace { + +void parse_block( const char * const arg, const char * const option_name, + long & size, uint8_t & value ) + { + const char * tail = arg; + + if( tail[0] != ',' ) + size = getnum( arg, option_name, 0, 1, INT_MAX, &tail ); + if( tail[0] == ',' ) + value = getnum( tail + 1, option_name, 0, 0, 255 ); + else if( tail[0] ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Bad separator between <size> and <value> in " + "argument of option '%s'.\n", program_name, option_name ); + std::exit( 1 ); + } + } + + +/* Return the address of a malloc'd buffer containing the file data and + the file size in '*size'. + In case of error, return 0 and do not modify '*size'. +*/ +uint8_t * read_file( const char * const name, long * const size ) + { + FILE * const f = std::fopen( name, "rb" ); + if( !f ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't open input file '%s': %s\n", + program_name, name, std::strerror( errno ) ); + return 0; + } + + long buffer_size = 1 << 20; + uint8_t * buffer = (uint8_t *)std::malloc( buffer_size ); + if( !buffer ) { show_error( mem_msg ); return 0; } + long file_size = std::fread( buffer, 1, buffer_size, f ); + while( file_size >= buffer_size ) + { + if( buffer_size >= LONG_MAX ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Input file '%s' is too large.\n", + program_name, name ); + std::free( buffer ); return 0; + } + buffer_size = ( buffer_size <= LONG_MAX / 2 ) ? 2 * buffer_size : LONG_MAX; + uint8_t * const tmp = (uint8_t *)std::realloc( buffer, buffer_size ); + if( !tmp ) { show_error( mem_msg ); std::free( buffer ); return 0; } + buffer = tmp; + file_size += std::fread( buffer + file_size, 1, buffer_size - file_size, f ); + } + if( std::ferror( f ) || !std::feof( f ) ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error reading file '%s': %s\n", + program_name, name, std::strerror( errno ) ); + std::free( buffer ); return 0; + } + std::fclose( f ); + *size = file_size; + return buffer; + } + + +class Bitset8 // 8 value bitset (1 to 8) + { + bool data[8]; + static bool valid_digit( const unsigned char ch ) + { return ( ch >= '1' && ch <= '8' ); } + +public: + Bitset8() { for( int i = 0; i < 8; ++i ) data[i] = true; } + + bool includes( const int i ) const + { return ( i >= 1 && i <= 8 && data[i-1] ); } + + // Recognized formats: 1 1,2,3 1-4 1,3-5,8 1-3,5-8 + bool parse_bs( const char * p ) + { + for( int i = 0; i < 8; ++i ) data[i] = false; + while( true ) + { + const unsigned char ch1 = *p++; + if( !valid_digit( ch1 ) ) break; + if( *p != '-' ) data[ch1-'1'] = true; + else + { + ++p; + if( !valid_digit( *p ) || ch1 > *p ) break; + for( int c = ch1; c <= *p; ++c ) data[c-'1'] = true; + ++p; + } + if( *p == 0 ) return true; + if( *p == ',' ) ++p; else break; + } + show_error( "Invalid value or range." ); + return false; + } + + // number of N-bit errors per byte (N=0 to 8): 1 8 28 56 70 56 28 8 1 + void print() const + { + std::fflush( stderr ); + int c = 0; + for( int i = 0; i < 8; ++i ) if( data[i] ) ++c; + if( c == 8 ) std::fputs( "Testing full byte.\n", stdout ); + else if( c == 0 ) std::fputs( "Nothing to test.\n", stdout ); + else + { + std::fputs( "Testing ", stdout ); + for( int i = 0; i < 8; ++i ) + if( data[i] ) + { + std::printf( "%d", i + 1 ); + if( --c ) std::fputc( ',', stdout ); + } + std::fputs( " bit errors.\n", stdout ); + } + std::fflush( stdout ); + } + }; + + +int differing_bits( const uint8_t byte1, const uint8_t byte2 ) + { + int count = 0; + uint8_t dif = byte1 ^ byte2; + while( dif ) + { count += ( dif & 1 ); dif >>= 1; } + return count; + } + + +/* Return the number of bytes really written. + If (value returned < size), it is always an error. +*/ +long writeblock( const int fd, const uint8_t * const buf, const long size ) + { + long sz = 0; + errno = 0; + while( sz < size ) + { + const long n = write( fd, buf + sz, size - sz ); + if( n > 0 ) sz += n; + else if( n < 0 && errno != EINTR ) break; + errno = 0; + } + return sz; + } + + +void show_exec_error( const char * const prog_name ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't exec '%s': %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +void show_fork_error( const char * const prog_name ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't fork '%s': %s\n", + program_name, prog_name, std::strerror( errno ) ); + } + + +int wait_for_child( const pid_t pid, const char * const name ) + { + int status; + while( waitpid( pid, &status, 0 ) == -1 ) + { + if( errno != EINTR ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error waiting termination of '%s': %s\n", + program_name, name, std::strerror( errno ) ); + return -1; + } + } + if( WIFEXITED( status ) ) + { const int ret = WEXITSTATUS( status ); if( ret != 255 ) return ret; } + return -1; + } + + +bool word_split( const char * const command, std::vector< std::string > & args ) + { + const unsigned long old_size = args.size(); + for( const char * p = command; *p; ) + { + while( *p && std::isspace( *p ) ) ++p; // strip leading space + if( !*p ) break; + if( *p == '\'' || *p == '"' ) // quoted name + { + const char quote = *p; + const char * const begin = ++p; // skip leading quote + while( *p && *p != quote ) ++p; + if( !*p || begin == p ) return false; // umbalanced or empty + args.push_back( std::string( begin, p - begin ) ); + ++p; continue; // skip trailing quote + } + const char * const begin = p++; + while( *p && !std::isspace( *p ) ) ++p; + args.push_back( std::string( begin, p - begin ) ); + } + return args.size() > old_size; + } + + +// return -1 if fatal error, 0 if OK, >0 if error +int fork_and_feed( const uint8_t * const buffer, const long buffer_size, + const char ** const argv, const bool verify = false ) + { + int fda[2]; // pipe to child + if( pipe( fda ) < 0 ) + { show_error( "Can't create pipe", errno ); return -1; } + + const pid_t pid = vfork(); + if( pid < 0 ) // parent + { show_fork_error( argv[0] ); return -1; } + else if( pid > 0 ) // parent (feed data to child) + { + if( close( fda[0] ) != 0 ) + { show_error( "Error closing unused pipe", errno ); return -1; } + if( writeblock( fda[1], buffer, buffer_size ) != buffer_size && verify ) + { show_error( "Can't write to child process", errno ); return -1; } + if( close( fda[1] ) != 0 ) + { show_error( "Error closing pipe", errno ); return -1; } + } + else if( pid == 0 ) // child + { + if( dup2( fda[0], STDIN_FILENO ) >= 0 && + close( fda[0] ) == 0 && close( fda[1] ) == 0 ) + execvp( argv[0], (char **)argv ); + show_exec_error( argv[0] ); + _exit( 255 ); // 255 means fatal error in wait_for_child + } + + return wait_for_child( pid, argv[0] ); + } + +} // end namespace + + +int main( const int argc, const char * const argv[] ) + { + enum Mode { m_block, m_byte, m_truncate }; + const char * mode_str[3] = { "block", "byte", "size" }; + Bitset8 bits; // if Bitset8::parse_bs not called test full byte + Bad_byte bad_byte; + const char * zcmp_program = "zcmp"; + long pos = 0; + long max_size = LONG_MAX; + long delta = 0; // to be set later + long block_size = 512; + Mode program_mode = m_byte; + uint8_t block_value = 0; + bool verify = true; + if( argc > 0 ) invocation_name = argv[0]; + + const Arg_parser::Option options[] = + { + { 'h', "help", Arg_parser::no }, + { 'b', "bits", Arg_parser::yes }, + { 'B', "block", Arg_parser::maybe }, + { 'd', "delta", Arg_parser::yes }, + { 'e', "set-byte", Arg_parser::yes }, + { 'n', "no-verify", Arg_parser::no }, + { 'p', "position", Arg_parser::yes }, + { 'q', "quiet", Arg_parser::no }, + { 's', "size", Arg_parser::yes }, + { 't', "truncate", Arg_parser::no }, + { 'v', "verbose", Arg_parser::no }, + { 'V', "version", Arg_parser::no }, + { 'z', "zcmp", Arg_parser::yes }, + { 0 , 0, Arg_parser::no } }; + + const Arg_parser parser( argc, argv, options ); + if( parser.error().size() ) // bad option + { show_error( parser.error().c_str(), 0, true ); return 1; } + + int argind = 0; + for( ; argind < parser.arguments(); ++argind ) + { + const int code = parser.code( argind ); + if( !code ) break; // no more options + const char * const pn = parser.parsed_name( argind ).c_str(); + const char * const arg = parser.argument( argind ).c_str(); + switch( code ) + { + case 'h': show_help(); return 0; + case 'b': if( !bits.parse_bs( arg ) ) return 1; program_mode = m_byte; break; + case 'B': if( arg[0] ) parse_block( arg, pn, block_size, block_value ); + program_mode = m_block; break; + case 'd': delta = getnum( arg, pn, block_size, 1, INT_MAX ); break; + case 'e': bad_byte.parse_bb( arg, pn ); break; + case 'n': verify = false; break; + case 'p': pos = getnum( arg, pn, block_size, -LONG_MAX, LONG_MAX ); break; + case 'q': verbosity = -1; break; + case 's': max_size = getnum( arg, pn, block_size, -LONG_MAX, LONG_MAX ); break; + case 't': program_mode = m_truncate; break; + case 'v': if( verbosity < 4 ) ++verbosity; break; + case 'V': show_version(); return 0; + case 'z': zcmp_program = arg; break; + default : internal_error( "uncaught option." ); + } + } // end process options + + if( parser.arguments() - argind != 2 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "Usage: %s 'lzip -t' file.lz\n", invocation_name ); + return 1; + } + + if( delta <= 0 ) delta = ( program_mode == m_block ) ? block_size : 1; + + const char * const command = parser.argument( argind ).c_str(); + std::vector< std::string > command_args; + if( !word_split( command, command_args ) ) + { show_file_error( command, "Invalid command" ); return 1; } + const char ** const command_argv = new const char *[command_args.size()+1]; + for( unsigned i = 0; i < command_args.size(); ++i ) + command_argv[i] = command_args[i].c_str(); + command_argv[command_args.size()] = 0; + + const char * const filename = parser.argument( argind + 1 ).c_str(); + long file_size = 0; + uint8_t * const buffer = read_file( filename, &file_size ); + if( !buffer ) return 1; + std::string zcmp_command; + std::vector< std::string > zcmp_args; + const char ** zcmp_argv = 0; + if( std::strcmp( zcmp_program, "false" ) != 0 ) + { + zcmp_command = zcmp_program; + zcmp_command += " '"; zcmp_command += filename; zcmp_command += "' -"; + if( !word_split( zcmp_command.c_str(), zcmp_args ) ) + { show_file_error( zcmp_command.c_str(), "Invalid zcmp command" ); + return 1; } + zcmp_argv = new const char *[zcmp_args.size()+1]; + for( unsigned i = 0; i < zcmp_args.size(); ++i ) + zcmp_argv[i] = zcmp_args[i].c_str(); + zcmp_argv[zcmp_args.size()] = 0; + } + + // verify original file + if( verbosity >= 1 ) fprintf( stderr, "Testing file '%s'\n", filename ); + if( verify ) + { + const int ret = fork_and_feed( buffer, file_size, command_argv, true ); + if( ret != 0 ) + { + if( verbosity >= 0 ) + { + if( ret < 0 ) + std::fprintf( stderr, "%s: Can't run '%s'.\n", program_name, command ); + else + std::fprintf( stderr, "%s: \"%s\" failed (%d).\n", + program_name, command, ret ); + } + return 1; + } + if( zcmp_command.size() ) + { + const int ret = fork_and_feed( buffer, file_size, zcmp_argv, true ); + if( ret != 0 ) + { + if( verbosity >= 0 ) + { + if( ret < 0 ) + std::fprintf( stderr, "%s: Can't run '%s'.\n", + program_name, zcmp_command.c_str() ); + else + std::fprintf( stderr, "%s: \"%s\" failed (%d). Disabling comparisons.\n", + program_name, zcmp_command.c_str(), ret ); + } + if( ret < 0 ) return 1; + zcmp_command.clear(); + } + } + } + + std::signal( SIGPIPE, SIG_IGN ); + + if( pos < 0 ) pos = std::max( 0L, file_size + pos ); + if( pos >= file_size || max_size == 0 || + ( max_size < 0 && -max_size >= file_size - pos ) ) + { show_error( "Nothing to do; domain is empty." ); return 0; } + if( max_size < 0 ) max_size += file_size - pos; + const long end = ( ( max_size < file_size - pos ) ? pos + max_size : file_size ); + if( bad_byte.pos >= file_size ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Position is beyond end of file " + "in option '%s'.\n", program_name, bad_byte.option_name ); + return 1; + } + if( bad_byte.pos >= 0 ) + buffer[bad_byte.pos] = bad_byte( buffer[bad_byte.pos] ); + long positions = 0, decompressions = 0, successes = 0, failed_comparisons = 0; + if( program_mode == m_truncate ) + for( long i = pos; i < end; i += std::min( delta, end - i ) ) + { + if( verbosity >= 1 ) std::fprintf( stderr, "length %ld\n", i ); + ++positions; ++decompressions; + const int ret = fork_and_feed( buffer, i, command_argv ); + if( ret < 0 ) return 1; + if( ret == 0 ) + { + ++successes; + if( verbosity >= 0 ) + std::fprintf( stderr, "length %ld passed the test\n", i ); + if( zcmp_command.size() ) + { + const int ret = fork_and_feed( buffer, i, zcmp_argv ); + if( ret < 0 ) return 1; + if( ret > 0 ) + { + ++failed_comparisons; + if( verbosity >= 0 ) + std::fprintf( stderr, "length %ld comparison failed\n", i ); + } + } + } + } + else if( program_mode == m_block ) + { + uint8_t * block = (uint8_t *)std::malloc( block_size ); + if( !block ) { show_error( mem_msg ); return 1; } + for( long i = pos; i < end; i += std::min( delta, end - i ) ) + { + const long size = std::min( block_size, file_size - i ); + if( verbosity >= 1 ) std::fprintf( stderr, "block %ld,%ld\n", i, size ); + ++positions; ++decompressions; + std::memcpy( block, buffer + i, size ); + std::memset( buffer + i, block_value, size ); + const int ret = fork_and_feed( buffer, file_size, command_argv ); + if( ret < 0 ) return 1; + if( ret == 0 ) + { + ++successes; + if( verbosity >= 0 ) + std::fprintf( stderr, "block %ld,%ld passed the test\n", i, size ); + if( zcmp_command.size() ) + { + const int ret = fork_and_feed( buffer, file_size, zcmp_argv ); + if( ret < 0 ) return 1; + if( ret > 0 ) + { + ++failed_comparisons; + if( verbosity >= 0 ) + std::fprintf( stderr, "block %ld,%ld comparison failed\n", i, size ); + } + } + } + std::memcpy( buffer + i, block, size ); + } + std::free( block ); + } + else + { + if( verbosity >= 1 ) bits.print(); + for( long i = pos; i < end; i += std::min( delta, end - i ) ) + { + if( verbosity >= 1 ) std::fprintf( stderr, "byte %ld\n", i ); + ++positions; + const uint8_t byte = buffer[i]; + for( int j = 1; j < 256; ++j ) + { + ++buffer[i]; + if( bits.includes( differing_bits( byte, buffer[i] ) ) ) + { + ++decompressions; + if( verbosity >= 2 ) + std::fprintf( stderr, "0x%02X (0x%02X+0x%02X) ", + buffer[i], byte, j ); + const int ret = fork_and_feed( buffer, file_size, command_argv ); + if( ret < 0 ) return 1; + if( ret == 0 ) + { + ++successes; + if( verbosity >= 0 ) + { if( verbosity < 2 ) // else already printed above + std::fprintf( stderr, "0x%02X (0x%02X+0x%02X) ", + buffer[i], byte, j ); + std::fprintf( stderr, "byte %ld passed the test\n", i ); } + if( zcmp_command.size() ) + { + const int ret = fork_and_feed( buffer, file_size, zcmp_argv ); + if( ret < 0 ) return 1; + if( ret > 0 ) + { + ++failed_comparisons; + if( verbosity >= 0 ) + std::fprintf( stderr, "byte %ld comparison failed\n", i ); + } + } + } + } + } + buffer[i] = byte; + } + } + + if( verbosity >= 0 ) + { + std::fprintf( stderr, "\n%8ld %ss tested\n%8ld total decompressions" + "\n%8ld decompressions returned with zero status", + positions, mode_str[program_mode], decompressions, successes ); + if( successes > 0 ) + { + if( zcmp_command.empty() ) + std::fputs( "\n comparisons disabled\n", stderr ); + else if( failed_comparisons > 0 ) + std::fprintf( stderr, ", of which\n%8ld comparisons failed\n", + failed_comparisons ); + else std::fputs( "\n all comparisons passed\n", stderr ); + } + else std::fputc( '\n', stderr ); + } + + std::free( buffer ); + return 0; + } |