From 5ec6074f0633939fd17d94111d10c6c6b062978c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 11:49:36 +0200 Subject: Adding upstream version 1:2.30.2. Signed-off-by: Daniel Baumann --- .../howto/recover-corrupted-object-harder.txt | 479 +++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 Documentation/howto/recover-corrupted-object-harder.txt (limited to 'Documentation/howto/recover-corrupted-object-harder.txt') diff --git a/Documentation/howto/recover-corrupted-object-harder.txt b/Documentation/howto/recover-corrupted-object-harder.txt new file mode 100644 index 0000000..8994e25 --- /dev/null +++ b/Documentation/howto/recover-corrupted-object-harder.txt @@ -0,0 +1,479 @@ +Date: Wed, 16 Oct 2013 04:34:01 -0400 +From: Jeff King +Subject: pack corruption post-mortem +Abstract: Recovering a corrupted object when no good copy is available. +Content-type: text/asciidoc + +How to recover an object from scratch +===================================== + +I was recently presented with a repository with a corrupted packfile, +and was asked if the data was recoverable. This post-mortem describes +the steps I took to investigate and fix the problem. I thought others +might find the process interesting, and it might help somebody in the +same situation. + +******************************** +Note: In this case, no good copy of the repository was available. For +the much easier case where you can get the corrupted object from +elsewhere, see link:recover-corrupted-blob-object.html[this howto]. +******************************** + +I started with an fsck, which found a problem with exactly one object +(I've used $pack and $obj below to keep the output readable, and also +because I'll refer to them later): + +----------- + $ git fsck + error: $pack SHA1 checksum mismatch + error: index CRC mismatch for object $obj from $pack at offset 51653873 + error: inflate: data stream error (incorrect data check) + error: cannot unpack $obj from $pack at offset 51653873 +----------- + +The pack checksum failing means a byte is munged somewhere, and it is +presumably in the object mentioned (since both the index checksum and +zlib were failing). + +Reading the zlib source code, I found that "incorrect data check" means +that the adler-32 checksum at the end of the zlib data did not match the +inflated data. So stepping the data through zlib would not help, as it +did not fail until the very end, when we realize the CRC does not match. +The problematic bytes could be anywhere in the object data. + +The first thing I did was pull the broken data out of the packfile. I +needed to know how big the object was, which I found out with: + +------------ + $ git show-index <$idx | cut -d' ' -f1 | sort -n | grep -A1 51653873 + 51653873 + 51664736 +------------ + +Show-index gives us the list of objects and their offsets. We throw away +everything but the offsets, and then sort them so that our interesting +offset (which we got from the fsck output above) is followed immediately +by the offset of the next object. Now we know that the object data is +10863 bytes long, and we can grab it with: + +------------ + dd if=$pack of=object bs=1 skip=51653873 count=10863 +------------ + +I inspected a hexdump of the data, looking for any obvious bogosity +(e.g., a 4K run of zeroes would be a good sign of filesystem +corruption). But everything looked pretty reasonable. + +Note that the "object" file isn't fit for feeding straight to zlib; it +has the git packed object header, which is variable-length. We want to +strip that off so we can start playing with the zlib data directly. You +can either work your way through it manually (the format is described in +link:../technical/pack-format.html[Documentation/technical/pack-format.txt]), +or you can walk through it in a debugger. I did the latter, creating a +valid pack like: + +------------ + # pack magic and version + printf 'PACK\0\0\0\2' >tmp.pack + # pack has one object + printf '\0\0\0\1' >>tmp.pack + # now add our object data + cat object >>tmp.pack + # and then append the pack trailer + /path/to/git.git/t/helper/test-tool sha1 -b trailer + cat trailer >>tmp.pack +------------ + +and then running "git index-pack tmp.pack" in the debugger (stop at +unpack_raw_entry). Doing this, I found that there were 3 bytes of header +(and the header itself had a sane type and size). So I stripped those +off with: + +------------ + dd if=object of=zlib bs=1 skip=3 +------------ + +I ran the result through zlib's inflate using a custom C program. And +while it did report the error, I did get the right number of output +bytes (i.e., it matched git's size header that we decoded above). But +feeding the result back to "git hash-object" didn't produce the same +sha1. So there were some wrong bytes, but I didn't know which. The file +happened to be C source code, so I hoped I could notice something +obviously wrong with it, but I didn't. I even got it to compile! + +I also tried comparing it to other versions of the same path in the +repository, hoping that there would be some part of the diff that didn't +make sense. Unfortunately, this happened to be the only revision of this +particular file in the repository, so I had nothing to compare against. + +So I took a different approach. Working under the guess that the +corruption was limited to a single byte, I wrote a program to munge each +byte individually, and try inflating the result. Since the object was +only 10K compressed, that worked out to about 2.5M attempts, which took +a few minutes. + +The program I used is here: + +---------------------------------------------- +#include +#include +#include +#include +#include + +static int try_zlib(unsigned char *buf, int len) +{ + /* make this absurdly large so we don't have to loop */ + static unsigned char out[1024*1024]; + z_stream z; + int ret; + + memset(&z, 0, sizeof(z)); + inflateInit(&z); + + z.next_in = buf; + z.avail_in = len; + z.next_out = out; + z.avail_out = sizeof(out); + + ret = inflate(&z, 0); + inflateEnd(&z); + return ret >= 0; +} + +/* eye candy */ +static int counter = 0; +static void progress(int sig) +{ + fprintf(stderr, "\r%d", counter); + alarm(1); +} + +int main(void) +{ + /* oversized so we can read the whole buffer in */ + unsigned char buf[1024*1024]; + int len; + unsigned i, j; + + signal(SIGALRM, progress); + alarm(1); + + len = read(0, buf, sizeof(buf)); + for (i = 0; i < len; i++) { + unsigned char c = buf[i]; + for (j = 0; j <= 0xff; j++) { + buf[i] = j; + + counter++; + if (try_zlib(buf, len)) + printf("i=%d, j=%x\n", i, j); + } + buf[i] = c; + } + + alarm(0); + fprintf(stderr, "\n"); + return 0; +} +---------------------------------------------- + +I compiled and ran with: + +------- + gcc -Wall -Werror -O3 munge.c -o munge -lz + ./munge +#include +#include +#include + +int main(int argc, char **argv) +{ + /* + * oversized so we can read the whole buffer in; + * this could actually be switched to streaming + * to avoid any memory limitations + */ + static unsigned char buf[25 * 1024 * 1024]; + static unsigned char out[25 * 1024 * 1024]; + int len; + z_stream z; + int ret; + + len = read(0, buf, sizeof(buf)); + memset(&z, 0, sizeof(z)); + inflateInit(&z); + + z.next_in = buf; + z.avail_in = len; + z.next_out = out; + z.avail_out = sizeof(out); + + ret = inflate(&z, 0); + if (ret != Z_OK && ret != Z_STREAM_END) + fprintf(stderr, "initial inflate failed (%d)\n", ret); + + fprintf(stderr, "outputting %lu bytes", z.total_out); + fwrite(out, 1, z.total_out, stdout); + return 0; +} +-------------------------- + +And here is the `sha1-munge` program: + +-------------------------- +#include +#include +#include +#include +#include +#include + +/* eye candy */ +static int counter = 0; +static void progress(int sig) +{ + fprintf(stderr, "\r%d", counter); + alarm(1); +} + +static const signed char hexval_table[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, /* 00-07 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 08-0f */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 10-17 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 18-1f */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 20-27 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 28-2f */ + 0, 1, 2, 3, 4, 5, 6, 7, /* 30-37 */ + 8, 9, -1, -1, -1, -1, -1, -1, /* 38-3f */ + -1, 10, 11, 12, 13, 14, 15, -1, /* 40-47 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 48-4f */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 50-57 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 58-5f */ + -1, 10, 11, 12, 13, 14, 15, -1, /* 60-67 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 68-67 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 70-77 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 78-7f */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 80-87 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 88-8f */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 90-97 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* 98-9f */ + -1, -1, -1, -1, -1, -1, -1, -1, /* a0-a7 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* a8-af */ + -1, -1, -1, -1, -1, -1, -1, -1, /* b0-b7 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* b8-bf */ + -1, -1, -1, -1, -1, -1, -1, -1, /* c0-c7 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* c8-cf */ + -1, -1, -1, -1, -1, -1, -1, -1, /* d0-d7 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* d8-df */ + -1, -1, -1, -1, -1, -1, -1, -1, /* e0-e7 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* e8-ef */ + -1, -1, -1, -1, -1, -1, -1, -1, /* f0-f7 */ + -1, -1, -1, -1, -1, -1, -1, -1, /* f8-ff */ +}; + +static inline unsigned int hexval(unsigned char c) +{ +return hexval_table[c]; +} + +static int get_sha1_hex(const char *hex, unsigned char *sha1) +{ + int i; + for (i = 0; i < 20; i++) { + unsigned int val; + /* + * hex[1]=='\0' is caught when val is checked below, + * but if hex[0] is NUL we have to avoid reading + * past the end of the string: + */ + if (!hex[0]) + return -1; + val = (hexval(hex[0]) << 4) | hexval(hex[1]); + if (val & ~0xff) + return -1; + *sha1++ = val; + hex += 2; + } + return 0; +} + +int main(int argc, char **argv) +{ + /* oversized so we can read the whole buffer in */ + static unsigned char buf[25 * 1024 * 1024]; + char header[32]; + int header_len; + unsigned char have[20], want[20]; + int start, len; + SHA_CTX orig; + unsigned i, j; + + if (!argv[1] || get_sha1_hex(argv[1], want)) { + fprintf(stderr, "usage: sha1-munge [start]