diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 14:30:35 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 14:30:35 +0000 |
commit | 378c18e5f024ac5a8aef4cb40d7c9aa9633d144c (patch) | |
tree | 44dfb6ca500d32cabd450649b322a42e70a30683 /lib/fileeq.c | |
parent | Initial commit. (diff) | |
download | util-linux-upstream.tar.xz util-linux-upstream.zip |
Adding upstream version 2.38.1.upstream/2.38.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/fileeq.c')
-rw-r--r-- | lib/fileeq.c | 641 |
1 files changed, 641 insertions, 0 deletions
diff --git a/lib/fileeq.c b/lib/fileeq.c new file mode 100644 index 0000000..d3d9aa3 --- /dev/null +++ b/lib/fileeq.c @@ -0,0 +1,641 @@ +/* + * compare files content + * + * The goal is to minimize number of data we need to read from the files and be + * ready to compare large set of files, it means reuse the previous data if + * possible. It never read entire file if not necessary. + * + * The another goal is to minimize number of open files (imagine "hardlink /"), + * the code can open only two files and reopen the file next time if + * necessary. + * + * This code supports multiple comparison methods. The very basic step which is + * generic for all methods is to read and compare an "intro" (a few bytes from + * the begging of the file). This intro buffer is always cached in 'struct + * ul_fileeq_data', this intro buffer is addresses as block=0. This primitive + * thing can reduce a lot ... + * + * The next steps depend on selected method: + * + * * memcmp method: always read data to userspace, nothing is cached, compare + * directly files content; fast for small sets of the small files. + * + * * Linux crypto API: zero-copy method based on sendfile(), data blocks are + * send to the kernel hash functions (sha1, ...), and only hash digest is read + * and cached in usersapce. Fast for large set of (large) files. + * + * + * No copyright is claimed. This code is in the public domain; do with + * it what you wish. + * + * Written by Karel Zak <kzak@redhat.com> [October 2021] + */ + +#include <inttypes.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +/* Linux crypto */ +#ifdef HAVE_LINUX_IF_ALG_H +# include <sys/socket.h> +# include <linux/if_alg.h> +# include <sys/param.h> +# include <sys/sendfile.h> +# define USE_HARDLINK_CRYPTOAPI 1 +#endif + +#include "c.h" +#include "all-io.h" +#include "fileeq.h" +#include "debug.h" + +static UL_DEBUG_DEFINE_MASK(ulfileeq); +UL_DEBUG_DEFINE_MASKNAMES(ulfileeq) = UL_DEBUG_EMPTY_MASKNAMES; + +#define ULFILEEQ_DEBUG_INIT (1 << 1) +#define ULFILEEQ_DEBUG_CRYPTO (1 << 2) +#define ULFILEEQ_DEBUG_DATA (1 << 3) +#define ULFILEEQ_DEBUG_EQ (1 << 4) + +#define DBG(m, x) __UL_DBG(ulfileeq, ULFILEEQ_DEBUG_, m, x) +#define ON_DBG(m, x) __UL_DBG_CALL(ulfileeq, ULFILEEQ_DEBUG_, m, x) + +#define UL_DEBUG_CURRENT_MASK UL_DEBUG_MASK(ulfileeq) +#include "debugobj.h" + +static void ul_fileeq_init_debug(void) +{ + if (ulfileeq_debug_mask) + return; + __UL_INIT_DEBUG_FROM_ENV(ulfileeq, ULFILEEQ_DEBUG_, 0, ULFILEEQ_DEBUG); +} + +enum { + UL_FILEEQ_MEMCMP, + UL_FILEEQ_SHA1, + UL_FILEEQ_SHA256, + UL_FILEEQ_CRC32 +}; + +struct ul_fileeq_method { + int id; + const char *name; /* name used by applications */ + const char *kname; /* name used by kernel crypto */ + short digsiz; +}; + +static const struct ul_fileeq_method ul_eq_methods[] = { + [UL_FILEEQ_MEMCMP] = { + .id = UL_FILEEQ_MEMCMP, .name = "memcmp" + }, +#ifdef USE_HARDLINK_CRYPTOAPI + [UL_FILEEQ_SHA1] = { + .id = UL_FILEEQ_SHA1, .name = "sha1", + .digsiz = 20, .kname = "sha1" + }, + [UL_FILEEQ_SHA256] = { + .id = UL_FILEEQ_SHA256, .name = "sha256", + .digsiz = 32, .kname = "sha256" + }, + + [UL_FILEEQ_CRC32] = { + .id = UL_FILEEQ_CRC32, .name = "crc32", + .digsiz = 4, .kname = "crc32c" + } +#endif +}; + +#ifdef USE_HARDLINK_CRYPTOAPI +static void deinit_crypto_api(struct ul_fileeq *eq) +{ + if (!eq) + return; + + DBG(CRYPTO, ul_debugobj(eq, "deinit")); + + if (eq->fd_cip >= 0) + close(eq->fd_cip); + if (eq->fd_api >= 0) + close(eq->fd_api); + + eq->fd_cip = eq->fd_api = -1; + +} + +static int init_crypto_api(struct ul_fileeq *eq) +{ + struct sockaddr_alg sa = { + .salg_family = AF_ALG, + .salg_type = "hash", + }; + + assert(eq->method); + assert(eq->method->kname); + assert(eq->fd_api == -1); + assert(eq->fd_cip == -1); + + DBG(CRYPTO, ul_debugobj(eq, "init [%s]", eq->method->kname)); + + assert(sizeof(sa.salg_name) > strlen(eq->method->kname) + 1); + memcpy(&sa.salg_name, eq->method->kname, strlen(eq->method->kname) + 1); + + if ((eq->fd_api = socket(AF_ALG, SOCK_SEQPACKET, 0)) < 0) + goto fail; + if (bind(eq->fd_api, (struct sockaddr *) &sa, sizeof(sa)) != 0) + goto fail; + if ((eq->fd_cip = accept(eq->fd_api, NULL, 0)) < 0) + goto fail; + return 0; +fail: + deinit_crypto_api(eq); + return -1; +} +#endif + +int ul_fileeq_init(struct ul_fileeq *eq, const char *method) +{ + size_t i; + + ul_fileeq_init_debug(); + DBG(EQ, ul_debugobj(eq, "init [%s]", method)); + + memset(eq, 0, sizeof(*eq)); + eq->fd_api = -1; + eq->fd_cip = -1; + + for (i = 0; i < ARRAY_SIZE(ul_eq_methods); i++) { + const struct ul_fileeq_method *m = &ul_eq_methods[i]; + + if (strcmp(m->name, method) == 0) { + eq->method = m; + break; + } + } + + if (!eq->method) + return -1; +#ifdef USE_HARDLINK_CRYPTOAPI + if (eq->method->id != UL_FILEEQ_MEMCMP + && init_crypto_api(eq) != 0) + return -1; +#endif + return 0; +} + +void ul_fileeq_deinit(struct ul_fileeq *eq) +{ + if (!eq) + return; + + DBG(EQ, ul_debugobj(eq, "deinit")); +#ifdef USE_HARDLINK_CRYPTOAPI + deinit_crypto_api(eq); +#endif + free(eq->buf_a); + free(eq->buf_b); +} + +void ul_fileeq_data_close_file(struct ul_fileeq_data *data) +{ + assert(data); + + if (data->fd >= 0) { + DBG(DATA, ul_debugobj(data, "close")); + close(data->fd); + } + data->fd = -1; +} + +void ul_fileeq_data_init(struct ul_fileeq_data *data) +{ + DBG(DATA, ul_debugobj(data, "init")); + memset(data, 0, sizeof(*data)); + data->fd = -1; +} + +void ul_fileeq_data_deinit(struct ul_fileeq_data *data) +{ + assert(data); + + DBG(DATA, ul_debugobj(data, "deinit")); + free(data->blocks); + data->blocks = NULL; + data->nblocks = 0; + data->maxblocks = 0; + data->is_eof = 0; + data->name = NULL; + + ul_fileeq_data_close_file(data); +} + +int ul_fileeq_data_associated(struct ul_fileeq_data *data) +{ + return data->name != NULL; +} + +void ul_fileeq_data_set_file(struct ul_fileeq_data *data, const char *name) +{ + assert(data); + assert(name); + + DBG(DATA, ul_debugobj(data, "set file: %s", name)); + ul_fileeq_data_init(data); + data->name = name; +} + +size_t ul_fileeq_set_size(struct ul_fileeq *eq, uint64_t filesiz, + size_t readsiz, size_t memsiz) +{ + uint64_t nreads, maxdigs; + size_t digsiz; + + assert(eq); + + eq->filesiz = filesiz; + + switch (eq->method->id) { + case UL_FILEEQ_MEMCMP: + /* align file size */ + filesiz = (filesiz + readsiz) / readsiz * readsiz; + break; + default: + digsiz = eq->method->digsiz; + if (readsiz < digsiz) + readsiz = digsiz; + /* align file size */ + filesiz = (filesiz + readsiz) / readsiz * readsiz; + /* calculate limits */ + maxdigs = memsiz / digsiz; + if (maxdigs == 0) + maxdigs = 1; + nreads = filesiz / readsiz; + /* enlarge readsize for large files */ + if (nreads > maxdigs) + readsiz = filesiz / maxdigs; + break; + } + + eq->readsiz = readsiz; + eq->blocksmax = filesiz / readsiz; + + DBG(EQ, ul_debugobj(eq, "set sizes: filesiz=%ju, maxblocks=%" PRIu64 ", readsiz=%zu", + eq->filesiz, eq->blocksmax, eq->readsiz)); + return eq->blocksmax; +} + +static unsigned char *get_buffer(struct ul_fileeq *eq) +{ + if (!eq->buf_a) + eq->buf_a = malloc(eq->readsiz); + if (!eq->buf_b) + eq->buf_b = malloc(eq->readsiz); + + if (!eq->buf_a || !eq->buf_b) + return NULL; + + if (eq->buf_last == eq->buf_b) + eq->buf_last = eq->buf_a; + else + eq->buf_last = eq->buf_b; + + return eq->buf_last; +} + +#define get_cached_nblocks(_d) \ + ((_d)->nblocks ? (_d)->nblocks - 1 : 0) + +#define get_cached_offset(_e, _d) \ + ((_d)->nblocks == 0 ? 0 : \ + sizeof((_d)->intro) \ + + (get_cached_nblocks(_d) * (_e)->readsiz)) + + +static int get_fd(struct ul_fileeq *eq, struct ul_fileeq_data *data, off_t *off) +{ + off_t o = get_cached_offset(eq, data); + + assert(eq); + assert(data); + + + if (data->fd < 0) { + DBG(DATA, ul_debugobj(data, "open: %s", data->name)); + data->fd = open(data->name, O_RDONLY); + if (data->fd < 0) + return data->fd; + +#if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE) + ignore_result( posix_fadvise(data->fd, o, 0, POSIX_FADV_SEQUENTIAL) ); +#endif + if (o) { + DBG(DATA, ul_debugobj(data, "lseek off=%ju", (uintmax_t) o)); + lseek(data->fd, o, SEEK_SET); + } + } + + if (off) + *off = o; + + return data->fd; +} + +static void memcmp_reset(struct ul_fileeq *eq, struct ul_fileeq_data *data) +{ + /* only intro[] is cached */ + if (data->nblocks) + data->nblocks = 1; + /* reset file possition */ + if (data->fd >= 0) + lseek(data->fd, get_cached_offset(eq, data), SEEK_SET); + data->is_eof = 0; +} + +static ssize_t read_block(struct ul_fileeq *eq, struct ul_fileeq_data *data, + size_t n, unsigned char **block) +{ + int fd; + off_t off = 0; + ssize_t rsz; + + if (data->is_eof || n > eq->blocksmax) + return 0; + + fd = get_fd(eq, data, &off); + if (fd < 0) + return fd; + + DBG(DATA, ul_debugobj(data, " read block off=%ju", (uintmax_t) off)); + + *block = get_buffer(eq); + if (!*block) + return -ENOMEM; + + rsz = read_all(data->fd, (char *) *block, eq->readsiz); + if (rsz < 0) { + DBG(DATA, ul_debugobj(data, " read failed")); + return rsz; + } + off += rsz; + data->nblocks++; + + if (rsz == 0 || (uint64_t) off >= eq->filesiz) { + data->is_eof = 1; + ul_fileeq_data_close_file(data); + } + + DBG(DATA, ul_debugobj(data, " read sz=%zu", rsz)); + return rsz; +} + +#ifdef USE_HARDLINK_CRYPTOAPI +static ssize_t get_digest(struct ul_fileeq *eq, struct ul_fileeq_data *data, + size_t n, unsigned char **block) +{ + off_t off = 0; + ssize_t rsz; + size_t sz; + int fd; + + if (n > eq->blocksmax) + return 0; + + /* return already cached if alvalable */ + if (n < get_cached_nblocks(data)) { + DBG(DATA, ul_debugobj(data, " digest cached")); + assert(data->blocks); + *block = data->blocks + (n * eq->method->digsiz); + return eq->method->digsiz; + } + + if (data->is_eof) { + DBG(DATA, ul_debugobj(data, " file EOF")); + return 0; + } + + /* read new block */ + fd = get_fd(eq, data, &off); + if (fd < 0) + return fd; + + DBG(DATA, ul_debugobj(data, " read digest off=%ju", (uintmax_t) off)); + + sz = eq->method->digsiz; + + if (!data->blocks) { + DBG(DATA, ul_debugobj(data, " alloc cache %zu", eq->blocksmax * sz)); + data->blocks = malloc(eq->blocksmax * sz); + if (!data->blocks) + return -ENOMEM; + } + + assert(n <= eq->blocksmax); + + rsz = sendfile(eq->fd_cip, data->fd, NULL, eq->readsiz); + DBG(DATA, ul_debugobj(data, " sent %zu [%zu wanted] to cipher", rsz, eq->readsiz)); + + if (rsz < 0) + return rsz; + + off += rsz; + + /* get block digest (note 1st block is data->intro */ + *block = data->blocks + (n * eq->method->digsiz); + rsz = read_all(eq->fd_cip, (char *) *block, sz); + + if (rsz > 0) + data->nblocks++; + if (rsz == 0 || (uint64_t) off >= eq->filesiz) { + data->is_eof = 1; + ul_fileeq_data_close_file(data); + } + DBG(DATA, ul_debugobj(data, " get %zuB digest", rsz)); + return rsz; +} +#endif + +static ssize_t get_intro(struct ul_fileeq *eq, struct ul_fileeq_data *data, + unsigned char **block) +{ + if (data->nblocks == 0) { + int fd = get_fd(eq, data, NULL); + ssize_t rsz; + + if (fd < 0) + return -1; + rsz = read_all(fd, (char *) data->intro, sizeof(data->intro)); + DBG(DATA, ul_debugobj(data, " read %zu bytes intro", sizeof(data->intro))); + if (rsz <= 0) + return -1; + data->nblocks = 1; + } + + DBG(DATA, ul_debugobj(data, " return intro")); + *block = data->intro; + return sizeof(data->intro); +} + +static ssize_t get_cmp_data(struct ul_fileeq *eq, struct ul_fileeq_data *data, + size_t blockno, unsigned char **block) +{ + if (blockno == 0) + return get_intro(eq, data, block); + + blockno--; + + switch (eq->method->id) { + case UL_FILEEQ_MEMCMP: + return read_block(eq, data, blockno, block); + default: + break; + } +#ifdef USE_HARDLINK_CRYPTOAPI + return get_digest(eq, data, blockno, block); +#else + return -1; +#endif +} + +#define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0)) + +int ul_fileeq(struct ul_fileeq *eq, + struct ul_fileeq_data *a, struct ul_fileeq_data *b) +{ + int cmp; + size_t n = 0; + + DBG(EQ, ul_debugobj(eq, "--> compare %s %s", a->name, b->name)); + + if (eq->method->id == UL_FILEEQ_MEMCMP) { + memcmp_reset(eq, a); + memcmp_reset(eq, b); + } + + do { + unsigned char *da, *db; + ssize_t ca, cb; + + DBG(EQ, ul_debugobj(eq, "compare block #%zu", n)); + + ca = get_cmp_data(eq, a, n, &da); + if (ca < 0) + goto done; + cb = get_cmp_data(eq, b, n, &db); + if (cb < 0) + goto done; + if (ca != cb || ca == 0) { + cmp = CMP(ca, cb); + break; + + } + cmp = memcmp(da, db, ca); + DBG(EQ, ul_debugobj(eq, "#%zu=%s", n, cmp == 0 ? "match" : "not-match")); + n++; + } while (cmp == 0); + + if (cmp == 0) { + if (!a->is_eof || !b->is_eof) + goto done; /* filesize chnaged? */ + + DBG(EQ, ul_debugobj(eq, "<-- MATCH")); + return 1; + } +done: + DBG(EQ, ul_debugobj(eq, " <-- NOT-MATCH")); + return 0; +} + +#ifdef TEST_PROGRAM_FILEEQ +# include <getopt.h> +# include <err.h> + +int main(int argc, char *argv[]) +{ + struct ul_fileeq eq; + struct ul_fileeq_data a, b, c; + const char *method = "sha1"; + static const struct option longopts[] = { + { "method", required_argument, NULL, 'm' }, + { "help", no_argument, NULL, 'h' }, + { NULL, 0, NULL, 0 } + }; + int ch, rc; + const char *file_a = NULL, *file_b = NULL, *file_c = NULL; + struct stat st_a, st_b, st_c; + + while ((ch = getopt_long(argc, argv, "m:", longopts, NULL)) != -1) { + switch (ch) { + case 'm': + method = optarg; + break; + case 'h': + printf("usage: %s [options] <file> <file>\n" + " -m, --method <memcmp|sha1|crc32> compare method\n", + program_invocation_short_name); + return EXIT_FAILURE; + } + } + + if (optind < argc) + file_a = argv[optind++]; + if (optind < argc) + file_b = argv[optind++]; + if (optind < argc) + file_c = argv[optind++]; + + if (!file_a || !file_b) + errx(EXIT_FAILURE, "no files specified, see --help"); + + if (stat(file_a, &st_a) != 0 || !S_ISREG(st_a.st_mode)) + errx(EXIT_FAILURE, "%s: wrong file", file_a); + if (stat(file_b, &st_b) != 0 || !S_ISREG(st_b.st_mode)) + errx(EXIT_FAILURE, "%s: wrong file", file_b); + if (file_c && (stat(file_c, &st_c) != 0 || !S_ISREG(st_c.st_mode))) + errx(EXIT_FAILURE, "%s: wrong file", file_c); + + + if (st_a.st_size != st_b.st_size || + (file_c && st_a.st_size != st_c.st_size)) + errx(EXIT_FAILURE, "size of the files does not match"); + + + rc = ul_fileeq_init(&eq, method); + if (rc != 0 && strcmp(method, "memcmp") != 0) { + method = "memcmp"; + rc = ul_fileeq_init(&eq, method); + } + if (rc < 0) + err(EXIT_FAILURE, "failed to initialize files comparior"); + + ul_fileeq_data_set_file(&a, file_a); + ul_fileeq_data_set_file(&b, file_b); + + /* 3rd is optional */ + if (file_c) + ul_fileeq_data_set_file(&c, file_c); + + /* filesiz, readsiz, memsiz */ + ul_fileeq_set_size(&eq, st_a.st_size, 1024*1024, 4*1024); + + rc = ul_fileeq(&eq, &a, &b); + + printf("1st vs. 2nd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH"); + if (file_c) { + rc = ul_fileeq(&eq, &a, &c); + printf("1st vs. 3rd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH"); + + rc = ul_fileeq(&eq, &b, &c); + printf("2st vs. 3rd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH"); + } + + ul_fileeq_data_deinit(&a); + ul_fileeq_data_deinit(&b); + + if (file_c) + ul_fileeq_data_deinit(&c); + + ul_fileeq_deinit(&eq); + return EXIT_FAILURE; +} +#endif /* TEST_PROGRAM_FILEEQ */ |