summaryrefslogtreecommitdiffstats
path: root/lib/fileeq.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/fileeq.c641
1 files changed, 641 insertions, 0 deletions
diff --git a/lib/fileeq.c b/lib/fileeq.c
new file mode 100644
index 0000000..d3d9aa3
--- /dev/null
+++ b/lib/fileeq.c
@@ -0,0 +1,641 @@
+/*
+ * compare files content
+ *
+ * The goal is to minimize number of data we need to read from the files and be
+ * ready to compare large set of files, it means reuse the previous data if
+ * possible. It never read entire file if not necessary.
+ *
+ * The another goal is to minimize number of open files (imagine "hardlink /"),
+ * the code can open only two files and reopen the file next time if
+ * necessary.
+ *
+ * This code supports multiple comparison methods. The very basic step which is
+ * generic for all methods is to read and compare an "intro" (a few bytes from
+ * the begging of the file). This intro buffer is always cached in 'struct
+ * ul_fileeq_data', this intro buffer is addresses as block=0. This primitive
+ * thing can reduce a lot ...
+ *
+ * The next steps depend on selected method:
+ *
+ * * memcmp method: always read data to userspace, nothing is cached, compare
+ * directly files content; fast for small sets of the small files.
+ *
+ * * Linux crypto API: zero-copy method based on sendfile(), data blocks are
+ * send to the kernel hash functions (sha1, ...), and only hash digest is read
+ * and cached in usersapce. Fast for large set of (large) files.
+ *
+ *
+ * No copyright is claimed. This code is in the public domain; do with
+ * it what you wish.
+ *
+ * Written by Karel Zak <kzak@redhat.com> [October 2021]
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* Linux crypto */
+#ifdef HAVE_LINUX_IF_ALG_H
+# include <sys/socket.h>
+# include <linux/if_alg.h>
+# include <sys/param.h>
+# include <sys/sendfile.h>
+# define USE_HARDLINK_CRYPTOAPI 1
+#endif
+
+#include "c.h"
+#include "all-io.h"
+#include "fileeq.h"
+#include "debug.h"
+
+static UL_DEBUG_DEFINE_MASK(ulfileeq);
+UL_DEBUG_DEFINE_MASKNAMES(ulfileeq) = UL_DEBUG_EMPTY_MASKNAMES;
+
+#define ULFILEEQ_DEBUG_INIT (1 << 1)
+#define ULFILEEQ_DEBUG_CRYPTO (1 << 2)
+#define ULFILEEQ_DEBUG_DATA (1 << 3)
+#define ULFILEEQ_DEBUG_EQ (1 << 4)
+
+#define DBG(m, x) __UL_DBG(ulfileeq, ULFILEEQ_DEBUG_, m, x)
+#define ON_DBG(m, x) __UL_DBG_CALL(ulfileeq, ULFILEEQ_DEBUG_, m, x)
+
+#define UL_DEBUG_CURRENT_MASK UL_DEBUG_MASK(ulfileeq)
+#include "debugobj.h"
+
+static void ul_fileeq_init_debug(void)
+{
+ if (ulfileeq_debug_mask)
+ return;
+ __UL_INIT_DEBUG_FROM_ENV(ulfileeq, ULFILEEQ_DEBUG_, 0, ULFILEEQ_DEBUG);
+}
+
+enum {
+ UL_FILEEQ_MEMCMP,
+ UL_FILEEQ_SHA1,
+ UL_FILEEQ_SHA256,
+ UL_FILEEQ_CRC32
+};
+
+struct ul_fileeq_method {
+ int id;
+ const char *name; /* name used by applications */
+ const char *kname; /* name used by kernel crypto */
+ short digsiz;
+};
+
+static const struct ul_fileeq_method ul_eq_methods[] = {
+ [UL_FILEEQ_MEMCMP] = {
+ .id = UL_FILEEQ_MEMCMP, .name = "memcmp"
+ },
+#ifdef USE_HARDLINK_CRYPTOAPI
+ [UL_FILEEQ_SHA1] = {
+ .id = UL_FILEEQ_SHA1, .name = "sha1",
+ .digsiz = 20, .kname = "sha1"
+ },
+ [UL_FILEEQ_SHA256] = {
+ .id = UL_FILEEQ_SHA256, .name = "sha256",
+ .digsiz = 32, .kname = "sha256"
+ },
+
+ [UL_FILEEQ_CRC32] = {
+ .id = UL_FILEEQ_CRC32, .name = "crc32",
+ .digsiz = 4, .kname = "crc32c"
+ }
+#endif
+};
+
+#ifdef USE_HARDLINK_CRYPTOAPI
+static void deinit_crypto_api(struct ul_fileeq *eq)
+{
+ if (!eq)
+ return;
+
+ DBG(CRYPTO, ul_debugobj(eq, "deinit"));
+
+ if (eq->fd_cip >= 0)
+ close(eq->fd_cip);
+ if (eq->fd_api >= 0)
+ close(eq->fd_api);
+
+ eq->fd_cip = eq->fd_api = -1;
+
+}
+
+static int init_crypto_api(struct ul_fileeq *eq)
+{
+ struct sockaddr_alg sa = {
+ .salg_family = AF_ALG,
+ .salg_type = "hash",
+ };
+
+ assert(eq->method);
+ assert(eq->method->kname);
+ assert(eq->fd_api == -1);
+ assert(eq->fd_cip == -1);
+
+ DBG(CRYPTO, ul_debugobj(eq, "init [%s]", eq->method->kname));
+
+ assert(sizeof(sa.salg_name) > strlen(eq->method->kname) + 1);
+ memcpy(&sa.salg_name, eq->method->kname, strlen(eq->method->kname) + 1);
+
+ if ((eq->fd_api = socket(AF_ALG, SOCK_SEQPACKET, 0)) < 0)
+ goto fail;
+ if (bind(eq->fd_api, (struct sockaddr *) &sa, sizeof(sa)) != 0)
+ goto fail;
+ if ((eq->fd_cip = accept(eq->fd_api, NULL, 0)) < 0)
+ goto fail;
+ return 0;
+fail:
+ deinit_crypto_api(eq);
+ return -1;
+}
+#endif
+
+int ul_fileeq_init(struct ul_fileeq *eq, const char *method)
+{
+ size_t i;
+
+ ul_fileeq_init_debug();
+ DBG(EQ, ul_debugobj(eq, "init [%s]", method));
+
+ memset(eq, 0, sizeof(*eq));
+ eq->fd_api = -1;
+ eq->fd_cip = -1;
+
+ for (i = 0; i < ARRAY_SIZE(ul_eq_methods); i++) {
+ const struct ul_fileeq_method *m = &ul_eq_methods[i];
+
+ if (strcmp(m->name, method) == 0) {
+ eq->method = m;
+ break;
+ }
+ }
+
+ if (!eq->method)
+ return -1;
+#ifdef USE_HARDLINK_CRYPTOAPI
+ if (eq->method->id != UL_FILEEQ_MEMCMP
+ && init_crypto_api(eq) != 0)
+ return -1;
+#endif
+ return 0;
+}
+
+void ul_fileeq_deinit(struct ul_fileeq *eq)
+{
+ if (!eq)
+ return;
+
+ DBG(EQ, ul_debugobj(eq, "deinit"));
+#ifdef USE_HARDLINK_CRYPTOAPI
+ deinit_crypto_api(eq);
+#endif
+ free(eq->buf_a);
+ free(eq->buf_b);
+}
+
+void ul_fileeq_data_close_file(struct ul_fileeq_data *data)
+{
+ assert(data);
+
+ if (data->fd >= 0) {
+ DBG(DATA, ul_debugobj(data, "close"));
+ close(data->fd);
+ }
+ data->fd = -1;
+}
+
+void ul_fileeq_data_init(struct ul_fileeq_data *data)
+{
+ DBG(DATA, ul_debugobj(data, "init"));
+ memset(data, 0, sizeof(*data));
+ data->fd = -1;
+}
+
+void ul_fileeq_data_deinit(struct ul_fileeq_data *data)
+{
+ assert(data);
+
+ DBG(DATA, ul_debugobj(data, "deinit"));
+ free(data->blocks);
+ data->blocks = NULL;
+ data->nblocks = 0;
+ data->maxblocks = 0;
+ data->is_eof = 0;
+ data->name = NULL;
+
+ ul_fileeq_data_close_file(data);
+}
+
+int ul_fileeq_data_associated(struct ul_fileeq_data *data)
+{
+ return data->name != NULL;
+}
+
+void ul_fileeq_data_set_file(struct ul_fileeq_data *data, const char *name)
+{
+ assert(data);
+ assert(name);
+
+ DBG(DATA, ul_debugobj(data, "set file: %s", name));
+ ul_fileeq_data_init(data);
+ data->name = name;
+}
+
+size_t ul_fileeq_set_size(struct ul_fileeq *eq, uint64_t filesiz,
+ size_t readsiz, size_t memsiz)
+{
+ uint64_t nreads, maxdigs;
+ size_t digsiz;
+
+ assert(eq);
+
+ eq->filesiz = filesiz;
+
+ switch (eq->method->id) {
+ case UL_FILEEQ_MEMCMP:
+ /* align file size */
+ filesiz = (filesiz + readsiz) / readsiz * readsiz;
+ break;
+ default:
+ digsiz = eq->method->digsiz;
+ if (readsiz < digsiz)
+ readsiz = digsiz;
+ /* align file size */
+ filesiz = (filesiz + readsiz) / readsiz * readsiz;
+ /* calculate limits */
+ maxdigs = memsiz / digsiz;
+ if (maxdigs == 0)
+ maxdigs = 1;
+ nreads = filesiz / readsiz;
+ /* enlarge readsize for large files */
+ if (nreads > maxdigs)
+ readsiz = filesiz / maxdigs;
+ break;
+ }
+
+ eq->readsiz = readsiz;
+ eq->blocksmax = filesiz / readsiz;
+
+ DBG(EQ, ul_debugobj(eq, "set sizes: filesiz=%ju, maxblocks=%" PRIu64 ", readsiz=%zu",
+ eq->filesiz, eq->blocksmax, eq->readsiz));
+ return eq->blocksmax;
+}
+
+static unsigned char *get_buffer(struct ul_fileeq *eq)
+{
+ if (!eq->buf_a)
+ eq->buf_a = malloc(eq->readsiz);
+ if (!eq->buf_b)
+ eq->buf_b = malloc(eq->readsiz);
+
+ if (!eq->buf_a || !eq->buf_b)
+ return NULL;
+
+ if (eq->buf_last == eq->buf_b)
+ eq->buf_last = eq->buf_a;
+ else
+ eq->buf_last = eq->buf_b;
+
+ return eq->buf_last;
+}
+
+#define get_cached_nblocks(_d) \
+ ((_d)->nblocks ? (_d)->nblocks - 1 : 0)
+
+#define get_cached_offset(_e, _d) \
+ ((_d)->nblocks == 0 ? 0 : \
+ sizeof((_d)->intro) \
+ + (get_cached_nblocks(_d) * (_e)->readsiz))
+
+
+static int get_fd(struct ul_fileeq *eq, struct ul_fileeq_data *data, off_t *off)
+{
+ off_t o = get_cached_offset(eq, data);
+
+ assert(eq);
+ assert(data);
+
+
+ if (data->fd < 0) {
+ DBG(DATA, ul_debugobj(data, "open: %s", data->name));
+ data->fd = open(data->name, O_RDONLY);
+ if (data->fd < 0)
+ return data->fd;
+
+#if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE)
+ ignore_result( posix_fadvise(data->fd, o, 0, POSIX_FADV_SEQUENTIAL) );
+#endif
+ if (o) {
+ DBG(DATA, ul_debugobj(data, "lseek off=%ju", (uintmax_t) o));
+ lseek(data->fd, o, SEEK_SET);
+ }
+ }
+
+ if (off)
+ *off = o;
+
+ return data->fd;
+}
+
+static void memcmp_reset(struct ul_fileeq *eq, struct ul_fileeq_data *data)
+{
+ /* only intro[] is cached */
+ if (data->nblocks)
+ data->nblocks = 1;
+ /* reset file possition */
+ if (data->fd >= 0)
+ lseek(data->fd, get_cached_offset(eq, data), SEEK_SET);
+ data->is_eof = 0;
+}
+
+static ssize_t read_block(struct ul_fileeq *eq, struct ul_fileeq_data *data,
+ size_t n, unsigned char **block)
+{
+ int fd;
+ off_t off = 0;
+ ssize_t rsz;
+
+ if (data->is_eof || n > eq->blocksmax)
+ return 0;
+
+ fd = get_fd(eq, data, &off);
+ if (fd < 0)
+ return fd;
+
+ DBG(DATA, ul_debugobj(data, " read block off=%ju", (uintmax_t) off));
+
+ *block = get_buffer(eq);
+ if (!*block)
+ return -ENOMEM;
+
+ rsz = read_all(data->fd, (char *) *block, eq->readsiz);
+ if (rsz < 0) {
+ DBG(DATA, ul_debugobj(data, " read failed"));
+ return rsz;
+ }
+ off += rsz;
+ data->nblocks++;
+
+ if (rsz == 0 || (uint64_t) off >= eq->filesiz) {
+ data->is_eof = 1;
+ ul_fileeq_data_close_file(data);
+ }
+
+ DBG(DATA, ul_debugobj(data, " read sz=%zu", rsz));
+ return rsz;
+}
+
+#ifdef USE_HARDLINK_CRYPTOAPI
+static ssize_t get_digest(struct ul_fileeq *eq, struct ul_fileeq_data *data,
+ size_t n, unsigned char **block)
+{
+ off_t off = 0;
+ ssize_t rsz;
+ size_t sz;
+ int fd;
+
+ if (n > eq->blocksmax)
+ return 0;
+
+ /* return already cached if alvalable */
+ if (n < get_cached_nblocks(data)) {
+ DBG(DATA, ul_debugobj(data, " digest cached"));
+ assert(data->blocks);
+ *block = data->blocks + (n * eq->method->digsiz);
+ return eq->method->digsiz;
+ }
+
+ if (data->is_eof) {
+ DBG(DATA, ul_debugobj(data, " file EOF"));
+ return 0;
+ }
+
+ /* read new block */
+ fd = get_fd(eq, data, &off);
+ if (fd < 0)
+ return fd;
+
+ DBG(DATA, ul_debugobj(data, " read digest off=%ju", (uintmax_t) off));
+
+ sz = eq->method->digsiz;
+
+ if (!data->blocks) {
+ DBG(DATA, ul_debugobj(data, " alloc cache %zu", eq->blocksmax * sz));
+ data->blocks = malloc(eq->blocksmax * sz);
+ if (!data->blocks)
+ return -ENOMEM;
+ }
+
+ assert(n <= eq->blocksmax);
+
+ rsz = sendfile(eq->fd_cip, data->fd, NULL, eq->readsiz);
+ DBG(DATA, ul_debugobj(data, " sent %zu [%zu wanted] to cipher", rsz, eq->readsiz));
+
+ if (rsz < 0)
+ return rsz;
+
+ off += rsz;
+
+ /* get block digest (note 1st block is data->intro */
+ *block = data->blocks + (n * eq->method->digsiz);
+ rsz = read_all(eq->fd_cip, (char *) *block, sz);
+
+ if (rsz > 0)
+ data->nblocks++;
+ if (rsz == 0 || (uint64_t) off >= eq->filesiz) {
+ data->is_eof = 1;
+ ul_fileeq_data_close_file(data);
+ }
+ DBG(DATA, ul_debugobj(data, " get %zuB digest", rsz));
+ return rsz;
+}
+#endif
+
+static ssize_t get_intro(struct ul_fileeq *eq, struct ul_fileeq_data *data,
+ unsigned char **block)
+{
+ if (data->nblocks == 0) {
+ int fd = get_fd(eq, data, NULL);
+ ssize_t rsz;
+
+ if (fd < 0)
+ return -1;
+ rsz = read_all(fd, (char *) data->intro, sizeof(data->intro));
+ DBG(DATA, ul_debugobj(data, " read %zu bytes intro", sizeof(data->intro)));
+ if (rsz <= 0)
+ return -1;
+ data->nblocks = 1;
+ }
+
+ DBG(DATA, ul_debugobj(data, " return intro"));
+ *block = data->intro;
+ return sizeof(data->intro);
+}
+
+static ssize_t get_cmp_data(struct ul_fileeq *eq, struct ul_fileeq_data *data,
+ size_t blockno, unsigned char **block)
+{
+ if (blockno == 0)
+ return get_intro(eq, data, block);
+
+ blockno--;
+
+ switch (eq->method->id) {
+ case UL_FILEEQ_MEMCMP:
+ return read_block(eq, data, blockno, block);
+ default:
+ break;
+ }
+#ifdef USE_HARDLINK_CRYPTOAPI
+ return get_digest(eq, data, blockno, block);
+#else
+ return -1;
+#endif
+}
+
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0))
+
+int ul_fileeq(struct ul_fileeq *eq,
+ struct ul_fileeq_data *a, struct ul_fileeq_data *b)
+{
+ int cmp;
+ size_t n = 0;
+
+ DBG(EQ, ul_debugobj(eq, "--> compare %s %s", a->name, b->name));
+
+ if (eq->method->id == UL_FILEEQ_MEMCMP) {
+ memcmp_reset(eq, a);
+ memcmp_reset(eq, b);
+ }
+
+ do {
+ unsigned char *da, *db;
+ ssize_t ca, cb;
+
+ DBG(EQ, ul_debugobj(eq, "compare block #%zu", n));
+
+ ca = get_cmp_data(eq, a, n, &da);
+ if (ca < 0)
+ goto done;
+ cb = get_cmp_data(eq, b, n, &db);
+ if (cb < 0)
+ goto done;
+ if (ca != cb || ca == 0) {
+ cmp = CMP(ca, cb);
+ break;
+
+ }
+ cmp = memcmp(da, db, ca);
+ DBG(EQ, ul_debugobj(eq, "#%zu=%s", n, cmp == 0 ? "match" : "not-match"));
+ n++;
+ } while (cmp == 0);
+
+ if (cmp == 0) {
+ if (!a->is_eof || !b->is_eof)
+ goto done; /* filesize chnaged? */
+
+ DBG(EQ, ul_debugobj(eq, "<-- MATCH"));
+ return 1;
+ }
+done:
+ DBG(EQ, ul_debugobj(eq, " <-- NOT-MATCH"));
+ return 0;
+}
+
+#ifdef TEST_PROGRAM_FILEEQ
+# include <getopt.h>
+# include <err.h>
+
+int main(int argc, char *argv[])
+{
+ struct ul_fileeq eq;
+ struct ul_fileeq_data a, b, c;
+ const char *method = "sha1";
+ static const struct option longopts[] = {
+ { "method", required_argument, NULL, 'm' },
+ { "help", no_argument, NULL, 'h' },
+ { NULL, 0, NULL, 0 }
+ };
+ int ch, rc;
+ const char *file_a = NULL, *file_b = NULL, *file_c = NULL;
+ struct stat st_a, st_b, st_c;
+
+ while ((ch = getopt_long(argc, argv, "m:", longopts, NULL)) != -1) {
+ switch (ch) {
+ case 'm':
+ method = optarg;
+ break;
+ case 'h':
+ printf("usage: %s [options] <file> <file>\n"
+ " -m, --method <memcmp|sha1|crc32> compare method\n",
+ program_invocation_short_name);
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (optind < argc)
+ file_a = argv[optind++];
+ if (optind < argc)
+ file_b = argv[optind++];
+ if (optind < argc)
+ file_c = argv[optind++];
+
+ if (!file_a || !file_b)
+ errx(EXIT_FAILURE, "no files specified, see --help");
+
+ if (stat(file_a, &st_a) != 0 || !S_ISREG(st_a.st_mode))
+ errx(EXIT_FAILURE, "%s: wrong file", file_a);
+ if (stat(file_b, &st_b) != 0 || !S_ISREG(st_b.st_mode))
+ errx(EXIT_FAILURE, "%s: wrong file", file_b);
+ if (file_c && (stat(file_c, &st_c) != 0 || !S_ISREG(st_c.st_mode)))
+ errx(EXIT_FAILURE, "%s: wrong file", file_c);
+
+
+ if (st_a.st_size != st_b.st_size ||
+ (file_c && st_a.st_size != st_c.st_size))
+ errx(EXIT_FAILURE, "size of the files does not match");
+
+
+ rc = ul_fileeq_init(&eq, method);
+ if (rc != 0 && strcmp(method, "memcmp") != 0) {
+ method = "memcmp";
+ rc = ul_fileeq_init(&eq, method);
+ }
+ if (rc < 0)
+ err(EXIT_FAILURE, "failed to initialize files comparior");
+
+ ul_fileeq_data_set_file(&a, file_a);
+ ul_fileeq_data_set_file(&b, file_b);
+
+ /* 3rd is optional */
+ if (file_c)
+ ul_fileeq_data_set_file(&c, file_c);
+
+ /* filesiz, readsiz, memsiz */
+ ul_fileeq_set_size(&eq, st_a.st_size, 1024*1024, 4*1024);
+
+ rc = ul_fileeq(&eq, &a, &b);
+
+ printf("1st vs. 2nd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH");
+ if (file_c) {
+ rc = ul_fileeq(&eq, &a, &c);
+ printf("1st vs. 3rd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH");
+
+ rc = ul_fileeq(&eq, &b, &c);
+ printf("2st vs. 3rd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH");
+ }
+
+ ul_fileeq_data_deinit(&a);
+ ul_fileeq_data_deinit(&b);
+
+ if (file_c)
+ ul_fileeq_data_deinit(&c);
+
+ ul_fileeq_deinit(&eq);
+ return EXIT_FAILURE;
+}
+#endif /* TEST_PROGRAM_FILEEQ */