summaryrefslogtreecommitdiffstats
path: root/src/zstd/tests/regression/data.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/zstd/tests/regression/data.c613
1 files changed, 613 insertions, 0 deletions
diff --git a/src/zstd/tests/regression/data.c b/src/zstd/tests/regression/data.c
new file mode 100644
index 000000000..b75ac1192
--- /dev/null
+++ b/src/zstd/tests/regression/data.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 2016-2020, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "data.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sys/stat.h>
+
+#include <curl/curl.h>
+
+#include "mem.h"
+#include "util.h"
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash.h"
+
+/**
+ * Data objects
+ */
+
+#define REGRESSION_RELEASE(x) \
+ "https://github.com/facebook/zstd/releases/download/regression-data/" x
+
+data_t silesia = {
+ .name = "silesia",
+ .type = data_type_dir,
+ .data =
+ {
+ .url = REGRESSION_RELEASE("silesia.tar.zst"),
+ .xxhash64 = 0x48a199f92f93e977LL,
+ },
+};
+
+data_t silesia_tar = {
+ .name = "silesia.tar",
+ .type = data_type_file,
+ .data =
+ {
+ .url = REGRESSION_RELEASE("silesia.tar.zst"),
+ .xxhash64 = 0x48a199f92f93e977LL,
+ },
+};
+
+data_t github = {
+ .name = "github",
+ .type = data_type_dir,
+ .data =
+ {
+ .url = REGRESSION_RELEASE("github.tar.zst"),
+ .xxhash64 = 0xa9b1b44b020df292LL,
+ },
+ .dict =
+ {
+ .url = REGRESSION_RELEASE("github.dict.zst"),
+ .xxhash64 = 0x1eddc6f737d3cb53LL,
+
+ },
+};
+
+static data_t* g_data[] = {
+ &silesia,
+ &silesia_tar,
+ &github,
+ NULL,
+};
+
+data_t const* const* data = (data_t const* const*)g_data;
+
+/**
+ * data helpers.
+ */
+
+int data_has_dict(data_t const* data) {
+ return data->dict.url != NULL;
+}
+
+/**
+ * data buffer helper functions (documented in header).
+ */
+
+data_buffer_t data_buffer_create(size_t const capacity) {
+ data_buffer_t buffer = {};
+
+ buffer.data = (uint8_t*)malloc(capacity);
+ if (buffer.data == NULL)
+ return buffer;
+ buffer.capacity = capacity;
+ return buffer;
+}
+
+data_buffer_t data_buffer_read(char const* filename) {
+ data_buffer_t buffer = {};
+
+ uint64_t const size = UTIL_getFileSize(filename);
+ if (size == UTIL_FILESIZE_UNKNOWN) {
+ fprintf(stderr, "unknown size for %s\n", filename);
+ return buffer;
+ }
+
+ buffer.data = (uint8_t*)malloc(size);
+ if (buffer.data == NULL) {
+ fprintf(stderr, "malloc failed\n");
+ return buffer;
+ }
+ buffer.capacity = size;
+
+ FILE* file = fopen(filename, "rb");
+ if (file == NULL) {
+ fprintf(stderr, "file null\n");
+ goto err;
+ }
+ buffer.size = fread(buffer.data, 1, buffer.capacity, file);
+ fclose(file);
+ if (buffer.size != buffer.capacity) {
+ fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
+ goto err;
+ }
+
+ return buffer;
+err:
+ free(buffer.data);
+ memset(&buffer, 0, sizeof(buffer));
+ return buffer;
+}
+
+data_buffer_t data_buffer_get_data(data_t const* data) {
+ data_buffer_t const kEmptyBuffer = {};
+
+ if (data->type != data_type_file)
+ return kEmptyBuffer;
+
+ return data_buffer_read(data->data.path);
+}
+
+data_buffer_t data_buffer_get_dict(data_t const* data) {
+ data_buffer_t const kEmptyBuffer = {};
+
+ if (!data_has_dict(data))
+ return kEmptyBuffer;
+
+ return data_buffer_read(data->dict.path);
+}
+
+int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
+ size_t const size =
+ buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
+ int const cmp = memcmp(buffer1.data, buffer2.data, size);
+ if (cmp != 0)
+ return cmp;
+ if (buffer1.size < buffer2.size)
+ return -1;
+ if (buffer1.size == buffer2.size)
+ return 0;
+ assert(buffer1.size > buffer2.size);
+ return 1;
+}
+
+void data_buffer_free(data_buffer_t buffer) {
+ free(buffer.data);
+}
+
+/**
+ * data filenames helpers.
+ */
+
+FileNamesTable* data_filenames_get(data_t const* data)
+{
+ char const* const path = data->data.path;
+ return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
+}
+
+/**
+ * data buffers helpers.
+ */
+
+data_buffers_t data_buffers_get(data_t const* data) {
+ data_buffers_t buffers = {.size = 0};
+ FileNamesTable* const filenames = data_filenames_get(data);
+ if (filenames == NULL) return buffers;
+ if (filenames->tableSize == 0) {
+ UTIL_freeFileNamesTable(filenames);
+ return buffers;
+ }
+
+ data_buffer_t* buffersPtr =
+ (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
+ if (buffersPtr == NULL) {
+ UTIL_freeFileNamesTable(filenames);
+ return buffers;
+ }
+ buffers.buffers = (data_buffer_t const*)buffersPtr;
+ buffers.size = filenames->tableSize;
+
+ for (size_t i = 0; i < filenames->tableSize; ++i) {
+ buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
+ if (buffersPtr[i].data == NULL) {
+ data_buffers_t const kEmptyBuffer = {};
+ data_buffers_free(buffers);
+ UTIL_freeFileNamesTable(filenames);
+ return kEmptyBuffer;
+ }
+ }
+
+ UTIL_freeFileNamesTable(filenames);
+ return buffers;
+}
+
+/**
+ * Frees the data buffers.
+ */
+void data_buffers_free(data_buffers_t buffers) {
+ free((data_buffer_t*)buffers.buffers);
+}
+
+/**
+ * Initialization and download functions.
+ */
+
+static char* g_data_dir = NULL;
+
+/* mkdir -p */
+static int ensure_directory_exists(char const* indir) {
+ char* const dir = strdup(indir);
+ char* end = dir;
+ int ret = 0;
+ if (dir == NULL) {
+ ret = EINVAL;
+ goto out;
+ }
+ do {
+ /* Find the next directory level. */
+ for (++end; *end != '\0' && *end != '/'; ++end)
+ ;
+ /* End the string there, make the directory, and restore the string. */
+ char const save = *end;
+ *end = '\0';
+ int const isdir = UTIL_isDirectory(dir);
+ ret = mkdir(dir, S_IRWXU);
+ *end = save;
+ /* Its okay if the directory already exists. */
+ if (ret == 0 || (errno == EEXIST && isdir))
+ continue;
+ ret = errno;
+ fprintf(stderr, "mkdir() failed\n");
+ goto out;
+ } while (*end != '\0');
+
+ ret = 0;
+out:
+ free(dir);
+ return ret;
+}
+
+/** Concatenate 3 strings into a new buffer. */
+static char* cat3(char const* str1, char const* str2, char const* str3) {
+ size_t const size1 = strlen(str1);
+ size_t const size2 = strlen(str2);
+ size_t const size3 = str3 == NULL ? 0 : strlen(str3);
+ size_t const size = size1 + size2 + size3 + 1;
+ char* const dst = (char*)malloc(size);
+ if (dst == NULL)
+ return NULL;
+ strcpy(dst, str1);
+ strcpy(dst + size1, str2);
+ if (str3 != NULL)
+ strcpy(dst + size1 + size2, str3);
+ assert(strlen(dst) == size1 + size2 + size3);
+ return dst;
+}
+
+static char* cat2(char const* str1, char const* str2) {
+ return cat3(str1, str2, NULL);
+}
+
+/**
+ * State needed by the curl callback.
+ * It takes data from curl, hashes it, and writes it to the file.
+ */
+typedef struct {
+ FILE* file;
+ XXH64_state_t xxhash64;
+ int error;
+} curl_data_t;
+
+/** Create the curl state. */
+static curl_data_t curl_data_create(
+ data_resource_t const* resource,
+ data_type_t type) {
+ curl_data_t cdata = {};
+
+ XXH64_reset(&cdata.xxhash64, 0);
+
+ assert(UTIL_isDirectory(g_data_dir));
+
+ if (type == data_type_file) {
+ /* Decompress the resource and store to the path. */
+ char* cmd = cat3("zstd -dqfo '", resource->path, "'");
+ if (cmd == NULL) {
+ cdata.error = ENOMEM;
+ return cdata;
+ }
+ cdata.file = popen(cmd, "w");
+ free(cmd);
+ } else {
+ /* Decompress and extract the resource to the cache directory. */
+ char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
+ if (cmd == NULL) {
+ cdata.error = ENOMEM;
+ return cdata;
+ }
+ cdata.file = popen(cmd, "w");
+ free(cmd);
+ }
+ if (cdata.file == NULL) {
+ cdata.error = errno;
+ }
+
+ return cdata;
+}
+
+/** Free the curl state. */
+static int curl_data_free(curl_data_t cdata) {
+ return pclose(cdata.file);
+}
+
+/** curl callback. Updates the hash, and writes to the file. */
+static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
+ curl_data_t* cdata = (curl_data_t*)ptr;
+ size_t const written = fwrite(data, size, count, cdata->file);
+ XXH64_update(&cdata->xxhash64, data, written * size);
+ return written;
+}
+
+static int curl_download_resource(
+ CURL* curl,
+ data_resource_t const* resource,
+ data_type_t type) {
+ curl_data_t cdata;
+ /* Download the data. */
+ if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
+ return EINVAL;
+ if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
+ return EINVAL;
+ cdata = curl_data_create(resource, type);
+ if (cdata.error != 0)
+ return cdata.error;
+ int const curl_err = curl_easy_perform(curl);
+ int const close_err = curl_data_free(cdata);
+ if (curl_err) {
+ fprintf(
+ stderr,
+ "downloading '%s' for '%s' failed\n",
+ resource->url,
+ resource->path);
+ return EIO;
+ }
+ if (close_err) {
+ fprintf(stderr, "writing data to '%s' failed\n", resource->path);
+ return EIO;
+ }
+ /* check that the file exists. */
+ if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
+ fprintf(stderr, "output file '%s' does not exist\n", resource->path);
+ return EIO;
+ }
+ if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
+ fprintf(
+ stderr, "output directory '%s' does not exist\n", resource->path);
+ return EIO;
+ }
+ /* Check that the hash matches. */
+ if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
+ fprintf(
+ stderr,
+ "checksum does not match: 0x%llxLL != 0x%llxLL\n",
+ (unsigned long long)XXH64_digest(&cdata.xxhash64),
+ (unsigned long long)resource->xxhash64);
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+/** Download a single data object. */
+static int curl_download_datum(CURL* curl, data_t const* data) {
+ int ret;
+ ret = curl_download_resource(curl, &data->data, data->type);
+ if (ret != 0)
+ return ret;
+ if (data_has_dict(data)) {
+ ret = curl_download_resource(curl, &data->dict, data_type_file);
+ if (ret != 0)
+ return ret;
+ }
+ return ret;
+}
+
+/** Download all the data. */
+static int curl_download_data(data_t const* const* data) {
+ if (curl_global_init(CURL_GLOBAL_ALL) != 0)
+ return EFAULT;
+
+ curl_data_t cdata = {};
+ CURL* curl = curl_easy_init();
+ int err = EFAULT;
+
+ if (curl == NULL)
+ return EFAULT;
+
+ if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
+ goto out;
+ if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
+ goto out;
+ if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
+ goto out;
+
+ assert(data != NULL);
+ for (; *data != NULL; ++data) {
+ if (curl_download_datum(curl, *data) != 0)
+ goto out;
+ }
+
+ err = 0;
+out:
+ curl_easy_cleanup(curl);
+ curl_global_cleanup();
+ return err;
+}
+
+/** Fill the path member variable of the data objects. */
+static int data_create_paths(data_t* const* data, char const* dir) {
+ size_t const dirlen = strlen(dir);
+ assert(data != NULL);
+ for (; *data != NULL; ++data) {
+ data_t* const datum = *data;
+ datum->data.path = cat3(dir, "/", datum->name);
+ if (datum->data.path == NULL)
+ return ENOMEM;
+ if (data_has_dict(datum)) {
+ datum->dict.path = cat2(datum->data.path, ".dict");
+ if (datum->dict.path == NULL)
+ return ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/** Free the path member variable of the data objects. */
+static void data_free_paths(data_t* const* data) {
+ assert(data != NULL);
+ for (; *data != NULL; ++data) {
+ data_t* datum = *data;
+ free((void*)datum->data.path);
+ free((void*)datum->dict.path);
+ datum->data.path = NULL;
+ datum->dict.path = NULL;
+ }
+}
+
+static char const kStampName[] = "STAMP";
+
+static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
+ if (!MEM_isLittleEndian())
+ data = MEM_swap64(data);
+ XXH64_update(state, &data, sizeof(data));
+}
+
+/** Hash the data to create the stamp. */
+static uint64_t stamp_hash(data_t const* const* data) {
+ XXH64_state_t state;
+
+ XXH64_reset(&state, 0);
+ assert(data != NULL);
+ for (; *data != NULL; ++data) {
+ data_t const* datum = *data;
+ /* We don't care about the URL that we fetch from. */
+ /* The path is derived from the name. */
+ XXH64_update(&state, datum->name, strlen(datum->name));
+ xxh_update_le(&state, datum->data.xxhash64);
+ xxh_update_le(&state, datum->dict.xxhash64);
+ xxh_update_le(&state, datum->type);
+ }
+ return XXH64_digest(&state);
+}
+
+/** Check if the stamp matches the stamp in the cache directory. */
+static int stamp_check(char const* dir, data_t const* const* data) {
+ char* stamp = cat3(dir, "/", kStampName);
+ uint64_t const expected = stamp_hash(data);
+ XXH64_canonical_t actual;
+ FILE* stampfile = NULL;
+ int matches = 0;
+
+ if (stamp == NULL)
+ goto out;
+ if (!UTIL_isRegularFile(stamp)) {
+ fprintf(stderr, "stamp does not exist: recreating the data cache\n");
+ goto out;
+ }
+
+ stampfile = fopen(stamp, "rb");
+ if (stampfile == NULL) {
+ fprintf(stderr, "could not open stamp: recreating the data cache\n");
+ goto out;
+ }
+
+ size_t b;
+ if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
+ fprintf(stderr, "invalid stamp: recreating the data cache\n");
+ goto out;
+ }
+
+ matches = (expected == XXH64_hashFromCanonical(&actual));
+ if (matches)
+ fprintf(stderr, "stamp matches: reusing the cached data\n");
+ else
+ fprintf(stderr, "stamp does not match: recreating the data cache\n");
+
+out:
+ free(stamp);
+ if (stampfile != NULL)
+ fclose(stampfile);
+ return matches;
+}
+
+/** On success write a new stamp, on failure delete the old stamp. */
+static int
+stamp_write(char const* dir, data_t const* const* data, int const data_err) {
+ char* stamp = cat3(dir, "/", kStampName);
+ FILE* stampfile = NULL;
+ int err = EIO;
+
+ if (stamp == NULL)
+ return ENOMEM;
+
+ if (data_err != 0) {
+ err = data_err;
+ goto out;
+ }
+ XXH64_canonical_t hash;
+
+ XXH64_canonicalFromHash(&hash, stamp_hash(data));
+
+ stampfile = fopen(stamp, "wb");
+ if (stampfile == NULL)
+ goto out;
+ if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
+ goto out;
+ err = 0;
+ fprintf(stderr, "stamped new data cache\n");
+out:
+ if (err != 0)
+ /* Ignore errors. */
+ unlink(stamp);
+ free(stamp);
+ if (stampfile != NULL)
+ fclose(stampfile);
+ return err;
+}
+
+int data_init(char const* dir) {
+ int err;
+
+ if (dir == NULL)
+ return EINVAL;
+
+ /* This must be first to simplify logic. */
+ err = ensure_directory_exists(dir);
+ if (err != 0)
+ return err;
+
+ /* Save the cache directory. */
+ g_data_dir = strdup(dir);
+ if (g_data_dir == NULL)
+ return ENOMEM;
+
+ err = data_create_paths(g_data, dir);
+ if (err != 0)
+ return err;
+
+ /* If the stamp matches then we are good to go.
+ * This must be called before any modifications to the data cache.
+ * After this point, we MUST call stamp_write() to update the STAMP,
+ * since we've updated the data cache.
+ */
+ if (stamp_check(dir, data))
+ return 0;
+
+ err = curl_download_data(data);
+ if (err != 0)
+ goto out;
+
+out:
+ /* This must be last, since it must know if data_init() succeeded. */
+ stamp_write(dir, data, err);
+ return err;
+}
+
+void data_finish(void) {
+ data_free_paths(g_data);
+ free(g_data_dir);
+ g_data_dir = NULL;
+}