diff options
Diffstat (limited to '')
-rw-r--r-- | src/zstd/tests/regression/data.c | 613 |
1 files changed, 613 insertions, 0 deletions
diff --git a/src/zstd/tests/regression/data.c b/src/zstd/tests/regression/data.c new file mode 100644 index 000000000..b75ac1192 --- /dev/null +++ b/src/zstd/tests/regression/data.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 2016-2020, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "data.h" + +#include <assert.h> +#include <errno.h> +#include <stdio.h> +#include <string.h> + +#include <sys/stat.h> + +#include <curl/curl.h> + +#include "mem.h" +#include "util.h" +#define XXH_STATIC_LINKING_ONLY +#include "xxhash.h" + +/** + * Data objects + */ + +#define REGRESSION_RELEASE(x) \ + "https://github.com/facebook/zstd/releases/download/regression-data/" x + +data_t silesia = { + .name = "silesia", + .type = data_type_dir, + .data = + { + .url = REGRESSION_RELEASE("silesia.tar.zst"), + .xxhash64 = 0x48a199f92f93e977LL, + }, +}; + +data_t silesia_tar = { + .name = "silesia.tar", + .type = data_type_file, + .data = + { + .url = REGRESSION_RELEASE("silesia.tar.zst"), + .xxhash64 = 0x48a199f92f93e977LL, + }, +}; + +data_t github = { + .name = "github", + .type = data_type_dir, + .data = + { + .url = REGRESSION_RELEASE("github.tar.zst"), + .xxhash64 = 0xa9b1b44b020df292LL, + }, + .dict = + { + .url = REGRESSION_RELEASE("github.dict.zst"), + .xxhash64 = 0x1eddc6f737d3cb53LL, + + }, +}; + +static data_t* g_data[] = { + &silesia, + &silesia_tar, + &github, + NULL, +}; + +data_t const* const* data = (data_t const* const*)g_data; + +/** + * data helpers. + */ + +int data_has_dict(data_t const* data) { + return data->dict.url != NULL; +} + +/** + * data buffer helper functions (documented in header). + */ + +data_buffer_t data_buffer_create(size_t const capacity) { + data_buffer_t buffer = {}; + + buffer.data = (uint8_t*)malloc(capacity); + if (buffer.data == NULL) + return buffer; + buffer.capacity = capacity; + return buffer; +} + +data_buffer_t data_buffer_read(char const* filename) { + data_buffer_t buffer = {}; + + uint64_t const size = UTIL_getFileSize(filename); + if (size == UTIL_FILESIZE_UNKNOWN) { + fprintf(stderr, "unknown size for %s\n", filename); + return buffer; + } + + buffer.data = (uint8_t*)malloc(size); + if (buffer.data == NULL) { + fprintf(stderr, "malloc failed\n"); + return buffer; + } + buffer.capacity = size; + + FILE* file = fopen(filename, "rb"); + if (file == NULL) { + fprintf(stderr, "file null\n"); + goto err; + } + buffer.size = fread(buffer.data, 1, buffer.capacity, file); + fclose(file); + if (buffer.size != buffer.capacity) { + fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity); + goto err; + } + + return buffer; +err: + free(buffer.data); + memset(&buffer, 0, sizeof(buffer)); + return buffer; +} + +data_buffer_t data_buffer_get_data(data_t const* data) { + data_buffer_t const kEmptyBuffer = {}; + + if (data->type != data_type_file) + return kEmptyBuffer; + + return data_buffer_read(data->data.path); +} + +data_buffer_t data_buffer_get_dict(data_t const* data) { + data_buffer_t const kEmptyBuffer = {}; + + if (!data_has_dict(data)) + return kEmptyBuffer; + + return data_buffer_read(data->dict.path); +} + +int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) { + size_t const size = + buffer1.size < buffer2.size ? buffer1.size : buffer2.size; + int const cmp = memcmp(buffer1.data, buffer2.data, size); + if (cmp != 0) + return cmp; + if (buffer1.size < buffer2.size) + return -1; + if (buffer1.size == buffer2.size) + return 0; + assert(buffer1.size > buffer2.size); + return 1; +} + +void data_buffer_free(data_buffer_t buffer) { + free(buffer.data); +} + +/** + * data filenames helpers. + */ + +FileNamesTable* data_filenames_get(data_t const* data) +{ + char const* const path = data->data.path; + return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ ); +} + +/** + * data buffers helpers. + */ + +data_buffers_t data_buffers_get(data_t const* data) { + data_buffers_t buffers = {.size = 0}; + FileNamesTable* const filenames = data_filenames_get(data); + if (filenames == NULL) return buffers; + if (filenames->tableSize == 0) { + UTIL_freeFileNamesTable(filenames); + return buffers; + } + + data_buffer_t* buffersPtr = + (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr)); + if (buffersPtr == NULL) { + UTIL_freeFileNamesTable(filenames); + return buffers; + } + buffers.buffers = (data_buffer_t const*)buffersPtr; + buffers.size = filenames->tableSize; + + for (size_t i = 0; i < filenames->tableSize; ++i) { + buffersPtr[i] = data_buffer_read(filenames->fileNames[i]); + if (buffersPtr[i].data == NULL) { + data_buffers_t const kEmptyBuffer = {}; + data_buffers_free(buffers); + UTIL_freeFileNamesTable(filenames); + return kEmptyBuffer; + } + } + + UTIL_freeFileNamesTable(filenames); + return buffers; +} + +/** + * Frees the data buffers. + */ +void data_buffers_free(data_buffers_t buffers) { + free((data_buffer_t*)buffers.buffers); +} + +/** + * Initialization and download functions. + */ + +static char* g_data_dir = NULL; + +/* mkdir -p */ +static int ensure_directory_exists(char const* indir) { + char* const dir = strdup(indir); + char* end = dir; + int ret = 0; + if (dir == NULL) { + ret = EINVAL; + goto out; + } + do { + /* Find the next directory level. */ + for (++end; *end != '\0' && *end != '/'; ++end) + ; + /* End the string there, make the directory, and restore the string. */ + char const save = *end; + *end = '\0'; + int const isdir = UTIL_isDirectory(dir); + ret = mkdir(dir, S_IRWXU); + *end = save; + /* Its okay if the directory already exists. */ + if (ret == 0 || (errno == EEXIST && isdir)) + continue; + ret = errno; + fprintf(stderr, "mkdir() failed\n"); + goto out; + } while (*end != '\0'); + + ret = 0; +out: + free(dir); + return ret; +} + +/** Concatenate 3 strings into a new buffer. */ +static char* cat3(char const* str1, char const* str2, char const* str3) { + size_t const size1 = strlen(str1); + size_t const size2 = strlen(str2); + size_t const size3 = str3 == NULL ? 0 : strlen(str3); + size_t const size = size1 + size2 + size3 + 1; + char* const dst = (char*)malloc(size); + if (dst == NULL) + return NULL; + strcpy(dst, str1); + strcpy(dst + size1, str2); + if (str3 != NULL) + strcpy(dst + size1 + size2, str3); + assert(strlen(dst) == size1 + size2 + size3); + return dst; +} + +static char* cat2(char const* str1, char const* str2) { + return cat3(str1, str2, NULL); +} + +/** + * State needed by the curl callback. + * It takes data from curl, hashes it, and writes it to the file. + */ +typedef struct { + FILE* file; + XXH64_state_t xxhash64; + int error; +} curl_data_t; + +/** Create the curl state. */ +static curl_data_t curl_data_create( + data_resource_t const* resource, + data_type_t type) { + curl_data_t cdata = {}; + + XXH64_reset(&cdata.xxhash64, 0); + + assert(UTIL_isDirectory(g_data_dir)); + + if (type == data_type_file) { + /* Decompress the resource and store to the path. */ + char* cmd = cat3("zstd -dqfo '", resource->path, "'"); + if (cmd == NULL) { + cdata.error = ENOMEM; + return cdata; + } + cdata.file = popen(cmd, "w"); + free(cmd); + } else { + /* Decompress and extract the resource to the cache directory. */ + char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'"); + if (cmd == NULL) { + cdata.error = ENOMEM; + return cdata; + } + cdata.file = popen(cmd, "w"); + free(cmd); + } + if (cdata.file == NULL) { + cdata.error = errno; + } + + return cdata; +} + +/** Free the curl state. */ +static int curl_data_free(curl_data_t cdata) { + return pclose(cdata.file); +} + +/** curl callback. Updates the hash, and writes to the file. */ +static size_t curl_write(void* data, size_t size, size_t count, void* ptr) { + curl_data_t* cdata = (curl_data_t*)ptr; + size_t const written = fwrite(data, size, count, cdata->file); + XXH64_update(&cdata->xxhash64, data, written * size); + return written; +} + +static int curl_download_resource( + CURL* curl, + data_resource_t const* resource, + data_type_t type) { + curl_data_t cdata; + /* Download the data. */ + if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0) + return EINVAL; + if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0) + return EINVAL; + cdata = curl_data_create(resource, type); + if (cdata.error != 0) + return cdata.error; + int const curl_err = curl_easy_perform(curl); + int const close_err = curl_data_free(cdata); + if (curl_err) { + fprintf( + stderr, + "downloading '%s' for '%s' failed\n", + resource->url, + resource->path); + return EIO; + } + if (close_err) { + fprintf(stderr, "writing data to '%s' failed\n", resource->path); + return EIO; + } + /* check that the file exists. */ + if (type == data_type_file && !UTIL_isRegularFile(resource->path)) { + fprintf(stderr, "output file '%s' does not exist\n", resource->path); + return EIO; + } + if (type == data_type_dir && !UTIL_isDirectory(resource->path)) { + fprintf( + stderr, "output directory '%s' does not exist\n", resource->path); + return EIO; + } + /* Check that the hash matches. */ + if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) { + fprintf( + stderr, + "checksum does not match: 0x%llxLL != 0x%llxLL\n", + (unsigned long long)XXH64_digest(&cdata.xxhash64), + (unsigned long long)resource->xxhash64); + return EINVAL; + } + + return 0; +} + +/** Download a single data object. */ +static int curl_download_datum(CURL* curl, data_t const* data) { + int ret; + ret = curl_download_resource(curl, &data->data, data->type); + if (ret != 0) + return ret; + if (data_has_dict(data)) { + ret = curl_download_resource(curl, &data->dict, data_type_file); + if (ret != 0) + return ret; + } + return ret; +} + +/** Download all the data. */ +static int curl_download_data(data_t const* const* data) { + if (curl_global_init(CURL_GLOBAL_ALL) != 0) + return EFAULT; + + curl_data_t cdata = {}; + CURL* curl = curl_easy_init(); + int err = EFAULT; + + if (curl == NULL) + return EFAULT; + + if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0) + goto out; + if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0) + goto out; + if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0) + goto out; + + assert(data != NULL); + for (; *data != NULL; ++data) { + if (curl_download_datum(curl, *data) != 0) + goto out; + } + + err = 0; +out: + curl_easy_cleanup(curl); + curl_global_cleanup(); + return err; +} + +/** Fill the path member variable of the data objects. */ +static int data_create_paths(data_t* const* data, char const* dir) { + size_t const dirlen = strlen(dir); + assert(data != NULL); + for (; *data != NULL; ++data) { + data_t* const datum = *data; + datum->data.path = cat3(dir, "/", datum->name); + if (datum->data.path == NULL) + return ENOMEM; + if (data_has_dict(datum)) { + datum->dict.path = cat2(datum->data.path, ".dict"); + if (datum->dict.path == NULL) + return ENOMEM; + } + } + return 0; +} + +/** Free the path member variable of the data objects. */ +static void data_free_paths(data_t* const* data) { + assert(data != NULL); + for (; *data != NULL; ++data) { + data_t* datum = *data; + free((void*)datum->data.path); + free((void*)datum->dict.path); + datum->data.path = NULL; + datum->dict.path = NULL; + } +} + +static char const kStampName[] = "STAMP"; + +static void xxh_update_le(XXH64_state_t* state, uint64_t data) { + if (!MEM_isLittleEndian()) + data = MEM_swap64(data); + XXH64_update(state, &data, sizeof(data)); +} + +/** Hash the data to create the stamp. */ +static uint64_t stamp_hash(data_t const* const* data) { + XXH64_state_t state; + + XXH64_reset(&state, 0); + assert(data != NULL); + for (; *data != NULL; ++data) { + data_t const* datum = *data; + /* We don't care about the URL that we fetch from. */ + /* The path is derived from the name. */ + XXH64_update(&state, datum->name, strlen(datum->name)); + xxh_update_le(&state, datum->data.xxhash64); + xxh_update_le(&state, datum->dict.xxhash64); + xxh_update_le(&state, datum->type); + } + return XXH64_digest(&state); +} + +/** Check if the stamp matches the stamp in the cache directory. */ +static int stamp_check(char const* dir, data_t const* const* data) { + char* stamp = cat3(dir, "/", kStampName); + uint64_t const expected = stamp_hash(data); + XXH64_canonical_t actual; + FILE* stampfile = NULL; + int matches = 0; + + if (stamp == NULL) + goto out; + if (!UTIL_isRegularFile(stamp)) { + fprintf(stderr, "stamp does not exist: recreating the data cache\n"); + goto out; + } + + stampfile = fopen(stamp, "rb"); + if (stampfile == NULL) { + fprintf(stderr, "could not open stamp: recreating the data cache\n"); + goto out; + } + + size_t b; + if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) { + fprintf(stderr, "invalid stamp: recreating the data cache\n"); + goto out; + } + + matches = (expected == XXH64_hashFromCanonical(&actual)); + if (matches) + fprintf(stderr, "stamp matches: reusing the cached data\n"); + else + fprintf(stderr, "stamp does not match: recreating the data cache\n"); + +out: + free(stamp); + if (stampfile != NULL) + fclose(stampfile); + return matches; +} + +/** On success write a new stamp, on failure delete the old stamp. */ +static int +stamp_write(char const* dir, data_t const* const* data, int const data_err) { + char* stamp = cat3(dir, "/", kStampName); + FILE* stampfile = NULL; + int err = EIO; + + if (stamp == NULL) + return ENOMEM; + + if (data_err != 0) { + err = data_err; + goto out; + } + XXH64_canonical_t hash; + + XXH64_canonicalFromHash(&hash, stamp_hash(data)); + + stampfile = fopen(stamp, "wb"); + if (stampfile == NULL) + goto out; + if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1) + goto out; + err = 0; + fprintf(stderr, "stamped new data cache\n"); +out: + if (err != 0) + /* Ignore errors. */ + unlink(stamp); + free(stamp); + if (stampfile != NULL) + fclose(stampfile); + return err; +} + +int data_init(char const* dir) { + int err; + + if (dir == NULL) + return EINVAL; + + /* This must be first to simplify logic. */ + err = ensure_directory_exists(dir); + if (err != 0) + return err; + + /* Save the cache directory. */ + g_data_dir = strdup(dir); + if (g_data_dir == NULL) + return ENOMEM; + + err = data_create_paths(g_data, dir); + if (err != 0) + return err; + + /* If the stamp matches then we are good to go. + * This must be called before any modifications to the data cache. + * After this point, we MUST call stamp_write() to update the STAMP, + * since we've updated the data cache. + */ + if (stamp_check(dir, data)) + return 0; + + err = curl_download_data(data); + if (err != 0) + goto out; + +out: + /* This must be last, since it must know if data_init() succeeded. */ + stamp_write(dir, data, err); + return err; +} + +void data_finish(void) { + data_free_paths(g_data); + free(g_data_dir); + g_data_dir = NULL; +} |