diff options
Diffstat (limited to 'src/zstd/contrib/seekable_format')
10 files changed, 1853 insertions, 0 deletions
diff --git a/src/zstd/contrib/seekable_format/examples/.gitignore b/src/zstd/contrib/seekable_format/examples/.gitignore new file mode 100644 index 00000000..df2f9ab0 --- /dev/null +++ b/src/zstd/contrib/seekable_format/examples/.gitignore @@ -0,0 +1,4 @@ +seekable_compression +seekable_decompression +parallel_processing +parallel_compression diff --git a/src/zstd/contrib/seekable_format/examples/Makefile b/src/zstd/contrib/seekable_format/examples/Makefile new file mode 100644 index 00000000..1847aa7e --- /dev/null +++ b/src/zstd/contrib/seekable_format/examples/Makefile @@ -0,0 +1,42 @@ +# ################################################################ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under both the BSD-style license (found in the +# LICENSE file in the root directory of this source tree) and the GPLv2 (found +# in the COPYING file in the root directory of this source tree). +# ################################################################ + +# This Makefile presumes libzstd is built, using `make` in / or /lib/ + +LDFLAGS += ../../../lib/libzstd.a +CPPFLAGS += -I../ -I../../../lib -I../../../lib/common + +CFLAGS ?= -O3 +CFLAGS += -g + +SEEKABLE_OBJS = ../zstdseek_compress.c ../zstdseek_decompress.c + +.PHONY: default all clean test + +default: all + +all: seekable_compression seekable_decompression parallel_processing + +seekable_compression : seekable_compression.c $(SEEKABLE_OBJS) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +seekable_decompression : seekable_decompression.c $(SEEKABLE_OBJS) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +parallel_processing : parallel_processing.c $(SEEKABLE_OBJS) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -pthread + +parallel_compression : parallel_compression.c $(SEEKABLE_OBJS) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -pthread + +clean: + @rm -f core *.o tmp* result* *.zst \ + seekable_compression seekable_decompression \ + parallel_processing parallel_compression + @echo Cleaning completed diff --git a/src/zstd/contrib/seekable_format/examples/parallel_compression.c b/src/zstd/contrib/seekable_format/examples/parallel_compression.c new file mode 100644 index 00000000..69644d2b --- /dev/null +++ b/src/zstd/contrib/seekable_format/examples/parallel_compression.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#include <stdlib.h> // malloc, free, exit, atoi +#include <stdio.h> // fprintf, perror, feof, fopen, etc. +#include <string.h> // strlen, memset, strcat +#define ZSTD_STATIC_LINKING_ONLY +#include <zstd.h> // presumes zstd library is installed +#include <zstd_errors.h> +#if defined(WIN32) || defined(_WIN32) +# include <windows.h> +# define SLEEP(x) Sleep(x) +#else +# include <unistd.h> +# define SLEEP(x) usleep(x * 1000) +#endif + +#define XXH_NAMESPACE ZSTD_ +#include "xxhash.h" + +#include "pool.h" // use zstd thread pool for demo + +#include "zstd_seekable.h" + +static void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc:"); + exit(1); +} + +static FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(3); +} + +static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(4); +} + +static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +{ + size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); + if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ + /* error */ + perror("fwrite"); + exit(5); +} + +static size_t fclose_orDie(FILE* file) +{ + if (!fclose(file)) return 0; + /* error */ + perror("fclose"); + exit(6); +} + +static void fseek_orDie(FILE* file, long int offset, int origin) +{ + if (!fseek(file, offset, origin)) { + if (!fflush(file)) return; + } + /* error */ + perror("fseek"); + exit(7); +} + +static long int ftell_orDie(FILE* file) +{ + long int off = ftell(file); + if (off != -1) return off; + /* error */ + perror("ftell"); + exit(8); +} + +struct job { + const void* src; + size_t srcSize; + void* dst; + size_t dstSize; + + unsigned checksum; + + int compressionLevel; + int done; +}; + +static void compressFrame(void* opaque) +{ + struct job* job = opaque; + + job->checksum = XXH64(job->src, job->srcSize, 0); + + size_t ret = ZSTD_compress(job->dst, job->dstSize, job->src, job->srcSize, job->compressionLevel); + if (ZSTD_isError(ret)) { + fprintf(stderr, "ZSTD_compress() error : %s \n", ZSTD_getErrorName(ret)); + exit(20); + } + + job->dstSize = ret; + job->done = 1; +} + +static void compressFile_orDie(const char* fname, const char* outName, int cLevel, unsigned frameSize, int nbThreads) +{ + POOL_ctx* pool = POOL_create(nbThreads, nbThreads); + if (pool == NULL) { fprintf(stderr, "POOL_create() error \n"); exit(9); } + + FILE* const fin = fopen_orDie(fname, "rb"); + FILE* const fout = fopen_orDie(outName, "wb"); + + if (ZSTD_compressBound(frameSize) > 0xFFFFFFFFU) { fprintf(stderr, "Frame size too large \n"); exit(10); } + unsigned dstSize = ZSTD_compressBound(frameSize); + + + fseek_orDie(fin, 0, SEEK_END); + long int length = ftell_orDie(fin); + fseek_orDie(fin, 0, SEEK_SET); + + size_t numFrames = (length + frameSize - 1) / frameSize; + + struct job* jobs = malloc_orDie(sizeof(struct job) * numFrames); + + size_t i; + for(i = 0; i < numFrames; i++) { + void* in = malloc_orDie(frameSize); + void* out = malloc_orDie(dstSize); + + size_t inSize = fread_orDie(in, frameSize, fin); + + jobs[i].src = in; + jobs[i].srcSize = inSize; + jobs[i].dst = out; + jobs[i].dstSize = dstSize; + jobs[i].compressionLevel = cLevel; + jobs[i].done = 0; + POOL_add(pool, compressFrame, &jobs[i]); + } + + ZSTD_frameLog* fl = ZSTD_seekable_createFrameLog(1); + if (fl == NULL) { fprintf(stderr, "ZSTD_seekable_createFrameLog() failed \n"); exit(11); } + for (i = 0; i < numFrames; i++) { + while (!jobs[i].done) SLEEP(5); /* wake up every 5 milliseconds to check */ + fwrite_orDie(jobs[i].dst, jobs[i].dstSize, fout); + free((void*)jobs[i].src); + free(jobs[i].dst); + + size_t ret = ZSTD_seekable_logFrame(fl, jobs[i].dstSize, jobs[i].srcSize, jobs[i].checksum); + if (ZSTD_isError(ret)) { fprintf(stderr, "ZSTD_seekable_logFrame() error : %s \n", ZSTD_getErrorName(ret)); } + } + + { unsigned char seekTableBuff[1024]; + ZSTD_outBuffer out = {seekTableBuff, 1024, 0}; + while (ZSTD_seekable_writeSeekTable(fl, &out) != 0) { + fwrite_orDie(seekTableBuff, out.pos, fout); + out.pos = 0; + } + fwrite_orDie(seekTableBuff, out.pos, fout); + } + + ZSTD_seekable_freeFrameLog(fl); + free(jobs); + fclose_orDie(fout); + fclose_orDie(fin); +} + +static const char* createOutFilename_orDie(const char* filename) +{ + size_t const inL = strlen(filename); + size_t const outL = inL + 5; + void* outSpace = malloc_orDie(outL); + memset(outSpace, 0, outL); + strcat(outSpace, filename); + strcat(outSpace, ".zst"); + return (const char*)outSpace; +} + +int main(int argc, const char** argv) { + const char* const exeName = argv[0]; + if (argc!=4) { + printf("wrong arguments\n"); + printf("usage:\n"); + printf("%s FILE FRAME_SIZE NB_THREADS\n", exeName); + return 1; + } + + { const char* const inFileName = argv[1]; + unsigned const frameSize = (unsigned)atoi(argv[2]); + int const nbThreads = atoi(argv[3]); + + const char* const outFileName = createOutFilename_orDie(inFileName); + compressFile_orDie(inFileName, outFileName, 5, frameSize, nbThreads); + } + + return 0; +} diff --git a/src/zstd/contrib/seekable_format/examples/parallel_processing.c b/src/zstd/contrib/seekable_format/examples/parallel_processing.c new file mode 100644 index 00000000..da347763 --- /dev/null +++ b/src/zstd/contrib/seekable_format/examples/parallel_processing.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +/* + * A simple demo that sums up all the bytes in the file in parallel using + * seekable decompression and the zstd thread pool + */ + +#include <stdlib.h> // malloc, exit +#include <stdio.h> // fprintf, perror, feof +#include <string.h> // strerror +#include <errno.h> // errno +#define ZSTD_STATIC_LINKING_ONLY +#include <zstd.h> // presumes zstd library is installed +#include <zstd_errors.h> +#if defined(WIN32) || defined(_WIN32) +# include <windows.h> +# define SLEEP(x) Sleep(x) +#else +# include <unistd.h> +# define SLEEP(x) usleep(x * 1000) +#endif + +#include "pool.h" // use zstd thread pool for demo + +#include "zstd_seekable.h" + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +static void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc"); + exit(1); +} + +static void* realloc_orDie(void* ptr, size_t size) +{ + ptr = realloc(ptr, size); + if (ptr) return ptr; + /* error */ + perror("realloc"); + exit(1); +} + +static FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(3); +} + +static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(4); +} + +static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +{ + size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); + if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ + /* error */ + perror("fwrite"); + exit(5); +} + +static size_t fclose_orDie(FILE* file) +{ + if (!fclose(file)) return 0; + /* error */ + perror("fclose"); + exit(6); +} + +static void fseek_orDie(FILE* file, long int offset, int origin) { + if (!fseek(file, offset, origin)) { + if (!fflush(file)) return; + } + /* error */ + perror("fseek"); + exit(7); +} + +struct sum_job { + const char* fname; + unsigned long long sum; + unsigned frameNb; + int done; +}; + +static void sumFrame(void* opaque) +{ + struct sum_job* job = (struct sum_job*)opaque; + job->done = 0; + + FILE* const fin = fopen_orDie(job->fname, "rb"); + + ZSTD_seekable* const seekable = ZSTD_seekable_create(); + if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); } + + size_t const initResult = ZSTD_seekable_initFile(seekable, fin); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + size_t const frameSize = ZSTD_seekable_getFrameDecompressedSize(seekable, job->frameNb); + unsigned char* data = malloc_orDie(frameSize); + + size_t result = ZSTD_seekable_decompressFrame(seekable, data, frameSize, job->frameNb); + if (ZSTD_isError(result)) { fprintf(stderr, "ZSTD_seekable_decompressFrame() error : %s \n", ZSTD_getErrorName(result)); exit(12); } + + unsigned long long sum = 0; + size_t i; + for (i = 0; i < frameSize; i++) { + sum += data[i]; + } + job->sum = sum; + job->done = 1; + + fclose(fin); + ZSTD_seekable_free(seekable); + free(data); +} + +static void sumFile_orDie(const char* fname, int nbThreads) +{ + POOL_ctx* pool = POOL_create(nbThreads, nbThreads); + if (pool == NULL) { fprintf(stderr, "POOL_create() error \n"); exit(9); } + + FILE* const fin = fopen_orDie(fname, "rb"); + + ZSTD_seekable* const seekable = ZSTD_seekable_create(); + if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); } + + size_t const initResult = ZSTD_seekable_initFile(seekable, fin); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + size_t const numFrames = ZSTD_seekable_getNumFrames(seekable); + struct sum_job* jobs = (struct sum_job*)malloc(numFrames * sizeof(struct sum_job)); + + size_t i; + for (i = 0; i < numFrames; i++) { + jobs[i] = (struct sum_job){ fname, 0, i, 0 }; + POOL_add(pool, sumFrame, &jobs[i]); + } + + unsigned long long total = 0; + + for (i = 0; i < numFrames; i++) { + while (!jobs[i].done) SLEEP(5); /* wake up every 5 milliseconds to check */ + total += jobs[i].sum; + } + + printf("Sum: %llu\n", total); + + POOL_free(pool); + ZSTD_seekable_free(seekable); + fclose(fin); + free(jobs); +} + + +int main(int argc, const char** argv) +{ + const char* const exeName = argv[0]; + + if (argc!=3) { + fprintf(stderr, "wrong arguments\n"); + fprintf(stderr, "usage:\n"); + fprintf(stderr, "%s FILE NB_THREADS\n", exeName); + return 1; + } + + { + const char* const inFilename = argv[1]; + int const nbThreads = atoi(argv[2]); + sumFile_orDie(inFilename, nbThreads); + } + + return 0; +} diff --git a/src/zstd/contrib/seekable_format/examples/seekable_compression.c b/src/zstd/contrib/seekable_format/examples/seekable_compression.c new file mode 100644 index 00000000..9485bf26 --- /dev/null +++ b/src/zstd/contrib/seekable_format/examples/seekable_compression.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#include <stdlib.h> // malloc, free, exit, atoi +#include <stdio.h> // fprintf, perror, feof, fopen, etc. +#include <string.h> // strlen, memset, strcat +#define ZSTD_STATIC_LINKING_ONLY +#include <zstd.h> // presumes zstd library is installed + +#include "zstd_seekable.h" + +static void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc:"); + exit(1); +} + +static FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(3); +} + +static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(4); +} + +static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +{ + size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); + if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ + /* error */ + perror("fwrite"); + exit(5); +} + +static size_t fclose_orDie(FILE* file) +{ + if (!fclose(file)) return 0; + /* error */ + perror("fclose"); + exit(6); +} + +static void compressFile_orDie(const char* fname, const char* outName, int cLevel, unsigned frameSize) +{ + FILE* const fin = fopen_orDie(fname, "rb"); + FILE* const fout = fopen_orDie(outName, "wb"); + size_t const buffInSize = ZSTD_CStreamInSize(); /* can always read one full block */ + void* const buffIn = malloc_orDie(buffInSize); + size_t const buffOutSize = ZSTD_CStreamOutSize(); /* can always flush a full block */ + void* const buffOut = malloc_orDie(buffOutSize); + + ZSTD_seekable_CStream* const cstream = ZSTD_seekable_createCStream(); + if (cstream==NULL) { fprintf(stderr, "ZSTD_seekable_createCStream() error \n"); exit(10); } + size_t const initResult = ZSTD_seekable_initCStream(cstream, cLevel, 1, frameSize); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_initCStream() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + size_t read, toRead = buffInSize; + while( (read = fread_orDie(buffIn, toRead, fin)) ) { + ZSTD_inBuffer input = { buffIn, read, 0 }; + while (input.pos < input.size) { + ZSTD_outBuffer output = { buffOut, buffOutSize, 0 }; + toRead = ZSTD_seekable_compressStream(cstream, &output , &input); /* toRead is guaranteed to be <= ZSTD_CStreamInSize() */ + if (ZSTD_isError(toRead)) { fprintf(stderr, "ZSTD_seekable_compressStream() error : %s \n", ZSTD_getErrorName(toRead)); exit(12); } + if (toRead > buffInSize) toRead = buffInSize; /* Safely handle case when `buffInSize` is manually changed to a value < ZSTD_CStreamInSize()*/ + fwrite_orDie(buffOut, output.pos, fout); + } + } + + while (1) { + ZSTD_outBuffer output = { buffOut, buffOutSize, 0 }; + size_t const remainingToFlush = ZSTD_seekable_endStream(cstream, &output); /* close stream */ + if (ZSTD_isError(remainingToFlush)) { fprintf(stderr, "ZSTD_seekable_endStream() error : %s \n", ZSTD_getErrorName(remainingToFlush)); exit(13); } + fwrite_orDie(buffOut, output.pos, fout); + if (!remainingToFlush) break; + } + + ZSTD_seekable_freeCStream(cstream); + fclose_orDie(fout); + fclose_orDie(fin); + free(buffIn); + free(buffOut); +} + +static const char* createOutFilename_orDie(const char* filename) +{ + size_t const inL = strlen(filename); + size_t const outL = inL + 5; + void* outSpace = malloc_orDie(outL); + memset(outSpace, 0, outL); + strcat(outSpace, filename); + strcat(outSpace, ".zst"); + return (const char*)outSpace; +} + +int main(int argc, const char** argv) { + const char* const exeName = argv[0]; + if (argc!=3) { + printf("wrong arguments\n"); + printf("usage:\n"); + printf("%s FILE FRAME_SIZE\n", exeName); + return 1; + } + + { const char* const inFileName = argv[1]; + unsigned const frameSize = (unsigned)atoi(argv[2]); + + const char* const outFileName = createOutFilename_orDie(inFileName); + compressFile_orDie(inFileName, outFileName, 5, frameSize); + } + + return 0; +} diff --git a/src/zstd/contrib/seekable_format/examples/seekable_decompression.c b/src/zstd/contrib/seekable_format/examples/seekable_decompression.c new file mode 100644 index 00000000..9cd23292 --- /dev/null +++ b/src/zstd/contrib/seekable_format/examples/seekable_decompression.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + + +#include <stdlib.h> // malloc, exit +#include <stdio.h> // fprintf, perror, feof +#include <string.h> // strerror +#include <errno.h> // errno +#define ZSTD_STATIC_LINKING_ONLY +#include <zstd.h> // presumes zstd library is installed +#include <zstd_errors.h> + +#include "zstd_seekable.h" + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +static void* malloc_orDie(size_t size) +{ + void* const buff = malloc(size); + if (buff) return buff; + /* error */ + perror("malloc"); + exit(1); +} + +static void* realloc_orDie(void* ptr, size_t size) +{ + ptr = realloc(ptr, size); + if (ptr) return ptr; + /* error */ + perror("realloc"); + exit(1); +} + +static FILE* fopen_orDie(const char *filename, const char *instruction) +{ + FILE* const inFile = fopen(filename, instruction); + if (inFile) return inFile; + /* error */ + perror(filename); + exit(3); +} + +static size_t fread_orDie(void* buffer, size_t sizeToRead, FILE* file) +{ + size_t const readSize = fread(buffer, 1, sizeToRead, file); + if (readSize == sizeToRead) return readSize; /* good */ + if (feof(file)) return readSize; /* good, reached end of file */ + /* error */ + perror("fread"); + exit(4); +} + +static size_t fwrite_orDie(const void* buffer, size_t sizeToWrite, FILE* file) +{ + size_t const writtenSize = fwrite(buffer, 1, sizeToWrite, file); + if (writtenSize == sizeToWrite) return sizeToWrite; /* good */ + /* error */ + perror("fwrite"); + exit(5); +} + +static size_t fclose_orDie(FILE* file) +{ + if (!fclose(file)) return 0; + /* error */ + perror("fclose"); + exit(6); +} + +static void fseek_orDie(FILE* file, long int offset, int origin) { + if (!fseek(file, offset, origin)) { + if (!fflush(file)) return; + } + /* error */ + perror("fseek"); + exit(7); +} + + +static void decompressFile_orDie(const char* fname, unsigned startOffset, unsigned endOffset) +{ + FILE* const fin = fopen_orDie(fname, "rb"); + FILE* const fout = stdout; + size_t const buffOutSize = ZSTD_DStreamOutSize(); /* Guarantee to successfully flush at least one complete compressed block in all circumstances. */ + void* const buffOut = malloc_orDie(buffOutSize); + + ZSTD_seekable* const seekable = ZSTD_seekable_create(); + if (seekable==NULL) { fprintf(stderr, "ZSTD_seekable_create() error \n"); exit(10); } + + size_t const initResult = ZSTD_seekable_initFile(seekable, fin); + if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_seekable_init() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); } + + while (startOffset < endOffset) { + size_t const result = ZSTD_seekable_decompress(seekable, buffOut, MIN(endOffset - startOffset, buffOutSize), startOffset); + + if (ZSTD_isError(result)) { + fprintf(stderr, "ZSTD_seekable_decompress() error : %s \n", + ZSTD_getErrorName(result)); + exit(12); + } + fwrite_orDie(buffOut, result, fout); + startOffset += result; + } + + ZSTD_seekable_free(seekable); + fclose_orDie(fin); + fclose_orDie(fout); + free(buffOut); +} + + +int main(int argc, const char** argv) +{ + const char* const exeName = argv[0]; + + if (argc!=4) { + fprintf(stderr, "wrong arguments\n"); + fprintf(stderr, "usage:\n"); + fprintf(stderr, "%s FILE START END\n", exeName); + return 1; + } + + { + const char* const inFilename = argv[1]; + unsigned const startOffset = (unsigned) atoi(argv[2]); + unsigned const endOffset = (unsigned) atoi(argv[3]); + decompressFile_orDie(inFilename, startOffset, endOffset); + } + + return 0; +} diff --git a/src/zstd/contrib/seekable_format/zstd_seekable.h b/src/zstd/contrib/seekable_format/zstd_seekable.h new file mode 100644 index 00000000..438ac201 --- /dev/null +++ b/src/zstd/contrib/seekable_format/zstd_seekable.h @@ -0,0 +1,184 @@ +#ifndef SEEKABLE_H +#define SEEKABLE_H + +#if defined (__cplusplus) +extern "C" { +#endif + +#include <stdio.h> + +static const unsigned ZSTD_seekTableFooterSize = 9; + +#define ZSTD_SEEKABLE_MAGICNUMBER 0x8F92EAB1 + +#define ZSTD_SEEKABLE_MAXFRAMES 0x8000000U + +/* Limit the maximum size to avoid any potential issues storing the compressed size */ +#define ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE 0x80000000U + +/*-**************************************************************************** +* Seekable Format +* +* The seekable format splits the compressed data into a series of "frames", +* each compressed individually so that decompression of a section in the +* middle of an archive only requires zstd to decompress at most a frame's +* worth of extra data, instead of the entire archive. +******************************************************************************/ + +typedef struct ZSTD_seekable_CStream_s ZSTD_seekable_CStream; +typedef struct ZSTD_seekable_s ZSTD_seekable; + +/*-**************************************************************************** +* Seekable compression - HowTo +* A ZSTD_seekable_CStream object is required to tracking streaming operation. +* Use ZSTD_seekable_createCStream() and ZSTD_seekable_freeCStream() to create/ +* release resources. +* +* Streaming objects are reusable to avoid allocation and deallocation, +* to start a new compression operation call ZSTD_seekable_initCStream() on the +* compressor. +* +* Data streamed to the seekable compressor will automatically be split into +* frames of size `maxFrameSize` (provided in ZSTD_seekable_initCStream()), +* or if none is provided, will be cut off whenever ZSTD_seekable_endFrame() is +* called or when the default maximum frame size (2GB) is reached. +* +* Use ZSTD_seekable_initCStream() to initialize a ZSTD_seekable_CStream object +* for a new compression operation. +* `maxFrameSize` indicates the size at which to automatically start a new +* seekable frame. `maxFrameSize == 0` implies the default maximum size. +* `checksumFlag` indicates whether or not the seek table should include frame +* checksums on the uncompressed data for verification. +* @return : a size hint for input to provide for compression, or an error code +* checkable with ZSTD_isError() +* +* Use ZSTD_seekable_compressStream() repetitively to consume input stream. +* The function will automatically update both `pos` fields. +* Note that it may not consume the entire input, in which case `pos < size`, +* and it's up to the caller to present again remaining data. +* @return : a size hint, preferred nb of bytes to use as input for next +* function call or an error code, which can be tested using +* ZSTD_isError(). +* Note 1 : it's just a hint, to help latency a little, any other +* value will work fine. +* +* At any time, call ZSTD_seekable_endFrame() to end the current frame and +* start a new one. +* +* ZSTD_seekable_endStream() will end the current frame, and then write the seek +* table so that decompressors can efficiently find compressed frames. +* ZSTD_seekable_endStream() may return a number > 0 if it was unable to flush +* all the necessary data to `output`. In this case, it should be called again +* until all remaining data is flushed out and 0 is returned. +******************************************************************************/ + +/*===== Seekable compressor management =====*/ +ZSTDLIB_API ZSTD_seekable_CStream* ZSTD_seekable_createCStream(void); +ZSTDLIB_API size_t ZSTD_seekable_freeCStream(ZSTD_seekable_CStream* zcs); + +/*===== Seekable compression functions =====*/ +ZSTDLIB_API size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs, int compressionLevel, int checksumFlag, unsigned maxFrameSize); +ZSTDLIB_API size_t ZSTD_seekable_compressStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +ZSTDLIB_API size_t ZSTD_seekable_endFrame(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output); +ZSTDLIB_API size_t ZSTD_seekable_endStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output); + +/*= Raw seek table API + * These functions allow for the seek table to be constructed directly. + * This table can then be appended to a file of concatenated frames. + * This allows the frames to be compressed independently, even in parallel, + * and compiled together afterward into a seekable archive. + * + * Use ZSTD_seekable_createFrameLog() to allocate and initialize a tracking + * structure. + * + * Call ZSTD_seekable_logFrame() once for each frame in the archive. + * checksum is optional, and will not be used if checksumFlag was 0 when the + * frame log was created. If present, it should be the least significant 32 + * bits of the XXH64 hash of the uncompressed data. + * + * Call ZSTD_seekable_writeSeekTable to serialize the data into a seek table. + * If the entire table was written, the return value will be 0. Otherwise, + * it will be equal to the number of bytes left to write. */ +typedef struct ZSTD_frameLog_s ZSTD_frameLog; +ZSTDLIB_API ZSTD_frameLog* ZSTD_seekable_createFrameLog(int checksumFlag); +ZSTDLIB_API size_t ZSTD_seekable_freeFrameLog(ZSTD_frameLog* fl); +ZSTDLIB_API size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl, unsigned compressedSize, unsigned decompressedSize, unsigned checksum); +ZSTDLIB_API size_t ZSTD_seekable_writeSeekTable(ZSTD_frameLog* fl, ZSTD_outBuffer* output); + +/*-**************************************************************************** +* Seekable decompression - HowTo +* A ZSTD_seekable object is required to tracking the seekTable. +* +* Call ZSTD_seekable_init* to initialize a ZSTD_seekable object with the +* the seek table provided in the input. +* There are three modes for ZSTD_seekable_init: +* - ZSTD_seekable_initBuff() : An in-memory API. The data contained in +* `src` should be the entire seekable file, including the seek table. +* `src` should be kept alive and unmodified until the ZSTD_seekable object +* is freed or reset. +* - ZSTD_seekable_initFile() : A simplified file API using stdio. fread and +* fseek will be used to access the required data for building the seek +* table and doing decompression operations. `src` should not be closed +* or modified until the ZSTD_seekable object is freed or reset. +* - ZSTD_seekable_initAdvanced() : A general API allowing the client to +* provide its own read and seek callbacks. +* + ZSTD_seekable_read() : read exactly `n` bytes into `buffer`. +* Premature EOF should be treated as an error. +* + ZSTD_seekable_seek() : seek the read head to `offset` from `origin`, +* where origin is either SEEK_SET (beginning of +* file), or SEEK_END (end of file). +* Both functions should return a non-negative value in case of success, and a +* negative value in case of failure. If implementing using this API and +* stdio, be careful with files larger than 4GB and fseek. All of these +* functions return an error code checkable with ZSTD_isError(). +* +* Call ZSTD_seekable_decompress to decompress `dstSize` bytes at decompressed +* offset `offset`. ZSTD_seekable_decompress may have to decompress the entire +* prefix of the frame before the desired data if it has not already processed +* this section. If ZSTD_seekable_decompress is called multiple times for a +* consecutive range of data, it will efficiently retain the decompressor object +* and avoid redecompressing frame prefixes. The return value is the number of +* bytes decompressed, or an error code checkable with ZSTD_isError(). +* +* The seek table access functions can be used to obtain the data contained +* in the seek table. If frameIndex is larger than the value returned by +* ZSTD_seekable_getNumFrames(), they will return error codes checkable with +* ZSTD_isError(). Note that since the offset access functions return +* unsigned long long instead of size_t, in this case they will instead return +* the value ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE. +******************************************************************************/ + +/*===== Seekable decompressor management =====*/ +ZSTDLIB_API ZSTD_seekable* ZSTD_seekable_create(void); +ZSTDLIB_API size_t ZSTD_seekable_free(ZSTD_seekable* zs); + +/*===== Seekable decompression functions =====*/ +ZSTDLIB_API size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src); +ZSTDLIB_API size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned long long offset); +ZSTDLIB_API size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned frameIndex); + +#define ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE (0ULL-2) +/*===== Seek Table access functions =====*/ +ZSTDLIB_API unsigned ZSTD_seekable_getNumFrames(ZSTD_seekable* const zs); +ZSTDLIB_API unsigned long long ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API unsigned long long ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API size_t ZSTD_seekable_getFrameCompressedSize(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API size_t ZSTD_seekable_getFrameDecompressedSize(ZSTD_seekable* const zs, unsigned frameIndex); +ZSTDLIB_API unsigned ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, unsigned long long offset); + +/*===== Seekable advanced I/O API =====*/ +typedef int(ZSTD_seekable_read)(void* opaque, void* buffer, size_t n); +typedef int(ZSTD_seekable_seek)(void* opaque, long long offset, int origin); +typedef struct { + void* opaque; + ZSTD_seekable_read* read; + ZSTD_seekable_seek* seek; +} ZSTD_seekable_customFile; +ZSTDLIB_API size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src); + +#if defined (__cplusplus) +} +#endif + +#endif diff --git a/src/zstd/contrib/seekable_format/zstd_seekable_compression_format.md b/src/zstd/contrib/seekable_format/zstd_seekable_compression_format.md new file mode 100644 index 00000000..bf3080f7 --- /dev/null +++ b/src/zstd/contrib/seekable_format/zstd_seekable_compression_format.md @@ -0,0 +1,116 @@ +# Zstandard Seekable Format + +### Notices + +Copyright (c) 2017-present Facebook, Inc. + +Permission is granted to copy and distribute this document +for any purpose and without charge, +including translations into other languages +and incorporation into compilations, +provided that the copyright notice and this notice are preserved, +and that any substantive changes or deletions from the original +are clearly marked. +Distribution of this document is unlimited. + +### Version +0.1.0 (11/04/17) + +## Introduction +This document defines a format for compressed data to be stored so that subranges of the data can be efficiently decompressed without requiring the entire document to be decompressed. +This is done by splitting up the input data into frames, +each of which are compressed independently, +and so can be decompressed independently. +Decompression then takes advantage of a provided 'seek table', which allows the decompressor to immediately jump to the desired data. This is done in a way that is compatible with the original Zstandard format by placing the seek table in a Zstandard skippable frame. + +### Overall conventions +In this document: +- square brackets i.e. `[` and `]` are used to indicate optional fields or parameters. +- the naming convention for identifiers is `Mixed_Case_With_Underscores` +- All numeric fields are little-endian unless specified otherwise + +## Format + +The format consists of a number of frames (Zstandard compressed frames and skippable frames), followed by a final skippable frame at the end containing the seek table. + +### Seek Table Format +The structure of the seek table frame is as follows: + +|`Skippable_Magic_Number`|`Frame_Size`|`[Seek_Table_Entries]`|`Seek_Table_Footer`| +|------------------------|------------|----------------------|-------------------| +| 4 bytes | 4 bytes | 8-12 bytes each | 9 bytes | + +__`Skippable_Magic_Number`__ + +Value : 0x184D2A5E. +This is for compatibility with [Zstandard skippable frames]. +Since it is legal for other Zstandard skippable frames to use the same +magic number, it is not recommended for a decoder to recognize frames +solely on this. + +__`Frame_Size`__ + +The total size of the skippable frame, not including the `Skippable_Magic_Number` or `Frame_Size`. +This is for compatibility with [Zstandard skippable frames]. + +[Zstandard skippable frames]: https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#skippable-frames + +#### `Seek_Table_Footer` +The seek table footer format is as follows: + +|`Number_Of_Frames`|`Seek_Table_Descriptor`|`Seekable_Magic_Number`| +|------------------|-----------------------|-----------------------| +| 4 bytes | 1 byte | 4 bytes | + +__`Seekable_Magic_Number`__ + +Value : 0x8F92EAB1. +This value must be the last bytes present in the compressed file so that decoders +can efficiently find it and determine if there is an actual seek table present. + +__`Number_Of_Frames`__ + +The number of stored frames in the data. + +__`Seek_Table_Descriptor`__ + +A bitfield describing the format of the seek table. + +| Bit number | Field name | +| ---------- | ---------- | +| 7 | `Checksum_Flag` | +| 6-2 | `Reserved_Bits` | +| 1-0 | `Unused_Bits` | + +While only `Checksum_Flag` currently exists, there are 7 other bits in this field that can be used for future changes to the format, +for example the addition of inline dictionaries. + +__`Checksum_Flag`__ + +If the checksum flag is set, each of the seek table entries contains a 4 byte checksum of the uncompressed data contained in its frame. + +`Reserved_Bits` are not currently used but may be used in the future for breaking changes, so a compliant decoder should ensure they are set to 0. `Unused_Bits` may be used in the future for non-breaking changes, so a compliant decoder should not interpret these bits. + +#### __`Seek_Table_Entries`__ + +`Seek_Table_Entries` consists of `Number_Of_Frames` (one for each frame in the data, not including the seek table frame) entries of the following form, in sequence: + +|`Compressed_Size`|`Decompressed_Size`|`[Checksum]`| +|-----------------|-------------------|------------| +| 4 bytes | 4 bytes | 4 bytes | + +__`Compressed_Size`__ + +The compressed size of the frame. +The cumulative sum of the `Compressed_Size` fields of frames `0` to `i` gives the offset in the compressed file of frame `i+1`. + +__`Decompressed_Size`__ + +The size of the decompressed data contained in the frame. For skippable or otherwise empty frames, this value is 0. + +__`Checksum`__ + +Only present if `Checksum_Flag` is set in the `Seek_Table_Descriptor`. Value : the least significant 32 bits of the XXH64 digest of the uncompressed data, stored in little-endian format. + +## Version Changes +- 0.1.0: initial version diff --git a/src/zstd/contrib/seekable_format/zstdseek_compress.c b/src/zstd/contrib/seekable_format/zstdseek_compress.c new file mode 100644 index 00000000..df207498 --- /dev/null +++ b/src/zstd/contrib/seekable_format/zstdseek_compress.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ + +#include <stdlib.h> /* malloc, free */ + +#define XXH_STATIC_LINKING_ONLY +#define XXH_NAMESPACE ZSTD_ +#include "xxhash.h" + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "mem.h" +#include "zstd_seekable.h" + +#define CHECK_Z(f) { size_t const ret = (f); if (ret != 0) return ret; } + +#undef ERROR +#define ERROR(name) ((size_t)-ZSTD_error_##name) + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +typedef struct { + U32 cSize; + U32 dSize; + U32 checksum; +} framelogEntry_t; + +struct ZSTD_frameLog_s { + framelogEntry_t* entries; + U32 size; + U32 capacity; + + int checksumFlag; + + /* for use when streaming out the seek table */ + U32 seekTablePos; + U32 seekTableIndex; +} framelog_t; + +struct ZSTD_seekable_CStream_s { + ZSTD_CStream* cstream; + ZSTD_frameLog framelog; + + U32 frameCSize; + U32 frameDSize; + + XXH64_state_t xxhState; + + U32 maxFrameSize; + + int writingSeekTable; +}; + +size_t ZSTD_seekable_frameLog_allocVec(ZSTD_frameLog* fl) +{ + /* allocate some initial space */ + size_t const FRAMELOG_STARTING_CAPACITY = 16; + fl->entries = (framelogEntry_t*)malloc( + sizeof(framelogEntry_t) * FRAMELOG_STARTING_CAPACITY); + if (fl->entries == NULL) return ERROR(memory_allocation); + fl->capacity = FRAMELOG_STARTING_CAPACITY; + + return 0; +} + +size_t ZSTD_seekable_frameLog_freeVec(ZSTD_frameLog* fl) +{ + if (fl != NULL) free(fl->entries); + return 0; +} + +ZSTD_frameLog* ZSTD_seekable_createFrameLog(int checksumFlag) +{ + ZSTD_frameLog* fl = malloc(sizeof(ZSTD_frameLog)); + if (fl == NULL) return NULL; + + if (ZSTD_isError(ZSTD_seekable_frameLog_allocVec(fl))) { + free(fl); + return NULL; + } + + fl->checksumFlag = checksumFlag; + fl->seekTablePos = 0; + fl->seekTableIndex = 0; + fl->size = 0; + + return fl; +} + +size_t ZSTD_seekable_freeFrameLog(ZSTD_frameLog* fl) +{ + ZSTD_seekable_frameLog_freeVec(fl); + free(fl); + return 0; +} + +ZSTD_seekable_CStream* ZSTD_seekable_createCStream() +{ + ZSTD_seekable_CStream* zcs = malloc(sizeof(ZSTD_seekable_CStream)); + + if (zcs == NULL) return NULL; + + memset(zcs, 0, sizeof(*zcs)); + + zcs->cstream = ZSTD_createCStream(); + if (zcs->cstream == NULL) goto failed1; + + if (ZSTD_isError(ZSTD_seekable_frameLog_allocVec(&zcs->framelog))) goto failed2; + + return zcs; + +failed2: + ZSTD_freeCStream(zcs->cstream); +failed1: + free(zcs); + return NULL; +} + +size_t ZSTD_seekable_freeCStream(ZSTD_seekable_CStream* zcs) +{ + if (zcs == NULL) return 0; /* support free on null */ + ZSTD_freeCStream(zcs->cstream); + ZSTD_seekable_frameLog_freeVec(&zcs->framelog); + free(zcs); + + return 0; +} + +size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs, + int compressionLevel, + int checksumFlag, + U32 maxFrameSize) +{ + zcs->framelog.size = 0; + zcs->frameCSize = 0; + zcs->frameDSize = 0; + + /* make sure maxFrameSize has a reasonable value */ + if (maxFrameSize > ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE) { + return ERROR(compressionParameter_unsupported); + } + + zcs->maxFrameSize = maxFrameSize + ? maxFrameSize + : ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE; + + zcs->framelog.checksumFlag = checksumFlag; + if (zcs->framelog.checksumFlag) { + XXH64_reset(&zcs->xxhState, 0); + } + + zcs->framelog.seekTablePos = 0; + zcs->framelog.seekTableIndex = 0; + zcs->writingSeekTable = 0; + + return ZSTD_initCStream(zcs->cstream, compressionLevel); +} + +size_t ZSTD_seekable_logFrame(ZSTD_frameLog* fl, + unsigned compressedSize, + unsigned decompressedSize, + unsigned checksum) +{ + if (fl->size == ZSTD_SEEKABLE_MAXFRAMES) + return ERROR(frameIndex_tooLarge); + + /* grow the buffer if required */ + if (fl->size == fl->capacity) { + /* exponential size increase for constant amortized runtime */ + size_t const newCapacity = fl->capacity * 2; + framelogEntry_t* const newEntries = realloc(fl->entries, + sizeof(framelogEntry_t) * newCapacity); + + if (newEntries == NULL) return ERROR(memory_allocation); + + fl->entries = newEntries; + fl->capacity = newCapacity; + } + + fl->entries[fl->size] = (framelogEntry_t){ + compressedSize, decompressedSize, checksum + }; + fl->size++; + + return 0; +} + +size_t ZSTD_seekable_endFrame(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output) +{ + size_t const prevOutPos = output->pos; + /* end the frame */ + size_t ret = ZSTD_endStream(zcs->cstream, output); + + zcs->frameCSize += output->pos - prevOutPos; + + /* need to flush before doing the rest */ + if (ret) return ret; + + /* frame done */ + + /* store the frame data for later */ + ret = ZSTD_seekable_logFrame( + &zcs->framelog, zcs->frameCSize, zcs->frameDSize, + zcs->framelog.checksumFlag + ? XXH64_digest(&zcs->xxhState) & 0xFFFFFFFFU + : 0); + if (ret) return ret; + + /* reset for the next frame */ + zcs->frameCSize = 0; + zcs->frameDSize = 0; + + ZSTD_resetCStream(zcs->cstream, 0); + if (zcs->framelog.checksumFlag) + XXH64_reset(&zcs->xxhState, 0); + + return 0; +} + +size_t ZSTD_seekable_compressStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + const BYTE* const inBase = (const BYTE*) input->src + input->pos; + size_t inLen = input->size - input->pos; + + inLen = MIN(inLen, (size_t)(zcs->maxFrameSize - zcs->frameDSize)); + + /* if we haven't finished flushing the last frame, don't start writing a new one */ + if (inLen > 0) { + ZSTD_inBuffer inTmp = { inBase, inLen, 0 }; + size_t const prevOutPos = output->pos; + + size_t const ret = ZSTD_compressStream(zcs->cstream, output, &inTmp); + + if (zcs->framelog.checksumFlag) { + XXH64_update(&zcs->xxhState, inBase, inTmp.pos); + } + + zcs->frameCSize += output->pos - prevOutPos; + zcs->frameDSize += inTmp.pos; + + input->pos += inTmp.pos; + + if (ZSTD_isError(ret)) return ret; + } + + if (zcs->maxFrameSize == zcs->frameDSize) { + /* log the frame and start over */ + size_t const ret = ZSTD_seekable_endFrame(zcs, output); + if (ZSTD_isError(ret)) return ret; + + /* get the client ready for the next frame */ + return (size_t)zcs->maxFrameSize; + } + + return (size_t)(zcs->maxFrameSize - zcs->frameDSize); +} + +static inline size_t ZSTD_seekable_seekTableSize(const ZSTD_frameLog* fl) +{ + size_t const sizePerFrame = 8 + (fl->checksumFlag?4:0); + size_t const seekTableLen = ZSTD_skippableHeaderSize + + sizePerFrame * fl->size + + ZSTD_seekTableFooterSize; + + return seekTableLen; +} + +static inline size_t ZSTD_stwrite32(ZSTD_frameLog* fl, + ZSTD_outBuffer* output, U32 const value, + U32 const offset) +{ + if (fl->seekTablePos < offset + 4) { + BYTE tmp[4]; /* so that we can work with buffers too small to write a whole word to */ + size_t const lenWrite = + MIN(output->size - output->pos, offset + 4 - fl->seekTablePos); + MEM_writeLE32(tmp, value); + memcpy((BYTE*)output->dst + output->pos, + tmp + (fl->seekTablePos - offset), lenWrite); + output->pos += lenWrite; + fl->seekTablePos += lenWrite; + + if (lenWrite < 4) return ZSTD_seekable_seekTableSize(fl) - fl->seekTablePos; + } + return 0; +} + +size_t ZSTD_seekable_writeSeekTable(ZSTD_frameLog* fl, ZSTD_outBuffer* output) +{ + /* seekTableIndex: the current index in the table and + * seekTableSize: the amount of the table written so far + * + * This function is written this way so that if it has to return early + * because of a small buffer, it can keep going where it left off. + */ + + size_t const sizePerFrame = 8 + (fl->checksumFlag?4:0); + size_t const seekTableLen = ZSTD_seekable_seekTableSize(fl); + + CHECK_Z(ZSTD_stwrite32(fl, output, ZSTD_MAGIC_SKIPPABLE_START | 0xE, 0)); + CHECK_Z(ZSTD_stwrite32(fl, output, seekTableLen - ZSTD_skippableHeaderSize, + 4)); + + while (fl->seekTableIndex < fl->size) { + CHECK_Z(ZSTD_stwrite32(fl, output, + fl->entries[fl->seekTableIndex].cSize, + ZSTD_skippableHeaderSize + + sizePerFrame * fl->seekTableIndex + 0)); + + CHECK_Z(ZSTD_stwrite32(fl, output, + fl->entries[fl->seekTableIndex].dSize, + ZSTD_skippableHeaderSize + + sizePerFrame * fl->seekTableIndex + 4)); + + if (fl->checksumFlag) { + CHECK_Z(ZSTD_stwrite32( + fl, output, fl->entries[fl->seekTableIndex].checksum, + ZSTD_skippableHeaderSize + + sizePerFrame * fl->seekTableIndex + 8)); + } + + fl->seekTableIndex++; + } + + CHECK_Z(ZSTD_stwrite32(fl, output, fl->size, + seekTableLen - ZSTD_seekTableFooterSize)); + + if (output->size - output->pos < 1) return seekTableLen - fl->seekTablePos; + if (fl->seekTablePos < seekTableLen - 4) { + BYTE sfd = 0; + sfd |= (fl->checksumFlag) << 7; + + ((BYTE*)output->dst)[output->pos] = sfd; + output->pos++; + fl->seekTablePos++; + } + + CHECK_Z(ZSTD_stwrite32(fl, output, ZSTD_SEEKABLE_MAGICNUMBER, + seekTableLen - 4)); + + if (fl->seekTablePos != seekTableLen) return ERROR(GENERIC); + return 0; +} + +size_t ZSTD_seekable_endStream(ZSTD_seekable_CStream* zcs, ZSTD_outBuffer* output) +{ + if (!zcs->writingSeekTable && zcs->frameDSize) { + const size_t endFrame = ZSTD_seekable_endFrame(zcs, output); + if (ZSTD_isError(endFrame)) return endFrame; + /* return an accurate size hint */ + if (endFrame) return endFrame + ZSTD_seekable_seekTableSize(&zcs->framelog); + } + + zcs->writingSeekTable = 1; + + return ZSTD_seekable_writeSeekTable(&zcs->framelog, output); +} diff --git a/src/zstd/contrib/seekable_format/zstdseek_decompress.c b/src/zstd/contrib/seekable_format/zstdseek_decompress.c new file mode 100644 index 00000000..d740e16b --- /dev/null +++ b/src/zstd/contrib/seekable_format/zstdseek_decompress.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* ********************************************************* +* Turn on Large Files support (>4GB) for 32-bit Linux/Unix +***********************************************************/ +#if !defined(__64BIT__) || defined(__MINGW32__) /* No point defining Large file for 64 bit but MinGW-w64 requires it */ +# if !defined(_FILE_OFFSET_BITS) +# define _FILE_OFFSET_BITS 64 /* turn off_t into a 64-bit type for ftello, fseeko */ +# endif +# if !defined(_LARGEFILE_SOURCE) /* obsolete macro, replaced with _FILE_OFFSET_BITS */ +# define _LARGEFILE_SOURCE 1 /* Large File Support extension (LFS) - fseeko, ftello */ +# endif +# if defined(_AIX) || defined(__hpux) +# define _LARGE_FILES /* Large file support on 32-bits AIX and HP-UX */ +# endif +#endif + +/* ************************************************************ +* Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW +***************************************************************/ +#if defined(_MSC_VER) && _MSC_VER >= 1400 +# define LONG_SEEK _fseeki64 +#elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */ +# define LONG_SEEK fseeko +#elif defined(__MINGW32__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS) && defined(__MSVCRT__) +# define LONG_SEEK fseeko64 +#elif defined(_WIN32) && !defined(__DJGPP__) +# include <windows.h> + static int LONG_SEEK(FILE* file, __int64 offset, int origin) { + LARGE_INTEGER off; + DWORD method; + off.QuadPart = offset; + if (origin == SEEK_END) + method = FILE_END; + else if (origin == SEEK_CUR) + method = FILE_CURRENT; + else + method = FILE_BEGIN; + + if (SetFilePointerEx((HANDLE) _get_osfhandle(_fileno(file)), off, NULL, method)) + return 0; + else + return -1; + } +#else +# define LONG_SEEK fseek +#endif + +#include <stdlib.h> /* malloc, free */ +#include <stdio.h> /* FILE* */ + +#define XXH_STATIC_LINKING_ONLY +#define XXH_NAMESPACE ZSTD_ +#include "xxhash.h" + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "mem.h" +#include "zstd_seekable.h" + +#undef ERROR +#define ERROR(name) ((size_t)-ZSTD_error_##name) + +#define CHECK_IO(f) { int const errcod = (f); if (errcod < 0) return ERROR(seekableIO); } + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +/* Special-case callbacks for FILE* and in-memory modes, so that we can treat + * them the same way as the advanced API */ +static int ZSTD_seekable_read_FILE(void* opaque, void* buffer, size_t n) +{ + size_t const result = fread(buffer, 1, n, (FILE*)opaque); + if (result != n) { + return -1; + } + return 0; +} + +static int ZSTD_seekable_seek_FILE(void* opaque, S64 offset, int origin) +{ + int const ret = LONG_SEEK((FILE*)opaque, offset, origin); + if (ret) return ret; + return fflush((FILE*)opaque); +} + +typedef struct { + const void *ptr; + size_t size; + size_t pos; +} buffWrapper_t; + +static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n) +{ + buffWrapper_t* buff = (buffWrapper_t*) opaque; + if (buff->size + n > buff->pos) return -1; + memcpy(buffer, (const BYTE*)buff->ptr + buff->pos, n); + buff->pos += n; + return 0; +} + +static int ZSTD_seekable_seek_buff(void* opaque, S64 offset, int origin) +{ + buffWrapper_t* buff = (buffWrapper_t*) opaque; + unsigned long long newOffset; + switch (origin) { + case SEEK_SET: + newOffset = offset; + break; + case SEEK_CUR: + newOffset = (unsigned long long)buff->pos + offset; + break; + case SEEK_END: + newOffset = (unsigned long long)buff->size - offset; + break; + } + if (newOffset < 0 || newOffset > buff->size) { + return -1; + } + buff->pos = newOffset; + return 0; +} + +typedef struct { + U64 cOffset; + U64 dOffset; + U32 checksum; +} seekEntry_t; + +typedef struct { + seekEntry_t* entries; + size_t tableLen; + + int checksumFlag; +} seekTable_t; + +#define SEEKABLE_BUFF_SIZE ZSTD_BLOCKSIZE_ABSOLUTEMAX + +struct ZSTD_seekable_s { + ZSTD_DStream* dstream; + seekTable_t seekTable; + ZSTD_seekable_customFile src; + + U64 decompressedOffset; + U32 curFrame; + + BYTE inBuff[SEEKABLE_BUFF_SIZE]; /* need to do our own input buffering */ + BYTE outBuff[SEEKABLE_BUFF_SIZE]; /* so we can efficiently decompress the + starts of chunks before we get to the + desired section */ + ZSTD_inBuffer in; /* maintain continuity across ZSTD_seekable_decompress operations */ + buffWrapper_t buffWrapper; /* for `src.opaque` in in-memory mode */ + + XXH64_state_t xxhState; +}; + +ZSTD_seekable* ZSTD_seekable_create(void) +{ + ZSTD_seekable* zs = malloc(sizeof(ZSTD_seekable)); + + if (zs == NULL) return NULL; + + /* also initializes stage to zsds_init */ + memset(zs, 0, sizeof(*zs)); + + zs->dstream = ZSTD_createDStream(); + if (zs->dstream == NULL) { + free(zs); + return NULL; + } + + return zs; +} + +size_t ZSTD_seekable_free(ZSTD_seekable* zs) +{ + if (zs == NULL) return 0; /* support free on null */ + ZSTD_freeDStream(zs->dstream); + free(zs->seekTable.entries); + free(zs); + + return 0; +} + +/** ZSTD_seekable_offsetToFrameIndex() : + * Performs a binary search to find the last frame with a decompressed offset + * <= pos + * @return : the frame's index */ +U32 ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, U64 pos) +{ + U32 lo = 0; + U32 hi = zs->seekTable.tableLen; + + if (pos >= zs->seekTable.entries[zs->seekTable.tableLen].dOffset) { + return zs->seekTable.tableLen; + } + + while (lo + 1 < hi) { + U32 const mid = lo + ((hi - lo) >> 1); + if (zs->seekTable.entries[mid].dOffset <= pos) { + lo = mid; + } else { + hi = mid; + } + } + return lo; +} + +U32 ZSTD_seekable_getNumFrames(ZSTD_seekable* const zs) +{ + return zs->seekTable.tableLen; +} + +U64 ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, U32 frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE; + return zs->seekTable.entries[frameIndex].cOffset; +} + +U64 ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, U32 frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE; + return zs->seekTable.entries[frameIndex].dOffset; +} + +size_t ZSTD_seekable_getFrameCompressedSize(ZSTD_seekable* const zs, U32 frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) return ERROR(frameIndex_tooLarge); + return zs->seekTable.entries[frameIndex + 1].cOffset - + zs->seekTable.entries[frameIndex].cOffset; +} + +size_t ZSTD_seekable_getFrameDecompressedSize(ZSTD_seekable* const zs, U32 frameIndex) +{ + if (frameIndex > zs->seekTable.tableLen) return ERROR(frameIndex_tooLarge); + return zs->seekTable.entries[frameIndex + 1].dOffset - + zs->seekTable.entries[frameIndex].dOffset; +} + +static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs) +{ + int checksumFlag; + ZSTD_seekable_customFile src = zs->src; + /* read the footer, fixed size */ + CHECK_IO(src.seek(src.opaque, -(int)ZSTD_seekTableFooterSize, SEEK_END)); + CHECK_IO(src.read(src.opaque, zs->inBuff, ZSTD_seekTableFooterSize)); + + if (MEM_readLE32(zs->inBuff + 5) != ZSTD_SEEKABLE_MAGICNUMBER) { + return ERROR(prefix_unknown); + } + + { BYTE const sfd = zs->inBuff[4]; + checksumFlag = sfd >> 7; + + /* check reserved bits */ + if ((checksumFlag >> 2) & 0x1f) { + return ERROR(corruption_detected); + } + } + + { U32 const numFrames = MEM_readLE32(zs->inBuff); + U32 const sizePerEntry = 8 + (checksumFlag?4:0); + U32 const tableSize = sizePerEntry * numFrames; + U32 const frameSize = tableSize + ZSTD_seekTableFooterSize + ZSTD_skippableHeaderSize; + + U32 remaining = frameSize - ZSTD_seekTableFooterSize; /* don't need to re-read footer */ + { + U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE); + + CHECK_IO(src.seek(src.opaque, -(S64)frameSize, SEEK_END)); + CHECK_IO(src.read(src.opaque, zs->inBuff, toRead)); + + remaining -= toRead; + } + + if (MEM_readLE32(zs->inBuff) != (ZSTD_MAGIC_SKIPPABLE_START | 0xE)) { + return ERROR(prefix_unknown); + } + if (MEM_readLE32(zs->inBuff+4) + ZSTD_skippableHeaderSize != frameSize) { + return ERROR(prefix_unknown); + } + + { /* Allocate an extra entry at the end so that we can do size + * computations on the last element without special case */ + seekEntry_t* entries = (seekEntry_t*)malloc(sizeof(seekEntry_t) * (numFrames + 1)); + const BYTE* tableBase = zs->inBuff + ZSTD_skippableHeaderSize; + + U32 idx = 0; + U32 pos = 8; + + + U64 cOffset = 0; + U64 dOffset = 0; + + if (!entries) { + free(entries); + return ERROR(memory_allocation); + } + + /* compute cumulative positions */ + for (; idx < numFrames; idx++) { + if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) { + U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE); + U32 const offset = SEEKABLE_BUFF_SIZE - pos; + memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */ + CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead)); + remaining -= toRead; + pos = 0; + } + entries[idx].cOffset = cOffset; + entries[idx].dOffset = dOffset; + + cOffset += MEM_readLE32(zs->inBuff + pos); + pos += 4; + dOffset += MEM_readLE32(zs->inBuff + pos); + pos += 4; + if (checksumFlag) { + entries[idx].checksum = MEM_readLE32(zs->inBuff + pos); + pos += 4; + } + } + entries[numFrames].cOffset = cOffset; + entries[numFrames].dOffset = dOffset; + + zs->seekTable.entries = entries; + zs->seekTable.tableLen = numFrames; + zs->seekTable.checksumFlag = checksumFlag; + return 0; + } + } +} + +size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize) +{ + zs->buffWrapper = (buffWrapper_t){src, srcSize, 0}; + { ZSTD_seekable_customFile srcFile = {&zs->buffWrapper, + &ZSTD_seekable_read_buff, + &ZSTD_seekable_seek_buff}; + return ZSTD_seekable_initAdvanced(zs, srcFile); } +} + +size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src) +{ + ZSTD_seekable_customFile srcFile = {src, &ZSTD_seekable_read_FILE, + &ZSTD_seekable_seek_FILE}; + return ZSTD_seekable_initAdvanced(zs, srcFile); +} + +size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src) +{ + zs->src = src; + + { const size_t seekTableInit = ZSTD_seekable_loadSeekTable(zs); + if (ZSTD_isError(seekTableInit)) return seekTableInit; } + + zs->decompressedOffset = (U64)-1; + zs->curFrame = (U32)-1; + + { const size_t dstreamInit = ZSTD_initDStream(zs->dstream); + if (ZSTD_isError(dstreamInit)) return dstreamInit; } + return 0; +} + +size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, U64 offset) +{ + U32 targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, offset); + do { + /* check if we can continue from a previous decompress job */ + if (targetFrame != zs->curFrame || offset != zs->decompressedOffset) { + zs->decompressedOffset = zs->seekTable.entries[targetFrame].dOffset; + zs->curFrame = targetFrame; + + CHECK_IO(zs->src.seek(zs->src.opaque, + zs->seekTable.entries[targetFrame].cOffset, + SEEK_SET)); + zs->in = (ZSTD_inBuffer){zs->inBuff, 0, 0}; + XXH64_reset(&zs->xxhState, 0); + ZSTD_resetDStream(zs->dstream); + } + + while (zs->decompressedOffset < offset + len) { + size_t toRead; + ZSTD_outBuffer outTmp; + size_t prevOutPos; + if (zs->decompressedOffset < offset) { + /* dummy decompressions until we get to the target offset */ + outTmp = (ZSTD_outBuffer){zs->outBuff, MIN(SEEKABLE_BUFF_SIZE, offset - zs->decompressedOffset), 0}; + } else { + outTmp = (ZSTD_outBuffer){dst, len, zs->decompressedOffset - offset}; + } + + prevOutPos = outTmp.pos; + toRead = ZSTD_decompressStream(zs->dstream, &outTmp, &zs->in); + if (ZSTD_isError(toRead)) { + return toRead; + } + + if (zs->seekTable.checksumFlag) { + XXH64_update(&zs->xxhState, (BYTE*)outTmp.dst + prevOutPos, + outTmp.pos - prevOutPos); + } + zs->decompressedOffset += outTmp.pos - prevOutPos; + + if (toRead == 0) { + /* frame complete */ + + /* verify checksum */ + if (zs->seekTable.checksumFlag && + (XXH64_digest(&zs->xxhState) & 0xFFFFFFFFU) != + zs->seekTable.entries[targetFrame].checksum) { + return ERROR(corruption_detected); + } + + if (zs->decompressedOffset < offset + len) { + /* go back to the start and force a reset of the stream */ + targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, zs->decompressedOffset); + } + break; + } + + /* read in more data if we're done with this buffer */ + if (zs->in.pos == zs->in.size) { + toRead = MIN(toRead, SEEKABLE_BUFF_SIZE); + CHECK_IO(zs->src.read(zs->src.opaque, zs->inBuff, toRead)); + zs->in.size = toRead; + zs->in.pos = 0; + } + } + } while (zs->decompressedOffset != offset + len); + + return len; +} + +size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, U32 frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) { + return ERROR(frameIndex_tooLarge); + } + + { + size_t const decompressedSize = + zs->seekTable.entries[frameIndex + 1].dOffset - + zs->seekTable.entries[frameIndex].dOffset; + if (dstSize < decompressedSize) { + return ERROR(dstSize_tooSmall); + } + return ZSTD_seekable_decompress( + zs, dst, decompressedSize, + zs->seekTable.entries[frameIndex].dOffset); + } +} |