summaryrefslogtreecommitdiffstats
path: root/src/zstd/contrib/match_finders/zstd_edist.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/zstd/contrib/match_finders/zstd_edist.c')
-rw-r--r--src/zstd/contrib/match_finders/zstd_edist.c558
1 files changed, 558 insertions, 0 deletions
diff --git a/src/zstd/contrib/match_finders/zstd_edist.c b/src/zstd/contrib/match_finders/zstd_edist.c
new file mode 100644
index 000000000..aab545fd3
--- /dev/null
+++ b/src/zstd/contrib/match_finders/zstd_edist.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+* Dependencies
+***************************************/
+
+/* Currently relies on qsort when combining contiguous matches. This can probabily
+ * be avoided but would require changes to the algorithm. The qsort is far from
+ * the bottleneck in this algorithm even for medium sized files so it's probably
+ * not worth trying to address */
+#include <stdlib.h>
+#include <assert.h>
+
+#include "zstd_edist.h"
+#include "mem.h"
+
+/*-*************************************
+* Constants
+***************************************/
+
+/* Just a sential for the entires of the diagnomal matrix */
+#define ZSTD_EDIST_DIAG_MAX (S32)(1 << 30)
+
+/* How large should a snake be to be considered a 'big' snake.
+ * For an explanation of what a 'snake' is with respect to the
+ * edit distance matrix, see the linked paper in zstd_edist.h */
+#define ZSTD_EDIST_SNAKE_THRESH 20
+
+/* After how many iterations should we start to use the heuristic
+ * based on 'big' snakes */
+#define ZSTD_EDIST_SNAKE_ITER_THRESH 200
+
+/* After how many iterations should be just give up and take
+ * the best availabe edit script for this round */
+#define ZSTD_EDIST_EXPENSIVE_THRESH 1024
+
+/*-*************************************
+* Structures
+***************************************/
+
+typedef struct {
+ U32 dictIdx;
+ U32 srcIdx;
+ U32 matchLength;
+} ZSTD_eDist_match;
+
+typedef struct {
+ const BYTE* dict;
+ const BYTE* src;
+ size_t dictSize;
+ size_t srcSize;
+ S32* forwardDiag; /* Entires of the forward diagonal stored here */
+ S32* backwardDiag; /* Entires of the backward diagonal stored here.
+ * Note: this buffer and the 'forwardDiag' buffer
+ * are contiguous. See the ZSTD_eDist_genSequences */
+ ZSTD_eDist_match* matches; /* Accumulate matches of length 1 in this buffer.
+ * In a subsequence post-processing step, we combine
+ * contiguous matches. */
+ U32 nbMatches;
+} ZSTD_eDist_state;
+
+typedef struct {
+ S32 dictMid; /* The mid diagonal for the dictionary */
+ S32 srcMid; /* The mid diagonal for the source */
+ int lowUseHeuristics; /* Should we use heuristics for the low part */
+ int highUseHeuristics; /* Should we use heuristics for the high part */
+} ZSTD_eDist_partition;
+
+/*-*************************************
+* Internal
+***************************************/
+
+static void ZSTD_eDist_diag(ZSTD_eDist_state* state,
+ ZSTD_eDist_partition* partition,
+ S32 dictLow, S32 dictHigh, S32 srcLow,
+ S32 srcHigh, int useHeuristics)
+{
+ S32* const forwardDiag = state->forwardDiag;
+ S32* const backwardDiag = state->backwardDiag;
+ const BYTE* const dict = state->dict;
+ const BYTE* const src = state->src;
+
+ S32 const diagMin = dictLow - srcHigh;
+ S32 const diagMax = dictHigh - srcLow;
+ S32 const forwardMid = dictLow - srcLow;
+ S32 const backwardMid = dictHigh - srcHigh;
+
+ S32 forwardMin = forwardMid;
+ S32 forwardMax = forwardMid;
+ S32 backwardMin = backwardMid;
+ S32 backwardMax = backwardMid;
+ int odd = (forwardMid - backwardMid) & 1;
+ U32 iterations;
+
+ forwardDiag[forwardMid] = dictLow;
+ backwardDiag[backwardMid] = dictHigh;
+
+ /* Main loop for updating diag entries. Unless useHeuristics is
+ * set to false, this loop will run until it finds the minimal
+ * edit script */
+ for (iterations = 1;;iterations++) {
+ S32 diag;
+ int bigSnake = 0;
+
+ if (forwardMin > diagMin) {
+ forwardMin--;
+ forwardDiag[forwardMin - 1] = -1;
+ } else {
+ forwardMin++;
+ }
+
+ if (forwardMax < diagMax) {
+ forwardMax++;
+ forwardDiag[forwardMax + 1] = -1;
+ } else {
+ forwardMax--;
+ }
+
+ for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
+ S32 dictIdx;
+ S32 srcIdx;
+ S32 low = forwardDiag[diag - 1];
+ S32 high = forwardDiag[diag + 1];
+ S32 dictIdx0 = low < high ? high : low + 1;
+
+ for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag;
+ dictIdx < dictHigh && srcIdx < srcHigh && dict[dictIdx] == src[srcIdx];
+ dictIdx++, srcIdx++) continue;
+
+ if (dictIdx - dictIdx0 > ZSTD_EDIST_SNAKE_THRESH)
+ bigSnake = 1;
+
+ forwardDiag[diag] = dictIdx;
+
+ if (odd && backwardMin <= diag && diag <= backwardMax && backwardDiag[diag] <= dictIdx) {
+ partition->dictMid = dictIdx;
+ partition->srcMid = srcIdx;
+ partition->lowUseHeuristics = 0;
+ partition->highUseHeuristics = 0;
+ return;
+ }
+ }
+
+ if (backwardMin > diagMin) {
+ backwardMin--;
+ backwardDiag[backwardMin - 1] = ZSTD_EDIST_DIAG_MAX;
+ } else {
+ backwardMin++;
+ }
+
+ if (backwardMax < diagMax) {
+ backwardMax++;
+ backwardDiag[backwardMax + 1] = ZSTD_EDIST_DIAG_MAX;
+ } else {
+ backwardMax--;
+ }
+
+
+ for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
+ S32 dictIdx;
+ S32 srcIdx;
+ S32 low = backwardDiag[diag - 1];
+ S32 high = backwardDiag[diag + 1];
+ S32 dictIdx0 = low < high ? low : high - 1;
+
+ for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag;
+ dictLow < dictIdx && srcLow < srcIdx && dict[dictIdx - 1] == src[srcIdx - 1];
+ dictIdx--, srcIdx--) continue;
+
+ if (dictIdx0 - dictIdx > ZSTD_EDIST_SNAKE_THRESH)
+ bigSnake = 1;
+
+ backwardDiag[diag] = dictIdx;
+
+ if (!odd && forwardMin <= diag && diag <= forwardMax && dictIdx <= forwardDiag[diag]) {
+ partition->dictMid = dictIdx;
+ partition->srcMid = srcIdx;
+ partition->lowUseHeuristics = 0;
+ partition->highUseHeuristics = 0;
+ return;
+ }
+ }
+
+ if (!useHeuristics)
+ continue;
+
+ /* Everything under this point is a heuritic. Using these will
+ * substantially speed up the match finding. In some cases, taking
+ * the total match finding time from several minutes to seconds.
+ * Of course, the caveat is that the edit script found may no longer
+ * be optimal */
+
+ /* Big snake heuristic */
+ if (iterations > ZSTD_EDIST_SNAKE_ITER_THRESH && bigSnake) {
+ {
+ S32 best = 0;
+
+ for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
+ S32 diagDiag = diag - forwardMid;
+ S32 dictIdx = forwardDiag[diag];
+ S32 srcIdx = dictIdx - diag;
+ S32 v = (dictIdx - dictLow) * 2 - diagDiag;
+
+ if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) {
+ if (v > best
+ && dictLow + ZSTD_EDIST_SNAKE_THRESH <= dictIdx && dictIdx <= dictHigh
+ && srcLow + ZSTD_EDIST_SNAKE_THRESH <= srcIdx && srcIdx <= srcHigh) {
+ S32 k;
+ for (k = 1; dict[dictIdx - k] == src[srcIdx - k]; k++) {
+ if (k == ZSTD_EDIST_SNAKE_THRESH) {
+ best = v;
+ partition->dictMid = dictIdx;
+ partition->srcMid = srcIdx;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (best > 0) {
+ partition->lowUseHeuristics = 0;
+ partition->highUseHeuristics = 1;
+ return;
+ }
+ }
+
+ {
+ S32 best = 0;
+
+ for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
+ S32 diagDiag = diag - backwardMid;
+ S32 dictIdx = backwardDiag[diag];
+ S32 srcIdx = dictIdx - diag;
+ S32 v = (dictHigh - dictIdx) * 2 + diagDiag;
+
+ if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) {
+ if (v > best
+ && dictLow < dictIdx && dictIdx <= dictHigh - ZSTD_EDIST_SNAKE_THRESH
+ && srcLow < srcIdx && srcIdx <= srcHigh - ZSTD_EDIST_SNAKE_THRESH) {
+ int k;
+ for (k = 0; dict[dictIdx + k] == src[srcIdx + k]; k++) {
+ if (k == ZSTD_EDIST_SNAKE_THRESH - 1) {
+ best = v;
+ partition->dictMid = dictIdx;
+ partition->srcMid = srcIdx;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (best > 0) {
+ partition->lowUseHeuristics = 1;
+ partition->highUseHeuristics = 0;
+ return;
+ }
+ }
+ }
+
+ /* More general 'too expensive' heuristic */
+ if (iterations >= ZSTD_EDIST_EXPENSIVE_THRESH) {
+ S32 forwardDictSrcBest;
+ S32 forwardDictBest = 0;
+ S32 backwardDictSrcBest;
+ S32 backwardDictBest = 0;
+
+ forwardDictSrcBest = -1;
+ for (diag = forwardMax; diag >= forwardMin; diag -= 2) {
+ S32 dictIdx = MIN(forwardDiag[diag], dictHigh);
+ S32 srcIdx = dictIdx - diag;
+
+ if (srcHigh < srcIdx) {
+ dictIdx = srcHigh + diag;
+ srcIdx = srcHigh;
+ }
+
+ if (forwardDictSrcBest < dictIdx + srcIdx) {
+ forwardDictSrcBest = dictIdx + srcIdx;
+ forwardDictBest = dictIdx;
+ }
+ }
+
+ backwardDictSrcBest = ZSTD_EDIST_DIAG_MAX;
+ for (diag = backwardMax; diag >= backwardMin; diag -= 2) {
+ S32 dictIdx = MAX(dictLow, backwardDiag[diag]);
+ S32 srcIdx = dictIdx - diag;
+
+ if (srcIdx < srcLow) {
+ dictIdx = srcLow + diag;
+ srcIdx = srcLow;
+ }
+
+ if (dictIdx + srcIdx < backwardDictSrcBest) {
+ backwardDictSrcBest = dictIdx + srcIdx;
+ backwardDictBest = dictIdx;
+ }
+ }
+
+ if ((dictHigh + srcHigh) - backwardDictSrcBest < forwardDictSrcBest - (dictLow + srcLow)) {
+ partition->dictMid = forwardDictBest;
+ partition->srcMid = forwardDictSrcBest - forwardDictBest;
+ partition->lowUseHeuristics = 0;
+ partition->highUseHeuristics = 1;
+ } else {
+ partition->dictMid = backwardDictBest;
+ partition->srcMid = backwardDictSrcBest - backwardDictBest;
+ partition->lowUseHeuristics = 1;
+ partition->highUseHeuristics = 0;
+ }
+ return;
+ }
+ }
+}
+
+static void ZSTD_eDist_insertMatch(ZSTD_eDist_state* state,
+ S32 const dictIdx, S32 const srcIdx)
+{
+ state->matches[state->nbMatches].dictIdx = dictIdx;
+ state->matches[state->nbMatches].srcIdx = srcIdx;
+ state->matches[state->nbMatches].matchLength = 1;
+ state->nbMatches++;
+}
+
+static int ZSTD_eDist_compare(ZSTD_eDist_state* state,
+ S32 dictLow, S32 dictHigh, S32 srcLow,
+ S32 srcHigh, int useHeuristics)
+{
+ const BYTE* const dict = state->dict;
+ const BYTE* const src = state->src;
+
+ /* Found matches while traversing from the low end */
+ while (dictLow < dictHigh && srcLow < srcHigh && dict[dictLow] == src[srcLow]) {
+ ZSTD_eDist_insertMatch(state, dictLow, srcLow);
+ dictLow++;
+ srcLow++;
+ }
+
+ /* Found matches while traversing from the high end */
+ while (dictLow < dictHigh && srcLow < srcHigh && dict[dictHigh - 1] == src[srcHigh - 1]) {
+ ZSTD_eDist_insertMatch(state, dictHigh - 1, srcHigh - 1);
+ dictHigh--;
+ srcHigh--;
+ }
+
+ /* If the low and high end end up touching. If we wanted to make
+ * note of the differences like most diffing algorithms do, we would
+ * do so here. In our case, we're only concerned with matches
+ * Note: if you wanted to find the edit distance of the algorithm,
+ * you could just accumulate the cost for an insertion/deletion
+ * below. */
+ if (dictLow == dictHigh) {
+ while (srcLow < srcHigh) {
+ /* Reaching this point means inserting src[srcLow] into
+ * the current position of dict */
+ srcLow++;
+ }
+ } else if (srcLow == srcHigh) {
+ while (dictLow < dictHigh) {
+ /* Reaching this point means deleteing dict[dictLow] from
+ * the current positino of dict */
+ dictLow++;
+ }
+ } else {
+ ZSTD_eDist_partition partition;
+ partition.dictMid = 0;
+ partition.srcMid = 0;
+ ZSTD_eDist_diag(state, &partition, dictLow, dictHigh,
+ srcLow, srcHigh, useHeuristics);
+ if (ZSTD_eDist_compare(state, dictLow, partition.dictMid,
+ srcLow, partition.srcMid, partition.lowUseHeuristics))
+ return 1;
+ if (ZSTD_eDist_compare(state, partition.dictMid, dictHigh,
+ partition.srcMid, srcHigh, partition.highUseHeuristics))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int ZSTD_eDist_matchComp(const void* p, const void* q)
+{
+ S32 const l = ((ZSTD_eDist_match*)p)->srcIdx;
+ S32 const r = ((ZSTD_eDist_match*)q)->srcIdx;
+ return (l - r);
+}
+
+/* The matches from the approach above will all be of the form
+ * (dictIdx, srcIdx, 1). this method combines contiguous matches
+ * of length MINMATCH or greater. Matches less than MINMATCH
+ * are discarded */
+static void ZSTD_eDist_combineMatches(ZSTD_eDist_state* state)
+{
+ /* Create a new buffer to put the combined matches into
+ * and memcpy to state->matches after */
+ ZSTD_eDist_match* combinedMatches =
+ ZSTD_malloc(state->nbMatches * sizeof(ZSTD_eDist_match),
+ ZSTD_defaultCMem);
+
+ U32 nbCombinedMatches = 1;
+ size_t i;
+
+ /* Make sure that the srcIdx and dictIdx are in sorted order.
+ * The combination step won't work otherwise */
+ qsort(state->matches, state->nbMatches, sizeof(ZSTD_eDist_match), ZSTD_eDist_matchComp);
+
+ memcpy(combinedMatches, state->matches, sizeof(ZSTD_eDist_match));
+ for (i = 1; i < state->nbMatches; i++) {
+ ZSTD_eDist_match const match = state->matches[i];
+ ZSTD_eDist_match const combinedMatch =
+ combinedMatches[nbCombinedMatches - 1];
+ if (combinedMatch.srcIdx + combinedMatch.matchLength == match.srcIdx &&
+ combinedMatch.dictIdx + combinedMatch.matchLength == match.dictIdx) {
+ combinedMatches[nbCombinedMatches - 1].matchLength++;
+ } else {
+ /* Discard matches that are less than MINMATCH */
+ if (combinedMatches[nbCombinedMatches - 1].matchLength < MINMATCH) {
+ nbCombinedMatches--;
+ }
+
+ memcpy(combinedMatches + nbCombinedMatches,
+ state->matches + i, sizeof(ZSTD_eDist_match));
+ nbCombinedMatches++;
+ }
+ }
+ memcpy(state->matches, combinedMatches, nbCombinedMatches * sizeof(ZSTD_eDist_match));
+ state->nbMatches = nbCombinedMatches;
+ ZSTD_free(combinedMatches, ZSTD_defaultCMem);
+}
+
+static size_t ZSTD_eDist_convertMatchesToSequences(ZSTD_Sequence* sequences,
+ ZSTD_eDist_state* state)
+{
+ const ZSTD_eDist_match* matches = state->matches;
+ size_t const nbMatches = state->nbMatches;
+ size_t const dictSize = state->dictSize;
+ size_t nbSequences = 0;
+ size_t i;
+ for (i = 0; i < nbMatches; i++) {
+ ZSTD_eDist_match const match = matches[i];
+ U32 const litLength = !i ? match.srcIdx :
+ match.srcIdx - (matches[i - 1].srcIdx + matches[i - 1].matchLength);
+ U32 const offset = (match.srcIdx + dictSize) - match.dictIdx;
+ U32 const matchLength = match.matchLength;
+ sequences[nbSequences].offset = offset;
+ sequences[nbSequences].litLength = litLength;
+ sequences[nbSequences].matchLength = matchLength;
+ nbSequences++;
+ }
+ return nbSequences;
+}
+
+/*-*************************************
+* Interal utils
+***************************************/
+
+static size_t ZSTD_eDist_hamingDist(const BYTE* const a,
+ const BYTE* const b, size_t n)
+{
+ size_t i;
+ size_t dist = 0;
+ for (i = 0; i < n; i++)
+ dist += a[i] != b[i];
+ return dist;
+}
+
+/* This is a pretty naive recursive implementation that should only
+ * be used for quick tests obviously. Don't try and run this on a
+ * GB file or something. There are faster implementations. Use those
+ * if you need to run it for large files. */
+static size_t ZSTD_eDist_levenshteinDist(const BYTE* const s,
+ size_t const sn, const BYTE* const t,
+ size_t const tn)
+{
+ size_t a, b, c;
+
+ if (!sn)
+ return tn;
+ if (!tn)
+ return sn;
+
+ if (s[sn - 1] == t[tn - 1])
+ return ZSTD_eDist_levenshteinDist(
+ s, sn - 1, t, tn - 1);
+
+ a = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn - 1);
+ b = ZSTD_eDist_levenshteinDist(s, sn, t, tn - 1);
+ c = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn);
+
+ if (a > b)
+ a = b;
+ if (a > c)
+ a = c;
+
+ return a + 1;
+}
+
+static void ZSTD_eDist_validateMatches(ZSTD_eDist_match* matches,
+ size_t const nbMatches, const BYTE* const dict,
+ size_t const dictSize, const BYTE* const src,
+ size_t const srcSize)
+{
+ size_t i;
+ for (i = 0; i < nbMatches; i++) {
+ ZSTD_eDist_match match = matches[i];
+ U32 const dictIdx = match.dictIdx;
+ U32 const srcIdx = match.srcIdx;
+ U32 const matchLength = match.matchLength;
+
+ assert(dictIdx + matchLength < dictSize);
+ assert(srcIdx + matchLength < srcSize);
+ assert(!memcmp(dict + dictIdx, src + srcIdx, matchLength));
+ }
+}
+
+/*-*************************************
+* API
+***************************************/
+
+size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences,
+ const void* dict, size_t dictSize,
+ const void* src, size_t srcSize,
+ int useHeuristics)
+{
+ size_t const nbDiags = dictSize + srcSize + 3;
+ S32* buffer = ZSTD_malloc(nbDiags * 2 * sizeof(S32), ZSTD_defaultCMem);
+ ZSTD_eDist_state state;
+ size_t nbSequences = 0;
+
+ state.dict = (const BYTE*)dict;
+ state.src = (const BYTE*)src;
+ state.dictSize = dictSize;
+ state.srcSize = srcSize;
+ state.forwardDiag = buffer;
+ state.backwardDiag = buffer + nbDiags;
+ state.forwardDiag += srcSize + 1;
+ state.backwardDiag += srcSize + 1;
+ state.matches = ZSTD_malloc(srcSize * sizeof(ZSTD_eDist_match), ZSTD_defaultCMem);
+ state.nbMatches = 0;
+
+ ZSTD_eDist_compare(&state, 0, dictSize, 0, srcSize, 1);
+ ZSTD_eDist_combineMatches(&state);
+ nbSequences = ZSTD_eDist_convertMatchesToSequences(sequences, &state);
+
+ ZSTD_free(buffer, ZSTD_defaultCMem);
+ ZSTD_free(state.matches, ZSTD_defaultCMem);
+
+ return nbSequences;
+}