1 files changed, 381 insertions, 0 deletions
diff --git a/comm/mailnews/db/gloda/modules/SuffixTree.jsm b/comm/mailnews/db/gloda/modules/SuffixTree.jsm
new file mode 100644
index 0000000000..239993e180
--- /dev/null
+++ b/comm/mailnews/db/gloda/modules/SuffixTree.jsm
@@ -0,0 +1,381 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"];
+
+/**
+ * Given a list of strings and a corresponding map of items that those strings
+ *  correspond to, build a suffix tree.
+ */
+function MultiSuffixTree(aStrings, aItems) {
+  if (aStrings.length != aItems.length) {
+    throw new Error("Array lengths need to be the same.");
+  }
+
+  let s = "";
+  let offsetsToItems = [];
+  let lastLength = 0;
+  for (let i = 0; i < aStrings.length; i++) {
+    s += aStrings[i];
+    offsetsToItems.push(lastLength, s.length, aItems[i]);
+    lastLength = s.length;
+  }
+
+  this._construct(s);
+  this._offsetsToItems = offsetsToItems;
+  this._numItems = aItems.length;
+}
+
+/**
+ * @class
+ */
+function State(aStartIndex, aEndIndex, aSuffix) {
+  this.start = aStartIndex;
+  this.end = aEndIndex;
+  this.suffix = aSuffix;
+}
+
+/**
+ * Since objects are basically hash-tables anyways, we simply create an
+ *  attribute whose name is the first letter of the edge string.  (So, the
+ *  edge string can conceptually be a multi-letter string, but since we would
+ *  split it were there any ambiguity, it's okay to just use the single letter.)
+ *  This avoids having to update the attribute name or worry about tripping our
+ *  implementation up.
+ */
+State.prototype = {
+  get isExplicit() {
+    // our end is not inclusive...
+    return this.end <= this.start;
+  },
+  get isImplicit() {
+    // our end is not inclusive...
+    return this.end > this.start;
+  },
+
+  get length() {
+    return this.end - this.start;
+  },
+
+  toString() {
+    return (
+      "[Start: " +
+      this.start +
+      " End: " +
+      this.end +
+      (this.suffix ? " non-null suffix]" : " null suffix]")
+    );
+  },
+};
+
+/**
+ * Suffix tree implemented using Ukkonen's algorithm.
+ *
+ * @class
+ */
+function SuffixTree(aStr) {
+  this._construct(aStr);
+}
+
+/**
+ * States are
+ */
+SuffixTree.prototype = {
+  /**
+   * Find all items matching the provided substring.
+   */
+  findMatches(aSubstring) {
+    let results = [];
+    let state = this._root;
+    let index = 0;
+    let end = aSubstring.length;
+    while (index < end) {
+      state = state[aSubstring[index]];
+      // bail if there was no edge
+      if (state === undefined) {
+        return results;
+      }
+      // bail if the portion of the edge we traversed is not equal to that
+      //  portion of our pattern
+      let actualTraverseLength = Math.min(state.length, end - index);
+      if (
+        this._str.substring(state.start, state.start + actualTraverseLength) !=
+        aSubstring.substring(index, index + actualTraverseLength)
+      ) {
+        return results;
+      }
+      index += state.length;
+    }
+
+    // state should now be the node which itself and all its children match...
+    // The delta is to adjust us to the offset of the last letter of our match;
+    //  the edge we traversed to get here may have found us traversing more
+    //  than we wanted.
+    // index - end captures the over-shoot of the edge traversal,
+    // index - end + 1 captures the fact that we want to find the last letter
+    //  that matched, not just the first letter beyond it
+    // However, if this state is a leaf node (end == 'infinity'), then 'end'
+    //  isn't describing an edge at all and we want to avoid accounting for it.
+    let delta;
+    /*
+    if (state.end != this._infinity)
+      //delta = index - end + 1;
+      delta = end - (index - state.length);
+    else */
+    delta = index - state.length - end + 1;
+
+    this._resultGather(state, results, {}, end, delta, true);
+    return results;
+  },
+
+  _resultGather(
+    aState,
+    aResults,
+    aPresence,
+    aPatLength,
+    aDelta,
+    alreadyAdjusted
+  ) {
+    // find the item that this state originated from based on the state's
+    //  start character.  offsetToItem holds [string start index, string end
+    //  index (exclusive), item reference].  So we want to binary search to
+    //  find the string whose start/end index contains the state's start index.
+    let low = 0;
+    let high = this._numItems - 1;
+    let mid, stringStart, stringEnd;
+
+    let patternLast = aState.start - aDelta;
+    while (low <= high) {
+      mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums
+      stringStart = this._offsetsToItems[mid * 3];
+      let startDelta = stringStart - patternLast;
+      stringEnd = this._offsetsToItems[mid * 3 + 1];
+      let endDelta = stringEnd - patternLast;
+      if (startDelta > 0) {
+        high = mid - 1;
+      } else if (endDelta <= 0) {
+        low = mid + 1;
+      } else {
+        break;
+      }
+    }
+
+    // - The match occurred completely inside a source string.  Success.
+    // - The match spans more than one source strings, and is therefore not
+    //   a match.
+
+    // at this point, we have located the origin string that corresponds to the
+    //  start index of this state.
+    // - The match terminated with the end of the preceding string, and does
+    //   not match us at all.  We, and potentially our children, are merely
+    //   serving as a unique terminal.
+    // - The
+
+    let patternFirst = patternLast - (aPatLength - 1);
+
+    if (patternFirst >= stringStart) {
+      if (!(stringStart in aPresence)) {
+        aPresence[stringStart] = true;
+        aResults.push(this._offsetsToItems[mid * 3 + 2]);
+      }
+    }
+
+    // bail if we had it coming OR
+    // if the result terminates at/part-way through this state, meaning any
+    //  of its children are not going to be actual results, just hangers
+    //  on.
+    /*
+    if (bail || (end <= aState.end)) {
+dump("  bailing! (bail was: " + bail + ")\n");
+      return;
+    }
+*/
+    // process our children...
+    for (let key in aState) {
+      // edges have attributes of length 1...
+      if (key.length == 1) {
+        let statePrime = aState[key];
+        this._resultGather(
+          statePrime,
+          aResults,
+          aPresence,
+          aPatLength,
+          aDelta + aState.length, // (alreadyAdjusted ? 0 : aState.length),
+          false
+        );
+      }
+    }
+  },
+
+  /**
+   * Given a reference 'pair' of a state and a string (may be 'empty'=explicit,
+   *  which means no work to do and we return immediately) follow that state
+   *  (and then the successive states)'s transitions until we run out of
+   *  transitions.  This happens either when we find an explicit state, or
+   *  find ourselves partially along an edge (conceptually speaking).  In
+   *  the partial case, we return the state prior to the edge traversal.
+   * (The information about the 'edge' is contained on its target State;
+   *  we can do this because a state is only referenced by one other state.)
+   */
+  _canonize(aState, aStart, aEnd) {
+    if (aEnd <= aStart) {
+      return [aState, aStart];
+    }
+
+    let statePrime;
+    // we treat an aState of null as 'bottom', which has transitions for every
+    //  letter in the alphabet to 'root'.  rather than create all those
+    //  transitions, we special-case here.
+    if (aState === null) {
+      statePrime = this._root;
+    } else {
+      statePrime = aState[this._str[aStart]];
+    }
+    while (statePrime.length <= aEnd - aStart) {
+      // (no 1 adjustment required)
+      aStart += statePrime.length;
+      aState = statePrime;
+      if (aStart < aEnd) {
+        statePrime = aState[this._str[aStart]];
+      }
+    }
+    return [aState, aStart];
+  },
+
+  /**
+   * Given a reference 'pair' whose state may or may not be explicit (and for
+   *  which we will perform the required splitting to make it explicit), test
+   *  whether it already possesses a transition corresponding to the provided
+   *  character.
+   *
+   * @returns A list of: whether we had to make it explicit, the (potentially)
+   *    new explicit state.
+   */
+  _testAndSplit(aState, aStart, aEnd, aChar) {
+    if (aStart < aEnd) {
+      // it's not explicit
+      let statePrime = aState[this._str[aStart]];
+      let length = aEnd - aStart;
+      if (aChar == this._str[statePrime.start + length]) {
+        return [true, aState];
+      }
+
+      // do splitting... aState -> rState -> statePrime
+      let rState = new State(statePrime.start, statePrime.start + length);
+      aState[this._str[statePrime.start]] = rState;
+      statePrime.start += length;
+      rState[this._str[statePrime.start]] = statePrime;
+      return [false, rState];
+    }
+
+    // it's already explicit
+    if (aState === null) {
+      // bottom case... shouldn't happen, but hey.
+      return [true, aState];
+    }
+    return [aChar in aState, aState];
+  },
+
+  _update(aState, aStart, aIndex) {
+    let oldR = this._root;
+    let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...)
+    // because of the way we store the 'end' value as a one-past form, we do
+    //  not need to subtract 1 off of aIndex.
+    let [endPoint, rState] = this._testAndSplit(
+      aState,
+      aStart,
+      aIndex, // no -1
+      textAtIndex
+    );
+    while (!endPoint) {
+      let rPrime = new State(aIndex, this._infinity);
+      rState[textAtIndex] = rPrime;
+      if (oldR !== this._root) {
+        oldR.suffix = rState;
+      }
+      oldR = rState;
+      [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1
+      [endPoint, rState] = this._testAndSplit(
+        aState,
+        aStart,
+        aIndex, // no -1
+        textAtIndex
+      );
+    }
+    if (oldR !== this._root) {
+      oldR.suffix = aState;
+    }
+
+    return [aState, aStart];
+  },
+
+  _construct(aStr) {
+    this._str = aStr;
+    // just needs to be longer than the string.
+    this._infinity = aStr.length + 1;
+
+    // this._bottom = new State(0, -1, null);
+    this._root = new State(-1, 0, null); // null === bottom
+    let state = this._root;
+    let start = 0;
+
+    for (let i = 0; i < aStr.length; i++) {
+      [state, start] = this._update(state, start, i); // treat as flowing -1...
+      [state, start] = this._canonize(state, start, i + 1); // 1-length string
+    }
+  },
+
+  dump(aState, aIndent, aKey) {
+    if (aState === undefined) {
+      aState = this._root;
+    }
+    if (aIndent === undefined) {
+      aIndent = "";
+      aKey = ".";
+    }
+
+    if (aState.isImplicit) {
+      let snip;
+      if (aState.length > 10) {
+        snip =
+          this._str.slice(
+            aState.start,
+            Math.min(aState.start + 10, this._str.length)
+          ) + "...";
+      } else {
+        snip = this._str.slice(
+          aState.start,
+          Math.min(aState.end, this._str.length)
+        );
+      }
+      dump(
+        aIndent +
+          aKey +
+          ":" +
+          snip +
+          "(" +
+          aState.start +
+          ":" +
+          aState.end +
+          ")\n"
+      );
+    } else {
+      dump(
+        aIndent +
+          aKey +
+          ": (explicit:" +
+          aState.start +
+          ":" +
+          aState.end +
+          ")\n"
+      );
+    }
+    let nextIndent = aIndent + "  ";
+    let keys = Object.keys(aState).filter(c => c.length == 1);
+    for (let key of keys) {
+      this.dump(aState[key], nextIndent, key);
+    }
+  },
+};
+MultiSuffixTree.prototype = SuffixTree.prototype;