summaryrefslogtreecommitdiffstats
path: root/comm/mailnews/db/gloda/modules/SuffixTree.jsm
diff options
context:
space:
mode:
Diffstat (limited to 'comm/mailnews/db/gloda/modules/SuffixTree.jsm')
-rw-r--r--comm/mailnews/db/gloda/modules/SuffixTree.jsm381
1 files changed, 381 insertions, 0 deletions
diff --git a/comm/mailnews/db/gloda/modules/SuffixTree.jsm b/comm/mailnews/db/gloda/modules/SuffixTree.jsm
new file mode 100644
index 0000000000..239993e180
--- /dev/null
+++ b/comm/mailnews/db/gloda/modules/SuffixTree.jsm
@@ -0,0 +1,381 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"];
+
+/**
+ * Given a list of strings and a corresponding map of items that those strings
+ * correspond to, build a suffix tree.
+ */
+function MultiSuffixTree(aStrings, aItems) {
+ if (aStrings.length != aItems.length) {
+ throw new Error("Array lengths need to be the same.");
+ }
+
+ let s = "";
+ let offsetsToItems = [];
+ let lastLength = 0;
+ for (let i = 0; i < aStrings.length; i++) {
+ s += aStrings[i];
+ offsetsToItems.push(lastLength, s.length, aItems[i]);
+ lastLength = s.length;
+ }
+
+ this._construct(s);
+ this._offsetsToItems = offsetsToItems;
+ this._numItems = aItems.length;
+}
+
+/**
+ * @class
+ */
+function State(aStartIndex, aEndIndex, aSuffix) {
+ this.start = aStartIndex;
+ this.end = aEndIndex;
+ this.suffix = aSuffix;
+}
+
+/**
+ * Since objects are basically hash-tables anyways, we simply create an
+ * attribute whose name is the first letter of the edge string. (So, the
+ * edge string can conceptually be a multi-letter string, but since we would
+ * split it were there any ambiguity, it's okay to just use the single letter.)
+ * This avoids having to update the attribute name or worry about tripping our
+ * implementation up.
+ */
+State.prototype = {
+ get isExplicit() {
+ // our end is not inclusive...
+ return this.end <= this.start;
+ },
+ get isImplicit() {
+ // our end is not inclusive...
+ return this.end > this.start;
+ },
+
+ get length() {
+ return this.end - this.start;
+ },
+
+ toString() {
+ return (
+ "[Start: " +
+ this.start +
+ " End: " +
+ this.end +
+ (this.suffix ? " non-null suffix]" : " null suffix]")
+ );
+ },
+};
+
+/**
+ * Suffix tree implemented using Ukkonen's algorithm.
+ *
+ * @class
+ */
+function SuffixTree(aStr) {
+ this._construct(aStr);
+}
+
+/**
+ * States are
+ */
+SuffixTree.prototype = {
+ /**
+ * Find all items matching the provided substring.
+ */
+ findMatches(aSubstring) {
+ let results = [];
+ let state = this._root;
+ let index = 0;
+ let end = aSubstring.length;
+ while (index < end) {
+ state = state[aSubstring[index]];
+ // bail if there was no edge
+ if (state === undefined) {
+ return results;
+ }
+ // bail if the portion of the edge we traversed is not equal to that
+ // portion of our pattern
+ let actualTraverseLength = Math.min(state.length, end - index);
+ if (
+ this._str.substring(state.start, state.start + actualTraverseLength) !=
+ aSubstring.substring(index, index + actualTraverseLength)
+ ) {
+ return results;
+ }
+ index += state.length;
+ }
+
+ // state should now be the node which itself and all its children match...
+ // The delta is to adjust us to the offset of the last letter of our match;
+ // the edge we traversed to get here may have found us traversing more
+ // than we wanted.
+ // index - end captures the over-shoot of the edge traversal,
+ // index - end + 1 captures the fact that we want to find the last letter
+ // that matched, not just the first letter beyond it
+ // However, if this state is a leaf node (end == 'infinity'), then 'end'
+ // isn't describing an edge at all and we want to avoid accounting for it.
+ let delta;
+ /*
+ if (state.end != this._infinity)
+ //delta = index - end + 1;
+ delta = end - (index - state.length);
+ else */
+ delta = index - state.length - end + 1;
+
+ this._resultGather(state, results, {}, end, delta, true);
+ return results;
+ },
+
+ _resultGather(
+ aState,
+ aResults,
+ aPresence,
+ aPatLength,
+ aDelta,
+ alreadyAdjusted
+ ) {
+ // find the item that this state originated from based on the state's
+ // start character. offsetToItem holds [string start index, string end
+ // index (exclusive), item reference]. So we want to binary search to
+ // find the string whose start/end index contains the state's start index.
+ let low = 0;
+ let high = this._numItems - 1;
+ let mid, stringStart, stringEnd;
+
+ let patternLast = aState.start - aDelta;
+ while (low <= high) {
+ mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums
+ stringStart = this._offsetsToItems[mid * 3];
+ let startDelta = stringStart - patternLast;
+ stringEnd = this._offsetsToItems[mid * 3 + 1];
+ let endDelta = stringEnd - patternLast;
+ if (startDelta > 0) {
+ high = mid - 1;
+ } else if (endDelta <= 0) {
+ low = mid + 1;
+ } else {
+ break;
+ }
+ }
+
+ // - The match occurred completely inside a source string. Success.
+ // - The match spans more than one source strings, and is therefore not
+ // a match.
+
+ // at this point, we have located the origin string that corresponds to the
+ // start index of this state.
+ // - The match terminated with the end of the preceding string, and does
+ // not match us at all. We, and potentially our children, are merely
+ // serving as a unique terminal.
+ // - The
+
+ let patternFirst = patternLast - (aPatLength - 1);
+
+ if (patternFirst >= stringStart) {
+ if (!(stringStart in aPresence)) {
+ aPresence[stringStart] = true;
+ aResults.push(this._offsetsToItems[mid * 3 + 2]);
+ }
+ }
+
+ // bail if we had it coming OR
+ // if the result terminates at/part-way through this state, meaning any
+ // of its children are not going to be actual results, just hangers
+ // on.
+ /*
+ if (bail || (end <= aState.end)) {
+dump(" bailing! (bail was: " + bail + ")\n");
+ return;
+ }
+*/
+ // process our children...
+ for (let key in aState) {
+ // edges have attributes of length 1...
+ if (key.length == 1) {
+ let statePrime = aState[key];
+ this._resultGather(
+ statePrime,
+ aResults,
+ aPresence,
+ aPatLength,
+ aDelta + aState.length, // (alreadyAdjusted ? 0 : aState.length),
+ false
+ );
+ }
+ }
+ },
+
+ /**
+ * Given a reference 'pair' of a state and a string (may be 'empty'=explicit,
+ * which means no work to do and we return immediately) follow that state
+ * (and then the successive states)'s transitions until we run out of
+ * transitions. This happens either when we find an explicit state, or
+ * find ourselves partially along an edge (conceptually speaking). In
+ * the partial case, we return the state prior to the edge traversal.
+ * (The information about the 'edge' is contained on its target State;
+ * we can do this because a state is only referenced by one other state.)
+ */
+ _canonize(aState, aStart, aEnd) {
+ if (aEnd <= aStart) {
+ return [aState, aStart];
+ }
+
+ let statePrime;
+ // we treat an aState of null as 'bottom', which has transitions for every
+ // letter in the alphabet to 'root'. rather than create all those
+ // transitions, we special-case here.
+ if (aState === null) {
+ statePrime = this._root;
+ } else {
+ statePrime = aState[this._str[aStart]];
+ }
+ while (statePrime.length <= aEnd - aStart) {
+ // (no 1 adjustment required)
+ aStart += statePrime.length;
+ aState = statePrime;
+ if (aStart < aEnd) {
+ statePrime = aState[this._str[aStart]];
+ }
+ }
+ return [aState, aStart];
+ },
+
+ /**
+ * Given a reference 'pair' whose state may or may not be explicit (and for
+ * which we will perform the required splitting to make it explicit), test
+ * whether it already possesses a transition corresponding to the provided
+ * character.
+ *
+ * @returns A list of: whether we had to make it explicit, the (potentially)
+ * new explicit state.
+ */
+ _testAndSplit(aState, aStart, aEnd, aChar) {
+ if (aStart < aEnd) {
+ // it's not explicit
+ let statePrime = aState[this._str[aStart]];
+ let length = aEnd - aStart;
+ if (aChar == this._str[statePrime.start + length]) {
+ return [true, aState];
+ }
+
+ // do splitting... aState -> rState -> statePrime
+ let rState = new State(statePrime.start, statePrime.start + length);
+ aState[this._str[statePrime.start]] = rState;
+ statePrime.start += length;
+ rState[this._str[statePrime.start]] = statePrime;
+ return [false, rState];
+ }
+
+ // it's already explicit
+ if (aState === null) {
+ // bottom case... shouldn't happen, but hey.
+ return [true, aState];
+ }
+ return [aChar in aState, aState];
+ },
+
+ _update(aState, aStart, aIndex) {
+ let oldR = this._root;
+ let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...)
+ // because of the way we store the 'end' value as a one-past form, we do
+ // not need to subtract 1 off of aIndex.
+ let [endPoint, rState] = this._testAndSplit(
+ aState,
+ aStart,
+ aIndex, // no -1
+ textAtIndex
+ );
+ while (!endPoint) {
+ let rPrime = new State(aIndex, this._infinity);
+ rState[textAtIndex] = rPrime;
+ if (oldR !== this._root) {
+ oldR.suffix = rState;
+ }
+ oldR = rState;
+ [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1
+ [endPoint, rState] = this._testAndSplit(
+ aState,
+ aStart,
+ aIndex, // no -1
+ textAtIndex
+ );
+ }
+ if (oldR !== this._root) {
+ oldR.suffix = aState;
+ }
+
+ return [aState, aStart];
+ },
+
+ _construct(aStr) {
+ this._str = aStr;
+ // just needs to be longer than the string.
+ this._infinity = aStr.length + 1;
+
+ // this._bottom = new State(0, -1, null);
+ this._root = new State(-1, 0, null); // null === bottom
+ let state = this._root;
+ let start = 0;
+
+ for (let i = 0; i < aStr.length; i++) {
+ [state, start] = this._update(state, start, i); // treat as flowing -1...
+ [state, start] = this._canonize(state, start, i + 1); // 1-length string
+ }
+ },
+
+ dump(aState, aIndent, aKey) {
+ if (aState === undefined) {
+ aState = this._root;
+ }
+ if (aIndent === undefined) {
+ aIndent = "";
+ aKey = ".";
+ }
+
+ if (aState.isImplicit) {
+ let snip;
+ if (aState.length > 10) {
+ snip =
+ this._str.slice(
+ aState.start,
+ Math.min(aState.start + 10, this._str.length)
+ ) + "...";
+ } else {
+ snip = this._str.slice(
+ aState.start,
+ Math.min(aState.end, this._str.length)
+ );
+ }
+ dump(
+ aIndent +
+ aKey +
+ ":" +
+ snip +
+ "(" +
+ aState.start +
+ ":" +
+ aState.end +
+ ")\n"
+ );
+ } else {
+ dump(
+ aIndent +
+ aKey +
+ ": (explicit:" +
+ aState.start +
+ ":" +
+ aState.end +
+ ")\n"
+ );
+ }
+ let nextIndent = aIndent + " ";
+ let keys = Object.keys(aState).filter(c => c.length == 1);
+ for (let key of keys) {
+ this.dump(aState[key], nextIndent, key);
+ }
+ },
+};
+MultiSuffixTree.prototype = SuffixTree.prototype;