diff options
Diffstat (limited to 'comm/mailnews/db/gloda/modules/SuffixTree.jsm')
-rw-r--r-- | comm/mailnews/db/gloda/modules/SuffixTree.jsm | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/comm/mailnews/db/gloda/modules/SuffixTree.jsm b/comm/mailnews/db/gloda/modules/SuffixTree.jsm new file mode 100644 index 0000000000..239993e180 --- /dev/null +++ b/comm/mailnews/db/gloda/modules/SuffixTree.jsm @@ -0,0 +1,381 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"]; + +/** + * Given a list of strings and a corresponding map of items that those strings + * correspond to, build a suffix tree. + */ +function MultiSuffixTree(aStrings, aItems) { + if (aStrings.length != aItems.length) { + throw new Error("Array lengths need to be the same."); + } + + let s = ""; + let offsetsToItems = []; + let lastLength = 0; + for (let i = 0; i < aStrings.length; i++) { + s += aStrings[i]; + offsetsToItems.push(lastLength, s.length, aItems[i]); + lastLength = s.length; + } + + this._construct(s); + this._offsetsToItems = offsetsToItems; + this._numItems = aItems.length; +} + +/** + * @class + */ +function State(aStartIndex, aEndIndex, aSuffix) { + this.start = aStartIndex; + this.end = aEndIndex; + this.suffix = aSuffix; +} + +/** + * Since objects are basically hash-tables anyways, we simply create an + * attribute whose name is the first letter of the edge string. (So, the + * edge string can conceptually be a multi-letter string, but since we would + * split it were there any ambiguity, it's okay to just use the single letter.) + * This avoids having to update the attribute name or worry about tripping our + * implementation up. + */ +State.prototype = { + get isExplicit() { + // our end is not inclusive... + return this.end <= this.start; + }, + get isImplicit() { + // our end is not inclusive... + return this.end > this.start; + }, + + get length() { + return this.end - this.start; + }, + + toString() { + return ( + "[Start: " + + this.start + + " End: " + + this.end + + (this.suffix ? " non-null suffix]" : " null suffix]") + ); + }, +}; + +/** + * Suffix tree implemented using Ukkonen's algorithm. + * + * @class + */ +function SuffixTree(aStr) { + this._construct(aStr); +} + +/** + * States are + */ +SuffixTree.prototype = { + /** + * Find all items matching the provided substring. + */ + findMatches(aSubstring) { + let results = []; + let state = this._root; + let index = 0; + let end = aSubstring.length; + while (index < end) { + state = state[aSubstring[index]]; + // bail if there was no edge + if (state === undefined) { + return results; + } + // bail if the portion of the edge we traversed is not equal to that + // portion of our pattern + let actualTraverseLength = Math.min(state.length, end - index); + if ( + this._str.substring(state.start, state.start + actualTraverseLength) != + aSubstring.substring(index, index + actualTraverseLength) + ) { + return results; + } + index += state.length; + } + + // state should now be the node which itself and all its children match... + // The delta is to adjust us to the offset of the last letter of our match; + // the edge we traversed to get here may have found us traversing more + // than we wanted. + // index - end captures the over-shoot of the edge traversal, + // index - end + 1 captures the fact that we want to find the last letter + // that matched, not just the first letter beyond it + // However, if this state is a leaf node (end == 'infinity'), then 'end' + // isn't describing an edge at all and we want to avoid accounting for it. + let delta; + /* + if (state.end != this._infinity) + //delta = index - end + 1; + delta = end - (index - state.length); + else */ + delta = index - state.length - end + 1; + + this._resultGather(state, results, {}, end, delta, true); + return results; + }, + + _resultGather( + aState, + aResults, + aPresence, + aPatLength, + aDelta, + alreadyAdjusted + ) { + // find the item that this state originated from based on the state's + // start character. offsetToItem holds [string start index, string end + // index (exclusive), item reference]. So we want to binary search to + // find the string whose start/end index contains the state's start index. + let low = 0; + let high = this._numItems - 1; + let mid, stringStart, stringEnd; + + let patternLast = aState.start - aDelta; + while (low <= high) { + mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums + stringStart = this._offsetsToItems[mid * 3]; + let startDelta = stringStart - patternLast; + stringEnd = this._offsetsToItems[mid * 3 + 1]; + let endDelta = stringEnd - patternLast; + if (startDelta > 0) { + high = mid - 1; + } else if (endDelta <= 0) { + low = mid + 1; + } else { + break; + } + } + + // - The match occurred completely inside a source string. Success. + // - The match spans more than one source strings, and is therefore not + // a match. + + // at this point, we have located the origin string that corresponds to the + // start index of this state. + // - The match terminated with the end of the preceding string, and does + // not match us at all. We, and potentially our children, are merely + // serving as a unique terminal. + // - The + + let patternFirst = patternLast - (aPatLength - 1); + + if (patternFirst >= stringStart) { + if (!(stringStart in aPresence)) { + aPresence[stringStart] = true; + aResults.push(this._offsetsToItems[mid * 3 + 2]); + } + } + + // bail if we had it coming OR + // if the result terminates at/part-way through this state, meaning any + // of its children are not going to be actual results, just hangers + // on. + /* + if (bail || (end <= aState.end)) { +dump(" bailing! (bail was: " + bail + ")\n"); + return; + } +*/ + // process our children... + for (let key in aState) { + // edges have attributes of length 1... + if (key.length == 1) { + let statePrime = aState[key]; + this._resultGather( + statePrime, + aResults, + aPresence, + aPatLength, + aDelta + aState.length, // (alreadyAdjusted ? 0 : aState.length), + false + ); + } + } + }, + + /** + * Given a reference 'pair' of a state and a string (may be 'empty'=explicit, + * which means no work to do and we return immediately) follow that state + * (and then the successive states)'s transitions until we run out of + * transitions. This happens either when we find an explicit state, or + * find ourselves partially along an edge (conceptually speaking). In + * the partial case, we return the state prior to the edge traversal. + * (The information about the 'edge' is contained on its target State; + * we can do this because a state is only referenced by one other state.) + */ + _canonize(aState, aStart, aEnd) { + if (aEnd <= aStart) { + return [aState, aStart]; + } + + let statePrime; + // we treat an aState of null as 'bottom', which has transitions for every + // letter in the alphabet to 'root'. rather than create all those + // transitions, we special-case here. + if (aState === null) { + statePrime = this._root; + } else { + statePrime = aState[this._str[aStart]]; + } + while (statePrime.length <= aEnd - aStart) { + // (no 1 adjustment required) + aStart += statePrime.length; + aState = statePrime; + if (aStart < aEnd) { + statePrime = aState[this._str[aStart]]; + } + } + return [aState, aStart]; + }, + + /** + * Given a reference 'pair' whose state may or may not be explicit (and for + * which we will perform the required splitting to make it explicit), test + * whether it already possesses a transition corresponding to the provided + * character. + * + * @returns A list of: whether we had to make it explicit, the (potentially) + * new explicit state. + */ + _testAndSplit(aState, aStart, aEnd, aChar) { + if (aStart < aEnd) { + // it's not explicit + let statePrime = aState[this._str[aStart]]; + let length = aEnd - aStart; + if (aChar == this._str[statePrime.start + length]) { + return [true, aState]; + } + + // do splitting... aState -> rState -> statePrime + let rState = new State(statePrime.start, statePrime.start + length); + aState[this._str[statePrime.start]] = rState; + statePrime.start += length; + rState[this._str[statePrime.start]] = statePrime; + return [false, rState]; + } + + // it's already explicit + if (aState === null) { + // bottom case... shouldn't happen, but hey. + return [true, aState]; + } + return [aChar in aState, aState]; + }, + + _update(aState, aStart, aIndex) { + let oldR = this._root; + let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...) + // because of the way we store the 'end' value as a one-past form, we do + // not need to subtract 1 off of aIndex. + let [endPoint, rState] = this._testAndSplit( + aState, + aStart, + aIndex, // no -1 + textAtIndex + ); + while (!endPoint) { + let rPrime = new State(aIndex, this._infinity); + rState[textAtIndex] = rPrime; + if (oldR !== this._root) { + oldR.suffix = rState; + } + oldR = rState; + [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1 + [endPoint, rState] = this._testAndSplit( + aState, + aStart, + aIndex, // no -1 + textAtIndex + ); + } + if (oldR !== this._root) { + oldR.suffix = aState; + } + + return [aState, aStart]; + }, + + _construct(aStr) { + this._str = aStr; + // just needs to be longer than the string. + this._infinity = aStr.length + 1; + + // this._bottom = new State(0, -1, null); + this._root = new State(-1, 0, null); // null === bottom + let state = this._root; + let start = 0; + + for (let i = 0; i < aStr.length; i++) { + [state, start] = this._update(state, start, i); // treat as flowing -1... + [state, start] = this._canonize(state, start, i + 1); // 1-length string + } + }, + + dump(aState, aIndent, aKey) { + if (aState === undefined) { + aState = this._root; + } + if (aIndent === undefined) { + aIndent = ""; + aKey = "."; + } + + if (aState.isImplicit) { + let snip; + if (aState.length > 10) { + snip = + this._str.slice( + aState.start, + Math.min(aState.start + 10, this._str.length) + ) + "..."; + } else { + snip = this._str.slice( + aState.start, + Math.min(aState.end, this._str.length) + ); + } + dump( + aIndent + + aKey + + ":" + + snip + + "(" + + aState.start + + ":" + + aState.end + + ")\n" + ); + } else { + dump( + aIndent + + aKey + + ": (explicit:" + + aState.start + + ":" + + aState.end + + ")\n" + ); + } + let nextIndent = aIndent + " "; + let keys = Object.keys(aState).filter(c => c.length == 1); + for (let key of keys) { + this.dump(aState[key], nextIndent, key); + } + }, +}; +MultiSuffixTree.prototype = SuffixTree.prototype; |