summaryrefslogtreecommitdiffstats
path: root/web/server/h2o/libh2o/misc/oktavia/src/sais.jsx
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--web/server/h2o/libh2o/misc/oktavia/src/sais.jsx250
1 files changed, 250 insertions, 0 deletions
diff --git a/web/server/h2o/libh2o/misc/oktavia/src/sais.jsx b/web/server/h2o/libh2o/misc/oktavia/src/sais.jsx
new file mode 100644
index 00000000..9d8fa8fb
--- /dev/null
+++ b/web/server/h2o/libh2o/misc/oktavia/src/sais.jsx
@@ -0,0 +1,250 @@
+/* Original source code:
+ * G. Nong, S. Zhang and W. H. Chan, Two Efficient Algorithms for Linear Time Suffix Array Construction, IEEE Transactions on Computers, To Appear
+ * http://www.cs.sysu.edu.cn/nong/index.files/Two%20Efficient%20Algorithms%20for%20Linear%20Suffix%20Array%20Construction.pdf
+ */
+
+import "bit-vector.jsx";
+
+class OArray
+{
+ var offset : int;
+ var array : int[];
+
+ function constructor (array : int[])
+ {
+ this.array = array;
+ this.offset = 0;
+ }
+
+ function constructor (array : int[], offset : int)
+ {
+ this.array = array;
+ this.offset = offset;
+ }
+
+ function get (index : int) : int
+ {
+ return this.array[index + this.offset];
+ }
+
+ function set (index : int, value : int) : void
+ {
+ this.array[index + this.offset] = value;
+ }
+
+ function isS (index : int) : boolean
+ {
+ return this.array[index + this.offset] < this.array[index + this.offset + 1];
+ }
+
+ function compare (index1 : int, index2 : int) : boolean
+ {
+ return this.array[index1 + this.offset] == this.array[index2 + this.offset];
+ }
+}
+
+
+class SAIS
+{
+ static function _isLMS (t : BitVector, i : int) : boolean
+ {
+ return i > 0 && t.get(i) && !t.get(i - 1);
+ }
+
+ // find the start or end of each bucket
+ static function _getBuckets(s : OArray, bkt : int[], n : int, K : int, end : boolean) : void
+ {
+ var sum = 0;
+ for (var i = 0; i <= K; i++)
+ {
+ bkt[i] = 0; // clear all buckets
+ }
+ for (var i = 0; i < n; i++)
+ {
+ bkt[s.get(i)]++; // compute the size of each bucket
+ }
+ for (var i = 0; i <= K; i++)
+ {
+ sum += bkt[i];
+ bkt[i] = end ? sum : sum - bkt[i];
+ }
+ }
+
+ // compute SAl
+ static function _induceSAl(t : BitVector, SA : int[], s : OArray, bkt : int[], n : int, K : int, end : boolean) : void
+ {
+ SAIS._getBuckets(s, bkt, n, K, end); // find starts of buckets
+ for (var i = 0; i < n; i++)
+ {
+ var j = SA[i] - 1;
+ if (j >= 0 && !t.get(j))
+ {
+ SA[bkt[s.get(j)]++] = j;
+ }
+ }
+ }
+
+ // compute SAs
+ static function _induceSAs(t : BitVector, SA : int[], s : OArray, bkt : int[], n : int, K : int, end : boolean) : void
+ {
+ SAIS._getBuckets(s, bkt, n, K, end); // find ends of buckets
+ for (var i = n - 1; i >= 0; i--)
+ {
+ var j = SA[i] - 1;
+ if (j >=0 && t.get(j))
+ {
+ SA[--bkt[s.get(j)]] = j;
+ }
+ }
+ }
+
+ // find the suffix array SA of s[0..n-1] in {1..K}^n
+ // require s[n-1]=0 (the sentinel!), n>=2
+ // use a working space (excluding s and SA) of at most 2.25n+O(1) for a constant alphabet
+
+ static function make(source : string) : int[]
+ {
+ var charCodes = [] : int[];
+ charCodes.length = source.length;
+ var maxCode = 0;
+ for (var i = 0; i < source.length; i++)
+ {
+ var code = source.charCodeAt(i);
+ charCodes[i] = code;
+ maxCode = (code > maxCode) ? code : maxCode;
+ }
+ var SA = [] : int[];
+ SA.length = source.length;
+ var s = new OArray(charCodes);
+ SAIS._make(s, SA, source.length, maxCode);
+ return SA;
+ }
+
+ static function _make(s : OArray, SA : int[], n : int, K : int) : void
+ {
+ // Classify the type of each character
+ var t = new BitVector();
+ t.set(n - 2, false);
+ t.set(n - 1, true); // the sentinel must be in s1, important!!!
+ for (var i = n - 3; i >= 0; i--)
+ {
+ t.set(i, (s.isS(i) || (s.compare(i, i + 1) && t.get(i + 1))));
+ }
+
+ // stage 1: reduce the problem by at least 1/2
+ // sort all the S-substrings
+ var bkt = [] : int[];
+ bkt.length = K + 1;
+ SAIS._getBuckets(s, bkt, n, K, true); // find ends of buckets
+ for (var i = 0; i < n; i++)
+ {
+ SA[i] = -1;
+ }
+ for (var i = 1; i < n; i++)
+ {
+ if (SAIS._isLMS(t, i))
+ {
+ SA[--bkt[s.get(i)]] = i;
+ }
+ }
+ SAIS._induceSAl(t, SA, s, bkt, n, K, false);
+ SAIS._induceSAs(t, SA, s, bkt, n, K, true);
+ // compact all the sorted substrings into the first n1 items of SA
+ // 2*n1 must be not larger than n (proveable)
+ var n1 = 0;
+ for (var i = 0; i < n; i++)
+ {
+ if (SAIS._isLMS(t, SA[i]))
+ {
+ SA[n1++] = SA[i];
+ }
+ }
+
+ // find the lexicographic names of all substrings
+ for (var i = n1; i < n; i++)
+ {
+ SA[i]=-1; // init the name array buffer
+ }
+ var name = 0;
+ var prev = -1;
+ for (i = 0; i < n1; i++)
+ {
+ var pos = SA[i];
+ var diff = false;
+ for (var d = 0; d < n; d++)
+ {
+ if (prev == -1 || !s.compare(pos + d, prev + d) || t.get(pos + d) != t.get(prev + d))
+ {
+ diff = true;
+ break;
+ }
+ else if (d > 0 && (SAIS._isLMS(t, pos+d) || SAIS._isLMS(t, prev + d)))
+ {
+ break;
+ }
+ }
+ if (diff)
+ {
+ name++;
+ prev = pos;
+ }
+ pos = (pos % 2 == 0) ? pos / 2 : (pos - 1) / 2;
+ SA[n1 + pos] = name - 1;
+ }
+ for (var i = n - 1, j = n - 1; i >= n1; i--)
+ {
+ if (SA[i] >= 0)
+ {
+ SA[j--] = SA[i];
+ }
+ }
+
+ // stage 2: solve the reduced problem
+ // recurse if names are not yet unique
+ var SA1 = SA;
+ var s1 = new OArray(SA, n - n1);
+
+ if (name < n1)
+ {
+ SAIS._make(s1, SA1, n1, name - 1);
+ }
+ else
+ {
+ // generate the suffix array of s1 directly
+ for (i = 0; i < n1; i++)
+ {
+ SA1[s1.get(i)] = i;
+ }
+ }
+
+ // stage 3: induce the result for the original problem
+
+ bkt = [] : int[];
+ bkt.length = K + 1;
+ // put all left-most S characters into their buckets
+ SAIS._getBuckets(s, bkt, n, K, true); // find ends of buckets
+ for (i = 1, j = 0; i < n; i++)
+ {
+ if (SAIS._isLMS(t, i))
+ {
+ s1.set(j++, i); // get p1
+ }
+ }
+ for (i = 0; i < n1; i++)
+ {
+ SA1[i] = s1.get(SA1[i]); // get index in s
+ }
+ for (i = n1; i < n; i++)
+ {
+ SA[i] = -1; // init SA[n1..n-1]
+ }
+ for (i = n1 - 1; i >= 0; i--)
+ {
+ j = SA[i];
+ SA[i] = -1;
+ SA[--bkt[s.get(j)]] = j;
+ }
+ SAIS._induceSAl(t, SA, s, bkt, n, K, false);
+ SAIS._induceSAs(t, SA, s, bkt, n, K, true);
+ }
+}