diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 13:15:26 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 13:15:26 +0000 |
commit | 82539ad8d59729fb45b0bb0edda8f2bddb719eb1 (patch) | |
tree | 58f0b58e6f44f0e04d4a6373132cf426fa835fa7 /src/index/suffixarray | |
parent | Initial commit. (diff) | |
download | golang-1.17-82539ad8d59729fb45b0bb0edda8f2bddb719eb1.tar.xz golang-1.17-82539ad8d59729fb45b0bb0edda8f2bddb719eb1.zip |
Adding upstream version 1.17.13.upstream/1.17.13upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/index/suffixarray')
-rw-r--r-- | src/index/suffixarray/example_test.go | 22 | ||||
-rw-r--r-- | src/index/suffixarray/gen.go | 93 | ||||
-rw-r--r-- | src/index/suffixarray/sais.go | 899 | ||||
-rw-r--r-- | src/index/suffixarray/sais2.go | 1741 | ||||
-rw-r--r-- | src/index/suffixarray/suffixarray.go | 385 | ||||
-rw-r--r-- | src/index/suffixarray/suffixarray_test.go | 615 |
6 files changed, 3755 insertions, 0 deletions
diff --git a/src/index/suffixarray/example_test.go b/src/index/suffixarray/example_test.go new file mode 100644 index 0000000..ea10bfd --- /dev/null +++ b/src/index/suffixarray/example_test.go @@ -0,0 +1,22 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package suffixarray_test + +import ( + "fmt" + "index/suffixarray" +) + +func ExampleIndex_Lookup() { + index := suffixarray.New([]byte("banana")) + offsets := index.Lookup([]byte("ana"), -1) + for _, off := range offsets { + fmt.Println(off) + } + + // Unordered output: + // 1 + // 3 +} diff --git a/src/index/suffixarray/gen.go b/src/index/suffixarray/gen.go new file mode 100644 index 0000000..3bc9b1e --- /dev/null +++ b/src/index/suffixarray/gen.go @@ -0,0 +1,93 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ignore +// +build ignore + +// Gen generates sais2.go by duplicating functions in sais.go +// using different input types. +// See the comment at the top of sais.go for details. +package main + +import ( + "bytes" + "log" + "os" + "strings" +) + +func main() { + log.SetPrefix("gen: ") + log.SetFlags(0) + + data, err := os.ReadFile("sais.go") + if err != nil { + log.Fatal(err) + } + + x := bytes.Index(data, []byte("\n\n")) + if x < 0 { + log.Fatal("cannot find blank line after copyright comment") + } + + var buf bytes.Buffer + buf.Write(data[:x]) + buf.WriteString("\n\n// Code generated by go generate; DO NOT EDIT.\n\npackage suffixarray\n") + + for { + x := bytes.Index(data, []byte("\nfunc ")) + if x < 0 { + break + } + data = data[x:] + p := bytes.IndexByte(data, '(') + if p < 0 { + p = len(data) + } + name := string(data[len("\nfunc "):p]) + + x = bytes.Index(data, []byte("\n}\n")) + if x < 0 { + log.Fatalf("cannot find end of func %s", name) + } + fn := string(data[:x+len("\n}\n")]) + data = data[x+len("\n}"):] + + if strings.HasSuffix(name, "_32") { + buf.WriteString(fix32.Replace(fn)) + } + if strings.HasSuffix(name, "_8_32") { + // x_8_32 -> x_8_64 done above + fn = fix8_32.Replace(stripByteOnly(fn)) + buf.WriteString(fn) + buf.WriteString(fix32.Replace(fn)) + } + } + + if err := os.WriteFile("sais2.go", buf.Bytes(), 0666); err != nil { + log.Fatal(err) + } +} + +var fix32 = strings.NewReplacer( + "32", "64", + "int32", "int64", +) + +var fix8_32 = strings.NewReplacer( + "_8_32", "_32", + "byte", "int32", +) + +func stripByteOnly(s string) string { + lines := strings.SplitAfter(s, "\n") + w := 0 + for _, line := range lines { + if !strings.Contains(line, "256") && !strings.Contains(line, "byte-only") { + lines[w] = line + w++ + } + } + return strings.Join(lines[:w], "") +} diff --git a/src/index/suffixarray/sais.go b/src/index/suffixarray/sais.go new file mode 100644 index 0000000..b4496d2 --- /dev/null +++ b/src/index/suffixarray/sais.go @@ -0,0 +1,899 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Suffix array construction by induced sorting (SAIS). +// See Ge Nong, Sen Zhang, and Wai Hong Chen, +// "Two Efficient Algorithms for Linear Time Suffix Array Construction", +// especially section 3 (https://ieeexplore.ieee.org/document/5582081). +// See also http://zork.net/~st/jottings/sais.html. +// +// With optimizations inspired by Yuta Mori's sais-lite +// (https://sites.google.com/site/yuta256/sais). +// +// And with other new optimizations. + +// Many of these functions are parameterized by the sizes of +// the types they operate on. The generator gen.go makes +// copies of these functions for use with other sizes. +// Specifically: +// +// - A function with a name ending in _8_32 takes []byte and []int32 arguments +// and is duplicated into _32_32, _8_64, and _64_64 forms. +// The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64. +// Any lines in the function body that contain the text "byte-only" or "256" +// are stripped when creating _32_32 and _64_64 forms. +// (Those lines are typically 8-bit-specific optimizations.) +// +// - A function with a name ending only in _32 operates on []int32 +// and is duplicated into a _64 form. (Note that it may still take a []byte, +// but there is no need for a version of the function in which the []byte +// is widened to a full integer array.) + +// The overall runtime of this code is linear in the input size: +// it runs a sequence of linear passes to reduce the problem to +// a subproblem at most half as big, invokes itself recursively, +// and then runs a sequence of linear passes to turn the answer +// for the subproblem into the answer for the original problem. +// This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N). +// +// The outline of the code, with the forward and backward scans +// through O(N)-sized arrays called out, is: +// +// sais_I_N +// placeLMS_I_B +// bucketMax_I_B +// freq_I_B +// <scan +text> (1) +// <scan +freq> (2) +// <scan -text, random bucket> (3) +// induceSubL_I_B +// bucketMin_I_B +// freq_I_B +// <scan +text, often optimized away> (4) +// <scan +freq> (5) +// <scan +sa, random text, random bucket> (6) +// induceSubS_I_B +// bucketMax_I_B +// freq_I_B +// <scan +text, often optimized away> (7) +// <scan +freq> (8) +// <scan -sa, random text, random bucket> (9) +// assignID_I_B +// <scan +sa, random text substrings> (10) +// map_B +// <scan -sa> (11) +// recurse_B +// (recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller) +// unmap_I_B +// <scan -text> (12) +// <scan +sa> (13) +// expand_I_B +// bucketMax_I_B +// freq_I_B +// <scan +text, often optimized away> (14) +// <scan +freq> (15) +// <scan -sa, random text, random bucket> (16) +// induceL_I_B +// bucketMin_I_B +// freq_I_B +// <scan +text, often optimized away> (17) +// <scan +freq> (18) +// <scan +sa, random text, random bucket> (19) +// induceS_I_B +// bucketMax_I_B +// freq_I_B +// <scan +text, often optimized away> (20) +// <scan +freq> (21) +// <scan -sa, random text, random bucket> (22) +// +// Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B). +// +// The outline shows there are in general 22 scans through +// O(N)-sized arrays for a given level of the recursion. +// In the top level, operating on 8-bit input text, +// the six freq scans are fixed size (256) instead of potentially +// input-sized. Also, the frequency is counted once and cached +// whenever there is room to do so (there is nearly always room in general, +// and always room at the top level), which eliminates all but +// the first freq_I_B text scans (that is, 5 of the 6). +// So the top level of the recursion only does 22 - 6 - 5 = 11 +// input-sized scans and a typical level does 16 scans. +// +// The linear scans do not cost anywhere near as much as +// the random accesses to the text made during a few of +// the scans (specifically #6, #9, #16, #19, #22 marked above). +// In real texts, there is not much but some locality to +// the accesses, due to the repetitive structure of the text +// (the same reason Burrows-Wheeler compression is so effective). +// For random inputs, there is no locality, which makes those +// accesses even more expensive, especially once the text +// no longer fits in cache. +// For example, running on 50 MB of Go source code, induceSubL_8_32 +// (which runs only once, at the top level of the recursion) +// takes 0.44s, while on 50 MB of random input, it takes 2.55s. +// Nearly all the relative slowdown is explained by the text access: +// +// c0, c1 := text[k-1], text[k] +// +// That line runs for 0.23s on the Go text and 2.02s on random text. + +//go:generate go run gen.go + +package suffixarray + +// text_32 returns the suffix array for the input text. +// It requires that len(text) fit in an int32 +// and that the caller zero sa. +func text_32(text []byte, sa []int32) { + if int(int32(len(text))) != len(text) || len(text) != len(sa) { + panic("suffixarray: misuse of text_32") + } + sais_8_32(text, 256, sa, make([]int32, 2*256)) +} + +// sais_8_32 computes the suffix array of text. +// The text must contain only values in [0, textMax). +// The suffix array is stored in sa, which the caller +// must ensure is already zeroed. +// The caller must also provide temporary space tmp +// with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax +// then the algorithm runs a little faster. +// If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return. +func sais_8_32(text []byte, textMax int, sa, tmp []int32) { + if len(sa) != len(text) || len(tmp) < int(textMax) { + panic("suffixarray: misuse of sais_8_32") + } + + // Trivial base cases. Sorting 0 or 1 things is easy. + if len(text) == 0 { + return + } + if len(text) == 1 { + sa[0] = 0 + return + } + + // Establish slices indexed by text character + // holding character frequency and bucket-sort offsets. + // If there's only enough tmp for one slice, + // we make it the bucket offsets and recompute + // the character frequency each time we need it. + var freq, bucket []int32 + if len(tmp) >= 2*textMax { + freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] + freq[0] = -1 // mark as uninitialized + } else { + freq, bucket = nil, tmp[:textMax] + } + + // The SAIS algorithm. + // Each of these calls makes one scan through sa. + // See the individual functions for documentation + // about each's role in the algorithm. + numLMS := placeLMS_8_32(text, sa, freq, bucket) + if numLMS <= 1 { + // 0 or 1 items are already sorted. Do nothing. + } else { + induceSubL_8_32(text, sa, freq, bucket) + induceSubS_8_32(text, sa, freq, bucket) + length_8_32(text, sa, numLMS) + maxID := assignID_8_32(text, sa, numLMS) + if maxID < numLMS { + map_32(sa, numLMS) + recurse_32(sa, tmp, numLMS, maxID) + unmap_8_32(text, sa, numLMS) + } else { + // If maxID == numLMS, then each LMS-substring + // is unique, so the relative ordering of two LMS-suffixes + // is determined by just the leading LMS-substring. + // That is, the LMS-suffix sort order matches the + // (simpler) LMS-substring sort order. + // Copy the original LMS-substring order into the + // suffix array destination. + copy(sa, sa[len(sa)-numLMS:]) + } + expand_8_32(text, freq, bucket, sa, numLMS) + } + induceL_8_32(text, sa, freq, bucket) + induceS_8_32(text, sa, freq, bucket) + + // Mark for caller that we overwrote tmp. + tmp[0] = -1 +} + +// freq_8_32 returns the character frequencies +// for text, as a slice indexed by character value. +// If freq is nil, freq_8_32 uses and returns bucket. +// If freq is non-nil, freq_8_32 assumes that freq[0] >= 0 +// means the frequencies are already computed. +// If the frequency data is overwritten or uninitialized, +// the caller must set freq[0] = -1 to force recomputation +// the next time it is needed. +func freq_8_32(text []byte, freq, bucket []int32) []int32 { + if freq != nil && freq[0] >= 0 { + return freq // already computed + } + if freq == nil { + freq = bucket + } + + freq = freq[:256] // eliminate bounds check for freq[c] below + for i := range freq { + freq[i] = 0 + } + for _, c := range text { + freq[c]++ + } + return freq +} + +// bucketMin_8_32 stores into bucket[c] the minimum index +// in the bucket for character c in a bucket-sort of text. +func bucketMin_8_32(text []byte, freq, bucket []int32) { + freq = freq_8_32(text, freq, bucket) + freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below + bucket = bucket[:256] // eliminate bounds check for bucket[i] below + total := int32(0) + for i, n := range freq { + bucket[i] = total + total += n + } +} + +// bucketMax_8_32 stores into bucket[c] the maximum index +// in the bucket for character c in a bucket-sort of text. +// The bucket indexes for c are [min, max). +// That is, max is one past the final index in that bucket. +func bucketMax_8_32(text []byte, freq, bucket []int32) { + freq = freq_8_32(text, freq, bucket) + freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below + bucket = bucket[:256] // eliminate bounds check for bucket[i] below + total := int32(0) + for i, n := range freq { + total += n + bucket[i] = total + } +} + +// The SAIS algorithm proceeds in a sequence of scans through sa. +// Each of the following functions implements one scan, +// and the functions appear here in the order they execute in the algorithm. + +// placeLMS_8_32 places into sa the indexes of the +// final characters of the LMS substrings of text, +// sorted into the rightmost ends of their correct buckets +// in the suffix array. +// +// The imaginary sentinel character at the end of the text +// is the final character of the final LMS substring, but there +// is no bucket for the imaginary sentinel character, +// which has a smaller value than any real character. +// The caller must therefore pretend that sa[-1] == len(text). +// +// The text indexes of LMS-substring characters are always ≥ 1 +// (the first LMS-substring must be preceded by one or more L-type +// characters that are not part of any LMS-substring), +// so using 0 as a “not present” suffix array entry is safe, +// both in this function and in most later functions +// (until induceL_8_32 below). +func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int { + bucketMax_8_32(text, freq, bucket) + + numLMS := 0 + lastB := int32(-1) + bucket = bucket[:256] // eliminate bounds check for bucket[c1] below + + // The next stanza of code (until the blank line) loop backward + // over text, stopping to execute a code body at each position i + // such that text[i] is an L-character and text[i+1] is an S-character. + // That is, i+1 is the position of the start of an LMS-substring. + // These could be hoisted out into a function with a callback, + // but at a significant speed cost. Instead, we just write these + // seven lines a few times in this source file. The copies below + // refer back to the pattern established by this original as the + // "LMS-substring iterator". + // + // In every scan through the text, c0, c1 are successive characters of text. + // In this backward scan, c0 == text[i] and c1 == text[i+1]. + // By scanning backward, we can keep track of whether the current + // position is type-S or type-L according to the usual definition: + // + // - position len(text) is type S with text[len(text)] == -1 (the sentinel) + // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. + // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. + // + // The backward scan lets us maintain the current type, + // update it when we see c0 != c1, and otherwise leave it alone. + // We want to identify all S positions with a preceding L. + // Position len(text) is one such position by definition, but we have + // nowhere to write it down, so we eliminate it by untruthfully + // setting isTypeS = false at the start of the loop. + c0, c1, isTypeS := byte(0), byte(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Bucket the index i+1 for the start of an LMS-substring. + b := bucket[c1] - 1 + bucket[c1] = b + sa[b] = int32(i + 1) + lastB = b + numLMS++ + } + } + + // We recorded the LMS-substring starts but really want the ends. + // Luckily, with two differences, the start indexes and the end indexes are the same. + // The first difference is that the rightmost LMS-substring's end index is len(text), + // so the caller must pretend that sa[-1] == len(text), as noted above. + // The second difference is that the first leftmost LMS-substring start index + // does not end an earlier LMS-substring, so as an optimization we can omit + // that leftmost LMS-substring start index (the last one we wrote). + // + // Exception: if numLMS <= 1, the caller is not going to bother with + // the recursion at all and will treat the result as containing LMS-substring starts. + // In that case, we don't remove the final entry. + if numLMS > 1 { + sa[lastB] = 0 + } + return numLMS +} + +// induceSubL_8_32 inserts the L-type text indexes of LMS-substrings +// into sa, assuming that the final characters of the LMS-substrings +// are already inserted into sa, sorted by final character, and at the +// right (not left) end of the corresponding character bucket. +// Each LMS-substring has the form (as a regexp) /S+L+S/: +// one or more S-type, one or more L-type, final S-type. +// induceSubL_8_32 leaves behind only the leftmost L-type text +// index for each LMS-substring. That is, it removes the final S-type +// indexes that are present on entry, and it inserts but then removes +// the interior L-type indexes too. +// (Only the leftmost L-type index is needed by induceSubS_8_32.) +func induceSubL_8_32(text []byte, sa, freq, bucket []int32) { + // Initialize positions for left side of character buckets. + bucketMin_8_32(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. + // Because j-1 is type L, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type L from type S. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type S. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type S, at which point it must stop. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing + // only the indexes of the leftmost L-type indexes for each LMS-substring. + // + // The suffix array sa therefore serves simultaneously as input, output, + // and a miraculously well-tailored work queue. + + // placeLMS_8_32 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index: + // we're processing suffixes in sorted order + // and accessing buckets indexed by the + // byte before the sorted order, which still + // has very good locality. + // Invariant: b is cached, possibly dirty copy of bucket[cB]. + cB := c1 + b := bucket[cB] + sa[b] = int32(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + if j < 0 { + // Leave discovered type-S index for caller. + sa[i] = int32(-j) + continue + } + sa[i] = 0 + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + k := j - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int32(k) + b++ + } +} + +// induceSubS_8_32 inserts the S-type text indexes of LMS-substrings +// into sa, assuming that the leftmost L-type text indexes are already +// inserted into sa, sorted by LMS-substring suffix, and at the +// left end of the corresponding character bucket. +// Each LMS-substring has the form (as a regexp) /S+L+S/: +// one or more S-type, one or more L-type, final S-type. +// induceSubS_8_32 leaves behind only the leftmost S-type text +// index for each LMS-substring, in sorted order, at the right end of sa. +// That is, it removes the L-type indexes that are present on entry, +// and it inserts but then removes the interior S-type indexes too, +// leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:]. +// (Only the LMS-substring start indexes are processed by the recursion.) +func induceSubS_8_32(text []byte, sa, freq, bucket []int32) { + // Initialize positions for right side of character buckets. + bucketMax_8_32(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + // Analogous to induceSubL_8_32 above, + // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. + // Because j-1 is type S, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type S from type L. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type L. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type L, at which point it must stop. + // That index (preceded by one of type L) is an LMS-substring start. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, + // so that the loop finishes with the top of sa containing exactly + // the LMS-substring start indexes, sorted by LMS-substring. + + // Cache recently used bucket index: + cB := byte(0) + b := bucket[cB] + + top := len(sa) + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + sa[i] = 0 + if j < 0 { + // Leave discovered LMS-substring start index for caller. + top-- + sa[top] = int32(-j) + continue + } + + // Index j was on work queue, meaning k := j-1 is S-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. + k := j - 1 + c1 := text[k] + c0 := text[k-1] + if c0 > c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int32(k) + } +} + +// length_8_32 computes and records the length of each LMS-substring in text. +// The length of the LMS-substring at index j is stored at sa[j/2], +// avoiding the LMS-substring indexes already stored in the top half of sa. +// (If index j is an LMS-substring start, then index j-1 is type L and cannot be.) +// There are two exceptions, made for optimizations in name_8_32 below. +// +// First, the final LMS-substring is recorded as having length 0, which is otherwise +// impossible, instead of giving it a length that includes the implicit sentinel. +// This ensures the final LMS-substring has length unequal to all others +// and therefore can be detected as different without text comparison +// (it is unequal because it is the only one that ends in the implicit sentinel, +// and the text comparison would be problematic since the implicit sentinel +// is not actually present at text[len(text)]). +// +// Second, to avoid text comparison entirely, if an LMS-substring is very short, +// sa[j/2] records its actual text instead of its length, so that if two such +// substrings have matching “length,” the text need not be read at all. +// The definition of “very short” is that the text bytes must pack into an uint32, +// and the unsigned encoding e must be ≥ len(text), so that it can be +// distinguished from a valid length. +func length_8_32(text []byte, sa []int32, numLMS int) { + end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) + + // The encoding of N text bytes into a “length” word + // adds 1 to each byte, packs them into the bottom + // N*8 bits of a word, and then bitwise inverts the result. + // That is, the text sequence A B C (hex 41 42 43) + // encodes as ^uint32(0x42_43_44). + // LMS-substrings can never start or end with 0xFF. + // Adding 1 ensures the encoded byte sequence never + // starts or ends with 0x00, so that present bytes can be + // distinguished from zero-padding in the top bits, + // so the length need not be separately encoded. + // Inverting the bytes increases the chance that a + // 4-byte encoding will still be ≥ len(text). + // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F) + // then the high bit of the inversion will be set, + // making it clearly not a valid length (it would be a negative one). + // + // cx holds the pre-inverted encoding (the packed incremented bytes). + cx := uint32(0) // byte-only + + // This stanza (until the blank line) is the "LMS-substring iterator", + // described in placeLMS_8_32 above, with one line added to maintain cx. + c0, c1, isTypeS := byte(0), byte(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + cx = cx<<8 | uint32(c1+1) // byte-only + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Index j = i+1 is the start of an LMS-substring. + // Compute length or encoded text to store in sa[j/2]. + j := i + 1 + var code int32 + if end == 0 { + code = 0 + } else { + code = int32(end - j) + if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only + code = int32(^cx) // byte-only + } // byte-only + } + sa[j>>1] = code + end = j + 1 + cx = uint32(c1 + 1) // byte-only + } + } +} + +// assignID_8_32 assigns a dense ID numbering to the +// set of LMS-substrings respecting string ordering and equality, +// returning the maximum assigned ID. +// For example given the input "ababab", the LMS-substrings +// are "aba", "aba", and "ab", renumbered as 2 2 1. +// sa[len(sa)-numLMS:] holds the LMS-substring indexes +// sorted in string order, so to assign numbers we can +// consider each in turn, removing adjacent duplicates. +// The new ID for the LMS-substring at index j is written to sa[j/2], +// overwriting the length previously stored there (by length_8_32 above). +func assignID_8_32(text []byte, sa []int32, numLMS int) int { + id := 0 + lastLen := int32(-1) // impossible + lastPos := int32(0) + for _, j := range sa[len(sa)-numLMS:] { + // Is the LMS-substring at index j new, or is it the same as the last one we saw? + n := sa[j/2] + if n != lastLen { + goto New + } + if uint32(n) >= uint32(len(text)) { + // “Length” is really encoded full text, and they match. + goto Same + } + { + // Compare actual texts. + n := int(n) + this := text[j:][:n] + last := text[lastPos:][:n] + for i := 0; i < n; i++ { + if this[i] != last[i] { + goto New + } + } + goto Same + } + New: + id++ + lastPos = j + lastLen = n + Same: + sa[j/2] = int32(id) + } + return id +} + +// map_32 maps the LMS-substrings in text to their new IDs, +// producing the subproblem for the recursion. +// The mapping itself was mostly applied by assignID_8_32: +// sa[i] is either 0, the ID for the LMS-substring at index 2*i, +// or the ID for the LMS-substring at index 2*i+1. +// To produce the subproblem we need only remove the zeros +// and change ID into ID-1 (our IDs start at 1, but text chars start at 0). +// +// map_32 packs the result, which is the input to the recursion, +// into the top of sa, so that the recursion result can be stored +// in the bottom of sa, which sets up for expand_8_32 well. +func map_32(sa []int32, numLMS int) { + w := len(sa) + for i := len(sa) / 2; i >= 0; i-- { + j := sa[i] + if j > 0 { + w-- + sa[w] = j - 1 + } + } +} + +// recurse_32 calls sais_32 recursively to solve the subproblem we've built. +// The subproblem is at the right end of sa, the suffix array result will be +// written at the left end of sa, and the middle of sa is available for use as +// temporary frequency and bucket storage. +func recurse_32(sa, oldTmp []int32, numLMS, maxID int) { + dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:] + + // Set up temporary space for recursive call. + // We must pass sais_32 a tmp buffer wiith at least maxID entries. + // + // The subproblem is guaranteed to have length at most len(sa)/2, + // so that sa can hold both the subproblem and its suffix array. + // Nearly all the time, however, the subproblem has length < len(sa)/3, + // in which case there is a subproblem-sized middle of sa that + // we can reuse for temporary space (saTmp). + // When recurse_32 is called from sais_8_32, oldTmp is length 512 + // (from text_32), and saTmp will typically be much larger, so we'll use saTmp. + // When deeper recursions come back to recurse_32, now oldTmp is + // the saTmp from the top-most recursion, it is typically larger than + // the current saTmp (because the current sa gets smaller and smaller + // as the recursion gets deeper), and we keep reusing that top-most + // large saTmp instead of the offered smaller ones. + // + // Why is the subproblem length so often just under len(sa)/3? + // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation. + // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern + // in the input, perfect alternation of larger and smaller input bytes. + // Real text doesn't do that. If each L-type index is randomly followed + // by either an L-type or S-type index, then half the substrings will + // be of the form SLS, but the other half will be longer. Of that half, + // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on. + // Not counting the final S in each (which overlaps the first S in the next), + // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3. + // The space we need is further reduced by the fact that many of the + // short patterns like SLS will often be the same character sequences + // repeated throughout the text, reducing maxID relative to numLMS. + // + // For short inputs, the averages may not run in our favor, but then we + // can often fall back to using the length-512 tmp available in the + // top-most call. (Also a short allocation would not be a big deal.) + // + // For pathological inputs, we fall back to allocating a new tmp of length + // max(maxID, numLMS/2). This level of the recursion needs maxID, + // and all deeper levels of the recursion will need no more than numLMS/2, + // so this one allocation is guaranteed to suffice for the entire stack + // of recursive calls. + tmp := oldTmp + if len(tmp) < len(saTmp) { + tmp = saTmp + } + if len(tmp) < numLMS { + // TestSAIS/forcealloc reaches this code. + n := maxID + if n < numLMS/2 { + n = numLMS / 2 + } + tmp = make([]int32, n) + } + + // sais_32 requires that the caller arrange to clear dst, + // because in general the caller may know dst is + // freshly-allocated and already cleared. But this one is not. + for i := range dst { + dst[i] = 0 + } + sais_32(text, maxID, dst, tmp) +} + +// unmap_8_32 unmaps the subproblem back to the original. +// sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore. +// sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers. +// The key part is that if the list says K that means the K'th substring. +// We can replace sa[:numLMS] with the indexes of the LMS-substrings. +// Then if the list says K it really means sa[K]. +// Having mapped the list back to LMS-substring indexes, +// we can place those into the right buckets. +func unmap_8_32(text []byte, sa []int32, numLMS int) { + unmap := sa[len(sa)-numLMS:] + j := len(unmap) + + // "LMS-substring iterator" (see placeLMS_8_32 above). + c0, c1, isTypeS := byte(0), byte(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Populate inverse map. + j-- + unmap[j] = int32(i + 1) + } + } + + // Apply inverse map to subproblem suffix array. + sa = sa[:numLMS] + for i := 0; i < len(sa); i++ { + sa[i] = unmap[sa[i]] + } +} + +// expand_8_32 distributes the compacted, sorted LMS-suffix indexes +// from sa[:numLMS] into the tops of the appropriate buckets in sa, +// preserving the sorted order and making room for the L-type indexes +// to be slotted into the sorted sequence by induceL_8_32. +func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) { + bucketMax_8_32(text, freq, bucket) + bucket = bucket[:256] // eliminate bound check for bucket[c] below + + // Loop backward through sa, always tracking + // the next index to populate from sa[:numLMS]. + // When we get to one, populate it. + // Zero the rest of the slots; they have dead values in them. + x := numLMS - 1 + saX := sa[x] + c := text[saX] + b := bucket[c] - 1 + bucket[c] = b + + for i := len(sa) - 1; i >= 0; i-- { + if i != int(b) { + sa[i] = 0 + continue + } + sa[i] = saX + + // Load next entry to put down (if any). + if x > 0 { + x-- + saX = sa[x] // TODO bounds check + c = text[saX] + b = bucket[c] - 1 + bucket[c] = b + } + } +} + +// induceL_8_32 inserts L-type text indexes into sa, +// assuming that the leftmost S-type indexes are inserted +// into sa, in sorted order, in the right bucket halves. +// It leaves all the L-type indexes in sa, but the +// leftmost L-type indexes are negated, to mark them +// for processing by induceS_8_32. +func induceL_8_32(text []byte, sa, freq, bucket []int32) { + // Initialize positions for left side of character buckets. + bucketMin_8_32(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + // This scan is similar to the one in induceSubL_8_32 above. + // That one arranges to clear all but the leftmost L-type indexes. + // This scan leaves all the L-type indexes and the original S-type + // indexes, but it negates the positive leftmost L-type indexes + // (the ones that induceS_8_32 needs to process). + + // expand_8_32 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index. + cB := c1 + b := bucket[cB] + sa[b] = int32(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j <= 0 { + // Skip empty or negated entry (including negated zero). + continue + } + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. The caller can't tell the difference between + // an empty slot and a non-empty zero, but there's no need + // to distinguish them anyway: the final suffix array will end up + // with one zero somewhere, and that will be a real zero. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 < c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int32(k) + b++ + } +} + +func induceS_8_32(text []byte, sa, freq, bucket []int32) { + // Initialize positions for right side of character buckets. + bucketMax_8_32(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + cB := byte(0) + b := bucket[cB] + + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j >= 0 { + // Skip non-flagged entry. + // (This loop can't see an empty entry; 0 means the real zero index.) + continue + } + + // Negative j is a work queue entry; rewrite to positive j for final suffix array. + j = -j + sa[i] = int32(j) + + // Index j was on work queue (encoded as -j but now decoded), + // meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue -k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 <= c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int32(k) + } +} diff --git a/src/index/suffixarray/sais2.go b/src/index/suffixarray/sais2.go new file mode 100644 index 0000000..f124702 --- /dev/null +++ b/src/index/suffixarray/sais2.go @@ -0,0 +1,1741 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Code generated by go generate; DO NOT EDIT. + +package suffixarray + +func text_64(text []byte, sa []int64) { + if int(int64(len(text))) != len(text) || len(text) != len(sa) { + panic("suffixarray: misuse of text_64") + } + sais_8_64(text, 256, sa, make([]int64, 2*256)) +} + +func sais_8_64(text []byte, textMax int, sa, tmp []int64) { + if len(sa) != len(text) || len(tmp) < int(textMax) { + panic("suffixarray: misuse of sais_8_64") + } + + // Trivial base cases. Sorting 0 or 1 things is easy. + if len(text) == 0 { + return + } + if len(text) == 1 { + sa[0] = 0 + return + } + + // Establish slices indexed by text character + // holding character frequency and bucket-sort offsets. + // If there's only enough tmp for one slice, + // we make it the bucket offsets and recompute + // the character frequency each time we need it. + var freq, bucket []int64 + if len(tmp) >= 2*textMax { + freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] + freq[0] = -1 // mark as uninitialized + } else { + freq, bucket = nil, tmp[:textMax] + } + + // The SAIS algorithm. + // Each of these calls makes one scan through sa. + // See the individual functions for documentation + // about each's role in the algorithm. + numLMS := placeLMS_8_64(text, sa, freq, bucket) + if numLMS <= 1 { + // 0 or 1 items are already sorted. Do nothing. + } else { + induceSubL_8_64(text, sa, freq, bucket) + induceSubS_8_64(text, sa, freq, bucket) + length_8_64(text, sa, numLMS) + maxID := assignID_8_64(text, sa, numLMS) + if maxID < numLMS { + map_64(sa, numLMS) + recurse_64(sa, tmp, numLMS, maxID) + unmap_8_64(text, sa, numLMS) + } else { + // If maxID == numLMS, then each LMS-substring + // is unique, so the relative ordering of two LMS-suffixes + // is determined by just the leading LMS-substring. + // That is, the LMS-suffix sort order matches the + // (simpler) LMS-substring sort order. + // Copy the original LMS-substring order into the + // suffix array destination. + copy(sa, sa[len(sa)-numLMS:]) + } + expand_8_64(text, freq, bucket, sa, numLMS) + } + induceL_8_64(text, sa, freq, bucket) + induceS_8_64(text, sa, freq, bucket) + + // Mark for caller that we overwrote tmp. + tmp[0] = -1 +} + +func sais_32(text []int32, textMax int, sa, tmp []int32) { + if len(sa) != len(text) || len(tmp) < int(textMax) { + panic("suffixarray: misuse of sais_32") + } + + // Trivial base cases. Sorting 0 or 1 things is easy. + if len(text) == 0 { + return + } + if len(text) == 1 { + sa[0] = 0 + return + } + + // Establish slices indexed by text character + // holding character frequency and bucket-sort offsets. + // If there's only enough tmp for one slice, + // we make it the bucket offsets and recompute + // the character frequency each time we need it. + var freq, bucket []int32 + if len(tmp) >= 2*textMax { + freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] + freq[0] = -1 // mark as uninitialized + } else { + freq, bucket = nil, tmp[:textMax] + } + + // The SAIS algorithm. + // Each of these calls makes one scan through sa. + // See the individual functions for documentation + // about each's role in the algorithm. + numLMS := placeLMS_32(text, sa, freq, bucket) + if numLMS <= 1 { + // 0 or 1 items are already sorted. Do nothing. + } else { + induceSubL_32(text, sa, freq, bucket) + induceSubS_32(text, sa, freq, bucket) + length_32(text, sa, numLMS) + maxID := assignID_32(text, sa, numLMS) + if maxID < numLMS { + map_32(sa, numLMS) + recurse_32(sa, tmp, numLMS, maxID) + unmap_32(text, sa, numLMS) + } else { + // If maxID == numLMS, then each LMS-substring + // is unique, so the relative ordering of two LMS-suffixes + // is determined by just the leading LMS-substring. + // That is, the LMS-suffix sort order matches the + // (simpler) LMS-substring sort order. + // Copy the original LMS-substring order into the + // suffix array destination. + copy(sa, sa[len(sa)-numLMS:]) + } + expand_32(text, freq, bucket, sa, numLMS) + } + induceL_32(text, sa, freq, bucket) + induceS_32(text, sa, freq, bucket) + + // Mark for caller that we overwrote tmp. + tmp[0] = -1 +} + +func sais_64(text []int64, textMax int, sa, tmp []int64) { + if len(sa) != len(text) || len(tmp) < int(textMax) { + panic("suffixarray: misuse of sais_64") + } + + // Trivial base cases. Sorting 0 or 1 things is easy. + if len(text) == 0 { + return + } + if len(text) == 1 { + sa[0] = 0 + return + } + + // Establish slices indexed by text character + // holding character frequency and bucket-sort offsets. + // If there's only enough tmp for one slice, + // we make it the bucket offsets and recompute + // the character frequency each time we need it. + var freq, bucket []int64 + if len(tmp) >= 2*textMax { + freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] + freq[0] = -1 // mark as uninitialized + } else { + freq, bucket = nil, tmp[:textMax] + } + + // The SAIS algorithm. + // Each of these calls makes one scan through sa. + // See the individual functions for documentation + // about each's role in the algorithm. + numLMS := placeLMS_64(text, sa, freq, bucket) + if numLMS <= 1 { + // 0 or 1 items are already sorted. Do nothing. + } else { + induceSubL_64(text, sa, freq, bucket) + induceSubS_64(text, sa, freq, bucket) + length_64(text, sa, numLMS) + maxID := assignID_64(text, sa, numLMS) + if maxID < numLMS { + map_64(sa, numLMS) + recurse_64(sa, tmp, numLMS, maxID) + unmap_64(text, sa, numLMS) + } else { + // If maxID == numLMS, then each LMS-substring + // is unique, so the relative ordering of two LMS-suffixes + // is determined by just the leading LMS-substring. + // That is, the LMS-suffix sort order matches the + // (simpler) LMS-substring sort order. + // Copy the original LMS-substring order into the + // suffix array destination. + copy(sa, sa[len(sa)-numLMS:]) + } + expand_64(text, freq, bucket, sa, numLMS) + } + induceL_64(text, sa, freq, bucket) + induceS_64(text, sa, freq, bucket) + + // Mark for caller that we overwrote tmp. + tmp[0] = -1 +} + +func freq_8_64(text []byte, freq, bucket []int64) []int64 { + if freq != nil && freq[0] >= 0 { + return freq // already computed + } + if freq == nil { + freq = bucket + } + + freq = freq[:256] // eliminate bounds check for freq[c] below + for i := range freq { + freq[i] = 0 + } + for _, c := range text { + freq[c]++ + } + return freq +} + +func freq_32(text []int32, freq, bucket []int32) []int32 { + if freq != nil && freq[0] >= 0 { + return freq // already computed + } + if freq == nil { + freq = bucket + } + + for i := range freq { + freq[i] = 0 + } + for _, c := range text { + freq[c]++ + } + return freq +} + +func freq_64(text []int64, freq, bucket []int64) []int64 { + if freq != nil && freq[0] >= 0 { + return freq // already computed + } + if freq == nil { + freq = bucket + } + + for i := range freq { + freq[i] = 0 + } + for _, c := range text { + freq[c]++ + } + return freq +} + +func bucketMin_8_64(text []byte, freq, bucket []int64) { + freq = freq_8_64(text, freq, bucket) + freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below + bucket = bucket[:256] // eliminate bounds check for bucket[i] below + total := int64(0) + for i, n := range freq { + bucket[i] = total + total += n + } +} + +func bucketMin_32(text []int32, freq, bucket []int32) { + freq = freq_32(text, freq, bucket) + total := int32(0) + for i, n := range freq { + bucket[i] = total + total += n + } +} + +func bucketMin_64(text []int64, freq, bucket []int64) { + freq = freq_64(text, freq, bucket) + total := int64(0) + for i, n := range freq { + bucket[i] = total + total += n + } +} + +func bucketMax_8_64(text []byte, freq, bucket []int64) { + freq = freq_8_64(text, freq, bucket) + freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below + bucket = bucket[:256] // eliminate bounds check for bucket[i] below + total := int64(0) + for i, n := range freq { + total += n + bucket[i] = total + } +} + +func bucketMax_32(text []int32, freq, bucket []int32) { + freq = freq_32(text, freq, bucket) + total := int32(0) + for i, n := range freq { + total += n + bucket[i] = total + } +} + +func bucketMax_64(text []int64, freq, bucket []int64) { + freq = freq_64(text, freq, bucket) + total := int64(0) + for i, n := range freq { + total += n + bucket[i] = total + } +} + +func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int { + bucketMax_8_64(text, freq, bucket) + + numLMS := 0 + lastB := int64(-1) + bucket = bucket[:256] // eliminate bounds check for bucket[c1] below + + // The next stanza of code (until the blank line) loop backward + // over text, stopping to execute a code body at each position i + // such that text[i] is an L-character and text[i+1] is an S-character. + // That is, i+1 is the position of the start of an LMS-substring. + // These could be hoisted out into a function with a callback, + // but at a significant speed cost. Instead, we just write these + // seven lines a few times in this source file. The copies below + // refer back to the pattern established by this original as the + // "LMS-substring iterator". + // + // In every scan through the text, c0, c1 are successive characters of text. + // In this backward scan, c0 == text[i] and c1 == text[i+1]. + // By scanning backward, we can keep track of whether the current + // position is type-S or type-L according to the usual definition: + // + // - position len(text) is type S with text[len(text)] == -1 (the sentinel) + // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. + // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. + // + // The backward scan lets us maintain the current type, + // update it when we see c0 != c1, and otherwise leave it alone. + // We want to identify all S positions with a preceding L. + // Position len(text) is one such position by definition, but we have + // nowhere to write it down, so we eliminate it by untruthfully + // setting isTypeS = false at the start of the loop. + c0, c1, isTypeS := byte(0), byte(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Bucket the index i+1 for the start of an LMS-substring. + b := bucket[c1] - 1 + bucket[c1] = b + sa[b] = int64(i + 1) + lastB = b + numLMS++ + } + } + + // We recorded the LMS-substring starts but really want the ends. + // Luckily, with two differences, the start indexes and the end indexes are the same. + // The first difference is that the rightmost LMS-substring's end index is len(text), + // so the caller must pretend that sa[-1] == len(text), as noted above. + // The second difference is that the first leftmost LMS-substring start index + // does not end an earlier LMS-substring, so as an optimization we can omit + // that leftmost LMS-substring start index (the last one we wrote). + // + // Exception: if numLMS <= 1, the caller is not going to bother with + // the recursion at all and will treat the result as containing LMS-substring starts. + // In that case, we don't remove the final entry. + if numLMS > 1 { + sa[lastB] = 0 + } + return numLMS +} + +func placeLMS_32(text []int32, sa, freq, bucket []int32) int { + bucketMax_32(text, freq, bucket) + + numLMS := 0 + lastB := int32(-1) + + // The next stanza of code (until the blank line) loop backward + // over text, stopping to execute a code body at each position i + // such that text[i] is an L-character and text[i+1] is an S-character. + // That is, i+1 is the position of the start of an LMS-substring. + // These could be hoisted out into a function with a callback, + // but at a significant speed cost. Instead, we just write these + // seven lines a few times in this source file. The copies below + // refer back to the pattern established by this original as the + // "LMS-substring iterator". + // + // In every scan through the text, c0, c1 are successive characters of text. + // In this backward scan, c0 == text[i] and c1 == text[i+1]. + // By scanning backward, we can keep track of whether the current + // position is type-S or type-L according to the usual definition: + // + // - position len(text) is type S with text[len(text)] == -1 (the sentinel) + // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. + // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. + // + // The backward scan lets us maintain the current type, + // update it when we see c0 != c1, and otherwise leave it alone. + // We want to identify all S positions with a preceding L. + // Position len(text) is one such position by definition, but we have + // nowhere to write it down, so we eliminate it by untruthfully + // setting isTypeS = false at the start of the loop. + c0, c1, isTypeS := int32(0), int32(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Bucket the index i+1 for the start of an LMS-substring. + b := bucket[c1] - 1 + bucket[c1] = b + sa[b] = int32(i + 1) + lastB = b + numLMS++ + } + } + + // We recorded the LMS-substring starts but really want the ends. + // Luckily, with two differences, the start indexes and the end indexes are the same. + // The first difference is that the rightmost LMS-substring's end index is len(text), + // so the caller must pretend that sa[-1] == len(text), as noted above. + // The second difference is that the first leftmost LMS-substring start index + // does not end an earlier LMS-substring, so as an optimization we can omit + // that leftmost LMS-substring start index (the last one we wrote). + // + // Exception: if numLMS <= 1, the caller is not going to bother with + // the recursion at all and will treat the result as containing LMS-substring starts. + // In that case, we don't remove the final entry. + if numLMS > 1 { + sa[lastB] = 0 + } + return numLMS +} + +func placeLMS_64(text []int64, sa, freq, bucket []int64) int { + bucketMax_64(text, freq, bucket) + + numLMS := 0 + lastB := int64(-1) + + // The next stanza of code (until the blank line) loop backward + // over text, stopping to execute a code body at each position i + // such that text[i] is an L-character and text[i+1] is an S-character. + // That is, i+1 is the position of the start of an LMS-substring. + // These could be hoisted out into a function with a callback, + // but at a significant speed cost. Instead, we just write these + // seven lines a few times in this source file. The copies below + // refer back to the pattern established by this original as the + // "LMS-substring iterator". + // + // In every scan through the text, c0, c1 are successive characters of text. + // In this backward scan, c0 == text[i] and c1 == text[i+1]. + // By scanning backward, we can keep track of whether the current + // position is type-S or type-L according to the usual definition: + // + // - position len(text) is type S with text[len(text)] == -1 (the sentinel) + // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. + // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. + // + // The backward scan lets us maintain the current type, + // update it when we see c0 != c1, and otherwise leave it alone. + // We want to identify all S positions with a preceding L. + // Position len(text) is one such position by definition, but we have + // nowhere to write it down, so we eliminate it by untruthfully + // setting isTypeS = false at the start of the loop. + c0, c1, isTypeS := int64(0), int64(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Bucket the index i+1 for the start of an LMS-substring. + b := bucket[c1] - 1 + bucket[c1] = b + sa[b] = int64(i + 1) + lastB = b + numLMS++ + } + } + + // We recorded the LMS-substring starts but really want the ends. + // Luckily, with two differences, the start indexes and the end indexes are the same. + // The first difference is that the rightmost LMS-substring's end index is len(text), + // so the caller must pretend that sa[-1] == len(text), as noted above. + // The second difference is that the first leftmost LMS-substring start index + // does not end an earlier LMS-substring, so as an optimization we can omit + // that leftmost LMS-substring start index (the last one we wrote). + // + // Exception: if numLMS <= 1, the caller is not going to bother with + // the recursion at all and will treat the result as containing LMS-substring starts. + // In that case, we don't remove the final entry. + if numLMS > 1 { + sa[lastB] = 0 + } + return numLMS +} + +func induceSubL_8_64(text []byte, sa, freq, bucket []int64) { + // Initialize positions for left side of character buckets. + bucketMin_8_64(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. + // Because j-1 is type L, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type L from type S. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type S. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type S, at which point it must stop. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing + // only the indexes of the leftmost L-type indexes for each LMS-substring. + // + // The suffix array sa therefore serves simultaneously as input, output, + // and a miraculously well-tailored work queue. + + // placeLMS_8_64 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index: + // we're processing suffixes in sorted order + // and accessing buckets indexed by the + // byte before the sorted order, which still + // has very good locality. + // Invariant: b is cached, possibly dirty copy of bucket[cB]. + cB := c1 + b := bucket[cB] + sa[b] = int64(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + if j < 0 { + // Leave discovered type-S index for caller. + sa[i] = int64(-j) + continue + } + sa[i] = 0 + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + k := j - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int64(k) + b++ + } +} + +func induceSubL_32(text []int32, sa, freq, bucket []int32) { + // Initialize positions for left side of character buckets. + bucketMin_32(text, freq, bucket) + + // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. + // Because j-1 is type L, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type L from type S. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type S. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type S, at which point it must stop. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing + // only the indexes of the leftmost L-type indexes for each LMS-substring. + // + // The suffix array sa therefore serves simultaneously as input, output, + // and a miraculously well-tailored work queue. + + // placeLMS_32 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index: + // we're processing suffixes in sorted order + // and accessing buckets indexed by the + // int32 before the sorted order, which still + // has very good locality. + // Invariant: b is cached, possibly dirty copy of bucket[cB]. + cB := c1 + b := bucket[cB] + sa[b] = int32(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + if j < 0 { + // Leave discovered type-S index for caller. + sa[i] = int32(-j) + continue + } + sa[i] = 0 + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + k := j - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int32(k) + b++ + } +} + +func induceSubL_64(text []int64, sa, freq, bucket []int64) { + // Initialize positions for left side of character buckets. + bucketMin_64(text, freq, bucket) + + // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. + // Because j-1 is type L, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type L from type S. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type S. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type S, at which point it must stop. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing + // only the indexes of the leftmost L-type indexes for each LMS-substring. + // + // The suffix array sa therefore serves simultaneously as input, output, + // and a miraculously well-tailored work queue. + + // placeLMS_64 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index: + // we're processing suffixes in sorted order + // and accessing buckets indexed by the + // int64 before the sorted order, which still + // has very good locality. + // Invariant: b is cached, possibly dirty copy of bucket[cB]. + cB := c1 + b := bucket[cB] + sa[b] = int64(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + if j < 0 { + // Leave discovered type-S index for caller. + sa[i] = int64(-j) + continue + } + sa[i] = 0 + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + k := j - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int64(k) + b++ + } +} + +func induceSubS_8_64(text []byte, sa, freq, bucket []int64) { + // Initialize positions for right side of character buckets. + bucketMax_8_64(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + // Analogous to induceSubL_8_64 above, + // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. + // Because j-1 is type S, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type S from type L. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type L. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type L, at which point it must stop. + // That index (preceded by one of type L) is an LMS-substring start. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, + // so that the loop finishes with the top of sa containing exactly + // the LMS-substring start indexes, sorted by LMS-substring. + + // Cache recently used bucket index: + cB := byte(0) + b := bucket[cB] + + top := len(sa) + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + sa[i] = 0 + if j < 0 { + // Leave discovered LMS-substring start index for caller. + top-- + sa[top] = int64(-j) + continue + } + + // Index j was on work queue, meaning k := j-1 is S-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. + k := j - 1 + c1 := text[k] + c0 := text[k-1] + if c0 > c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int64(k) + } +} + +func induceSubS_32(text []int32, sa, freq, bucket []int32) { + // Initialize positions for right side of character buckets. + bucketMax_32(text, freq, bucket) + + // Analogous to induceSubL_32 above, + // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. + // Because j-1 is type S, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type S from type L. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type L. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type L, at which point it must stop. + // That index (preceded by one of type L) is an LMS-substring start. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, + // so that the loop finishes with the top of sa containing exactly + // the LMS-substring start indexes, sorted by LMS-substring. + + // Cache recently used bucket index: + cB := int32(0) + b := bucket[cB] + + top := len(sa) + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + sa[i] = 0 + if j < 0 { + // Leave discovered LMS-substring start index for caller. + top-- + sa[top] = int32(-j) + continue + } + + // Index j was on work queue, meaning k := j-1 is S-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. + k := j - 1 + c1 := text[k] + c0 := text[k-1] + if c0 > c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int32(k) + } +} + +func induceSubS_64(text []int64, sa, freq, bucket []int64) { + // Initialize positions for right side of character buckets. + bucketMax_64(text, freq, bucket) + + // Analogous to induceSubL_64 above, + // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly + // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. + // Because j-1 is type S, inserting it into sa now will sort it correctly. + // But we want to distinguish a j-1 with j-2 of type S from type L. + // We can process the former but want to leave the latter for the caller. + // We record the difference by negating j-1 if it is preceded by type L. + // Either way, the insertion (into the text[j-1] bucket) is guaranteed to + // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have + // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, + // and so on, in sorted but not necessarily adjacent order, until it finds + // one preceded by an index of type L, at which point it must stop. + // That index (preceded by one of type L) is an LMS-substring start. + // + // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, + // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, + // so that the loop finishes with the top of sa containing exactly + // the LMS-substring start indexes, sorted by LMS-substring. + + // Cache recently used bucket index: + cB := int64(0) + b := bucket[cB] + + top := len(sa) + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j == 0 { + // Skip empty entry. + continue + } + sa[i] = 0 + if j < 0 { + // Leave discovered LMS-substring start index for caller. + top-- + sa[top] = int64(-j) + continue + } + + // Index j was on work queue, meaning k := j-1 is S-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. + k := j - 1 + c1 := text[k] + c0 := text[k-1] + if c0 > c1 { + k = -k + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int64(k) + } +} + +func length_8_64(text []byte, sa []int64, numLMS int) { + end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) + + // The encoding of N text bytes into a “length” word + // adds 1 to each byte, packs them into the bottom + // N*8 bits of a word, and then bitwise inverts the result. + // That is, the text sequence A B C (hex 41 42 43) + // encodes as ^uint64(0x42_43_44). + // LMS-substrings can never start or end with 0xFF. + // Adding 1 ensures the encoded byte sequence never + // starts or ends with 0x00, so that present bytes can be + // distinguished from zero-padding in the top bits, + // so the length need not be separately encoded. + // Inverting the bytes increases the chance that a + // 4-byte encoding will still be ≥ len(text). + // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F) + // then the high bit of the inversion will be set, + // making it clearly not a valid length (it would be a negative one). + // + // cx holds the pre-inverted encoding (the packed incremented bytes). + cx := uint64(0) // byte-only + + // This stanza (until the blank line) is the "LMS-substring iterator", + // described in placeLMS_8_64 above, with one line added to maintain cx. + c0, c1, isTypeS := byte(0), byte(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + cx = cx<<8 | uint64(c1+1) // byte-only + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Index j = i+1 is the start of an LMS-substring. + // Compute length or encoded text to store in sa[j/2]. + j := i + 1 + var code int64 + if end == 0 { + code = 0 + } else { + code = int64(end - j) + if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only + code = int64(^cx) // byte-only + } // byte-only + } + sa[j>>1] = code + end = j + 1 + cx = uint64(c1 + 1) // byte-only + } + } +} + +func length_32(text []int32, sa []int32, numLMS int) { + end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) + + // The encoding of N text int32s into a “length” word + // adds 1 to each int32, packs them into the bottom + // N*8 bits of a word, and then bitwise inverts the result. + // That is, the text sequence A B C (hex 41 42 43) + // encodes as ^uint32(0x42_43_44). + // LMS-substrings can never start or end with 0xFF. + // Adding 1 ensures the encoded int32 sequence never + // starts or ends with 0x00, so that present int32s can be + // distinguished from zero-padding in the top bits, + // so the length need not be separately encoded. + // Inverting the int32s increases the chance that a + // 4-int32 encoding will still be ≥ len(text). + // In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F) + // then the high bit of the inversion will be set, + // making it clearly not a valid length (it would be a negative one). + // + // cx holds the pre-inverted encoding (the packed incremented int32s). + + // This stanza (until the blank line) is the "LMS-substring iterator", + // described in placeLMS_32 above, with one line added to maintain cx. + c0, c1, isTypeS := int32(0), int32(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Index j = i+1 is the start of an LMS-substring. + // Compute length or encoded text to store in sa[j/2]. + j := i + 1 + var code int32 + if end == 0 { + code = 0 + } else { + code = int32(end - j) + } + sa[j>>1] = code + end = j + 1 + } + } +} + +func length_64(text []int64, sa []int64, numLMS int) { + end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) + + // The encoding of N text int64s into a “length” word + // adds 1 to each int64, packs them into the bottom + // N*8 bits of a word, and then bitwise inverts the result. + // That is, the text sequence A B C (hex 41 42 43) + // encodes as ^uint64(0x42_43_44). + // LMS-substrings can never start or end with 0xFF. + // Adding 1 ensures the encoded int64 sequence never + // starts or ends with 0x00, so that present int64s can be + // distinguished from zero-padding in the top bits, + // so the length need not be separately encoded. + // Inverting the int64s increases the chance that a + // 4-int64 encoding will still be ≥ len(text). + // In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F) + // then the high bit of the inversion will be set, + // making it clearly not a valid length (it would be a negative one). + // + // cx holds the pre-inverted encoding (the packed incremented int64s). + + // This stanza (until the blank line) is the "LMS-substring iterator", + // described in placeLMS_64 above, with one line added to maintain cx. + c0, c1, isTypeS := int64(0), int64(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Index j = i+1 is the start of an LMS-substring. + // Compute length or encoded text to store in sa[j/2]. + j := i + 1 + var code int64 + if end == 0 { + code = 0 + } else { + code = int64(end - j) + } + sa[j>>1] = code + end = j + 1 + } + } +} + +func assignID_8_64(text []byte, sa []int64, numLMS int) int { + id := 0 + lastLen := int64(-1) // impossible + lastPos := int64(0) + for _, j := range sa[len(sa)-numLMS:] { + // Is the LMS-substring at index j new, or is it the same as the last one we saw? + n := sa[j/2] + if n != lastLen { + goto New + } + if uint64(n) >= uint64(len(text)) { + // “Length” is really encoded full text, and they match. + goto Same + } + { + // Compare actual texts. + n := int(n) + this := text[j:][:n] + last := text[lastPos:][:n] + for i := 0; i < n; i++ { + if this[i] != last[i] { + goto New + } + } + goto Same + } + New: + id++ + lastPos = j + lastLen = n + Same: + sa[j/2] = int64(id) + } + return id +} + +func assignID_32(text []int32, sa []int32, numLMS int) int { + id := 0 + lastLen := int32(-1) // impossible + lastPos := int32(0) + for _, j := range sa[len(sa)-numLMS:] { + // Is the LMS-substring at index j new, or is it the same as the last one we saw? + n := sa[j/2] + if n != lastLen { + goto New + } + if uint32(n) >= uint32(len(text)) { + // “Length” is really encoded full text, and they match. + goto Same + } + { + // Compare actual texts. + n := int(n) + this := text[j:][:n] + last := text[lastPos:][:n] + for i := 0; i < n; i++ { + if this[i] != last[i] { + goto New + } + } + goto Same + } + New: + id++ + lastPos = j + lastLen = n + Same: + sa[j/2] = int32(id) + } + return id +} + +func assignID_64(text []int64, sa []int64, numLMS int) int { + id := 0 + lastLen := int64(-1) // impossible + lastPos := int64(0) + for _, j := range sa[len(sa)-numLMS:] { + // Is the LMS-substring at index j new, or is it the same as the last one we saw? + n := sa[j/2] + if n != lastLen { + goto New + } + if uint64(n) >= uint64(len(text)) { + // “Length” is really encoded full text, and they match. + goto Same + } + { + // Compare actual texts. + n := int(n) + this := text[j:][:n] + last := text[lastPos:][:n] + for i := 0; i < n; i++ { + if this[i] != last[i] { + goto New + } + } + goto Same + } + New: + id++ + lastPos = j + lastLen = n + Same: + sa[j/2] = int64(id) + } + return id +} + +func map_64(sa []int64, numLMS int) { + w := len(sa) + for i := len(sa) / 2; i >= 0; i-- { + j := sa[i] + if j > 0 { + w-- + sa[w] = j - 1 + } + } +} + +func recurse_64(sa, oldTmp []int64, numLMS, maxID int) { + dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:] + + // Set up temporary space for recursive call. + // We must pass sais_64 a tmp buffer wiith at least maxID entries. + // + // The subproblem is guaranteed to have length at most len(sa)/2, + // so that sa can hold both the subproblem and its suffix array. + // Nearly all the time, however, the subproblem has length < len(sa)/3, + // in which case there is a subproblem-sized middle of sa that + // we can reuse for temporary space (saTmp). + // When recurse_64 is called from sais_8_64, oldTmp is length 512 + // (from text_64), and saTmp will typically be much larger, so we'll use saTmp. + // When deeper recursions come back to recurse_64, now oldTmp is + // the saTmp from the top-most recursion, it is typically larger than + // the current saTmp (because the current sa gets smaller and smaller + // as the recursion gets deeper), and we keep reusing that top-most + // large saTmp instead of the offered smaller ones. + // + // Why is the subproblem length so often just under len(sa)/3? + // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation. + // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern + // in the input, perfect alternation of larger and smaller input bytes. + // Real text doesn't do that. If each L-type index is randomly followed + // by either an L-type or S-type index, then half the substrings will + // be of the form SLS, but the other half will be longer. Of that half, + // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on. + // Not counting the final S in each (which overlaps the first S in the next), + // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3. + // The space we need is further reduced by the fact that many of the + // short patterns like SLS will often be the same character sequences + // repeated throughout the text, reducing maxID relative to numLMS. + // + // For short inputs, the averages may not run in our favor, but then we + // can often fall back to using the length-512 tmp available in the + // top-most call. (Also a short allocation would not be a big deal.) + // + // For pathological inputs, we fall back to allocating a new tmp of length + // max(maxID, numLMS/2). This level of the recursion needs maxID, + // and all deeper levels of the recursion will need no more than numLMS/2, + // so this one allocation is guaranteed to suffice for the entire stack + // of recursive calls. + tmp := oldTmp + if len(tmp) < len(saTmp) { + tmp = saTmp + } + if len(tmp) < numLMS { + // TestSAIS/forcealloc reaches this code. + n := maxID + if n < numLMS/2 { + n = numLMS / 2 + } + tmp = make([]int64, n) + } + + // sais_64 requires that the caller arrange to clear dst, + // because in general the caller may know dst is + // freshly-allocated and already cleared. But this one is not. + for i := range dst { + dst[i] = 0 + } + sais_64(text, maxID, dst, tmp) +} + +func unmap_8_64(text []byte, sa []int64, numLMS int) { + unmap := sa[len(sa)-numLMS:] + j := len(unmap) + + // "LMS-substring iterator" (see placeLMS_8_64 above). + c0, c1, isTypeS := byte(0), byte(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Populate inverse map. + j-- + unmap[j] = int64(i + 1) + } + } + + // Apply inverse map to subproblem suffix array. + sa = sa[:numLMS] + for i := 0; i < len(sa); i++ { + sa[i] = unmap[sa[i]] + } +} + +func unmap_32(text []int32, sa []int32, numLMS int) { + unmap := sa[len(sa)-numLMS:] + j := len(unmap) + + // "LMS-substring iterator" (see placeLMS_32 above). + c0, c1, isTypeS := int32(0), int32(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Populate inverse map. + j-- + unmap[j] = int32(i + 1) + } + } + + // Apply inverse map to subproblem suffix array. + sa = sa[:numLMS] + for i := 0; i < len(sa); i++ { + sa[i] = unmap[sa[i]] + } +} + +func unmap_64(text []int64, sa []int64, numLMS int) { + unmap := sa[len(sa)-numLMS:] + j := len(unmap) + + // "LMS-substring iterator" (see placeLMS_64 above). + c0, c1, isTypeS := int64(0), int64(0), false + for i := len(text) - 1; i >= 0; i-- { + c0, c1 = text[i], c0 + if c0 < c1 { + isTypeS = true + } else if c0 > c1 && isTypeS { + isTypeS = false + + // Populate inverse map. + j-- + unmap[j] = int64(i + 1) + } + } + + // Apply inverse map to subproblem suffix array. + sa = sa[:numLMS] + for i := 0; i < len(sa); i++ { + sa[i] = unmap[sa[i]] + } +} + +func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) { + bucketMax_8_64(text, freq, bucket) + bucket = bucket[:256] // eliminate bound check for bucket[c] below + + // Loop backward through sa, always tracking + // the next index to populate from sa[:numLMS]. + // When we get to one, populate it. + // Zero the rest of the slots; they have dead values in them. + x := numLMS - 1 + saX := sa[x] + c := text[saX] + b := bucket[c] - 1 + bucket[c] = b + + for i := len(sa) - 1; i >= 0; i-- { + if i != int(b) { + sa[i] = 0 + continue + } + sa[i] = saX + + // Load next entry to put down (if any). + if x > 0 { + x-- + saX = sa[x] // TODO bounds check + c = text[saX] + b = bucket[c] - 1 + bucket[c] = b + } + } +} + +func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) { + bucketMax_32(text, freq, bucket) + + // Loop backward through sa, always tracking + // the next index to populate from sa[:numLMS]. + // When we get to one, populate it. + // Zero the rest of the slots; they have dead values in them. + x := numLMS - 1 + saX := sa[x] + c := text[saX] + b := bucket[c] - 1 + bucket[c] = b + + for i := len(sa) - 1; i >= 0; i-- { + if i != int(b) { + sa[i] = 0 + continue + } + sa[i] = saX + + // Load next entry to put down (if any). + if x > 0 { + x-- + saX = sa[x] // TODO bounds check + c = text[saX] + b = bucket[c] - 1 + bucket[c] = b + } + } +} + +func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) { + bucketMax_64(text, freq, bucket) + + // Loop backward through sa, always tracking + // the next index to populate from sa[:numLMS]. + // When we get to one, populate it. + // Zero the rest of the slots; they have dead values in them. + x := numLMS - 1 + saX := sa[x] + c := text[saX] + b := bucket[c] - 1 + bucket[c] = b + + for i := len(sa) - 1; i >= 0; i-- { + if i != int(b) { + sa[i] = 0 + continue + } + sa[i] = saX + + // Load next entry to put down (if any). + if x > 0 { + x-- + saX = sa[x] // TODO bounds check + c = text[saX] + b = bucket[c] - 1 + bucket[c] = b + } + } +} + +func induceL_8_64(text []byte, sa, freq, bucket []int64) { + // Initialize positions for left side of character buckets. + bucketMin_8_64(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + // This scan is similar to the one in induceSubL_8_64 above. + // That one arranges to clear all but the leftmost L-type indexes. + // This scan leaves all the L-type indexes and the original S-type + // indexes, but it negates the positive leftmost L-type indexes + // (the ones that induceS_8_64 needs to process). + + // expand_8_64 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index. + cB := c1 + b := bucket[cB] + sa[b] = int64(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j <= 0 { + // Skip empty or negated entry (including negated zero). + continue + } + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. The caller can't tell the difference between + // an empty slot and a non-empty zero, but there's no need + // to distinguish them anyway: the final suffix array will end up + // with one zero somewhere, and that will be a real zero. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 < c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int64(k) + b++ + } +} + +func induceL_32(text []int32, sa, freq, bucket []int32) { + // Initialize positions for left side of character buckets. + bucketMin_32(text, freq, bucket) + + // This scan is similar to the one in induceSubL_32 above. + // That one arranges to clear all but the leftmost L-type indexes. + // This scan leaves all the L-type indexes and the original S-type + // indexes, but it negates the positive leftmost L-type indexes + // (the ones that induceS_32 needs to process). + + // expand_32 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index. + cB := c1 + b := bucket[cB] + sa[b] = int32(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j <= 0 { + // Skip empty or negated entry (including negated zero). + continue + } + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. The caller can't tell the difference between + // an empty slot and a non-empty zero, but there's no need + // to distinguish them anyway: the final suffix array will end up + // with one zero somewhere, and that will be a real zero. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 < c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int32(k) + b++ + } +} + +func induceL_64(text []int64, sa, freq, bucket []int64) { + // Initialize positions for left side of character buckets. + bucketMin_64(text, freq, bucket) + + // This scan is similar to the one in induceSubL_64 above. + // That one arranges to clear all but the leftmost L-type indexes. + // This scan leaves all the L-type indexes and the original S-type + // indexes, but it negates the positive leftmost L-type indexes + // (the ones that induceS_64 needs to process). + + // expand_64 left out the implicit entry sa[-1] == len(text), + // corresponding to the identified type-L index len(text)-1. + // Process it before the left-to-right scan of sa proper. + // See body in loop for commentary. + k := len(text) - 1 + c0, c1 := text[k-1], text[k] + if c0 < c1 { + k = -k + } + + // Cache recently used bucket index. + cB := c1 + b := bucket[cB] + sa[b] = int64(k) + b++ + + for i := 0; i < len(sa); i++ { + j := int(sa[i]) + if j <= 0 { + // Skip empty or negated entry (including negated zero). + continue + } + + // Index j was on work queue, meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is L-type, queue k for processing later in this loop. + // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. The caller can't tell the difference between + // an empty slot and a non-empty zero, but there's no need + // to distinguish them anyway: the final suffix array will end up + // with one zero somewhere, and that will be a real zero. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 < c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + sa[b] = int64(k) + b++ + } +} + +func induceS_8_64(text []byte, sa, freq, bucket []int64) { + // Initialize positions for right side of character buckets. + bucketMax_8_64(text, freq, bucket) + bucket = bucket[:256] // eliminate bounds check for bucket[cB] below + + cB := byte(0) + b := bucket[cB] + + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j >= 0 { + // Skip non-flagged entry. + // (This loop can't see an empty entry; 0 means the real zero index.) + continue + } + + // Negative j is a work queue entry; rewrite to positive j for final suffix array. + j = -j + sa[i] = int64(j) + + // Index j was on work queue (encoded as -j but now decoded), + // meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue -k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 <= c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int64(k) + } +} + +func induceS_32(text []int32, sa, freq, bucket []int32) { + // Initialize positions for right side of character buckets. + bucketMax_32(text, freq, bucket) + + cB := int32(0) + b := bucket[cB] + + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j >= 0 { + // Skip non-flagged entry. + // (This loop can't see an empty entry; 0 means the real zero index.) + continue + } + + // Negative j is a work queue entry; rewrite to positive j for final suffix array. + j = -j + sa[i] = int32(j) + + // Index j was on work queue (encoded as -j but now decoded), + // meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue -k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 <= c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int32(k) + } +} + +func induceS_64(text []int64, sa, freq, bucket []int64) { + // Initialize positions for right side of character buckets. + bucketMax_64(text, freq, bucket) + + cB := int64(0) + b := bucket[cB] + + for i := len(sa) - 1; i >= 0; i-- { + j := int(sa[i]) + if j >= 0 { + // Skip non-flagged entry. + // (This loop can't see an empty entry; 0 means the real zero index.) + continue + } + + // Negative j is a work queue entry; rewrite to positive j for final suffix array. + j = -j + sa[i] = int64(j) + + // Index j was on work queue (encoded as -j but now decoded), + // meaning k := j-1 is L-type, + // so we can now place k correctly into sa. + // If k-1 is S-type, queue -k for processing later in this loop. + // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. + // If k is zero, k-1 doesn't exist, so we only need to leave it + // for the caller. + k := j - 1 + c1 := text[k] + if k > 0 { + if c0 := text[k-1]; c0 <= c1 { + k = -k + } + } + + if cB != c1 { + bucket[cB] = b + cB = c1 + b = bucket[cB] + } + b-- + sa[b] = int64(k) + } +} diff --git a/src/index/suffixarray/suffixarray.go b/src/index/suffixarray/suffixarray.go new file mode 100644 index 0000000..9c169e7 --- /dev/null +++ b/src/index/suffixarray/suffixarray.go @@ -0,0 +1,385 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package suffixarray implements substring search in logarithmic time using +// an in-memory suffix array. +// +// Example use: +// +// // create index for some data +// index := suffixarray.New(data) +// +// // lookup byte slice s +// offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data +// offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data +// +package suffixarray + +import ( + "bytes" + "encoding/binary" + "errors" + "io" + "math" + "regexp" + "sort" +) + +// Can change for testing +var maxData32 int = realMaxData32 + +const realMaxData32 = math.MaxInt32 + +// Index implements a suffix array for fast substring search. +type Index struct { + data []byte + sa ints // suffix array for data; sa.len() == len(data) +} + +// An ints is either an []int32 or an []int64. +// That is, one of them is empty, and one is the real data. +// The int64 form is used when len(data) > maxData32 +type ints struct { + int32 []int32 + int64 []int64 +} + +func (a *ints) len() int { + return len(a.int32) + len(a.int64) +} + +func (a *ints) get(i int) int64 { + if a.int32 != nil { + return int64(a.int32[i]) + } + return a.int64[i] +} + +func (a *ints) set(i int, v int64) { + if a.int32 != nil { + a.int32[i] = int32(v) + } else { + a.int64[i] = v + } +} + +func (a *ints) slice(i, j int) ints { + if a.int32 != nil { + return ints{a.int32[i:j], nil} + } + return ints{nil, a.int64[i:j]} +} + +// New creates a new Index for data. +// Index creation time is O(N) for N = len(data). +func New(data []byte) *Index { + ix := &Index{data: data} + if len(data) <= maxData32 { + ix.sa.int32 = make([]int32, len(data)) + text_32(data, ix.sa.int32) + } else { + ix.sa.int64 = make([]int64, len(data)) + text_64(data, ix.sa.int64) + } + return ix +} + +// writeInt writes an int x to w using buf to buffer the write. +func writeInt(w io.Writer, buf []byte, x int) error { + binary.PutVarint(buf, int64(x)) + _, err := w.Write(buf[0:binary.MaxVarintLen64]) + return err +} + +// readInt reads an int x from r using buf to buffer the read and returns x. +func readInt(r io.Reader, buf []byte) (int64, error) { + _, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error + x, _ := binary.Varint(buf) + return x, err +} + +// writeSlice writes data[:n] to w and returns n. +// It uses buf to buffer the write. +func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error) { + // encode as many elements as fit into buf + p := binary.MaxVarintLen64 + m := data.len() + for ; n < m && p+binary.MaxVarintLen64 <= len(buf); n++ { + p += binary.PutUvarint(buf[p:], uint64(data.get(n))) + } + + // update buffer size + binary.PutVarint(buf, int64(p)) + + // write buffer + _, err = w.Write(buf[0:p]) + return +} + +var errTooBig = errors.New("suffixarray: data too large") + +// readSlice reads data[:n] from r and returns n. +// It uses buf to buffer the read. +func readSlice(r io.Reader, buf []byte, data ints) (n int, err error) { + // read buffer size + var size64 int64 + size64, err = readInt(r, buf) + if err != nil { + return + } + if int64(int(size64)) != size64 || int(size64) < 0 { + // We never write chunks this big anyway. + return 0, errTooBig + } + size := int(size64) + + // read buffer w/o the size + if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil { + return + } + + // decode as many elements as present in buf + for p := binary.MaxVarintLen64; p < size; n++ { + x, w := binary.Uvarint(buf[p:]) + data.set(n, int64(x)) + p += w + } + + return +} + +const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore + +// Read reads the index from r into x; x must not be nil. +func (x *Index) Read(r io.Reader) error { + // buffer for all reads + buf := make([]byte, bufSize) + + // read length + n64, err := readInt(r, buf) + if err != nil { + return err + } + if int64(int(n64)) != n64 || int(n64) < 0 { + return errTooBig + } + n := int(n64) + + // allocate space + if 2*n < cap(x.data) || cap(x.data) < n || x.sa.int32 != nil && n > maxData32 || x.sa.int64 != nil && n <= maxData32 { + // new data is significantly smaller or larger than + // existing buffers - allocate new ones + x.data = make([]byte, n) + x.sa.int32 = nil + x.sa.int64 = nil + if n <= maxData32 { + x.sa.int32 = make([]int32, n) + } else { + x.sa.int64 = make([]int64, n) + } + } else { + // re-use existing buffers + x.data = x.data[0:n] + x.sa = x.sa.slice(0, n) + } + + // read data + if _, err := io.ReadFull(r, x.data); err != nil { + return err + } + + // read index + sa := x.sa + for sa.len() > 0 { + n, err := readSlice(r, buf, sa) + if err != nil { + return err + } + sa = sa.slice(n, sa.len()) + } + return nil +} + +// Write writes the index x to w. +func (x *Index) Write(w io.Writer) error { + // buffer for all writes + buf := make([]byte, bufSize) + + // write length + if err := writeInt(w, buf, len(x.data)); err != nil { + return err + } + + // write data + if _, err := w.Write(x.data); err != nil { + return err + } + + // write index + sa := x.sa + for sa.len() > 0 { + n, err := writeSlice(w, buf, sa) + if err != nil { + return err + } + sa = sa.slice(n, sa.len()) + } + return nil +} + +// Bytes returns the data over which the index was created. +// It must not be modified. +// +func (x *Index) Bytes() []byte { + return x.data +} + +func (x *Index) at(i int) []byte { + return x.data[x.sa.get(i):] +} + +// lookupAll returns a slice into the matching region of the index. +// The runtime is O(log(N)*len(s)). +func (x *Index) lookupAll(s []byte) ints { + // find matching suffix index range [i:j] + // find the first index where s would be the prefix + i := sort.Search(x.sa.len(), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 }) + // starting at i, find the first index at which s is not a prefix + j := i + sort.Search(x.sa.len()-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) }) + return x.sa.slice(i, j) +} + +// Lookup returns an unsorted list of at most n indices where the byte string s +// occurs in the indexed data. If n < 0, all occurrences are returned. +// The result is nil if s is empty, s is not found, or n == 0. +// Lookup time is O(log(N)*len(s) + len(result)) where N is the +// size of the indexed data. +// +func (x *Index) Lookup(s []byte, n int) (result []int) { + if len(s) > 0 && n != 0 { + matches := x.lookupAll(s) + count := matches.len() + if n < 0 || count < n { + n = count + } + // 0 <= n <= count + if n > 0 { + result = make([]int, n) + if matches.int32 != nil { + for i := range result { + result[i] = int(matches.int32[i]) + } + } else { + for i := range result { + result[i] = int(matches.int64[i]) + } + } + } + } + return +} + +// FindAllIndex returns a sorted list of non-overlapping matches of the +// regular expression r, where a match is a pair of indices specifying +// the matched slice of x.Bytes(). If n < 0, all matches are returned +// in successive order. Otherwise, at most n matches are returned and +// they may not be successive. The result is nil if there are no matches, +// or if n == 0. +// +func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) { + // a non-empty literal prefix is used to determine possible + // match start indices with Lookup + prefix, complete := r.LiteralPrefix() + lit := []byte(prefix) + + // worst-case scenario: no literal prefix + if prefix == "" { + return r.FindAllIndex(x.data, n) + } + + // if regexp is a literal just use Lookup and convert its + // result into match pairs + if complete { + // Lookup returns indices that may belong to overlapping matches. + // After eliminating them, we may end up with fewer than n matches. + // If we don't have enough at the end, redo the search with an + // increased value n1, but only if Lookup returned all the requested + // indices in the first place (if it returned fewer than that then + // there cannot be more). + for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ { + indices := x.Lookup(lit, n1) + if len(indices) == 0 { + return + } + sort.Ints(indices) + pairs := make([]int, 2*len(indices)) + result = make([][]int, len(indices)) + count := 0 + prev := 0 + for _, i := range indices { + if count == n { + break + } + // ignore indices leading to overlapping matches + if prev <= i { + j := 2 * count + pairs[j+0] = i + pairs[j+1] = i + len(lit) + result[count] = pairs[j : j+2] + count++ + prev = i + len(lit) + } + } + result = result[0:count] + if len(result) >= n || len(indices) != n1 { + // found all matches or there's no chance to find more + // (n and n1 can be negative) + break + } + } + if len(result) == 0 { + result = nil + } + return + } + + // regexp has a non-empty literal prefix; Lookup(lit) computes + // the indices of possible complete matches; use these as starting + // points for anchored searches + // (regexp "^" matches beginning of input, not beginning of line) + r = regexp.MustCompile("^" + r.String()) // compiles because r compiled + + // same comment about Lookup applies here as in the loop above + for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ { + indices := x.Lookup(lit, n1) + if len(indices) == 0 { + return + } + sort.Ints(indices) + result = result[0:0] + prev := 0 + for _, i := range indices { + if len(result) == n { + break + } + m := r.FindIndex(x.data[i:]) // anchored search - will not run off + // ignore indices leading to overlapping matches + if m != nil && prev <= i { + m[0] = i // correct m + m[1] += i + result = append(result, m) + prev = m[1] + } + } + if len(result) >= n || len(indices) != n1 { + // found all matches or there's no chance to find more + // (n and n1 can be negative) + break + } + } + if len(result) == 0 { + result = nil + } + return +} diff --git a/src/index/suffixarray/suffixarray_test.go b/src/index/suffixarray/suffixarray_test.go new file mode 100644 index 0000000..44c5041 --- /dev/null +++ b/src/index/suffixarray/suffixarray_test.go @@ -0,0 +1,615 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package suffixarray + +import ( + "bytes" + "fmt" + "io/fs" + "math/rand" + "os" + "path/filepath" + "regexp" + "sort" + "strings" + "testing" +) + +type testCase struct { + name string // name of test case + source string // source to index + patterns []string // patterns to lookup +} + +var testCases = []testCase{ + { + "empty string", + "", + []string{ + "", + "foo", + "(foo)", + ".*", + "a*", + }, + }, + + { + "all a's", + "aaaaaaaaaa", // 10 a's + []string{ + "", + "a", + "aa", + "aaa", + "aaaa", + "aaaaa", + "aaaaaa", + "aaaaaaa", + "aaaaaaaa", + "aaaaaaaaa", + "aaaaaaaaaa", + "aaaaaaaaaaa", // 11 a's + ".", + ".*", + "a+", + "aa+", + "aaaa[b]?", + "aaa*", + }, + }, + + { + "abc", + "abc", + []string{ + "a", + "b", + "c", + "ab", + "bc", + "abc", + "a.c", + "a(b|c)", + "abc?", + }, + }, + + { + "barbara*3", + "barbarabarbarabarbara", + []string{ + "a", + "bar", + "rab", + "arab", + "barbar", + "bara?bar", + }, + }, + + { + "typing drill", + "Now is the time for all good men to come to the aid of their country.", + []string{ + "Now", + "the time", + "to come the aid", + "is the time for all good men to come to the aid of their", + "to (come|the)?", + }, + }, + + { + "godoc simulation", + "package main\n\nimport(\n \"rand\"\n ", + []string{}, + }, +} + +// find all occurrences of s in source; report at most n occurrences +func find(src, s string, n int) []int { + var res []int + if s != "" && n != 0 { + // find at most n occurrences of s in src + for i := -1; n < 0 || len(res) < n; { + j := strings.Index(src[i+1:], s) + if j < 0 { + break + } + i += j + 1 + res = append(res, i) + } + } + return res +} + +func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) { + res := x.Lookup([]byte(s), n) + exp := find(tc.source, s, n) + + // check that the lengths match + if len(res) != len(exp) { + t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res)) + } + + // if n >= 0 the number of results is limited --- unless n >= all results, + // we may obtain different positions from the Index and from find (because + // Index may not find the results in the same order as find) => in general + // we cannot simply check that the res and exp lists are equal + + // check that each result is in fact a correct match and there are no duplicates + sort.Ints(res) + for i, r := range res { + if r < 0 || len(tc.source) <= r { + t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source)) + } else if !strings.HasPrefix(tc.source[r:], s) { + t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r) + } + if i > 0 && res[i-1] == r { + t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r) + } + } + + if n < 0 { + // all results computed - sorted res and exp must be equal + for i, r := range res { + e := exp[i] + if r != e { + t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r) + } + } + } +} + +func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) { + res := x.FindAllIndex(rx, n) + exp := rx.FindAllStringIndex(tc.source, n) + + // check that the lengths match + if len(res) != len(exp) { + t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res)) + } + + // if n >= 0 the number of results is limited --- unless n >= all results, + // we may obtain different positions from the Index and from regexp (because + // Index may not find the results in the same order as regexp) => in general + // we cannot simply check that the res and exp lists are equal + + // check that each result is in fact a correct match and the result is sorted + for i, r := range res { + if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] { + t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1]) + } else if !rx.MatchString(tc.source[r[0]:r[1]]) { + t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1]) + } + } + + if n < 0 { + // all results computed - sorted res and exp must be equal + for i, r := range res { + e := exp[i] + if r[0] != e[0] || r[1] != e[1] { + t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]", + tc.name, rx, i, e[0], e[1], r[0], r[1]) + } + } + } +} + +func testLookups(t *testing.T, tc *testCase, x *Index, n int) { + for _, pat := range tc.patterns { + testLookup(t, tc, x, pat, n) + if rx, err := regexp.Compile(pat); err == nil { + testFindAllIndex(t, tc, x, rx, n) + } + } +} + +// index is used to hide the sort.Interface +type index Index + +func (x *index) Len() int { return x.sa.len() } +func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } +func (x *index) Swap(i, j int) { + if x.sa.int32 != nil { + x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i] + } else { + x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i] + } +} + +func (x *index) at(i int) []byte { + return x.data[x.sa.get(i):] +} + +func testConstruction(t *testing.T, tc *testCase, x *Index) { + if !sort.IsSorted((*index)(x)) { + t.Errorf("failed testConstruction %s", tc.name) + } +} + +func equal(x, y *Index) bool { + if !bytes.Equal(x.data, y.data) { + return false + } + if x.sa.len() != y.sa.len() { + return false + } + n := x.sa.len() + for i := 0; i < n; i++ { + if x.sa.get(i) != y.sa.get(i) { + return false + } + } + return true +} + +// returns the serialized index size +func testSaveRestore(t *testing.T, tc *testCase, x *Index) int { + var buf bytes.Buffer + if err := x.Write(&buf); err != nil { + t.Errorf("failed writing index %s (%s)", tc.name, err) + } + size := buf.Len() + var y Index + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { + t.Errorf("failed reading index %s (%s)", tc.name, err) + } + if !equal(x, &y) { + t.Errorf("restored index doesn't match saved index %s", tc.name) + } + + old := maxData32 + defer func() { + maxData32 = old + }() + // Reread as forced 32. + y = Index{} + maxData32 = realMaxData32 + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { + t.Errorf("failed reading index %s (%s)", tc.name, err) + } + if !equal(x, &y) { + t.Errorf("restored index doesn't match saved index %s", tc.name) + } + + // Reread as forced 64. + y = Index{} + maxData32 = -1 + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { + t.Errorf("failed reading index %s (%s)", tc.name, err) + } + if !equal(x, &y) { + t.Errorf("restored index doesn't match saved index %s", tc.name) + } + + return size +} + +func testIndex(t *testing.T) { + for _, tc := range testCases { + x := New([]byte(tc.source)) + testConstruction(t, &tc, x) + testSaveRestore(t, &tc, x) + testLookups(t, &tc, x, 0) + testLookups(t, &tc, x, 1) + testLookups(t, &tc, x, 10) + testLookups(t, &tc, x, 2e9) + testLookups(t, &tc, x, -1) + } +} + +func TestIndex32(t *testing.T) { + testIndex(t) +} + +func TestIndex64(t *testing.T) { + maxData32 = -1 + defer func() { + maxData32 = realMaxData32 + }() + testIndex(t) +} + +func TestNew32(t *testing.T) { + test(t, func(x []byte) []int { + sa := make([]int32, len(x)) + text_32(x, sa) + out := make([]int, len(sa)) + for i, v := range sa { + out[i] = int(v) + } + return out + }) +} + +func TestNew64(t *testing.T) { + test(t, func(x []byte) []int { + sa := make([]int64, len(x)) + text_64(x, sa) + out := make([]int, len(sa)) + for i, v := range sa { + out[i] = int(v) + } + return out + }) +} + +// test tests an arbitrary suffix array construction function. +// Generates many inputs, builds and checks suffix arrays. +func test(t *testing.T, build func([]byte) []int) { + t.Run("ababab...", func(t *testing.T) { + // Very repetitive input has numLMS = len(x)/2-1 + // at top level, the largest it can be. + // But maxID is only two (aba and ab$). + size := 100000 + if testing.Short() { + size = 10000 + } + x := make([]byte, size) + for i := range x { + x[i] = "ab"[i%2] + } + testSA(t, x, build) + }) + + t.Run("forcealloc", func(t *testing.T) { + // Construct a pathological input that forces + // recurse_32 to allocate a new temporary buffer. + // The input must have more than N/3 LMS-substrings, + // which we arrange by repeating an SLSLSLSLSLSL pattern + // like ababab... above, but then we must also arrange + // for a large number of distinct LMS-substrings. + // We use this pattern: + // 1 255 1 254 1 253 1 ... 1 2 1 255 2 254 2 253 2 252 2 ... + // This gives approximately 2¹⁵ distinct LMS-substrings. + // We need to repeat at least one substring, though, + // or else the recursion can be bypassed entirely. + x := make([]byte, 100000, 100001) + lo := byte(1) + hi := byte(255) + for i := range x { + if i%2 == 0 { + x[i] = lo + } else { + x[i] = hi + hi-- + if hi <= lo { + lo++ + if lo == 0 { + lo = 1 + } + hi = 255 + } + } + } + x[:cap(x)][len(x)] = 0 // for sais.New + testSA(t, x, build) + }) + + t.Run("exhaustive2", func(t *testing.T) { + // All inputs over {0,1} up to length 21. + // Runs in about 10 seconds on my laptop. + x := make([]byte, 30) + numFail := 0 + for n := 0; n <= 21; n++ { + if n > 12 && testing.Short() { + break + } + x[n] = 0 // for sais.New + testRec(t, x[:n], 0, 2, &numFail, build) + } + }) + + t.Run("exhaustive3", func(t *testing.T) { + // All inputs over {0,1,2} up to length 14. + // Runs in about 10 seconds on my laptop. + x := make([]byte, 30) + numFail := 0 + for n := 0; n <= 14; n++ { + if n > 8 && testing.Short() { + break + } + x[n] = 0 // for sais.New + testRec(t, x[:n], 0, 3, &numFail, build) + } + }) +} + +// testRec fills x[i:] with all possible combinations of values in [1,max] +// and then calls testSA(t, x, build) for each one. +func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) { + if i < len(x) { + for x[i] = 1; x[i] <= byte(max); x[i]++ { + testRec(t, x, i+1, max, numFail, build) + } + return + } + + if !testSA(t, x, build) { + *numFail++ + if *numFail >= 10 { + t.Errorf("stopping after %d failures", *numFail) + t.FailNow() + } + } +} + +// testSA tests the suffix array build function on the input x. +// It constructs the suffix array and then checks that it is correct. +func testSA(t *testing.T, x []byte, build func([]byte) []int) bool { + defer func() { + if e := recover(); e != nil { + t.Logf("build %v", x) + panic(e) + } + }() + sa := build(x) + if len(sa) != len(x) { + t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x)) + return false + } + for i := 0; i+1 < len(sa); i++ { + if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) { + t.Errorf("build %s: sa out of range: %v\n", x, sa) + return false + } + if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 { + t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1]) + return false + } + } + + return true +} + +var ( + benchdata = make([]byte, 1e6) + benchrand = make([]byte, 1e6) +) + +// Of all possible inputs, the random bytes have the least amount of substring +// repetition, and the repeated bytes have the most. For most algorithms, +// the running time of every input will be between these two. +func benchmarkNew(b *testing.B, random bool) { + b.ReportAllocs() + b.StopTimer() + data := benchdata + if random { + data = benchrand + if data[0] == 0 { + for i := range data { + data[i] = byte(rand.Intn(256)) + } + } + } + b.StartTimer() + b.SetBytes(int64(len(data))) + for i := 0; i < b.N; i++ { + New(data) + } +} + +func makeText(name string) ([]byte, error) { + var data []byte + switch name { + case "opticks": + var err error + data, err = os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt") + if err != nil { + return nil, err + } + case "go": + err := filepath.WalkDir("../..", func(path string, info fs.DirEntry, err error) error { + if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() { + file, err := os.ReadFile(path) + if err != nil { + return err + } + data = append(data, file...) + } + return nil + }) + if err != nil { + return nil, err + } + case "zero": + data = make([]byte, 50e6) + case "rand": + data = make([]byte, 50e6) + for i := range data { + data[i] = byte(rand.Intn(256)) + } + } + return data, nil +} + +func setBits(bits int) (cleanup func()) { + if bits == 32 { + maxData32 = realMaxData32 + } else { + maxData32 = -1 // force use of 64-bit code + } + return func() { + maxData32 = realMaxData32 + } +} + +func BenchmarkNew(b *testing.B) { + for _, text := range []string{"opticks", "go", "zero", "rand"} { + b.Run("text="+text, func(b *testing.B) { + data, err := makeText(text) + if err != nil { + b.Fatal(err) + } + if testing.Short() && len(data) > 5e6 { + data = data[:5e6] + } + for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} { + if len(data) < size { + continue + } + data := data[:size] + name := fmt.Sprintf("%dK", size/1e3) + if size >= 1e6 { + name = fmt.Sprintf("%dM", size/1e6) + } + b.Run("size="+name, func(b *testing.B) { + for _, bits := range []int{32, 64} { + if ^uint(0) == 0xffffffff && bits == 64 { + continue + } + b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) { + cleanup := setBits(bits) + defer cleanup() + + b.SetBytes(int64(len(data))) + b.ReportAllocs() + for i := 0; i < b.N; i++ { + New(data) + } + }) + } + }) + } + }) + } +} + +func BenchmarkSaveRestore(b *testing.B) { + r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence + data := make([]byte, 1<<20) // 1MB of data to index + for i := range data { + data[i] = byte(r.Intn(256)) + } + for _, bits := range []int{32, 64} { + if ^uint(0) == 0xffffffff && bits == 64 { + continue + } + b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) { + cleanup := setBits(bits) + defer cleanup() + + b.StopTimer() + x := New(data) + size := testSaveRestore(nil, nil, x) // verify correctness + buf := bytes.NewBuffer(make([]byte, size)) // avoid growing + b.SetBytes(int64(size)) + b.StartTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + buf.Reset() + if err := x.Write(buf); err != nil { + b.Fatal(err) + } + var y Index + if err := y.Read(buf); err != nil { + b.Fatal(err) + } + } + }) + } +} |