diff options
Diffstat (limited to 'modules/indexer/code/bleve/bleve.go')
-rw-r--r-- | modules/indexer/code/bleve/bleve.go | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go new file mode 100644 index 00000000..66724a34 --- /dev/null +++ b/modules/indexer/code/bleve/bleve.go @@ -0,0 +1,352 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package bleve + +import ( + "bufio" + "context" + "fmt" + "io" + "strconv" + "strings" + "time" + + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/analyze" + "code.gitea.io/gitea/modules/charset" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/indexer/code/internal" + indexer_internal "code.gitea.io/gitea/modules/indexer/internal" + inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" + "code.gitea.io/gitea/modules/typesniffer" + + "github.com/blevesearch/bleve/v2" + analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" + analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" + "github.com/blevesearch/bleve/v2/analysis/token/camelcase" + "github.com/blevesearch/bleve/v2/analysis/token/lowercase" + "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search/query" + "github.com/go-enry/go-enry/v2" +) + +const ( + unicodeNormalizeName = "unicodeNormalize" + maxBatchSize = 16 + // fuzzyDenominator determines the levenshtein distance per each character of a keyword + fuzzyDenominator = 4 + // see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311 + maxFuzziness = 2 +) + +func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { + return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{ + "type": unicodenorm.Name, + "form": unicodenorm.NFC, + }) +} + +// RepoIndexerData data stored in the repo indexer +type RepoIndexerData struct { + RepoID int64 + CommitID string + Content string + Language string + UpdatedAt time.Time +} + +// Type returns the document type, for bleve's mapping.Classifier interface. +func (d *RepoIndexerData) Type() string { + return repoIndexerDocType +} + +const ( + repoIndexerAnalyzer = "repoIndexerAnalyzer" + repoIndexerDocType = "repoIndexerDocType" + repoIndexerLatestVersion = 6 +) + +// generateBleveIndexMapping generates a bleve index mapping for the repo indexer +func generateBleveIndexMapping() (mapping.IndexMapping, error) { + docMapping := bleve.NewDocumentMapping() + numericFieldMapping := bleve.NewNumericFieldMapping() + numericFieldMapping.IncludeInAll = false + docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping) + + textFieldMapping := bleve.NewTextFieldMapping() + textFieldMapping.IncludeInAll = false + docMapping.AddFieldMappingsAt("Content", textFieldMapping) + + termFieldMapping := bleve.NewTextFieldMapping() + termFieldMapping.IncludeInAll = false + termFieldMapping.Analyzer = analyzer_keyword.Name + docMapping.AddFieldMappingsAt("Language", termFieldMapping) + docMapping.AddFieldMappingsAt("CommitID", termFieldMapping) + + timeFieldMapping := bleve.NewDateTimeFieldMapping() + timeFieldMapping.IncludeInAll = false + docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping) + + mapping := bleve.NewIndexMapping() + if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { + return nil, err + } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ + "type": analyzer_custom.Name, + "char_filters": []string{}, + "tokenizer": unicode.Name, + "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, + }); err != nil { + return nil, err + } + mapping.DefaultAnalyzer = repoIndexerAnalyzer + mapping.AddDocumentMapping(repoIndexerDocType, docMapping) + mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) + + return mapping, nil +} + +var _ internal.Indexer = &Indexer{} + +// Indexer represents a bleve indexer implementation +type Indexer struct { + inner *inner_bleve.Indexer + indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much +} + +// NewIndexer creates a new bleve local indexer +func NewIndexer(indexDir string) *Indexer { + inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping) + return &Indexer{ + Indexer: inner, + inner: inner, + } +} + +func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string, + update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch, +) error { + // Ignore vendored files in code search + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { + return nil + } + + size := update.Size + + var err error + if !update.Sized { + var stdout string + stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) + if err != nil { + return err + } + if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { + return fmt.Errorf("misformatted git cat-file output: %w", err) + } + } + + if size > setting.Indexer.MaxIndexerFileSize { + return b.addDelete(update.Filename, repo, batch) + } + + if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { + return err + } + + _, _, size, err = git.ReadBatchLine(batchReader) + if err != nil { + return err + } + + fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) + if err != nil { + return err + } else if !typesniffer.DetectContentType(fileContents).IsText() { + // FIXME: UTF-16 files will probably fail here + return nil + } + + if _, err = batchReader.Discard(1); err != nil { + return err + } + id := internal.FilenameIndexerID(repo.ID, update.Filename) + return batch.Index(id, &RepoIndexerData{ + RepoID: repo.ID, + CommitID: commitSha, + Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), + Language: analyze.GetCodeLanguage(update.Filename, fileContents), + UpdatedAt: time.Now().UTC(), + }) +} + +func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error { + id := internal.FilenameIndexerID(repo.ID, filename) + return batch.Delete(id) +} + +// Index indexes the data +func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error { + batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) + if len(changes.Updates) > 0 { + // Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first! + if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil { + log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err) + return err + } + + batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath()) + defer cancel() + + for _, update := range changes.Updates { + if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil { + return err + } + } + cancel() + } + for _, filename := range changes.RemovedFilenames { + if err := b.addDelete(filename, repo, batch); err != nil { + return err + } + } + return batch.Flush() +} + +// Delete deletes indexes by ids +func (b *Indexer) Delete(_ context.Context, repoID int64) error { + query := inner_bleve.NumericEqualityQuery(repoID, "RepoID") + searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false) + result, err := b.inner.Indexer.Search(searchRequest) + if err != nil { + return err + } + batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) + for _, hit := range result.Hits { + if err = batch.Delete(hit.ID); err != nil { + return err + } + } + return batch.Flush() +} + +// Search searches for files in the specified repo. +// Returns the matching file-paths +func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { + var ( + indexerQuery query.Query + keywordQuery query.Query + ) + + phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) + phraseQuery.FieldVal = "Content" + phraseQuery.Analyzer = repoIndexerAnalyzer + keywordQuery = phraseQuery + if opts.IsKeywordFuzzy { + phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator) + } + + if len(opts.RepoIDs) > 0 { + repoQueries := make([]query.Query, 0, len(opts.RepoIDs)) + for _, repoID := range opts.RepoIDs { + repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID")) + } + + indexerQuery = bleve.NewConjunctionQuery( + bleve.NewDisjunctionQuery(repoQueries...), + keywordQuery, + ) + } else { + indexerQuery = keywordQuery + } + + // Save for reuse without language filter + facetQuery := indexerQuery + if len(opts.Language) > 0 { + languageQuery := bleve.NewMatchQuery(opts.Language) + languageQuery.FieldVal = "Language" + languageQuery.Analyzer = analyzer_keyword.Name + + indexerQuery = bleve.NewConjunctionQuery( + indexerQuery, + languageQuery, + ) + } + + from, pageSize := opts.GetSkipTake() + searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false) + searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} + searchRequest.IncludeLocations = true + + if len(opts.Language) == 0 { + searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) + } + + result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest) + if err != nil { + return 0, nil, nil, err + } + + total := int64(result.Total) + + searchResults := make([]*internal.SearchResult, len(result.Hits)) + for i, hit := range result.Hits { + startIndex, endIndex := -1, -1 + for _, locations := range hit.Locations["Content"] { + location := locations[0] + locationStart := int(location.Start) + locationEnd := int(location.End) + if startIndex < 0 || locationStart < startIndex { + startIndex = locationStart + } + if endIndex < 0 || locationEnd > endIndex { + endIndex = locationEnd + } + } + language := hit.Fields["Language"].(string) + var updatedUnix timeutil.TimeStamp + if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil { + updatedUnix = timeutil.TimeStamp(t.Unix()) + } + searchResults[i] = &internal.SearchResult{ + RepoID: int64(hit.Fields["RepoID"].(float64)), + StartIndex: startIndex, + EndIndex: endIndex, + Filename: internal.FilenameOfIndexerID(hit.ID), + Content: hit.Fields["Content"].(string), + CommitID: hit.Fields["CommitID"].(string), + UpdatedUnix: updatedUnix, + Language: language, + Color: enry.GetColor(language), + } + } + + searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10) + if len(opts.Language) > 0 { + // Use separate query to go get all language counts + facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false) + facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} + facetRequest.IncludeLocations = true + facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) + + if result, err = b.inner.Indexer.Search(facetRequest); err != nil { + return 0, nil, nil, err + } + } + languagesFacet := result.Facets["languages"] + for _, term := range languagesFacet.Terms.Terms() { + if len(term.Term) == 0 { + continue + } + searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{ + Language: term.Term, + Color: enry.GetColor(term.Term), + Count: term.Count, + }) + } + return total, searchResults, searchResultLanguages, nil +} |