1
0
Fork 0
forked from forgejo/forgejo

Support elastic search for code search (#10273)

* Support elastic search for code search

* Finished elastic search implementation and add some tests

* Enable test on drone and added docs

* Add new fields to elastic search

* Fix bug

* remove unused changes

* Use indexer alias to keep the gitea indexer version

* Improve codes

* Some code improvements

* The real indexer name changed to xxx.v1

Co-authored-by: zeripath <art27@cantab.net>
This commit is contained in:
Lunny Xiao 2020-08-31 00:08:01 +08:00 committed by GitHub
parent d257485bc0
commit 9bc69ff26e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 694 additions and 164 deletions

View file

@ -58,10 +58,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}
// openIndexer open the index at the specified path, checking for metadata
// openBleveIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, nil
@ -104,54 +104,14 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return addDelete(update.Filename, repo, batch)
}
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}
func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 5
)
// createRepoIndexer create a repo indexer if one does not already exist
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
// createBleveIndexer create a bleve repo indexer if one does not already exist
func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
@ -199,18 +159,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
return indexer, nil
}
func filenameIndexerID(repoID int64, filename string) string {
return indexerID(repoID) + "_" + filename
}
func filenameOfIndexerID(indexerID string) string {
index := strings.IndexByte(indexerID, '_')
if index == -1 {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
return indexerID[index+1:]
}
var (
_ Indexer = &BleveIndexer{}
)
@ -230,10 +178,51 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
return indexer, created, err
}
func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
}
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}
func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}
// init init the indexer
func (b *BleveIndexer) init() (bool, error) {
var err error
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
@ -241,7 +230,7 @@ func (b *BleveIndexer) init() (bool, error) {
return false, nil
}
b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
@ -262,38 +251,19 @@ func (b *BleveIndexer) Close() {
}
// Index indexes the data
func (b *BleveIndexer) Index(repoID int64) error {
repo, err := models.GetRepositoryByID(repoID)
if err != nil {
return err
}
sha, err := getDefaultBranchSha(repo)
if err != nil {
return err
}
changes, err := getRepoChanges(repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}
func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
if err := addUpdate(sha, update, repo, batch); err != nil {
if err := b.addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
for _, filename := range changes.RemovedFilenames {
if err := addDelete(filename, repo, batch); err != nil {
if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
if err = batch.Flush(); err != nil {
return err
}
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
return batch.Flush()
}
// Delete deletes indexes by ids