forked from forgejo/forgejo
Support elastic search for code search (#10273)
* Support elastic search for code search * Finished elastic search implementation and add some tests * Enable test on drone and added docs * Add new fields to elastic search * Fix bug * remove unused changes * Use indexer alias to keep the gitea indexer version * Improve codes * Some code improvements * The real indexer name changed to xxx.v1 Co-authored-by: zeripath <art27@cantab.net>
This commit is contained in:
parent
d257485bc0
commit
9bc69ff26e
14 changed files with 694 additions and 164 deletions
|
@ -58,10 +58,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
|||
})
|
||||
}
|
||||
|
||||
// openIndexer open the index at the specified path, checking for metadata
|
||||
// openBleveIndexer open the index at the specified path, checking for metadata
|
||||
// updates and bleve version updates. If index needs to be created (or
|
||||
// re-created), returns (nil, nil)
|
||||
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
|
||||
func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
|
||||
_, err := os.Stat(path)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
|
@ -104,54 +104,14 @@ func (d *RepoIndexerData) Type() string {
|
|||
return repoIndexerDocType
|
||||
}
|
||||
|
||||
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
|
||||
// Ignore vendored files in code search
|
||||
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
|
||||
return nil
|
||||
}
|
||||
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
|
||||
RunInDir(repo.RepoPath())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
|
||||
return fmt.Errorf("Misformatted git cat-file output: %v", err)
|
||||
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
|
||||
return addDelete(update.Filename, repo, batch)
|
||||
}
|
||||
|
||||
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
|
||||
RunInDirBytes(repo.RepoPath())
|
||||
if err != nil {
|
||||
return err
|
||||
} else if !base.IsTextFile(fileContents) {
|
||||
// FIXME: UTF-16 files will probably fail here
|
||||
return nil
|
||||
}
|
||||
|
||||
id := filenameIndexerID(repo.ID, update.Filename)
|
||||
return batch.Index(id, &RepoIndexerData{
|
||||
RepoID: repo.ID,
|
||||
CommitID: commitSha,
|
||||
Content: string(charset.ToUTF8DropErrors(fileContents)),
|
||||
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
|
||||
func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
|
||||
id := filenameIndexerID(repo.ID, filename)
|
||||
return batch.Delete(id)
|
||||
}
|
||||
|
||||
const (
|
||||
repoIndexerAnalyzer = "repoIndexerAnalyzer"
|
||||
repoIndexerDocType = "repoIndexerDocType"
|
||||
repoIndexerLatestVersion = 5
|
||||
)
|
||||
|
||||
// createRepoIndexer create a repo indexer if one does not already exist
|
||||
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
|
||||
// createBleveIndexer create a bleve repo indexer if one does not already exist
|
||||
func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
|
||||
docMapping := bleve.NewDocumentMapping()
|
||||
numericFieldMapping := bleve.NewNumericFieldMapping()
|
||||
numericFieldMapping.IncludeInAll = false
|
||||
|
@ -199,18 +159,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
|
|||
return indexer, nil
|
||||
}
|
||||
|
||||
func filenameIndexerID(repoID int64, filename string) string {
|
||||
return indexerID(repoID) + "_" + filename
|
||||
}
|
||||
|
||||
func filenameOfIndexerID(indexerID string) string {
|
||||
index := strings.IndexByte(indexerID, '_')
|
||||
if index == -1 {
|
||||
log.Error("Unexpected ID in repo indexer: %s", indexerID)
|
||||
}
|
||||
return indexerID[index+1:]
|
||||
}
|
||||
|
||||
var (
|
||||
_ Indexer = &BleveIndexer{}
|
||||
)
|
||||
|
@ -230,10 +178,51 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
|
|||
return indexer, created, err
|
||||
}
|
||||
|
||||
func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
|
||||
// Ignore vendored files in code search
|
||||
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
|
||||
return nil
|
||||
}
|
||||
|
||||
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
|
||||
RunInDir(repo.RepoPath())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
|
||||
return fmt.Errorf("Misformatted git cat-file output: %v", err)
|
||||
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
|
||||
return b.addDelete(update.Filename, repo, batch)
|
||||
}
|
||||
|
||||
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
|
||||
RunInDirBytes(repo.RepoPath())
|
||||
if err != nil {
|
||||
return err
|
||||
} else if !base.IsTextFile(fileContents) {
|
||||
// FIXME: UTF-16 files will probably fail here
|
||||
return nil
|
||||
}
|
||||
|
||||
id := filenameIndexerID(repo.ID, update.Filename)
|
||||
return batch.Index(id, &RepoIndexerData{
|
||||
RepoID: repo.ID,
|
||||
CommitID: commitSha,
|
||||
Content: string(charset.ToUTF8DropErrors(fileContents)),
|
||||
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
|
||||
func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
|
||||
id := filenameIndexerID(repo.ID, filename)
|
||||
return batch.Delete(id)
|
||||
}
|
||||
|
||||
// init init the indexer
|
||||
func (b *BleveIndexer) init() (bool, error) {
|
||||
var err error
|
||||
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
|
||||
b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
@ -241,7 +230,7 @@ func (b *BleveIndexer) init() (bool, error) {
|
|||
return false, nil
|
||||
}
|
||||
|
||||
b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
|
||||
b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
@ -262,38 +251,19 @@ func (b *BleveIndexer) Close() {
|
|||
}
|
||||
|
||||
// Index indexes the data
|
||||
func (b *BleveIndexer) Index(repoID int64) error {
|
||||
repo, err := models.GetRepositoryByID(repoID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sha, err := getDefaultBranchSha(repo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
changes, err := getRepoChanges(repo, sha)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if changes == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
|
||||
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
|
||||
for _, update := range changes.Updates {
|
||||
if err := addUpdate(sha, update, repo, batch); err != nil {
|
||||
if err := b.addUpdate(sha, update, repo, batch); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, filename := range changes.RemovedFilenames {
|
||||
if err := addDelete(filename, repo, batch); err != nil {
|
||||
if err := b.addDelete(filename, repo, batch); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err = batch.Flush(); err != nil {
|
||||
return err
|
||||
}
|
||||
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
|
||||
return batch.Flush()
|
||||
}
|
||||
|
||||
// Delete deletes indexes by ids
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue