forked from forgejo/forgejo
Update bleve dependency to latest master revision (#6100)
* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revision
This commit is contained in:
parent
11e316654e
commit
a380cfd8e0
161 changed files with 9911 additions and 4233 deletions
19
vendor/github.com/blevesearch/bleve/index/analysis.go
generated
vendored
19
vendor/github.com/blevesearch/bleve/index/analysis.go
generated
vendored
|
@ -15,10 +15,20 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeAnalysisResult int
|
||||
|
||||
func init() {
|
||||
var ar AnalysisResult
|
||||
reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size())
|
||||
}
|
||||
|
||||
type IndexRow interface {
|
||||
KeySize() int
|
||||
KeyTo([]byte) (int, error)
|
||||
|
@ -39,6 +49,15 @@ type AnalysisResult struct {
|
|||
Length []int
|
||||
}
|
||||
|
||||
func (a *AnalysisResult) Size() int {
|
||||
rv := reflectStaticSizeAnalysisResult
|
||||
for _, analyzedI := range a.Analyzed {
|
||||
rv += analyzedI.Size()
|
||||
}
|
||||
rv += len(a.Length) * size.SizeOfInt
|
||||
return rv
|
||||
}
|
||||
|
||||
type AnalysisWork struct {
|
||||
i Index
|
||||
d *document.Document
|
||||
|
|
124
vendor/github.com/blevesearch/bleve/index/index.go
generated
vendored
124
vendor/github.com/blevesearch/bleve/index/index.go
generated
vendored
|
@ -18,11 +18,23 @@ import (
|
|||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTermFieldDoc int
|
||||
var reflectStaticSizeTermFieldVector int
|
||||
|
||||
func init() {
|
||||
var tfd TermFieldDoc
|
||||
reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
|
||||
var tfv TermFieldVector
|
||||
reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
|
||||
}
|
||||
|
||||
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
|
||||
|
||||
type Index interface {
|
||||
|
@ -68,6 +80,8 @@ type IndexReader interface {
|
|||
Document(id string) (*document.Document, error)
|
||||
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
|
||||
|
||||
DocValueReader(fields []string) (DocValueReader, error)
|
||||
|
||||
Fields() ([]string, error)
|
||||
|
||||
GetInternal(key []byte) ([]byte, error)
|
||||
|
@ -84,6 +98,29 @@ type IndexReader interface {
|
|||
Close() error
|
||||
}
|
||||
|
||||
// The Regexp interface defines the subset of the regexp.Regexp API
|
||||
// methods that are used by bleve indexes, allowing callers to pass in
|
||||
// alternate implementations.
|
||||
type Regexp interface {
|
||||
FindStringIndex(s string) (loc []int)
|
||||
|
||||
LiteralPrefix() (prefix string, complete bool)
|
||||
|
||||
String() string
|
||||
}
|
||||
|
||||
type IndexReaderRegexp interface {
|
||||
FieldDictRegexp(field string, regex string) (FieldDict, error)
|
||||
}
|
||||
|
||||
type IndexReaderFuzzy interface {
|
||||
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
|
||||
}
|
||||
|
||||
type IndexReaderOnly interface {
|
||||
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
|
||||
}
|
||||
|
||||
// FieldTerms contains the terms used by a document, keyed by field
|
||||
type FieldTerms map[string][]string
|
||||
|
||||
|
@ -115,6 +152,11 @@ type TermFieldVector struct {
|
|||
End uint64
|
||||
}
|
||||
|
||||
func (tfv *TermFieldVector) Size() int {
|
||||
return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
|
||||
len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
|
||||
}
|
||||
|
||||
// IndexInternalID is an opaque document identifier interal to the index impl
|
||||
type IndexInternalID []byte
|
||||
|
||||
|
@ -134,14 +176,27 @@ type TermFieldDoc struct {
|
|||
Vectors []*TermFieldVector
|
||||
}
|
||||
|
||||
func (tfd *TermFieldDoc) Size() int {
|
||||
sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
|
||||
len(tfd.Term) + len(tfd.ID)
|
||||
|
||||
for _, entry := range tfd.Vectors {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
// Reset allows an already allocated TermFieldDoc to be reused
|
||||
func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
|
||||
// remember the []byte used for the ID
|
||||
id := tfd.ID
|
||||
vectors := tfd.Vectors
|
||||
// idiom to copy over from empty TermFieldDoc (0 allocations)
|
||||
*tfd = TermFieldDoc{}
|
||||
// reuse the []byte already allocated (and reset len to 0)
|
||||
tfd.ID = id[:0]
|
||||
tfd.Vectors = vectors[:0]
|
||||
return tfd
|
||||
}
|
||||
|
||||
|
@ -161,6 +216,8 @@ type TermFieldReader interface {
|
|||
// Count returns the number of documents contains the term in this field.
|
||||
Count() uint64
|
||||
Close() error
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
||||
type DictEntry struct {
|
||||
|
@ -185,12 +242,18 @@ type DocIDReader interface {
|
|||
// will start there instead. If ID is greater than or equal to the end of
|
||||
// the range, Next() call will return io.EOF.
|
||||
Advance(ID IndexInternalID) (IndexInternalID, error)
|
||||
|
||||
Size() int
|
||||
|
||||
Close() error
|
||||
}
|
||||
|
||||
type BatchCallback func(error)
|
||||
|
||||
type Batch struct {
|
||||
IndexOps map[string]*document.Document
|
||||
InternalOps map[string][]byte
|
||||
IndexOps map[string]*document.Document
|
||||
InternalOps map[string][]byte
|
||||
persistedCallback BatchCallback
|
||||
}
|
||||
|
||||
func NewBatch() *Batch {
|
||||
|
@ -216,6 +279,14 @@ func (b *Batch) DeleteInternal(key []byte) {
|
|||
b.InternalOps[string(key)] = nil
|
||||
}
|
||||
|
||||
func (b *Batch) SetPersistedCallback(f BatchCallback) {
|
||||
b.persistedCallback = f
|
||||
}
|
||||
|
||||
func (b *Batch) PersistedCallback() BatchCallback {
|
||||
return b.persistedCallback
|
||||
}
|
||||
|
||||
func (b *Batch) String() string {
|
||||
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
|
||||
for k, v := range b.IndexOps {
|
||||
|
@ -238,4 +309,53 @@ func (b *Batch) String() string {
|
|||
func (b *Batch) Reset() {
|
||||
b.IndexOps = make(map[string]*document.Document)
|
||||
b.InternalOps = make(map[string][]byte)
|
||||
b.persistedCallback = nil
|
||||
}
|
||||
|
||||
func (b *Batch) Merge(o *Batch) {
|
||||
for k, v := range o.IndexOps {
|
||||
b.IndexOps[k] = v
|
||||
}
|
||||
for k, v := range o.InternalOps {
|
||||
b.InternalOps[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Batch) TotalDocSize() int {
|
||||
var s int
|
||||
for k, v := range b.IndexOps {
|
||||
if v != nil {
|
||||
s += v.Size() + size.SizeOfString
|
||||
}
|
||||
s += len(k)
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// Optimizable represents an optional interface that implementable by
|
||||
// optimizable resources (e.g., TermFieldReaders, Searchers). These
|
||||
// optimizable resources are provided the same OptimizableContext
|
||||
// instance, so that they can coordinate via dynamic interface
|
||||
// casting.
|
||||
type Optimizable interface {
|
||||
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
|
||||
}
|
||||
|
||||
// Represents a result of optimization -- see the Finish() method.
|
||||
type Optimized interface{}
|
||||
|
||||
type OptimizableContext interface {
|
||||
// Once all the optimzable resources have been provided the same
|
||||
// OptimizableContext instance, the optimization preparations are
|
||||
// finished or completed via the Finish() method.
|
||||
//
|
||||
// Depending on the optimization being performed, the Finish()
|
||||
// method might return a non-nil Optimized instance. For example,
|
||||
// the Optimized instance might represent an optimized
|
||||
// TermFieldReader instance.
|
||||
Finish() (Optimized, error)
|
||||
}
|
||||
|
||||
type DocValueReader interface {
|
||||
VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
|
||||
}
|
||||
|
|
272
vendor/github.com/blevesearch/bleve/index/scorch/introducer.go
generated
vendored
272
vendor/github.com/blevesearch/bleve/index/scorch/introducer.go
generated
vendored
|
@ -19,7 +19,9 @@ import (
|
|||
"sync/atomic"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
)
|
||||
|
||||
type segmentIntroduction struct {
|
||||
|
@ -29,8 +31,14 @@ type segmentIntroduction struct {
|
|||
ids []string
|
||||
internal map[string][]byte
|
||||
|
||||
applied chan error
|
||||
persisted chan error
|
||||
applied chan error
|
||||
persisted chan error
|
||||
persistedCallback index.BatchCallback
|
||||
}
|
||||
|
||||
type persistIntroduction struct {
|
||||
persisted map[uint64]segment.Segment
|
||||
applied notificationChan
|
||||
}
|
||||
|
||||
type epochWatcher struct {
|
||||
|
@ -48,6 +56,8 @@ func (s *Scorch) mainLoop() {
|
|||
var epochWatchers []*epochWatcher
|
||||
OUTER:
|
||||
for {
|
||||
atomic.AddUint64(&s.stats.TotIntroduceLoop, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
|
@ -64,6 +74,9 @@ OUTER:
|
|||
continue OUTER
|
||||
}
|
||||
|
||||
case persist := <-s.persists:
|
||||
s.introducePersist(persist)
|
||||
|
||||
case revertTo := <-s.revertToSnapshots:
|
||||
err := s.revertToSnapshot(revertTo)
|
||||
if err != nil {
|
||||
|
@ -92,32 +105,38 @@ OUTER:
|
|||
}
|
||||
|
||||
func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
||||
// acquire lock
|
||||
s.rootLock.Lock()
|
||||
atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1)
|
||||
|
||||
nsegs := len(s.root.segment)
|
||||
s.rootLock.RLock()
|
||||
root := s.root
|
||||
root.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
nsegs := len(root.segment)
|
||||
|
||||
// prepare new index snapshot
|
||||
newSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
segment: make([]*SegmentSnapshot, 0, nsegs+1),
|
||||
offsets: make([]uint64, 0, nsegs+1),
|
||||
internal: make(map[string][]byte, len(s.root.internal)),
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
internal: make(map[string][]byte, len(root.internal)),
|
||||
refs: 1,
|
||||
creator: "introduceSegment",
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
|
||||
// iterate through current segments
|
||||
var running uint64
|
||||
for i := range s.root.segment {
|
||||
var docsToPersistCount, memSegments, fileSegments uint64
|
||||
for i := range root.segment {
|
||||
// see if optimistic work included this segment
|
||||
delta, ok := next.obsoletes[s.root.segment[i].id]
|
||||
delta, ok := next.obsoletes[root.segment[i].id]
|
||||
if !ok {
|
||||
var err error
|
||||
delta, err = s.root.segment[i].segment.DocNumbers(next.ids)
|
||||
delta, err = root.segment[i].segment.DocNumbers(next.ids)
|
||||
if err != nil {
|
||||
s.rootLock.Unlock()
|
||||
next.applied <- fmt.Errorf("error computing doc numbers: %v", err)
|
||||
close(next.applied)
|
||||
_ = newSnapshot.DecRef()
|
||||
|
@ -126,43 +145,60 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
|||
}
|
||||
|
||||
newss := &SegmentSnapshot{
|
||||
id: s.root.segment[i].id,
|
||||
segment: s.root.segment[i].segment,
|
||||
cachedDocs: s.root.segment[i].cachedDocs,
|
||||
id: root.segment[i].id,
|
||||
segment: root.segment[i].segment,
|
||||
cachedDocs: root.segment[i].cachedDocs,
|
||||
creator: root.segment[i].creator,
|
||||
}
|
||||
|
||||
// apply new obsoletions
|
||||
if s.root.segment[i].deleted == nil {
|
||||
if root.segment[i].deleted == nil {
|
||||
newss.deleted = delta
|
||||
} else {
|
||||
newss.deleted = roaring.Or(s.root.segment[i].deleted, delta)
|
||||
newss.deleted = roaring.Or(root.segment[i].deleted, delta)
|
||||
}
|
||||
if newss.deleted.IsEmpty() {
|
||||
newss.deleted = nil
|
||||
}
|
||||
|
||||
// check for live size before copying
|
||||
if newss.LiveSize() > 0 {
|
||||
newSnapshot.segment = append(newSnapshot.segment, newss)
|
||||
s.root.segment[i].segment.AddRef()
|
||||
root.segment[i].segment.AddRef()
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
running += s.root.segment[i].Count()
|
||||
running += newss.segment.Count()
|
||||
}
|
||||
|
||||
if isMemorySegment(root.segment[i]) {
|
||||
docsToPersistCount += root.segment[i].Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
|
||||
// append new segment, if any, to end of the new index snapshot
|
||||
if next.data != nil {
|
||||
newSegmentSnapshot := &SegmentSnapshot{
|
||||
id: next.id,
|
||||
segment: next.data, // take ownership of next.data's ref-count
|
||||
cachedDocs: &cachedDocs{cache: nil},
|
||||
creator: "introduceSegment",
|
||||
}
|
||||
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot)
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
|
||||
// increment numItemsIntroduced which tracks the number of items
|
||||
// queued for persistence.
|
||||
atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count())
|
||||
atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count())
|
||||
atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1)
|
||||
}
|
||||
// copy old values
|
||||
for key, oldVal := range s.root.internal {
|
||||
for key, oldVal := range root.internal {
|
||||
newSnapshot.internal[key] = oldVal
|
||||
}
|
||||
// set new values and apply deletes
|
||||
|
@ -173,12 +209,21 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
|||
delete(newSnapshot.internal, key)
|
||||
}
|
||||
}
|
||||
|
||||
newSnapshot.updateSize()
|
||||
s.rootLock.Lock()
|
||||
if next.persisted != nil {
|
||||
s.rootPersisted = append(s.rootPersisted, next.persisted)
|
||||
}
|
||||
if next.persistedCallback != nil {
|
||||
s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback)
|
||||
}
|
||||
// swap in new index snapshot
|
||||
newSnapshot.epoch = s.nextSnapshotEpoch
|
||||
s.nextSnapshotEpoch++
|
||||
rootPrev := s.root
|
||||
s.root = newSnapshot
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
// release lock
|
||||
s.rootLock.Unlock()
|
||||
|
||||
|
@ -191,42 +236,113 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||
// acquire lock
|
||||
func (s *Scorch) introducePersist(persist *persistIntroduction) {
|
||||
atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1)
|
||||
|
||||
s.rootLock.Lock()
|
||||
root := s.root
|
||||
root.AddRef()
|
||||
nextSnapshotEpoch := s.nextSnapshotEpoch
|
||||
s.nextSnapshotEpoch++
|
||||
s.rootLock.Unlock()
|
||||
|
||||
// prepare new index snapshot
|
||||
currSize := len(s.root.segment)
|
||||
newSize := currSize + 1 - len(nextMerge.old)
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
// empty segments deletion
|
||||
if nextMerge.new == nil {
|
||||
newSize--
|
||||
newIndexSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
epoch: nextSnapshotEpoch,
|
||||
segment: make([]*SegmentSnapshot, len(root.segment)),
|
||||
offsets: make([]uint64, len(root.offsets)),
|
||||
internal: make(map[string][]byte, len(root.internal)),
|
||||
refs: 1,
|
||||
creator: "introducePersist",
|
||||
}
|
||||
|
||||
var docsToPersistCount, memSegments, fileSegments uint64
|
||||
for i, segmentSnapshot := range root.segment {
|
||||
// see if this segment has been replaced
|
||||
if replacement, ok := persist.persisted[segmentSnapshot.id]; ok {
|
||||
newSegmentSnapshot := &SegmentSnapshot{
|
||||
id: segmentSnapshot.id,
|
||||
segment: replacement,
|
||||
deleted: segmentSnapshot.deleted,
|
||||
cachedDocs: segmentSnapshot.cachedDocs,
|
||||
creator: "introducePersist",
|
||||
}
|
||||
newIndexSnapshot.segment[i] = newSegmentSnapshot
|
||||
delete(persist.persisted, segmentSnapshot.id)
|
||||
|
||||
// update items persisted incase of a new segment snapshot
|
||||
atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count())
|
||||
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
|
||||
fileSegments++
|
||||
} else {
|
||||
newIndexSnapshot.segment[i] = root.segment[i]
|
||||
newIndexSnapshot.segment[i].segment.AddRef()
|
||||
|
||||
if isMemorySegment(root.segment[i]) {
|
||||
docsToPersistCount += root.segment[i].Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
newIndexSnapshot.offsets[i] = root.offsets[i]
|
||||
}
|
||||
|
||||
for k, v := range root.internal {
|
||||
newIndexSnapshot.internal[k] = v
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
newIndexSnapshot.updateSize()
|
||||
s.rootLock.Lock()
|
||||
rootPrev := s.root
|
||||
s.root = newIndexSnapshot
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if rootPrev != nil {
|
||||
_ = rootPrev.DecRef()
|
||||
}
|
||||
|
||||
close(persist.applied)
|
||||
}
|
||||
|
||||
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||
atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1)
|
||||
|
||||
s.rootLock.RLock()
|
||||
root := s.root
|
||||
root.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
newSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
segment: make([]*SegmentSnapshot, 0, newSize),
|
||||
offsets: make([]uint64, 0, newSize),
|
||||
internal: s.root.internal,
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
internal: root.internal,
|
||||
refs: 1,
|
||||
creator: "introduceMerge",
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
|
||||
// iterate through current segments
|
||||
newSegmentDeleted := roaring.NewBitmap()
|
||||
var running uint64
|
||||
for i := range s.root.segment {
|
||||
segmentID := s.root.segment[i].id
|
||||
var running, docsToPersistCount, memSegments, fileSegments uint64
|
||||
for i := range root.segment {
|
||||
segmentID := root.segment[i].id
|
||||
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok {
|
||||
// this segment is going away, see if anything else was deleted since we started the merge
|
||||
if segSnapAtMerge != nil && s.root.segment[i].deleted != nil {
|
||||
if segSnapAtMerge != nil && root.segment[i].deleted != nil {
|
||||
// assume all these deletes are new
|
||||
deletedSince := s.root.segment[i].deleted
|
||||
deletedSince := root.segment[i].deleted
|
||||
// if we already knew about some of them, remove
|
||||
if segSnapAtMerge.deleted != nil {
|
||||
deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted)
|
||||
deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted)
|
||||
}
|
||||
deletedSinceItr := deletedSince.Iterator()
|
||||
for deletedSinceItr.HasNext() {
|
||||
|
@ -240,18 +356,25 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
// segments left behind in old map after processing
|
||||
// the root segments would be the obsolete segment set
|
||||
delete(nextMerge.old, segmentID)
|
||||
|
||||
} else if s.root.segment[i].LiveSize() > 0 {
|
||||
} else if root.segment[i].LiveSize() > 0 {
|
||||
// this segment is staying
|
||||
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
|
||||
id: s.root.segment[i].id,
|
||||
segment: s.root.segment[i].segment,
|
||||
deleted: s.root.segment[i].deleted,
|
||||
cachedDocs: s.root.segment[i].cachedDocs,
|
||||
id: root.segment[i].id,
|
||||
segment: root.segment[i].segment,
|
||||
deleted: root.segment[i].deleted,
|
||||
cachedDocs: root.segment[i].cachedDocs,
|
||||
creator: root.segment[i].creator,
|
||||
})
|
||||
s.root.segment[i].segment.AddRef()
|
||||
root.segment[i].segment.AddRef()
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
running += s.root.segment[i].Count()
|
||||
running += root.segment[i].segment.Count()
|
||||
|
||||
if isMemorySegment(root.segment[i]) {
|
||||
docsToPersistCount += root.segment[i].Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -269,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// In case where all the docs in the newly merged segment getting
|
||||
// deleted by the time we reach here, can skip the introduction.
|
||||
if nextMerge.new != nil &&
|
||||
|
@ -279,15 +403,35 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count
|
||||
deleted: newSegmentDeleted,
|
||||
cachedDocs: &cachedDocs{cache: nil},
|
||||
creator: "introduceMerge",
|
||||
})
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1)
|
||||
|
||||
switch nextMerge.new.(type) {
|
||||
case *zap.SegmentBase:
|
||||
docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality()
|
||||
memSegments++
|
||||
case *zap.Segment:
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
|
||||
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
|
||||
|
||||
// swap in new segment
|
||||
newSnapshot.updateSize()
|
||||
|
||||
s.rootLock.Lock()
|
||||
// swap in new index snapshot
|
||||
newSnapshot.epoch = s.nextSnapshotEpoch
|
||||
s.nextSnapshotEpoch++
|
||||
rootPrev := s.root
|
||||
s.root = newSnapshot
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
// release lock
|
||||
s.rootLock.Unlock()
|
||||
|
||||
|
@ -301,6 +445,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
}
|
||||
|
||||
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
||||
atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1)
|
||||
|
||||
if revertTo.snapshot == nil {
|
||||
err := fmt.Errorf("Cannot revert to a nil snapshot")
|
||||
revertTo.applied <- err
|
||||
|
@ -318,9 +465,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
|||
internal: revertTo.snapshot.internal,
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
refs: 1,
|
||||
creator: "revertToSnapshot",
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
|
||||
var docsToPersistCount, memSegments, fileSegments uint64
|
||||
// iterate through segments
|
||||
for i, segmentSnapshot := range revertTo.snapshot.segment {
|
||||
newSnapshot.segment[i] = &SegmentSnapshot{
|
||||
|
@ -328,21 +477,37 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
|||
segment: segmentSnapshot.segment,
|
||||
deleted: segmentSnapshot.deleted,
|
||||
cachedDocs: segmentSnapshot.cachedDocs,
|
||||
creator: segmentSnapshot.creator,
|
||||
}
|
||||
newSnapshot.segment[i].segment.AddRef()
|
||||
|
||||
// remove segment from ineligibleForRemoval map
|
||||
filename := zapFileName(segmentSnapshot.id)
|
||||
delete(s.ineligibleForRemoval, filename)
|
||||
|
||||
if isMemorySegment(segmentSnapshot) {
|
||||
docsToPersistCount += segmentSnapshot.Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
|
||||
if revertTo.persisted != nil {
|
||||
s.rootPersisted = append(s.rootPersisted, revertTo.persisted)
|
||||
}
|
||||
|
||||
newSnapshot.updateSize()
|
||||
|
||||
// swap in new snapshot
|
||||
rootPrev := s.root
|
||||
s.root = newSnapshot
|
||||
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
// release lock
|
||||
s.rootLock.Unlock()
|
||||
|
||||
|
@ -354,3 +519,12 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
func isMemorySegment(s *SegmentSnapshot) bool {
|
||||
switch s.segment.(type) {
|
||||
case *zap.SegmentBase:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
|
146
vendor/github.com/blevesearch/bleve/index/scorch/merge.go
generated
vendored
146
vendor/github.com/blevesearch/bleve/index/scorch/merge.go
generated
vendored
|
@ -15,9 +15,7 @@
|
|||
package scorch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
|
||||
"fmt"
|
||||
"os"
|
||||
"sync/atomic"
|
||||
|
@ -40,16 +38,20 @@ func (s *Scorch) mergerLoop() {
|
|||
|
||||
OUTER:
|
||||
for {
|
||||
atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
|
||||
default:
|
||||
// check to see if there is a new snapshot to persist
|
||||
s.rootLock.RLock()
|
||||
s.rootLock.Lock()
|
||||
ourSnapshot := s.root
|
||||
ourSnapshot.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size()))
|
||||
atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch)
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if ourSnapshot.epoch != lastEpochMergePlanned {
|
||||
startTime := time.Now()
|
||||
|
@ -57,12 +59,21 @@ OUTER:
|
|||
// lets get started
|
||||
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions)
|
||||
if err != nil {
|
||||
atomic.StoreUint64(&s.iStats.mergeEpoch, 0)
|
||||
if err == segment.ErrClosed {
|
||||
// index has been closed
|
||||
_ = ourSnapshot.DecRef()
|
||||
break OUTER
|
||||
}
|
||||
s.fireAsyncError(fmt.Errorf("merging err: %v", err))
|
||||
_ = ourSnapshot.DecRef()
|
||||
atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1)
|
||||
continue OUTER
|
||||
}
|
||||
lastEpochMergePlanned = ourSnapshot.epoch
|
||||
|
||||
atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch)
|
||||
|
||||
s.fireEvent(EventKindMergerProgress, time.Since(startTime))
|
||||
}
|
||||
_ = ourSnapshot.DecRef()
|
||||
|
@ -88,7 +99,10 @@ OUTER:
|
|||
case <-ew.notifyCh:
|
||||
}
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1)
|
||||
}
|
||||
|
||||
s.asyncTasks.Done()
|
||||
}
|
||||
|
||||
|
@ -105,6 +119,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions,
|
|||
if err != nil {
|
||||
return &mergePlannerOptions, err
|
||||
}
|
||||
|
||||
err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return &mergePlannerOptions, nil
|
||||
}
|
||||
|
@ -119,32 +138,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
|||
}
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlan, 1)
|
||||
|
||||
// give this list to the planner
|
||||
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options)
|
||||
if err != nil {
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1)
|
||||
return fmt.Errorf("merge planning err: %v", err)
|
||||
}
|
||||
if resultMergePlan == nil {
|
||||
// nothing to do
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1)
|
||||
return nil
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1)
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks)))
|
||||
|
||||
// process tasks in serial for now
|
||||
var notifications []chan *IndexSnapshot
|
||||
for _, task := range resultMergePlan.Tasks {
|
||||
if len(task.Segments) == 0 {
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments)))
|
||||
|
||||
oldMap := make(map[uint64]*SegmentSnapshot)
|
||||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
||||
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments))
|
||||
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
|
||||
|
||||
for _, planSegment := range task.Segments {
|
||||
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
|
||||
oldMap[segSnapshot.id] = segSnapshot
|
||||
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok {
|
||||
if segSnapshot.LiveSize() == 0 {
|
||||
atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1)
|
||||
oldMap[segSnapshot.id] = nil
|
||||
} else {
|
||||
segmentsToMerge = append(segmentsToMerge, zapSeg)
|
||||
|
@ -155,32 +187,53 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
|||
}
|
||||
|
||||
var oldNewDocNums map[uint64][]uint64
|
||||
var segment segment.Segment
|
||||
var seg segment.Segment
|
||||
if len(segmentsToMerge) > 0 {
|
||||
filename := zapFileName(newSegmentID)
|
||||
s.markIneligibleForRemoval(filename)
|
||||
path := s.path + string(os.PathSeparator) + filename
|
||||
newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024)
|
||||
|
||||
fileMergeZapStartTime := time.Now()
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
|
||||
newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path,
|
||||
DefaultChunkFactor, s.closeCh, s)
|
||||
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
|
||||
|
||||
fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
|
||||
atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime)
|
||||
if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime {
|
||||
atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
s.unmarkIneligibleForRemoval(filename)
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
|
||||
if err == segment.ErrClosed {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("merging failed: %v", err)
|
||||
}
|
||||
segment, err = zap.Open(path)
|
||||
|
||||
seg, err = zap.Open(path)
|
||||
if err != nil {
|
||||
s.unmarkIneligibleForRemoval(filename)
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
|
||||
return err
|
||||
}
|
||||
oldNewDocNums = make(map[uint64][]uint64)
|
||||
for i, segNewDocNums := range newDocNums {
|
||||
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge)))
|
||||
}
|
||||
|
||||
sm := &segmentMerge{
|
||||
id: newSegmentID,
|
||||
old: oldMap,
|
||||
oldNewDocNums: oldNewDocNums,
|
||||
new: segment,
|
||||
new: seg,
|
||||
notify: make(chan *IndexSnapshot, 1),
|
||||
}
|
||||
notifications = append(notifications, sm.notify)
|
||||
|
@ -188,21 +241,28 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
|||
// give it to the introducer
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
_ = segment.Close()
|
||||
return nil
|
||||
_ = seg.Close()
|
||||
return segment.ErrClosed
|
||||
case s.merges <- sm:
|
||||
atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1)
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1)
|
||||
}
|
||||
|
||||
for _, notification := range notifications {
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return nil
|
||||
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1)
|
||||
return segment.ErrClosed
|
||||
case newSnapshot := <-notification:
|
||||
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1)
|
||||
if newSnapshot != nil {
|
||||
_ = newSnapshot.DecRef()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -219,44 +279,48 @@ type segmentMerge struct {
|
|||
// into the root
|
||||
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
|
||||
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int,
|
||||
chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) {
|
||||
var br bytes.Buffer
|
||||
chunkFactor uint32) (*IndexSnapshot, uint64, error) {
|
||||
atomic.AddUint64(&s.stats.TotMemMergeBeg, 1)
|
||||
|
||||
cr := zap.NewCountHashWriter(&br)
|
||||
memMergeZapStartTime := time.Now()
|
||||
|
||||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset,
|
||||
docValueOffset, dictLocs, fieldsInv, fieldsMap, err :=
|
||||
zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
}
|
||||
|
||||
sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
|
||||
fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset,
|
||||
docValueOffset, dictLocs)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
}
|
||||
atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1)
|
||||
|
||||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
||||
|
||||
filename := zapFileName(newSegmentID)
|
||||
path := s.path + string(os.PathSeparator) + filename
|
||||
err = zap.PersistSegmentBase(sb, path)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
|
||||
newDocNums, _, err :=
|
||||
zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s)
|
||||
|
||||
atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1)
|
||||
|
||||
memMergeZapTime := uint64(time.Since(memMergeZapStartTime))
|
||||
atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime)
|
||||
if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime {
|
||||
atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime)
|
||||
}
|
||||
|
||||
segment, err := zap.Open(path)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
seg, err := zap.Open(path)
|
||||
if err != nil {
|
||||
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// update persisted stats
|
||||
atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count())
|
||||
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
|
||||
|
||||
sm := &segmentMerge{
|
||||
id: newSegmentID,
|
||||
old: make(map[uint64]*SegmentSnapshot),
|
||||
oldNewDocNums: make(map[uint64][]uint64),
|
||||
new: segment,
|
||||
new: seg,
|
||||
notify: make(chan *IndexSnapshot, 1),
|
||||
}
|
||||
|
||||
|
@ -268,15 +332,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
|
|||
|
||||
select { // send to introducer
|
||||
case <-s.closeCh:
|
||||
_ = segment.DecRef()
|
||||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed?
|
||||
_ = seg.DecRef()
|
||||
return nil, 0, segment.ErrClosed
|
||||
case s.merges <- sm:
|
||||
}
|
||||
|
||||
select { // wait for introduction to complete
|
||||
case <-s.closeCh:
|
||||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed?
|
||||
return nil, 0, segment.ErrClosed
|
||||
case newSnapshot := <-sm.notify:
|
||||
return numDocs, newSnapshot, newSegmentID, nil
|
||||
atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs)))
|
||||
atomic.AddUint64(&s.stats.TotMemMergeDone, 1)
|
||||
return newSnapshot, newSegmentID, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scorch) ReportBytesWritten(bytesWritten uint64) {
|
||||
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten)
|
||||
}
|
||||
|
|
23
vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go
generated
vendored
23
vendor/github.com/blevesearch/bleve/index/scorch/mergeplan/merge_plan.go
generated
vendored
|
@ -18,6 +18,7 @@
|
|||
package mergeplan
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
|
@ -115,7 +116,15 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 {
|
|||
return o.FloorSegmentSize
|
||||
}
|
||||
|
||||
// Suggested default options.
|
||||
// MaxSegmentSizeLimit represents the maximum size of a segment,
|
||||
// this limit comes with hit-1 optimisation/max encoding limit uint31.
|
||||
const MaxSegmentSizeLimit = 1<<31 - 1
|
||||
|
||||
// ErrMaxSegmentSizeTooLarge is returned when the size of the segment
|
||||
// exceeds the MaxSegmentSizeLimit
|
||||
var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit")
|
||||
|
||||
// DefaultMergePlanOptions suggests the default options.
|
||||
var DefaultMergePlanOptions = MergePlanOptions{
|
||||
MaxSegmentsPerTier: 10,
|
||||
MaxSegmentSize: 5000000,
|
||||
|
@ -208,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
|
|||
if len(roster) > 0 {
|
||||
rosterScore := scoreSegments(roster, o)
|
||||
|
||||
if len(bestRoster) <= 0 || rosterScore < bestRosterScore {
|
||||
if len(bestRoster) == 0 || rosterScore < bestRosterScore {
|
||||
bestRoster = roster
|
||||
bestRosterScore = rosterScore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(bestRoster) <= 0 {
|
||||
if len(bestRoster) == 0 {
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
|
@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan)
|
|||
|
||||
return strings.Join(rv, "\n")
|
||||
}
|
||||
|
||||
// ValidateMergePlannerOptions validates the merge planner options
|
||||
func ValidateMergePlannerOptions(options *MergePlanOptions) error {
|
||||
if options.MaxSegmentSize > MaxSegmentSizeLimit {
|
||||
return ErrMaxSegmentSizeTooLarge
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
420
vendor/github.com/blevesearch/bleve/index/scorch/optimize.go
generated
vendored
Normal file
420
vendor/github.com/blevesearch/bleve/index/scorch/optimize.go
generated
vendored
Normal file
|
@ -0,0 +1,420 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
)
|
||||
|
||||
var OptimizeConjunction = true
|
||||
var OptimizeConjunctionUnadorned = true
|
||||
var OptimizeDisjunctionUnadorned = true
|
||||
|
||||
func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if OptimizeConjunction && kind == "conjunction" {
|
||||
return s.optimizeConjunction(octx)
|
||||
}
|
||||
|
||||
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
|
||||
return s.optimizeConjunctionUnadorned(octx)
|
||||
}
|
||||
|
||||
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
|
||||
return s.optimizeDisjunctionUnadorned(octx)
|
||||
}
|
||||
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunction)
|
||||
if !ok {
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRConjunction struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr0.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr1.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
bm := roaring.And(itr0.ActualBM, itr1.ActualBM)
|
||||
|
||||
for _, tfr := range o.tfrs[2:] {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
bm.And(itr.ActualBM)
|
||||
}
|
||||
|
||||
// in this conjunction optimization, the postings iterators
|
||||
// will all share the same AND'ed together actual bitmap. The
|
||||
// regular conjunction searcher machinery will still be used,
|
||||
// but the underlying bitmap will be smaller.
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if ok && itr.ActualBM != nil {
|
||||
itr.ActualBM = bm
|
||||
itr.Actual = bm.Iterator()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" conjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRConjunctionUnadorned struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
|
||||
var OptimizeTFRConjunctionUnadornedField = "*"
|
||||
|
||||
// Finish of an unadorned conjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps AND'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{
|
||||
term: OptimizeTFRConjunctionUnadornedTerm,
|
||||
field: OptimizeTFRConjunctionUnadornedField,
|
||||
snapshot: o.snapshot,
|
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||
segmentOffset: 0,
|
||||
includeFreq: false,
|
||||
includeNorm: false,
|
||||
includeTermVectors: false,
|
||||
}
|
||||
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
OUTER:
|
||||
for i := range o.snapshot.segment {
|
||||
actualBMs = actualBMs[:0]
|
||||
|
||||
var docNum1HitLast uint64
|
||||
var docNum1HitLastOk bool
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok {
|
||||
// An empty postings iterator means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
// We optimize zap postings iterators only.
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// If the postings iterator is "1-hit" optimized, then we
|
||||
// can perform several optimizations up-front here.
|
||||
docNum1Hit, ok := itr.DocNum1Hit()
|
||||
if ok {
|
||||
if docNum1Hit == zap.DocNum1HitFinished {
|
||||
// An empty docNum here means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
|
||||
// The docNum1Hit doesn't match the previous
|
||||
// docNum1HitLast, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
docNum1HitLast = docNum1Hit
|
||||
docNum1HitLastOk = true
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if itr.ActualBM == nil {
|
||||
// An empty actual bitmap means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
// Collect the actual bitmap for more processing later.
|
||||
actualBMs = append(actualBMs, itr.ActualBM)
|
||||
}
|
||||
|
||||
if docNum1HitLastOk {
|
||||
// We reach here if all the 1-hit optimized posting
|
||||
// iterators had the same 1-hit docNum, so we can check if
|
||||
// our collected actual bitmaps also have that docNum.
|
||||
for _, bm := range actualBMs {
|
||||
if !bm.Contains(uint32(docNum1HitLast)) {
|
||||
// The docNum1Hit isn't in one of our actual
|
||||
// bitmaps, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
|
||||
// The actual bitmaps and docNum1Hits all contain or have
|
||||
// the same 1-hit docNum, so that's our AND'ed result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit(
|
||||
docNum1HitLast, zap.NormBits1Hit, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if len(actualBMs) == 0 {
|
||||
// If we've collected no actual bitmaps at this point,
|
||||
// then the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if len(actualBMs) == 1 {
|
||||
// If we've only 1 actual bitmap, then that's our result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||
actualBMs[0], false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
// Else, AND together our collected bitmaps as our result.
|
||||
bm := roaring.And(actualBMs[0], actualBMs[1])
|
||||
|
||||
for _, actualBM := range actualBMs[2:] {
|
||||
bm.And(actualBM)
|
||||
}
|
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||
bm, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return oTFR, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" disjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRDisjunctionUnadorned struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
|
||||
var OptimizeTFRDisjunctionUnadornedField = "*"
|
||||
|
||||
// Finish of an unadorned disjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps OR'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
var cMax uint64
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if itr.ActualBM != nil {
|
||||
c := itr.ActualBM.GetCardinality()
|
||||
if cMax < c {
|
||||
cMax = c
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Heuristic to skip the optimization if all the constituent
|
||||
// bitmaps are too small, where the processing & resource
|
||||
// overhead to create the OR'ed bitmap outweighs the benefit.
|
||||
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{
|
||||
term: OptimizeTFRDisjunctionUnadornedTerm,
|
||||
field: OptimizeTFRDisjunctionUnadornedField,
|
||||
snapshot: o.snapshot,
|
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||
segmentOffset: 0,
|
||||
includeFreq: false,
|
||||
includeNorm: false,
|
||||
includeTermVectors: false,
|
||||
}
|
||||
|
||||
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
docNums = docNums[:0]
|
||||
actualBMs = actualBMs[:0]
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
docNum, ok := itr.DocNum1Hit()
|
||||
if ok {
|
||||
docNums = append(docNums, uint32(docNum))
|
||||
continue
|
||||
}
|
||||
|
||||
if itr.ActualBM != nil {
|
||||
actualBMs = append(actualBMs, itr.ActualBM)
|
||||
}
|
||||
}
|
||||
|
||||
var bm *roaring.Bitmap
|
||||
if len(actualBMs) > 2 {
|
||||
bm = roaring.HeapOr(actualBMs...)
|
||||
} else if len(actualBMs) == 2 {
|
||||
bm = roaring.Or(actualBMs[0], actualBMs[1])
|
||||
} else if len(actualBMs) == 1 {
|
||||
bm = actualBMs[0].Clone()
|
||||
}
|
||||
|
||||
if bm == nil {
|
||||
bm = roaring.New()
|
||||
}
|
||||
|
||||
bm.AddMany(docNums)
|
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return oTFR, nil
|
||||
}
|
287
vendor/github.com/blevesearch/bleve/index/scorch/persister.go
generated
vendored
287
vendor/github.com/blevesearch/bleve/index/scorch/persister.go
generated
vendored
|
@ -16,9 +16,12 @@ package scorch
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
@ -27,23 +30,57 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
var DefaultChunkFactor uint32 = 1024
|
||||
|
||||
// Arbitrary number, need to make it configurable.
|
||||
// Lower values like 10/making persister really slow
|
||||
// doesn't work well as it is creating more files to
|
||||
// persist for in next persist iteration and spikes the # FDs.
|
||||
// Ideal value should let persister also proceed at
|
||||
// an optimum pace so that the merger can skip
|
||||
// many intermediate snapshots.
|
||||
// This needs to be based on empirical data.
|
||||
// TODO - may need to revisit this approach/value.
|
||||
var epochDistance = uint64(5)
|
||||
// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct
|
||||
// persistence of segments with the default safe batch option.
|
||||
// If the default safe batch option results in high number of
|
||||
// files on disk, then users may initialise this configuration parameter
|
||||
// with higher values so that the persister will nap a bit within it's
|
||||
// work loop to favour better in-memory merging of segments to result
|
||||
// in fewer segment files on disk. But that may come with an indexing
|
||||
// performance overhead.
|
||||
// Unsafe batch users are advised to override this to higher value
|
||||
// for better performance especially with high data density.
|
||||
var DefaultPersisterNapTimeMSec int = 0 // ms
|
||||
|
||||
// DefaultPersisterNapUnderNumFiles helps in controlling the pace of
|
||||
// persister. At times of a slow merger progress with heavy file merging
|
||||
// operations, its better to pace down the persister for letting the merger
|
||||
// to catch up within a range defined by this parameter.
|
||||
// Fewer files on disk (as per the merge plan) would result in keeping the
|
||||
// file handle usage under limit, faster disk merger and a healthier index.
|
||||
// Its been observed that such a loosely sync'ed introducer-persister-merger
|
||||
// trio results in better overall performance.
|
||||
var DefaultPersisterNapUnderNumFiles int = 1000
|
||||
|
||||
var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64
|
||||
|
||||
type persisterOptions struct {
|
||||
// PersisterNapTimeMSec controls the wait/delay injected into
|
||||
// persistence workloop to improve the chances for
|
||||
// a healthier and heavier in-memory merging
|
||||
PersisterNapTimeMSec int
|
||||
|
||||
// PersisterNapTimeMSec > 0, and the number of files is less than
|
||||
// PersisterNapUnderNumFiles, then the persister will sleep
|
||||
// PersisterNapTimeMSec amount of time to improve the chances for
|
||||
// a healthier and heavier in-memory merging
|
||||
PersisterNapUnderNumFiles int
|
||||
|
||||
// MemoryPressurePauseThreshold let persister to have a better leeway
|
||||
// for prudently performing the memory merge of segments on a memory
|
||||
// pressure situation. Here the config value is an upper threshold
|
||||
// for the number of paused application threads. The default value would
|
||||
// be a very high number to always favour the merging of memory segments.
|
||||
MemoryPressurePauseThreshold uint64
|
||||
}
|
||||
|
||||
type notificationChan chan struct{}
|
||||
|
||||
|
@ -53,8 +90,17 @@ func (s *Scorch) persisterLoop() {
|
|||
var persistWatchers []*epochWatcher
|
||||
var lastPersistedEpoch, lastMergedEpoch uint64
|
||||
var ew *epochWatcher
|
||||
po, err := s.parsePersisterOptions()
|
||||
if err != nil {
|
||||
s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
|
||||
s.asyncTasks.Done()
|
||||
return
|
||||
}
|
||||
|
||||
OUTER:
|
||||
for {
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
|
@ -65,11 +111,13 @@ OUTER:
|
|||
if ew != nil && ew.epoch > lastMergedEpoch {
|
||||
lastMergedEpoch = ew.epoch
|
||||
}
|
||||
persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
|
||||
&lastMergedEpoch, persistWatchers)
|
||||
|
||||
lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
|
||||
lastMergedEpoch, persistWatchers, po)
|
||||
|
||||
var ourSnapshot *IndexSnapshot
|
||||
var ourPersisted []chan error
|
||||
var ourPersistedCallbacks []index.BatchCallback
|
||||
|
||||
// check to see if there is a new snapshot to persist
|
||||
s.rootLock.Lock()
|
||||
|
@ -78,13 +126,17 @@ OUTER:
|
|||
ourSnapshot.AddRef()
|
||||
ourPersisted = s.rootPersisted
|
||||
s.rootPersisted = nil
|
||||
ourPersistedCallbacks = s.persistedCallbacks
|
||||
s.persistedCallbacks = nil
|
||||
atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
|
||||
atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
|
||||
}
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if ourSnapshot != nil {
|
||||
startTime := time.Now()
|
||||
|
||||
err := s.persistSnapshot(ourSnapshot)
|
||||
err := s.persistSnapshot(ourSnapshot, po)
|
||||
for _, ch := range ourPersisted {
|
||||
if err != nil {
|
||||
ch <- err
|
||||
|
@ -92,10 +144,22 @@ OUTER:
|
|||
close(ch)
|
||||
}
|
||||
if err != nil {
|
||||
atomic.StoreUint64(&s.iStats.persistEpoch, 0)
|
||||
if err == segment.ErrClosed {
|
||||
// index has been closed
|
||||
_ = ourSnapshot.DecRef()
|
||||
break OUTER
|
||||
}
|
||||
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
|
||||
_ = ourSnapshot.DecRef()
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
|
||||
continue OUTER
|
||||
}
|
||||
for i := range ourPersistedCallbacks {
|
||||
ourPersistedCallbacks[i](err)
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch)
|
||||
|
||||
lastPersistedEpoch = ourSnapshot.epoch
|
||||
for _, ew := range persistWatchers {
|
||||
|
@ -115,6 +179,8 @@ OUTER:
|
|||
s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
|
||||
|
||||
if changed {
|
||||
s.removeOldData()
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
|
@ -133,17 +199,21 @@ OUTER:
|
|||
|
||||
s.removeOldData() // might as well cleanup while waiting
|
||||
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopWait, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
case <-w.notifyCh:
|
||||
// woken up, next loop should pick up work
|
||||
continue OUTER
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1)
|
||||
case ew = <-s.persisterNotifier:
|
||||
// if the watchers are already caught up then let them wait,
|
||||
// else let them continue to do the catch up
|
||||
persistWatchers = append(persistWatchers, ew)
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,38 +230,95 @@ func notifyMergeWatchers(lastPersistedEpoch uint64,
|
|||
return watchersNext
|
||||
}
|
||||
|
||||
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64,
|
||||
persistWatchers []*epochWatcher) []*epochWatcher {
|
||||
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64,
|
||||
persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) {
|
||||
|
||||
// first, let the watchers proceed if they lag behind
|
||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||
|
||||
// check the merger lag by counting the segment files on disk,
|
||||
// On finding fewer files on disk, persister takes a short pause
|
||||
// for sufficient in-memory segments to pile up for the next
|
||||
// memory merge cum persist loop.
|
||||
// On finding too many files on disk, persister pause until the merger
|
||||
// catches up to reduce the segment file count under the threshold.
|
||||
// But if there is memory pressure, then skip this sleep maneuvers.
|
||||
numFilesOnDisk, _ := s.diskFileStats()
|
||||
if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
|
||||
po.PersisterNapTimeMSec > 0 && s.paused() == 0 {
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)):
|
||||
atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1)
|
||||
|
||||
case ew := <-s.persisterNotifier:
|
||||
// unblock the merger in meantime
|
||||
persistWatchers = append(persistWatchers, ew)
|
||||
lastMergedEpoch = ew.epoch
|
||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||
atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1)
|
||||
}
|
||||
return lastMergedEpoch, persistWatchers
|
||||
}
|
||||
|
||||
OUTER:
|
||||
// check for slow merger and await until the merger catch up
|
||||
for lastPersistedEpoch > *lastMergedEpoch+epochDistance {
|
||||
for po.PersisterNapUnderNumFiles > 0 &&
|
||||
numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
|
||||
lastMergedEpoch < lastPersistedEpoch {
|
||||
atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
case ew := <-s.persisterNotifier:
|
||||
persistWatchers = append(persistWatchers, ew)
|
||||
*lastMergedEpoch = ew.epoch
|
||||
lastMergedEpoch = ew.epoch
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1)
|
||||
|
||||
// let the watchers proceed if they lag behind
|
||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||
|
||||
numFilesOnDisk, _ = s.diskFileStats()
|
||||
}
|
||||
|
||||
return persistWatchers
|
||||
return lastMergedEpoch, persistWatchers
|
||||
}
|
||||
|
||||
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
|
||||
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) {
|
||||
po := persisterOptions{
|
||||
PersisterNapTimeMSec: DefaultPersisterNapTimeMSec,
|
||||
PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles,
|
||||
MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold,
|
||||
}
|
||||
if persisted {
|
||||
return nil
|
||||
if v, ok := s.config["scorchPersisterOptions"]; ok {
|
||||
b, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return &po, err
|
||||
}
|
||||
|
||||
err = json.Unmarshal(b, &po)
|
||||
if err != nil {
|
||||
return &po, err
|
||||
}
|
||||
}
|
||||
return &po, nil
|
||||
}
|
||||
|
||||
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot,
|
||||
po *persisterOptions) error {
|
||||
// Perform in-memory segment merging only when the memory pressure is
|
||||
// below the configured threshold, else the persister performs the
|
||||
// direct persistence of segments.
|
||||
if s.paused() < po.MemoryPressurePauseThreshold {
|
||||
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if persisted {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return s.persistSnapshotDirect(snapshot)
|
||||
|
@ -224,7 +351,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
|
|||
return false, nil
|
||||
}
|
||||
|
||||
_, newSnapshot, newSegmentID, err := s.mergeSegmentBases(
|
||||
newSnapshot, newSegmentID, err := s.mergeSegmentBases(
|
||||
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor)
|
||||
if err != nil {
|
||||
return false, err
|
||||
|
@ -249,6 +376,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
|
|||
segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)),
|
||||
internal: snapshot.internal,
|
||||
epoch: snapshot.epoch,
|
||||
creator: "persistSnapshotMaybeMerge",
|
||||
}
|
||||
|
||||
// copy to the equiv the segments that weren't replaced
|
||||
|
@ -301,6 +429,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
|
|||
return err
|
||||
}
|
||||
|
||||
// persist meta values
|
||||
metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = metaBucket.Put([]byte("type"), []byte(zap.Type))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
buf := make([]byte, binary.MaxVarintLen32)
|
||||
binary.BigEndian.PutUint32(buf, zap.Version)
|
||||
err = metaBucket.Put([]byte("version"), buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// persist internal values
|
||||
internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
|
||||
if err != nil {
|
||||
|
@ -390,44 +534,21 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
|
|||
}
|
||||
}
|
||||
|
||||
s.rootLock.Lock()
|
||||
newIndexSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
segment: make([]*SegmentSnapshot, len(s.root.segment)),
|
||||
offsets: make([]uint64, len(s.root.offsets)),
|
||||
internal: make(map[string][]byte, len(s.root.internal)),
|
||||
refs: 1,
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
for i, segmentSnapshot := range s.root.segment {
|
||||
// see if this segment has been replaced
|
||||
if replacement, ok := newSegments[segmentSnapshot.id]; ok {
|
||||
newSegmentSnapshot := &SegmentSnapshot{
|
||||
id: segmentSnapshot.id,
|
||||
segment: replacement,
|
||||
deleted: segmentSnapshot.deleted,
|
||||
cachedDocs: segmentSnapshot.cachedDocs,
|
||||
}
|
||||
newIndexSnapshot.segment[i] = newSegmentSnapshot
|
||||
delete(newSegments, segmentSnapshot.id)
|
||||
// update items persisted incase of a new segment snapshot
|
||||
atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count())
|
||||
} else {
|
||||
newIndexSnapshot.segment[i] = s.root.segment[i]
|
||||
newIndexSnapshot.segment[i].segment.AddRef()
|
||||
}
|
||||
newIndexSnapshot.offsets[i] = s.root.offsets[i]
|
||||
}
|
||||
for k, v := range s.root.internal {
|
||||
newIndexSnapshot.internal[k] = v
|
||||
persist := &persistIntroduction{
|
||||
persisted: newSegments,
|
||||
applied: make(notificationChan),
|
||||
}
|
||||
|
||||
rootPrev := s.root
|
||||
s.root = newIndexSnapshot
|
||||
s.rootLock.Unlock()
|
||||
if rootPrev != nil {
|
||||
_ = rootPrev.DecRef()
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return segment.ErrClosed
|
||||
case s.persists <- persist:
|
||||
}
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return segment.ErrClosed
|
||||
case <-persist.applied:
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -462,6 +583,7 @@ var boltSnapshotsBucket = []byte{'s'}
|
|||
var boltPathKey = []byte{'p'}
|
||||
var boltDeletedKey = []byte{'d'}
|
||||
var boltInternalKey = []byte{'i'}
|
||||
var boltMetaDataKey = []byte{'m'}
|
||||
|
||||
func (s *Scorch) loadFromBolt() error {
|
||||
return s.rootBolt.View(func(tx *bolt.Tx) error {
|
||||
|
@ -478,19 +600,19 @@ func (s *Scorch) loadFromBolt() error {
|
|||
continue
|
||||
}
|
||||
if foundRoot {
|
||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
||||
s.AddEligibleForRemoval(snapshotEpoch)
|
||||
continue
|
||||
}
|
||||
snapshot := snapshots.Bucket(k)
|
||||
if snapshot == nil {
|
||||
log.Printf("snapshot key, but bucket missing %x, continuing", k)
|
||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
||||
s.AddEligibleForRemoval(snapshotEpoch)
|
||||
continue
|
||||
}
|
||||
indexSnapshot, err := s.loadSnapshot(snapshot)
|
||||
if err != nil {
|
||||
log.Printf("unable to load snapshot, %v, continuing", err)
|
||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
||||
s.AddEligibleForRemoval(snapshotEpoch)
|
||||
continue
|
||||
}
|
||||
indexSnapshot.epoch = snapshotEpoch
|
||||
|
@ -500,13 +622,16 @@ func (s *Scorch) loadFromBolt() error {
|
|||
return err
|
||||
}
|
||||
s.nextSegmentID++
|
||||
s.nextSnapshotEpoch = snapshotEpoch + 1
|
||||
s.rootLock.Lock()
|
||||
if s.root != nil {
|
||||
_ = s.root.DecRef()
|
||||
}
|
||||
s.nextSnapshotEpoch = snapshotEpoch + 1
|
||||
rootPrev := s.root
|
||||
s.root = indexSnapshot
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if rootPrev != nil {
|
||||
_ = rootPrev.DecRef()
|
||||
}
|
||||
|
||||
foundRoot = true
|
||||
}
|
||||
return nil
|
||||
|
@ -524,7 +649,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
|
|||
snapshotKey := segment.EncodeUvarintAscending(nil, epoch)
|
||||
snapshot := snapshots.Bucket(snapshotKey)
|
||||
if snapshot == nil {
|
||||
return nil
|
||||
return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch)
|
||||
}
|
||||
rv, err = s.loadSnapshot(snapshot)
|
||||
return err
|
||||
|
@ -536,12 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
|
|||
}
|
||||
|
||||
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
||||
|
||||
rv := &IndexSnapshot{
|
||||
parent: s,
|
||||
internal: make(map[string][]byte),
|
||||
refs: 1,
|
||||
creator: "loadSnapshot",
|
||||
}
|
||||
|
||||
var running uint64
|
||||
c := snapshot.Cursor()
|
||||
for k, _ := c.First(); k != nil; k, _ = c.Next() {
|
||||
|
@ -556,7 +682,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
|||
_ = rv.DecRef()
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
} else if k[0] != boltMetaDataKey[0] {
|
||||
segmentBucket := snapshot.Bucket(k)
|
||||
if segmentBucket == nil {
|
||||
_ = rv.DecRef()
|
||||
|
@ -577,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
|||
running += segmentSnapshot.segment.Count()
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
|
@ -604,7 +731,9 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
|
|||
_ = segment.Close()
|
||||
return nil, fmt.Errorf("error reading deleted bytes: %v", err)
|
||||
}
|
||||
rv.deleted = deletedBitmap
|
||||
if !deletedBitmap.IsEmpty() {
|
||||
rv.deleted = deletedBitmap
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
|
@ -643,14 +772,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
|
|||
return 0, err
|
||||
}
|
||||
|
||||
if len(persistedEpochs) <= NumSnapshotsToKeep {
|
||||
if len(persistedEpochs) <= s.numSnapshotsToKeep {
|
||||
// we need to keep everything
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// make a map of epochs to protect from deletion
|
||||
protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep)
|
||||
for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] {
|
||||
protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep)
|
||||
for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] {
|
||||
protectedEpochs[epoch] = struct{}{}
|
||||
}
|
||||
|
||||
|
@ -668,7 +797,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
|
|||
s.eligibleForRemoval = newEligible
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if len(epochsToRemove) <= 0 {
|
||||
if len(epochsToRemove) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
|
|
110
vendor/github.com/blevesearch/bleve/index/scorch/reader.go
generated
vendored
110
vendor/github.com/blevesearch/bleve/index/scorch/reader.go
generated
vendored
|
@ -1,110 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
)
|
||||
|
||||
type Reader struct {
|
||||
root *IndexSnapshot // Owns 1 ref-count on the index snapshot.
|
||||
}
|
||||
|
||||
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq,
|
||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
||||
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors)
|
||||
}
|
||||
|
||||
// DocIDReader returns an iterator over all doc ids
|
||||
// The caller must close returned instance to release associated resources.
|
||||
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) {
|
||||
return r.root.DocIDReaderAll()
|
||||
}
|
||||
|
||||
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
|
||||
return r.root.DocIDReaderOnly(ids)
|
||||
}
|
||||
|
||||
func (r *Reader) FieldDict(field string) (index.FieldDict, error) {
|
||||
return r.root.FieldDict(field)
|
||||
}
|
||||
|
||||
// FieldDictRange is currently defined to include the start and end terms
|
||||
func (r *Reader) FieldDictRange(field string, startTerm []byte,
|
||||
endTerm []byte) (index.FieldDict, error) {
|
||||
return r.root.FieldDictRange(field, startTerm, endTerm)
|
||||
}
|
||||
|
||||
func (r *Reader) FieldDictPrefix(field string,
|
||||
termPrefix []byte) (index.FieldDict, error) {
|
||||
return r.root.FieldDictPrefix(field, termPrefix)
|
||||
}
|
||||
|
||||
func (r *Reader) Document(id string) (*document.Document, error) {
|
||||
return r.root.Document(id)
|
||||
}
|
||||
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
return r.root.DocumentVisitFieldTerms(id, fields, visitor)
|
||||
}
|
||||
|
||||
func (r *Reader) Fields() ([]string, error) {
|
||||
return r.root.Fields()
|
||||
}
|
||||
|
||||
func (r *Reader) GetInternal(key []byte) ([]byte, error) {
|
||||
return r.root.GetInternal(key)
|
||||
}
|
||||
|
||||
func (r *Reader) DocCount() (uint64, error) {
|
||||
return r.root.DocCount()
|
||||
}
|
||||
|
||||
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) {
|
||||
return r.root.ExternalID(id)
|
||||
}
|
||||
|
||||
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
|
||||
return r.root.InternalID(id)
|
||||
}
|
||||
|
||||
func (r *Reader) DumpAll() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) DumpDoc(id string) chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) DumpFields() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) Close() error {
|
||||
return r.root.DecRef()
|
||||
}
|
267
vendor/github.com/blevesearch/bleve/index/scorch/scorch.go
generated
vendored
267
vendor/github.com/blevesearch/bleve/index/scorch/scorch.go
generated
vendored
|
@ -17,6 +17,7 @@ package scorch
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
@ -27,23 +28,24 @@ import (
|
|||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
const Name = "scorch"
|
||||
|
||||
const Version uint8 = 1
|
||||
const Version uint8 = 2
|
||||
|
||||
var ErrClosed = fmt.Errorf("scorch closed")
|
||||
|
||||
type Scorch struct {
|
||||
readOnly bool
|
||||
version uint8
|
||||
config map[string]interface{}
|
||||
analysisQueue *index.AnalysisQueue
|
||||
stats *Stats
|
||||
stats Stats
|
||||
nextSegmentID uint64
|
||||
path string
|
||||
|
||||
|
@ -52,12 +54,15 @@ type Scorch struct {
|
|||
rootLock sync.RWMutex
|
||||
root *IndexSnapshot // holds 1 ref-count on the root
|
||||
rootPersisted []chan error // closed when root is persisted
|
||||
persistedCallbacks []index.BatchCallback
|
||||
nextSnapshotEpoch uint64
|
||||
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
|
||||
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
|
||||
|
||||
numSnapshotsToKeep int
|
||||
closeCh chan struct{}
|
||||
introductions chan *segmentIntroduction
|
||||
persists chan *persistIntroduction
|
||||
merges chan *segmentMerge
|
||||
introducerNotifier chan *epochWatcher
|
||||
revertToSnapshots chan *snapshotReversion
|
||||
|
@ -67,6 +72,23 @@ type Scorch struct {
|
|||
|
||||
onEvent func(event Event)
|
||||
onAsyncError func(err error)
|
||||
|
||||
iStats internalStats
|
||||
|
||||
pauseLock sync.RWMutex
|
||||
|
||||
pauseCount uint64
|
||||
}
|
||||
|
||||
type internalStats struct {
|
||||
persistEpoch uint64
|
||||
persistSnapshotSize uint64
|
||||
mergeEpoch uint64
|
||||
mergeSnapshotSize uint64
|
||||
newSegBufBytesAdded uint64
|
||||
newSegBufBytesRemoved uint64
|
||||
analysisBytesAdded uint64
|
||||
analysisBytesRemoved uint64
|
||||
}
|
||||
|
||||
func NewScorch(storeName string,
|
||||
|
@ -80,8 +102,7 @@ func NewScorch(storeName string,
|
|||
closeCh: make(chan struct{}),
|
||||
ineligibleForRemoval: map[string]bool{},
|
||||
}
|
||||
rv.stats = &Stats{i: rv}
|
||||
rv.root = &IndexSnapshot{parent: rv, refs: 1}
|
||||
rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"}
|
||||
ro, ok := config["read_only"].(bool)
|
||||
if ok {
|
||||
rv.readOnly = ro
|
||||
|
@ -101,9 +122,30 @@ func NewScorch(storeName string,
|
|||
return rv, nil
|
||||
}
|
||||
|
||||
func (s *Scorch) paused() uint64 {
|
||||
s.pauseLock.Lock()
|
||||
pc := s.pauseCount
|
||||
s.pauseLock.Unlock()
|
||||
return pc
|
||||
}
|
||||
|
||||
func (s *Scorch) incrPause() {
|
||||
s.pauseLock.Lock()
|
||||
s.pauseCount++
|
||||
s.pauseLock.Unlock()
|
||||
}
|
||||
|
||||
func (s *Scorch) decrPause() {
|
||||
s.pauseLock.Lock()
|
||||
s.pauseCount--
|
||||
s.pauseLock.Unlock()
|
||||
}
|
||||
|
||||
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
|
||||
if s.onEvent != nil {
|
||||
s.incrPause()
|
||||
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
|
||||
s.decrPause()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -111,6 +153,7 @@ func (s *Scorch) fireAsyncError(err error) {
|
|||
if s.onAsyncError != nil {
|
||||
s.onAsyncError(err)
|
||||
}
|
||||
atomic.AddUint64(&s.stats.TotOnErrors, 1)
|
||||
}
|
||||
|
||||
func (s *Scorch) Open() error {
|
||||
|
@ -172,7 +215,10 @@ func (s *Scorch) openBolt() error {
|
|||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment)))
|
||||
|
||||
s.introductions = make(chan *segmentIntroduction)
|
||||
s.persists = make(chan *persistIntroduction)
|
||||
s.merges = make(chan *segmentMerge)
|
||||
s.introducerNotifier = make(chan *epochWatcher, 1)
|
||||
s.revertToSnapshots = make(chan *snapshotReversion)
|
||||
|
@ -186,6 +232,17 @@ func (s *Scorch) openBolt() error {
|
|||
}
|
||||
}
|
||||
|
||||
s.numSnapshotsToKeep = NumSnapshotsToKeep
|
||||
if v, ok := s.config["numSnapshotsToKeep"]; ok {
|
||||
var t int
|
||||
if t, err = parseToInteger(v); err != nil {
|
||||
return fmt.Errorf("numSnapshotsToKeep parse err: %v", err)
|
||||
}
|
||||
if t > 0 {
|
||||
s.numSnapshotsToKeep = t
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -255,65 +312,83 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
|
|||
|
||||
// FIXME could sort ids list concurrent with analysis?
|
||||
|
||||
go func() {
|
||||
for _, doc := range batch.IndexOps {
|
||||
if doc != nil {
|
||||
aw := index.NewAnalysisWork(s, doc, resultChan)
|
||||
// put the work on the queue
|
||||
s.analysisQueue.Queue(aw)
|
||||
if len(batch.IndexOps) > 0 {
|
||||
go func() {
|
||||
for _, doc := range batch.IndexOps {
|
||||
if doc != nil {
|
||||
aw := index.NewAnalysisWork(s, doc, resultChan)
|
||||
// put the work on the queue
|
||||
s.analysisQueue.Queue(aw)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}()
|
||||
}
|
||||
|
||||
// wait for analysis result
|
||||
analysisResults := make([]*index.AnalysisResult, int(numUpdates))
|
||||
var itemsDeQueued uint64
|
||||
var totalAnalysisSize int
|
||||
for itemsDeQueued < numUpdates {
|
||||
result := <-resultChan
|
||||
resultSize := result.Size()
|
||||
atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize))
|
||||
totalAnalysisSize += resultSize
|
||||
analysisResults[itemsDeQueued] = result
|
||||
itemsDeQueued++
|
||||
}
|
||||
close(resultChan)
|
||||
defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize))
|
||||
|
||||
atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start)))
|
||||
atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start)))
|
||||
|
||||
indexStart := time.Now()
|
||||
|
||||
// notify handlers that we're about to introduce a segment
|
||||
s.fireEvent(EventKindBatchIntroductionStart, 0)
|
||||
|
||||
var newSegment segment.Segment
|
||||
var bufBytes uint64
|
||||
if len(analysisResults) > 0 {
|
||||
newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor)
|
||||
newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes)
|
||||
} else {
|
||||
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1)
|
||||
}
|
||||
|
||||
err = s.prepareSegment(newSegment, ids, batch.InternalOps)
|
||||
err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback())
|
||||
if err != nil {
|
||||
if newSegment != nil {
|
||||
_ = newSegment.Close()
|
||||
}
|
||||
atomic.AddUint64(&s.stats.errors, 1)
|
||||
atomic.AddUint64(&s.stats.TotOnErrors, 1)
|
||||
} else {
|
||||
atomic.AddUint64(&s.stats.updates, numUpdates)
|
||||
atomic.AddUint64(&s.stats.deletes, numDeletes)
|
||||
atomic.AddUint64(&s.stats.batches, 1)
|
||||
atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes)
|
||||
atomic.AddUint64(&s.stats.TotUpdates, numUpdates)
|
||||
atomic.AddUint64(&s.stats.TotDeletes, numDeletes)
|
||||
atomic.AddUint64(&s.stats.TotBatches, 1)
|
||||
atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes)
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes)
|
||||
atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart)))
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
||||
internalOps map[string][]byte) error {
|
||||
internalOps map[string][]byte, persistedCallback index.BatchCallback) error {
|
||||
|
||||
// new introduction
|
||||
introduction := &segmentIntroduction{
|
||||
id: atomic.AddUint64(&s.nextSegmentID, 1),
|
||||
data: newSegment,
|
||||
ids: ids,
|
||||
obsoletes: make(map[uint64]*roaring.Bitmap),
|
||||
internal: internalOps,
|
||||
applied: make(chan error),
|
||||
id: atomic.AddUint64(&s.nextSegmentID, 1),
|
||||
data: newSegment,
|
||||
ids: ids,
|
||||
obsoletes: make(map[uint64]*roaring.Bitmap),
|
||||
internal: internalOps,
|
||||
applied: make(chan error),
|
||||
persistedCallback: persistedCallback,
|
||||
}
|
||||
|
||||
if !s.unsafeBatch {
|
||||
|
@ -326,6 +401,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
|||
root.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
for _, seg := range root.segment {
|
||||
delta, err := seg.segment.DocNumbers(ids)
|
||||
if err != nil {
|
||||
|
@ -334,7 +411,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
|||
introduction.obsoletes[seg.id] = delta
|
||||
}
|
||||
|
||||
_ = root.DecRef()
|
||||
introStartTime := time.Now()
|
||||
|
||||
s.introductions <- introduction
|
||||
|
||||
|
@ -348,6 +425,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
|||
err = <-introduction.persisted
|
||||
}
|
||||
|
||||
introTime := uint64(time.Since(introStartTime))
|
||||
atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime)
|
||||
if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime {
|
||||
atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -366,18 +449,69 @@ func (s *Scorch) DeleteInternal(key []byte) error {
|
|||
// Reader returns a low-level accessor on the index data. Close it to
|
||||
// release associated resources.
|
||||
func (s *Scorch) Reader() (index.IndexReader, error) {
|
||||
return s.currentSnapshot(), nil
|
||||
}
|
||||
|
||||
func (s *Scorch) currentSnapshot() *IndexSnapshot {
|
||||
s.rootLock.RLock()
|
||||
rv := &Reader{root: s.root}
|
||||
rv.root.AddRef()
|
||||
rv := s.root
|
||||
if rv != nil {
|
||||
rv.AddRef()
|
||||
}
|
||||
s.rootLock.RUnlock()
|
||||
return rv, nil
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *Scorch) Stats() json.Marshaler {
|
||||
return s.stats
|
||||
return &s.stats
|
||||
}
|
||||
|
||||
func (s *Scorch) diskFileStats() (uint64, uint64) {
|
||||
var numFilesOnDisk, numBytesUsedDisk uint64
|
||||
if s.path != "" {
|
||||
finfos, err := ioutil.ReadDir(s.path)
|
||||
if err == nil {
|
||||
for _, finfo := range finfos {
|
||||
if !finfo.IsDir() {
|
||||
numBytesUsedDisk += uint64(finfo.Size())
|
||||
numFilesOnDisk++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return numFilesOnDisk, numBytesUsedDisk
|
||||
}
|
||||
|
||||
func (s *Scorch) StatsMap() map[string]interface{} {
|
||||
m, _ := s.stats.statsMap()
|
||||
m := s.stats.ToMap()
|
||||
|
||||
numFilesOnDisk, numBytesUsedDisk := s.diskFileStats()
|
||||
|
||||
m["CurOnDiskBytes"] = numBytesUsedDisk
|
||||
m["CurOnDiskFiles"] = numFilesOnDisk
|
||||
|
||||
// TODO: consider one day removing these backwards compatible
|
||||
// names for apps using the old names
|
||||
m["updates"] = m["TotUpdates"]
|
||||
m["deletes"] = m["TotDeletes"]
|
||||
m["batches"] = m["TotBatches"]
|
||||
m["errors"] = m["TotOnErrors"]
|
||||
m["analysis_time"] = m["TotAnalysisTime"]
|
||||
m["index_time"] = m["TotIndexTime"]
|
||||
m["term_searchers_started"] = m["TotTermSearchersStarted"]
|
||||
m["term_searchers_finished"] = m["TotTermSearchersFinished"]
|
||||
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
|
||||
m["num_items_introduced"] = m["TotIntroducedItems"]
|
||||
m["num_items_persisted"] = m["TotPersistedItems"]
|
||||
m["num_recs_to_persist"] = m["TotItemsToPersist"]
|
||||
m["num_bytes_used_disk"] = m["CurOnDiskBytes"]
|
||||
m["num_files_on_disk"] = m["CurOnDiskFiles"]
|
||||
m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"]
|
||||
m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"]
|
||||
m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"]
|
||||
m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"]
|
||||
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"]
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
|
@ -394,7 +528,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult {
|
|||
rv.Analyzed[i] = tokenFreqs
|
||||
rv.Length[i] = fieldLength
|
||||
|
||||
if len(d.CompositeFields) > 0 {
|
||||
if len(d.CompositeFields) > 0 && field.Name() != "_id" {
|
||||
// see if any of the composite fields need this
|
||||
for _, compositeField := range d.CompositeFields {
|
||||
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
|
||||
|
@ -418,20 +552,43 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) {
|
|||
s.rootLock.Unlock()
|
||||
}
|
||||
|
||||
func (s *Scorch) MemoryUsed() uint64 {
|
||||
var memUsed uint64
|
||||
s.rootLock.RLock()
|
||||
if s.root != nil {
|
||||
for _, segmentSnapshot := range s.root.segment {
|
||||
memUsed += 8 /* size of id -> uint64 */ +
|
||||
segmentSnapshot.segment.SizeInBytes()
|
||||
if segmentSnapshot.deleted != nil {
|
||||
memUsed += segmentSnapshot.deleted.GetSizeInBytes()
|
||||
}
|
||||
memUsed += segmentSnapshot.cachedDocs.sizeInBytes()
|
||||
}
|
||||
func (s *Scorch) MemoryUsed() (memUsed uint64) {
|
||||
indexSnapshot := s.currentSnapshot()
|
||||
if indexSnapshot == nil {
|
||||
return
|
||||
}
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
defer func() {
|
||||
_ = indexSnapshot.Close()
|
||||
}()
|
||||
|
||||
// Account for current root snapshot overhead
|
||||
memUsed += uint64(indexSnapshot.Size())
|
||||
|
||||
// Account for snapshot that the persister may be working on
|
||||
persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch)
|
||||
persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize)
|
||||
if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch {
|
||||
// the snapshot that the persister is working on isn't the same as
|
||||
// the current snapshot
|
||||
memUsed += persistSnapshotSize
|
||||
}
|
||||
|
||||
// Account for snapshot that the merger may be working on
|
||||
mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch)
|
||||
mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize)
|
||||
if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch {
|
||||
// the snapshot that the merger is working on isn't the same as
|
||||
// the current snapshot
|
||||
memUsed += mergeSnapshotSize
|
||||
}
|
||||
|
||||
memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) -
|
||||
atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved))
|
||||
|
||||
memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) -
|
||||
atomic.LoadUint64(&s.iStats.analysisBytesRemoved))
|
||||
|
||||
return memUsed
|
||||
}
|
||||
|
||||
|
@ -450,3 +607,15 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) {
|
|||
func init() {
|
||||
registry.RegisterIndexType(Name, NewScorch)
|
||||
}
|
||||
|
||||
func parseToInteger(i interface{}) (int, error) {
|
||||
switch v := i.(type) {
|
||||
case float64:
|
||||
return int(v), nil
|
||||
case int:
|
||||
return v, nil
|
||||
|
||||
default:
|
||||
return 0, fmt.Errorf("expects int or float64 value")
|
||||
}
|
||||
}
|
||||
|
|
40
vendor/github.com/blevesearch/bleve/index/scorch/segment/empty.go
generated
vendored
40
vendor/github.com/blevesearch/bleve/index/scorch/segment/empty.go
generated
vendored
|
@ -17,6 +17,7 @@ package segment
|
|||
import (
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/couchbase/vellum"
|
||||
)
|
||||
|
||||
type EmptySegment struct{}
|
||||
|
@ -29,6 +30,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit
|
|||
return nil
|
||||
}
|
||||
|
||||
func (e *EmptySegment) DocID(num uint64) ([]byte, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (e *EmptySegment) Count() uint64 {
|
||||
return 0
|
||||
}
|
||||
|
@ -46,6 +51,10 @@ func (e *EmptySegment) Close() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (e *EmptySegment) Size() uint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func (e *EmptySegment) AddRef() {
|
||||
}
|
||||
|
||||
|
@ -55,8 +64,8 @@ func (e *EmptySegment) DecRef() error {
|
|||
|
||||
type EmptyDictionary struct{}
|
||||
|
||||
func (e *EmptyDictionary) PostingsList(term string,
|
||||
except *roaring.Bitmap) (PostingsList, error) {
|
||||
func (e *EmptyDictionary) PostingsList(term []byte,
|
||||
except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) {
|
||||
return &EmptyPostingsList{}, nil
|
||||
}
|
||||
|
||||
|
@ -72,18 +81,37 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
|
|||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton,
|
||||
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator {
|
||||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte,
|
||||
includeCount bool) DictionaryIterator {
|
||||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
type EmptyDictionaryIterator struct{}
|
||||
|
||||
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type EmptyPostingsList struct{}
|
||||
|
||||
func (e *EmptyPostingsList) Iterator() PostingsIterator {
|
||||
func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool,
|
||||
prealloc PostingsIterator) PostingsIterator {
|
||||
return &EmptyPostingsIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsList) Size() int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsList) Count() uint64 {
|
||||
return 0
|
||||
}
|
||||
|
@ -93,3 +121,9 @@ type EmptyPostingsIterator struct{}
|
|||
func (e *EmptyPostingsIterator) Next() (Posting, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsIterator) Size() int {
|
||||
return 0
|
||||
}
|
||||
|
||||
var AnEmptyPostingsIterator = &EmptyPostingsIterator{}
|
||||
|
|
321
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/build.go
generated
vendored
321
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/build.go
generated
vendored
|
@ -1,321 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
)
|
||||
|
||||
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment
|
||||
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
|
||||
s := New()
|
||||
|
||||
// ensure that _id field get fieldID 0
|
||||
s.getOrDefineField("_id")
|
||||
|
||||
// fill Dicts/DictKeys and preallocate memory
|
||||
s.initializeDict(results)
|
||||
|
||||
// walk each doc
|
||||
for _, result := range results {
|
||||
s.processDocument(result)
|
||||
}
|
||||
|
||||
// go back and sort the dictKeys
|
||||
for _, dict := range s.DictKeys {
|
||||
sort.Strings(dict)
|
||||
}
|
||||
|
||||
// compute memory usage of segment
|
||||
s.updateSizeInBytes()
|
||||
|
||||
// professional debugging
|
||||
//
|
||||
// log.Printf("fields: %v\n", s.FieldsMap)
|
||||
// log.Printf("fieldsInv: %v\n", s.FieldsInv)
|
||||
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc)
|
||||
// log.Printf("dicts: %v\n", s.Dicts)
|
||||
// log.Printf("dict keys: %v\n", s.DictKeys)
|
||||
// for i, posting := range s.Postings {
|
||||
// log.Printf("posting %d: %v\n", i, posting)
|
||||
// }
|
||||
// for i, freq := range s.Freqs {
|
||||
// log.Printf("freq %d: %v\n", i, freq)
|
||||
// }
|
||||
// for i, norm := range s.Norms {
|
||||
// log.Printf("norm %d: %v\n", i, norm)
|
||||
// }
|
||||
// for i, field := range s.Locfields {
|
||||
// log.Printf("field %d: %v\n", i, field)
|
||||
// }
|
||||
// for i, start := range s.Locstarts {
|
||||
// log.Printf("start %d: %v\n", i, start)
|
||||
// }
|
||||
// for i, end := range s.Locends {
|
||||
// log.Printf("end %d: %v\n", i, end)
|
||||
// }
|
||||
// for i, pos := range s.Locpos {
|
||||
// log.Printf("pos %d: %v\n", i, pos)
|
||||
// }
|
||||
// for i, apos := range s.Locarraypos {
|
||||
// log.Printf("apos %d: %v\n", i, apos)
|
||||
// }
|
||||
// log.Printf("stored: %v\n", s.Stored)
|
||||
// log.Printf("stored types: %v\n", s.StoredTypes)
|
||||
// log.Printf("stored pos: %v\n", s.StoredPos)
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// fill Dicts/DictKeys and preallocate memory for postings
|
||||
func (s *Segment) initializeDict(results []*index.AnalysisResult) {
|
||||
var numPostingsLists int
|
||||
|
||||
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
|
||||
var numTokenFrequencies int
|
||||
var totLocs int
|
||||
|
||||
// initial scan for all fieldID's to sort them
|
||||
for _, result := range results {
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
for _, field := range result.Document.Fields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
}
|
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
|
||||
for fieldID, fieldName := range s.FieldsInv {
|
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||
}
|
||||
|
||||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||
for term, tf := range tfs {
|
||||
pidPlus1, exists := s.Dicts[fieldID][term]
|
||||
if !exists {
|
||||
numPostingsLists++
|
||||
pidPlus1 = uint64(numPostingsLists)
|
||||
s.Dicts[fieldID][term] = pidPlus1
|
||||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
|
||||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0)
|
||||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0)
|
||||
}
|
||||
pid := pidPlus1 - 1
|
||||
numTermsPerPostingsList[pid] += 1
|
||||
numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||
totLocs += len(tf.Locations)
|
||||
}
|
||||
numTokenFrequencies += len(tfs)
|
||||
}
|
||||
|
||||
for _, result := range results {
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
_, tf := field.Analyze()
|
||||
processField(fieldID, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
tf := result.Analyzed[i]
|
||||
processField(fieldID, tf)
|
||||
}
|
||||
}
|
||||
|
||||
s.Postings = make([]*roaring.Bitmap, numPostingsLists)
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
s.Postings[i] = roaring.New()
|
||||
}
|
||||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists)
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
s.PostingsLocs[i] = roaring.New()
|
||||
}
|
||||
|
||||
// Preallocate big, contiguous backing arrays.
|
||||
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos.
|
||||
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos.
|
||||
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms.
|
||||
uint16Backing := make([]uint16, totLocs) // For sub-Locfields.
|
||||
|
||||
// Point top-level slices to the backing arrays.
|
||||
s.Freqs = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Norms = make([][]float32, numPostingsLists)
|
||||
|
||||
s.Locfields = make([][]uint16, numPostingsLists)
|
||||
|
||||
s.Locstarts = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locends = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locpos = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locarraypos = make([][][]uint64, numPostingsLists)
|
||||
|
||||
// Point sub-slices to the backing arrays.
|
||||
for pid, numTerms := range numTermsPerPostingsList {
|
||||
s.Freqs[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numTerms:]
|
||||
|
||||
s.Norms[pid] = float32Backing[0:0]
|
||||
float32Backing = float32Backing[numTerms:]
|
||||
}
|
||||
|
||||
for pid, numLocs := range numLocsPerPostingsList {
|
||||
s.Locfields[pid] = uint16Backing[0:0]
|
||||
uint16Backing = uint16Backing[numLocs:]
|
||||
|
||||
s.Locstarts[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locends[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locpos[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locarraypos[pid] = auint64Backing[0:0]
|
||||
auint64Backing = auint64Backing[numLocs:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) processDocument(result *index.AnalysisResult) {
|
||||
// used to collate information across fields
|
||||
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap))
|
||||
fieldLens := make(map[uint16]int, len(s.FieldsMap))
|
||||
|
||||
docNum := uint64(s.addDocument())
|
||||
|
||||
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) {
|
||||
fieldLens[field] += l
|
||||
if existingFreqs, ok := docMap[field]; ok {
|
||||
existingFreqs.MergeAll(name, tf)
|
||||
} else {
|
||||
docMap[field] = tf
|
||||
}
|
||||
}
|
||||
|
||||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
|
||||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val)
|
||||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ)
|
||||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos)
|
||||
}
|
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
l, tf := field.Analyze()
|
||||
processField(fieldID, field.Name(), l, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
l := result.Length[i]
|
||||
tf := result.Analyzed[i]
|
||||
processField(fieldID, field.Name(), l, tf)
|
||||
if field.Options().IsStored() {
|
||||
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions())
|
||||
}
|
||||
|
||||
if field.Options().IncludeDocValues() {
|
||||
s.DocValueFields[fieldID] = true
|
||||
}
|
||||
}
|
||||
|
||||
// now that its been rolled up into docMap, walk that
|
||||
for fieldID, tokenFrequencies := range docMap {
|
||||
for term, tokenFreq := range tokenFrequencies {
|
||||
pid := s.Dicts[fieldID][term] - 1
|
||||
bs := s.Postings[pid]
|
||||
bs.AddInt(int(docNum))
|
||||
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency()))
|
||||
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
|
||||
locationBS := s.PostingsLocs[pid]
|
||||
if len(tokenFreq.Locations) > 0 {
|
||||
locationBS.AddInt(int(docNum))
|
||||
for _, loc := range tokenFreq.Locations {
|
||||
var locf = fieldID
|
||||
if loc.Field != "" {
|
||||
locf = uint16(s.getOrDefineField(loc.Field))
|
||||
}
|
||||
s.Locfields[pid] = append(s.Locfields[pid], locf)
|
||||
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start))
|
||||
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End))
|
||||
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position))
|
||||
if len(loc.ArrayPositions) > 0 {
|
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions)
|
||||
} else {
|
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) getOrDefineField(name string) int {
|
||||
fieldIDPlus1, ok := s.FieldsMap[name]
|
||||
if !ok {
|
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||
s.FieldsMap[name] = fieldIDPlus1
|
||||
s.FieldsInv = append(s.FieldsInv, name)
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||
s.DictKeys = append(s.DictKeys, make([]string, 0))
|
||||
}
|
||||
return int(fieldIDPlus1 - 1)
|
||||
}
|
||||
|
||||
func (s *Segment) addDocument() int {
|
||||
docNum := len(s.Stored)
|
||||
s.Stored = append(s.Stored, map[uint16][][]byte{})
|
||||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{})
|
||||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{})
|
||||
return docNum
|
||||
}
|
||||
|
||||
func encodeFieldType(f document.Field) byte {
|
||||
fieldType := byte('x')
|
||||
switch f.(type) {
|
||||
case *document.TextField:
|
||||
fieldType = 't'
|
||||
case *document.NumericField:
|
||||
fieldType = 'n'
|
||||
case *document.DateTimeField:
|
||||
fieldType = 'd'
|
||||
case *document.BooleanField:
|
||||
fieldType = 'b'
|
||||
case *document.GeoPointField:
|
||||
fieldType = 'g'
|
||||
case *document.CompositeField:
|
||||
fieldType = 'c'
|
||||
}
|
||||
return fieldType
|
||||
}
|
103
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/dict.go
generated
vendored
103
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/dict.go
generated
vendored
|
@ -1,103 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// Dictionary is the in-memory representation of the term dictionary
|
||||
type Dictionary struct {
|
||||
segment *Segment
|
||||
field string
|
||||
fieldID uint16
|
||||
}
|
||||
|
||||
// PostingsList returns the postings list for the specified term
|
||||
func (d *Dictionary) PostingsList(term string,
|
||||
except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
return &PostingsList{
|
||||
dictionary: d,
|
||||
term: term,
|
||||
postingsID: d.segment.Dicts[d.fieldID][term],
|
||||
except: except,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this dictionary
|
||||
func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
}
|
||||
|
||||
// PrefixIterator returns an iterator which only visits terms having the
|
||||
// the specified prefix
|
||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
prefix: prefix,
|
||||
offset: offset,
|
||||
}
|
||||
}
|
||||
|
||||
// RangeIterator returns an iterator which only visits terms between the
|
||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
offset: offset,
|
||||
end: end,
|
||||
}
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
prefix string
|
||||
end string
|
||||
offset int
|
||||
|
||||
dictEntry index.DictEntry // reused across Next()'s
|
||||
}
|
||||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 {
|
||||
return nil, nil
|
||||
}
|
||||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset]
|
||||
// check prefix
|
||||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) {
|
||||
return nil, nil
|
||||
}
|
||||
// check end (bleve.index API demands inclusive end)
|
||||
if d.end != "" && next > d.end {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
d.offset++
|
||||
postingID := d.d.segment.Dicts[d.d.fieldID][next]
|
||||
d.dictEntry.Term = next
|
||||
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality()
|
||||
return &d.dictEntry, nil
|
||||
}
|
178
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/posting.go
generated
vendored
178
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/posting.go
generated
vendored
|
@ -1,178 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// PostingsList is an in-memory represenation of a postings list
|
||||
type PostingsList struct {
|
||||
dictionary *Dictionary
|
||||
term string
|
||||
postingsID uint64
|
||||
except *roaring.Bitmap
|
||||
}
|
||||
|
||||
// Count returns the number of items on this postings list
|
||||
func (p *PostingsList) Count() uint64 {
|
||||
var rv uint64
|
||||
if p.postingsID > 0 {
|
||||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality()
|
||||
if p.except != nil {
|
||||
except := p.except.GetCardinality()
|
||||
if except > rv {
|
||||
// avoid underflow
|
||||
except = rv
|
||||
}
|
||||
rv -= except
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this postings list
|
||||
func (p *PostingsList) Iterator() segment.PostingsIterator {
|
||||
rv := &PostingsIterator{
|
||||
postings: p,
|
||||
}
|
||||
if p.postingsID > 0 {
|
||||
allbits := p.dictionary.segment.Postings[p.postingsID-1]
|
||||
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1]
|
||||
rv.all = allbits.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := allbits.Clone()
|
||||
allExcept.AndNot(p.except)
|
||||
rv.actual = allExcept.Iterator()
|
||||
} else {
|
||||
rv.actual = allbits.Iterator()
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// PostingsIterator provides a way to iterate through the postings list
|
||||
type PostingsIterator struct {
|
||||
postings *PostingsList
|
||||
all roaring.IntIterable
|
||||
locations *roaring.Bitmap
|
||||
offset int
|
||||
locoffset int
|
||||
actual roaring.IntIterable
|
||||
}
|
||||
|
||||
// Next returns the next posting on the postings list, or nil at the end
|
||||
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
||||
if i.actual == nil || !i.actual.HasNext() {
|
||||
return nil, nil
|
||||
}
|
||||
n := i.actual.Next()
|
||||
allN := i.all.Next()
|
||||
|
||||
// n is the next actual hit (excluding some postings)
|
||||
// allN is the next hit in the full postings
|
||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
||||
// incr the all iterator, and check again
|
||||
for allN != n {
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
allN = i.all.Next()
|
||||
}
|
||||
rv := &Posting{
|
||||
iterator: i,
|
||||
docNum: uint64(n),
|
||||
offset: i.offset,
|
||||
locoffset: i.locoffset,
|
||||
hasLoc: i.locations.Contains(n),
|
||||
}
|
||||
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Posting is a single entry in a postings list
|
||||
type Posting struct {
|
||||
iterator *PostingsIterator
|
||||
docNum uint64
|
||||
offset int
|
||||
locoffset int
|
||||
hasLoc bool
|
||||
}
|
||||
|
||||
// Number returns the document number of this posting in this segment
|
||||
func (p *Posting) Number() uint64 {
|
||||
return p.docNum
|
||||
}
|
||||
|
||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
||||
func (p *Posting) Frequency() uint64 {
|
||||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset]
|
||||
}
|
||||
|
||||
// Norm returns the normalization factor for this posting
|
||||
func (p *Posting) Norm() float64 {
|
||||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset])
|
||||
}
|
||||
|
||||
// Locations returns the location information for each occurance
|
||||
func (p *Posting) Locations() []segment.Location {
|
||||
if !p.hasLoc {
|
||||
return nil
|
||||
}
|
||||
freq := int(p.Frequency())
|
||||
rv := make([]segment.Location, freq)
|
||||
for i := 0; i < freq; i++ {
|
||||
rv[i] = &Location{
|
||||
p: p,
|
||||
offset: p.locoffset + i,
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// Location represents the location of a single occurance
|
||||
type Location struct {
|
||||
p *Posting
|
||||
offset int
|
||||
}
|
||||
|
||||
// Field returns the name of the field (useful in composite fields to know
|
||||
// which original field the value came from)
|
||||
func (l *Location) Field() string {
|
||||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]]
|
||||
}
|
||||
|
||||
// Start returns the start byte offset of this occurance
|
||||
func (l *Location) Start() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// End returns the end byte offset of this occurance
|
||||
func (l *Location) End() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// Pos returns the 1-based phrase position of this occurance
|
||||
func (l *Location) Pos() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// ArrayPositions returns the array position vector associated with this occurance
|
||||
func (l *Location) ArrayPositions() []uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
289
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/segment.go
generated
vendored
289
vendor/github.com/blevesearch/bleve/index/scorch/segment/mem/segment.go
generated
vendored
|
@ -1,289 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// _id field is always guaranteed to have fieldID of 0
|
||||
const idFieldID uint16 = 0
|
||||
|
||||
// KNOWN ISSUES
|
||||
// - LIMITATION - we decided whether or not to store term vectors for a field
|
||||
// at the segment level, based on the first definition of a
|
||||
// field we see. in normal bleve usage this is fine, all
|
||||
// instances of a field definition will be the same. however,
|
||||
// advanced users may violate this and provide unique field
|
||||
// definitions with each document. this segment does not
|
||||
// support this usage.
|
||||
|
||||
// TODO
|
||||
// - need better testing of multiple docs, iterating freqs, locations and
|
||||
// and verifying the correct results are returned
|
||||
|
||||
// Segment is an in memory implementation of scorch.Segment
|
||||
type Segment struct {
|
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16
|
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string
|
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64
|
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string
|
||||
|
||||
// Postings list
|
||||
// postings list id -> bitmap by docNum
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// Postings list has locations
|
||||
PostingsLocs []*roaring.Bitmap
|
||||
|
||||
// Term frequencies
|
||||
// postings list id -> Freqs (one for each hit in bitmap)
|
||||
Freqs [][]uint64
|
||||
|
||||
// Field norms
|
||||
// postings list id -> Norms (one for each hit in bitmap)
|
||||
Norms [][]float32
|
||||
|
||||
// Field/start/end/pos/locarraypos
|
||||
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
||||
Locfields [][]uint16
|
||||
Locstarts [][]uint64
|
||||
Locends [][]uint64
|
||||
Locpos [][]uint64
|
||||
Locarraypos [][][]uint64
|
||||
|
||||
// Stored field values
|
||||
// docNum -> field id -> slice of values (each value []byte)
|
||||
Stored []map[uint16][][]byte
|
||||
|
||||
// Stored field types
|
||||
// docNum -> field id -> slice of types (each type byte)
|
||||
StoredTypes []map[uint16][]byte
|
||||
|
||||
// Stored field array positions
|
||||
// docNum -> field id -> slice of array positions (each is []uint64)
|
||||
StoredPos []map[uint16][][]uint64
|
||||
|
||||
// For storing the docValue persisted fields
|
||||
DocValueFields map[uint16]bool
|
||||
|
||||
// Footprint of the segment, updated when analyzed document mutations
|
||||
// are added into the segment
|
||||
sizeInBytes uint64
|
||||
}
|
||||
|
||||
// New builds a new empty Segment
|
||||
func New() *Segment {
|
||||
return &Segment{
|
||||
FieldsMap: map[string]uint16{},
|
||||
DocValueFields: map[uint16]bool{},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) updateSizeInBytes() {
|
||||
var sizeInBytes uint64
|
||||
|
||||
// FieldsMap, FieldsInv
|
||||
for k, _ := range s.FieldsMap {
|
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
||||
2 /* size of uint16 */)
|
||||
}
|
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
||||
|
||||
// Dicts, DictKeys
|
||||
for _, entry := range s.Dicts {
|
||||
for k, _ := range entry {
|
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
||||
8 /* size of uint64 */)
|
||||
}
|
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Postings, PostingsLocs
|
||||
for i := 0; i < len(s.Postings); i++ {
|
||||
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) +
|
||||
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Freqs, Norms
|
||||
for i := 0; i < len(s.Freqs); i++ {
|
||||
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ +
|
||||
len(s.Norms[i])*4 /* size of float32 */) +
|
||||
(segment.SizeOfSlice * 2)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Location data
|
||||
for i := 0; i < len(s.Locfields); i++ {
|
||||
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ +
|
||||
len(s.Locstarts[i])*8 /* size of uint64 */ +
|
||||
len(s.Locends[i])*8 /* size of uint64 */ +
|
||||
len(s.Locpos[i])*8 /* size of uint64 */)
|
||||
|
||||
for j := 0; j < len(s.Locarraypos[i]); j++ {
|
||||
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) +
|
||||
segment.SizeOfSlice
|
||||
}
|
||||
|
||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
||||
|
||||
// Stored data
|
||||
for i := 0; i < len(s.Stored); i++ {
|
||||
for _, v := range s.Stored[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
||||
for _, arr := range v {
|
||||
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice
|
||||
}
|
||||
sizeInBytes += segment.SizeOfSlice
|
||||
}
|
||||
|
||||
for _, v := range s.StoredTypes[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice
|
||||
}
|
||||
|
||||
for _, v := range s.StoredPos[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
||||
for _, arr := range v {
|
||||
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) +
|
||||
segment.SizeOfSlice
|
||||
}
|
||||
sizeInBytes += segment.SizeOfSlice
|
||||
}
|
||||
|
||||
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfMap * 3)
|
||||
}
|
||||
// overhead from data structures: Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfSlice * 3)
|
||||
|
||||
// DocValueFields
|
||||
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) +
|
||||
segment.SizeOfMap
|
||||
|
||||
// SizeInBytes
|
||||
sizeInBytes += uint64(8)
|
||||
|
||||
s.sizeInBytes = sizeInBytes
|
||||
}
|
||||
|
||||
func (s *Segment) SizeInBytes() uint64 {
|
||||
return s.sizeInBytes
|
||||
}
|
||||
|
||||
func (s *Segment) AddRef() {
|
||||
}
|
||||
|
||||
func (s *Segment) DecRef() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fields returns the field names used in this segment
|
||||
func (s *Segment) Fields() []string {
|
||||
return s.FieldsInv
|
||||
}
|
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||
// ensure document number exists
|
||||
if int(num) > len(s.Stored)-1 {
|
||||
return nil
|
||||
}
|
||||
docFields := s.Stored[int(num)]
|
||||
st := s.StoredTypes[int(num)]
|
||||
sp := s.StoredPos[int(num)]
|
||||
for field, values := range docFields {
|
||||
for i, value := range values {
|
||||
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i])
|
||||
if !keepGoing {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Segment) getField(name string) (int, error) {
|
||||
fieldID, ok := s.FieldsMap[name]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("no field named %s", name)
|
||||
}
|
||||
return int(fieldID - 1), nil
|
||||
}
|
||||
|
||||
// Dictionary returns the term dictionary for the specified field
|
||||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
|
||||
fieldID, err := s.getField(field)
|
||||
if err != nil {
|
||||
// no such field, return empty dictionary
|
||||
return &segment.EmptyDictionary{}, nil
|
||||
}
|
||||
return &Dictionary{
|
||||
segment: s,
|
||||
field: field,
|
||||
fieldID: uint16(fieldID),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Count returns the number of documents in this segment
|
||||
// (this has no notion of deleted docs)
|
||||
func (s *Segment) Count() uint64 {
|
||||
return uint64(len(s.Stored))
|
||||
}
|
||||
|
||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
||||
// provided _id strings
|
||||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
||||
rv := roaring.New()
|
||||
|
||||
// guard against empty segment
|
||||
if len(s.FieldsMap) > 0 {
|
||||
idDictionary := s.Dicts[idFieldID]
|
||||
|
||||
for _, id := range ids {
|
||||
postingID := idDictionary[id]
|
||||
if postingID > 0 {
|
||||
rv.Or(s.Postings[postingID-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Close releases all resources associated with this segment
|
||||
func (s *Segment) Close() error {
|
||||
return nil
|
||||
}
|
75
vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go
generated
vendored
Normal file
75
vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go
generated
vendored
Normal file
|
@ -0,0 +1,75 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package segment
|
||||
|
||||
import (
|
||||
"regexp/syntax"
|
||||
|
||||
"github.com/couchbase/vellum/regexp"
|
||||
)
|
||||
|
||||
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) {
|
||||
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
|
||||
|
||||
parsed, err := syntax.Parse(pattern, syntax.Perl)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
prefix := LiteralPrefix(parsed)
|
||||
if prefix != "" {
|
||||
prefixBeg := []byte(prefix)
|
||||
prefixEnd := IncrementBytes(prefixBeg)
|
||||
return re, prefixBeg, prefixEnd, nil
|
||||
}
|
||||
|
||||
return re, nil, nil, nil
|
||||
}
|
||||
|
||||
// Returns the literal prefix given the parse tree for a regexp
|
||||
func LiteralPrefix(s *syntax.Regexp) string {
|
||||
// traverse the left-most branch in the parse tree as long as the
|
||||
// node represents a concatenation
|
||||
for s != nil && s.Op == syntax.OpConcat {
|
||||
if len(s.Sub) < 1 {
|
||||
return ""
|
||||
}
|
||||
|
||||
s = s.Sub[0]
|
||||
}
|
||||
|
||||
if s.Op == syntax.OpLiteral {
|
||||
return string(s.Rune)
|
||||
}
|
||||
|
||||
return "" // no literal prefix
|
||||
}
|
||||
|
||||
func IncrementBytes(in []byte) []byte {
|
||||
rv := make([]byte, len(in))
|
||||
copy(rv, in)
|
||||
for i := len(rv) - 1; i >= 0; i-- {
|
||||
rv[i] = rv[i] + 1
|
||||
if rv[i] != 0 {
|
||||
return rv // didn't overflow, so stop
|
||||
}
|
||||
}
|
||||
return nil // overflowed
|
||||
}
|
43
vendor/github.com/blevesearch/bleve/index/scorch/segment/segment.go
generated
vendored
43
vendor/github.com/blevesearch/bleve/index/scorch/segment/segment.go
generated
vendored
|
@ -15,15 +15,14 @@
|
|||
package segment
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/couchbase/vellum"
|
||||
)
|
||||
|
||||
// Overhead from go data structures when deployed on a 64-bit system.
|
||||
const SizeOfMap uint64 = 8
|
||||
const SizeOfPointer uint64 = 8
|
||||
const SizeOfSlice uint64 = 24
|
||||
const SizeOfString uint64 = 16
|
||||
var ErrClosed = fmt.Errorf("index closed")
|
||||
|
||||
// DocumentFieldValueVisitor defines a callback to be visited for each
|
||||
// stored field value. The return value determines if the visitor
|
||||
|
@ -34,6 +33,9 @@ type Segment interface {
|
|||
Dictionary(field string) (TermDictionary, error)
|
||||
|
||||
VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error
|
||||
|
||||
DocID(num uint64) ([]byte, error)
|
||||
|
||||
Count() uint64
|
||||
|
||||
DocNumbers([]string) (*roaring.Bitmap, error)
|
||||
|
@ -42,18 +44,21 @@ type Segment interface {
|
|||
|
||||
Close() error
|
||||
|
||||
SizeInBytes() uint64
|
||||
Size() int
|
||||
|
||||
AddRef()
|
||||
DecRef() error
|
||||
}
|
||||
|
||||
type TermDictionary interface {
|
||||
PostingsList(term string, except *roaring.Bitmap) (PostingsList, error)
|
||||
PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error)
|
||||
|
||||
Iterator() DictionaryIterator
|
||||
PrefixIterator(prefix string) DictionaryIterator
|
||||
RangeIterator(start, end string) DictionaryIterator
|
||||
AutomatonIterator(a vellum.Automaton,
|
||||
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator
|
||||
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator
|
||||
}
|
||||
|
||||
type DictionaryIterator interface {
|
||||
|
@ -61,7 +66,9 @@ type DictionaryIterator interface {
|
|||
}
|
||||
|
||||
type PostingsList interface {
|
||||
Iterator() PostingsIterator
|
||||
Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator
|
||||
|
||||
Size() int
|
||||
|
||||
Count() uint64
|
||||
|
||||
|
@ -77,6 +84,14 @@ type PostingsIterator interface {
|
|||
// implementations may return a shared instance to reduce memory
|
||||
// allocations.
|
||||
Next() (Posting, error)
|
||||
|
||||
// Advance will return the posting with the specified doc number
|
||||
// or if there is no such posting, the next posting.
|
||||
// Callers MUST NOT attempt to pass a docNum that is less than or
|
||||
// equal to the currently visited posting doc Num.
|
||||
Advance(docNum uint64) (Posting, error)
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
||||
type Posting interface {
|
||||
|
@ -86,6 +101,8 @@ type Posting interface {
|
|||
Norm() float64
|
||||
|
||||
Locations() []Location
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
||||
type Location interface {
|
||||
|
@ -94,6 +111,7 @@ type Location interface {
|
|||
End() uint64
|
||||
Pos() uint64
|
||||
ArrayPositions() []uint64
|
||||
Size() int
|
||||
}
|
||||
|
||||
// DocumentFieldTermVisitable is implemented by various scorch segment
|
||||
|
@ -101,10 +119,17 @@ type Location interface {
|
|||
// postings or other indexed values.
|
||||
type DocumentFieldTermVisitable interface {
|
||||
VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor) error
|
||||
visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error)
|
||||
|
||||
// VisitableDocValueFields implementation should return
|
||||
// the list of fields which are document value persisted and
|
||||
// therefore visitable by the above VisitDocumentFieldTerms method.
|
||||
VisitableDocValueFields() ([]string, error)
|
||||
}
|
||||
|
||||
type DocVisitState interface {
|
||||
}
|
||||
|
||||
type StatsReporter interface {
|
||||
ReportBytesWritten(bytesWritten uint64)
|
||||
}
|
||||
|
|
542
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/build.go
generated
vendored
542
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/build.go
generated
vendored
|
@ -16,19 +16,13 @@ package zap
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"math"
|
||||
"os"
|
||||
"sort"
|
||||
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
const version uint32 = 3
|
||||
const Version uint32 = 11
|
||||
|
||||
const Type string = "zap"
|
||||
|
||||
const fieldNotUninverted = math.MaxUint64
|
||||
|
||||
|
@ -82,219 +76,39 @@ func PersistSegmentBase(sb *SegmentBase, path string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// PersistSegment takes the in-memory segment and persists it to
|
||||
// the specified path in the zap file format.
|
||||
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error {
|
||||
flag := os.O_RDWR | os.O_CREATE
|
||||
|
||||
f, err := os.OpenFile(path, flag, 0600)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cleanup := func() {
|
||||
_ = f.Close()
|
||||
_ = os.Remove(path)
|
||||
}
|
||||
|
||||
// buffer the output
|
||||
br := bufio.NewWriter(f)
|
||||
|
||||
// wrap it for counting (tracking offsets)
|
||||
cr := NewCountHashWriter(br)
|
||||
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err :=
|
||||
persistBase(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
||||
chunkFactor, cr.Sum32(), cr)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = br.Flush()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = f.Sync()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = f.Close()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) (
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
||||
dictLocs []uint64, err error) {
|
||||
docValueOffset = uint64(fieldNotUninverted)
|
||||
|
||||
if len(memSegment.Stored) > 0 {
|
||||
storedIndexOffset, err = persistStored(memSegment, cr)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
postingsListLocs, err := persistPostingsLocs(memSegment, cr)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
} else {
|
||||
dictLocs = make([]uint64, len(memSegment.FieldsInv))
|
||||
}
|
||||
|
||||
fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
||||
dictLocs, nil
|
||||
}
|
||||
|
||||
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
|
||||
var curr int
|
||||
var metaBuf bytes.Buffer
|
||||
var data, compressed []byte
|
||||
|
||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
||||
|
||||
docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
|
||||
|
||||
for docNum, storedValues := range memSegment.Stored {
|
||||
if docNum != 0 {
|
||||
// reset buffer if necessary
|
||||
curr = 0
|
||||
metaBuf.Reset()
|
||||
data = data[:0]
|
||||
compressed = compressed[:0]
|
||||
}
|
||||
|
||||
st := memSegment.StoredTypes[docNum]
|
||||
sp := memSegment.StoredPos[docNum]
|
||||
|
||||
// encode fields in order
|
||||
for fieldID := range memSegment.FieldsInv {
|
||||
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
|
||||
stf := st[uint16(fieldID)]
|
||||
spf := sp[uint16(fieldID)]
|
||||
|
||||
var err2 error
|
||||
curr, data, err2 = persistStoredFieldValues(fieldID,
|
||||
storedFieldValues, stf, spf, curr, metaEncoder, data)
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metaEncoder.Close()
|
||||
metaBytes := metaBuf.Bytes()
|
||||
|
||||
// compress the data
|
||||
compressed = snappy.Encode(compressed, data)
|
||||
|
||||
// record where we're about to start writing
|
||||
docNumOffsets[docNum] = uint64(w.Count())
|
||||
|
||||
// write out the meta len and compressed data len
|
||||
_, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// now write the meta
|
||||
_, err = w.Write(metaBytes)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// now write the compressed data
|
||||
_, err = w.Write(compressed)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
// return value is the start of the stored index
|
||||
rv := uint64(w.Count())
|
||||
// now write out the stored doc index
|
||||
for docNum := range memSegment.Stored {
|
||||
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistStoredFieldValues(fieldID int,
|
||||
storedFieldValues [][]byte, stf []byte, spf [][]uint64,
|
||||
curr int, metaEncoder *govarint.Base128Encoder, data []byte) (
|
||||
curr int, metaEncode varintEncoder, data []byte) (
|
||||
int, []byte, error) {
|
||||
for i := 0; i < len(storedFieldValues); i++ {
|
||||
// encode field
|
||||
_, err := metaEncoder.PutU64(uint64(fieldID))
|
||||
_, err := metaEncode(uint64(fieldID))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode type
|
||||
_, err = metaEncoder.PutU64(uint64(stf[i]))
|
||||
_, err = metaEncode(uint64(stf[i]))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode start offset
|
||||
_, err = metaEncoder.PutU64(uint64(curr))
|
||||
_, err = metaEncode(uint64(curr))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// end len
|
||||
_, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
|
||||
_, err = metaEncode(uint64(len(storedFieldValues[i])))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode number of array pos
|
||||
_, err = metaEncoder.PutU64(uint64(len(spf[i])))
|
||||
_, err = metaEncode(uint64(len(spf[i])))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode all array positions
|
||||
for _, pos := range spf[i] {
|
||||
_, err = metaEncoder.PutU64(pos)
|
||||
_, err = metaEncode(pos)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
@ -307,337 +121,6 @@ func persistStoredFieldValues(fieldID int,
|
|||
return curr, data, nil
|
||||
}
|
||||
|
||||
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
|
||||
var freqOffsets, locOfffsets []uint64
|
||||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
||||
for postingID := range memSegment.Postings {
|
||||
if postingID != 0 {
|
||||
tfEncoder.Reset()
|
||||
}
|
||||
freqs := memSegment.Freqs[postingID]
|
||||
norms := memSegment.Norms[postingID]
|
||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
||||
var offset int
|
||||
for postingsListItr.HasNext() {
|
||||
|
||||
docNum := uint64(postingsListItr.Next())
|
||||
|
||||
// put freq
|
||||
err := tfEncoder.Add(docNum, freqs[offset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put norm
|
||||
norm := norms[offset]
|
||||
normBits := math.Float32bits(norm)
|
||||
err = tfEncoder.Add(docNum, uint64(normBits))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
offset++
|
||||
}
|
||||
|
||||
// record where this postings freq info starts
|
||||
freqOffsets = append(freqOffsets, uint64(w.Count()))
|
||||
|
||||
tfEncoder.Close()
|
||||
_, err := tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// now do it again for the locations
|
||||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
||||
for postingID := range memSegment.Postings {
|
||||
if postingID != 0 {
|
||||
locEncoder.Reset()
|
||||
}
|
||||
freqs := memSegment.Freqs[postingID]
|
||||
locfields := memSegment.Locfields[postingID]
|
||||
locpos := memSegment.Locpos[postingID]
|
||||
locstarts := memSegment.Locstarts[postingID]
|
||||
locends := memSegment.Locends[postingID]
|
||||
locarraypos := memSegment.Locarraypos[postingID]
|
||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
||||
var offset int
|
||||
var locOffset int
|
||||
for postingsListItr.HasNext() {
|
||||
docNum := uint64(postingsListItr.Next())
|
||||
for i := 0; i < int(freqs[offset]); i++ {
|
||||
if len(locfields) > 0 {
|
||||
// put field
|
||||
err := locEncoder.Add(docNum, uint64(locfields[locOffset]))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put pos
|
||||
err = locEncoder.Add(docNum, locpos[locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put start
|
||||
err = locEncoder.Add(docNum, locstarts[locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put end
|
||||
err = locEncoder.Add(docNum, locends[locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put the number of array positions to follow
|
||||
num := len(locarraypos[locOffset])
|
||||
err = locEncoder.Add(docNum, uint64(num))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put each array position
|
||||
for _, pos := range locarraypos[locOffset] {
|
||||
err = locEncoder.Add(docNum, pos)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
locOffset++
|
||||
}
|
||||
offset++
|
||||
}
|
||||
|
||||
// record where this postings loc info starts
|
||||
locOfffsets = append(locOfffsets, uint64(w.Count()))
|
||||
locEncoder.Close()
|
||||
_, err := locEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
return freqOffsets, locOfffsets, nil
|
||||
}
|
||||
|
||||
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
|
||||
rv = make([]uint64, 0, len(memSegment.PostingsLocs))
|
||||
var reuseBuf bytes.Buffer
|
||||
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
||||
for postingID := range memSegment.PostingsLocs {
|
||||
// record where we start this posting loc
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
// write out the length and bitmap
|
||||
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
|
||||
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
|
||||
rv = make([]uint64, 0, len(memSegment.Postings))
|
||||
var reuseBuf bytes.Buffer
|
||||
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
||||
for postingID := range memSegment.Postings {
|
||||
// record where we start this posting list
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
|
||||
// write out the term info, loc info, and loc posting list offset
|
||||
_, err = writeUvarints(w, freqOffsets[postingID],
|
||||
locOffsets[postingID], postingsListLocs[postingID])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the length and bitmap
|
||||
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
|
||||
rv := make([]uint64, 0, len(memSegment.DictKeys))
|
||||
|
||||
varintBuf := make([]byte, binary.MaxVarintLen64)
|
||||
|
||||
var buffer bytes.Buffer
|
||||
for fieldID, fieldTerms := range memSegment.DictKeys {
|
||||
if fieldID != 0 {
|
||||
buffer.Reset()
|
||||
}
|
||||
|
||||
// start a new vellum for this field
|
||||
builder, err := vellum.New(&buffer, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dict := memSegment.Dicts[fieldID]
|
||||
// now walk the dictionary in order of fieldTerms (already sorted)
|
||||
for _, fieldTerm := range fieldTerms {
|
||||
postingID := dict[fieldTerm] - 1
|
||||
postingsAddr := postingsLocs[postingID]
|
||||
err = builder.Insert([]byte(fieldTerm), postingsAddr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
err = builder.Close()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// record where this dictionary starts
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
|
||||
vellumData := buffer.Bytes()
|
||||
|
||||
// write out the length of the vellum data
|
||||
n := binary.PutUvarint(varintBuf, uint64(len(vellumData)))
|
||||
_, err = w.Write(varintBuf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write this vellum to disk
|
||||
_, err = w.Write(vellumData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
type docIDRange []uint64
|
||||
|
||||
func (a docIDRange) Len() int { return len(a) }
|
||||
func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] }
|
||||
|
||||
func persistDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
||||
chunkFactor uint32) (map[uint16]uint64, error) {
|
||||
fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv))
|
||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
||||
|
||||
for fieldID := range memSegment.DocValueFields {
|
||||
field := memSegment.FieldsInv[fieldID]
|
||||
docTermMap := make(map[uint64][]byte, 0)
|
||||
dict, err := memSegment.Dictionary(field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dictItr := dict.Iterator()
|
||||
next, err := dictItr.Next()
|
||||
for err == nil && next != nil {
|
||||
postings, err1 := dict.PostingsList(next.Term, nil)
|
||||
if err1 != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
postingsItr := postings.Iterator()
|
||||
nextPosting, err2 := postingsItr.Next()
|
||||
for err2 == nil && nextPosting != nil {
|
||||
docNum := nextPosting.Number()
|
||||
docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...)
|
||||
docTermMap[docNum] = append(docTermMap[docNum], termSeparator)
|
||||
nextPosting, err2 = postingsItr.Next()
|
||||
}
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
|
||||
next, err = dictItr.Next()
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// sort wrt to docIDs
|
||||
var docNumbers docIDRange
|
||||
for k := range docTermMap {
|
||||
docNumbers = append(docNumbers, k)
|
||||
}
|
||||
sort.Sort(docNumbers)
|
||||
|
||||
for _, docNum := range docNumbers {
|
||||
err = fdvEncoder.Add(docNum, docTermMap[docNum])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
fieldChunkOffsets[fieldID] = uint64(w.Count())
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// persist the doc value details for this field
|
||||
_, err = fdvEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// reseting encoder for the next field
|
||||
fdvEncoder.Reset()
|
||||
}
|
||||
|
||||
return fieldChunkOffsets, nil
|
||||
}
|
||||
|
||||
func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
||||
chunkFactor uint32) (uint64, error) {
|
||||
fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
fieldDocValuesOffset := uint64(w.Count())
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
offset := uint64(0)
|
||||
ok := true
|
||||
for fieldID := range memSegment.FieldsInv {
|
||||
// if the field isn't configured for docValue, then mark
|
||||
// the offset accordingly
|
||||
if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok {
|
||||
offset = fieldNotUninverted
|
||||
}
|
||||
n := binary.PutUvarint(buf, uint64(offset))
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return fieldDocValuesOffset, nil
|
||||
}
|
||||
|
||||
func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) {
|
||||
var br bytes.Buffer
|
||||
|
||||
cr := NewCountHashWriter(&br)
|
||||
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err :=
|
||||
persistBase(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
|
||||
memSegment.FieldsMap, memSegment.FieldsInv, numDocs,
|
||||
storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs)
|
||||
}
|
||||
|
||||
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
||||
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
|
||||
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
|
||||
|
@ -653,10 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
|||
fieldsIndexOffset: fieldsIndexOffset,
|
||||
docValueOffset: docValueOffset,
|
||||
dictLocs: dictLocs,
|
||||
fieldDvIterMap: make(map[uint16]*docValueIterator),
|
||||
fieldDvReaders: make(map[uint16]*docValueReader),
|
||||
}
|
||||
sb.updateSize()
|
||||
|
||||
err := sb.loadDvIterators()
|
||||
err := sb.loadDvReaders()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
133
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/contentcoder.go
generated
vendored
133
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/contentcoder.go
generated
vendored
|
@ -18,41 +18,56 @@ import (
|
|||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"reflect"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var reflectStaticSizeMetaData int
|
||||
|
||||
func init() {
|
||||
var md MetaData
|
||||
reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size())
|
||||
}
|
||||
|
||||
var termSeparator byte = 0xff
|
||||
var termSeparatorSplitSlice = []byte{termSeparator}
|
||||
|
||||
type chunkedContentCoder struct {
|
||||
final []byte
|
||||
chunkSize uint64
|
||||
currChunk uint64
|
||||
chunkLens []uint64
|
||||
final []byte
|
||||
chunkSize uint64
|
||||
currChunk uint64
|
||||
chunkLens []uint64
|
||||
|
||||
w io.Writer
|
||||
progressiveWrite bool
|
||||
|
||||
chunkMetaBuf bytes.Buffer
|
||||
chunkBuf bytes.Buffer
|
||||
|
||||
chunkMeta []MetaData
|
||||
|
||||
compressed []byte // temp buf for snappy compression
|
||||
}
|
||||
|
||||
// MetaData represents the data information inside a
|
||||
// chunk.
|
||||
type MetaData struct {
|
||||
DocNum uint64 // docNum of the data inside the chunk
|
||||
DocDvLoc uint64 // starting offset for a given docid
|
||||
DocDvLen uint64 // length of data inside the chunk for the given docid
|
||||
DocNum uint64 // docNum of the data inside the chunk
|
||||
DocDvOffset uint64 // offset of data inside the chunk for the given docid
|
||||
}
|
||||
|
||||
// newChunkedContentCoder returns a new chunk content coder which
|
||||
// packs data into chunks based on the provided chunkSize
|
||||
func newChunkedContentCoder(chunkSize uint64,
|
||||
maxDocNum uint64) *chunkedContentCoder {
|
||||
func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64,
|
||||
w io.Writer, progressiveWrite bool) *chunkedContentCoder {
|
||||
total := maxDocNum/chunkSize + 1
|
||||
rv := &chunkedContentCoder{
|
||||
chunkSize: chunkSize,
|
||||
chunkLens: make([]uint64, total),
|
||||
chunkMeta: make([]MetaData, 0, total),
|
||||
chunkSize: chunkSize,
|
||||
chunkLens: make([]uint64, total),
|
||||
chunkMeta: make([]MetaData, 0, total),
|
||||
w: w,
|
||||
progressiveWrite: progressiveWrite,
|
||||
}
|
||||
|
||||
return rv
|
||||
|
@ -88,7 +103,7 @@ func (c *chunkedContentCoder) flushContents() error {
|
|||
|
||||
// write out the metaData slice
|
||||
for _, meta := range c.chunkMeta {
|
||||
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen)
|
||||
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -98,10 +113,19 @@ func (c *chunkedContentCoder) flushContents() error {
|
|||
metaData := c.chunkMetaBuf.Bytes()
|
||||
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
|
||||
// write the compressed data to the final data
|
||||
compressedData := snappy.Encode(nil, c.chunkBuf.Bytes())
|
||||
c.final = append(c.final, compressedData...)
|
||||
c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
|
||||
c.final = append(c.final, c.compressed...)
|
||||
|
||||
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))
|
||||
|
||||
if c.progressiveWrite {
|
||||
_, err := c.w.Write(c.final)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.final = c.final[:0]
|
||||
}
|
||||
|
||||
c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData))
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -122,7 +146,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
|
|||
c.currChunk = chunk
|
||||
}
|
||||
|
||||
// mark the starting offset for this doc
|
||||
// get the starting offset for this doc
|
||||
dvOffset := c.chunkBuf.Len()
|
||||
dvSize, err := c.chunkBuf.Write(vals)
|
||||
if err != nil {
|
||||
|
@ -130,38 +154,77 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
|
|||
}
|
||||
|
||||
c.chunkMeta = append(c.chunkMeta, MetaData{
|
||||
DocNum: docNum,
|
||||
DocDvLoc: uint64(dvOffset),
|
||||
DocDvLen: uint64(dvSize),
|
||||
DocNum: docNum,
|
||||
DocDvOffset: uint64(dvOffset + dvSize),
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
// Write commits all the encoded chunked contents to the provided writer.
|
||||
func (c *chunkedContentCoder) Write(w io.Writer) (int, error) {
|
||||
//
|
||||
// | ..... data ..... | chunk offsets (varints)
|
||||
// | position of chunk offsets (uint64) | number of offsets (uint64) |
|
||||
//
|
||||
func (c *chunkedContentCoder) Write() (int, error) {
|
||||
var tw int
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the number of chunks
|
||||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
|
||||
nw, err := w.Write(buf[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
// write out the chunk lens
|
||||
for _, chunkLen := range c.chunkLens {
|
||||
n := binary.PutUvarint(buf, uint64(chunkLen))
|
||||
nw, err = w.Write(buf[:n])
|
||||
|
||||
if c.final != nil {
|
||||
// write out the data section first
|
||||
nw, err := c.w.Write(c.final)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
}
|
||||
// write out the data
|
||||
nw, err = w.Write(c.final)
|
||||
|
||||
chunkOffsetsStart := uint64(tw)
|
||||
|
||||
if cap(c.final) < binary.MaxVarintLen64 {
|
||||
c.final = make([]byte, binary.MaxVarintLen64)
|
||||
} else {
|
||||
c.final = c.final[0:binary.MaxVarintLen64]
|
||||
}
|
||||
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
|
||||
// write out the chunk offsets
|
||||
for _, chunkOffset := range chunkOffsets {
|
||||
n := binary.PutUvarint(c.final, chunkOffset)
|
||||
nw, err := c.w.Write(c.final[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
}
|
||||
|
||||
chunkOffsetsLen := uint64(tw) - chunkOffsetsStart
|
||||
|
||||
c.final = c.final[0:8]
|
||||
// write out the length of chunk offsets
|
||||
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
|
||||
nw, err := c.w.Write(c.final)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
// write out the number of chunks
|
||||
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
|
||||
nw, err = c.w.Write(c.final)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
c.final = c.final[:0]
|
||||
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
// ReadDocValueBoundary elicits the start, end offsets from a
|
||||
// metaData header slice
|
||||
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) {
|
||||
var start uint64
|
||||
if chunk > 0 {
|
||||
start = metaHeaders[chunk-1].DocDvOffset
|
||||
}
|
||||
return start, metaHeaders[chunk].DocDvOffset
|
||||
}
|
||||
|
|
10
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/count.go
generated
vendored
10
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/count.go
generated
vendored
|
@ -17,6 +17,8 @@ package zap
|
|||
import (
|
||||
"hash/crc32"
|
||||
"io"
|
||||
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// CountHashWriter is a wrapper around a Writer which counts the number of
|
||||
|
@ -25,6 +27,7 @@ type CountHashWriter struct {
|
|||
w io.Writer
|
||||
crc uint32
|
||||
n int
|
||||
s segment.StatsReporter
|
||||
}
|
||||
|
||||
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
|
||||
|
@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter {
|
|||
return &CountHashWriter{w: w}
|
||||
}
|
||||
|
||||
func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter {
|
||||
return &CountHashWriter{w: w, s: s}
|
||||
}
|
||||
|
||||
// Write writes the provided bytes to the wrapped writer and counts the bytes
|
||||
func (c *CountHashWriter) Write(b []byte) (int, error) {
|
||||
n, err := c.w.Write(b)
|
||||
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
|
||||
c.n += n
|
||||
if c.s != nil {
|
||||
c.s.ReportBytesWritten(uint64(n))
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
|
|
151
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/dict.go
generated
vendored
151
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/dict.go
generated
vendored
|
@ -15,38 +15,51 @@
|
|||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/couchbase/vellum/regexp"
|
||||
)
|
||||
|
||||
// Dictionary is the zap representation of the term dictionary
|
||||
type Dictionary struct {
|
||||
sb *SegmentBase
|
||||
field string
|
||||
fieldID uint16
|
||||
fst *vellum.FST
|
||||
sb *SegmentBase
|
||||
field string
|
||||
fieldID uint16
|
||||
fst *vellum.FST
|
||||
fstReader *vellum.Reader
|
||||
}
|
||||
|
||||
// PostingsList returns the postings list for the specified term
|
||||
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
return d.postingsList([]byte(term), except, nil)
|
||||
func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap,
|
||||
prealloc segment.PostingsList) (segment.PostingsList, error) {
|
||||
var preallocPL *PostingsList
|
||||
pl, ok := prealloc.(*PostingsList)
|
||||
if ok && pl != nil {
|
||||
preallocPL = pl
|
||||
}
|
||||
return d.postingsList(term, except, preallocPL)
|
||||
}
|
||||
|
||||
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
|
||||
if d.fst == nil {
|
||||
if d.fstReader == nil {
|
||||
if rv == nil || rv == emptyPostingsList {
|
||||
return emptyPostingsList, nil
|
||||
}
|
||||
return d.postingsListInit(rv, except), nil
|
||||
}
|
||||
|
||||
postingsOffset, exists, err := d.fst.Get(term)
|
||||
postingsOffset, exists, err := d.fstReader.Get(term)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vellum err: %v", err)
|
||||
}
|
||||
if !exists {
|
||||
if rv == nil || rv == emptyPostingsList {
|
||||
return emptyPostingsList, nil
|
||||
}
|
||||
return d.postingsListInit(rv, except), nil
|
||||
}
|
||||
|
||||
|
@ -65,10 +78,17 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari
|
|||
}
|
||||
|
||||
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
|
||||
if rv == nil {
|
||||
if rv == nil || rv == emptyPostingsList {
|
||||
rv = &PostingsList{}
|
||||
} else {
|
||||
postings := rv.postings
|
||||
if postings != nil {
|
||||
postings.Clear()
|
||||
}
|
||||
|
||||
*rv = PostingsList{} // clear the struct
|
||||
|
||||
rv.postings = postings
|
||||
}
|
||||
rv.sb = d.sb
|
||||
rv.except = except
|
||||
|
@ -85,6 +105,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
|||
itr, err := d.fst.Iterator(nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -98,13 +120,15 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
|||
d: d,
|
||||
}
|
||||
|
||||
kBeg := []byte(prefix)
|
||||
kEnd := segment.IncrementBytes(kBeg)
|
||||
|
||||
if d.fst != nil {
|
||||
r, err := regexp.New(prefix + ".*")
|
||||
itr, err := d.fst.Iterator(kBeg, kEnd)
|
||||
if err == nil {
|
||||
itr, err := d.fst.Search(r, nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
}
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -130,36 +154,103 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
|
|||
itr, err := d.fst.Iterator([]byte(start), endBytes)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// AutomatonIterator returns an iterator which only visits terms
|
||||
// having the the vellum automaton and start/end key range
|
||||
func (d *Dictionary) AutomatonIterator(a vellum.Automaton,
|
||||
startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator {
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (d *Dictionary) OnlyIterator(onlyTerms [][]byte,
|
||||
includeCount bool) segment.DictionaryIterator {
|
||||
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
omitCount: !includeCount,
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
builder, err := vellum.New(&buf, nil)
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
for _, term := range onlyTerms {
|
||||
err = builder.Insert(term, 0)
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
}
|
||||
err = builder.Close()
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
|
||||
onlyFST, err := vellum.Load(buf.Bytes())
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
|
||||
itr, err := d.fst.Search(onlyFST, nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
itr vellum.Iterator
|
||||
err error
|
||||
tmp PostingsList
|
||||
d *Dictionary
|
||||
itr vellum.Iterator
|
||||
err error
|
||||
tmp PostingsList
|
||||
entry index.DictEntry
|
||||
omitCount bool
|
||||
}
|
||||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
if i.itr == nil || i.err == vellum.ErrIteratorDone {
|
||||
return nil, nil
|
||||
} else if i.err != nil {
|
||||
if i.err != nil && i.err != vellum.ErrIteratorDone {
|
||||
return nil, i.err
|
||||
} else if i.itr == nil || i.err == vellum.ErrIteratorDone {
|
||||
return nil, nil
|
||||
}
|
||||
term, postingsOffset := i.itr.Current()
|
||||
i.err = i.tmp.read(postingsOffset, i.d)
|
||||
if i.err != nil {
|
||||
return nil, i.err
|
||||
}
|
||||
rv := &index.DictEntry{
|
||||
Term: string(term),
|
||||
Count: i.tmp.Count(),
|
||||
i.entry.Term = string(term)
|
||||
if !i.omitCount {
|
||||
i.err = i.tmp.read(postingsOffset, i.d)
|
||||
if i.err != nil {
|
||||
return nil, i.err
|
||||
}
|
||||
i.entry.Count = i.tmp.Count()
|
||||
}
|
||||
i.err = i.itr.Next()
|
||||
return rv, nil
|
||||
return &i.entry, nil
|
||||
}
|
||||
|
|
262
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go
generated
vendored
262
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go
generated
vendored
|
@ -19,93 +19,129 @@ import (
|
|||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
type docValueIterator struct {
|
||||
var reflectStaticSizedocValueReader int
|
||||
|
||||
func init() {
|
||||
var dvi docValueReader
|
||||
reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
|
||||
}
|
||||
|
||||
type docNumTermsVisitor func(docNum uint64, terms []byte) error
|
||||
|
||||
type docVisitState struct {
|
||||
dvrs map[uint16]*docValueReader
|
||||
segment *Segment
|
||||
}
|
||||
|
||||
type docValueReader struct {
|
||||
field string
|
||||
curChunkNum uint64
|
||||
numChunks uint64
|
||||
chunkLens []uint64
|
||||
chunkOffsets []uint64
|
||||
dvDataLoc uint64
|
||||
curChunkHeader []MetaData
|
||||
curChunkData []byte // compressed data cache
|
||||
uncompressed []byte // temp buf for snappy decompression
|
||||
}
|
||||
|
||||
func (di *docValueIterator) sizeInBytes() uint64 {
|
||||
// curChunkNum, numChunks, dvDataLoc --> uint64
|
||||
sizeInBytes := 24
|
||||
|
||||
// field
|
||||
sizeInBytes += (len(di.field) + int(segment.SizeOfString))
|
||||
|
||||
// chunkLens, curChunkHeader
|
||||
sizeInBytes += len(di.chunkLens)*8 +
|
||||
len(di.curChunkHeader)*24 +
|
||||
int(segment.SizeOfSlice*2) /* overhead from slices */
|
||||
|
||||
// curChunkData is mmap'ed, not included
|
||||
|
||||
return uint64(sizeInBytes)
|
||||
func (di *docValueReader) size() int {
|
||||
return reflectStaticSizedocValueReader + size.SizeOfPtr +
|
||||
len(di.field) +
|
||||
len(di.chunkOffsets)*size.SizeOfUint64 +
|
||||
len(di.curChunkHeader)*reflectStaticSizeMetaData +
|
||||
len(di.curChunkData)
|
||||
}
|
||||
|
||||
func (di *docValueIterator) fieldName() string {
|
||||
func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
|
||||
if rv == nil {
|
||||
rv = &docValueReader{}
|
||||
}
|
||||
|
||||
rv.field = di.field
|
||||
rv.curChunkNum = math.MaxUint64
|
||||
rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
|
||||
rv.dvDataLoc = di.dvDataLoc
|
||||
rv.curChunkHeader = rv.curChunkHeader[:0]
|
||||
rv.curChunkData = nil
|
||||
rv.uncompressed = rv.uncompressed[:0]
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (di *docValueReader) fieldName() string {
|
||||
return di.field
|
||||
}
|
||||
|
||||
func (di *docValueIterator) curChunkNumber() uint64 {
|
||||
func (di *docValueReader) curChunkNumber() uint64 {
|
||||
return di.curChunkNum
|
||||
}
|
||||
|
||||
func (s *SegmentBase) loadFieldDocValueIterator(field string,
|
||||
fieldDvLoc uint64) (*docValueIterator, error) {
|
||||
func (s *SegmentBase) loadFieldDocValueReader(field string,
|
||||
fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
|
||||
// get the docValue offset for the given fields
|
||||
if fieldDvLoc == fieldNotUninverted {
|
||||
return nil, fmt.Errorf("loadFieldDocValueIterator: "+
|
||||
if fieldDvLocStart == fieldNotUninverted {
|
||||
return nil, fmt.Errorf("loadFieldDocValueReader: "+
|
||||
"no docValues found for field: %s", field)
|
||||
}
|
||||
|
||||
// read the number of chunks, chunk lengths
|
||||
var offset, clen uint64
|
||||
numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64])
|
||||
if read <= 0 {
|
||||
return nil, fmt.Errorf("failed to read the field "+
|
||||
"doc values for field %s", field)
|
||||
}
|
||||
offset += uint64(read)
|
||||
// read the number of chunks, and chunk offsets position
|
||||
var numChunks, chunkOffsetsPosition uint64
|
||||
|
||||
fdvIter := &docValueIterator{
|
||||
curChunkNum: math.MaxUint64,
|
||||
field: field,
|
||||
chunkLens: make([]uint64, int(numChunks)),
|
||||
if fieldDvLocEnd-fieldDvLocStart > 16 {
|
||||
numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
|
||||
// read the length of chunk offsets
|
||||
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
|
||||
// acquire position of chunk offsets
|
||||
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
|
||||
}
|
||||
|
||||
fdvIter := &docValueReader{
|
||||
curChunkNum: math.MaxUint64,
|
||||
field: field,
|
||||
chunkOffsets: make([]uint64, int(numChunks)),
|
||||
}
|
||||
|
||||
// read the chunk offsets
|
||||
var offset uint64
|
||||
for i := 0; i < int(numChunks); i++ {
|
||||
clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
|
||||
loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
|
||||
if read <= 0 {
|
||||
return nil, fmt.Errorf("corrupted chunk length during segment load")
|
||||
return nil, fmt.Errorf("corrupted chunk offset during segment load")
|
||||
}
|
||||
fdvIter.chunkLens[i] = clen
|
||||
fdvIter.chunkOffsets[i] = loc
|
||||
offset += uint64(read)
|
||||
}
|
||||
|
||||
fdvIter.dvDataLoc = fieldDvLoc + offset
|
||||
// set the data offset
|
||||
fdvIter.dvDataLoc = fieldDvLocStart
|
||||
|
||||
return fdvIter, nil
|
||||
}
|
||||
|
||||
func (di *docValueIterator) loadDvChunk(chunkNumber,
|
||||
localDocNum uint64, s *SegmentBase) error {
|
||||
func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
|
||||
// advance to the chunk where the docValues
|
||||
// reside for the given docNum
|
||||
destChunkDataLoc := di.dvDataLoc
|
||||
for i := 0; i < int(chunkNumber); i++ {
|
||||
destChunkDataLoc += di.chunkLens[i]
|
||||
destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
|
||||
start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
|
||||
if start >= end {
|
||||
di.curChunkHeader = di.curChunkHeader[:0]
|
||||
di.curChunkData = nil
|
||||
di.curChunkNum = chunkNumber
|
||||
di.uncompressed = di.uncompressed[:0]
|
||||
return nil
|
||||
}
|
||||
|
||||
curChunkSize := di.chunkLens[chunkNumber]
|
||||
destChunkDataLoc += start
|
||||
curChunkEnd += end
|
||||
|
||||
// read the number of docs reside in the chunk
|
||||
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
|
||||
if read <= 0 {
|
||||
|
@ -114,38 +150,81 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
|
|||
chunkMetaLoc := destChunkDataLoc + uint64(read)
|
||||
|
||||
offset := uint64(0)
|
||||
di.curChunkHeader = make([]MetaData, int(numDocs))
|
||||
if cap(di.curChunkHeader) < int(numDocs) {
|
||||
di.curChunkHeader = make([]MetaData, int(numDocs))
|
||||
} else {
|
||||
di.curChunkHeader = di.curChunkHeader[:int(numDocs)]
|
||||
}
|
||||
for i := 0; i < int(numDocs); i++ {
|
||||
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
offset += uint64(read)
|
||||
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
offset += uint64(read)
|
||||
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
offset += uint64(read)
|
||||
}
|
||||
|
||||
compressedDataLoc := chunkMetaLoc + offset
|
||||
dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc
|
||||
dataLength := curChunkEnd - compressedDataLoc
|
||||
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
|
||||
di.curChunkNum = chunkNumber
|
||||
di.uncompressed = di.uncompressed[:0]
|
||||
return nil
|
||||
}
|
||||
|
||||
func (di *docValueIterator) visitDocValues(docNum uint64,
|
||||
func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
|
||||
for i := 0; i < len(di.chunkOffsets); i++ {
|
||||
err := di.loadDvChunk(uint64(i), s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if di.curChunkData == nil || len(di.curChunkHeader) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// uncompress the already loaded data
|
||||
uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
di.uncompressed = uncompressed
|
||||
|
||||
start := uint64(0)
|
||||
for _, entry := range di.curChunkHeader {
|
||||
err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
start = entry.DocDvOffset
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (di *docValueReader) visitDocValues(docNum uint64,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
// binary search the term locations for the docNum
|
||||
start, length := di.getDocValueLocs(docNum)
|
||||
if start == math.MaxUint64 || length == math.MaxUint64 {
|
||||
start, end := di.getDocValueLocs(docNum)
|
||||
if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
|
||||
return nil
|
||||
}
|
||||
// uncompress the already loaded data
|
||||
uncompressed, err := snappy.Decode(nil, di.curChunkData)
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
var uncompressed []byte
|
||||
var err error
|
||||
// use the uncompressed copy if available
|
||||
if len(di.uncompressed) > 0 {
|
||||
uncompressed = di.uncompressed
|
||||
} else {
|
||||
// uncompress the already loaded data
|
||||
uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
di.uncompressed = uncompressed
|
||||
}
|
||||
|
||||
// pick the terms for the given docNum
|
||||
uncompressed = uncompressed[start : start+length]
|
||||
uncompressed = uncompressed[start:end]
|
||||
for {
|
||||
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
|
||||
if i < 0 {
|
||||
|
@ -159,55 +238,72 @@ func (di *docValueIterator) visitDocValues(docNum uint64,
|
|||
return nil
|
||||
}
|
||||
|
||||
func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) {
|
||||
func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
|
||||
i := sort.Search(len(di.curChunkHeader), func(i int) bool {
|
||||
return di.curChunkHeader[i].DocNum >= docNum
|
||||
})
|
||||
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
|
||||
return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen
|
||||
return ReadDocValueBoundary(i, di.curChunkHeader)
|
||||
}
|
||||
return math.MaxUint64, math.MaxUint64
|
||||
}
|
||||
|
||||
// VisitDocumentFieldTerms is an implementation of the
|
||||
// DocumentFieldTermVisitable interface
|
||||
func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
fieldIDPlus1 := uint16(0)
|
||||
ok := true
|
||||
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
|
||||
segment.DocVisitState, error) {
|
||||
dvs, ok := dvsIn.(*docVisitState)
|
||||
if !ok || dvs == nil {
|
||||
dvs = &docVisitState{}
|
||||
} else {
|
||||
if dvs.segment != s {
|
||||
dvs.segment = s
|
||||
dvs.dvrs = nil
|
||||
}
|
||||
}
|
||||
|
||||
var fieldIDPlus1 uint16
|
||||
if dvs.dvrs == nil {
|
||||
dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
|
||||
for _, field := range fields {
|
||||
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
|
||||
continue
|
||||
}
|
||||
fieldID := fieldIDPlus1 - 1
|
||||
if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
|
||||
dvIter != nil {
|
||||
dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find the chunkNumber where the docValues are stored
|
||||
docInChunk := localDocNum / uint64(s.chunkFactor)
|
||||
var dvr *docValueReader
|
||||
for _, field := range fields {
|
||||
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
|
||||
continue
|
||||
}
|
||||
// find the chunkNumber where the docValues are stored
|
||||
docInChunk := localDocNum / uint64(s.chunkFactor)
|
||||
|
||||
if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists &&
|
||||
dvIter != nil {
|
||||
fieldID := fieldIDPlus1 - 1
|
||||
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
|
||||
// check if the chunk is already loaded
|
||||
if docInChunk != dvIter.curChunkNumber() {
|
||||
err := dvIter.loadDvChunk(docInChunk, localDocNum, s)
|
||||
if docInChunk != dvr.curChunkNumber() {
|
||||
err := dvr.loadDvChunk(docInChunk, &s.SegmentBase)
|
||||
if err != nil {
|
||||
continue
|
||||
return dvs, err
|
||||
}
|
||||
}
|
||||
|
||||
_ = dvIter.visitDocValues(localDocNum, visitor)
|
||||
_ = dvr.visitDocValues(localDocNum, visitor)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return dvs, nil
|
||||
}
|
||||
|
||||
// VisitableDocValueFields returns the list of fields with
|
||||
// persisted doc value terms ready to be visitable using the
|
||||
// VisitDocumentFieldTerms method.
|
||||
func (s *Segment) VisitableDocValueFields() ([]string, error) {
|
||||
var rv []string
|
||||
for fieldID, field := range s.fieldsInv {
|
||||
if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok &&
|
||||
dvIter != nil {
|
||||
rv = append(rv, field)
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
return s.fieldDvNames, nil
|
||||
}
|
||||
|
|
16
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/enumerator.go
generated
vendored
16
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/enumerator.go
generated
vendored
|
@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) {
|
|||
for i, itr := range rv.itrs {
|
||||
rv.currKs[i], rv.currVs[i] = itr.Current()
|
||||
}
|
||||
rv.updateMatches()
|
||||
if rv.lowK == nil {
|
||||
rv.updateMatches(false)
|
||||
if rv.lowK == nil && len(rv.lowIdxs) == 0 {
|
||||
return rv, vellum.ErrIteratorDone
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// updateMatches maintains the low key matches based on the currKs
|
||||
func (m *enumerator) updateMatches() {
|
||||
func (m *enumerator) updateMatches(skipEmptyKey bool) {
|
||||
m.lowK = nil
|
||||
m.lowIdxs = m.lowIdxs[:0]
|
||||
m.lowCurr = 0
|
||||
|
||||
for i, key := range m.currKs {
|
||||
if key == nil {
|
||||
if (key == nil && m.currVs[i] == 0) || // in case of empty iterator
|
||||
(len(key) == 0 && skipEmptyKey) { // skip empty keys
|
||||
continue
|
||||
}
|
||||
|
||||
cmp := bytes.Compare(key, m.lowK)
|
||||
if cmp < 0 || m.lowK == nil {
|
||||
if cmp < 0 || len(m.lowIdxs) == 0 {
|
||||
// reached a new low
|
||||
m.lowK = key
|
||||
m.lowIdxs = m.lowIdxs[:0]
|
||||
|
@ -102,9 +103,10 @@ func (m *enumerator) Next() error {
|
|||
}
|
||||
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
|
||||
}
|
||||
m.updateMatches()
|
||||
// can skip any empty keys encountered at this point
|
||||
m.updateMatches(true)
|
||||
}
|
||||
if m.lowK == nil {
|
||||
if m.lowK == nil && len(m.lowIdxs) == 0 {
|
||||
return vellum.ErrIteratorDone
|
||||
}
|
||||
return nil
|
||||
|
|
83
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/intcoder.go
generated
vendored
83
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/intcoder.go
generated
vendored
|
@ -18,16 +18,12 @@ import (
|
|||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
|
||||
"github.com/Smerity/govarint"
|
||||
)
|
||||
|
||||
type chunkedIntCoder struct {
|
||||
final []byte
|
||||
maxDocNum uint64
|
||||
chunkSize uint64
|
||||
chunkBuf bytes.Buffer
|
||||
encoder *govarint.Base128Encoder
|
||||
chunkLens []uint64
|
||||
currChunk uint64
|
||||
|
||||
|
@ -41,11 +37,9 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder {
|
|||
total := maxDocNum/chunkSize + 1
|
||||
rv := &chunkedIntCoder{
|
||||
chunkSize: chunkSize,
|
||||
maxDocNum: maxDocNum,
|
||||
chunkLens: make([]uint64, total),
|
||||
final: make([]byte, 0, 64),
|
||||
}
|
||||
rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf)
|
||||
|
||||
return rv
|
||||
}
|
||||
|
@ -67,16 +61,18 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
|||
chunk := docNum / c.chunkSize
|
||||
if chunk != c.currChunk {
|
||||
// starting a new chunk
|
||||
if c.encoder != nil {
|
||||
// close out last
|
||||
c.Close()
|
||||
c.chunkBuf.Reset()
|
||||
}
|
||||
c.Close()
|
||||
c.chunkBuf.Reset()
|
||||
c.currChunk = chunk
|
||||
}
|
||||
|
||||
if len(c.buf) < binary.MaxVarintLen64 {
|
||||
c.buf = make([]byte, binary.MaxVarintLen64)
|
||||
}
|
||||
|
||||
for _, val := range vals {
|
||||
_, err := c.encoder.PutU64(val)
|
||||
wb := binary.PutUvarint(c.buf, val)
|
||||
_, err := c.chunkBuf.Write(c.buf[:wb])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -85,13 +81,26 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error {
|
||||
chunk := docNum / c.chunkSize
|
||||
if chunk != c.currChunk {
|
||||
// starting a new chunk
|
||||
c.Close()
|
||||
c.chunkBuf.Reset()
|
||||
c.currChunk = chunk
|
||||
}
|
||||
|
||||
_, err := c.chunkBuf.Write(buf)
|
||||
return err
|
||||
}
|
||||
|
||||
// Close indicates you are done calling Add() this allows the final chunk
|
||||
// to be encoded.
|
||||
func (c *chunkedIntCoder) Close() {
|
||||
c.encoder.Close()
|
||||
encodingBytes := c.chunkBuf.Bytes()
|
||||
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
|
||||
c.final = append(c.final, encodingBytes...)
|
||||
c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close
|
||||
}
|
||||
|
||||
// Write commits all the encoded chunked integers to the provided writer.
|
||||
|
@ -102,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
|||
}
|
||||
buf := c.buf
|
||||
|
||||
// write out the number of chunks & each chunkLen
|
||||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
|
||||
for _, chunkLen := range c.chunkLens {
|
||||
n += binary.PutUvarint(buf[n:], uint64(chunkLen))
|
||||
// convert the chunk lengths into chunk offsets
|
||||
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
|
||||
|
||||
// write out the number of chunks & each chunk offsets
|
||||
n := binary.PutUvarint(buf, uint64(len(chunkOffsets)))
|
||||
for _, chunkOffset := range chunkOffsets {
|
||||
n += binary.PutUvarint(buf[n:], chunkOffset)
|
||||
}
|
||||
|
||||
tw, err := w.Write(buf[:n])
|
||||
|
@ -121,3 +133,40 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
|||
}
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
func (c *chunkedIntCoder) FinalSize() int {
|
||||
return len(c.final)
|
||||
}
|
||||
|
||||
// modifyLengthsToEndOffsets converts the chunk length array
|
||||
// to a chunk offset array. The readChunkBoundary
|
||||
// will figure out the start and end of every chunk from
|
||||
// these offsets. Starting offset of i'th index is stored
|
||||
// in i-1'th position except for 0'th index and ending offset
|
||||
// is stored at i'th index position.
|
||||
// For 0'th element, starting position is always zero.
|
||||
// eg:
|
||||
// Lens -> 5 5 5 5 => 5 10 15 20
|
||||
// Lens -> 0 5 0 5 => 0 5 5 10
|
||||
// Lens -> 0 0 0 5 => 0 0 0 5
|
||||
// Lens -> 5 0 0 0 => 5 5 5 5
|
||||
// Lens -> 0 5 0 0 => 0 5 5 5
|
||||
// Lens -> 0 0 5 0 => 0 0 5 5
|
||||
func modifyLengthsToEndOffsets(lengths []uint64) []uint64 {
|
||||
var runningOffset uint64
|
||||
var index, i int
|
||||
for i = 1; i <= len(lengths); i++ {
|
||||
runningOffset += lengths[i-1]
|
||||
lengths[index] = runningOffset
|
||||
index++
|
||||
}
|
||||
return lengths
|
||||
}
|
||||
|
||||
func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
|
||||
var start uint64
|
||||
if chunk > 0 {
|
||||
start = offsets[chunk-1]
|
||||
}
|
||||
return start, offsets[chunk]
|
||||
}
|
||||
|
|
576
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/merge.go
generated
vendored
576
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/merge.go
generated
vendored
|
@ -24,11 +24,13 @@ import (
|
|||
"sort"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
seg "github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var DefaultFileMergerBufferSize = 1024 * 1024
|
||||
|
||||
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
||||
|
||||
// Merge takes a slice of zap segments and bit masks describing which
|
||||
|
@ -36,12 +38,24 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
|||
// remaining data. This new segment is built at the specified path,
|
||||
// with the provided chunkFactor.
|
||||
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
|
||||
chunkFactor uint32) ([][]uint64, error) {
|
||||
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
|
||||
[][]uint64, uint64, error) {
|
||||
segmentBases := make([]*SegmentBase, len(segments))
|
||||
for segmenti, segment := range segments {
|
||||
segmentBases[segmenti] = &segment.SegmentBase
|
||||
}
|
||||
|
||||
return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s)
|
||||
}
|
||||
|
||||
func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string,
|
||||
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
|
||||
[][]uint64, uint64, error) {
|
||||
flag := os.O_RDWR | os.O_CREATE
|
||||
|
||||
f, err := os.OpenFile(path, flag, 0600)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
cleanup := func() {
|
||||
|
@ -49,54 +63,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
|
|||
_ = os.Remove(path)
|
||||
}
|
||||
|
||||
segmentBases := make([]*SegmentBase, len(segments))
|
||||
for segmenti, segment := range segments {
|
||||
segmentBases[segmenti] = &segment.SegmentBase
|
||||
}
|
||||
|
||||
// buffer the output
|
||||
br := bufio.NewWriter(f)
|
||||
br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize)
|
||||
|
||||
// wrap it for counting (tracking offsets)
|
||||
cr := NewCountHashWriter(br)
|
||||
cr := NewCountHashWriterWithStatsReporter(br, s)
|
||||
|
||||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err :=
|
||||
MergeToWriter(segmentBases, drops, chunkFactor, cr)
|
||||
MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset,
|
||||
docValueOffset, chunkFactor, cr.Sum32(), cr)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = br.Flush()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = f.Sync()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = f.Close()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
return newDocNums, nil
|
||||
return newDocNums, uint64(cr.Count()), nil
|
||||
}
|
||||
|
||||
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||
chunkFactor uint32, cr *CountHashWriter) (
|
||||
chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) (
|
||||
newDocNums [][]uint64,
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
||||
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16,
|
||||
|
@ -108,15 +117,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
fieldsMap = mapFields(fieldsInv)
|
||||
|
||||
numDocs = computeNewDocCount(segments, drops)
|
||||
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed
|
||||
}
|
||||
|
||||
if numDocs > 0 {
|
||||
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
|
||||
fieldsMap, fieldsInv, fieldsSame, numDocs, cr)
|
||||
fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||
}
|
||||
|
||||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
|
||||
newDocNums, numDocs, chunkFactor, cr)
|
||||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
|
||||
fieldsInv, fieldsMap, fieldsSame,
|
||||
newDocNums, numDocs, chunkFactor, cr, closeCh)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||
}
|
||||
|
@ -156,11 +171,10 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64
|
|||
}
|
||||
|
||||
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||
fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64,
|
||||
newSegDocCount uint64, chunkFactor uint32,
|
||||
w *CountHashWriter) ([]uint64, uint64, error) {
|
||||
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
|
||||
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32,
|
||||
w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) {
|
||||
|
||||
var bufReuse bytes.Buffer
|
||||
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
|
||||
var bufLoc []uint64
|
||||
|
||||
|
@ -168,28 +182,22 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
var postItr *PostingsIterator
|
||||
|
||||
rv := make([]uint64, len(fieldsInv))
|
||||
fieldDvLocs := make([]uint64, len(fieldsInv))
|
||||
fieldDvLocsStart := make([]uint64, len(fieldsInv))
|
||||
fieldDvLocsEnd := make([]uint64, len(fieldsInv))
|
||||
|
||||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
|
||||
// docTermMap is keyed by docNum, where the array impl provides
|
||||
// better memory usage behavior than a sparse-friendlier hashmap
|
||||
// for when docs have much structural similarity (i.e., every doc
|
||||
// has a given field)
|
||||
var docTermMap [][]byte
|
||||
|
||||
var vellumBuf bytes.Buffer
|
||||
newVellum, err := vellum.New(&vellumBuf, nil)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
newRoaring := roaring.NewBitmap()
|
||||
|
||||
// for each field
|
||||
for fieldID, fieldName := range fieldsInv {
|
||||
if fieldID != 0 {
|
||||
vellumBuf.Reset()
|
||||
}
|
||||
newVellum, err := vellum.New(&vellumBuf, nil)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// collect FST iterators from all active segments for this field
|
||||
var newDocNums [][]uint64
|
||||
|
@ -197,7 +205,15 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
var dicts []*Dictionary
|
||||
var itrs []vellum.Iterator
|
||||
|
||||
var segmentsInFocus []*SegmentBase
|
||||
|
||||
for segmentI, segment := range segments {
|
||||
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, seg.ErrClosed
|
||||
}
|
||||
|
||||
dict, err2 := segment.dictionary(fieldName)
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
|
@ -209,89 +225,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
}
|
||||
if itr != nil {
|
||||
newDocNums = append(newDocNums, newDocNumsIn[segmentI])
|
||||
drops = append(drops, dropsIn[segmentI])
|
||||
if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() {
|
||||
drops = append(drops, dropsIn[segmentI])
|
||||
} else {
|
||||
drops = append(drops, nil)
|
||||
}
|
||||
dicts = append(dicts, dict)
|
||||
itrs = append(itrs, itr)
|
||||
segmentsInFocus = append(segmentsInFocus, segment)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if uint64(cap(docTermMap)) < newSegDocCount {
|
||||
docTermMap = make([][]byte, newSegDocCount)
|
||||
} else {
|
||||
docTermMap = docTermMap[0:newSegDocCount]
|
||||
for docNum := range docTermMap { // reset the docTermMap
|
||||
docTermMap[docNum] = docTermMap[docNum][:0]
|
||||
}
|
||||
}
|
||||
|
||||
var prevTerm []byte
|
||||
|
||||
newRoaring := roaring.NewBitmap()
|
||||
newRoaringLocs := roaring.NewBitmap()
|
||||
newRoaring.Clear()
|
||||
|
||||
var lastDocNum, lastFreq, lastNorm uint64
|
||||
|
||||
// determines whether to use "1-hit" encoding optimization
|
||||
// when a term appears in only 1 doc, with no loc info,
|
||||
// has freq of 1, and the docNum fits into 31-bits
|
||||
use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) {
|
||||
if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 {
|
||||
docNum := uint64(newRoaring.Minimum())
|
||||
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 {
|
||||
return true, docNum, lastNorm
|
||||
}
|
||||
}
|
||||
return false, 0, 0
|
||||
}
|
||||
|
||||
finishTerm := func(term []byte) error {
|
||||
if term == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
if newRoaring.GetCardinality() > 0 {
|
||||
// this field/term actually has hits in the new segment, lets write it down
|
||||
freqOffset := uint64(w.Count())
|
||||
_, err := tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
locOffset := uint64(w.Count())
|
||||
_, err = locEncoder.Write(w)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
postingLocOffset := uint64(w.Count())
|
||||
_, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
postingOffset := uint64(w.Count())
|
||||
postingsOffset, err := writePostings(newRoaring,
|
||||
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// write out the start of the term info
|
||||
n := binary.PutUvarint(bufMaxVarintLen64, freqOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out the start of the loc info
|
||||
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out the start of the posting locs
|
||||
n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = newVellum.Insert(term, postingOffset)
|
||||
if postingsOffset > 0 {
|
||||
err = newVellum.Insert(term, postingsOffset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
newRoaring = roaring.NewBitmap()
|
||||
newRoaringLocs = roaring.NewBitmap()
|
||||
newRoaring.Clear()
|
||||
|
||||
tfEncoder.Reset()
|
||||
locEncoder.Reset()
|
||||
|
||||
lastDocNum = 0
|
||||
lastFreq = 0
|
||||
lastNorm = 0
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -301,66 +291,39 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
term, itrI, postingsOffset := enumerator.Current()
|
||||
|
||||
if !bytes.Equal(prevTerm, term) {
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, seg.ErrClosed
|
||||
}
|
||||
|
||||
// if the term changed, write out the info collected
|
||||
// for the previous term
|
||||
err2 := finishTerm(prevTerm)
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
}
|
||||
}
|
||||
|
||||
var err2 error
|
||||
postings, err2 = dicts[itrI].postingsListFromOffset(
|
||||
postingsOffset, drops[itrI], postings)
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
}
|
||||
|
||||
newDocNumsI := newDocNums[itrI]
|
||||
|
||||
postItr = postings.iterator(postItr)
|
||||
next, err2 := postItr.Next()
|
||||
for next != nil && err2 == nil {
|
||||
hitNewDocNum := newDocNumsI[next.Number()]
|
||||
if hitNewDocNum == docDropped {
|
||||
return nil, 0, fmt.Errorf("see hit with dropped doc num")
|
||||
}
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
// encode norm bits
|
||||
norm := next.Norm()
|
||||
normBits := math.Float32bits(float32(norm))
|
||||
err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits))
|
||||
err = finishTerm(prevTerm)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
locs := next.Locations()
|
||||
if len(locs) > 0 {
|
||||
newRoaringLocs.Add(uint32(hitNewDocNum))
|
||||
for _, loc := range locs {
|
||||
if cap(bufLoc) < 5+len(loc.ArrayPositions()) {
|
||||
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions()))
|
||||
}
|
||||
args := bufLoc[0:5]
|
||||
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
||||
args[1] = loc.Pos()
|
||||
args[2] = loc.Start()
|
||||
args[3] = loc.End()
|
||||
args[4] = uint64(len(loc.ArrayPositions()))
|
||||
args = append(args, loc.ArrayPositions()...)
|
||||
err = locEncoder.Add(hitNewDocNum, args...)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docTermMap[hitNewDocNum] =
|
||||
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
|
||||
|
||||
next, err2 = postItr.Next()
|
||||
}
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
|
||||
postings, err = dicts[itrI].postingsListFromOffset(
|
||||
postingsOffset, drops[itrI], postings)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
postItr = postings.iterator(true, true, true, postItr)
|
||||
|
||||
if fieldsSame {
|
||||
// can optimize by copying freq/norm/loc bytes directly
|
||||
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
|
||||
term, postItr, newDocNums[itrI], newRoaring,
|
||||
tfEncoder, locEncoder)
|
||||
} else {
|
||||
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
|
||||
fieldsMap, term, postItr, newDocNums[itrI], newRoaring,
|
||||
tfEncoder, locEncoder, bufLoc)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem
|
||||
|
@ -368,7 +331,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
|
||||
err = enumerator.Next()
|
||||
}
|
||||
if err != nil && err != vellum.ErrIteratorDone {
|
||||
if err != vellum.ErrIteratorDone {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
|
@ -400,26 +363,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
|
||||
rv[fieldID] = dictOffset
|
||||
|
||||
// get the field doc value offset (start)
|
||||
fieldDvLocsStart[fieldID] = uint64(w.Count())
|
||||
|
||||
// update the field doc values
|
||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
for docNum, docTerms := range docTermMap {
|
||||
if len(docTerms) > 0 {
|
||||
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true)
|
||||
|
||||
fdvReadersAvailable := false
|
||||
var dvIterClone *docValueReader
|
||||
for segmentI, segment := range segmentsInFocus {
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, seg.ErrClosed
|
||||
}
|
||||
|
||||
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName])
|
||||
if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists &&
|
||||
dvIter != nil {
|
||||
fdvReadersAvailable = true
|
||||
dvIterClone = dvIter.cloneInto(dvIterClone)
|
||||
err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error {
|
||||
if newDocNums[segmentI][docNum] == docDropped {
|
||||
return nil
|
||||
}
|
||||
err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
|
||||
if fdvReadersAvailable {
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// persist the doc value details for this field
|
||||
_, err = fdvEncoder.Write()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// get the field doc value offset (end)
|
||||
fieldDvLocsEnd[fieldID] = uint64(w.Count())
|
||||
} else {
|
||||
fieldDvLocsStart[fieldID] = fieldNotUninverted
|
||||
fieldDvLocsEnd[fieldID] = fieldNotUninverted
|
||||
}
|
||||
|
||||
// get the field doc value offset
|
||||
fieldDvLocs[fieldID] = uint64(w.Count())
|
||||
|
||||
// persist the doc value details for this field
|
||||
_, err = fdvEncoder.Write(w)
|
||||
// reset vellum buffer and vellum builder
|
||||
vellumBuf.Reset()
|
||||
err = newVellum.Reset(&vellumBuf)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
@ -428,38 +428,210 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
fieldDvLocsOffset := uint64(w.Count())
|
||||
|
||||
buf := bufMaxVarintLen64
|
||||
for _, offset := range fieldDvLocs {
|
||||
n := binary.PutUvarint(buf, uint64(offset))
|
||||
for i := 0; i < len(fieldDvLocsStart); i++ {
|
||||
n := binary.PutUvarint(buf, fieldDvLocsStart[i])
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
n = binary.PutUvarint(buf, fieldDvLocsEnd[i])
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, fieldDvLocsOffset, nil
|
||||
}
|
||||
|
||||
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
|
||||
newDocNums []uint64, newRoaring *roaring.Bitmap,
|
||||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) (
|
||||
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
|
||||
next, err := postItr.Next()
|
||||
for next != nil && err == nil {
|
||||
hitNewDocNum := newDocNums[next.Number()]
|
||||
if hitNewDocNum == docDropped {
|
||||
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
|
||||
}
|
||||
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
|
||||
nextFreq := next.Frequency()
|
||||
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
|
||||
|
||||
locs := next.Locations()
|
||||
|
||||
err = tfEncoder.Add(hitNewDocNum,
|
||||
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
if len(locs) > 0 {
|
||||
numBytesLocs := 0
|
||||
for _, loc := range locs {
|
||||
ap := loc.ArrayPositions()
|
||||
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1),
|
||||
loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap)
|
||||
}
|
||||
|
||||
err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs))
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
for _, loc := range locs {
|
||||
ap := loc.ArrayPositions()
|
||||
if cap(bufLoc) < 5+len(ap) {
|
||||
bufLoc = make([]uint64, 0, 5+len(ap))
|
||||
}
|
||||
args := bufLoc[0:5]
|
||||
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
||||
args[1] = loc.Pos()
|
||||
args[2] = loc.Start()
|
||||
args[3] = loc.End()
|
||||
args[4] = uint64(len(ap))
|
||||
args = append(args, ap...)
|
||||
err = locEncoder.Add(hitNewDocNum, args...)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastDocNum = hitNewDocNum
|
||||
lastFreq = nextFreq
|
||||
lastNorm = nextNorm
|
||||
|
||||
next, err = postItr.Next()
|
||||
}
|
||||
|
||||
return lastDocNum, lastFreq, lastNorm, bufLoc, err
|
||||
}
|
||||
|
||||
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
|
||||
newDocNums []uint64, newRoaring *roaring.Bitmap,
|
||||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) (
|
||||
lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) {
|
||||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err :=
|
||||
postItr.nextBytes()
|
||||
for err == nil && len(nextFreqNormBytes) > 0 {
|
||||
hitNewDocNum := newDocNums[nextDocNum]
|
||||
if hitNewDocNum == docDropped {
|
||||
return 0, 0, 0, fmt.Errorf("see hit with dropped doc num")
|
||||
}
|
||||
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes)
|
||||
if err != nil {
|
||||
return 0, 0, 0, err
|
||||
}
|
||||
|
||||
if len(nextLocBytes) > 0 {
|
||||
err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes)
|
||||
if err != nil {
|
||||
return 0, 0, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
lastDocNum = hitNewDocNum
|
||||
lastFreq = nextFreq
|
||||
lastNorm = nextNorm
|
||||
|
||||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err =
|
||||
postItr.nextBytes()
|
||||
}
|
||||
|
||||
return lastDocNum, lastFreq, lastNorm, err
|
||||
}
|
||||
|
||||
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
|
||||
use1HitEncoding func(uint64) (bool, uint64, uint64),
|
||||
w *CountHashWriter, bufMaxVarintLen64 []byte) (
|
||||
offset uint64, err error) {
|
||||
termCardinality := postings.GetCardinality()
|
||||
if termCardinality <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
if use1HitEncoding != nil {
|
||||
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality)
|
||||
if encodeAs1Hit {
|
||||
return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil
|
||||
}
|
||||
}
|
||||
|
||||
tfOffset := uint64(w.Count())
|
||||
_, err = tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
locOffset := uint64(w.Count())
|
||||
_, err = locEncoder.Write(w)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
postingsOffset := uint64(w.Count())
|
||||
|
||||
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return postingsOffset, nil
|
||||
}
|
||||
|
||||
type varintEncoder func(uint64) (int, error)
|
||||
|
||||
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
|
||||
w *CountHashWriter) (uint64, [][]uint64, error) {
|
||||
w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) {
|
||||
var rv [][]uint64 // The remapped or newDocNums for each segment.
|
||||
|
||||
var newDocNum uint64
|
||||
|
||||
var curr int
|
||||
var metaBuf bytes.Buffer
|
||||
var data, compressed []byte
|
||||
|
||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
||||
var metaBuf bytes.Buffer
|
||||
varBuf := make([]byte, binary.MaxVarintLen64)
|
||||
metaEncode := func(val uint64) (int, error) {
|
||||
wb := binary.PutUvarint(varBuf, val)
|
||||
return metaBuf.Write(varBuf[:wb])
|
||||
}
|
||||
|
||||
vals := make([][][]byte, len(fieldsInv))
|
||||
typs := make([][]byte, len(fieldsInv))
|
||||
poss := make([][][]uint64, len(fieldsInv))
|
||||
|
||||
var posBuf []uint64
|
||||
|
||||
docNumOffsets := make([]uint64, newSegDocCount)
|
||||
|
||||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||
defer visitDocumentCtxPool.Put(vdc)
|
||||
|
||||
// for each segment
|
||||
for segI, segment := range segments {
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return 0, nil, seg.ErrClosed
|
||||
}
|
||||
|
||||
segNewDocNums := make([]uint64, segment.numDocs)
|
||||
|
||||
dropsI := drops[segI]
|
||||
|
@ -495,7 +667,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
curr = 0
|
||||
metaBuf.Reset()
|
||||
data = data[:0]
|
||||
compressed = compressed[:0]
|
||||
|
||||
posTemp := posBuf
|
||||
|
||||
// collect all the data
|
||||
for i := 0; i < len(fieldsInv); i++ {
|
||||
|
@ -503,42 +676,63 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
typs[i] = typs[i][:0]
|
||||
poss[i] = poss[i][:0]
|
||||
}
|
||||
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
fieldID := int(fieldsMap[field]) - 1
|
||||
vals[fieldID] = append(vals[fieldID], value)
|
||||
typs[fieldID] = append(typs[fieldID], typ)
|
||||
poss[fieldID] = append(poss[fieldID], pos)
|
||||
|
||||
// copy array positions to preserve them beyond the scope of this callback
|
||||
var curPos []uint64
|
||||
if len(pos) > 0 {
|
||||
if cap(posTemp) < len(pos) {
|
||||
posBuf = make([]uint64, len(pos)*len(fieldsInv))
|
||||
posTemp = posBuf
|
||||
}
|
||||
curPos = posTemp[0:len(pos)]
|
||||
copy(curPos, pos)
|
||||
posTemp = posTemp[len(pos):]
|
||||
}
|
||||
poss[fieldID] = append(poss[fieldID], curPos)
|
||||
|
||||
return true
|
||||
})
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// now walk the fields in order
|
||||
for fieldID := range fieldsInv {
|
||||
storedFieldValues := vals[int(fieldID)]
|
||||
// _id field special case optimizes ExternalID() lookups
|
||||
idFieldVal := vals[uint16(0)][0]
|
||||
_, err = metaEncode(uint64(len(idFieldVal)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
stf := typs[int(fieldID)]
|
||||
spf := poss[int(fieldID)]
|
||||
// now walk the non-"_id" fields in order
|
||||
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ {
|
||||
storedFieldValues := vals[fieldID]
|
||||
|
||||
stf := typs[fieldID]
|
||||
spf := poss[fieldID]
|
||||
|
||||
var err2 error
|
||||
curr, data, err2 = persistStoredFieldValues(fieldID,
|
||||
storedFieldValues, stf, spf, curr, metaEncoder, data)
|
||||
storedFieldValues, stf, spf, curr, metaEncode, data)
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
}
|
||||
|
||||
metaEncoder.Close()
|
||||
metaBytes := metaBuf.Bytes()
|
||||
|
||||
compressed = snappy.Encode(compressed, data)
|
||||
compressed = snappy.Encode(compressed[:cap(compressed)], data)
|
||||
|
||||
// record where we're about to start writing
|
||||
docNumOffsets[newDocNum] = uint64(w.Count())
|
||||
|
||||
// write out the meta len and compressed data len
|
||||
_, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
|
||||
_, err = writeUvarints(w,
|
||||
uint64(len(metaBytes)),
|
||||
uint64(len(idFieldVal)+len(compressed)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
@ -547,6 +741,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// now write the _id field val (counted as part of the 'compressed' data)
|
||||
_, err = w.Write(idFieldVal)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// now write the compressed data
|
||||
_, err = w.Write(compressed)
|
||||
if err != nil {
|
||||
|
@ -644,3 +843,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) {
|
|||
|
||||
return fieldsSame, rv
|
||||
}
|
||||
|
||||
func isClosed(closeCh chan struct{}) bool {
|
||||
select {
|
||||
case <-closeCh:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
|
826
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go
generated
vendored
Normal file
826
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go
generated
vendored
Normal file
|
@ -0,0 +1,826 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"math"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var NewSegmentBufferNumResultsBump int = 100
|
||||
var NewSegmentBufferNumResultsFactor float64 = 1.0
|
||||
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
|
||||
|
||||
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
|
||||
// SegmentBase from analysis results
|
||||
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
|
||||
chunkFactor uint32) (*SegmentBase, uint64, error) {
|
||||
s := interimPool.Get().(*interim)
|
||||
|
||||
var br bytes.Buffer
|
||||
if s.lastNumDocs > 0 {
|
||||
// use previous results to initialize the buf with an estimate
|
||||
// size, but note that the interim instance comes from a
|
||||
// global interimPool, so multiple scorch instances indexing
|
||||
// different docs can lead to low quality estimates
|
||||
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
|
||||
NewSegmentBufferNumResultsFactor)
|
||||
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
|
||||
NewSegmentBufferAvgBytesPerDocFactor)
|
||||
br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
|
||||
}
|
||||
|
||||
s.results = results
|
||||
s.chunkFactor = chunkFactor
|
||||
s.w = NewCountHashWriter(&br)
|
||||
|
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
|
||||
err := s.convert()
|
||||
if err != nil {
|
||||
return nil, uint64(0), err
|
||||
}
|
||||
|
||||
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
|
||||
s.FieldsMap, s.FieldsInv, uint64(len(results)),
|
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
|
||||
|
||||
if err == nil && s.reset() == nil {
|
||||
s.lastNumDocs = len(results)
|
||||
s.lastOutSize = len(br.Bytes())
|
||||
interimPool.Put(s)
|
||||
}
|
||||
|
||||
return sb, uint64(len(br.Bytes())), err
|
||||
}
|
||||
|
||||
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
|
||||
|
||||
// interim holds temporary working data used while converting from
|
||||
// analysis results to a zap-encoded segment
|
||||
type interim struct {
|
||||
results []*index.AnalysisResult
|
||||
|
||||
chunkFactor uint32
|
||||
|
||||
w *CountHashWriter
|
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16
|
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string
|
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64
|
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string
|
||||
|
||||
// Fields whose IncludeDocValues is true
|
||||
// field id -> bool
|
||||
IncludeDocValues []bool
|
||||
|
||||
// postings id -> bitmap of docNums
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// postings id -> freq/norm's, one for each docNum in postings
|
||||
FreqNorms [][]interimFreqNorm
|
||||
freqNormsBacking []interimFreqNorm
|
||||
|
||||
// postings id -> locs, one for each freq
|
||||
Locs [][]interimLoc
|
||||
locsBacking []interimLoc
|
||||
|
||||
numTermsPerPostingsList []int // key is postings list id
|
||||
numLocsPerPostingsList []int // key is postings list id
|
||||
|
||||
builder *vellum.Builder
|
||||
builderBuf bytes.Buffer
|
||||
|
||||
metaBuf bytes.Buffer
|
||||
|
||||
tmp0 []byte
|
||||
tmp1 []byte
|
||||
|
||||
lastNumDocs int
|
||||
lastOutSize int
|
||||
}
|
||||
|
||||
func (s *interim) reset() (err error) {
|
||||
s.results = nil
|
||||
s.chunkFactor = 0
|
||||
s.w = nil
|
||||
s.FieldsMap = nil
|
||||
s.FieldsInv = nil
|
||||
for i := range s.Dicts {
|
||||
s.Dicts[i] = nil
|
||||
}
|
||||
s.Dicts = s.Dicts[:0]
|
||||
for i := range s.DictKeys {
|
||||
s.DictKeys[i] = s.DictKeys[i][:0]
|
||||
}
|
||||
s.DictKeys = s.DictKeys[:0]
|
||||
for i := range s.IncludeDocValues {
|
||||
s.IncludeDocValues[i] = false
|
||||
}
|
||||
s.IncludeDocValues = s.IncludeDocValues[:0]
|
||||
for _, idn := range s.Postings {
|
||||
idn.Clear()
|
||||
}
|
||||
s.Postings = s.Postings[:0]
|
||||
s.FreqNorms = s.FreqNorms[:0]
|
||||
for i := range s.freqNormsBacking {
|
||||
s.freqNormsBacking[i] = interimFreqNorm{}
|
||||
}
|
||||
s.freqNormsBacking = s.freqNormsBacking[:0]
|
||||
s.Locs = s.Locs[:0]
|
||||
for i := range s.locsBacking {
|
||||
s.locsBacking[i] = interimLoc{}
|
||||
}
|
||||
s.locsBacking = s.locsBacking[:0]
|
||||
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
|
||||
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
|
||||
s.builderBuf.Reset()
|
||||
if s.builder != nil {
|
||||
err = s.builder.Reset(&s.builderBuf)
|
||||
}
|
||||
s.metaBuf.Reset()
|
||||
s.tmp0 = s.tmp0[:0]
|
||||
s.tmp1 = s.tmp1[:0]
|
||||
s.lastNumDocs = 0
|
||||
s.lastOutSize = 0
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *interim) grabBuf(size int) []byte {
|
||||
buf := s.tmp0
|
||||
if cap(buf) < size {
|
||||
buf = make([]byte, size)
|
||||
s.tmp0 = buf
|
||||
}
|
||||
return buf[0:size]
|
||||
}
|
||||
|
||||
type interimStoredField struct {
|
||||
vals [][]byte
|
||||
typs []byte
|
||||
arrayposs [][]uint64 // array positions
|
||||
}
|
||||
|
||||
type interimFreqNorm struct {
|
||||
freq uint64
|
||||
norm float32
|
||||
numLocs int
|
||||
}
|
||||
|
||||
type interimLoc struct {
|
||||
fieldID uint16
|
||||
pos uint64
|
||||
start uint64
|
||||
end uint64
|
||||
arrayposs []uint64
|
||||
}
|
||||
|
||||
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
|
||||
s.FieldsMap = map[string]uint16{}
|
||||
|
||||
s.getOrDefineField("_id") // _id field is fieldID 0
|
||||
|
||||
for _, result := range s.results {
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
for _, field := range result.Document.Fields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
}
|
||||
|
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
|
||||
for fieldID, fieldName := range s.FieldsInv {
|
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||
}
|
||||
|
||||
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
|
||||
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
|
||||
} else {
|
||||
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
|
||||
}
|
||||
|
||||
s.prepareDicts()
|
||||
|
||||
for _, dict := range s.DictKeys {
|
||||
sort.Strings(dict)
|
||||
}
|
||||
|
||||
s.processDocuments()
|
||||
|
||||
storedIndexOffset, err := s.writeStoredFields()
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
var fdvIndexOffset uint64
|
||||
var dictOffsets []uint64
|
||||
|
||||
if len(s.results) > 0 {
|
||||
fdvIndexOffset, dictOffsets, err = s.writeDicts()
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
} else {
|
||||
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||
}
|
||||
|
||||
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
|
||||
}
|
||||
|
||||
func (s *interim) getOrDefineField(fieldName string) int {
|
||||
fieldIDPlus1, exists := s.FieldsMap[fieldName]
|
||||
if !exists {
|
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||
s.FieldsMap[fieldName] = fieldIDPlus1
|
||||
s.FieldsInv = append(s.FieldsInv, fieldName)
|
||||
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||
|
||||
n := len(s.DictKeys)
|
||||
if n < cap(s.DictKeys) {
|
||||
s.DictKeys = s.DictKeys[:n+1]
|
||||
s.DictKeys[n] = s.DictKeys[n][:0]
|
||||
} else {
|
||||
s.DictKeys = append(s.DictKeys, []string(nil))
|
||||
}
|
||||
}
|
||||
|
||||
return int(fieldIDPlus1 - 1)
|
||||
}
|
||||
|
||||
// fill Dicts and DictKeys from analysis results
|
||||
func (s *interim) prepareDicts() {
|
||||
var pidNext int
|
||||
|
||||
var totTFs int
|
||||
var totLocs int
|
||||
|
||||
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||
dict := s.Dicts[fieldID]
|
||||
dictKeys := s.DictKeys[fieldID]
|
||||
|
||||
for term, tf := range tfs {
|
||||
pidPlus1, exists := dict[term]
|
||||
if !exists {
|
||||
pidNext++
|
||||
pidPlus1 = uint64(pidNext)
|
||||
|
||||
dict[term] = pidPlus1
|
||||
dictKeys = append(dictKeys, term)
|
||||
|
||||
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
|
||||
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
|
||||
}
|
||||
|
||||
pid := pidPlus1 - 1
|
||||
|
||||
s.numTermsPerPostingsList[pid] += 1
|
||||
s.numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||
|
||||
totLocs += len(tf.Locations)
|
||||
}
|
||||
|
||||
totTFs += len(tfs)
|
||||
|
||||
s.DictKeys[fieldID] = dictKeys
|
||||
}
|
||||
|
||||
for _, result := range s.results {
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
_, tf := field.Analyze()
|
||||
visitField(fieldID, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
tf := result.Analyzed[i]
|
||||
visitField(fieldID, tf)
|
||||
}
|
||||
}
|
||||
|
||||
numPostingsLists := pidNext
|
||||
|
||||
if cap(s.Postings) >= numPostingsLists {
|
||||
s.Postings = s.Postings[:numPostingsLists]
|
||||
} else {
|
||||
postings := make([]*roaring.Bitmap, numPostingsLists)
|
||||
copy(postings, s.Postings[:cap(s.Postings)])
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
if postings[i] == nil {
|
||||
postings[i] = roaring.New()
|
||||
}
|
||||
}
|
||||
s.Postings = postings
|
||||
}
|
||||
|
||||
if cap(s.FreqNorms) >= numPostingsLists {
|
||||
s.FreqNorms = s.FreqNorms[:numPostingsLists]
|
||||
} else {
|
||||
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
|
||||
}
|
||||
|
||||
if cap(s.freqNormsBacking) >= totTFs {
|
||||
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
|
||||
} else {
|
||||
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
|
||||
}
|
||||
|
||||
freqNormsBacking := s.freqNormsBacking
|
||||
for pid, numTerms := range s.numTermsPerPostingsList {
|
||||
s.FreqNorms[pid] = freqNormsBacking[0:0]
|
||||
freqNormsBacking = freqNormsBacking[numTerms:]
|
||||
}
|
||||
|
||||
if cap(s.Locs) >= numPostingsLists {
|
||||
s.Locs = s.Locs[:numPostingsLists]
|
||||
} else {
|
||||
s.Locs = make([][]interimLoc, numPostingsLists)
|
||||
}
|
||||
|
||||
if cap(s.locsBacking) >= totLocs {
|
||||
s.locsBacking = s.locsBacking[:totLocs]
|
||||
} else {
|
||||
s.locsBacking = make([]interimLoc, totLocs)
|
||||
}
|
||||
|
||||
locsBacking := s.locsBacking
|
||||
for pid, numLocs := range s.numLocsPerPostingsList {
|
||||
s.Locs[pid] = locsBacking[0:0]
|
||||
locsBacking = locsBacking[numLocs:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) processDocuments() {
|
||||
numFields := len(s.FieldsInv)
|
||||
reuseFieldLens := make([]int, numFields)
|
||||
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
|
||||
|
||||
for docNum, result := range s.results {
|
||||
for i := 0; i < numFields; i++ { // clear these for reuse
|
||||
reuseFieldLens[i] = 0
|
||||
reuseFieldTFs[i] = nil
|
||||
}
|
||||
|
||||
s.processDocument(uint64(docNum), result,
|
||||
reuseFieldLens, reuseFieldTFs)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) processDocument(docNum uint64,
|
||||
result *index.AnalysisResult,
|
||||
fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
|
||||
visitField := func(fieldID uint16, fieldName string,
|
||||
ln int, tf analysis.TokenFrequencies) {
|
||||
fieldLens[fieldID] += ln
|
||||
|
||||
existingFreqs := fieldTFs[fieldID]
|
||||
if existingFreqs != nil {
|
||||
existingFreqs.MergeAll(fieldName, tf)
|
||||
} else {
|
||||
fieldTFs[fieldID] = tf
|
||||
}
|
||||
}
|
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
ln, tf := field.Analyze()
|
||||
visitField(fieldID, field.Name(), ln, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
ln := result.Length[i]
|
||||
tf := result.Analyzed[i]
|
||||
visitField(fieldID, field.Name(), ln, tf)
|
||||
}
|
||||
|
||||
// now that it's been rolled up into fieldTFs, walk that
|
||||
for fieldID, tfs := range fieldTFs {
|
||||
dict := s.Dicts[fieldID]
|
||||
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
|
||||
|
||||
for term, tf := range tfs {
|
||||
pid := dict[term] - 1
|
||||
bs := s.Postings[pid]
|
||||
bs.Add(uint32(docNum))
|
||||
|
||||
s.FreqNorms[pid] = append(s.FreqNorms[pid],
|
||||
interimFreqNorm{
|
||||
freq: uint64(tf.Frequency()),
|
||||
norm: norm,
|
||||
numLocs: len(tf.Locations),
|
||||
})
|
||||
|
||||
if len(tf.Locations) > 0 {
|
||||
locs := s.Locs[pid]
|
||||
|
||||
for _, loc := range tf.Locations {
|
||||
var locf = uint16(fieldID)
|
||||
if loc.Field != "" {
|
||||
locf = uint16(s.getOrDefineField(loc.Field))
|
||||
}
|
||||
var arrayposs []uint64
|
||||
if len(loc.ArrayPositions) > 0 {
|
||||
arrayposs = loc.ArrayPositions
|
||||
}
|
||||
locs = append(locs, interimLoc{
|
||||
fieldID: locf,
|
||||
pos: uint64(loc.Position),
|
||||
start: uint64(loc.Start),
|
||||
end: uint64(loc.End),
|
||||
arrayposs: arrayposs,
|
||||
})
|
||||
}
|
||||
|
||||
s.Locs[pid] = locs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) writeStoredFields() (
|
||||
storedIndexOffset uint64, err error) {
|
||||
varBuf := make([]byte, binary.MaxVarintLen64)
|
||||
metaEncode := func(val uint64) (int, error) {
|
||||
wb := binary.PutUvarint(varBuf, val)
|
||||
return s.metaBuf.Write(varBuf[:wb])
|
||||
}
|
||||
|
||||
data, compressed := s.tmp0[:0], s.tmp1[:0]
|
||||
defer func() { s.tmp0, s.tmp1 = data, compressed }()
|
||||
|
||||
// keyed by docNum
|
||||
docStoredOffsets := make([]uint64, len(s.results))
|
||||
|
||||
// keyed by fieldID, for the current doc in the loop
|
||||
docStoredFields := map[uint16]interimStoredField{}
|
||||
|
||||
for docNum, result := range s.results {
|
||||
for fieldID := range docStoredFields { // reset for next doc
|
||||
delete(docStoredFields, fieldID)
|
||||
}
|
||||
|
||||
for _, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
|
||||
opts := field.Options()
|
||||
|
||||
if opts.IsStored() {
|
||||
isf := docStoredFields[fieldID]
|
||||
isf.vals = append(isf.vals, field.Value())
|
||||
isf.typs = append(isf.typs, encodeFieldType(field))
|
||||
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
|
||||
docStoredFields[fieldID] = isf
|
||||
}
|
||||
|
||||
if opts.IncludeDocValues() {
|
||||
s.IncludeDocValues[fieldID] = true
|
||||
}
|
||||
}
|
||||
|
||||
var curr int
|
||||
|
||||
s.metaBuf.Reset()
|
||||
data = data[:0]
|
||||
|
||||
// _id field special case optimizes ExternalID() lookups
|
||||
idFieldVal := docStoredFields[uint16(0)].vals[0]
|
||||
_, err = metaEncode(uint64(len(idFieldVal)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// handle non-"_id" fields
|
||||
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
|
||||
isf, exists := docStoredFields[uint16(fieldID)]
|
||||
if exists {
|
||||
curr, data, err = persistStoredFieldValues(
|
||||
fieldID, isf.vals, isf.typs, isf.arrayposs,
|
||||
curr, metaEncode, data)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metaBytes := s.metaBuf.Bytes()
|
||||
|
||||
compressed = snappy.Encode(compressed[:cap(compressed)], data)
|
||||
|
||||
docStoredOffsets[docNum] = uint64(s.w.Count())
|
||||
|
||||
_, err := writeUvarints(s.w,
|
||||
uint64(len(metaBytes)),
|
||||
uint64(len(idFieldVal)+len(compressed)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(metaBytes)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(idFieldVal)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(compressed)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
storedIndexOffset = uint64(s.w.Count())
|
||||
|
||||
for _, docStoredOffset := range docStoredOffsets {
|
||||
err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return storedIndexOffset, nil
|
||||
}
|
||||
|
||||
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
|
||||
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||
|
||||
fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
|
||||
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
|
||||
|
||||
buf := s.grabBuf(binary.MaxVarintLen64)
|
||||
|
||||
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
|
||||
|
||||
var docTermMap [][]byte
|
||||
|
||||
if s.builder == nil {
|
||||
s.builder, err = vellum.New(&s.builderBuf, nil)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
for fieldID, terms := range s.DictKeys {
|
||||
if cap(docTermMap) < len(s.results) {
|
||||
docTermMap = make([][]byte, len(s.results))
|
||||
} else {
|
||||
docTermMap = docTermMap[0:len(s.results)]
|
||||
for docNum := range docTermMap { // reset the docTermMap
|
||||
docTermMap[docNum] = docTermMap[docNum][:0]
|
||||
}
|
||||
}
|
||||
|
||||
dict := s.Dicts[fieldID]
|
||||
|
||||
for _, term := range terms { // terms are already sorted
|
||||
pid := dict[term] - 1
|
||||
|
||||
postingsBS := s.Postings[pid]
|
||||
|
||||
freqNorms := s.FreqNorms[pid]
|
||||
freqNormOffset := 0
|
||||
|
||||
locs := s.Locs[pid]
|
||||
locOffset := 0
|
||||
|
||||
postingsItr := postingsBS.Iterator()
|
||||
for postingsItr.HasNext() {
|
||||
docNum := uint64(postingsItr.Next())
|
||||
|
||||
freqNorm := freqNorms[freqNormOffset]
|
||||
|
||||
err = tfEncoder.Add(docNum,
|
||||
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
|
||||
uint64(math.Float32bits(freqNorm.norm)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if freqNorm.numLocs > 0 {
|
||||
numBytesLocs := 0
|
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||
numBytesLocs += totalUvarintBytes(
|
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||
uint64(len(loc.arrayposs)), loc.arrayposs)
|
||||
}
|
||||
|
||||
err = locEncoder.Add(docNum, uint64(numBytesLocs))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||
err = locEncoder.Add(docNum,
|
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||
uint64(len(loc.arrayposs)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
err = locEncoder.Add(docNum, loc.arrayposs...)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
locOffset += freqNorm.numLocs
|
||||
}
|
||||
|
||||
freqNormOffset++
|
||||
|
||||
docTermMap[docNum] = append(
|
||||
append(docTermMap[docNum], term...),
|
||||
termSeparator)
|
||||
}
|
||||
|
||||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
postingsOffset, err :=
|
||||
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if postingsOffset > uint64(0) {
|
||||
err = s.builder.Insert([]byte(term), postingsOffset)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
tfEncoder.Reset()
|
||||
locEncoder.Reset()
|
||||
}
|
||||
|
||||
err = s.builder.Close()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// record where this dictionary starts
|
||||
dictOffsets[fieldID] = uint64(s.w.Count())
|
||||
|
||||
vellumData := s.builderBuf.Bytes()
|
||||
|
||||
// write out the length of the vellum data
|
||||
n := binary.PutUvarint(buf, uint64(len(vellumData)))
|
||||
_, err = s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// write this vellum to disk
|
||||
_, err = s.w.Write(vellumData)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// reset vellum for reuse
|
||||
s.builderBuf.Reset()
|
||||
|
||||
err = s.builder.Reset(&s.builderBuf)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// write the field doc values
|
||||
if s.IncludeDocValues[fieldID] {
|
||||
for docNum, docTerms := range docTermMap {
|
||||
if len(docTerms) > 0 {
|
||||
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
fdvOffsetsStart[fieldID] = uint64(s.w.Count())
|
||||
|
||||
_, err = fdvEncoder.Write()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
|
||||
|
||||
fdvEncoder.Reset()
|
||||
} else {
|
||||
fdvOffsetsStart[fieldID] = fieldNotUninverted
|
||||
fdvOffsetsEnd[fieldID] = fieldNotUninverted
|
||||
}
|
||||
}
|
||||
|
||||
fdvIndexOffset = uint64(s.w.Count())
|
||||
|
||||
for i := 0; i < len(fdvOffsetsStart); i++ {
|
||||
n := binary.PutUvarint(buf, fdvOffsetsStart[i])
|
||||
_, err := s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
|
||||
_, err = s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return fdvIndexOffset, dictOffsets, nil
|
||||
}
|
||||
|
||||
func encodeFieldType(f document.Field) byte {
|
||||
fieldType := byte('x')
|
||||
switch f.(type) {
|
||||
case *document.TextField:
|
||||
fieldType = 't'
|
||||
case *document.NumericField:
|
||||
fieldType = 'n'
|
||||
case *document.DateTimeField:
|
||||
fieldType = 'd'
|
||||
case *document.BooleanField:
|
||||
fieldType = 'b'
|
||||
case *document.GeoPointField:
|
||||
fieldType = 'g'
|
||||
case *document.CompositeField:
|
||||
fieldType = 'c'
|
||||
}
|
||||
return fieldType
|
||||
}
|
||||
|
||||
// returns the total # of bytes needed to encode the given uint64's
|
||||
// into binary.PutUVarint() encoding
|
||||
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
|
||||
n = numUvarintBytes(a)
|
||||
n += numUvarintBytes(b)
|
||||
n += numUvarintBytes(c)
|
||||
n += numUvarintBytes(d)
|
||||
n += numUvarintBytes(e)
|
||||
for _, v := range more {
|
||||
n += numUvarintBytes(v)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
|
||||
func numUvarintBytes(x uint64) (n int) {
|
||||
for x >= 0x80 {
|
||||
x >>= 7
|
||||
n++
|
||||
}
|
||||
return n + 1
|
||||
}
|
829
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/posting.go
generated
vendored
829
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/posting.go
generated
vendored
File diff suppressed because it is too large
Load diff
232
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/segment.go
generated
vendored
232
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/segment.go
generated
vendored
|
@ -20,16 +20,24 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"reflect"
|
||||
"sync"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
"github.com/couchbase/vellum"
|
||||
mmap "github.com/edsrzf/mmap-go"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var reflectStaticSizeSegmentBase int
|
||||
|
||||
func init() {
|
||||
var sb SegmentBase
|
||||
reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size())
|
||||
}
|
||||
|
||||
// Open returns a zap impl of a segment
|
||||
func Open(path string) (segment.Segment, error) {
|
||||
f, err := os.Open(path)
|
||||
|
@ -47,13 +55,14 @@ func Open(path string) (segment.Segment, error) {
|
|||
SegmentBase: SegmentBase{
|
||||
mem: mm[0 : len(mm)-FooterSize],
|
||||
fieldsMap: make(map[string]uint16),
|
||||
fieldDvIterMap: make(map[uint16]*docValueIterator),
|
||||
fieldDvReaders: make(map[uint16]*docValueReader),
|
||||
},
|
||||
f: f,
|
||||
mm: mm,
|
||||
path: path,
|
||||
refs: 1,
|
||||
}
|
||||
rv.SegmentBase.updateSize()
|
||||
|
||||
err = rv.loadConfig()
|
||||
if err != nil {
|
||||
|
@ -67,7 +76,7 @@ func Open(path string) (segment.Segment, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
err = rv.loadDvIterators()
|
||||
err = rv.loadDvReaders()
|
||||
if err != nil {
|
||||
_ = rv.Close()
|
||||
return nil, err
|
||||
|
@ -89,7 +98,39 @@ type SegmentBase struct {
|
|||
fieldsIndexOffset uint64
|
||||
docValueOffset uint64
|
||||
dictLocs []uint64
|
||||
fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field
|
||||
fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field
|
||||
fieldDvNames []string // field names cached in fieldDvReaders
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (sb *SegmentBase) Size() int {
|
||||
return int(sb.size)
|
||||
}
|
||||
|
||||
func (sb *SegmentBase) updateSize() {
|
||||
sizeInBytes := reflectStaticSizeSegmentBase +
|
||||
cap(sb.mem)
|
||||
|
||||
// fieldsMap
|
||||
for k, _ := range sb.fieldsMap {
|
||||
sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16
|
||||
}
|
||||
|
||||
// fieldsInv, dictLocs
|
||||
for _, entry := range sb.fieldsInv {
|
||||
sizeInBytes += len(entry) + size.SizeOfString
|
||||
}
|
||||
sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64
|
||||
|
||||
// fieldDvReaders
|
||||
for _, v := range sb.fieldDvReaders {
|
||||
sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr
|
||||
if v != nil {
|
||||
sizeInBytes += v.size()
|
||||
}
|
||||
}
|
||||
|
||||
sb.size = uint64(sizeInBytes)
|
||||
}
|
||||
|
||||
func (sb *SegmentBase) AddRef() {}
|
||||
|
@ -111,56 +152,19 @@ type Segment struct {
|
|||
refs int64
|
||||
}
|
||||
|
||||
func (s *Segment) SizeInBytes() uint64 {
|
||||
func (s *Segment) Size() int {
|
||||
// 8 /* size of file pointer */
|
||||
// 4 /* size of version -> uint32 */
|
||||
// 4 /* size of crc -> uint32 */
|
||||
sizeOfUints := 16
|
||||
|
||||
sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints
|
||||
sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints
|
||||
|
||||
// mutex, refs -> int64
|
||||
sizeInBytes += 16
|
||||
|
||||
// do not include the mmap'ed part
|
||||
return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem))
|
||||
}
|
||||
|
||||
func (s *SegmentBase) SizeInBytes() uint64 {
|
||||
// 4 /* size of memCRC -> uint32 */
|
||||
// 4 /* size of chunkFactor -> uint32 */
|
||||
// 8 /* size of numDocs -> uint64 */
|
||||
// 8 /* size of storedIndexOffset -> uint64 */
|
||||
// 8 /* size of fieldsIndexOffset -> uint64 */
|
||||
// 8 /* size of docValueOffset -> uint64 */
|
||||
sizeInBytes := 40
|
||||
|
||||
sizeInBytes += len(s.mem) + int(segment.SizeOfSlice)
|
||||
|
||||
// fieldsMap
|
||||
for k, _ := range s.fieldsMap {
|
||||
sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */
|
||||
}
|
||||
sizeInBytes += int(segment.SizeOfMap) /* overhead from map */
|
||||
|
||||
// fieldsInv, dictLocs
|
||||
for _, entry := range s.fieldsInv {
|
||||
sizeInBytes += (len(entry) + int(segment.SizeOfString))
|
||||
}
|
||||
sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */
|
||||
sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */
|
||||
|
||||
// fieldDvIterMap
|
||||
sizeInBytes += len(s.fieldDvIterMap) *
|
||||
int(segment.SizeOfPointer+2 /* size of uint16 */)
|
||||
for _, entry := range s.fieldDvIterMap {
|
||||
if entry != nil {
|
||||
sizeInBytes += int(entry.sizeInBytes())
|
||||
}
|
||||
}
|
||||
sizeInBytes += int(segment.SizeOfMap)
|
||||
|
||||
return uint64(sizeInBytes)
|
||||
return sizeInBytes + s.SegmentBase.Size() - cap(s.mem)
|
||||
}
|
||||
|
||||
func (s *Segment) AddRef() {
|
||||
|
@ -185,7 +189,7 @@ func (s *Segment) loadConfig() error {
|
|||
|
||||
verOffset := crcOffset - 4
|
||||
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
|
||||
if s.version != version {
|
||||
if s.version != Version {
|
||||
return fmt.Errorf("unsupported version %d", s.version)
|
||||
}
|
||||
|
||||
|
@ -207,7 +211,7 @@ func (s *Segment) loadConfig() error {
|
|||
}
|
||||
|
||||
func (s *SegmentBase) loadFields() error {
|
||||
// NOTE for now we assume the fields index immediately preceeds
|
||||
// NOTE for now we assume the fields index immediately precedes
|
||||
// the footer, and if this changes, need to adjust accordingly (or
|
||||
// store explicit length), where s.mem was sliced from s.mm in Open().
|
||||
fieldsIndexEnd := uint64(len(s.mem))
|
||||
|
@ -262,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
|
|||
if err != nil {
|
||||
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
|
||||
}
|
||||
rv.fstReader, err = rv.fst.Reader()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -269,50 +277,90 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
|
|||
return rv, nil
|
||||
}
|
||||
|
||||
// visitDocumentCtx holds data structures that are reusable across
|
||||
// multiple VisitDocument() calls to avoid memory allocations
|
||||
type visitDocumentCtx struct {
|
||||
buf []byte
|
||||
reader bytes.Reader
|
||||
arrayPos []uint64
|
||||
}
|
||||
|
||||
var visitDocumentCtxPool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
reuse := &visitDocumentCtx{}
|
||||
return reuse
|
||||
},
|
||||
}
|
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||
defer visitDocumentCtxPool.Put(vdc)
|
||||
return s.visitDocument(vdc, num, visitor)
|
||||
}
|
||||
|
||||
func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64,
|
||||
visitor segment.DocumentFieldValueVisitor) error {
|
||||
// first make sure this is a valid number in this segment
|
||||
if num < s.numDocs {
|
||||
meta, compressed := s.getDocStoredMetaAndCompressed(num)
|
||||
uncompressed, err := snappy.Decode(nil, compressed)
|
||||
|
||||
vdc.reader.Reset(meta)
|
||||
|
||||
// handle _id field special case
|
||||
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
idFieldVal := compressed[:idFieldValLen]
|
||||
|
||||
keepGoing := visitor("_id", byte('t'), idFieldVal, nil)
|
||||
if !keepGoing {
|
||||
visitDocumentCtxPool.Put(vdc)
|
||||
return nil
|
||||
}
|
||||
|
||||
// handle non-"_id" fields
|
||||
compressed = compressed[idFieldValLen:]
|
||||
|
||||
uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// now decode meta and process
|
||||
reader := bytes.NewReader(meta)
|
||||
decoder := govarint.NewU64Base128Decoder(reader)
|
||||
|
||||
keepGoing := true
|
||||
for keepGoing {
|
||||
field, err := decoder.GetU64()
|
||||
field, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
typ, err := decoder.GetU64()
|
||||
typ, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
offset, err := decoder.GetU64()
|
||||
offset, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
l, err := decoder.GetU64()
|
||||
l, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
numap, err := decoder.GetU64()
|
||||
numap, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var arrayPos []uint64
|
||||
if numap > 0 {
|
||||
arrayPos = make([]uint64, numap)
|
||||
if cap(vdc.arrayPos) < int(numap) {
|
||||
vdc.arrayPos = make([]uint64, numap)
|
||||
}
|
||||
arrayPos = vdc.arrayPos[:numap]
|
||||
for i := 0; i < int(numap); i++ {
|
||||
ap, err := decoder.GetU64()
|
||||
ap, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -323,10 +371,36 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal
|
|||
value := uncompressed[offset : offset+l]
|
||||
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
|
||||
}
|
||||
|
||||
vdc.buf = uncompressed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// DocID returns the value of the _id field for the given docNum
|
||||
func (s *SegmentBase) DocID(num uint64) ([]byte, error) {
|
||||
if num >= s.numDocs {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||
|
||||
meta, compressed := s.getDocStoredMetaAndCompressed(num)
|
||||
|
||||
vdc.reader.Reset(meta)
|
||||
|
||||
// handle _id field special case
|
||||
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
idFieldVal := compressed[:idFieldValLen]
|
||||
|
||||
visitDocumentCtxPool.Put(vdc)
|
||||
|
||||
return idFieldVal, nil
|
||||
}
|
||||
|
||||
// Count returns the number of documents in this segment.
|
||||
func (s *SegmentBase) Count() uint64 {
|
||||
return s.numDocs
|
||||
|
@ -343,15 +417,26 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
var postings *PostingsList
|
||||
postingsList := emptyPostingsList
|
||||
|
||||
sMax, err := idDict.fst.GetMaxKey()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sMaxStr := string(sMax)
|
||||
filteredIds := make([]string, 0, len(ids))
|
||||
for _, id := range ids {
|
||||
postings, err = idDict.postingsList([]byte(id), nil, postings)
|
||||
if id <= sMaxStr {
|
||||
filteredIds = append(filteredIds, id)
|
||||
}
|
||||
}
|
||||
|
||||
for _, id := range filteredIds {
|
||||
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if postings.postings != nil {
|
||||
rv.Or(postings.postings)
|
||||
}
|
||||
postingsList.OrInto(rv)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -441,19 +526,32 @@ func (s *Segment) DictAddr(field string) (uint64, error) {
|
|||
return s.dictLocs[fieldIDPlus1-1], nil
|
||||
}
|
||||
|
||||
func (s *SegmentBase) loadDvIterators() error {
|
||||
func (s *SegmentBase) loadDvReaders() error {
|
||||
if s.docValueOffset == fieldNotUninverted {
|
||||
return nil
|
||||
}
|
||||
|
||||
var read uint64
|
||||
for fieldID, field := range s.fieldsInv {
|
||||
fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||
var fieldLocStart, fieldLocEnd uint64
|
||||
var n int
|
||||
fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||
if n <= 0 {
|
||||
return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID)
|
||||
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID)
|
||||
}
|
||||
s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc)
|
||||
read += uint64(n)
|
||||
fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||
if n <= 0 {
|
||||
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID)
|
||||
}
|
||||
read += uint64(n)
|
||||
|
||||
fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
|
||||
if fieldDvReader != nil {
|
||||
s.fieldDvReaders[uint16(fieldID)] = fieldDvReader
|
||||
s.fieldDvNames = append(s.fieldDvNames, field)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
22
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/write.go
generated
vendored
22
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/write.go
generated
vendored
|
@ -15,7 +15,6 @@
|
|||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
|
||||
|
@ -25,28 +24,29 @@ import (
|
|||
// writes out the length of the roaring bitmap in bytes as varint
|
||||
// then writes out the roaring bitmap itself
|
||||
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
|
||||
reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) {
|
||||
reuseBuf.Reset()
|
||||
|
||||
// write out postings list to memory so we know the len
|
||||
postingsListLen, err := r.WriteTo(reuseBuf)
|
||||
reuseBufVarint []byte) (int, error) {
|
||||
buf, err := r.ToBytes()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
var tw int
|
||||
// write out the length of this postings list
|
||||
n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen))
|
||||
|
||||
// write out the length
|
||||
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf)))
|
||||
nw, err := w.Write(reuseBufVarint[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
// write out the postings list itself
|
||||
nw, err = w.Write(reuseBuf.Bytes())
|
||||
|
||||
// write out the roaring bytes
|
||||
nw, err = w.Write(buf)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
|
@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset
|
|||
return err
|
||||
}
|
||||
// write out 32-bit version
|
||||
err = binary.Write(w, binary.BigEndian, version)
|
||||
err = binary.Write(w, binary.BigEndian, Version)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
446
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go
generated
vendored
446
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index.go
generated
vendored
|
@ -15,10 +15,10 @@
|
|||
package scorch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"container/heap"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
@ -27,8 +27,13 @@ import (
|
|||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbase/vellum"
|
||||
lev2 "github.com/couchbase/vellum/levenshtein2"
|
||||
)
|
||||
|
||||
// re usable, threadsafe levenshtein builders
|
||||
var lb1, lb2 *lev2.LevenshteinAutomatonBuilder
|
||||
|
||||
type asynchSegmentResult struct {
|
||||
dictItr segment.DictionaryIterator
|
||||
|
||||
|
@ -40,15 +45,36 @@ type asynchSegmentResult struct {
|
|||
err error
|
||||
}
|
||||
|
||||
var reflectStaticSizeIndexSnapshot int
|
||||
|
||||
func init() {
|
||||
var is interface{} = IndexSnapshot{}
|
||||
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
|
||||
var err error
|
||||
lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
|
||||
}
|
||||
lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
type IndexSnapshot struct {
|
||||
parent *Scorch
|
||||
segment []*SegmentSnapshot
|
||||
offsets []uint64
|
||||
internal map[string][]byte
|
||||
epoch uint64
|
||||
size uint64
|
||||
creator string
|
||||
|
||||
m sync.Mutex // Protects the fields that follow.
|
||||
refs int64
|
||||
|
||||
m2 sync.Mutex // Protects the fields that follow.
|
||||
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) Segments() []*SegmentSnapshot {
|
||||
|
@ -85,12 +111,27 @@ func (i *IndexSnapshot) DecRef() (err error) {
|
|||
return err
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) Close() error {
|
||||
return i.DecRef()
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) Size() int {
|
||||
return int(i.size)
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) updateSize() {
|
||||
i.size += uint64(reflectStaticSizeIndexSnapshot)
|
||||
for _, s := range i.segment {
|
||||
i.size += uint64(s.Size())
|
||||
}
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
|
||||
|
||||
results := make(chan *asynchSegmentResult)
|
||||
for index, segment := range i.segment {
|
||||
go func(index int, segment *SegmentSnapshot) {
|
||||
dict, err := segment.Dictionary(field)
|
||||
dict, err := segment.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
results <- &asynchSegmentResult{err: err}
|
||||
} else {
|
||||
|
@ -116,7 +157,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
|
|||
if next != nil {
|
||||
rv.cursors = append(rv.cursors, &segmentDictCursor{
|
||||
itr: asr.dictItr,
|
||||
curr: next,
|
||||
curr: *next,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -151,6 +192,56 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
|
|||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictRegexp(field string,
|
||||
termRegex string) (index.FieldDict, error) {
|
||||
// TODO: potential optimization where the literal prefix represents the,
|
||||
// entire regexp, allowing us to use PrefixIterator(prefixTerm)?
|
||||
|
||||
a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) getLevAutomaton(term string,
|
||||
fuzziness uint8) (vellum.Automaton, error) {
|
||||
if fuzziness == 1 {
|
||||
return lb1.BuildDfa(term, fuzziness)
|
||||
} else if fuzziness == 2 {
|
||||
return lb2.BuildDfa(term, fuzziness)
|
||||
}
|
||||
return nil, fmt.Errorf("fuzziness exceeds the max limit")
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictFuzzy(field string,
|
||||
term string, fuzziness int, prefix string) (index.FieldDict, error) {
|
||||
a, err := i.getLevAutomaton(term, uint8(fuzziness))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var prefixBeg, prefixEnd []byte
|
||||
if prefix != "" {
|
||||
prefixBeg = []byte(prefix)
|
||||
prefixEnd = segment.IncrementBytes(prefixBeg)
|
||||
}
|
||||
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictOnly(field string,
|
||||
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) {
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.OnlyIterator(onlyTerms, includeCount)
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
|
||||
results := make(chan *asynchSegmentResult)
|
||||
for index, segment := range i.segment {
|
||||
|
@ -264,21 +355,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) {
|
|||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
|
||||
rv = document.NewDocument(id)
|
||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool {
|
||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool {
|
||||
if name == "_id" {
|
||||
return true
|
||||
}
|
||||
|
||||
// copy value, array positions to preserve them beyond the scope of this callback
|
||||
value := append([]byte(nil), val...)
|
||||
arrayPos := append([]uint64(nil), pos...)
|
||||
|
||||
switch typ {
|
||||
case 't':
|
||||
rv.AddField(document.NewTextField(name, pos, value))
|
||||
rv.AddField(document.NewTextField(name, arrayPos, value))
|
||||
case 'n':
|
||||
rv.AddField(document.NewNumericFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value))
|
||||
case 'd':
|
||||
rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value))
|
||||
case 'b':
|
||||
rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value))
|
||||
case 'g':
|
||||
rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value))
|
||||
}
|
||||
|
||||
return true
|
||||
|
@ -307,24 +403,15 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
|
|||
}
|
||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
|
||||
var found bool
|
||||
var rv string
|
||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
if field == "_id" {
|
||||
found = true
|
||||
rv = string(value)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
v, err := i.segment[segmentIndex].DocID(localDocNum)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if found {
|
||||
return rv, nil
|
||||
if v == nil {
|
||||
return "", fmt.Errorf("document number %d not found", docNum)
|
||||
}
|
||||
return "", fmt.Errorf("document number %d not found", docNum)
|
||||
|
||||
return string(v), nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) {
|
||||
|
@ -349,33 +436,81 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err
|
|||
|
||||
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
|
||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
||||
rv := i.allocTermFieldReaderDicts(field)
|
||||
|
||||
rv := &IndexSnapshotTermFieldReader{
|
||||
term: term,
|
||||
field: field,
|
||||
snapshot: i,
|
||||
postings: make([]segment.PostingsList, len(i.segment)),
|
||||
iterators: make([]segment.PostingsIterator, len(i.segment)),
|
||||
includeFreq: includeFreq,
|
||||
includeNorm: includeNorm,
|
||||
includeTermVectors: includeTermVectors,
|
||||
rv.term = term
|
||||
rv.field = field
|
||||
rv.snapshot = i
|
||||
if rv.postings == nil {
|
||||
rv.postings = make([]segment.PostingsList, len(i.segment))
|
||||
}
|
||||
for i, segment := range i.segment {
|
||||
dict, err := segment.Dictionary(field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if rv.iterators == nil {
|
||||
rv.iterators = make([]segment.PostingsIterator, len(i.segment))
|
||||
}
|
||||
rv.segmentOffset = 0
|
||||
rv.includeFreq = includeFreq
|
||||
rv.includeNorm = includeNorm
|
||||
rv.includeTermVectors = includeTermVectors
|
||||
rv.currPosting = nil
|
||||
rv.currID = rv.currID[:0]
|
||||
|
||||
if rv.dicts == nil {
|
||||
rv.dicts = make([]segment.TermDictionary, len(i.segment))
|
||||
for i, segment := range i.segment {
|
||||
dict, err := segment.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv.dicts[i] = dict
|
||||
}
|
||||
pl, err := dict.PostingsList(string(term), nil)
|
||||
}
|
||||
|
||||
for i, segment := range i.segment {
|
||||
pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv.postings[i] = pl
|
||||
rv.iterators[i] = pl.Iterator()
|
||||
rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i])
|
||||
}
|
||||
atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1))
|
||||
atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1))
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) {
|
||||
i.m2.Lock()
|
||||
if i.fieldTFRs != nil {
|
||||
tfrs := i.fieldTFRs[field]
|
||||
last := len(tfrs) - 1
|
||||
if last >= 0 {
|
||||
tfr = tfrs[last]
|
||||
tfrs[last] = nil
|
||||
i.fieldTFRs[field] = tfrs[:last]
|
||||
i.m2.Unlock()
|
||||
return
|
||||
}
|
||||
}
|
||||
i.m2.Unlock()
|
||||
return &IndexSnapshotTermFieldReader{}
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) {
|
||||
i.parent.rootLock.RLock()
|
||||
obsolete := i.parent.root != i
|
||||
i.parent.rootLock.RUnlock()
|
||||
if obsolete {
|
||||
// if we're not the current root (mutations happened), don't bother recycling
|
||||
return
|
||||
}
|
||||
|
||||
i.m2.Lock()
|
||||
if i.fieldTFRs == nil {
|
||||
i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{}
|
||||
}
|
||||
i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr)
|
||||
i.m2.Unlock()
|
||||
}
|
||||
|
||||
func docNumberToBytes(buf []byte, in uint64) []byte {
|
||||
if len(buf) != 8 {
|
||||
if cap(buf) >= 8 {
|
||||
|
@ -389,115 +524,172 @@ func docNumberToBytes(buf []byte, in uint64) []byte {
|
|||
}
|
||||
|
||||
func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
|
||||
var res uint64
|
||||
err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
if len(in) != 8 {
|
||||
return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in)
|
||||
}
|
||||
return res, nil
|
||||
return binary.BigEndian.Uint64(in), nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
|
||||
fields []string, visitor index.DocumentFieldTermVisitor) error {
|
||||
_, err := i.documentVisitFieldTerms(id, fields, visitor, nil)
|
||||
return err
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID,
|
||||
fields []string, visitor index.DocumentFieldTermVisitor,
|
||||
dvs segment.DocVisitState) (segment.DocVisitState, error) {
|
||||
docNum, err := docInternalToNumber(id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
if segmentIndex >= len(i.segment) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
_, dvs, err = i.documentVisitFieldTermsOnSegment(
|
||||
segmentIndex, localDocNum, fields, nil, visitor, dvs)
|
||||
|
||||
return dvs, err
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) documentVisitFieldTermsOnSegment(
|
||||
segmentIndex int, localDocNum uint64, fields []string, cFields []string,
|
||||
visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) (
|
||||
cFieldsOut []string, dvsOut segment.DocVisitState, err error) {
|
||||
ss := i.segment[segmentIndex]
|
||||
|
||||
var vFields []string // fields that are visitable via the segment
|
||||
|
||||
ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable)
|
||||
if ssvOk && ssv != nil {
|
||||
vFields, err = ssv.VisitableDocValueFields()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var errCh chan error
|
||||
|
||||
// cFields represents the fields that we'll need from the
|
||||
// cachedDocs, and might be optionally be provided by the caller,
|
||||
// if the caller happens to know we're on the same segmentIndex
|
||||
// from a previous invocation
|
||||
if cFields == nil {
|
||||
cFields = subtractStrings(fields, vFields)
|
||||
|
||||
if !ss.cachedDocs.hasFields(cFields) {
|
||||
errCh = make(chan error, 1)
|
||||
|
||||
go func() {
|
||||
err := ss.cachedDocs.prepareFields(cFields, ss)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
}
|
||||
close(errCh)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
if ssvOk && ssv != nil && len(vFields) > 0 {
|
||||
dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if errCh != nil {
|
||||
err = <-errCh
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if len(cFields) > 0 {
|
||||
ss.cachedDocs.visitDoc(localDocNum, cFields, visitor)
|
||||
}
|
||||
|
||||
return cFields, dvs, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DocValueReader(fields []string) (
|
||||
index.DocValueReader, error) {
|
||||
return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil
|
||||
}
|
||||
|
||||
type DocValueReader struct {
|
||||
i *IndexSnapshot
|
||||
fields []string
|
||||
dvs segment.DocVisitState
|
||||
|
||||
currSegmentIndex int
|
||||
currCachedFields []string
|
||||
}
|
||||
|
||||
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
|
||||
visitor index.DocumentFieldTermVisitor) (err error) {
|
||||
docNum, err := docInternalToNumber(id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
if segmentIndex >= len(i.segment) {
|
||||
|
||||
segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
if segmentIndex >= len(dvr.i.segment) {
|
||||
return nil
|
||||
}
|
||||
|
||||
ss := i.segment[segmentIndex]
|
||||
|
||||
if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok {
|
||||
// get the list of doc value persisted fields
|
||||
pFields, err := zaps.VisitableDocValueFields()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// assort the fields for which terms look up have to
|
||||
// be performed runtime
|
||||
dvPendingFields := extractDvPendingFields(fields, pFields)
|
||||
if len(dvPendingFields) == 0 {
|
||||
// all fields are doc value persisted
|
||||
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
|
||||
}
|
||||
|
||||
// concurrently trigger the runtime doc value preparations for
|
||||
// pending fields as well as the visit of the persisted doc values
|
||||
errCh := make(chan error, 1)
|
||||
|
||||
go func() {
|
||||
defer close(errCh)
|
||||
err := ss.cachedDocs.prepareFields(fields, ss)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
}
|
||||
}()
|
||||
|
||||
// visit the persisted dv while the cache preparation is in progress
|
||||
err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// err out if fieldCache preparation failed
|
||||
err = <-errCh
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor)
|
||||
return nil
|
||||
if dvr.currSegmentIndex != segmentIndex {
|
||||
dvr.currSegmentIndex = segmentIndex
|
||||
dvr.currCachedFields = nil
|
||||
}
|
||||
|
||||
return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor)
|
||||
dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment(
|
||||
dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error {
|
||||
err := ss.cachedDocs.prepareFields(fields, ss)
|
||||
if err != nil {
|
||||
return err
|
||||
func (i *IndexSnapshot) DumpAll() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DumpDoc(id string) chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DumpFields() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
// subtractStrings returns set a minus elements of set b.
|
||||
func subtractStrings(a, b []string) []string {
|
||||
if len(b) == 0 {
|
||||
return a
|
||||
}
|
||||
|
||||
visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor)
|
||||
return nil
|
||||
}
|
||||
|
||||
func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string,
|
||||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) {
|
||||
|
||||
for _, field := range fields {
|
||||
if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists {
|
||||
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
|
||||
for {
|
||||
i := bytes.Index(tlist, TermSeparatorSplitSlice)
|
||||
if i < 0 {
|
||||
break
|
||||
}
|
||||
visitor(field, tlist[0:i])
|
||||
tlist = tlist[i+1:]
|
||||
}
|
||||
rv := make([]string, 0, len(a))
|
||||
OUTER:
|
||||
for _, as := range a {
|
||||
for _, bs := range b {
|
||||
if as == bs {
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func extractDvPendingFields(requestedFields, persistedFields []string) []string {
|
||||
removeMap := map[string]struct{}{}
|
||||
for _, str := range persistedFields {
|
||||
removeMap[str] = struct{}{}
|
||||
}
|
||||
|
||||
rv := make([]string, 0, len(requestedFields))
|
||||
for _, s := range requestedFields {
|
||||
if _, ok := removeMap[s]; !ok {
|
||||
rv = append(rv, s)
|
||||
}
|
||||
rv = append(rv, as)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
|
17
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_dict.go
generated
vendored
17
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_dict.go
generated
vendored
|
@ -23,12 +23,13 @@ import (
|
|||
|
||||
type segmentDictCursor struct {
|
||||
itr segment.DictionaryIterator
|
||||
curr *index.DictEntry
|
||||
curr index.DictEntry
|
||||
}
|
||||
|
||||
type IndexSnapshotFieldDict struct {
|
||||
snapshot *IndexSnapshot
|
||||
cursors []*segmentDictCursor
|
||||
entry index.DictEntry
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) }
|
||||
|
@ -51,10 +52,10 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} {
|
|||
}
|
||||
|
||||
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
||||
if len(i.cursors) <= 0 {
|
||||
if len(i.cursors) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
rv := i.cursors[0].curr
|
||||
i.entry = i.cursors[0].curr
|
||||
next, err := i.cursors[0].itr.Next()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
|||
heap.Pop(i)
|
||||
} else {
|
||||
// modified heap, fix it
|
||||
i.cursors[0].curr = next
|
||||
i.cursors[0].curr = *next
|
||||
heap.Fix(i, 0)
|
||||
}
|
||||
// look for any other entries with the exact same term
|
||||
for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term {
|
||||
rv.Count += i.cursors[0].curr.Count
|
||||
for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term {
|
||||
i.entry.Count += i.cursors[0].curr.Count
|
||||
next, err := i.cursors[0].itr.Next()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
|||
heap.Pop(i)
|
||||
} else {
|
||||
// modified heap, fix it
|
||||
i.cursors[0].curr = next
|
||||
i.cursors[0].curr = *next
|
||||
heap.Fix(i, 0)
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
return &i.entry, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotFieldDict) Close() error {
|
||||
|
|
13
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_doc.go
generated
vendored
13
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_doc.go
generated
vendored
|
@ -16,17 +16,30 @@ package scorch
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"reflect"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeIndexSnapshotDocIDReader int
|
||||
|
||||
func init() {
|
||||
var isdr IndexSnapshotDocIDReader
|
||||
reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size())
|
||||
}
|
||||
|
||||
type IndexSnapshotDocIDReader struct {
|
||||
snapshot *IndexSnapshot
|
||||
iterators []roaring.IntIterable
|
||||
segmentOffset int
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotDocIDReader) Size() int {
|
||||
return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) {
|
||||
for i.segmentOffset < len(i.iterators) {
|
||||
if !i.iterators[i.segmentOffset].HasNext() {
|
||||
|
|
85
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go
generated
vendored
85
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_index_tfr.go
generated
vendored
|
@ -16,16 +16,27 @@ package scorch
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeIndexSnapshotTermFieldReader int
|
||||
|
||||
func init() {
|
||||
var istfr IndexSnapshotTermFieldReader
|
||||
reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size())
|
||||
}
|
||||
|
||||
type IndexSnapshotTermFieldReader struct {
|
||||
term []byte
|
||||
field string
|
||||
snapshot *IndexSnapshot
|
||||
dicts []segment.TermDictionary
|
||||
postings []segment.PostingsList
|
||||
iterators []segment.PostingsIterator
|
||||
segmentOffset int
|
||||
|
@ -36,13 +47,34 @@ type IndexSnapshotTermFieldReader struct {
|
|||
currID index.IndexInternalID
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotTermFieldReader) Size() int {
|
||||
sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr +
|
||||
len(i.term) +
|
||||
len(i.field) +
|
||||
len(i.currID)
|
||||
|
||||
for _, entry := range i.postings {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range i.iterators {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
if i.currPosting != nil {
|
||||
sizeInBytes += i.currPosting.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
|
||||
rv := preAlloced
|
||||
if rv == nil {
|
||||
rv = &index.TermFieldDoc{}
|
||||
}
|
||||
// find the next hit
|
||||
for i.segmentOffset < len(i.postings) {
|
||||
for i.segmentOffset < len(i.iterators) {
|
||||
next, err := i.iterators[i.segmentOffset].Next()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -72,9 +104,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin
|
|||
}
|
||||
if i.includeTermVectors {
|
||||
locs := next.Locations()
|
||||
rv.Vectors = make([]*index.TermFieldVector, len(locs))
|
||||
if cap(rv.Vectors) < len(locs) {
|
||||
rv.Vectors = make([]*index.TermFieldVector, len(locs))
|
||||
backing := make([]index.TermFieldVector, len(locs))
|
||||
for i := range backing {
|
||||
rv.Vectors[i] = &backing[i]
|
||||
}
|
||||
}
|
||||
rv.Vectors = rv.Vectors[:len(locs)]
|
||||
for i, loc := range locs {
|
||||
rv.Vectors[i] = &index.TermFieldVector{
|
||||
*rv.Vectors[i] = index.TermFieldVector{
|
||||
Start: loc.Start(),
|
||||
End: loc.End(),
|
||||
Pos: loc.Pos(),
|
||||
|
@ -96,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo
|
|||
}
|
||||
*i = *(i2.(*IndexSnapshotTermFieldReader))
|
||||
}
|
||||
// FIXME do something better
|
||||
next, err := i.Next(preAlloced)
|
||||
num, err := docInternalToNumber(ID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
|
||||
}
|
||||
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
|
||||
if segIndex >= len(i.snapshot.segment) {
|
||||
return nil, fmt.Errorf("computed segment index %d out of bounds %d",
|
||||
segIndex, len(i.snapshot.segment))
|
||||
}
|
||||
// skip directly to the target segment
|
||||
i.segmentOffset = segIndex
|
||||
next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if next == nil {
|
||||
return nil, nil
|
||||
// we jumped directly to the segment that should have contained it
|
||||
// but it wasn't there, so reuse Next() which should correctly
|
||||
// get the next hit after it (we moved i.segmentOffset)
|
||||
return i.Next(preAlloced)
|
||||
}
|
||||
for bytes.Compare(next.ID, ID) < 0 {
|
||||
next, err = i.Next(preAlloced)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if next == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if preAlloced == nil {
|
||||
preAlloced = &index.TermFieldDoc{}
|
||||
}
|
||||
return next, nil
|
||||
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
|
||||
i.snapshot.offsets[segIndex])
|
||||
i.postingToTermFieldDoc(next, preAlloced)
|
||||
i.currID = preAlloced.ID
|
||||
i.currPosting = next
|
||||
return preAlloced, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotTermFieldReader) Count() uint64 {
|
||||
|
@ -126,7 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 {
|
|||
|
||||
func (i *IndexSnapshotTermFieldReader) Close() error {
|
||||
if i.snapshot != nil {
|
||||
atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1))
|
||||
atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1))
|
||||
i.snapshot.recycleTermFieldReader(i)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
2
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go
generated
vendored
2
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_rollback.go
generated
vendored
|
@ -19,7 +19,7 @@ import (
|
|||
"log"
|
||||
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
type RollbackPoint struct {
|
||||
|
|
148
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go
generated
vendored
148
vendor/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go
generated
vendored
|
@ -15,42 +15,25 @@
|
|||
package scorch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var TermSeparator byte = 0xff
|
||||
|
||||
var TermSeparatorSplitSlice = []byte{TermSeparator}
|
||||
|
||||
type SegmentDictionarySnapshot struct {
|
||||
s *SegmentSnapshot
|
||||
d segment.TermDictionary
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
// TODO: if except is non-nil, perhaps need to OR it with s.s.deleted?
|
||||
return s.d.PostingsList(term, s.s.deleted)
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator {
|
||||
return s.d.Iterator()
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||
return s.d.PrefixIterator(prefix)
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator {
|
||||
return s.d.RangeIterator(start, end)
|
||||
}
|
||||
|
||||
type SegmentSnapshot struct {
|
||||
id uint64
|
||||
segment segment.Segment
|
||||
deleted *roaring.Bitmap
|
||||
creator string
|
||||
|
||||
cachedDocs *cachedDocs
|
||||
}
|
||||
|
@ -83,8 +66,11 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel
|
|||
return s.segment.VisitDocument(num, visitor)
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Count() uint64 {
|
||||
func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) {
|
||||
return s.segment.DocID(num)
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Count() uint64 {
|
||||
rv := s.segment.Count()
|
||||
if s.deleted != nil {
|
||||
rv -= s.deleted.GetCardinality()
|
||||
|
@ -92,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 {
|
|||
return rv
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) {
|
||||
d, err := s.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &SegmentDictionarySnapshot{
|
||||
s: s,
|
||||
d: d,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
|
||||
rv, err := s.segment.DocNumbers(docIDs)
|
||||
if err != nil {
|
||||
|
@ -114,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
|
|||
return rv, nil
|
||||
}
|
||||
|
||||
// DocNumbersLive returns bitsit containing doc numbers for all live docs
|
||||
// DocNumbersLive returns a bitmap containing doc numbers for all live docs
|
||||
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
|
||||
rv := roaring.NewBitmap()
|
||||
rv.AddRange(0, s.segment.Count())
|
||||
|
@ -128,36 +103,68 @@ func (s *SegmentSnapshot) Fields() []string {
|
|||
return s.segment.Fields()
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Size() (rv int) {
|
||||
rv = s.segment.Size()
|
||||
if s.deleted != nil {
|
||||
rv += int(s.deleted.GetSizeInBytes())
|
||||
}
|
||||
rv += s.cachedDocs.Size()
|
||||
return
|
||||
}
|
||||
|
||||
type cachedFieldDocs struct {
|
||||
m sync.Mutex
|
||||
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
|
||||
err error // Non-nil if there was an error when preparing this cachedFieldDocs.
|
||||
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
|
||||
defer close(cfd.readyCh)
|
||||
func (cfd *cachedFieldDocs) Size() int {
|
||||
var rv int
|
||||
cfd.m.Lock()
|
||||
for _, entry := range cfd.docs {
|
||||
rv += 8 /* size of uint64 */ + len(entry)
|
||||
}
|
||||
cfd.m.Unlock()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
|
||||
cfd.m.Lock()
|
||||
defer func() {
|
||||
close(cfd.readyCh)
|
||||
cfd.m.Unlock()
|
||||
}()
|
||||
|
||||
cfd.size += uint64(size.SizeOfUint64) /* size field */
|
||||
dict, err := ss.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
cfd.err = err
|
||||
return
|
||||
}
|
||||
|
||||
var postings segment.PostingsList
|
||||
var postingsItr segment.PostingsIterator
|
||||
|
||||
dictItr := dict.Iterator()
|
||||
next, err := dictItr.Next()
|
||||
for err == nil && next != nil {
|
||||
postings, err1 := dict.PostingsList(next.Term, nil)
|
||||
var err1 error
|
||||
postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings)
|
||||
if err1 != nil {
|
||||
cfd.err = err1
|
||||
return
|
||||
}
|
||||
|
||||
postingsItr := postings.Iterator()
|
||||
cfd.size += uint64(size.SizeOfUint64) /* map key */
|
||||
postingsItr = postings.Iterator(false, false, false, postingsItr)
|
||||
nextPosting, err2 := postingsItr.Next()
|
||||
for err2 == nil && nextPosting != nil {
|
||||
docNum := nextPosting.Number()
|
||||
cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...)
|
||||
cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator)
|
||||
cfd.size += uint64(len(next.Term) + 1) // map value
|
||||
nextPosting, err2 = postingsItr.Next()
|
||||
}
|
||||
|
||||
|
@ -178,10 +185,12 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
|
|||
type cachedDocs struct {
|
||||
m sync.Mutex // As the cache is asynchronously prepared, need a lock
|
||||
cache map[string]*cachedFieldDocs // Keyed by field
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
|
||||
c.m.Lock()
|
||||
|
||||
if c.cache == nil {
|
||||
c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields()))
|
||||
}
|
||||
|
@ -194,7 +203,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
|
|||
docs: make(map[uint64][]byte),
|
||||
}
|
||||
|
||||
go c.cache[field].prepareFields(field, ss)
|
||||
go c.cache[field].prepareField(field, ss)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -209,21 +218,62 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
|
|||
c.m.Lock()
|
||||
}
|
||||
|
||||
c.updateSizeLOCKED()
|
||||
|
||||
c.m.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *cachedDocs) sizeInBytes() uint64 {
|
||||
sizeInBytes := 0
|
||||
// hasFields returns true if the cache has all the given fields
|
||||
func (c *cachedDocs) hasFields(fields []string) bool {
|
||||
c.m.Lock()
|
||||
for k, v := range c.cache { // cachedFieldDocs
|
||||
sizeInBytes += len(k)
|
||||
if v != nil {
|
||||
for _, entry := range v.docs { // docs
|
||||
sizeInBytes += 8 /* size of uint64 */ + len(entry)
|
||||
}
|
||||
for _, field := range fields {
|
||||
if _, exists := c.cache[field]; !exists {
|
||||
c.m.Unlock()
|
||||
return false // found a field not in cache
|
||||
}
|
||||
}
|
||||
c.m.Unlock()
|
||||
return uint64(sizeInBytes)
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *cachedDocs) Size() int {
|
||||
return int(atomic.LoadUint64(&c.size))
|
||||
}
|
||||
|
||||
func (c *cachedDocs) updateSizeLOCKED() {
|
||||
sizeInBytes := 0
|
||||
for k, v := range c.cache { // cachedFieldDocs
|
||||
sizeInBytes += len(k)
|
||||
if v != nil {
|
||||
sizeInBytes += v.Size()
|
||||
}
|
||||
}
|
||||
atomic.StoreUint64(&c.size, uint64(sizeInBytes))
|
||||
}
|
||||
|
||||
func (c *cachedDocs) visitDoc(localDocNum uint64,
|
||||
fields []string, visitor index.DocumentFieldTermVisitor) {
|
||||
c.m.Lock()
|
||||
|
||||
for _, field := range fields {
|
||||
if cachedFieldDocs, exists := c.cache[field]; exists {
|
||||
c.m.Unlock()
|
||||
<-cachedFieldDocs.readyCh
|
||||
c.m.Lock()
|
||||
|
||||
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
|
||||
for {
|
||||
i := bytes.Index(tlist, TermSeparatorSplitSlice)
|
||||
if i < 0 {
|
||||
break
|
||||
}
|
||||
visitor(field, tlist[0:i])
|
||||
tlist = tlist[i+1:]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.m.Unlock()
|
||||
}
|
||||
|
|
156
vendor/github.com/blevesearch/bleve/index/scorch/stats.go
generated
vendored
156
vendor/github.com/blevesearch/bleve/index/scorch/stats.go
generated
vendored
|
@ -16,63 +16,125 @@ package scorch
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"reflect"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// Stats tracks statistics about the index
|
||||
// Stats tracks statistics about the index, fields that are
|
||||
// prefixed like CurXxxx are gauges (can go up and down),
|
||||
// and fields that are prefixed like TotXxxx are monotonically
|
||||
// increasing counters.
|
||||
type Stats struct {
|
||||
updates, deletes, batches, errors uint64
|
||||
analysisTime, indexTime uint64
|
||||
termSearchersStarted uint64
|
||||
termSearchersFinished uint64
|
||||
numPlainTextBytesIndexed uint64
|
||||
numItemsIntroduced uint64
|
||||
numItemsPersisted uint64
|
||||
i *Scorch
|
||||
TotUpdates uint64
|
||||
TotDeletes uint64
|
||||
|
||||
TotBatches uint64
|
||||
TotBatchesEmpty uint64
|
||||
TotBatchIntroTime uint64
|
||||
MaxBatchIntroTime uint64
|
||||
|
||||
CurRootEpoch uint64
|
||||
LastPersistedEpoch uint64
|
||||
LastMergedEpoch uint64
|
||||
|
||||
TotOnErrors uint64
|
||||
|
||||
TotAnalysisTime uint64
|
||||
TotIndexTime uint64
|
||||
|
||||
TotIndexedPlainTextBytes uint64
|
||||
|
||||
TotTermSearchersStarted uint64
|
||||
TotTermSearchersFinished uint64
|
||||
|
||||
TotIntroduceLoop uint64
|
||||
TotIntroduceSegmentBeg uint64
|
||||
TotIntroduceSegmentEnd uint64
|
||||
TotIntroducePersistBeg uint64
|
||||
TotIntroducePersistEnd uint64
|
||||
TotIntroduceMergeBeg uint64
|
||||
TotIntroduceMergeEnd uint64
|
||||
TotIntroduceRevertBeg uint64
|
||||
TotIntroduceRevertEnd uint64
|
||||
|
||||
TotIntroducedItems uint64
|
||||
TotIntroducedSegmentsBatch uint64
|
||||
TotIntroducedSegmentsMerge uint64
|
||||
|
||||
TotPersistLoopBeg uint64
|
||||
TotPersistLoopErr uint64
|
||||
TotPersistLoopProgress uint64
|
||||
TotPersistLoopWait uint64
|
||||
TotPersistLoopWaitNotified uint64
|
||||
TotPersistLoopEnd uint64
|
||||
|
||||
TotPersistedItems uint64
|
||||
TotItemsToPersist uint64
|
||||
TotPersistedSegments uint64
|
||||
|
||||
TotPersisterSlowMergerPause uint64
|
||||
TotPersisterSlowMergerResume uint64
|
||||
|
||||
TotPersisterNapPauseCompleted uint64
|
||||
TotPersisterMergerNapBreak uint64
|
||||
|
||||
TotFileMergeLoopBeg uint64
|
||||
TotFileMergeLoopErr uint64
|
||||
TotFileMergeLoopEnd uint64
|
||||
|
||||
TotFileMergePlan uint64
|
||||
TotFileMergePlanErr uint64
|
||||
TotFileMergePlanNone uint64
|
||||
TotFileMergePlanOk uint64
|
||||
|
||||
TotFileMergePlanTasks uint64
|
||||
TotFileMergePlanTasksDone uint64
|
||||
TotFileMergePlanTasksErr uint64
|
||||
TotFileMergePlanTasksSegments uint64
|
||||
TotFileMergePlanTasksSegmentsEmpty uint64
|
||||
|
||||
TotFileMergeSegmentsEmpty uint64
|
||||
TotFileMergeSegments uint64
|
||||
TotFileSegmentsAtRoot uint64
|
||||
TotFileMergeWrittenBytes uint64
|
||||
|
||||
TotFileMergeZapBeg uint64
|
||||
TotFileMergeZapEnd uint64
|
||||
TotFileMergeZapTime uint64
|
||||
MaxFileMergeZapTime uint64
|
||||
|
||||
TotFileMergeIntroductions uint64
|
||||
TotFileMergeIntroductionsDone uint64
|
||||
TotFileMergeIntroductionsSkipped uint64
|
||||
|
||||
TotMemMergeBeg uint64
|
||||
TotMemMergeErr uint64
|
||||
TotMemMergeDone uint64
|
||||
TotMemMergeZapBeg uint64
|
||||
TotMemMergeZapEnd uint64
|
||||
TotMemMergeZapTime uint64
|
||||
MaxMemMergeZapTime uint64
|
||||
TotMemMergeSegments uint64
|
||||
TotMemorySegmentsAtRoot uint64
|
||||
}
|
||||
|
||||
func (s *Stats) statsMap() (map[string]interface{}, error) {
|
||||
// atomically populates the returned map
|
||||
func (s *Stats) ToMap() map[string]interface{} {
|
||||
m := map[string]interface{}{}
|
||||
m["updates"] = atomic.LoadUint64(&s.updates)
|
||||
m["deletes"] = atomic.LoadUint64(&s.deletes)
|
||||
m["batches"] = atomic.LoadUint64(&s.batches)
|
||||
m["errors"] = atomic.LoadUint64(&s.errors)
|
||||
m["analysis_time"] = atomic.LoadUint64(&s.analysisTime)
|
||||
m["index_time"] = atomic.LoadUint64(&s.indexTime)
|
||||
m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted)
|
||||
m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished)
|
||||
m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed)
|
||||
m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced)
|
||||
m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted)
|
||||
|
||||
if s.i.path != "" {
|
||||
finfos, err := ioutil.ReadDir(s.i.path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
sve := reflect.ValueOf(s).Elem()
|
||||
svet := sve.Type()
|
||||
for i := 0; i < svet.NumField(); i++ {
|
||||
svef := sve.Field(i)
|
||||
if svef.CanAddr() {
|
||||
svefp := svef.Addr().Interface()
|
||||
m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64))
|
||||
}
|
||||
|
||||
var numFilesOnDisk, numBytesUsedDisk uint64
|
||||
|
||||
for _, finfo := range finfos {
|
||||
if !finfo.IsDir() {
|
||||
numBytesUsedDisk += uint64(finfo.Size())
|
||||
numFilesOnDisk++
|
||||
}
|
||||
}
|
||||
|
||||
m["num_bytes_used_disk"] = numBytesUsedDisk
|
||||
m["num_files_on_disk"] = numFilesOnDisk
|
||||
}
|
||||
|
||||
return m, nil
|
||||
return m
|
||||
}
|
||||
|
||||
// MarshalJSON implements json.Marshaler
|
||||
// MarshalJSON implements json.Marshaler, and in contrast to standard
|
||||
// json marshaling provides atomic safety
|
||||
func (s *Stats) MarshalJSON() ([]byte, error) {
|
||||
m, err := s.statsMap()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return json.Marshal(m)
|
||||
return json.Marshal(s.ToMap())
|
||||
}
|
||||
|
|
2
vendor/github.com/blevesearch/bleve/index/store/boltdb/iterator.go
generated
vendored
2
vendor/github.com/blevesearch/bleve/index/store/boltdb/iterator.go
generated
vendored
|
@ -17,7 +17,7 @@ package boltdb
|
|||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
type Iterator struct {
|
||||
|
|
2
vendor/github.com/blevesearch/bleve/index/store/boltdb/reader.go
generated
vendored
2
vendor/github.com/blevesearch/bleve/index/store/boltdb/reader.go
generated
vendored
|
@ -16,7 +16,7 @@ package boltdb
|
|||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
type Reader struct {
|
||||
|
|
8
vendor/github.com/blevesearch/bleve/index/store/boltdb/store.go
generated
vendored
8
vendor/github.com/blevesearch/bleve/index/store/boltdb/store.go
generated
vendored
|
@ -30,7 +30,7 @@ import (
|
|||
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore,
|
|||
bo.ReadOnly = ro
|
||||
}
|
||||
|
||||
if initialMmapSize, ok := config["initialMmapSize"].(int); ok {
|
||||
bo.InitialMmapSize = initialMmapSize
|
||||
} else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok {
|
||||
bo.InitialMmapSize = int(initialMmapSize)
|
||||
}
|
||||
|
||||
db, err := bolt.Open(path, 0600, bo)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
23
vendor/github.com/blevesearch/bleve/index/upsidedown/index_reader.go
generated
vendored
23
vendor/github.com/blevesearch/bleve/index/upsidedown/index_reader.go
generated
vendored
|
@ -15,11 +15,20 @@
|
|||
package upsidedown
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
)
|
||||
|
||||
var reflectStaticSizeIndexReader int
|
||||
|
||||
func init() {
|
||||
var ir IndexReader
|
||||
reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size())
|
||||
}
|
||||
|
||||
type IndexReader struct {
|
||||
index *UpsideDownCouch
|
||||
kvreader store.KVReader
|
||||
|
@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte {
|
|||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) {
|
||||
return &DocValueReader{i: i, fields: fields}, nil
|
||||
}
|
||||
|
||||
type DocValueReader struct {
|
||||
i *IndexReader
|
||||
fields []string
|
||||
}
|
||||
|
||||
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor)
|
||||
}
|
||||
|
|
39
vendor/github.com/blevesearch/bleve/index/upsidedown/reader.go
generated
vendored
39
vendor/github.com/blevesearch/bleve/index/upsidedown/reader.go
generated
vendored
|
@ -16,13 +16,27 @@ package upsidedown
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeUpsideDownCouchTermFieldReader int
|
||||
var reflectStaticSizeUpsideDownCouchDocIDReader int
|
||||
|
||||
func init() {
|
||||
var tfr UpsideDownCouchTermFieldReader
|
||||
reflectStaticSizeUpsideDownCouchTermFieldReader =
|
||||
int(reflect.TypeOf(tfr).Size())
|
||||
var cdr UpsideDownCouchDocIDReader
|
||||
reflectStaticSizeUpsideDownCouchDocIDReader =
|
||||
int(reflect.TypeOf(cdr).Size())
|
||||
}
|
||||
|
||||
type UpsideDownCouchTermFieldReader struct {
|
||||
count uint64
|
||||
indexReader *IndexReader
|
||||
|
@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct {
|
|||
includeTermVectors bool
|
||||
}
|
||||
|
||||
func (r *UpsideDownCouchTermFieldReader) Size() int {
|
||||
sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr +
|
||||
len(r.term) +
|
||||
r.tfrPrealloc.Size() +
|
||||
len(r.keyBuf)
|
||||
|
||||
if r.tfrNext != nil {
|
||||
sizeInBytes += r.tfrNext.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) {
|
||||
bufNeeded := termFrequencyRowKeySize(term, nil)
|
||||
if bufNeeded < dictionaryRowKeySize(term) {
|
||||
|
@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct {
|
|||
onlyMode bool
|
||||
}
|
||||
|
||||
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
|
||||
func (r *UpsideDownCouchDocIDReader) Size() int {
|
||||
sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader +
|
||||
reflectStaticSizeIndexReader + size.SizeOfPtr
|
||||
|
||||
for _, entry := range r.only {
|
||||
sizeInBytes += size.SizeOfString + len(entry)
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
|
||||
startBytes := []byte{0x0}
|
||||
endBytes := []byte{0xff}
|
||||
|
||||
|
|
31
vendor/github.com/blevesearch/bleve/index/upsidedown/row.go
generated
vendored
31
vendor/github.com/blevesearch/bleve/index/upsidedown/row.go
generated
vendored
|
@ -20,10 +20,22 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/size"
|
||||
"github.com/golang/protobuf/proto"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTermFrequencyRow int
|
||||
var reflectStaticSizeTermVector int
|
||||
|
||||
func init() {
|
||||
var tfr TermFrequencyRow
|
||||
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size())
|
||||
var tv TermVector
|
||||
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size())
|
||||
}
|
||||
|
||||
const ByteSeparator byte = 0xff
|
||||
|
||||
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
||||
|
@ -358,6 +370,11 @@ type TermVector struct {
|
|||
end uint64
|
||||
}
|
||||
|
||||
func (tv *TermVector) Size() int {
|
||||
return reflectStaticSizeTermVector + size.SizeOfPtr +
|
||||
len(tv.arrayPositions)*size.SizeOfUint64
|
||||
}
|
||||
|
||||
func (tv *TermVector) String() string {
|
||||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
||||
}
|
||||
|
@ -371,6 +388,18 @@ type TermFrequencyRow struct {
|
|||
field uint16
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) Size() int {
|
||||
sizeInBytes := reflectStaticSizeTermFrequencyRow +
|
||||
len(tfr.term) +
|
||||
len(tfr.doc)
|
||||
|
||||
for _, entry := range tfr.vectors {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) Term() []byte {
|
||||
return tfr.term
|
||||
}
|
||||
|
@ -555,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error {
|
|||
|
||||
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
|
||||
tfr.doc = key[3+len(term)+1:]
|
||||
if len(tfr.doc) <= 0 {
|
||||
if len(tfr.doc) == 0 {
|
||||
return fmt.Errorf("invalid term frequency key, empty docid")
|
||||
}
|
||||
|
||||
|
|
25
vendor/github.com/blevesearch/bleve/index/upsidedown/upsidedown.go
generated
vendored
25
vendor/github.com/blevesearch/bleve/index/upsidedown/upsidedown.go
generated
vendored
|
@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
|
|||
}
|
||||
|
||||
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
|
||||
if len(in) <= 0 {
|
||||
if len(in) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -810,15 +810,17 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
|||
}
|
||||
}
|
||||
|
||||
go func() {
|
||||
for _, doc := range batch.IndexOps {
|
||||
if doc != nil {
|
||||
aw := index.NewAnalysisWork(udc, doc, resultChan)
|
||||
// put the work on the queue
|
||||
udc.analysisQueue.Queue(aw)
|
||||
if len(batch.IndexOps) > 0 {
|
||||
go func() {
|
||||
for _, doc := range batch.IndexOps {
|
||||
if doc != nil {
|
||||
aw := index.NewAnalysisWork(udc, doc, resultChan)
|
||||
// put the work on the queue
|
||||
udc.analysisQueue.Queue(aw)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}()
|
||||
}
|
||||
|
||||
// retrieve back index rows concurrent with analysis
|
||||
docBackIndexRowErr := error(nil)
|
||||
|
@ -958,6 +960,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
|||
} else {
|
||||
atomic.AddUint64(&udc.stats.errors, 1)
|
||||
}
|
||||
|
||||
persistedCallback := batch.PersistedCallback()
|
||||
if persistedCallback != nil {
|
||||
persistedCallback(err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue