1
0
Fork 0
forked from forgejo/forgejo

Upgrade blevesearch to v0.8.1 (#9177)

For #1441

a91b427b59
This commit is contained in:
Mura Li 2019-11-27 17:23:33 +08:00 committed by Lauris BH
parent b50dee5a61
commit 9591185c8f
180 changed files with 43400 additions and 41105 deletions

View file

@ -121,6 +121,10 @@ type IndexReaderOnly interface {
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
}
type IndexReaderContains interface {
FieldDictContains(field string) (FieldDictContains, error)
}
// FieldTerms contains the terms used by a document, keyed by field
type FieldTerms map[string][]string
@ -230,6 +234,10 @@ type FieldDict interface {
Close() error
}
type FieldDictContains interface {
Contains(key []byte) (bool, error)
}
// DocIDReader is the interface exposing enumeration of documents identifiers.
// Close the reader to release associated resources.
type DocIDReader interface {

View file

@ -376,6 +376,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
fileSegments++
}
}
}
// before the newMerge introduction, need to clean the newly
@ -392,7 +393,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
}
}
}
// In case where all the docs in the newly merged segment getting
// deleted by the time we reach here, can skip the introduction.
if nextMerge.new != nil &&
@ -424,7 +424,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
newSnapshot.updateSize()
s.rootLock.Lock()
// swap in new index snapshot
newSnapshot.epoch = s.nextSnapshotEpoch
@ -502,7 +501,6 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
}
newSnapshot.updateSize()
// swap in new snapshot
rootPrev := s.root
s.root = newSnapshot

View file

@ -18,6 +18,7 @@ import (
"encoding/json"
"fmt"
"os"
"strings"
"sync/atomic"
"time"
@ -151,13 +152,13 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1)
return nil
}
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1)
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks)))
// process tasks in serial for now
var notifications []chan *IndexSnapshot
var filenames []string
for _, task := range resultMergePlan.Tasks {
if len(task.Segments) == 0 {
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1)
@ -182,6 +183,12 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
segmentsToMerge = append(segmentsToMerge, zapSeg)
docsToDrop = append(docsToDrop, segSnapshot.deleted)
}
// track the files getting merged for unsetting the
// removal ineligibility. This helps to unflip files
// even with fast merger, slow persister work flows.
path := zapSeg.Path()
filenames = append(filenames,
strings.TrimPrefix(path, s.path+string(os.PathSeparator)))
}
}
}
@ -221,6 +228,11 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
return err
}
err = zap.ValidateMerge(segmentsToMerge, nil, docsToDrop, seg.(*zap.Segment))
if err != nil {
s.unmarkIneligibleForRemoval(filename)
return fmt.Errorf("merge validation failed: %v", err)
}
oldNewDocNums = make(map[uint64][]uint64)
for i, segNewDocNums := range newDocNums {
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
@ -263,6 +275,13 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
}
}
// once all the newly merged segment introductions are done,
// its safe to unflip the removal ineligibility for the replaced
// older segments
for _, f := range filenames {
s.unmarkIneligibleForRemoval(f)
}
return nil
}
@ -311,6 +330,10 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return nil, 0, err
}
err = zap.ValidateMerge(nil, sbs, sbsDrops, seg.(*zap.Segment))
if err != nil {
return nil, 0, fmt.Errorf("in-memory merge validation failed: %v", err)
}
// update persisted stats
atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count())

View file

@ -90,6 +90,9 @@ func (s *Scorch) persisterLoop() {
var persistWatchers []*epochWatcher
var lastPersistedEpoch, lastMergedEpoch uint64
var ew *epochWatcher
var unpersistedCallbacks []index.BatchCallback
po, err := s.parsePersisterOptions()
if err != nil {
s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
@ -111,7 +114,6 @@ OUTER:
if ew != nil && ew.epoch > lastMergedEpoch {
lastMergedEpoch = ew.epoch
}
lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
lastMergedEpoch, persistWatchers, po)
@ -150,11 +152,25 @@ OUTER:
_ = ourSnapshot.DecRef()
break OUTER
}
// save this current snapshot's persistedCallbacks, to invoke during
// the retry attempt
unpersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
_ = ourSnapshot.DecRef()
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
continue OUTER
}
if unpersistedCallbacks != nil {
// in the event of this being a retry attempt for persisting a snapshot
// that had earlier failed, prepend the persistedCallbacks associated
// with earlier segment(s) to the latest persistedCallbacks
ourPersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)
unpersistedCallbacks = nil
}
for i := range ourPersistedCallbacks {
ourPersistedCallbacks[i](err)
}
@ -179,7 +195,6 @@ OUTER:
s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
if changed {
s.removeOldData()
atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
continue OUTER
}
@ -230,20 +245,19 @@ func notifyMergeWatchers(lastPersistedEpoch uint64,
return watchersNext
}
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64,
persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) {
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64,
lastMergedEpoch uint64, persistWatchers []*epochWatcher,
po *persisterOptions) (uint64, []*epochWatcher) {
// first, let the watchers proceed if they lag behind
// First, let the watchers proceed if they lag behind
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
// check the merger lag by counting the segment files on disk,
// Check the merger lag by counting the segment files on disk,
numFilesOnDisk, _ := s.diskFileStats()
// On finding fewer files on disk, persister takes a short pause
// for sufficient in-memory segments to pile up for the next
// memory merge cum persist loop.
// On finding too many files on disk, persister pause until the merger
// catches up to reduce the segment file count under the threshold.
// But if there is memory pressure, then skip this sleep maneuvers.
numFilesOnDisk, _ := s.diskFileStats()
if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
po.PersisterNapTimeMSec > 0 && s.paused() == 0 {
select {
@ -261,6 +275,17 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastM
return lastMergedEpoch, persistWatchers
}
// Finding too many files on disk could be due to two reasons.
// 1. Too many older snapshots awaiting the clean up.
// 2. The merger could be lagging behind on merging the disk files.
if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) {
s.removeOldData()
numFilesOnDisk, _ = s.diskFileStats()
}
// Persister pause until the merger catches up to reduce the segment
// file count under the threshold.
// But if there is memory pressure, then skip this sleep maneuvers.
OUTER:
for po.PersisterNapUnderNumFiles > 0 &&
numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
@ -661,13 +686,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
}
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
rv := &IndexSnapshot{
parent: s,
internal: make(map[string][]byte),
refs: 1,
creator: "loadSnapshot",
}
var running uint64
c := snapshot.Cursor()
for k, _ := c.First(); k != nil; k, _ = c.Next() {
@ -703,7 +728,6 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
running += segmentSnapshot.segment.Count()
}
}
return rv, nil
}
@ -750,12 +774,11 @@ func (s *Scorch) removeOldData() {
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err))
}
atomic.AddUint64(&s.stats.TotSnapshotsRemovedFromMetaStore, uint64(removed))
if removed > 0 {
err = s.removeOldZapFiles()
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err))
}
err = s.removeOldZapFiles()
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err))
}
}

View file

@ -41,12 +41,14 @@ const Version uint8 = 2
var ErrClosed = fmt.Errorf("scorch closed")
type Scorch struct {
nextSegmentID uint64
stats Stats
iStats internalStats
readOnly bool
version uint8
config map[string]interface{}
analysisQueue *index.AnalysisQueue
stats Stats
nextSegmentID uint64
path string
unsafeBatch bool
@ -73,8 +75,6 @@ type Scorch struct {
onEvent func(event Event)
onAsyncError func(err error)
iStats internalStats
pauseLock sync.RWMutex
pauseCount uint64
@ -312,7 +312,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
// FIXME could sort ids list concurrent with analysis?
if len(batch.IndexOps) > 0 {
if numUpdates > 0 {
go func() {
for _, doc := range batch.IndexOps {
if doc != nil {
@ -490,6 +490,9 @@ func (s *Scorch) StatsMap() map[string]interface{} {
m["CurOnDiskBytes"] = numBytesUsedDisk
m["CurOnDiskFiles"] = numFilesOnDisk
s.rootLock.RLock()
m["CurFilesIneligibleForRemoval"] = uint64(len(s.ineligibleForRemoval))
s.rootLock.RUnlock()
// TODO: consider one day removing these backwards compatible
// names for apps using the old names
m["updates"] = m["TotUpdates"]

View file

@ -91,12 +91,20 @@ func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte,
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) Contains(key []byte) (bool, error) {
return false, nil
}
type EmptyDictionaryIterator struct{}
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
return nil, nil
}
func (e *EmptyDictionaryIterator) Contains(key []byte) (bool, error) {
return false, nil
}
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
return nil, nil
}

View file

@ -19,7 +19,10 @@
package segment
import "fmt"
import (
"errors"
"fmt"
)
const (
MaxVarintSize = 9
@ -92,3 +95,82 @@ func DecodeUvarintAscending(b []byte) ([]byte, uint64, error) {
}
return b[length:], v, nil
}
// ------------------------------------------------------------
type MemUvarintReader struct {
C int // index of next byte to read from S
S []byte
}
func NewMemUvarintReader(s []byte) *MemUvarintReader {
return &MemUvarintReader{S: s}
}
// Len returns the number of unread bytes.
func (r *MemUvarintReader) Len() int {
n := len(r.S) - r.C
if n < 0 {
return 0
}
return n
}
var ErrMemUvarintReaderOverflow = errors.New("MemUvarintReader overflow")
// ReadUvarint reads an encoded uint64. The original code this was
// based on is at encoding/binary/ReadUvarint().
func (r *MemUvarintReader) ReadUvarint() (uint64, error) {
var x uint64
var s uint
var C = r.C
var S = r.S
for {
b := S[C]
C++
if b < 0x80 {
r.C = C
// why 63? The original code had an 'i += 1' loop var and
// checked for i > 9 || i == 9 ...; but, we no longer
// check for the i var, but instead check here for s,
// which is incremented by 7. So, 7*9 == 63.
//
// why the "extra" >= check? The normal case is that s <
// 63, so we check this single >= guard first so that we
// hit the normal, nil-error return pathway sooner.
if s >= 63 && (s > 63 || s == 63 && b > 1) {
return 0, ErrMemUvarintReaderOverflow
}
return x | uint64(b)<<s, nil
}
x |= uint64(b&0x7f) << s
s += 7
}
}
// SkipUvarint skips ahead one encoded uint64.
func (r *MemUvarintReader) SkipUvarint() {
for {
b := r.S[r.C]
r.C++
if b < 0x80 {
return
}
}
}
// SkipBytes skips a count number of bytes.
func (r *MemUvarintReader) SkipBytes(count int) {
r.C = r.C + count
}
func (r *MemUvarintReader) Reset(s []byte) {
r.C = 0
r.S = s
}

View file

@ -55,7 +55,7 @@ func LiteralPrefix(s *syntax.Regexp) string {
s = s.Sub[0]
}
if s.Op == syntax.OpLiteral {
if s.Op == syntax.OpLiteral && (s.Flags&syntax.FoldCase == 0) {
return string(s.Rune)
}

View file

@ -59,6 +59,8 @@ type TermDictionary interface {
AutomatonIterator(a vellum.Automaton,
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator
Contains(key []byte) (bool, error)
}
type DictionaryIterator interface {

View file

@ -16,6 +16,7 @@ package zap
import (
"bufio"
"github.com/couchbase/vellum"
"math"
"os"
)
@ -137,6 +138,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
docValueOffset: docValueOffset,
dictLocs: dictLocs,
fieldDvReaders: make(map[uint16]*docValueReader),
fieldFSTs: make(map[uint16]*vellum.FST),
}
sb.updateSize()

View file

@ -95,6 +95,10 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap)
return rv
}
func (d *Dictionary) Contains(key []byte) (bool, error) {
return d.fst.Contains(key)
}
// Iterator returns an iterator for this dictionary
func (d *Dictionary) Iterator() segment.DictionaryIterator {
rv := &DictionaryIterator{
@ -143,11 +147,14 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
}
// need to increment the end position to be inclusive
endBytes := []byte(end)
if endBytes[len(endBytes)-1] < 0xff {
endBytes[len(endBytes)-1]++
} else {
endBytes = append(endBytes, 0xff)
var endBytes []byte
if len(end) > 0 {
endBytes = []byte(end)
if endBytes[len(endBytes)-1] < 0xff {
endBytes[len(endBytes)-1]++
} else {
endBytes = append(endBytes, 0xff)
}
}
if d.fst != nil {

View file

@ -39,7 +39,7 @@ type docNumTermsVisitor func(docNum uint64, terms []byte) error
type docVisitState struct {
dvrs map[uint16]*docValueReader
segment *Segment
segment *SegmentBase
}
type docValueReader struct {
@ -88,8 +88,8 @@ func (s *SegmentBase) loadFieldDocValueReader(field string,
fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
// get the docValue offset for the given fields
if fieldDvLocStart == fieldNotUninverted {
return nil, fmt.Errorf("loadFieldDocValueReader: "+
"no docValues found for field: %s", field)
// no docValues found, nothing to do
return nil, nil
}
// read the number of chunks, and chunk offsets position
@ -101,6 +101,8 @@ func (s *SegmentBase) loadFieldDocValueReader(field string,
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
// acquire position of chunk offsets
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
} else {
return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart)
}
fdvIter := &docValueReader{
@ -250,7 +252,7 @@ func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
// VisitDocumentFieldTerms is an implementation of the
// DocumentFieldTermVisitable interface
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
segment.DocVisitState, error) {
dvs, ok := dvsIn.(*docVisitState)
@ -289,7 +291,7 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
// check if the chunk is already loaded
if docInChunk != dvr.curChunkNumber() {
err := dvr.loadDvChunk(docInChunk, &s.SegmentBase)
err := dvr.loadDvChunk(docInChunk, s)
if err != nil {
return dvs, err
}
@ -304,6 +306,6 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
// VisitableDocValueFields returns the list of fields with
// persisted doc value terms ready to be visitable using the
// VisitDocumentFieldTerms method.
func (s *Segment) VisitableDocValueFields() ([]string, error) {
func (s *SegmentBase) VisitableDocValueFields() ([]string, error) {
return s.fieldDvNames, nil
}

View file

@ -31,6 +31,14 @@ import (
var DefaultFileMergerBufferSize = 1024 * 1024
// ValidateMerge can be set by applications to perform additional checks
// on a new segment produced by a merge, by default this does nothing.
// Caller should provide EITHER segments or memSegments, but not both.
// This API is experimental and may be removed at any time.
var ValidateMerge = func(segments []*Segment, memSegments []*SegmentBase, drops []*roaring.Bitmap, newSegment *Segment) error {
return nil
}
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
// Merge takes a slice of zap segments and bit masks describing which

View file

@ -33,6 +33,14 @@ var NewSegmentBufferNumResultsBump int = 100
var NewSegmentBufferNumResultsFactor float64 = 1.0
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
// ValidateDocFields can be set by applications to perform additional checks
// on fields in a document being added to a new segment, by default it does
// nothing.
// This API is experimental and may be removed at any time.
var ValidateDocFields = func(field document.Field) error {
return nil
}
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
// SegmentBase from analysis results
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
@ -521,6 +529,11 @@ func (s *interim) writeStoredFields() (
if opts.IncludeDocValues() {
s.IncludeDocValues[fieldID] = true
}
err := ValidateDocFields(field)
if err != nil {
return 0, err
}
}
var curr int

View file

@ -15,10 +15,8 @@
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
"reflect"
@ -192,7 +190,7 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool,
}
rv.postings = p
rv.includeFreqNorm = includeFreq || includeNorm
rv.includeFreqNorm = includeFreq || includeNorm || includeLocs
rv.includeLocs = includeLocs
if p.normBits1Hit != 0 {
@ -264,18 +262,17 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool,
// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
var n uint64
var n, e uint64
if p.normBits1Hit != 0 {
n = 1
if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) {
e = 1
}
} else if p.postings != nil {
n = p.postings.GetCardinality()
}
var e uint64
if p.except != nil {
e = p.except.GetCardinality()
}
if n <= e {
return 0
if p.except != nil {
e = p.postings.AndCardinality(p.except)
}
}
return n - e
}
@ -327,16 +324,16 @@ func (rv *PostingsList) init1Hit(fstVal uint64) error {
// PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct {
postings *PostingsList
all roaring.IntIterable
Actual roaring.IntIterable
all roaring.IntPeekable
Actual roaring.IntPeekable
ActualBM *roaring.Bitmap
currChunk uint32
currChunkFreqNorm []byte
currChunkLoc []byte
freqNormReader *bytes.Reader
locReader *bytes.Reader
freqNormReader *segment.MemUvarintReader
locReader *segment.MemUvarintReader
freqChunkOffsets []uint64
freqChunkStart uint64
@ -387,7 +384,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
end += e
i.currChunkFreqNorm = i.postings.sb.mem[start:end]
if i.freqNormReader == nil {
i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm)
i.freqNormReader = segment.NewMemUvarintReader(i.currChunkFreqNorm)
} else {
i.freqNormReader.Reset(i.currChunkFreqNorm)
}
@ -405,7 +402,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
end += e
i.currChunkLoc = i.postings.sb.mem[start:end]
if i.locReader == nil {
i.locReader = bytes.NewReader(i.currChunkLoc)
i.locReader = segment.NewMemUvarintReader(i.currChunkLoc)
} else {
i.locReader.Reset(i.currChunkLoc)
}
@ -420,18 +417,34 @@ func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) {
return 1, i.normBits1Hit, false, nil
}
freqHasLocs, err := binary.ReadUvarint(i.freqNormReader)
freqHasLocs, err := i.freqNormReader.ReadUvarint()
if err != nil {
return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
}
freq, hasLocs := decodeFreqHasLocs(freqHasLocs)
normBits, err := binary.ReadUvarint(i.freqNormReader)
normBits, err := i.freqNormReader.ReadUvarint()
if err != nil {
return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
}
return freq, normBits, hasLocs, err
return freq, normBits, hasLocs, nil
}
func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) {
if i.normBits1Hit != 0 {
return false, nil
}
freqHasLocs, err := i.freqNormReader.ReadUvarint()
if err != nil {
return false, fmt.Errorf("error reading freqHasLocs: %v", err)
}
i.freqNormReader.SkipUvarint() // Skip normBits.
return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs.
}
func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
@ -449,58 +462,53 @@ func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) {
}
// readLocation processes all the integers on the stream representing a single
// location. if you care about it, pass in a non-nil location struct, and we
// will fill it. if you don't care about it, pass in nil and we safely consume
// the contents.
// location.
func (i *PostingsIterator) readLocation(l *Location) error {
// read off field
fieldID, err := binary.ReadUvarint(i.locReader)
fieldID, err := i.locReader.ReadUvarint()
if err != nil {
return fmt.Errorf("error reading location field: %v", err)
}
// read off pos
pos, err := binary.ReadUvarint(i.locReader)
pos, err := i.locReader.ReadUvarint()
if err != nil {
return fmt.Errorf("error reading location pos: %v", err)
}
// read off start
start, err := binary.ReadUvarint(i.locReader)
start, err := i.locReader.ReadUvarint()
if err != nil {
return fmt.Errorf("error reading location start: %v", err)
}
// read off end
end, err := binary.ReadUvarint(i.locReader)
end, err := i.locReader.ReadUvarint()
if err != nil {
return fmt.Errorf("error reading location end: %v", err)
}
// read off num array pos
numArrayPos, err := binary.ReadUvarint(i.locReader)
numArrayPos, err := i.locReader.ReadUvarint()
if err != nil {
return fmt.Errorf("error reading location num array pos: %v", err)
}
// group these together for less branching
if l != nil {
l.field = i.postings.sb.fieldsInv[fieldID]
l.pos = pos
l.start = start
l.end = end
if cap(l.ap) < int(numArrayPos) {
l.ap = make([]uint64, int(numArrayPos))
} else {
l.ap = l.ap[:int(numArrayPos)]
}
l.field = i.postings.sb.fieldsInv[fieldID]
l.pos = pos
l.start = start
l.end = end
if cap(l.ap) < int(numArrayPos) {
l.ap = make([]uint64, int(numArrayPos))
} else {
l.ap = l.ap[:int(numArrayPos)]
}
// read off array positions
for k := 0; k < int(numArrayPos); k++ {
ap, err := binary.ReadUvarint(i.locReader)
ap, err := i.locReader.ReadUvarint()
if err != nil {
return fmt.Errorf("error reading array position: %v", err)
}
if l != nil {
l.ap[k] = ap
}
l.ap[k] = ap
}
return nil
@ -557,7 +565,7 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err
}
rv.locs = i.nextSegmentLocs[:0]
numLocsBytes, err := binary.ReadUvarint(i.locReader)
numLocsBytes, err := i.locReader.ReadUvarint()
if err != nil {
return nil, fmt.Errorf("error reading location numLocsBytes: %v", err)
}
@ -613,17 +621,14 @@ func (i *PostingsIterator) nextBytes() (
if hasLocs {
startLoc := len(i.currChunkLoc) - i.locReader.Len()
numLocsBytes, err := binary.ReadUvarint(i.locReader)
numLocsBytes, err := i.locReader.ReadUvarint()
if err != nil {
return 0, 0, 0, nil, nil,
fmt.Errorf("error reading location nextBytes numLocs: %v", err)
}
// skip over all the location bytes
_, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent)
if err != nil {
return 0, 0, 0, nil, nil, err
}
i.locReader.SkipBytes(int(numLocsBytes))
endLoc := len(i.currChunkLoc) - i.locReader.Len()
bytesLoc = i.currChunkLoc[startLoc:endLoc]
@ -657,14 +662,14 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool,
return i.nextDocNumAtOrAfterClean(atOrAfter)
}
n := i.Actual.Next()
for uint64(n) < atOrAfter && i.Actual.HasNext() {
n = i.Actual.Next()
}
if uint64(n) < atOrAfter {
i.Actual.AdvanceIfNeeded(uint32(atOrAfter))
if !i.Actual.HasNext() {
// couldn't find anything
return 0, false, nil
}
n := i.Actual.Next()
allN := i.all.Next()
nChunk := n / i.postings.sb.chunkFactor
@ -701,23 +706,20 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool,
// no deletions) where the all bitmap is the same as the actual bitmap
func (i *PostingsIterator) nextDocNumAtOrAfterClean(
atOrAfter uint64) (uint64, bool, error) {
n := i.Actual.Next()
if !i.includeFreqNorm {
for uint64(n) < atOrAfter && i.Actual.HasNext() {
n = i.Actual.Next()
}
i.Actual.AdvanceIfNeeded(uint32(atOrAfter))
if uint64(n) < atOrAfter {
if !i.Actual.HasNext() {
return 0, false, nil // couldn't find anything
}
return uint64(n), true, nil
return uint64(i.Actual.Next()), true, nil
}
// freq-norm's needed, so maintain freq-norm chunk reader
sameChunkNexts := 0 // # of times we called Next() in the same chunk
n := i.Actual.Next()
nChunk := n / i.postings.sb.chunkFactor
for uint64(n) < atOrAfter && i.Actual.HasNext() {
@ -764,22 +766,19 @@ func (i *PostingsIterator) currChunkNext(nChunk uint32) error {
}
// read off freq/offsets even though we don't care about them
_, _, hasLocs, err := i.readFreqNormHasLocs()
hasLocs, err := i.skipFreqNormReadHasLocs()
if err != nil {
return err
}
if i.includeLocs && hasLocs {
numLocsBytes, err := binary.ReadUvarint(i.locReader)
numLocsBytes, err := i.locReader.ReadUvarint()
if err != nil {
return fmt.Errorf("error reading location numLocsBytes: %v", err)
}
// skip over all the location bytes
_, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent)
if err != nil {
return err
}
i.locReader.SkipBytes(int(numLocsBytes))
}
return nil

View file

@ -20,8 +20,8 @@ import (
"fmt"
"io"
"os"
"reflect"
"sync"
"unsafe"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
@ -35,7 +35,7 @@ var reflectStaticSizeSegmentBase int
func init() {
var sb SegmentBase
reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size())
reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb))
}
// Open returns a zap impl of a segment
@ -56,6 +56,7 @@ func Open(path string) (segment.Segment, error) {
mem: mm[0 : len(mm)-FooterSize],
fieldsMap: make(map[string]uint16),
fieldDvReaders: make(map[uint16]*docValueReader),
fieldFSTs: make(map[uint16]*vellum.FST),
},
f: f,
mm: mm,
@ -101,6 +102,9 @@ type SegmentBase struct {
fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field
fieldDvNames []string // field names cached in fieldDvReaders
size uint64
m sync.Mutex
fieldFSTs map[uint16]*vellum.FST
}
func (sb *SegmentBase) Size() int {
@ -258,19 +262,27 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
dictStart := sb.dictLocs[rv.fieldID]
if dictStart > 0 {
// read the length of the vellum data
vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64])
fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
if fstBytes != nil {
var ok bool
sb.m.Lock()
if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok {
// read the length of the vellum data
vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64])
fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
rv.fst, err = vellum.Load(fstBytes)
if err != nil {
sb.m.Unlock()
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
}
rv.fstReader, err = rv.fst.Reader()
if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err)
}
sb.fieldFSTs[rv.fieldID] = rv.fst
}
sb.m.Unlock()
rv.fstReader, err = rv.fst.Reader()
if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err)
}
}
}
@ -527,7 +539,7 @@ func (s *Segment) DictAddr(field string) (uint64, error) {
}
func (s *SegmentBase) loadDvReaders() error {
if s.docValueOffset == fieldNotUninverted {
if s.docValueOffset == fieldNotUninverted || s.numDocs == 0 {
return nil
}
@ -546,7 +558,10 @@ func (s *SegmentBase) loadDvReaders() error {
}
read += uint64(n)
fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
if err != nil {
return err
}
if fieldDvReader != nil {
s.fieldDvReaders[uint16(fieldID)] = fieldDvReader
s.fieldDvNames = append(s.fieldDvNames, field)

View file

@ -28,13 +28,14 @@ import (
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
lev2 "github.com/couchbase/vellum/levenshtein2"
lev "github.com/couchbase/vellum/levenshtein"
)
// re usable, threadsafe levenshtein builders
var lb1, lb2 *lev2.LevenshteinAutomatonBuilder
var lb1, lb2 *lev.LevenshteinAutomatonBuilder
type asynchSegmentResult struct {
dict segment.TermDictionary
dictItr segment.DictionaryIterator
index int
@ -51,11 +52,11 @@ func init() {
var is interface{} = IndexSnapshot{}
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
var err error
lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true)
lb1, err = lev.NewLevenshteinAutomatonBuilder(1, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
}
lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true)
lb2, err = lev.NewLevenshteinAutomatonBuilder(2, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
}
@ -126,7 +127,9 @@ func (i *IndexSnapshot) updateSize() {
}
}
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string,
makeItr func(i segment.TermDictionary) segment.DictionaryIterator,
randomLookup bool) (*IndexSnapshotFieldDict, error) {
results := make(chan *asynchSegmentResult)
for index, segment := range i.segment {
@ -135,7 +138,11 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
if err != nil {
results <- &asynchSegmentResult{err: err}
} else {
results <- &asynchSegmentResult{dictItr: makeItr(dict)}
if randomLookup {
results <- &asynchSegmentResult{dict: dict}
} else {
results <- &asynchSegmentResult{dictItr: makeItr(dict)}
}
}
}(index, segment)
}
@ -150,14 +157,20 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
if asr.err != nil && err == nil {
err = asr.err
} else {
next, err2 := asr.dictItr.Next()
if err2 != nil && err == nil {
err = err2
}
if next != nil {
if !randomLookup {
next, err2 := asr.dictItr.Next()
if err2 != nil && err == nil {
err = err2
}
if next != nil {
rv.cursors = append(rv.cursors, &segmentDictCursor{
itr: asr.dictItr,
curr: *next,
})
}
} else {
rv.cursors = append(rv.cursors, &segmentDictCursor{
itr: asr.dictItr,
curr: *next,
dict: asr.dict,
})
}
}
@ -166,8 +179,11 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
if err != nil {
return nil, err
}
// prepare heap
heap.Init(rv)
if !randomLookup {
// prepare heap
heap.Init(rv)
}
return rv, nil
}
@ -175,21 +191,21 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
func (i *IndexSnapshot) FieldDict(field string) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.Iterator()
})
}, false)
}
func (i *IndexSnapshot) FieldDictRange(field string, startTerm []byte,
endTerm []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.RangeIterator(string(startTerm), string(endTerm))
})
}, false)
}
func (i *IndexSnapshot) FieldDictPrefix(field string,
termPrefix []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.PrefixIterator(string(termPrefix))
})
}, false)
}
func (i *IndexSnapshot) FieldDictRegexp(field string,
@ -204,7 +220,7 @@ func (i *IndexSnapshot) FieldDictRegexp(field string,
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
})
}, false)
}
func (i *IndexSnapshot) getLevAutomaton(term string,
@ -232,14 +248,18 @@ func (i *IndexSnapshot) FieldDictFuzzy(field string,
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
})
}, false)
}
func (i *IndexSnapshot) FieldDictOnly(field string,
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.OnlyIterator(onlyTerms, includeCount)
})
}, false)
}
func (i *IndexSnapshot) FieldDictContains(field string) (index.FieldDictContains, error) {
return i.newIndexSnapshotFieldDict(field, nil, true)
}
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {

View file

@ -22,6 +22,7 @@ import (
)
type segmentDictCursor struct {
dict segment.TermDictionary
itr segment.DictionaryIterator
curr index.DictEntry
}
@ -91,3 +92,17 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
func (i *IndexSnapshotFieldDict) Close() error {
return nil
}
func (i *IndexSnapshotFieldDict) Contains(key []byte) (bool, error) {
if len(i.cursors) == 0 {
return false, nil
}
for _, cursor := range i.cursors {
if found, _ := cursor.dict.Contains(key); found {
return true, nil
}
}
return false, nil
}

View file

@ -183,9 +183,9 @@ func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
}
type cachedDocs struct {
size uint64
m sync.Mutex // As the cache is asynchronously prepared, need a lock
cache map[string]*cachedFieldDocs // Keyed by field
size uint64
}
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {

View file

@ -107,6 +107,9 @@ type Stats struct {
TotFileMergeIntroductionsDone uint64
TotFileMergeIntroductionsSkipped uint64
CurFilesIneligibleForRemoval uint64
TotSnapshotsRemovedFromMetaStore uint64
TotMemMergeBeg uint64
TotMemMergeErr uint64
TotMemMergeDone uint64

View file

@ -415,7 +415,6 @@ func (udc *UpsideDownCouch) Close() error {
func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
// do analysis before acquiring write lock
analysisStart := time.Now()
numPlainTextBytes := doc.NumPlainTextBytes()
resultChan := make(chan *index.AnalysisResult)
aw := index.NewAnalysisWork(udc, doc, resultChan)
@ -452,6 +451,11 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
return
}
return udc.UpdateWithAnalysis(doc, result, backIndexRow)
}
func (udc *UpsideDownCouch) UpdateWithAnalysis(doc *document.Document,
result *index.AnalysisResult, backIndexRow *BackIndexRow) (err error) {
// start a writer for this update
indexStart := time.Now()
var kvwriter store.KVWriter
@ -490,7 +494,7 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
atomic.AddUint64(&udc.stats.indexTime, uint64(time.Since(indexStart)))
if err == nil {
atomic.AddUint64(&udc.stats.updates, 1)
atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, numPlainTextBytes)
atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, doc.NumPlainTextBytes())
} else {
atomic.AddUint64(&udc.stats.errors, 1)
}
@ -797,6 +801,10 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []
}
func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
persistedCallback := batch.PersistedCallback()
if persistedCallback != nil {
defer persistedCallback(err)
}
analysisStart := time.Now()
resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps))
@ -810,7 +818,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
}
}
if len(batch.IndexOps) > 0 {
if numUpdates > 0 {
go func() {
for _, doc := range batch.IndexOps {
if doc != nil {
@ -961,10 +969,6 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
atomic.AddUint64(&udc.stats.errors, 1)
}
persistedCallback := batch.PersistedCallback()
if persistedCallback != nil {
persistedCallback(err)
}
return
}