Upgrade blevesearch dependency to v2.0.1 (#14346)

* Upgrade blevesearch dependency to v2.0.1 * Update rupture to v1.0.0 * Fix test
2021-01-18 03:21:14 +02:00 · 2021-01-18 03:21:14 +02:00 · f5abe2f563
commit f5abe2f563
parent 3aa53dc6bc
459 changed files with 7518 additions and 4211 deletions
--- a/vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go
@ -0,0 +1,105 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lowercase implements a TokenFilter which converts
+// tokens to lower case according to unicode rules.
+package lowercase
+
+import (
+	"bytes"
+	"unicode"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+// Name is the name used to register LowerCaseFilter in the bleve registry
+const Name = "to_lower"
+
+type LowerCaseFilter struct {
+}
+
+func NewLowerCaseFilter() *LowerCaseFilter {
+	return &LowerCaseFilter{}
+}
+
+func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = toLowerDeferredCopy(token.Term)
+	}
+	return input
+}
+
+func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewLowerCaseFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
+}
+
+// toLowerDeferredCopy will function exactly like
+// bytes.ToLower() only it will reuse (overwrite)
+// the original byte array when possible
+// NOTE: because its possible that the lower-case
+// form of a rune has a different utf-8 encoded
+// length, in these cases a new byte array is allocated
+func toLowerDeferredCopy(s []byte) []byte {
+	j := 0
+	for i := 0; i < len(s); {
+		wid := 1
+		r := rune(s[i])
+		if r >= utf8.RuneSelf {
+			r, wid = utf8.DecodeRune(s[i:])
+		}
+
+		l := unicode.ToLower(r)
+
+		// If the rune is already lowercased, just move to the
+		// next rune.
+		if l == r {
+			i += wid
+			j += wid
+			continue
+		}
+
+		// Handles the Unicode edge-case where the last
+		// rune in a word on the greek Σ needs to be converted
+		// differently.
+		if l == 'σ' && i+2 == len(s) {
+			l = 'ς'
+		}
+
+		lwid := utf8.RuneLen(l)
+		if lwid > wid {
+			// utf-8 encoded replacement is wider
+			// for now, punt and defer
+			// to bytes.ToLower() for the remainder
+			// only known to happen with chars
+			//   Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
+			//   Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
+			rest := bytes.ToLower(s[i:])
+			rv := make([]byte, j+len(rest))
+			copy(rv[:j], s[:j])
+			copy(rv[j:], rest)
+			return rv
+		} else {
+			utf8.EncodeRune(s[j:], l)
+		}
+		i += wid
+		j += lwid
+	}
+	return s[:j]
+}
--- a/vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go
@ -0,0 +1,53 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package porter
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+
+	"github.com/blevesearch/go-porterstemmer"
+)
+
+const Name = "stemmer_porter"
+
+type PorterStemmer struct {
+}
+
+func NewPorterStemmer() *PorterStemmer {
+	return &PorterStemmer{}
+}
+
+func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		// if it is not a protected keyword, stem it
+		if !token.KeyWord {
+			termRunes := bytes.Runes(token.Term)
+			stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes)
+			token.Term = analysis.BuildTermFromRunes(stemmedRunes)
+		}
+	}
+	return input
+}
+
+func PorterStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewPorterStemmer(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, PorterStemmerConstructor)
+}
--- a/vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go
@ -0,0 +1,70 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package stop implements a TokenFilter removing tokens found in
+// a TokenMap.
+//
+// It constructor takes the following arguments:
+//
+// "stop_token_map" (string): the name of the token map identifying tokens to
+// remove.
+package stop
+
+import (
+	"fmt"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "stop_tokens"
+
+type StopTokensFilter struct {
+	stopTokens analysis.TokenMap
+}
+
+func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
+	return &StopTokensFilter{
+		stopTokens: stopTokens,
+	}
+}
+
+func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	j := 0
+	for _, token := range input {
+		_, isStopToken := f.stopTokens[string(token.Term)]
+		if !isStopToken {
+			input[j] = token
+			j++
+		}
+	}
+
+	return input[:j]
+}
+
+func StopTokensFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	stopTokenMapName, ok := config["stop_token_map"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify stop_token_map")
+	}
+	stopTokenMap, err := cache.TokenMapNamed(stopTokenMapName)
+	if err != nil {
+		return nil, fmt.Errorf("error building stop words filter: %v", err)
+	}
+	return NewStopTokensFilter(stopTokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, StopTokensFilterConstructor)
+}
--- a/vendor/github.com/blevesearch/bleve/v2/analysis/token/unicodenorm/unicodenorm.go
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/token/unicodenorm/unicodenorm.go
@ -0,0 +1,79 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unicodenorm
+
+import (
+	"fmt"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+	"golang.org/x/text/unicode/norm"
+)
+
+const Name = "normalize_unicode"
+
+const NFC = "nfc"
+const NFD = "nfd"
+const NFKC = "nfkc"
+const NFKD = "nfkd"
+
+var forms = map[string]norm.Form{
+	NFC:  norm.NFC,
+	NFD:  norm.NFD,
+	NFKC: norm.NFKC,
+	NFKD: norm.NFKD,
+}
+
+type UnicodeNormalizeFilter struct {
+	form norm.Form
+}
+
+func NewUnicodeNormalizeFilter(formName string) (*UnicodeNormalizeFilter, error) {
+	form, ok := forms[formName]
+	if !ok {
+		return nil, fmt.Errorf("no form named %s", formName)
+	}
+	return &UnicodeNormalizeFilter{
+		form: form,
+	}, nil
+}
+
+func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
+	filter, err := NewUnicodeNormalizeFilter(formName)
+	if err != nil {
+		panic(err)
+	}
+	return filter
+}
+
+func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = s.form.Bytes(token.Term)
+	}
+	return input
+}
+
+func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	formVal, ok := config["form"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify form")
+	}
+	form := formVal
+	return NewUnicodeNormalizeFilter(form)
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, UnicodeNormalizeFilterConstructor)
+}