forked from forgejo/forgejo
Upgrade blevesearch dependency to v2.0.1 (#14346)
* Upgrade blevesearch dependency to v2.0.1 * Update rupture to v1.0.0 * Fix test
This commit is contained in:
parent
3aa53dc6bc
commit
f5abe2f563
459 changed files with 7518 additions and 4211 deletions
105
vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go
generated
vendored
Normal file
105
vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go
generated
vendored
Normal file
|
@ -0,0 +1,105 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package lowercase implements a TokenFilter which converts
|
||||
// tokens to lower case according to unicode rules.
|
||||
package lowercase
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
// Name is the name used to register LowerCaseFilter in the bleve registry
|
||||
const Name = "to_lower"
|
||||
|
||||
type LowerCaseFilter struct {
|
||||
}
|
||||
|
||||
func NewLowerCaseFilter() *LowerCaseFilter {
|
||||
return &LowerCaseFilter{}
|
||||
}
|
||||
|
||||
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
token.Term = toLowerDeferredCopy(token.Term)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewLowerCaseFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
|
||||
}
|
||||
|
||||
// toLowerDeferredCopy will function exactly like
|
||||
// bytes.ToLower() only it will reuse (overwrite)
|
||||
// the original byte array when possible
|
||||
// NOTE: because its possible that the lower-case
|
||||
// form of a rune has a different utf-8 encoded
|
||||
// length, in these cases a new byte array is allocated
|
||||
func toLowerDeferredCopy(s []byte) []byte {
|
||||
j := 0
|
||||
for i := 0; i < len(s); {
|
||||
wid := 1
|
||||
r := rune(s[i])
|
||||
if r >= utf8.RuneSelf {
|
||||
r, wid = utf8.DecodeRune(s[i:])
|
||||
}
|
||||
|
||||
l := unicode.ToLower(r)
|
||||
|
||||
// If the rune is already lowercased, just move to the
|
||||
// next rune.
|
||||
if l == r {
|
||||
i += wid
|
||||
j += wid
|
||||
continue
|
||||
}
|
||||
|
||||
// Handles the Unicode edge-case where the last
|
||||
// rune in a word on the greek Σ needs to be converted
|
||||
// differently.
|
||||
if l == 'σ' && i+2 == len(s) {
|
||||
l = 'ς'
|
||||
}
|
||||
|
||||
lwid := utf8.RuneLen(l)
|
||||
if lwid > wid {
|
||||
// utf-8 encoded replacement is wider
|
||||
// for now, punt and defer
|
||||
// to bytes.ToLower() for the remainder
|
||||
// only known to happen with chars
|
||||
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
|
||||
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
|
||||
rest := bytes.ToLower(s[i:])
|
||||
rv := make([]byte, j+len(rest))
|
||||
copy(rv[:j], s[:j])
|
||||
copy(rv[j:], rest)
|
||||
return rv
|
||||
} else {
|
||||
utf8.EncodeRune(s[j:], l)
|
||||
}
|
||||
i += wid
|
||||
j += lwid
|
||||
}
|
||||
return s[:j]
|
||||
}
|
53
vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go
generated
vendored
Normal file
53
vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go
generated
vendored
Normal file
|
@ -0,0 +1,53 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package porter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/go-porterstemmer"
|
||||
)
|
||||
|
||||
const Name = "stemmer_porter"
|
||||
|
||||
type PorterStemmer struct {
|
||||
}
|
||||
|
||||
func NewPorterStemmer() *PorterStemmer {
|
||||
return &PorterStemmer{}
|
||||
}
|
||||
|
||||
func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if it is not a protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
termRunes := bytes.Runes(token.Term)
|
||||
stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes)
|
||||
token.Term = analysis.BuildTermFromRunes(stemmedRunes)
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func PorterStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPorterStemmer(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(Name, PorterStemmerConstructor)
|
||||
}
|
70
vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go
generated
vendored
Normal file
70
vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go
generated
vendored
Normal file
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package stop implements a TokenFilter removing tokens found in
|
||||
// a TokenMap.
|
||||
//
|
||||
// It constructor takes the following arguments:
|
||||
//
|
||||
// "stop_token_map" (string): the name of the token map identifying tokens to
|
||||
// remove.
|
||||
package stop
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "stop_tokens"
|
||||
|
||||
type StopTokensFilter struct {
|
||||
stopTokens analysis.TokenMap
|
||||
}
|
||||
|
||||
func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
|
||||
return &StopTokensFilter{
|
||||
stopTokens: stopTokens,
|
||||
}
|
||||
}
|
||||
|
||||
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
j := 0
|
||||
for _, token := range input {
|
||||
_, isStopToken := f.stopTokens[string(token.Term)]
|
||||
if !isStopToken {
|
||||
input[j] = token
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
return input[:j]
|
||||
}
|
||||
|
||||
func StopTokensFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
stopTokenMapName, ok := config["stop_token_map"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify stop_token_map")
|
||||
}
|
||||
stopTokenMap, err := cache.TokenMapNamed(stopTokenMapName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building stop words filter: %v", err)
|
||||
}
|
||||
return NewStopTokensFilter(stopTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(Name, StopTokensFilterConstructor)
|
||||
}
|
79
vendor/github.com/blevesearch/bleve/v2/analysis/token/unicodenorm/unicodenorm.go
generated
vendored
Normal file
79
vendor/github.com/blevesearch/bleve/v2/analysis/token/unicodenorm/unicodenorm.go
generated
vendored
Normal file
|
@ -0,0 +1,79 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package unicodenorm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
const Name = "normalize_unicode"
|
||||
|
||||
const NFC = "nfc"
|
||||
const NFD = "nfd"
|
||||
const NFKC = "nfkc"
|
||||
const NFKD = "nfkd"
|
||||
|
||||
var forms = map[string]norm.Form{
|
||||
NFC: norm.NFC,
|
||||
NFD: norm.NFD,
|
||||
NFKC: norm.NFKC,
|
||||
NFKD: norm.NFKD,
|
||||
}
|
||||
|
||||
type UnicodeNormalizeFilter struct {
|
||||
form norm.Form
|
||||
}
|
||||
|
||||
func NewUnicodeNormalizeFilter(formName string) (*UnicodeNormalizeFilter, error) {
|
||||
form, ok := forms[formName]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no form named %s", formName)
|
||||
}
|
||||
return &UnicodeNormalizeFilter{
|
||||
form: form,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
|
||||
filter, err := NewUnicodeNormalizeFilter(formName)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return filter
|
||||
}
|
||||
|
||||
func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
token.Term = s.form.Bytes(token.Term)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
formVal, ok := config["form"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify form")
|
||||
}
|
||||
form := formVal
|
||||
return NewUnicodeNormalizeFilter(form)
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(Name, UnicodeNormalizeFilterConstructor)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue