forked from forgejo/forgejo
parent
b50dee5a61
commit
9591185c8f
180 changed files with 43400 additions and 41105 deletions
203
vendor/github.com/couchbase/vellum/levenshtein/LICENSE
generated
vendored
Normal file
203
vendor/github.com/couchbase/vellum/levenshtein/LICENSE
generated
vendored
Normal file
|
@ -0,0 +1,203 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
33
vendor/github.com/couchbase/vellum/levenshtein/README.md
generated
vendored
Normal file
33
vendor/github.com/couchbase/vellum/levenshtein/README.md
generated
vendored
Normal file
|
@ -0,0 +1,33 @@
|
|||
# levenshtein
|
||||
levenshtein automaton
|
||||
|
||||
This package makes it fast and simple to build a finite determinic automaton that computes the levenshtein distance from a given string.
|
||||
|
||||
# Sample usage:
|
||||
|
||||
```
|
||||
// build a re-usable builder
|
||||
lb := NewLevenshteinAutomatonBuilder(2, false)
|
||||
|
||||
origTerm := "couchbasefts"
|
||||
dfa := lb.BuildDfa("couchbases", 2)
|
||||
ed := dfa.eval([]byte(origTerm))
|
||||
if ed.distance() != 2 {
|
||||
log.Errorf("expected distance 2, actual: %d", ed.distance())
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
This implementation is inspired by [blog post](https://fulmicoton.com/posts/levenshtein/) and is intended to be
|
||||
a port of original rust implementation: https://github.com/tantivy-search/levenshtein-automata
|
||||
|
||||
|
||||
Micro Benchmark Results against the current vellum/levenshtein is as below.
|
||||
|
||||
```
|
||||
BenchmarkNewEditDistance1-8 30000 52684 ns/op 89985 B/op 295 allocs/op
|
||||
BenchmarkOlderEditDistance1-8 10000 132931 ns/op 588892 B/op 363 allocs/op
|
||||
|
||||
BenchmarkNewEditDistance2-8 10000 199127 ns/op 377532 B/op 1019 allocs/op
|
||||
BenchmarkOlderEditDistance2-8 2000 988109 ns/op 4236609 B/op 1898 allocs/op
|
||||
```
|
125
vendor/github.com/couchbase/vellum/levenshtein/alphabet.go
generated
vendored
Normal file
125
vendor/github.com/couchbase/vellum/levenshtein/alphabet.go
generated
vendored
Normal file
|
@ -0,0 +1,125 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package levenshtein
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type FullCharacteristicVector []uint32
|
||||
|
||||
func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 {
|
||||
bucketID := offset / 32
|
||||
align := offset - bucketID*32
|
||||
if align == 0 {
|
||||
return fcv[bucketID] & mask
|
||||
}
|
||||
left := fcv[bucketID] >> align
|
||||
right := fcv[bucketID+1] << (32 - align)
|
||||
return (left | right) & mask
|
||||
}
|
||||
|
||||
type tuple struct {
|
||||
char rune
|
||||
fcv FullCharacteristicVector
|
||||
}
|
||||
|
||||
type sortRunes []rune
|
||||
|
||||
func (s sortRunes) Less(i, j int) bool {
|
||||
return s[i] < s[j]
|
||||
}
|
||||
|
||||
func (s sortRunes) Swap(i, j int) {
|
||||
s[i], s[j] = s[j], s[i]
|
||||
}
|
||||
|
||||
func (s sortRunes) Len() int {
|
||||
return len(s)
|
||||
}
|
||||
|
||||
func sortRune(r []rune) []rune {
|
||||
sort.Sort(sortRunes(r))
|
||||
return r
|
||||
}
|
||||
|
||||
type Alphabet struct {
|
||||
charset []tuple
|
||||
index uint32
|
||||
}
|
||||
|
||||
func (a *Alphabet) resetNext() {
|
||||
a.index = 0
|
||||
}
|
||||
|
||||
func (a *Alphabet) next() (rune, FullCharacteristicVector, error) {
|
||||
if int(a.index) >= len(a.charset) {
|
||||
return 0, nil, fmt.Errorf("eof")
|
||||
}
|
||||
|
||||
rv := a.charset[a.index]
|
||||
a.index++
|
||||
return rv.char, rv.fcv, nil
|
||||
}
|
||||
|
||||
func dedupe(in string) string {
|
||||
lookUp := make(map[rune]struct{}, len(in))
|
||||
var rv string
|
||||
for len(in) > 0 {
|
||||
r, size := utf8.DecodeRuneInString(in)
|
||||
in = in[size:]
|
||||
if _, ok := lookUp[r]; !ok {
|
||||
rv += string(r)
|
||||
lookUp[r] = struct{}{}
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func queryChars(qChars string) Alphabet {
|
||||
chars := dedupe(qChars)
|
||||
inChars := sortRune([]rune(chars))
|
||||
charsets := make([]tuple, 0, len(inChars))
|
||||
|
||||
for _, c := range inChars {
|
||||
tempChars := qChars
|
||||
var bits []uint32
|
||||
for len(tempChars) > 0 {
|
||||
var chunk string
|
||||
if len(tempChars) > 32 {
|
||||
chunk = tempChars[0:32]
|
||||
tempChars = tempChars[32:]
|
||||
} else {
|
||||
chunk = tempChars
|
||||
tempChars = tempChars[:0]
|
||||
}
|
||||
|
||||
chunkBits := uint32(0)
|
||||
bit := uint32(1)
|
||||
for _, chr := range chunk {
|
||||
if chr == c {
|
||||
chunkBits |= bit
|
||||
}
|
||||
bit <<= 1
|
||||
}
|
||||
bits = append(bits, chunkBits)
|
||||
}
|
||||
bits = append(bits, 0)
|
||||
charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)})
|
||||
}
|
||||
return Alphabet{charset: charsets}
|
||||
}
|
250
vendor/github.com/couchbase/vellum/levenshtein/dfa.go
generated
vendored
Normal file
250
vendor/github.com/couchbase/vellum/levenshtein/dfa.go
generated
vendored
Normal file
|
@ -0,0 +1,250 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package levenshtein
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
const SinkState = uint32(0)
|
||||
|
||||
type DFA struct {
|
||||
transitions [][256]uint32
|
||||
distances []Distance
|
||||
initState int
|
||||
ed uint8
|
||||
}
|
||||
|
||||
/// Returns the initial state
|
||||
func (d *DFA) initialState() int {
|
||||
return d.initState
|
||||
}
|
||||
|
||||
/// Returns the Levenshtein distance associated to the
|
||||
/// current state.
|
||||
func (d *DFA) distance(stateId int) Distance {
|
||||
return d.distances[stateId]
|
||||
}
|
||||
|
||||
/// Returns the number of states in the `DFA`.
|
||||
func (d *DFA) numStates() int {
|
||||
return len(d.transitions)
|
||||
}
|
||||
|
||||
/// Returns the destination state reached after consuming a given byte.
|
||||
func (d *DFA) transition(fromState int, b uint8) int {
|
||||
return int(d.transitions[fromState][b])
|
||||
}
|
||||
|
||||
func (d *DFA) eval(bytes []uint8) Distance {
|
||||
state := d.initialState()
|
||||
|
||||
for _, b := range bytes {
|
||||
state = d.transition(state, b)
|
||||
}
|
||||
|
||||
return d.distance(state)
|
||||
}
|
||||
|
||||
func (d *DFA) Start() int {
|
||||
return int(d.initialState())
|
||||
}
|
||||
|
||||
func (d *DFA) IsMatch(state int) bool {
|
||||
if _, ok := d.distance(state).(Exact); ok {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (d *DFA) CanMatch(state int) bool {
|
||||
return state > 0 && state < d.numStates()
|
||||
}
|
||||
|
||||
func (d *DFA) Accept(state int, b byte) int {
|
||||
return int(d.transition(state, b))
|
||||
}
|
||||
|
||||
// WillAlwaysMatch returns if the specified state will always end in a
|
||||
// matching state.
|
||||
func (d *DFA) WillAlwaysMatch(state int) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func fill(dest []uint32, val uint32) {
|
||||
for i := range dest {
|
||||
dest[i] = val
|
||||
}
|
||||
}
|
||||
|
||||
func fillTransitions(dest *[256]uint32, val uint32) {
|
||||
for i := range dest {
|
||||
dest[i] = val
|
||||
}
|
||||
}
|
||||
|
||||
type Utf8DFAStateBuilder struct {
|
||||
dfaBuilder *Utf8DFABuilder
|
||||
stateID uint32
|
||||
defaultSuccessor []uint32
|
||||
}
|
||||
|
||||
func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8,
|
||||
toStateID uint32) {
|
||||
sb.dfaBuilder.transitions[fromStateID][b] = toStateID
|
||||
}
|
||||
|
||||
func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) {
|
||||
fromStateID := sb.stateID
|
||||
chars := []byte(string(in))
|
||||
lastByte := chars[len(chars)-1]
|
||||
|
||||
for i, ch := range chars[:len(chars)-1] {
|
||||
remNumBytes := len(chars) - i - 1
|
||||
defaultSuccessor := sb.defaultSuccessor[remNumBytes]
|
||||
intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch]
|
||||
|
||||
if intermediateStateID == defaultSuccessor {
|
||||
intermediateStateID = sb.dfaBuilder.allocate()
|
||||
fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID],
|
||||
sb.defaultSuccessor[remNumBytes-1])
|
||||
}
|
||||
|
||||
sb.addTransitionID(fromStateID, ch, intermediateStateID)
|
||||
fromStateID = intermediateStateID
|
||||
}
|
||||
|
||||
toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID))
|
||||
sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded)
|
||||
}
|
||||
|
||||
type Utf8StateId uint32
|
||||
|
||||
func original(stateId uint32) Utf8StateId {
|
||||
return predecessor(stateId, 0)
|
||||
}
|
||||
|
||||
func predecessor(stateId uint32, numSteps uint8) Utf8StateId {
|
||||
return Utf8StateId(stateId*4 + uint32(numSteps))
|
||||
}
|
||||
|
||||
// Utf8DFABuilder makes it possible to define a DFA
|
||||
// that takes unicode character, and build a `DFA`
|
||||
// that operates on utf-8 encoded
|
||||
type Utf8DFABuilder struct {
|
||||
index []uint32
|
||||
distances []Distance
|
||||
transitions [][256]uint32
|
||||
initialState uint32
|
||||
numStates uint32
|
||||
maxNumStates uint32
|
||||
}
|
||||
|
||||
func withMaxStates(maxStates uint32) *Utf8DFABuilder {
|
||||
rv := &Utf8DFABuilder{
|
||||
index: make([]uint32, maxStates*2+100),
|
||||
distances: make([]Distance, 0, maxStates),
|
||||
transitions: make([][256]uint32, 0, maxStates),
|
||||
maxNumStates: maxStates,
|
||||
}
|
||||
|
||||
for i := range rv.index {
|
||||
rv.index[i] = math.MaxUint32
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (dfab *Utf8DFABuilder) allocate() uint32 {
|
||||
newState := dfab.numStates
|
||||
dfab.numStates++
|
||||
|
||||
dfab.distances = append(dfab.distances, Atleast{d: 255})
|
||||
dfab.transitions = append(dfab.transitions, [256]uint32{})
|
||||
|
||||
return newState
|
||||
}
|
||||
|
||||
func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 {
|
||||
if int(state) >= cap(dfab.index) {
|
||||
cloneIndex := make([]uint32, int(state)*2)
|
||||
copy(cloneIndex, dfab.index)
|
||||
dfab.index = cloneIndex
|
||||
}
|
||||
if dfab.index[state] != math.MaxUint32 {
|
||||
return dfab.index[state]
|
||||
}
|
||||
|
||||
nstate := dfab.allocate()
|
||||
dfab.index[state] = nstate
|
||||
|
||||
return nstate
|
||||
}
|
||||
|
||||
func (dfab *Utf8DFABuilder) setInitialState(iState uint32) {
|
||||
decodedID := dfab.getOrAllocate(original(iState))
|
||||
dfab.initialState = decodedID
|
||||
}
|
||||
|
||||
func (dfab *Utf8DFABuilder) build(ed uint8) *DFA {
|
||||
return &DFA{
|
||||
transitions: dfab.transitions,
|
||||
distances: dfab.distances,
|
||||
initState: int(dfab.initialState),
|
||||
ed: ed,
|
||||
}
|
||||
}
|
||||
|
||||
func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32,
|
||||
distance Distance) (*Utf8DFAStateBuilder, error) {
|
||||
if state > dfab.maxNumStates {
|
||||
return nil, fmt.Errorf("State id is larger than maxNumStates")
|
||||
}
|
||||
|
||||
stateID := dfab.getOrAllocate(original(state))
|
||||
dfab.distances[stateID] = distance
|
||||
|
||||
defaultSuccID := dfab.getOrAllocate(original(default_suc_orig))
|
||||
// creates a chain of states of predecessors of `default_suc_orig`.
|
||||
// Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]`
|
||||
// leads to the `default_suc_orig` state.
|
||||
predecessorStates := []uint32{defaultSuccID,
|
||||
defaultSuccID,
|
||||
defaultSuccID,
|
||||
defaultSuccID}
|
||||
|
||||
for numBytes := uint8(1); numBytes < 4; numBytes++ {
|
||||
predecessorState := predecessor(default_suc_orig, numBytes)
|
||||
predecessorStateID := dfab.getOrAllocate(predecessorState)
|
||||
predecessorStates[numBytes] = predecessorStateID
|
||||
succ := predecessorStates[numBytes-1]
|
||||
fillTransitions(&dfab.transitions[predecessorStateID], succ)
|
||||
}
|
||||
|
||||
// 1-byte encoded chars.
|
||||
fill(dfab.transitions[stateID][0:192], predecessorStates[0])
|
||||
// 2-bytes encoded chars.
|
||||
fill(dfab.transitions[stateID][192:224], predecessorStates[1])
|
||||
// 3-bytes encoded chars.
|
||||
fill(dfab.transitions[stateID][224:240], predecessorStates[2])
|
||||
// 4-bytes encoded chars.
|
||||
fill(dfab.transitions[stateID][240:256], predecessorStates[3])
|
||||
|
||||
return &Utf8DFAStateBuilder{
|
||||
dfaBuilder: dfab,
|
||||
stateID: stateID,
|
||||
defaultSuccessor: predecessorStates}, nil
|
||||
}
|
64
vendor/github.com/couchbase/vellum/levenshtein/levenshtein.go
generated
vendored
Normal file
64
vendor/github.com/couchbase/vellum/levenshtein/levenshtein.go
generated
vendored
Normal file
|
@ -0,0 +1,64 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package levenshtein
|
||||
|
||||
import "fmt"
|
||||
|
||||
// StateLimit is the maximum number of states allowed
|
||||
const StateLimit = 10000
|
||||
|
||||
// ErrTooManyStates is returned if you attempt to build a Levenshtein
|
||||
// automaton which requires too many states.
|
||||
var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
|
||||
StateLimit)
|
||||
|
||||
// LevenshteinAutomatonBuilder wraps a precomputed
|
||||
// datastructure that allows to produce small (but not minimal) DFA.
|
||||
type LevenshteinAutomatonBuilder struct {
|
||||
pDfa *ParametricDFA
|
||||
}
|
||||
|
||||
// NewLevenshteinAutomatonBuilder creates a
|
||||
// reusable, threadsafe Levenshtein automaton builder.
|
||||
// `maxDistance` - maximum distance considered by the automaton.
|
||||
// `transposition` - assign a distance of 1 for transposition
|
||||
//
|
||||
// Building this automaton builder is computationally intensive.
|
||||
// While it takes only a few milliseconds for `d=2`, it grows
|
||||
// exponentially with `d`. It is only reasonable to `d <= 5`.
|
||||
func NewLevenshteinAutomatonBuilder(maxDistance uint8,
|
||||
transposition bool) (*LevenshteinAutomatonBuilder, error) {
|
||||
lnfa := newLevenshtein(maxDistance, transposition)
|
||||
|
||||
pdfa, err := fromNfa(lnfa)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &LevenshteinAutomatonBuilder{pDfa: pdfa}, nil
|
||||
}
|
||||
|
||||
// BuildDfa builds the levenshtein automaton for serving
|
||||
// queries with a given edit distance.
|
||||
func (lab *LevenshteinAutomatonBuilder) BuildDfa(query string,
|
||||
fuzziness uint8) (*DFA, error) {
|
||||
return lab.pDfa.buildDfa(query, fuzziness, false)
|
||||
}
|
||||
|
||||
// MaxDistance returns the MaxEdit distance supported by the
|
||||
// LevenshteinAutomatonBuilder builder.
|
||||
func (lab *LevenshteinAutomatonBuilder) MaxDistance() uint8 {
|
||||
return lab.pDfa.maxDistance
|
||||
}
|
292
vendor/github.com/couchbase/vellum/levenshtein/levenshtein_nfa.go
generated
vendored
Normal file
292
vendor/github.com/couchbase/vellum/levenshtein/levenshtein_nfa.go
generated
vendored
Normal file
|
@ -0,0 +1,292 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package levenshtein
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
)
|
||||
|
||||
/// Levenshtein Distance computed by a Levenshtein Automaton.
|
||||
///
|
||||
/// Levenshtein automata can only compute the exact Levenshtein distance
|
||||
/// up to a given `max_distance`.
|
||||
///
|
||||
/// Over this distance, the automaton will invariably
|
||||
/// return `Distance::AtLeast(max_distance + 1)`.
|
||||
type Distance interface {
|
||||
distance() uint8
|
||||
}
|
||||
|
||||
type Exact struct {
|
||||
d uint8
|
||||
}
|
||||
|
||||
func (e Exact) distance() uint8 {
|
||||
return e.d
|
||||
}
|
||||
|
||||
type Atleast struct {
|
||||
d uint8
|
||||
}
|
||||
|
||||
func (a Atleast) distance() uint8 {
|
||||
return a.d
|
||||
}
|
||||
|
||||
func characteristicVector(query []rune, c rune) uint64 {
|
||||
chi := uint64(0)
|
||||
for i := 0; i < len(query); i++ {
|
||||
if query[i] == c {
|
||||
chi |= 1 << uint64(i)
|
||||
}
|
||||
}
|
||||
return chi
|
||||
}
|
||||
|
||||
type NFAState struct {
|
||||
Offset uint32
|
||||
Distance uint8
|
||||
InTranspose bool
|
||||
}
|
||||
|
||||
type NFAStates []NFAState
|
||||
|
||||
func (ns NFAStates) Len() int {
|
||||
return len(ns)
|
||||
}
|
||||
|
||||
func (ns NFAStates) Less(i, j int) bool {
|
||||
if ns[i].Offset != ns[j].Offset {
|
||||
return ns[i].Offset < ns[j].Offset
|
||||
}
|
||||
|
||||
if ns[i].Distance != ns[j].Distance {
|
||||
return ns[i].Distance < ns[j].Distance
|
||||
}
|
||||
|
||||
return !ns[i].InTranspose && ns[j].InTranspose
|
||||
}
|
||||
|
||||
func (ns NFAStates) Swap(i, j int) {
|
||||
ns[i], ns[j] = ns[j], ns[i]
|
||||
}
|
||||
|
||||
func (ns *NFAState) imply(other NFAState) bool {
|
||||
transposeImply := ns.InTranspose
|
||||
if !other.InTranspose {
|
||||
transposeImply = !other.InTranspose
|
||||
}
|
||||
|
||||
deltaOffset := ns.Offset - other.Offset
|
||||
if ns.Offset < other.Offset {
|
||||
deltaOffset = other.Offset - ns.Offset
|
||||
}
|
||||
|
||||
if transposeImply {
|
||||
return uint32(other.Distance) >= (uint32(ns.Distance) + deltaOffset)
|
||||
}
|
||||
|
||||
return uint32(other.Distance) > (uint32(ns.Distance) + deltaOffset)
|
||||
}
|
||||
|
||||
type MultiState struct {
|
||||
states []NFAState
|
||||
}
|
||||
|
||||
func (ms *MultiState) States() []NFAState {
|
||||
return ms.states
|
||||
}
|
||||
|
||||
func (ms *MultiState) Clear() {
|
||||
ms.states = ms.states[:0]
|
||||
}
|
||||
|
||||
func newMultiState() *MultiState {
|
||||
return &MultiState{states: make([]NFAState, 0)}
|
||||
}
|
||||
|
||||
func (ms *MultiState) normalize() uint32 {
|
||||
minOffset := uint32(math.MaxUint32)
|
||||
|
||||
for _, s := range ms.states {
|
||||
if s.Offset < minOffset {
|
||||
minOffset = s.Offset
|
||||
}
|
||||
}
|
||||
if minOffset == uint32(math.MaxUint32) {
|
||||
minOffset = 0
|
||||
}
|
||||
|
||||
for i := 0; i < len(ms.states); i++ {
|
||||
ms.states[i].Offset -= minOffset
|
||||
}
|
||||
|
||||
sort.Sort(NFAStates(ms.states))
|
||||
|
||||
return minOffset
|
||||
}
|
||||
|
||||
func (ms *MultiState) addStates(nState NFAState) {
|
||||
|
||||
for _, s := range ms.states {
|
||||
if s.imply(nState) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
i := 0
|
||||
for i < len(ms.states) {
|
||||
if nState.imply(ms.states[i]) {
|
||||
ms.states = append(ms.states[:i], ms.states[i+1:]...)
|
||||
} else {
|
||||
i++
|
||||
}
|
||||
}
|
||||
ms.states = append(ms.states, nState)
|
||||
|
||||
}
|
||||
|
||||
func extractBit(bitset uint64, pos uint8) bool {
|
||||
shift := bitset >> pos
|
||||
bit := shift & 1
|
||||
return bit == uint64(1)
|
||||
}
|
||||
|
||||
func dist(left, right uint32) uint32 {
|
||||
if left > right {
|
||||
return left - right
|
||||
}
|
||||
return right - left
|
||||
}
|
||||
|
||||
type LevenshteinNFA struct {
|
||||
mDistance uint8
|
||||
damerau bool
|
||||
}
|
||||
|
||||
func newLevenshtein(maxD uint8, transposition bool) *LevenshteinNFA {
|
||||
return &LevenshteinNFA{mDistance: maxD,
|
||||
damerau: transposition,
|
||||
}
|
||||
}
|
||||
|
||||
func (la *LevenshteinNFA) maxDistance() uint8 {
|
||||
return la.mDistance
|
||||
}
|
||||
|
||||
func (la *LevenshteinNFA) msDiameter() uint8 {
|
||||
return 2*la.mDistance + 1
|
||||
}
|
||||
|
||||
func (la *LevenshteinNFA) initialStates() *MultiState {
|
||||
ms := MultiState{}
|
||||
nfaState := NFAState{}
|
||||
ms.addStates(nfaState)
|
||||
return &ms
|
||||
}
|
||||
|
||||
func (la *LevenshteinNFA) multistateDistance(ms *MultiState,
|
||||
queryLen uint32) Distance {
|
||||
minDistance := Atleast{d: la.mDistance + 1}
|
||||
for _, s := range ms.states {
|
||||
t := s.Distance + uint8(dist(queryLen, s.Offset))
|
||||
if t <= uint8(la.mDistance) {
|
||||
if minDistance.distance() > t {
|
||||
minDistance.d = t
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if minDistance.distance() == la.mDistance+1 {
|
||||
return Atleast{d: la.mDistance + 1}
|
||||
}
|
||||
|
||||
return minDistance
|
||||
}
|
||||
|
||||
func (la *LevenshteinNFA) simpleTransition(state NFAState,
|
||||
symbol uint64, ms *MultiState) {
|
||||
|
||||
if state.Distance < la.mDistance {
|
||||
// insertion
|
||||
ms.addStates(NFAState{Offset: state.Offset,
|
||||
Distance: state.Distance + 1,
|
||||
InTranspose: false})
|
||||
|
||||
// substitution
|
||||
ms.addStates(NFAState{Offset: state.Offset + 1,
|
||||
Distance: state.Distance + 1,
|
||||
InTranspose: false})
|
||||
|
||||
n := la.mDistance + 1 - state.Distance
|
||||
for d := uint8(1); d < n; d++ {
|
||||
if extractBit(symbol, d) {
|
||||
// for d > 0, as many deletion and character match
|
||||
ms.addStates(NFAState{Offset: state.Offset + 1 + uint32(d),
|
||||
Distance: state.Distance + d,
|
||||
InTranspose: false})
|
||||
}
|
||||
}
|
||||
|
||||
if la.damerau && extractBit(symbol, 1) {
|
||||
ms.addStates(NFAState{
|
||||
Offset: state.Offset,
|
||||
Distance: state.Distance + 1,
|
||||
InTranspose: true})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if extractBit(symbol, 0) {
|
||||
ms.addStates(NFAState{Offset: state.Offset + 1,
|
||||
Distance: state.Distance,
|
||||
InTranspose: false})
|
||||
}
|
||||
|
||||
if state.InTranspose && extractBit(symbol, 0) {
|
||||
ms.addStates(NFAState{Offset: state.Offset + 2,
|
||||
Distance: state.Distance,
|
||||
InTranspose: false})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (la *LevenshteinNFA) transition(cState *MultiState,
|
||||
dState *MultiState, scv uint64) {
|
||||
dState.Clear()
|
||||
mask := (uint64(1) << la.msDiameter()) - uint64(1)
|
||||
|
||||
for _, state := range cState.states {
|
||||
cv := (scv >> state.Offset) & mask
|
||||
la.simpleTransition(state, cv, dState)
|
||||
}
|
||||
|
||||
sort.Sort(NFAStates(dState.states))
|
||||
}
|
||||
|
||||
func (la *LevenshteinNFA) computeDistance(query, other []rune) Distance {
|
||||
cState := la.initialStates()
|
||||
nState := newMultiState()
|
||||
|
||||
for _, i := range other {
|
||||
nState.Clear()
|
||||
chi := characteristicVector(query, i)
|
||||
la.transition(cState, nState, chi)
|
||||
cState, nState = nState, cState
|
||||
}
|
||||
|
||||
return la.multistateDistance(cState, uint32(len(query)))
|
||||
}
|
349
vendor/github.com/couchbase/vellum/levenshtein/parametric_dfa.go
generated
vendored
Normal file
349
vendor/github.com/couchbase/vellum/levenshtein/parametric_dfa.go
generated
vendored
Normal file
|
@ -0,0 +1,349 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package levenshtein
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
type ParametricState struct {
|
||||
shapeID uint32
|
||||
offset uint32
|
||||
}
|
||||
|
||||
func newParametricState() ParametricState {
|
||||
return ParametricState{}
|
||||
}
|
||||
|
||||
func (ps *ParametricState) isDeadEnd() bool {
|
||||
return ps.shapeID == 0
|
||||
}
|
||||
|
||||
type Transition struct {
|
||||
destShapeID uint32
|
||||
deltaOffset uint32
|
||||
}
|
||||
|
||||
func (t *Transition) apply(state ParametricState) ParametricState {
|
||||
ps := ParametricState{
|
||||
shapeID: t.destShapeID}
|
||||
// don't need any offset if we are in the dead state,
|
||||
// this ensures we have only one dead state.
|
||||
if t.destShapeID != 0 {
|
||||
ps.offset = state.offset + t.deltaOffset
|
||||
}
|
||||
|
||||
return ps
|
||||
}
|
||||
|
||||
type ParametricStateIndex struct {
|
||||
stateIndex []uint32
|
||||
stateQueue []ParametricState
|
||||
numOffsets uint32
|
||||
}
|
||||
|
||||
func newParametricStateIndex(queryLen,
|
||||
numParamState uint32) ParametricStateIndex {
|
||||
numOffsets := queryLen + 1
|
||||
if numParamState == 0 {
|
||||
numParamState = numOffsets
|
||||
}
|
||||
maxNumStates := numParamState * numOffsets
|
||||
psi := ParametricStateIndex{
|
||||
stateIndex: make([]uint32, maxNumStates),
|
||||
stateQueue: make([]ParametricState, 0, 150),
|
||||
numOffsets: numOffsets,
|
||||
}
|
||||
|
||||
for i := uint32(0); i < maxNumStates; i++ {
|
||||
psi.stateIndex[i] = math.MaxUint32
|
||||
}
|
||||
return psi
|
||||
}
|
||||
|
||||
func (psi *ParametricStateIndex) numStates() int {
|
||||
return len(psi.stateQueue)
|
||||
}
|
||||
|
||||
func (psi *ParametricStateIndex) maxNumStates() int {
|
||||
return len(psi.stateIndex)
|
||||
}
|
||||
|
||||
func (psi *ParametricStateIndex) get(stateID uint32) ParametricState {
|
||||
return psi.stateQueue[stateID]
|
||||
}
|
||||
|
||||
func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 {
|
||||
bucket := ps.shapeID*psi.numOffsets + ps.offset
|
||||
if bucket < uint32(len(psi.stateIndex)) &&
|
||||
psi.stateIndex[bucket] != math.MaxUint32 {
|
||||
return psi.stateIndex[bucket]
|
||||
}
|
||||
nState := uint32(len(psi.stateQueue))
|
||||
psi.stateQueue = append(psi.stateQueue, ps)
|
||||
|
||||
psi.stateIndex[bucket] = nState
|
||||
return nState
|
||||
}
|
||||
|
||||
type ParametricDFA struct {
|
||||
distance []uint8
|
||||
transitions []Transition
|
||||
maxDistance uint8
|
||||
transitionStride uint32
|
||||
diameter uint32
|
||||
}
|
||||
|
||||
func (pdfa *ParametricDFA) initialState() ParametricState {
|
||||
return ParametricState{shapeID: 1}
|
||||
}
|
||||
|
||||
// Returns true iff whatever characters come afterward,
|
||||
// we will never reach a shorter distance
|
||||
func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool {
|
||||
if state.isDeadEnd() {
|
||||
return true
|
||||
}
|
||||
|
||||
remOffset := queryLen - state.offset
|
||||
if remOffset < pdfa.diameter {
|
||||
stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:]
|
||||
prefixDistance := stateDistances[remOffset]
|
||||
if prefixDistance > pdfa.maxDistance {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, d := range stateDistances {
|
||||
if d < prefixDistance {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (pdfa *ParametricDFA) numStates() int {
|
||||
return len(pdfa.transitions) / int(pdfa.transitionStride)
|
||||
}
|
||||
|
||||
func min(x, y uint32) uint32 {
|
||||
if x < y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
func (pdfa *ParametricDFA) transition(state ParametricState,
|
||||
chi uint32) Transition {
|
||||
return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi]
|
||||
}
|
||||
|
||||
func (pdfa *ParametricDFA) getDistance(state ParametricState,
|
||||
qLen uint32) Distance {
|
||||
remainingOffset := qLen - state.offset
|
||||
if state.isDeadEnd() || remainingOffset >= pdfa.diameter {
|
||||
return Atleast{d: pdfa.maxDistance + 1}
|
||||
}
|
||||
dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)]
|
||||
if dist > pdfa.maxDistance {
|
||||
return Atleast{d: dist}
|
||||
}
|
||||
return Exact{d: dist}
|
||||
}
|
||||
|
||||
func (pdfa *ParametricDFA) computeDistance(left, right string) Distance {
|
||||
state := pdfa.initialState()
|
||||
leftChars := []rune(left)
|
||||
for _, chr := range []rune(right) {
|
||||
start := state.offset
|
||||
stop := min(start+pdfa.diameter, uint32(len(leftChars)))
|
||||
chi := characteristicVector(leftChars[start:stop], chr)
|
||||
transition := pdfa.transition(state, uint32(chi))
|
||||
state = transition.apply(state)
|
||||
if state.isDeadEnd() {
|
||||
return Atleast{d: pdfa.maxDistance + 1}
|
||||
}
|
||||
}
|
||||
return pdfa.getDistance(state, uint32(len(left)))
|
||||
}
|
||||
|
||||
func (pdfa *ParametricDFA) buildDfa(query string, distance uint8,
|
||||
prefix bool) (*DFA, error) {
|
||||
qLen := uint32(len([]rune(query)))
|
||||
alphabet := queryChars(query)
|
||||
|
||||
psi := newParametricStateIndex(qLen, uint32(pdfa.numStates()))
|
||||
maxNumStates := psi.maxNumStates()
|
||||
deadEndStateID := psi.getOrAllocate(newParametricState())
|
||||
if deadEndStateID != 0 {
|
||||
return nil, fmt.Errorf("Invalid dead end state")
|
||||
}
|
||||
|
||||
initialStateID := psi.getOrAllocate(pdfa.initialState())
|
||||
dfaBuilder := withMaxStates(uint32(maxNumStates))
|
||||
mask := uint32((1 << pdfa.diameter) - 1)
|
||||
|
||||
var stateID int
|
||||
for stateID = 0; stateID < StateLimit; stateID++ {
|
||||
if stateID == psi.numStates() {
|
||||
break
|
||||
}
|
||||
state := psi.get(uint32(stateID))
|
||||
if prefix && pdfa.isPrefixSink(state, qLen) {
|
||||
distance := pdfa.getDistance(state, qLen)
|
||||
dfaBuilder.addState(uint32(stateID), uint32(stateID), distance)
|
||||
} else {
|
||||
transition := pdfa.transition(state, 0)
|
||||
defSuccessor := transition.apply(state)
|
||||
defSuccessorID := psi.getOrAllocate(defSuccessor)
|
||||
distance := pdfa.getDistance(state, qLen)
|
||||
stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err)
|
||||
}
|
||||
|
||||
alphabet.resetNext()
|
||||
chr, cv, err := alphabet.next()
|
||||
for err == nil {
|
||||
chi := cv.shiftAndMask(state.offset, mask)
|
||||
|
||||
transition := pdfa.transition(state, chi)
|
||||
|
||||
destState := transition.apply(state)
|
||||
|
||||
destStateID := psi.getOrAllocate(destState)
|
||||
|
||||
stateBuilder.addTransition(chr, destStateID)
|
||||
|
||||
chr, cv, err = alphabet.next()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if stateID == StateLimit {
|
||||
return nil, ErrTooManyStates
|
||||
}
|
||||
|
||||
dfaBuilder.setInitialState(initialStateID)
|
||||
return dfaBuilder.build(distance), nil
|
||||
}
|
||||
|
||||
func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) {
|
||||
lookUp := newHash()
|
||||
lookUp.getOrAllocate(*newMultiState())
|
||||
initialState := nfa.initialStates()
|
||||
lookUp.getOrAllocate(*initialState)
|
||||
|
||||
maxDistance := nfa.maxDistance()
|
||||
msDiameter := nfa.msDiameter()
|
||||
|
||||
numChi := 1 << msDiameter
|
||||
chiValues := make([]uint64, numChi)
|
||||
for i := 0; i < numChi; i++ {
|
||||
chiValues[i] = uint64(i)
|
||||
}
|
||||
|
||||
transitions := make([]Transition, 0, numChi*int(msDiameter))
|
||||
var stateID int
|
||||
for stateID = 0; stateID < StateLimit; stateID++ {
|
||||
if stateID == len(lookUp.items) {
|
||||
break
|
||||
}
|
||||
|
||||
for _, chi := range chiValues {
|
||||
destMs := newMultiState()
|
||||
|
||||
ms := lookUp.getFromID(stateID)
|
||||
|
||||
nfa.transition(ms, destMs, chi)
|
||||
|
||||
translation := destMs.normalize()
|
||||
|
||||
destID := lookUp.getOrAllocate(*destMs)
|
||||
|
||||
transitions = append(transitions, Transition{
|
||||
destShapeID: uint32(destID),
|
||||
deltaOffset: translation,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if stateID == StateLimit {
|
||||
return nil, ErrTooManyStates
|
||||
}
|
||||
|
||||
ns := len(lookUp.items)
|
||||
diameter := int(msDiameter)
|
||||
|
||||
distances := make([]uint8, 0, diameter*ns)
|
||||
for stateID := 0; stateID < ns; stateID++ {
|
||||
ms := lookUp.getFromID(stateID)
|
||||
for offset := 0; offset < diameter; offset++ {
|
||||
dist := nfa.multistateDistance(ms, uint32(offset))
|
||||
distances = append(distances, dist.distance())
|
||||
}
|
||||
}
|
||||
|
||||
return &ParametricDFA{
|
||||
diameter: uint32(msDiameter),
|
||||
transitions: transitions,
|
||||
maxDistance: maxDistance,
|
||||
transitionStride: uint32(numChi),
|
||||
distance: distances,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type hash struct {
|
||||
index map[[16]byte]int
|
||||
items []MultiState
|
||||
}
|
||||
|
||||
func newHash() *hash {
|
||||
return &hash{
|
||||
index: make(map[[16]byte]int, 100),
|
||||
items: make([]MultiState, 0, 100),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *hash) getOrAllocate(m MultiState) int {
|
||||
size := len(h.items)
|
||||
var exists bool
|
||||
var pos int
|
||||
md5 := getHash(&m)
|
||||
if pos, exists = h.index[md5]; !exists {
|
||||
h.index[md5] = size
|
||||
pos = size
|
||||
h.items = append(h.items, m)
|
||||
}
|
||||
return pos
|
||||
}
|
||||
|
||||
func (h *hash) getFromID(id int) *MultiState {
|
||||
return &h.items[id]
|
||||
}
|
||||
|
||||
func getHash(ms *MultiState) [16]byte {
|
||||
msBytes := []byte{}
|
||||
for _, state := range ms.states {
|
||||
jsonBytes, _ := json.Marshal(&state)
|
||||
msBytes = append(msBytes, jsonBytes...)
|
||||
}
|
||||
return md5.Sum(msBytes)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue