forked from forgejo/forgejo
Search bar for issues/pulls (#530)
This commit is contained in:
parent
8bc431952f
commit
833f8b94c2
195 changed files with 221830 additions and 60 deletions
202
vendor/github.com/blevesearch/segment/LICENSE
generated
vendored
Normal file
202
vendor/github.com/blevesearch/segment/LICENSE
generated
vendored
Normal file
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
92
vendor/github.com/blevesearch/segment/README.md
generated
vendored
Normal file
92
vendor/github.com/blevesearch/segment/README.md
generated
vendored
Normal file
|
@ -0,0 +1,92 @@
|
|||
# segment
|
||||
|
||||
A Go library for performing Unicode Text Segmentation
|
||||
as described in [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/)
|
||||
|
||||
## Features
|
||||
|
||||
* Currently only segmentation at Word Boundaries is supported.
|
||||
|
||||
## License
|
||||
|
||||
Apache License Version 2.0
|
||||
|
||||
## Usage
|
||||
|
||||
The functionality is exposed in two ways:
|
||||
|
||||
1. You can use a bufio.Scanner with the SplitWords implementation of SplitFunc.
|
||||
The SplitWords function will identify the appropriate word boundaries in the input
|
||||
text and the Scanner will return tokens at the appropriate place.
|
||||
|
||||
scanner := bufio.NewScanner(...)
|
||||
scanner.Split(segment.SplitWords)
|
||||
for scanner.Scan() {
|
||||
tokenBytes := scanner.Bytes()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
2. Sometimes you would also like information returned about the type of token.
|
||||
To do this we have introduce a new type named Segmenter. It works just like Scanner
|
||||
but additionally a token type is returned.
|
||||
|
||||
segmenter := segment.NewWordSegmenter(...)
|
||||
for segmenter.Segment() {
|
||||
tokenBytes := segmenter.Bytes())
|
||||
tokenType := segmenter.Type()
|
||||
}
|
||||
if err := segmenter.Err(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
## Choosing Implementation
|
||||
|
||||
By default segment does NOT use the fastest runtime implementation. The reason is that it adds approximately 5s to compilation time and may require more than 1GB of ram on the machine performing compilation.
|
||||
|
||||
However, you can choose to build with the fastest runtime implementation by passing the build tag as follows:
|
||||
|
||||
-tags 'prod'
|
||||
|
||||
## Generating Code
|
||||
|
||||
Several components in this package are generated.
|
||||
|
||||
1. Several Ragel rules files are generated from Unicode properties files.
|
||||
2. Ragel machine is generated from the Ragel rules.
|
||||
3. Test tables are generated from the Unicode test files.
|
||||
|
||||
All of these can be generated by running:
|
||||
|
||||
go generate
|
||||
|
||||
## Fuzzing
|
||||
|
||||
There is support for fuzzing the segment library with [go-fuzz](https://github.com/dvyukov/go-fuzz).
|
||||
|
||||
1. Install go-fuzz if you haven't already:
|
||||
|
||||
go get github.com/dvyukov/go-fuzz/go-fuzz
|
||||
go get github.com/dvyukov/go-fuzz/go-fuzz-build
|
||||
|
||||
2. Build the package with go-fuzz:
|
||||
|
||||
go-fuzz-build github.com/blevesearch/segment
|
||||
|
||||
3. Convert the Unicode provided test cases into the initial corpus for go-fuzz:
|
||||
|
||||
go test -v -run=TestGenerateWordSegmentFuzz -tags gofuzz_generate
|
||||
|
||||
4. Run go-fuzz:
|
||||
|
||||
go-fuzz -bin=segment-fuzz.zip -workdir=workdir
|
||||
|
||||
## Status
|
||||
|
||||
|
||||
[](https://travis-ci.org/blevesearch/segment)
|
||||
|
||||
[](https://coveralls.io/r/blevesearch/segment?branch=master)
|
||||
|
||||
[](https://godoc.org/github.com/blevesearch/segment)
|
45
vendor/github.com/blevesearch/segment/doc.go
generated
vendored
Normal file
45
vendor/github.com/blevesearch/segment/doc.go
generated
vendored
Normal file
|
@ -0,0 +1,45 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
/*
|
||||
Package segment is a library for performing Unicode Text Segmentation
|
||||
as described in Unicode Standard Annex #29 http://www.unicode.org/reports/tr29/
|
||||
|
||||
Currently only segmentation at Word Boundaries is supported.
|
||||
|
||||
The functionality is exposed in two ways:
|
||||
|
||||
1. You can use a bufio.Scanner with the SplitWords implementation of SplitFunc.
|
||||
The SplitWords function will identify the appropriate word boundaries in the input
|
||||
text and the Scanner will return tokens at the appropriate place.
|
||||
|
||||
scanner := bufio.NewScanner(...)
|
||||
scanner.Split(segment.SplitWords)
|
||||
for scanner.Scan() {
|
||||
tokenBytes := scanner.Bytes()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
2. Sometimes you would also like information returned about the type of token.
|
||||
To do this we have introduce a new type named Segmenter. It works just like Scanner
|
||||
but additionally a token type is returned.
|
||||
|
||||
segmenter := segment.NewWordSegmenter(...)
|
||||
for segmenter.Segment() {
|
||||
tokenBytes := segmenter.Bytes())
|
||||
tokenType := segmenter.Type()
|
||||
}
|
||||
if err := segmenter.Err(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
*/
|
||||
package segment
|
219
vendor/github.com/blevesearch/segment/maketesttables.go
generated
vendored
Normal file
219
vendor/github.com/blevesearch/segment/maketesttables.go
generated
vendored
Normal file
|
@ -0,0 +1,219 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
var url = flag.String("url",
|
||||
"http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
|
||||
"URL of Unicode database directory")
|
||||
var verbose = flag.Bool("verbose",
|
||||
false,
|
||||
"write data to stdout as it is parsed")
|
||||
var localFiles = flag.Bool("local",
|
||||
false,
|
||||
"data files have been copied to the current directory; for debugging only")
|
||||
|
||||
var outputFile = flag.String("output",
|
||||
"",
|
||||
"output file for generated tables; default stdout")
|
||||
|
||||
var output *bufio.Writer
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
setupOutput()
|
||||
|
||||
graphemeTests := make([]test, 0)
|
||||
graphemeComments := make([]string, 0)
|
||||
graphemeTests, graphemeComments = loadUnicodeData("GraphemeBreakTest.txt", graphemeTests, graphemeComments)
|
||||
wordTests := make([]test, 0)
|
||||
wordComments := make([]string, 0)
|
||||
wordTests, wordComments = loadUnicodeData("WordBreakTest.txt", wordTests, wordComments)
|
||||
sentenceTests := make([]test, 0)
|
||||
sentenceComments := make([]string, 0)
|
||||
sentenceTests, sentenceComments = loadUnicodeData("SentenceBreakTest.txt", sentenceTests, sentenceComments)
|
||||
|
||||
fmt.Fprintf(output, fileHeader, *url)
|
||||
generateTestTables("Grapheme", graphemeTests, graphemeComments)
|
||||
generateTestTables("Word", wordTests, wordComments)
|
||||
generateTestTables("Sentence", sentenceTests, sentenceComments)
|
||||
|
||||
flushOutput()
|
||||
}
|
||||
|
||||
// WordBreakProperty.txt has the form:
|
||||
// 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
|
||||
// FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
|
||||
func openReader(file string) (input io.ReadCloser) {
|
||||
if *localFiles {
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
input = f
|
||||
} else {
|
||||
path := *url + file
|
||||
resp, err := http.Get(path)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
log.Fatal("bad GET status for "+file, resp.Status)
|
||||
}
|
||||
input = resp.Body
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func loadUnicodeData(filename string, tests []test, comments []string) ([]test, []string) {
|
||||
f := openReader(filename)
|
||||
defer f.Close()
|
||||
bufioReader := bufio.NewReader(f)
|
||||
line, err := bufioReader.ReadString('\n')
|
||||
for err == nil {
|
||||
tests, comments = parseLine(line, tests, comments)
|
||||
line, err = bufioReader.ReadString('\n')
|
||||
}
|
||||
// if the err was EOF still need to process last value
|
||||
if err == io.EOF {
|
||||
tests, comments = parseLine(line, tests, comments)
|
||||
}
|
||||
return tests, comments
|
||||
}
|
||||
|
||||
const comment = "#"
|
||||
const brk = "÷"
|
||||
const nbrk = "×"
|
||||
|
||||
type test [][]byte
|
||||
|
||||
func parseLine(line string, tests []test, comments []string) ([]test, []string) {
|
||||
if strings.HasPrefix(line, comment) {
|
||||
return tests, comments
|
||||
}
|
||||
line = strings.TrimSpace(line)
|
||||
if len(line) == 0 {
|
||||
return tests, comments
|
||||
}
|
||||
commentStart := strings.Index(line, comment)
|
||||
comment := strings.TrimSpace(line[commentStart+1:])
|
||||
if commentStart > 0 {
|
||||
line = line[0:commentStart]
|
||||
}
|
||||
pieces := strings.Split(line, brk)
|
||||
t := make(test, 0)
|
||||
for _, piece := range pieces {
|
||||
piece = strings.TrimSpace(piece)
|
||||
if len(piece) > 0 {
|
||||
codePoints := strings.Split(piece, nbrk)
|
||||
word := ""
|
||||
for _, codePoint := range codePoints {
|
||||
codePoint = strings.TrimSpace(codePoint)
|
||||
r, err := strconv.ParseInt(codePoint, 16, 64)
|
||||
if err != nil {
|
||||
log.Printf("err: %v for '%s'", err, string(r))
|
||||
return tests, comments
|
||||
}
|
||||
|
||||
word += string(r)
|
||||
}
|
||||
t = append(t, []byte(word))
|
||||
}
|
||||
}
|
||||
tests = append(tests, t)
|
||||
comments = append(comments, comment)
|
||||
return tests, comments
|
||||
}
|
||||
|
||||
func generateTestTables(prefix string, tests []test, comments []string) {
|
||||
fmt.Fprintf(output, testHeader, prefix)
|
||||
for i, t := range tests {
|
||||
fmt.Fprintf(output, "\t\t{\n")
|
||||
fmt.Fprintf(output, "\t\t\tinput: %#v,\n", bytes.Join(t, []byte{}))
|
||||
fmt.Fprintf(output, "\t\t\toutput: %s,\n", generateTest(t))
|
||||
fmt.Fprintf(output, "\t\t\tcomment: `%s`,\n", comments[i])
|
||||
fmt.Fprintf(output, "\t\t},\n")
|
||||
}
|
||||
fmt.Fprintf(output, "}\n")
|
||||
}
|
||||
|
||||
func generateTest(t test) string {
|
||||
rv := "[][]byte{"
|
||||
for _, te := range t {
|
||||
rv += fmt.Sprintf("%#v,", te)
|
||||
}
|
||||
rv += "}"
|
||||
return rv
|
||||
}
|
||||
|
||||
const fileHeader = `// Generated by running
|
||||
// maketesttables --url=%s
|
||||
// DO NOT EDIT
|
||||
|
||||
package segment
|
||||
`
|
||||
|
||||
const testHeader = `var unicode%sTests = []struct {
|
||||
input []byte
|
||||
output [][]byte
|
||||
comment string
|
||||
}{
|
||||
`
|
||||
|
||||
func setupOutput() {
|
||||
output = bufio.NewWriter(startGofmt())
|
||||
}
|
||||
|
||||
// startGofmt connects output to a gofmt process if -output is set.
|
||||
func startGofmt() io.Writer {
|
||||
if *outputFile == "" {
|
||||
return os.Stdout
|
||||
}
|
||||
stdout, err := os.Create(*outputFile)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// Pipe output to gofmt.
|
||||
gofmt := exec.Command("gofmt")
|
||||
fd, err := gofmt.StdinPipe()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
gofmt.Stdout = stdout
|
||||
gofmt.Stderr = os.Stderr
|
||||
err = gofmt.Start()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
return fd
|
||||
}
|
||||
|
||||
func flushOutput() {
|
||||
err := output.Flush()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
284
vendor/github.com/blevesearch/segment/segment.go
generated
vendored
Normal file
284
vendor/github.com/blevesearch/segment/segment.go
generated
vendored
Normal file
|
@ -0,0 +1,284 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package segment
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
)
|
||||
|
||||
// Autogenerate the following:
|
||||
// 1. Ragel rules from subset of Unicode script properties
|
||||
// 2. Ragel rules from Unicode word segmentation properties
|
||||
// 3. Ragel machine for word segmentation
|
||||
// 4. Test tables from Unicode
|
||||
//
|
||||
// Requires:
|
||||
// 1. Ruby (to generate ragel rules from unicode spec)
|
||||
// 2. Ragel (only v6.9 tested)
|
||||
// 3. sed (to rewrite build tags)
|
||||
//
|
||||
//go:generate ragel/unicode2ragel.rb -u http://www.unicode.org/Public/8.0.0/ucd/Scripts.txt -m SCRIPTS -p Hangul,Han,Hiragana -o ragel/uscript.rl
|
||||
//go:generate ragel/unicode2ragel.rb -u http://www.unicode.org/Public/8.0.0/ucd/auxiliary/WordBreakProperty.txt -m WB -p Double_Quote,Single_Quote,Hebrew_Letter,CR,LF,Newline,Extend,Format,Katakana,ALetter,MidLetter,MidNum,MidNumLet,Numeric,ExtendNumLet,Regional_Indicator -o ragel/uwb.rl
|
||||
//go:generate ragel -T1 -Z segment_words.rl -o segment_words.go
|
||||
//go:generate sed -i "" -e "s/BUILDTAGS/!prod/" segment_words.go
|
||||
//go:generate sed -i "" -e "s/RAGELFLAGS/-T1/" segment_words.go
|
||||
//go:generate ragel -G2 -Z segment_words.rl -o segment_words_prod.go
|
||||
//go:generate sed -i "" -e "s/BUILDTAGS/prod/" segment_words_prod.go
|
||||
//go:generate sed -i "" -e "s/RAGELFLAGS/-G2/" segment_words_prod.go
|
||||
//go:generate go run maketesttables.go -output tables_test.go
|
||||
|
||||
// NewWordSegmenter returns a new Segmenter to read from r.
|
||||
func NewWordSegmenter(r io.Reader) *Segmenter {
|
||||
return NewSegmenter(r)
|
||||
}
|
||||
|
||||
// NewWordSegmenterDirect returns a new Segmenter to work directly with buf.
|
||||
func NewWordSegmenterDirect(buf []byte) *Segmenter {
|
||||
return NewSegmenterDirect(buf)
|
||||
}
|
||||
|
||||
func SplitWords(data []byte, atEOF bool) (int, []byte, error) {
|
||||
advance, token, _, err := SegmentWords(data, atEOF)
|
||||
return advance, token, err
|
||||
}
|
||||
|
||||
func SegmentWords(data []byte, atEOF bool) (int, []byte, int, error) {
|
||||
vals := make([][]byte, 0, 1)
|
||||
types := make([]int, 0, 1)
|
||||
tokens, types, advance, err := segmentWords(data, 1, atEOF, vals, types)
|
||||
if len(tokens) > 0 {
|
||||
return advance, tokens[0], types[0], err
|
||||
}
|
||||
return advance, nil, 0, err
|
||||
}
|
||||
|
||||
func SegmentWordsDirect(data []byte, val [][]byte, types []int) ([][]byte, []int, int, error) {
|
||||
return segmentWords(data, -1, true, val, types)
|
||||
}
|
||||
|
||||
// *** Core Segmenter
|
||||
|
||||
const maxConsecutiveEmptyReads = 100
|
||||
|
||||
// NewSegmenter returns a new Segmenter to read from r.
|
||||
// Defaults to segment using SegmentWords
|
||||
func NewSegmenter(r io.Reader) *Segmenter {
|
||||
return &Segmenter{
|
||||
r: r,
|
||||
segment: SegmentWords,
|
||||
maxTokenSize: MaxScanTokenSize,
|
||||
buf: make([]byte, 4096), // Plausible starting size; needn't be large.
|
||||
}
|
||||
}
|
||||
|
||||
// NewSegmenterDirect returns a new Segmenter to work directly with buf.
|
||||
// Defaults to segment using SegmentWords
|
||||
func NewSegmenterDirect(buf []byte) *Segmenter {
|
||||
return &Segmenter{
|
||||
segment: SegmentWords,
|
||||
maxTokenSize: MaxScanTokenSize,
|
||||
buf: buf,
|
||||
start: 0,
|
||||
end: len(buf),
|
||||
err: io.EOF,
|
||||
}
|
||||
}
|
||||
|
||||
// Segmenter provides a convenient interface for reading data such as
|
||||
// a file of newline-delimited lines of text. Successive calls to
|
||||
// the Segment method will step through the 'tokens' of a file, skipping
|
||||
// the bytes between the tokens. The specification of a token is
|
||||
// defined by a split function of type SplitFunc; the default split
|
||||
// function breaks the input into lines with line termination stripped. Split
|
||||
// functions are defined in this package for scanning a file into
|
||||
// lines, bytes, UTF-8-encoded runes, and space-delimited words. The
|
||||
// client may instead provide a custom split function.
|
||||
//
|
||||
// Segmenting stops unrecoverably at EOF, the first I/O error, or a token too
|
||||
// large to fit in the buffer. When a scan stops, the reader may have
|
||||
// advanced arbitrarily far past the last token. Programs that need more
|
||||
// control over error handling or large tokens, or must run sequential scans
|
||||
// on a reader, should use bufio.Reader instead.
|
||||
//
|
||||
type Segmenter struct {
|
||||
r io.Reader // The reader provided by the client.
|
||||
segment SegmentFunc // The function to split the tokens.
|
||||
maxTokenSize int // Maximum size of a token; modified by tests.
|
||||
token []byte // Last token returned by split.
|
||||
buf []byte // Buffer used as argument to split.
|
||||
start int // First non-processed byte in buf.
|
||||
end int // End of data in buf.
|
||||
typ int // The token type
|
||||
err error // Sticky error.
|
||||
}
|
||||
|
||||
// SegmentFunc is the signature of the segmenting function used to tokenize the
|
||||
// input. The arguments are an initial substring of the remaining unprocessed
|
||||
// data and a flag, atEOF, that reports whether the Reader has no more data
|
||||
// to give. The return values are the number of bytes to advance the input
|
||||
// and the next token to return to the user, plus an error, if any. If the
|
||||
// data does not yet hold a complete token, for instance if it has no newline
|
||||
// while scanning lines, SegmentFunc can return (0, nil, nil) to signal the
|
||||
// Segmenter to read more data into the slice and try again with a longer slice
|
||||
// starting at the same point in the input.
|
||||
//
|
||||
// If the returned error is non-nil, segmenting stops and the error
|
||||
// is returned to the client.
|
||||
//
|
||||
// The function is never called with an empty data slice unless atEOF
|
||||
// is true. If atEOF is true, however, data may be non-empty and,
|
||||
// as always, holds unprocessed text.
|
||||
type SegmentFunc func(data []byte, atEOF bool) (advance int, token []byte, segmentType int, err error)
|
||||
|
||||
// Errors returned by Segmenter.
|
||||
var (
|
||||
ErrTooLong = errors.New("bufio.Segmenter: token too long")
|
||||
ErrNegativeAdvance = errors.New("bufio.Segmenter: SplitFunc returns negative advance count")
|
||||
ErrAdvanceTooFar = errors.New("bufio.Segmenter: SplitFunc returns advance count beyond input")
|
||||
)
|
||||
|
||||
const (
|
||||
// Maximum size used to buffer a token. The actual maximum token size
|
||||
// may be smaller as the buffer may need to include, for instance, a newline.
|
||||
MaxScanTokenSize = 64 * 1024
|
||||
)
|
||||
|
||||
// Err returns the first non-EOF error that was encountered by the Segmenter.
|
||||
func (s *Segmenter) Err() error {
|
||||
if s.err == io.EOF {
|
||||
return nil
|
||||
}
|
||||
return s.err
|
||||
}
|
||||
|
||||
func (s *Segmenter) Type() int {
|
||||
return s.typ
|
||||
}
|
||||
|
||||
// Bytes returns the most recent token generated by a call to Segment.
|
||||
// The underlying array may point to data that will be overwritten
|
||||
// by a subsequent call to Segment. It does no allocation.
|
||||
func (s *Segmenter) Bytes() []byte {
|
||||
return s.token
|
||||
}
|
||||
|
||||
// Text returns the most recent token generated by a call to Segment
|
||||
// as a newly allocated string holding its bytes.
|
||||
func (s *Segmenter) Text() string {
|
||||
return string(s.token)
|
||||
}
|
||||
|
||||
// Segment advances the Segmenter to the next token, which will then be
|
||||
// available through the Bytes or Text method. It returns false when the
|
||||
// scan stops, either by reaching the end of the input or an error.
|
||||
// After Segment returns false, the Err method will return any error that
|
||||
// occurred during scanning, except that if it was io.EOF, Err
|
||||
// will return nil.
|
||||
func (s *Segmenter) Segment() bool {
|
||||
// Loop until we have a token.
|
||||
for {
|
||||
// See if we can get a token with what we already have.
|
||||
if s.end > s.start {
|
||||
advance, token, typ, err := s.segment(s.buf[s.start:s.end], s.err != nil)
|
||||
if err != nil {
|
||||
s.setErr(err)
|
||||
return false
|
||||
}
|
||||
s.typ = typ
|
||||
if !s.advance(advance) {
|
||||
return false
|
||||
}
|
||||
s.token = token
|
||||
if token != nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
// We cannot generate a token with what we are holding.
|
||||
// If we've already hit EOF or an I/O error, we are done.
|
||||
if s.err != nil {
|
||||
// Shut it down.
|
||||
s.start = 0
|
||||
s.end = 0
|
||||
return false
|
||||
}
|
||||
// Must read more data.
|
||||
// First, shift data to beginning of buffer if there's lots of empty space
|
||||
// or space is needed.
|
||||
if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) {
|
||||
copy(s.buf, s.buf[s.start:s.end])
|
||||
s.end -= s.start
|
||||
s.start = 0
|
||||
}
|
||||
// Is the buffer full? If so, resize.
|
||||
if s.end == len(s.buf) {
|
||||
if len(s.buf) >= s.maxTokenSize {
|
||||
s.setErr(ErrTooLong)
|
||||
return false
|
||||
}
|
||||
newSize := len(s.buf) * 2
|
||||
if newSize > s.maxTokenSize {
|
||||
newSize = s.maxTokenSize
|
||||
}
|
||||
newBuf := make([]byte, newSize)
|
||||
copy(newBuf, s.buf[s.start:s.end])
|
||||
s.buf = newBuf
|
||||
s.end -= s.start
|
||||
s.start = 0
|
||||
continue
|
||||
}
|
||||
// Finally we can read some input. Make sure we don't get stuck with
|
||||
// a misbehaving Reader. Officially we don't need to do this, but let's
|
||||
// be extra careful: Segmenter is for safe, simple jobs.
|
||||
for loop := 0; ; {
|
||||
n, err := s.r.Read(s.buf[s.end:len(s.buf)])
|
||||
s.end += n
|
||||
if err != nil {
|
||||
s.setErr(err)
|
||||
break
|
||||
}
|
||||
if n > 0 {
|
||||
break
|
||||
}
|
||||
loop++
|
||||
if loop > maxConsecutiveEmptyReads {
|
||||
s.setErr(io.ErrNoProgress)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// advance consumes n bytes of the buffer. It reports whether the advance was legal.
|
||||
func (s *Segmenter) advance(n int) bool {
|
||||
if n < 0 {
|
||||
s.setErr(ErrNegativeAdvance)
|
||||
return false
|
||||
}
|
||||
if n > s.end-s.start {
|
||||
s.setErr(ErrAdvanceTooFar)
|
||||
return false
|
||||
}
|
||||
s.start += n
|
||||
return true
|
||||
}
|
||||
|
||||
// setErr records the first error encountered.
|
||||
func (s *Segmenter) setErr(err error) {
|
||||
if s.err == nil || s.err == io.EOF {
|
||||
s.err = err
|
||||
}
|
||||
}
|
||||
|
||||
// SetSegmenter sets the segment function for the Segmenter. If called, it must be
|
||||
// called before Segment.
|
||||
func (s *Segmenter) SetSegmenter(segmenter SegmentFunc) {
|
||||
s.segment = segmenter
|
||||
}
|
22
vendor/github.com/blevesearch/segment/segment_fuzz.go
generated
vendored
Normal file
22
vendor/github.com/blevesearch/segment/segment_fuzz.go
generated
vendored
Normal file
|
@ -0,0 +1,22 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build gofuzz
|
||||
|
||||
package segment
|
||||
|
||||
func Fuzz(data []byte) int {
|
||||
|
||||
vals := make([][]byte, 0, 10000)
|
||||
types := make([]int, 0, 10000)
|
||||
if _, _, _, err := SegmentWordsDirect(data, vals, types); err != nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
}
|
19542
vendor/github.com/blevesearch/segment/segment_words.go
generated
vendored
Normal file
19542
vendor/github.com/blevesearch/segment/segment_words.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
285
vendor/github.com/blevesearch/segment/segment_words.rl
generated
vendored
Normal file
285
vendor/github.com/blevesearch/segment/segment_words.rl
generated
vendored
Normal file
|
@ -0,0 +1,285 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build BUILDTAGS
|
||||
|
||||
package segment
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var RagelFlags = "RAGELFLAGS"
|
||||
|
||||
var ParseError = fmt.Errorf("unicode word segmentation parse error")
|
||||
|
||||
// Word Types
|
||||
const (
|
||||
None = iota
|
||||
Number
|
||||
Letter
|
||||
Kana
|
||||
Ideo
|
||||
)
|
||||
|
||||
%%{
|
||||
machine s;
|
||||
write data;
|
||||
}%%
|
||||
|
||||
func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
|
||||
cs, p, pe := 0, 0, len(data)
|
||||
cap := maxTokens
|
||||
if cap < 0 {
|
||||
cap = 1000
|
||||
}
|
||||
if val == nil {
|
||||
val = make([][]byte, 0, cap)
|
||||
}
|
||||
if types == nil {
|
||||
types = make([]int, 0, cap)
|
||||
}
|
||||
|
||||
// added for scanner
|
||||
ts := 0
|
||||
te := 0
|
||||
act := 0
|
||||
eof := pe
|
||||
_ = ts // compiler not happy
|
||||
_ = te
|
||||
_ = act
|
||||
|
||||
// our state
|
||||
startPos := 0
|
||||
endPos := 0
|
||||
totalConsumed := 0
|
||||
%%{
|
||||
|
||||
include SCRIPTS "ragel/uscript.rl";
|
||||
include WB "ragel/uwb.rl";
|
||||
|
||||
action startToken {
|
||||
startPos = p
|
||||
}
|
||||
|
||||
action endToken {
|
||||
endPos = p
|
||||
}
|
||||
|
||||
action finishNumericToken {
|
||||
if !atEOF {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
|
||||
val = append(val, data[startPos:endPos+1])
|
||||
types = append(types, Number)
|
||||
totalConsumed = endPos+1
|
||||
if maxTokens > 0 && len(val) >= maxTokens {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
}
|
||||
|
||||
action finishHangulToken {
|
||||
if endPos+1 == pe && !atEOF {
|
||||
return val, types, totalConsumed, nil
|
||||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
|
||||
val = append(val, data[startPos:endPos+1])
|
||||
types = append(types, Letter)
|
||||
totalConsumed = endPos+1
|
||||
if maxTokens > 0 && len(val) >= maxTokens {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
}
|
||||
|
||||
action finishKatakanaToken {
|
||||
if endPos+1 == pe && !atEOF {
|
||||
return val, types, totalConsumed, nil
|
||||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
|
||||
val = append(val, data[startPos:endPos+1])
|
||||
types = append(types, Ideo)
|
||||
totalConsumed = endPos+1
|
||||
if maxTokens > 0 && len(val) >= maxTokens {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
}
|
||||
|
||||
action finishWordToken {
|
||||
if !atEOF {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
val = append(val, data[startPos:endPos+1])
|
||||
types = append(types, Letter)
|
||||
totalConsumed = endPos+1
|
||||
if maxTokens > 0 && len(val) >= maxTokens {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
}
|
||||
|
||||
action finishHanToken {
|
||||
if endPos+1 == pe && !atEOF {
|
||||
return val, types, totalConsumed, nil
|
||||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
|
||||
val = append(val, data[startPos:endPos+1])
|
||||
types = append(types, Ideo)
|
||||
totalConsumed = endPos+1
|
||||
if maxTokens > 0 && len(val) >= maxTokens {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
}
|
||||
|
||||
action finishHiraganaToken {
|
||||
if endPos+1 == pe && !atEOF {
|
||||
return val, types, totalConsumed, nil
|
||||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
|
||||
val = append(val, data[startPos:endPos+1])
|
||||
types = append(types, Ideo)
|
||||
totalConsumed = endPos+1
|
||||
if maxTokens > 0 && len(val) >= maxTokens {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
}
|
||||
|
||||
action finishNoneToken {
|
||||
lastPos := startPos
|
||||
for lastPos <= endPos {
|
||||
_, size := utf8.DecodeRune(data[lastPos:])
|
||||
lastPos += size
|
||||
}
|
||||
endPos = lastPos -1
|
||||
p = endPos
|
||||
|
||||
if endPos+1 == pe && !atEOF {
|
||||
return val, types, totalConsumed, nil
|
||||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
// otherwise, consume this as well
|
||||
val = append(val, data[startPos:endPos+1])
|
||||
types = append(types, None)
|
||||
totalConsumed = endPos+1
|
||||
if maxTokens > 0 && len(val) >= maxTokens {
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
||||
}
|
||||
|
||||
HangulEx = Hangul ( Extend | Format )*;
|
||||
HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
|
||||
NumericEx = Numeric ( Extend | Format )*;
|
||||
KatakanaEx = Katakana ( Extend | Format )*;
|
||||
MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
|
||||
MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
|
||||
ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
|
||||
HanEx = Han ( Extend | Format )*;
|
||||
HiraganaEx = Hiragana ( Extend | Format )*;
|
||||
SingleQuoteEx = Single_Quote ( Extend | Format )*;
|
||||
DoubleQuoteEx = Double_Quote ( Extend | Format )*;
|
||||
HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
|
||||
RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
|
||||
NLCRLF = Newline | CR | LF;
|
||||
OtherEx = ^(NLCRLF) ( Extend | Format )* ;
|
||||
|
||||
# UAX#29 WB8. Numeric × Numeric
|
||||
# WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
|
||||
# WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
|
||||
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
#
|
||||
WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
|
||||
|
||||
# subset of the below for typing purposes only!
|
||||
WordHangul = ( HangulEx )+ >startToken @endToken;
|
||||
WordKatakana = ( KatakanaEx )+ >startToken @endToken;
|
||||
|
||||
# UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
|
||||
# WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
|
||||
# WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
|
||||
# WB7a. Hebrew_Letter × Single_Quote
|
||||
# WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
||||
# WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
||||
# WB9. (ALetter | Hebrew_Letter) × Numeric
|
||||
# WB10. Numeric × (ALetter | Hebrew_Letter)
|
||||
# WB13. Katakana × Katakana
|
||||
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
#
|
||||
# Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
|
||||
#
|
||||
Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
|
||||
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
|
||||
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
|
||||
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
|
||||
|ExtendNumLetEx
|
||||
)+
|
||||
)
|
||||
(
|
||||
( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
|
||||
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
|
||||
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
|
||||
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
|
||||
)+
|
||||
)
|
||||
)* ExtendNumLetEx*) >startToken @endToken;
|
||||
|
||||
# UAX#29 WB14. Any ÷ Any
|
||||
WordHan = HanEx >startToken @endToken;
|
||||
WordHiragana = HiraganaEx >startToken @endToken;
|
||||
|
||||
WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
|
||||
|
||||
WordCRLF = (CR LF) >startToken @endToken;
|
||||
|
||||
WordCR = CR >startToken @endToken;
|
||||
|
||||
WordLF = LF >startToken @endToken;
|
||||
|
||||
WordNL = Newline >startToken @endToken;
|
||||
|
||||
WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
|
||||
|
||||
Other = OtherEx >startToken @endToken;
|
||||
|
||||
main := |*
|
||||
WordNumeric => finishNumericToken;
|
||||
WordHangul => finishHangulToken;
|
||||
WordKatakana => finishKatakanaToken;
|
||||
Word => finishWordToken;
|
||||
WordHan => finishHanToken;
|
||||
WordHiragana => finishHiraganaToken;
|
||||
WordRegional =>finishNoneToken;
|
||||
WordCRLF => finishNoneToken;
|
||||
WordCR => finishNoneToken;
|
||||
WordLF => finishNoneToken;
|
||||
WordNL => finishNoneToken;
|
||||
WordExt => finishNoneToken;
|
||||
Other => finishNoneToken;
|
||||
*|;
|
||||
|
||||
write init;
|
||||
write exec;
|
||||
}%%
|
||||
|
||||
if cs < s_first_final {
|
||||
return val, types, totalConsumed, ParseError
|
||||
}
|
||||
|
||||
return val, types, totalConsumed, nil
|
||||
}
|
173643
vendor/github.com/blevesearch/segment/segment_words_prod.go
generated
vendored
Normal file
173643
vendor/github.com/blevesearch/segment/segment_words_prod.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue