forked from forgejo/forgejo
Server-side syntax highlighting for all code (#12047)
* Server-side syntax hilighting for all code This PR does a few things: * Remove all traces of highlight.js * Use chroma library to provide fast syntax hilighting directly on the server * Provide syntax hilighting for diffs * Re-style both unified and split diffs views * Add custom syntax hilighting styling for both regular and arc-green Fixes #7729 Fixes #10157 Fixes #11825 Fixes #7728 Fixes #3872 Fixes #3682 And perhaps gets closer to #9553 * fix line marker * fix repo search * Fix single line select * properly load settings * npm uninstall highlight.js * review suggestion * code review * forgot to call function * fix test * Apply suggestions from code review suggestions from @silverwind thanks Co-authored-by: silverwind <me@silverwind.io> * code review * copy/paste error * Use const for highlight size limit * Update web_src/less/_repository.less Co-authored-by: Lauris BH <lauris@nix.lv> * update size limit to 1MB and other styling tweaks * fix highlighting for certain diff sections * fix test * add worker back as suggested Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: Lauris BH <lauris@nix.lv>
This commit is contained in:
parent
ce5f2b9845
commit
af7ffaa279
336 changed files with 37293 additions and 769 deletions
854
vendor/github.com/dlclark/regexp2/syntax/charclass.go
generated
vendored
Normal file
854
vendor/github.com/dlclark/regexp2/syntax/charclass.go
generated
vendored
Normal file
|
@ -0,0 +1,854 @@
|
|||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"sort"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// CharSet combines start-end rune ranges and unicode categories representing a set of characters
|
||||
type CharSet struct {
|
||||
ranges []singleRange
|
||||
categories []category
|
||||
sub *CharSet //optional subtractor
|
||||
negate bool
|
||||
anything bool
|
||||
}
|
||||
|
||||
type category struct {
|
||||
negate bool
|
||||
cat string
|
||||
}
|
||||
|
||||
type singleRange struct {
|
||||
first rune
|
||||
last rune
|
||||
}
|
||||
|
||||
const (
|
||||
spaceCategoryText = " "
|
||||
wordCategoryText = "W"
|
||||
)
|
||||
|
||||
var (
|
||||
ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
|
||||
ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
|
||||
ecmaDigit = []rune{0x0030, 0x003a}
|
||||
)
|
||||
|
||||
var (
|
||||
AnyClass = getCharSetFromOldString([]rune{0}, false)
|
||||
ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
|
||||
NoneClass = getCharSetFromOldString(nil, false)
|
||||
ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
|
||||
NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
|
||||
ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
|
||||
NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
|
||||
ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
|
||||
NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
|
||||
|
||||
WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
|
||||
NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
|
||||
SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
|
||||
NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
|
||||
DigitClass = getCharSetFromCategoryString(false, false, "Nd")
|
||||
NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
|
||||
)
|
||||
|
||||
var unicodeCategories = func() map[string]*unicode.RangeTable {
|
||||
retVal := make(map[string]*unicode.RangeTable)
|
||||
for k, v := range unicode.Scripts {
|
||||
retVal[k] = v
|
||||
}
|
||||
for k, v := range unicode.Categories {
|
||||
retVal[k] = v
|
||||
}
|
||||
for k, v := range unicode.Properties {
|
||||
retVal[k] = v
|
||||
}
|
||||
return retVal
|
||||
}()
|
||||
|
||||
func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
|
||||
if negateCat && negateSet {
|
||||
panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
|
||||
}
|
||||
|
||||
c := CharSet{negate: negateSet}
|
||||
|
||||
c.categories = make([]category, len(cats))
|
||||
for i, cat := range cats {
|
||||
c.categories[i] = category{cat: cat, negate: negateCat}
|
||||
}
|
||||
return func() *CharSet {
|
||||
//make a copy each time
|
||||
local := c
|
||||
//return that address
|
||||
return &local
|
||||
}
|
||||
}
|
||||
|
||||
func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
|
||||
c := CharSet{}
|
||||
if len(setText) > 0 {
|
||||
fillFirst := false
|
||||
l := len(setText)
|
||||
if negate {
|
||||
if setText[0] == 0 {
|
||||
setText = setText[1:]
|
||||
} else {
|
||||
l++
|
||||
fillFirst = true
|
||||
}
|
||||
}
|
||||
|
||||
if l%2 == 0 {
|
||||
c.ranges = make([]singleRange, l/2)
|
||||
} else {
|
||||
c.ranges = make([]singleRange, l/2+1)
|
||||
}
|
||||
|
||||
first := true
|
||||
if fillFirst {
|
||||
c.ranges[0] = singleRange{first: 0}
|
||||
first = false
|
||||
}
|
||||
|
||||
i := 0
|
||||
for _, r := range setText {
|
||||
if first {
|
||||
// lower bound in a new range
|
||||
c.ranges[i] = singleRange{first: r}
|
||||
first = false
|
||||
} else {
|
||||
c.ranges[i].last = r - 1
|
||||
i++
|
||||
first = true
|
||||
}
|
||||
}
|
||||
if !first {
|
||||
c.ranges[i].last = utf8.MaxRune
|
||||
}
|
||||
}
|
||||
|
||||
return func() *CharSet {
|
||||
local := c
|
||||
return &local
|
||||
}
|
||||
}
|
||||
|
||||
// Copy makes a deep copy to prevent accidental mutation of a set
|
||||
func (c CharSet) Copy() CharSet {
|
||||
ret := CharSet{
|
||||
anything: c.anything,
|
||||
negate: c.negate,
|
||||
}
|
||||
|
||||
ret.ranges = append(ret.ranges, c.ranges...)
|
||||
ret.categories = append(ret.categories, c.categories...)
|
||||
|
||||
if c.sub != nil {
|
||||
sub := c.sub.Copy()
|
||||
ret.sub = &sub
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// gets a human-readable description for a set string
|
||||
func (c CharSet) String() string {
|
||||
buf := &bytes.Buffer{}
|
||||
buf.WriteRune('[')
|
||||
|
||||
if c.IsNegated() {
|
||||
buf.WriteRune('^')
|
||||
}
|
||||
|
||||
for _, r := range c.ranges {
|
||||
|
||||
buf.WriteString(CharDescription(r.first))
|
||||
if r.first != r.last {
|
||||
if r.last-r.first != 1 {
|
||||
//groups that are 1 char apart skip the dash
|
||||
buf.WriteRune('-')
|
||||
}
|
||||
buf.WriteString(CharDescription(r.last))
|
||||
}
|
||||
}
|
||||
|
||||
for _, c := range c.categories {
|
||||
buf.WriteString(c.String())
|
||||
}
|
||||
|
||||
if c.sub != nil {
|
||||
buf.WriteRune('-')
|
||||
buf.WriteString(c.sub.String())
|
||||
}
|
||||
|
||||
buf.WriteRune(']')
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// mapHashFill converts a charset into a buffer for use in maps
|
||||
func (c CharSet) mapHashFill(buf *bytes.Buffer) {
|
||||
if c.negate {
|
||||
buf.WriteByte(0)
|
||||
} else {
|
||||
buf.WriteByte(1)
|
||||
}
|
||||
|
||||
binary.Write(buf, binary.LittleEndian, len(c.ranges))
|
||||
binary.Write(buf, binary.LittleEndian, len(c.categories))
|
||||
for _, r := range c.ranges {
|
||||
buf.WriteRune(r.first)
|
||||
buf.WriteRune(r.last)
|
||||
}
|
||||
for _, ct := range c.categories {
|
||||
buf.WriteString(ct.cat)
|
||||
if ct.negate {
|
||||
buf.WriteByte(1)
|
||||
} else {
|
||||
buf.WriteByte(0)
|
||||
}
|
||||
}
|
||||
|
||||
if c.sub != nil {
|
||||
c.sub.mapHashFill(buf)
|
||||
}
|
||||
}
|
||||
|
||||
// CharIn returns true if the rune is in our character set (either ranges or categories).
|
||||
// It handles negations and subtracted sub-charsets.
|
||||
func (c CharSet) CharIn(ch rune) bool {
|
||||
val := false
|
||||
// in s && !s.subtracted
|
||||
|
||||
//check ranges
|
||||
for _, r := range c.ranges {
|
||||
if ch < r.first {
|
||||
continue
|
||||
}
|
||||
if ch <= r.last {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
//check categories if we haven't already found a range
|
||||
if !val && len(c.categories) > 0 {
|
||||
for _, ct := range c.categories {
|
||||
// special categories...then unicode
|
||||
if ct.cat == spaceCategoryText {
|
||||
if unicode.IsSpace(ch) {
|
||||
// we found a space so we're done
|
||||
// negate means this is a "bad" thing
|
||||
val = !ct.negate
|
||||
break
|
||||
} else if ct.negate {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
} else if ct.cat == wordCategoryText {
|
||||
if IsWordChar(ch) {
|
||||
val = !ct.negate
|
||||
break
|
||||
} else if ct.negate {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
} else if unicode.Is(unicodeCategories[ct.cat], ch) {
|
||||
// if we're in this unicode category then we're done
|
||||
// if negate=true on this category then we "failed" our test
|
||||
// otherwise we're good that we found it
|
||||
val = !ct.negate
|
||||
break
|
||||
} else if ct.negate {
|
||||
val = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// negate the whole char set
|
||||
if c.negate {
|
||||
val = !val
|
||||
}
|
||||
|
||||
// get subtracted recurse
|
||||
if val && c.sub != nil {
|
||||
val = !c.sub.CharIn(ch)
|
||||
}
|
||||
|
||||
//log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
|
||||
return val
|
||||
}
|
||||
|
||||
func (c category) String() string {
|
||||
switch c.cat {
|
||||
case spaceCategoryText:
|
||||
if c.negate {
|
||||
return "\\S"
|
||||
}
|
||||
return "\\s"
|
||||
case wordCategoryText:
|
||||
if c.negate {
|
||||
return "\\W"
|
||||
}
|
||||
return "\\w"
|
||||
}
|
||||
if _, ok := unicodeCategories[c.cat]; ok {
|
||||
|
||||
if c.negate {
|
||||
return "\\P{" + c.cat + "}"
|
||||
}
|
||||
return "\\p{" + c.cat + "}"
|
||||
}
|
||||
return "Unknown category: " + c.cat
|
||||
}
|
||||
|
||||
// CharDescription Produces a human-readable description for a single character.
|
||||
func CharDescription(ch rune) string {
|
||||
/*if ch == '\\' {
|
||||
return "\\\\"
|
||||
}
|
||||
|
||||
if ch > ' ' && ch <= '~' {
|
||||
return string(ch)
|
||||
} else if ch == '\n' {
|
||||
return "\\n"
|
||||
} else if ch == ' ' {
|
||||
return "\\ "
|
||||
}*/
|
||||
|
||||
b := &bytes.Buffer{}
|
||||
escape(b, ch, false) //fmt.Sprintf("%U", ch)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
|
||||
// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
|
||||
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
|
||||
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
|
||||
func IsWordChar(r rune) bool {
|
||||
//"L", "Mn", "Nd", "Pc"
|
||||
return unicode.In(r,
|
||||
unicode.Categories["L"], unicode.Categories["Mn"],
|
||||
unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
|
||||
//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
||||
}
|
||||
|
||||
func IsECMAWordChar(r rune) bool {
|
||||
return unicode.In(r,
|
||||
unicode.Categories["L"], unicode.Categories["Mn"],
|
||||
unicode.Categories["Nd"], unicode.Categories["Pc"])
|
||||
|
||||
//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
||||
}
|
||||
|
||||
// SingletonChar will return the char from the first range without validation.
|
||||
// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
|
||||
func (c CharSet) SingletonChar() rune {
|
||||
return c.ranges[0].first
|
||||
}
|
||||
|
||||
func (c CharSet) IsSingleton() bool {
|
||||
return !c.negate && //negated is multiple chars
|
||||
len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
||||
c.sub == nil && // subtraction means we've got multiple chars
|
||||
c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
||||
}
|
||||
|
||||
func (c CharSet) IsSingletonInverse() bool {
|
||||
return c.negate && //same as above, but requires negated
|
||||
len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
||||
c.sub == nil && // subtraction means we've got multiple chars
|
||||
c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
||||
}
|
||||
|
||||
func (c CharSet) IsMergeable() bool {
|
||||
return !c.IsNegated() && !c.HasSubtraction()
|
||||
}
|
||||
|
||||
func (c CharSet) IsNegated() bool {
|
||||
return c.negate
|
||||
}
|
||||
|
||||
func (c CharSet) HasSubtraction() bool {
|
||||
return c.sub != nil
|
||||
}
|
||||
|
||||
func (c CharSet) IsEmpty() bool {
|
||||
return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
|
||||
}
|
||||
|
||||
func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
|
||||
if ecma {
|
||||
if negate {
|
||||
c.addRanges(NotECMADigitClass().ranges)
|
||||
} else {
|
||||
c.addRanges(ECMADigitClass().ranges)
|
||||
}
|
||||
} else {
|
||||
c.addCategories(category{cat: "Nd", negate: negate})
|
||||
}
|
||||
}
|
||||
|
||||
func (c *CharSet) addChar(ch rune) {
|
||||
c.addRange(ch, ch)
|
||||
}
|
||||
|
||||
func (c *CharSet) addSpace(ecma, negate bool) {
|
||||
if ecma {
|
||||
if negate {
|
||||
c.addRanges(NotECMASpaceClass().ranges)
|
||||
} else {
|
||||
c.addRanges(ECMASpaceClass().ranges)
|
||||
}
|
||||
} else {
|
||||
c.addCategories(category{cat: spaceCategoryText, negate: negate})
|
||||
}
|
||||
}
|
||||
|
||||
func (c *CharSet) addWord(ecma, negate bool) {
|
||||
if ecma {
|
||||
if negate {
|
||||
c.addRanges(NotECMAWordClass().ranges)
|
||||
} else {
|
||||
c.addRanges(ECMAWordClass().ranges)
|
||||
}
|
||||
} else {
|
||||
c.addCategories(category{cat: wordCategoryText, negate: negate})
|
||||
}
|
||||
}
|
||||
|
||||
// Add set ranges and categories into ours -- no deduping or anything
|
||||
func (c *CharSet) addSet(set CharSet) {
|
||||
if c.anything {
|
||||
return
|
||||
}
|
||||
if set.anything {
|
||||
c.makeAnything()
|
||||
return
|
||||
}
|
||||
// just append here to prevent double-canon
|
||||
c.ranges = append(c.ranges, set.ranges...)
|
||||
c.addCategories(set.categories...)
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
func (c *CharSet) makeAnything() {
|
||||
c.anything = true
|
||||
c.categories = []category{}
|
||||
c.ranges = AnyClass().ranges
|
||||
}
|
||||
|
||||
func (c *CharSet) addCategories(cats ...category) {
|
||||
// don't add dupes and remove positive+negative
|
||||
if c.anything {
|
||||
// if we've had a previous positive+negative group then
|
||||
// just return, we're as broad as we can get
|
||||
return
|
||||
}
|
||||
|
||||
for _, ct := range cats {
|
||||
found := false
|
||||
for _, ct2 := range c.categories {
|
||||
if ct.cat == ct2.cat {
|
||||
if ct.negate != ct2.negate {
|
||||
// oposite negations...this mean we just
|
||||
// take us as anything and move on
|
||||
c.makeAnything()
|
||||
return
|
||||
}
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
c.categories = append(c.categories, ct)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merges new ranges to our own
|
||||
func (c *CharSet) addRanges(ranges []singleRange) {
|
||||
if c.anything {
|
||||
return
|
||||
}
|
||||
c.ranges = append(c.ranges, ranges...)
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
// Merges everything but the new ranges into our own
|
||||
func (c *CharSet) addNegativeRanges(ranges []singleRange) {
|
||||
if c.anything {
|
||||
return
|
||||
}
|
||||
|
||||
var hi rune
|
||||
|
||||
// convert incoming ranges into opposites, assume they are in order
|
||||
for _, r := range ranges {
|
||||
if hi < r.first {
|
||||
c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
|
||||
}
|
||||
hi = r.last + 1
|
||||
}
|
||||
|
||||
if hi < utf8.MaxRune {
|
||||
c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
|
||||
}
|
||||
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
func isValidUnicodeCat(catName string) bool {
|
||||
_, ok := unicodeCategories[catName]
|
||||
return ok
|
||||
}
|
||||
|
||||
func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
|
||||
if !isValidUnicodeCat(categoryName) {
|
||||
// unknown unicode category, script, or property "blah"
|
||||
panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
|
||||
|
||||
}
|
||||
|
||||
if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
|
||||
// when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
|
||||
c.addCategories(
|
||||
category{cat: "Ll", negate: negate},
|
||||
category{cat: "Lu", negate: negate},
|
||||
category{cat: "Lt", negate: negate})
|
||||
}
|
||||
c.addCategories(category{cat: categoryName, negate: negate})
|
||||
}
|
||||
|
||||
func (c *CharSet) addSubtraction(sub *CharSet) {
|
||||
c.sub = sub
|
||||
}
|
||||
|
||||
func (c *CharSet) addRange(chMin, chMax rune) {
|
||||
c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
func (c *CharSet) addNamedASCII(name string, negate bool) bool {
|
||||
var rs []singleRange
|
||||
|
||||
switch name {
|
||||
case "alnum":
|
||||
rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
|
||||
case "alpha":
|
||||
rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
|
||||
case "ascii":
|
||||
rs = []singleRange{singleRange{0, 0x7f}}
|
||||
case "blank":
|
||||
rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
|
||||
case "cntrl":
|
||||
rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
|
||||
case "digit":
|
||||
c.addDigit(false, negate, "")
|
||||
case "graph":
|
||||
rs = []singleRange{singleRange{'!', '~'}}
|
||||
case "lower":
|
||||
rs = []singleRange{singleRange{'a', 'z'}}
|
||||
case "print":
|
||||
rs = []singleRange{singleRange{' ', '~'}}
|
||||
case "punct": //[!-/:-@[-`{-~]
|
||||
rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
|
||||
case "space":
|
||||
c.addSpace(true, negate)
|
||||
case "upper":
|
||||
rs = []singleRange{singleRange{'A', 'Z'}}
|
||||
case "word":
|
||||
c.addWord(true, negate)
|
||||
case "xdigit":
|
||||
rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
if len(rs) > 0 {
|
||||
if negate {
|
||||
c.addNegativeRanges(rs)
|
||||
} else {
|
||||
c.addRanges(rs)
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
type singleRangeSorter []singleRange
|
||||
|
||||
func (p singleRangeSorter) Len() int { return len(p) }
|
||||
func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
|
||||
func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
||||
|
||||
// Logic to reduce a character class to a unique, sorted form.
|
||||
func (c *CharSet) canonicalize() {
|
||||
var i, j int
|
||||
var last rune
|
||||
|
||||
//
|
||||
// Find and eliminate overlapping or abutting ranges
|
||||
//
|
||||
|
||||
if len(c.ranges) > 1 {
|
||||
sort.Sort(singleRangeSorter(c.ranges))
|
||||
|
||||
done := false
|
||||
|
||||
for i, j = 1, 0; ; i++ {
|
||||
for last = c.ranges[j].last; ; i++ {
|
||||
if i == len(c.ranges) || last == utf8.MaxRune {
|
||||
done = true
|
||||
break
|
||||
}
|
||||
|
||||
CurrentRange := c.ranges[i]
|
||||
if CurrentRange.first > last+1 {
|
||||
break
|
||||
}
|
||||
|
||||
if last < CurrentRange.last {
|
||||
last = CurrentRange.last
|
||||
}
|
||||
}
|
||||
|
||||
c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
|
||||
|
||||
j++
|
||||
|
||||
if done {
|
||||
break
|
||||
}
|
||||
|
||||
if j < i {
|
||||
c.ranges[j] = c.ranges[i]
|
||||
}
|
||||
}
|
||||
|
||||
c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
|
||||
}
|
||||
}
|
||||
|
||||
// Adds to the class any lowercase versions of characters already
|
||||
// in the class. Used for case-insensitivity.
|
||||
func (c *CharSet) addLowercase() {
|
||||
if c.anything {
|
||||
return
|
||||
}
|
||||
toAdd := []singleRange{}
|
||||
for i := 0; i < len(c.ranges); i++ {
|
||||
r := c.ranges[i]
|
||||
if r.first == r.last {
|
||||
lower := unicode.ToLower(r.first)
|
||||
c.ranges[i] = singleRange{first: lower, last: lower}
|
||||
} else {
|
||||
toAdd = append(toAdd, r)
|
||||
}
|
||||
}
|
||||
|
||||
for _, r := range toAdd {
|
||||
c.addLowercaseRange(r.first, r.last)
|
||||
}
|
||||
c.canonicalize()
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
Let U be the set of Unicode character values and let L be the lowercase
|
||||
function, mapping from U to U. To perform case insensitive matching of
|
||||
character sets, we need to be able to map an interval I in U, say
|
||||
|
||||
I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
|
||||
|
||||
to a set A such that A contains L(I) and A is contained in the union of
|
||||
I and L(I).
|
||||
|
||||
The table below partitions U into intervals on which L is non-decreasing.
|
||||
Thus, for any interval J = [a, b] contained in one of these intervals,
|
||||
L(J) is contained in [L(a), L(b)].
|
||||
|
||||
It is also true that for any such J, [L(a), L(b)] is contained in the
|
||||
union of J and L(J). This does not follow from L being non-decreasing on
|
||||
these intervals. It follows from the nature of the L on each interval.
|
||||
On each interval, L has one of the following forms:
|
||||
|
||||
(1) L(ch) = constant (LowercaseSet)
|
||||
(2) L(ch) = ch + offset (LowercaseAdd)
|
||||
(3) L(ch) = ch | 1 (LowercaseBor)
|
||||
(4) L(ch) = ch + (ch & 1) (LowercaseBad)
|
||||
|
||||
It is easy to verify that for any of these forms [L(a), L(b)] is
|
||||
contained in the union of [a, b] and L([a, b]).
|
||||
***************************************************************************/
|
||||
|
||||
const (
|
||||
LowercaseSet = 0 // Set to arg.
|
||||
LowercaseAdd = 1 // Add arg.
|
||||
LowercaseBor = 2 // Bitwise or with 1.
|
||||
LowercaseBad = 3 // Bitwise and with 1 and add original.
|
||||
)
|
||||
|
||||
type lcMap struct {
|
||||
chMin, chMax rune
|
||||
op, data int32
|
||||
}
|
||||
|
||||
var lcTable = []lcMap{
|
||||
lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
|
||||
lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
|
||||
lcMap{'\u0100', '\u012E', LowercaseBor, 0},
|
||||
lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
|
||||
lcMap{'\u0132', '\u0136', LowercaseBor, 0},
|
||||
lcMap{'\u0139', '\u0147', LowercaseBad, 0},
|
||||
lcMap{'\u014A', '\u0176', LowercaseBor, 0},
|
||||
lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
|
||||
lcMap{'\u0179', '\u017D', LowercaseBad, 0},
|
||||
lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
|
||||
lcMap{'\u0182', '\u0184', LowercaseBor, 0},
|
||||
lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
|
||||
lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
|
||||
lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
|
||||
lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
|
||||
lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
|
||||
lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
|
||||
lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
|
||||
lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
|
||||
lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
|
||||
lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
|
||||
lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
|
||||
lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
|
||||
lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
|
||||
lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
|
||||
lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
|
||||
lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
|
||||
lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
|
||||
lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
|
||||
lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
|
||||
lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
|
||||
lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
|
||||
lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
|
||||
lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
|
||||
lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
|
||||
lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
|
||||
lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
|
||||
lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
|
||||
lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
|
||||
lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
|
||||
lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
|
||||
lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
|
||||
lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
|
||||
lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
|
||||
lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
|
||||
lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
|
||||
lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
|
||||
lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
|
||||
lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
|
||||
lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
|
||||
lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
|
||||
lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
|
||||
lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
|
||||
lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
|
||||
lcMap{'\u0460', '\u0480', LowercaseBor, 0},
|
||||
lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
|
||||
lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
|
||||
lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
|
||||
lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
|
||||
lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
|
||||
lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
|
||||
lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
|
||||
lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
|
||||
lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
|
||||
lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
|
||||
lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
|
||||
lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
|
||||
lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
|
||||
lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
|
||||
lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
|
||||
lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
|
||||
lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
|
||||
lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
|
||||
lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
|
||||
lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
|
||||
lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
|
||||
lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
|
||||
lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
|
||||
lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
|
||||
lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
|
||||
lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
|
||||
lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
|
||||
lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
|
||||
lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
|
||||
lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
|
||||
lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
|
||||
lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
|
||||
lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
|
||||
lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
|
||||
}
|
||||
|
||||
func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
|
||||
var i, iMax, iMid int
|
||||
var chMinT, chMaxT rune
|
||||
var lc lcMap
|
||||
|
||||
for i, iMax = 0, len(lcTable); i < iMax; {
|
||||
iMid = (i + iMax) / 2
|
||||
if lcTable[iMid].chMax < chMin {
|
||||
i = iMid + 1
|
||||
} else {
|
||||
iMax = iMid
|
||||
}
|
||||
}
|
||||
|
||||
for ; i < len(lcTable); i++ {
|
||||
lc = lcTable[i]
|
||||
if lc.chMin > chMax {
|
||||
return
|
||||
}
|
||||
chMinT = lc.chMin
|
||||
if chMinT < chMin {
|
||||
chMinT = chMin
|
||||
}
|
||||
|
||||
chMaxT = lc.chMax
|
||||
if chMaxT > chMax {
|
||||
chMaxT = chMax
|
||||
}
|
||||
|
||||
switch lc.op {
|
||||
case LowercaseSet:
|
||||
chMinT = rune(lc.data)
|
||||
chMaxT = rune(lc.data)
|
||||
break
|
||||
case LowercaseAdd:
|
||||
chMinT += lc.data
|
||||
chMaxT += lc.data
|
||||
break
|
||||
case LowercaseBor:
|
||||
chMinT |= 1
|
||||
chMaxT |= 1
|
||||
break
|
||||
case LowercaseBad:
|
||||
chMinT += (chMinT & 1)
|
||||
chMaxT += (chMaxT & 1)
|
||||
break
|
||||
}
|
||||
|
||||
if chMinT < chMin || chMaxT > chMax {
|
||||
c.addRange(chMinT, chMaxT)
|
||||
}
|
||||
}
|
||||
}
|
274
vendor/github.com/dlclark/regexp2/syntax/code.go
generated
vendored
Normal file
274
vendor/github.com/dlclark/regexp2/syntax/code.go
generated
vendored
Normal file
|
@ -0,0 +1,274 @@
|
|||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
// similar to prog.go in the go regex package...also with comment 'may not belong in this package'
|
||||
|
||||
// File provides operator constants for use by the Builder and the Machine.
|
||||
|
||||
// Implementation notes:
|
||||
//
|
||||
// Regexps are built into RegexCodes, which contain an operation array,
|
||||
// a string table, and some constants.
|
||||
//
|
||||
// Each operation is one of the codes below, followed by the integer
|
||||
// operands specified for each op.
|
||||
//
|
||||
// Strings and sets are indices into a string table.
|
||||
|
||||
type InstOp int
|
||||
|
||||
const (
|
||||
// lef/back operands description
|
||||
|
||||
Onerep InstOp = 0 // lef,back char,min,max a {n}
|
||||
Notonerep = 1 // lef,back char,min,max .{n}
|
||||
Setrep = 2 // lef,back set,min,max [\d]{n}
|
||||
|
||||
Oneloop = 3 // lef,back char,min,max a {,n}
|
||||
Notoneloop = 4 // lef,back char,min,max .{,n}
|
||||
Setloop = 5 // lef,back set,min,max [\d]{,n}
|
||||
|
||||
Onelazy = 6 // lef,back char,min,max a {,n}?
|
||||
Notonelazy = 7 // lef,back char,min,max .{,n}?
|
||||
Setlazy = 8 // lef,back set,min,max [\d]{,n}?
|
||||
|
||||
One = 9 // lef char a
|
||||
Notone = 10 // lef char [^a]
|
||||
Set = 11 // lef set [a-z\s] \w \s \d
|
||||
|
||||
Multi = 12 // lef string abcd
|
||||
Ref = 13 // lef group \#
|
||||
|
||||
Bol = 14 // ^
|
||||
Eol = 15 // $
|
||||
Boundary = 16 // \b
|
||||
Nonboundary = 17 // \B
|
||||
Beginning = 18 // \A
|
||||
Start = 19 // \G
|
||||
EndZ = 20 // \Z
|
||||
End = 21 // \Z
|
||||
|
||||
Nothing = 22 // Reject!
|
||||
|
||||
// Primitive control structures
|
||||
|
||||
Lazybranch = 23 // back jump straight first
|
||||
Branchmark = 24 // back jump branch first for loop
|
||||
Lazybranchmark = 25 // back jump straight first for loop
|
||||
Nullcount = 26 // back val set counter, null mark
|
||||
Setcount = 27 // back val set counter, make mark
|
||||
Branchcount = 28 // back jump,limit branch++ if zero<=c<limit
|
||||
Lazybranchcount = 29 // back jump,limit same, but straight first
|
||||
Nullmark = 30 // back save position
|
||||
Setmark = 31 // back save position
|
||||
Capturemark = 32 // back group define group
|
||||
Getmark = 33 // back recall position
|
||||
Setjump = 34 // back save backtrack state
|
||||
Backjump = 35 // zap back to saved state
|
||||
Forejump = 36 // zap backtracking state
|
||||
Testref = 37 // backtrack if ref undefined
|
||||
Goto = 38 // jump just go
|
||||
|
||||
Prune = 39 // prune it baby
|
||||
Stop = 40 // done!
|
||||
|
||||
ECMABoundary = 41 // \b
|
||||
NonECMABoundary = 42 // \B
|
||||
|
||||
// Modifiers for alternate modes
|
||||
|
||||
Mask = 63 // Mask to get unmodified ordinary operator
|
||||
Rtl = 64 // bit to indicate that we're reverse scanning.
|
||||
Back = 128 // bit to indicate that we're backtracking.
|
||||
Back2 = 256 // bit to indicate that we're backtracking on a second branch.
|
||||
Ci = 512 // bit to indicate that we're case-insensitive.
|
||||
)
|
||||
|
||||
type Code struct {
|
||||
Codes []int // the code
|
||||
Strings [][]rune // string table
|
||||
Sets []*CharSet //character set table
|
||||
TrackCount int // how many instructions use backtracking
|
||||
Caps map[int]int // mapping of user group numbers -> impl group slots
|
||||
Capsize int // number of impl group slots
|
||||
FcPrefix *Prefix // the set of candidate first characters (may be null)
|
||||
BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
|
||||
Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
|
||||
RightToLeft bool // true if right to left
|
||||
}
|
||||
|
||||
func opcodeBacktracks(op InstOp) bool {
|
||||
op &= Mask
|
||||
|
||||
switch op {
|
||||
case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
|
||||
Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
|
||||
Forejump, Goto:
|
||||
return true
|
||||
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func opcodeSize(op InstOp) int {
|
||||
op &= Mask
|
||||
|
||||
switch op {
|
||||
case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
|
||||
End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
|
||||
return 1
|
||||
|
||||
case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
|
||||
Prune, Set:
|
||||
return 2
|
||||
|
||||
case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
|
||||
Setlazy, Setrep, Setloop:
|
||||
return 3
|
||||
|
||||
default:
|
||||
panic(fmt.Errorf("Unexpected op code: %v", op))
|
||||
}
|
||||
}
|
||||
|
||||
var codeStr = []string{
|
||||
"Onerep", "Notonerep", "Setrep",
|
||||
"Oneloop", "Notoneloop", "Setloop",
|
||||
"Onelazy", "Notonelazy", "Setlazy",
|
||||
"One", "Notone", "Set",
|
||||
"Multi", "Ref",
|
||||
"Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
|
||||
"Nothing",
|
||||
"Lazybranch", "Branchmark", "Lazybranchmark",
|
||||
"Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
|
||||
"Nullmark", "Setmark", "Capturemark", "Getmark",
|
||||
"Setjump", "Backjump", "Forejump", "Testref", "Goto",
|
||||
"Prune", "Stop",
|
||||
"ECMABoundary", "NonECMABoundary",
|
||||
}
|
||||
|
||||
func operatorDescription(op InstOp) string {
|
||||
desc := codeStr[op&Mask]
|
||||
if (op & Ci) != 0 {
|
||||
desc += "-Ci"
|
||||
}
|
||||
if (op & Rtl) != 0 {
|
||||
desc += "-Rtl"
|
||||
}
|
||||
if (op & Back) != 0 {
|
||||
desc += "-Back"
|
||||
}
|
||||
if (op & Back2) != 0 {
|
||||
desc += "-Back2"
|
||||
}
|
||||
|
||||
return desc
|
||||
}
|
||||
|
||||
// OpcodeDescription is a humman readable string of the specific offset
|
||||
func (c *Code) OpcodeDescription(offset int) string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
op := InstOp(c.Codes[offset])
|
||||
fmt.Fprintf(buf, "%06d ", offset)
|
||||
|
||||
if opcodeBacktracks(op & Mask) {
|
||||
buf.WriteString("*")
|
||||
} else {
|
||||
buf.WriteString(" ")
|
||||
}
|
||||
buf.WriteString(operatorDescription(op))
|
||||
buf.WriteString("(")
|
||||
op &= Mask
|
||||
|
||||
switch op {
|
||||
case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
|
||||
buf.WriteString("Ch = ")
|
||||
buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
|
||||
|
||||
case Set, Setrep, Setloop, Setlazy:
|
||||
buf.WriteString("Set = ")
|
||||
buf.WriteString(c.Sets[c.Codes[offset+1]].String())
|
||||
|
||||
case Multi:
|
||||
fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
|
||||
|
||||
case Ref, Testref:
|
||||
fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
||||
|
||||
case Capturemark:
|
||||
fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
||||
if c.Codes[offset+2] != -1 {
|
||||
fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
|
||||
}
|
||||
|
||||
case Nullcount, Setcount:
|
||||
fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
|
||||
|
||||
case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
|
||||
fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
|
||||
}
|
||||
|
||||
switch op {
|
||||
case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
|
||||
buf.WriteString(", Rep = ")
|
||||
if c.Codes[offset+2] == math.MaxInt32 {
|
||||
buf.WriteString("inf")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
||||
}
|
||||
|
||||
case Branchcount, Lazybranchcount:
|
||||
buf.WriteString(", Limit = ")
|
||||
if c.Codes[offset+2] == math.MaxInt32 {
|
||||
buf.WriteString("inf")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
buf.WriteString(")")
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func (c *Code) Dump() string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
if c.RightToLeft {
|
||||
fmt.Fprintln(buf, "Direction: right-to-left")
|
||||
} else {
|
||||
fmt.Fprintln(buf, "Direction: left-to-right")
|
||||
}
|
||||
if c.FcPrefix == nil {
|
||||
fmt.Fprintln(buf, "Firstchars: n/a")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
|
||||
}
|
||||
|
||||
if c.BmPrefix == nil {
|
||||
fmt.Fprintln(buf, "Prefix: n/a")
|
||||
} else {
|
||||
fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String()))
|
||||
}
|
||||
|
||||
fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors)
|
||||
fmt.Fprintln(buf)
|
||||
|
||||
if c.BmPrefix != nil {
|
||||
fmt.Fprintln(buf, "BoyerMoore:")
|
||||
fmt.Fprintln(buf, c.BmPrefix.Dump(" "))
|
||||
}
|
||||
for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
|
||||
fmt.Fprintln(buf, c.OpcodeDescription(i))
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
94
vendor/github.com/dlclark/regexp2/syntax/escape.go
generated
vendored
Normal file
94
vendor/github.com/dlclark/regexp2/syntax/escape.go
generated
vendored
Normal file
|
@ -0,0 +1,94 @@
|
|||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
func Escape(input string) string {
|
||||
b := &bytes.Buffer{}
|
||||
for _, r := range input {
|
||||
escape(b, r, false)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const meta = `\.+*?()|[]{}^$# `
|
||||
|
||||
func escape(b *bytes.Buffer, r rune, force bool) {
|
||||
if unicode.IsPrint(r) {
|
||||
if strings.IndexRune(meta, r) >= 0 || force {
|
||||
b.WriteRune('\\')
|
||||
}
|
||||
b.WriteRune(r)
|
||||
return
|
||||
}
|
||||
|
||||
switch r {
|
||||
case '\a':
|
||||
b.WriteString(`\a`)
|
||||
case '\f':
|
||||
b.WriteString(`\f`)
|
||||
case '\n':
|
||||
b.WriteString(`\n`)
|
||||
case '\r':
|
||||
b.WriteString(`\r`)
|
||||
case '\t':
|
||||
b.WriteString(`\t`)
|
||||
case '\v':
|
||||
b.WriteString(`\v`)
|
||||
default:
|
||||
if r < 0x100 {
|
||||
b.WriteString(`\x`)
|
||||
s := strconv.FormatInt(int64(r), 16)
|
||||
if len(s) == 1 {
|
||||
b.WriteRune('0')
|
||||
}
|
||||
b.WriteString(s)
|
||||
break
|
||||
}
|
||||
b.WriteString(`\u`)
|
||||
b.WriteString(strconv.FormatInt(int64(r), 16))
|
||||
}
|
||||
}
|
||||
|
||||
func Unescape(input string) (string, error) {
|
||||
idx := strings.IndexRune(input, '\\')
|
||||
// no slashes means no unescape needed
|
||||
if idx == -1 {
|
||||
return input, nil
|
||||
}
|
||||
|
||||
buf := bytes.NewBufferString(input[:idx])
|
||||
// get the runes for the rest of the string -- we're going full parser scan on this
|
||||
|
||||
p := parser{}
|
||||
p.setPattern(input[idx+1:])
|
||||
for {
|
||||
if p.rightMost() {
|
||||
return "", p.getErr(ErrIllegalEndEscape)
|
||||
}
|
||||
r, err := p.scanCharEscape()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
buf.WriteRune(r)
|
||||
// are we done?
|
||||
if p.rightMost() {
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
r = p.moveRightGetChar()
|
||||
for r != '\\' {
|
||||
buf.WriteRune(r)
|
||||
if p.rightMost() {
|
||||
// we're done, no more slashes
|
||||
return buf.String(), nil
|
||||
}
|
||||
// keep scanning until we get another slash
|
||||
r = p.moveRightGetChar()
|
||||
}
|
||||
}
|
||||
}
|
20
vendor/github.com/dlclark/regexp2/syntax/fuzz.go
generated
vendored
Normal file
20
vendor/github.com/dlclark/regexp2/syntax/fuzz.go
generated
vendored
Normal file
|
@ -0,0 +1,20 @@
|
|||
// +build gofuzz
|
||||
|
||||
package syntax
|
||||
|
||||
// Fuzz is the input point for go-fuzz
|
||||
func Fuzz(data []byte) int {
|
||||
sdata := string(data)
|
||||
tree, err := Parse(sdata, RegexOptions(0))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
// translate it to code
|
||||
_, err = Write(tree)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return 1
|
||||
}
|
2202
vendor/github.com/dlclark/regexp2/syntax/parser.go
generated
vendored
Normal file
2202
vendor/github.com/dlclark/regexp2/syntax/parser.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
896
vendor/github.com/dlclark/regexp2/syntax/prefix.go
generated
vendored
Normal file
896
vendor/github.com/dlclark/regexp2/syntax/prefix.go
generated
vendored
Normal file
|
@ -0,0 +1,896 @@
|
|||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type Prefix struct {
|
||||
PrefixStr []rune
|
||||
PrefixSet CharSet
|
||||
CaseInsensitive bool
|
||||
}
|
||||
|
||||
// It takes a RegexTree and computes the set of chars that can start it.
|
||||
func getFirstCharsPrefix(tree *RegexTree) *Prefix {
|
||||
s := regexFcd{
|
||||
fcStack: make([]regexFc, 32),
|
||||
intStack: make([]int, 32),
|
||||
}
|
||||
fc := s.regexFCFromRegexTree(tree)
|
||||
|
||||
if fc == nil || fc.nullable || fc.cc.IsEmpty() {
|
||||
return nil
|
||||
}
|
||||
fcSet := fc.getFirstChars()
|
||||
return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
|
||||
}
|
||||
|
||||
type regexFcd struct {
|
||||
intStack []int
|
||||
intDepth int
|
||||
fcStack []regexFc
|
||||
fcDepth int
|
||||
skipAllChildren bool // don't process any more children at the current level
|
||||
skipchild bool // don't process the current child.
|
||||
failed bool
|
||||
}
|
||||
|
||||
/*
|
||||
* The main FC computation. It does a shortcutted depth-first walk
|
||||
* through the tree and calls CalculateFC to emits code before
|
||||
* and after each child of an interior node, and at each leaf.
|
||||
*/
|
||||
func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
|
||||
curNode := tree.root
|
||||
curChild := 0
|
||||
|
||||
for {
|
||||
if len(curNode.children) == 0 {
|
||||
// This is a leaf node
|
||||
s.calculateFC(curNode.t, curNode, 0)
|
||||
} else if curChild < len(curNode.children) && !s.skipAllChildren {
|
||||
// This is an interior node, and we have more children to analyze
|
||||
s.calculateFC(curNode.t|beforeChild, curNode, curChild)
|
||||
|
||||
if !s.skipchild {
|
||||
curNode = curNode.children[curChild]
|
||||
// this stack is how we get a depth first walk of the tree.
|
||||
s.pushInt(curChild)
|
||||
curChild = 0
|
||||
} else {
|
||||
curChild++
|
||||
s.skipchild = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// This is an interior node where we've finished analyzing all the children, or
|
||||
// the end of a leaf node.
|
||||
s.skipAllChildren = false
|
||||
|
||||
if s.intIsEmpty() {
|
||||
break
|
||||
}
|
||||
|
||||
curChild = s.popInt()
|
||||
curNode = curNode.next
|
||||
|
||||
s.calculateFC(curNode.t|afterChild, curNode, curChild)
|
||||
if s.failed {
|
||||
return nil
|
||||
}
|
||||
|
||||
curChild++
|
||||
}
|
||||
|
||||
if s.fcIsEmpty() {
|
||||
return nil
|
||||
}
|
||||
|
||||
return s.popFC()
|
||||
}
|
||||
|
||||
// To avoid recursion, we use a simple integer stack.
|
||||
// This is the push.
|
||||
func (s *regexFcd) pushInt(I int) {
|
||||
if s.intDepth >= len(s.intStack) {
|
||||
expanded := make([]int, s.intDepth*2)
|
||||
copy(expanded, s.intStack)
|
||||
s.intStack = expanded
|
||||
}
|
||||
|
||||
s.intStack[s.intDepth] = I
|
||||
s.intDepth++
|
||||
}
|
||||
|
||||
// True if the stack is empty.
|
||||
func (s *regexFcd) intIsEmpty() bool {
|
||||
return s.intDepth == 0
|
||||
}
|
||||
|
||||
// This is the pop.
|
||||
func (s *regexFcd) popInt() int {
|
||||
s.intDepth--
|
||||
return s.intStack[s.intDepth]
|
||||
}
|
||||
|
||||
// We also use a stack of RegexFC objects.
|
||||
// This is the push.
|
||||
func (s *regexFcd) pushFC(fc regexFc) {
|
||||
if s.fcDepth >= len(s.fcStack) {
|
||||
expanded := make([]regexFc, s.fcDepth*2)
|
||||
copy(expanded, s.fcStack)
|
||||
s.fcStack = expanded
|
||||
}
|
||||
|
||||
s.fcStack[s.fcDepth] = fc
|
||||
s.fcDepth++
|
||||
}
|
||||
|
||||
// True if the stack is empty.
|
||||
func (s *regexFcd) fcIsEmpty() bool {
|
||||
return s.fcDepth == 0
|
||||
}
|
||||
|
||||
// This is the pop.
|
||||
func (s *regexFcd) popFC() *regexFc {
|
||||
s.fcDepth--
|
||||
return &s.fcStack[s.fcDepth]
|
||||
}
|
||||
|
||||
// This is the top.
|
||||
func (s *regexFcd) topFC() *regexFc {
|
||||
return &s.fcStack[s.fcDepth-1]
|
||||
}
|
||||
|
||||
// Called in Beforechild to prevent further processing of the current child
|
||||
func (s *regexFcd) skipChild() {
|
||||
s.skipchild = true
|
||||
}
|
||||
|
||||
// FC computation and shortcut cases for each node type
|
||||
func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
|
||||
//fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
|
||||
ci := false
|
||||
rtl := false
|
||||
|
||||
if nt <= ntRef {
|
||||
if (node.options & IgnoreCase) != 0 {
|
||||
ci = true
|
||||
}
|
||||
if (node.options & RightToLeft) != 0 {
|
||||
rtl = true
|
||||
}
|
||||
}
|
||||
|
||||
switch nt {
|
||||
case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
|
||||
break
|
||||
|
||||
case ntTestgroup | beforeChild:
|
||||
if CurIndex == 0 {
|
||||
s.skipChild()
|
||||
}
|
||||
break
|
||||
|
||||
case ntEmpty:
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
break
|
||||
|
||||
case ntConcatenate | afterChild:
|
||||
if CurIndex != 0 {
|
||||
child := s.popFC()
|
||||
cumul := s.topFC()
|
||||
|
||||
s.failed = !cumul.addFC(*child, true)
|
||||
}
|
||||
|
||||
fc := s.topFC()
|
||||
if !fc.nullable {
|
||||
s.skipAllChildren = true
|
||||
}
|
||||
break
|
||||
|
||||
case ntTestgroup | afterChild:
|
||||
if CurIndex > 1 {
|
||||
child := s.popFC()
|
||||
cumul := s.topFC()
|
||||
|
||||
s.failed = !cumul.addFC(*child, false)
|
||||
}
|
||||
break
|
||||
|
||||
case ntAlternate | afterChild, ntTestref | afterChild:
|
||||
if CurIndex != 0 {
|
||||
child := s.popFC()
|
||||
cumul := s.topFC()
|
||||
|
||||
s.failed = !cumul.addFC(*child, false)
|
||||
}
|
||||
break
|
||||
|
||||
case ntLoop | afterChild, ntLazyloop | afterChild:
|
||||
if node.m == 0 {
|
||||
fc := s.topFC()
|
||||
fc.nullable = true
|
||||
}
|
||||
break
|
||||
|
||||
case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
|
||||
break
|
||||
|
||||
case ntRequire | beforeChild, ntPrevent | beforeChild:
|
||||
s.skipChild()
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
break
|
||||
|
||||
case ntRequire | afterChild, ntPrevent | afterChild:
|
||||
break
|
||||
|
||||
case ntOne, ntNotone:
|
||||
s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
|
||||
break
|
||||
|
||||
case ntOneloop, ntOnelazy:
|
||||
s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
|
||||
break
|
||||
|
||||
case ntNotoneloop, ntNotonelazy:
|
||||
s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
|
||||
break
|
||||
|
||||
case ntMulti:
|
||||
if len(node.str) == 0 {
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
} else if !rtl {
|
||||
s.pushFC(newRegexFc(node.str[0], false, false, ci))
|
||||
} else {
|
||||
s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
|
||||
}
|
||||
break
|
||||
|
||||
case ntSet:
|
||||
s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
|
||||
break
|
||||
|
||||
case ntSetloop, ntSetlazy:
|
||||
s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
|
||||
break
|
||||
|
||||
case ntRef:
|
||||
s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
|
||||
break
|
||||
|
||||
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
||||
s.pushFC(regexFc{nullable: true})
|
||||
break
|
||||
|
||||
default:
|
||||
panic(fmt.Sprintf("unexpected op code: %v", nt))
|
||||
}
|
||||
}
|
||||
|
||||
type regexFc struct {
|
||||
cc CharSet
|
||||
nullable bool
|
||||
caseInsensitive bool
|
||||
}
|
||||
|
||||
func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
|
||||
r := regexFc{
|
||||
caseInsensitive: caseInsensitive,
|
||||
nullable: nullable,
|
||||
}
|
||||
if not {
|
||||
if ch > 0 {
|
||||
r.cc.addRange('\x00', ch-1)
|
||||
}
|
||||
if ch < 0xFFFF {
|
||||
r.cc.addRange(ch+1, utf8.MaxRune)
|
||||
}
|
||||
} else {
|
||||
r.cc.addRange(ch, ch)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *regexFc) getFirstChars() CharSet {
|
||||
if r.caseInsensitive {
|
||||
r.cc.addLowercase()
|
||||
}
|
||||
|
||||
return r.cc
|
||||
}
|
||||
|
||||
func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
|
||||
if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
|
||||
return false
|
||||
}
|
||||
|
||||
if concatenate {
|
||||
if !r.nullable {
|
||||
return true
|
||||
}
|
||||
|
||||
if !fc.nullable {
|
||||
r.nullable = false
|
||||
}
|
||||
} else {
|
||||
if fc.nullable {
|
||||
r.nullable = true
|
||||
}
|
||||
}
|
||||
|
||||
r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
|
||||
r.cc.addSet(fc.cc)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// This is a related computation: it takes a RegexTree and computes the
|
||||
// leading substring if it sees one. It's quite trivial and gives up easily.
|
||||
func getPrefix(tree *RegexTree) *Prefix {
|
||||
var concatNode *regexNode
|
||||
nextChild := 0
|
||||
|
||||
curNode := tree.root
|
||||
|
||||
for {
|
||||
switch curNode.t {
|
||||
case ntConcatenate:
|
||||
if len(curNode.children) > 0 {
|
||||
concatNode = curNode
|
||||
nextChild = 0
|
||||
}
|
||||
|
||||
case ntGreedy, ntCapture:
|
||||
curNode = curNode.children[0]
|
||||
concatNode = nil
|
||||
continue
|
||||
|
||||
case ntOneloop, ntOnelazy:
|
||||
if curNode.m > 0 {
|
||||
return &Prefix{
|
||||
PrefixStr: repeat(curNode.ch, curNode.m),
|
||||
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
case ntOne:
|
||||
return &Prefix{
|
||||
PrefixStr: []rune{curNode.ch},
|
||||
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
||||
}
|
||||
|
||||
case ntMulti:
|
||||
return &Prefix{
|
||||
PrefixStr: curNode.str,
|
||||
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
||||
}
|
||||
|
||||
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
|
||||
ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
|
||||
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
if concatNode == nil || nextChild >= len(concatNode.children) {
|
||||
return nil
|
||||
}
|
||||
|
||||
curNode = concatNode.children[nextChild]
|
||||
nextChild++
|
||||
}
|
||||
}
|
||||
|
||||
// repeat the rune r, c times... up to the max of MaxPrefixSize
|
||||
func repeat(r rune, c int) []rune {
|
||||
if c > MaxPrefixSize {
|
||||
c = MaxPrefixSize
|
||||
}
|
||||
|
||||
ret := make([]rune, c)
|
||||
|
||||
// binary growth using copy for speed
|
||||
ret[0] = r
|
||||
bp := 1
|
||||
for bp < len(ret) {
|
||||
copy(ret[bp:], ret[:bp])
|
||||
bp *= 2
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// BmPrefix precomputes the Boyer-Moore
|
||||
// tables for fast string scanning. These tables allow
|
||||
// you to scan for the first occurrence of a string within
|
||||
// a large body of text without examining every character.
|
||||
// The performance of the heuristic depends on the actual
|
||||
// string and the text being searched, but usually, the longer
|
||||
// the string that is being searched for, the fewer characters
|
||||
// need to be examined.
|
||||
type BmPrefix struct {
|
||||
positive []int
|
||||
negativeASCII []int
|
||||
negativeUnicode [][]int
|
||||
pattern []rune
|
||||
lowASCII rune
|
||||
highASCII rune
|
||||
rightToLeft bool
|
||||
caseInsensitive bool
|
||||
}
|
||||
|
||||
func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
|
||||
|
||||
b := &BmPrefix{
|
||||
rightToLeft: rightToLeft,
|
||||
caseInsensitive: caseInsensitive,
|
||||
pattern: pattern,
|
||||
}
|
||||
|
||||
if caseInsensitive {
|
||||
for i := 0; i < len(b.pattern); i++ {
|
||||
// We do the ToLower character by character for consistency. With surrogate chars, doing
|
||||
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
|
||||
// linguistically, but since Regex doesn't support surrogates, it's more important to be
|
||||
// consistent.
|
||||
|
||||
b.pattern[i] = unicode.ToLower(b.pattern[i])
|
||||
}
|
||||
}
|
||||
|
||||
var beforefirst, last, bump int
|
||||
var scan, match int
|
||||
|
||||
if !rightToLeft {
|
||||
beforefirst = -1
|
||||
last = len(b.pattern) - 1
|
||||
bump = 1
|
||||
} else {
|
||||
beforefirst = len(b.pattern)
|
||||
last = 0
|
||||
bump = -1
|
||||
}
|
||||
|
||||
// PART I - the good-suffix shift table
|
||||
//
|
||||
// compute the positive requirement:
|
||||
// if char "i" is the first one from the right that doesn't match,
|
||||
// then we know the matcher can advance by _positive[i].
|
||||
//
|
||||
// This algorithm is a simplified variant of the standard
|
||||
// Boyer-Moore good suffix calculation.
|
||||
|
||||
b.positive = make([]int, len(b.pattern))
|
||||
|
||||
examine := last
|
||||
ch := b.pattern[examine]
|
||||
b.positive[examine] = bump
|
||||
examine -= bump
|
||||
|
||||
Outerloop:
|
||||
for {
|
||||
// find an internal char (examine) that matches the tail
|
||||
|
||||
for {
|
||||
if examine == beforefirst {
|
||||
break Outerloop
|
||||
}
|
||||
if b.pattern[examine] == ch {
|
||||
break
|
||||
}
|
||||
examine -= bump
|
||||
}
|
||||
|
||||
match = last
|
||||
scan = examine
|
||||
|
||||
// find the length of the match
|
||||
for {
|
||||
if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
|
||||
// at the end of the match, note the difference in _positive
|
||||
// this is not the length of the match, but the distance from the internal match
|
||||
// to the tail suffix.
|
||||
if b.positive[match] == 0 {
|
||||
b.positive[match] = match - scan
|
||||
}
|
||||
|
||||
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
scan -= bump
|
||||
match -= bump
|
||||
}
|
||||
|
||||
examine -= bump
|
||||
}
|
||||
|
||||
match = last - bump
|
||||
|
||||
// scan for the chars for which there are no shifts that yield a different candidate
|
||||
|
||||
// The inside of the if statement used to say
|
||||
// "_positive[match] = last - beforefirst;"
|
||||
// This is slightly less aggressive in how much we skip, but at worst it
|
||||
// should mean a little more work rather than skipping a potential match.
|
||||
for match != beforefirst {
|
||||
if b.positive[match] == 0 {
|
||||
b.positive[match] = bump
|
||||
}
|
||||
|
||||
match -= bump
|
||||
}
|
||||
|
||||
// PART II - the bad-character shift table
|
||||
//
|
||||
// compute the negative requirement:
|
||||
// if char "ch" is the reject character when testing position "i",
|
||||
// we can slide up by _negative[ch];
|
||||
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
|
||||
//
|
||||
// the lookup table is divided into ASCII and Unicode portions;
|
||||
// only those parts of the Unicode 16-bit code set that actually
|
||||
// appear in the string are in the table. (Maximum size with
|
||||
// Unicode is 65K; ASCII only case is 512 bytes.)
|
||||
|
||||
b.negativeASCII = make([]int, 128)
|
||||
|
||||
for i := 0; i < len(b.negativeASCII); i++ {
|
||||
b.negativeASCII[i] = last - beforefirst
|
||||
}
|
||||
|
||||
b.lowASCII = 127
|
||||
b.highASCII = 0
|
||||
|
||||
for examine = last; examine != beforefirst; examine -= bump {
|
||||
ch = b.pattern[examine]
|
||||
|
||||
switch {
|
||||
case ch < 128:
|
||||
if b.lowASCII > ch {
|
||||
b.lowASCII = ch
|
||||
}
|
||||
|
||||
if b.highASCII < ch {
|
||||
b.highASCII = ch
|
||||
}
|
||||
|
||||
if b.negativeASCII[ch] == last-beforefirst {
|
||||
b.negativeASCII[ch] = last - examine
|
||||
}
|
||||
case ch <= 0xffff:
|
||||
i, j := ch>>8, ch&0xFF
|
||||
|
||||
if b.negativeUnicode == nil {
|
||||
b.negativeUnicode = make([][]int, 256)
|
||||
}
|
||||
|
||||
if b.negativeUnicode[i] == nil {
|
||||
newarray := make([]int, 256)
|
||||
|
||||
for k := 0; k < len(newarray); k++ {
|
||||
newarray[k] = last - beforefirst
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
copy(newarray, b.negativeASCII)
|
||||
//TODO: this line needed?
|
||||
b.negativeASCII = newarray
|
||||
}
|
||||
|
||||
b.negativeUnicode[i] = newarray
|
||||
}
|
||||
|
||||
if b.negativeUnicode[i][j] == last-beforefirst {
|
||||
b.negativeUnicode[i][j] = last - examine
|
||||
}
|
||||
default:
|
||||
// we can't do the filter because this algo doesn't support
|
||||
// unicode chars >0xffff
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return b
|
||||
}
|
||||
|
||||
func (b *BmPrefix) String() string {
|
||||
return string(b.pattern)
|
||||
}
|
||||
|
||||
// Dump returns the contents of the filter as a human readable string
|
||||
func (b *BmPrefix) Dump(indent string) string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
|
||||
for i := 0; i < len(b.positive); i++ {
|
||||
buf.WriteString(strconv.Itoa(b.positive[i]))
|
||||
buf.WriteRune(' ')
|
||||
}
|
||||
buf.WriteRune('\n')
|
||||
|
||||
if b.negativeASCII != nil {
|
||||
buf.WriteString(indent)
|
||||
buf.WriteString("Negative table\n")
|
||||
for i := 0; i < len(b.negativeASCII); i++ {
|
||||
if b.negativeASCII[i] != len(b.pattern) {
|
||||
fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// Scan uses the Boyer-Moore algorithm to find the first occurrence
|
||||
// of the specified string within text, beginning at index, and
|
||||
// constrained within beglimit and endlimit.
|
||||
//
|
||||
// The direction and case-sensitivity of the match is determined
|
||||
// by the arguments to the RegexBoyerMoore constructor.
|
||||
func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
|
||||
var (
|
||||
defadv, test, test2 int
|
||||
match, startmatch, endmatch int
|
||||
bump, advance int
|
||||
chTest rune
|
||||
unicodeLookup []int
|
||||
)
|
||||
|
||||
if !b.rightToLeft {
|
||||
defadv = len(b.pattern)
|
||||
startmatch = len(b.pattern) - 1
|
||||
endmatch = 0
|
||||
test = index + defadv - 1
|
||||
bump = 1
|
||||
} else {
|
||||
defadv = -len(b.pattern)
|
||||
startmatch = 0
|
||||
endmatch = -defadv - 1
|
||||
test = index + defadv
|
||||
bump = -1
|
||||
}
|
||||
|
||||
chMatch := b.pattern[startmatch]
|
||||
|
||||
for {
|
||||
if test >= endlimit || test < beglimit {
|
||||
return -1
|
||||
}
|
||||
|
||||
chTest = text[test]
|
||||
|
||||
if b.caseInsensitive {
|
||||
chTest = unicode.ToLower(chTest)
|
||||
}
|
||||
|
||||
if chTest != chMatch {
|
||||
if chTest < 128 {
|
||||
advance = b.negativeASCII[chTest]
|
||||
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
||||
unicodeLookup = b.negativeUnicode[chTest>>8]
|
||||
if len(unicodeLookup) > 0 {
|
||||
advance = unicodeLookup[chTest&0xFF]
|
||||
} else {
|
||||
advance = defadv
|
||||
}
|
||||
} else {
|
||||
advance = defadv
|
||||
}
|
||||
|
||||
test += advance
|
||||
} else { // if (chTest == chMatch)
|
||||
test2 = test
|
||||
match = startmatch
|
||||
|
||||
for {
|
||||
if match == endmatch {
|
||||
if b.rightToLeft {
|
||||
return test2 + 1
|
||||
} else {
|
||||
return test2
|
||||
}
|
||||
}
|
||||
|
||||
match -= bump
|
||||
test2 -= bump
|
||||
|
||||
chTest = text[test2]
|
||||
|
||||
if b.caseInsensitive {
|
||||
chTest = unicode.ToLower(chTest)
|
||||
}
|
||||
|
||||
if chTest != b.pattern[match] {
|
||||
advance = b.positive[match]
|
||||
if (chTest & 0xFF80) == 0 {
|
||||
test2 = (match - startmatch) + b.negativeASCII[chTest]
|
||||
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
||||
unicodeLookup = b.negativeUnicode[chTest>>8]
|
||||
if len(unicodeLookup) > 0 {
|
||||
test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
|
||||
} else {
|
||||
test += advance
|
||||
break
|
||||
}
|
||||
} else {
|
||||
test += advance
|
||||
break
|
||||
}
|
||||
|
||||
if b.rightToLeft {
|
||||
if test2 < advance {
|
||||
advance = test2
|
||||
}
|
||||
} else if test2 > advance {
|
||||
advance = test2
|
||||
}
|
||||
|
||||
test += advance
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
|
||||
func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
|
||||
if !b.rightToLeft {
|
||||
if index < beglimit || endlimit-index < len(b.pattern) {
|
||||
return false
|
||||
}
|
||||
|
||||
return b.matchPattern(text, index)
|
||||
} else {
|
||||
if index > endlimit || index-beglimit < len(b.pattern) {
|
||||
return false
|
||||
}
|
||||
|
||||
return b.matchPattern(text, index-len(b.pattern))
|
||||
}
|
||||
}
|
||||
|
||||
func (b *BmPrefix) matchPattern(text []rune, index int) bool {
|
||||
if len(text)-index < len(b.pattern) {
|
||||
return false
|
||||
}
|
||||
|
||||
if b.caseInsensitive {
|
||||
for i := 0; i < len(b.pattern); i++ {
|
||||
//Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
|
||||
if unicode.ToLower(text[index+i]) != b.pattern[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
} else {
|
||||
for i := 0; i < len(b.pattern); i++ {
|
||||
if text[index+i] != b.pattern[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
type AnchorLoc int16
|
||||
|
||||
// where the regex can be pegged
|
||||
const (
|
||||
AnchorBeginning AnchorLoc = 0x0001
|
||||
AnchorBol = 0x0002
|
||||
AnchorStart = 0x0004
|
||||
AnchorEol = 0x0008
|
||||
AnchorEndZ = 0x0010
|
||||
AnchorEnd = 0x0020
|
||||
AnchorBoundary = 0x0040
|
||||
AnchorECMABoundary = 0x0080
|
||||
)
|
||||
|
||||
func getAnchors(tree *RegexTree) AnchorLoc {
|
||||
|
||||
var concatNode *regexNode
|
||||
nextChild, result := 0, AnchorLoc(0)
|
||||
|
||||
curNode := tree.root
|
||||
|
||||
for {
|
||||
switch curNode.t {
|
||||
case ntConcatenate:
|
||||
if len(curNode.children) > 0 {
|
||||
concatNode = curNode
|
||||
nextChild = 0
|
||||
}
|
||||
|
||||
case ntGreedy, ntCapture:
|
||||
curNode = curNode.children[0]
|
||||
concatNode = nil
|
||||
continue
|
||||
|
||||
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
|
||||
ntStart, ntEndZ, ntEnd:
|
||||
return result | anchorFromType(curNode.t)
|
||||
|
||||
case ntEmpty, ntRequire, ntPrevent:
|
||||
|
||||
default:
|
||||
return result
|
||||
}
|
||||
|
||||
if concatNode == nil || nextChild >= len(concatNode.children) {
|
||||
return result
|
||||
}
|
||||
|
||||
curNode = concatNode.children[nextChild]
|
||||
nextChild++
|
||||
}
|
||||
}
|
||||
|
||||
func anchorFromType(t nodeType) AnchorLoc {
|
||||
switch t {
|
||||
case ntBol:
|
||||
return AnchorBol
|
||||
case ntEol:
|
||||
return AnchorEol
|
||||
case ntBoundary:
|
||||
return AnchorBoundary
|
||||
case ntECMABoundary:
|
||||
return AnchorECMABoundary
|
||||
case ntBeginning:
|
||||
return AnchorBeginning
|
||||
case ntStart:
|
||||
return AnchorStart
|
||||
case ntEndZ:
|
||||
return AnchorEndZ
|
||||
case ntEnd:
|
||||
return AnchorEnd
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// anchorDescription returns a human-readable description of the anchors
|
||||
func (anchors AnchorLoc) String() string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
if 0 != (anchors & AnchorBeginning) {
|
||||
buf.WriteString(", Beginning")
|
||||
}
|
||||
if 0 != (anchors & AnchorStart) {
|
||||
buf.WriteString(", Start")
|
||||
}
|
||||
if 0 != (anchors & AnchorBol) {
|
||||
buf.WriteString(", Bol")
|
||||
}
|
||||
if 0 != (anchors & AnchorBoundary) {
|
||||
buf.WriteString(", Boundary")
|
||||
}
|
||||
if 0 != (anchors & AnchorECMABoundary) {
|
||||
buf.WriteString(", ECMABoundary")
|
||||
}
|
||||
if 0 != (anchors & AnchorEol) {
|
||||
buf.WriteString(", Eol")
|
||||
}
|
||||
if 0 != (anchors & AnchorEnd) {
|
||||
buf.WriteString(", End")
|
||||
}
|
||||
if 0 != (anchors & AnchorEndZ) {
|
||||
buf.WriteString(", EndZ")
|
||||
}
|
||||
|
||||
// trim off comma
|
||||
if buf.Len() >= 2 {
|
||||
return buf.String()[2:]
|
||||
}
|
||||
return "None"
|
||||
}
|
87
vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
generated
vendored
Normal file
87
vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
generated
vendored
Normal file
|
@ -0,0 +1,87 @@
|
|||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
)
|
||||
|
||||
type ReplacerData struct {
|
||||
Rep string
|
||||
Strings []string
|
||||
Rules []int
|
||||
}
|
||||
|
||||
const (
|
||||
replaceSpecials = 4
|
||||
replaceLeftPortion = -1
|
||||
replaceRightPortion = -2
|
||||
replaceLastGroup = -3
|
||||
replaceWholeString = -4
|
||||
)
|
||||
|
||||
//ErrReplacementError is a general error during parsing the replacement text
|
||||
var ErrReplacementError = errors.New("Replacement pattern error.")
|
||||
|
||||
// NewReplacerData will populate a reusable replacer data struct based on the given replacement string
|
||||
// and the capture group data from a regexp
|
||||
func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error) {
|
||||
p := parser{
|
||||
options: op,
|
||||
caps: caps,
|
||||
capsize: capsize,
|
||||
capnames: capnames,
|
||||
}
|
||||
p.setPattern(rep)
|
||||
concat, err := p.scanReplacement()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if concat.t != ntConcatenate {
|
||||
panic(ErrReplacementError)
|
||||
}
|
||||
|
||||
sb := &bytes.Buffer{}
|
||||
var (
|
||||
strings []string
|
||||
rules []int
|
||||
)
|
||||
|
||||
for _, child := range concat.children {
|
||||
switch child.t {
|
||||
case ntMulti:
|
||||
child.writeStrToBuf(sb)
|
||||
|
||||
case ntOne:
|
||||
sb.WriteRune(child.ch)
|
||||
|
||||
case ntRef:
|
||||
if sb.Len() > 0 {
|
||||
rules = append(rules, len(strings))
|
||||
strings = append(strings, sb.String())
|
||||
sb.Reset()
|
||||
}
|
||||
slot := child.m
|
||||
|
||||
if len(caps) > 0 && slot >= 0 {
|
||||
slot = caps[slot]
|
||||
}
|
||||
|
||||
rules = append(rules, -replaceSpecials-1-slot)
|
||||
|
||||
default:
|
||||
panic(ErrReplacementError)
|
||||
}
|
||||
}
|
||||
|
||||
if sb.Len() > 0 {
|
||||
rules = append(rules, len(strings))
|
||||
strings = append(strings, sb.String())
|
||||
}
|
||||
|
||||
return &ReplacerData{
|
||||
Rep: rep,
|
||||
Strings: strings,
|
||||
Rules: rules,
|
||||
}, nil
|
||||
}
|
654
vendor/github.com/dlclark/regexp2/syntax/tree.go
generated
vendored
Normal file
654
vendor/github.com/dlclark/regexp2/syntax/tree.go
generated
vendored
Normal file
|
@ -0,0 +1,654 @@
|
|||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type RegexTree struct {
|
||||
root *regexNode
|
||||
caps map[int]int
|
||||
capnumlist []int
|
||||
captop int
|
||||
Capnames map[string]int
|
||||
Caplist []string
|
||||
options RegexOptions
|
||||
}
|
||||
|
||||
// It is built into a parsed tree for a regular expression.
|
||||
|
||||
// Implementation notes:
|
||||
//
|
||||
// Since the node tree is a temporary data structure only used
|
||||
// during compilation of the regexp to integer codes, it's
|
||||
// designed for clarity and convenience rather than
|
||||
// space efficiency.
|
||||
//
|
||||
// RegexNodes are built into a tree, linked by the n.children list.
|
||||
// Each node also has a n.parent and n.ichild member indicating
|
||||
// its parent and which child # it is in its parent's list.
|
||||
//
|
||||
// RegexNodes come in as many types as there are constructs in
|
||||
// a regular expression, for example, "concatenate", "alternate",
|
||||
// "one", "rept", "group". There are also node types for basic
|
||||
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
|
||||
//
|
||||
// Because perl 5 allows "lookback" groups that scan backwards,
|
||||
// each node also gets a "direction". Normally the value of
|
||||
// boolean n.backward = false.
|
||||
//
|
||||
// During parsing, top-level nodes are also stacked onto a parse
|
||||
// stack (a stack of trees). For this purpose we have a n.next
|
||||
// pointer. [Note that to save a few bytes, we could overload the
|
||||
// n.parent pointer instead.]
|
||||
//
|
||||
// On the parse stack, each tree has a "role" - basically, the
|
||||
// nonterminal in the grammar that the parser has currently
|
||||
// assigned to the tree. That code is stored in n.role.
|
||||
//
|
||||
// Finally, some of the different kinds of nodes have data.
|
||||
// Two integers (for the looping constructs) are stored in
|
||||
// n.operands, an an object (either a string or a set)
|
||||
// is stored in n.data
|
||||
type regexNode struct {
|
||||
t nodeType
|
||||
children []*regexNode
|
||||
str []rune
|
||||
set *CharSet
|
||||
ch rune
|
||||
m int
|
||||
n int
|
||||
options RegexOptions
|
||||
next *regexNode
|
||||
}
|
||||
|
||||
type nodeType int32
|
||||
|
||||
const (
|
||||
// The following are leaves, and correspond to primitive operations
|
||||
|
||||
ntOnerep nodeType = 0 // lef,back char,min,max a {n}
|
||||
ntNotonerep = 1 // lef,back char,min,max .{n}
|
||||
ntSetrep = 2 // lef,back set,min,max [\d]{n}
|
||||
ntOneloop = 3 // lef,back char,min,max a {,n}
|
||||
ntNotoneloop = 4 // lef,back char,min,max .{,n}
|
||||
ntSetloop = 5 // lef,back set,min,max [\d]{,n}
|
||||
ntOnelazy = 6 // lef,back char,min,max a {,n}?
|
||||
ntNotonelazy = 7 // lef,back char,min,max .{,n}?
|
||||
ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
|
||||
ntOne = 9 // lef char a
|
||||
ntNotone = 10 // lef char [^a]
|
||||
ntSet = 11 // lef set [a-z\s] \w \s \d
|
||||
ntMulti = 12 // lef string abcd
|
||||
ntRef = 13 // lef group \#
|
||||
ntBol = 14 // ^
|
||||
ntEol = 15 // $
|
||||
ntBoundary = 16 // \b
|
||||
ntNonboundary = 17 // \B
|
||||
ntBeginning = 18 // \A
|
||||
ntStart = 19 // \G
|
||||
ntEndZ = 20 // \Z
|
||||
ntEnd = 21 // \Z
|
||||
|
||||
// Interior nodes do not correspond to primitive operations, but
|
||||
// control structures compositing other operations
|
||||
|
||||
// Concat and alternate take n children, and can run forward or backwards
|
||||
|
||||
ntNothing = 22 // []
|
||||
ntEmpty = 23 // ()
|
||||
ntAlternate = 24 // a|b
|
||||
ntConcatenate = 25 // ab
|
||||
ntLoop = 26 // m,x * + ? {,}
|
||||
ntLazyloop = 27 // m,x *? +? ?? {,}?
|
||||
ntCapture = 28 // n ()
|
||||
ntGroup = 29 // (?:)
|
||||
ntRequire = 30 // (?=) (?<=)
|
||||
ntPrevent = 31 // (?!) (?<!)
|
||||
ntGreedy = 32 // (?>) (?<)
|
||||
ntTestref = 33 // (?(n) | )
|
||||
ntTestgroup = 34 // (?(...) | )
|
||||
|
||||
ntECMABoundary = 41 // \b
|
||||
ntNonECMABoundary = 42 // \B
|
||||
)
|
||||
|
||||
func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
ch: ch,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
str: str,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
set: set,
|
||||
}
|
||||
}
|
||||
|
||||
func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
m: m,
|
||||
}
|
||||
}
|
||||
func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
|
||||
return ®exNode{
|
||||
t: t,
|
||||
options: opt,
|
||||
m: m,
|
||||
n: n,
|
||||
}
|
||||
}
|
||||
|
||||
func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
|
||||
for i := 0; i < len(n.str); i++ {
|
||||
buf.WriteRune(n.str[i])
|
||||
}
|
||||
}
|
||||
|
||||
func (n *regexNode) addChild(child *regexNode) {
|
||||
reduced := child.reduce()
|
||||
n.children = append(n.children, reduced)
|
||||
reduced.next = n
|
||||
}
|
||||
|
||||
func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
|
||||
newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
|
||||
n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
|
||||
}
|
||||
|
||||
// removes children including the start but not the end index
|
||||
func (n *regexNode) removeChildren(startIndex, endIndex int) {
|
||||
n.children = append(n.children[:startIndex], n.children[endIndex:]...)
|
||||
}
|
||||
|
||||
// Pass type as OneLazy or OneLoop
|
||||
func (n *regexNode) makeRep(t nodeType, min, max int) {
|
||||
n.t += (t - ntOne)
|
||||
n.m = min
|
||||
n.n = max
|
||||
}
|
||||
|
||||
func (n *regexNode) reduce() *regexNode {
|
||||
switch n.t {
|
||||
case ntAlternate:
|
||||
return n.reduceAlternation()
|
||||
|
||||
case ntConcatenate:
|
||||
return n.reduceConcatenation()
|
||||
|
||||
case ntLoop, ntLazyloop:
|
||||
return n.reduceRep()
|
||||
|
||||
case ntGroup:
|
||||
return n.reduceGroup()
|
||||
|
||||
case ntSet, ntSetloop:
|
||||
return n.reduceSet()
|
||||
|
||||
default:
|
||||
return n
|
||||
}
|
||||
}
|
||||
|
||||
// Basic optimization. Single-letter alternations can be replaced
|
||||
// by faster set specifications, and nested alternations with no
|
||||
// intervening operators can be flattened:
|
||||
//
|
||||
// a|b|c|def|g|h -> [a-c]|def|[gh]
|
||||
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
|
||||
func (n *regexNode) reduceAlternation() *regexNode {
|
||||
if len(n.children) == 0 {
|
||||
return newRegexNode(ntNothing, n.options)
|
||||
}
|
||||
|
||||
wasLastSet := false
|
||||
lastNodeCannotMerge := false
|
||||
var optionsLast RegexOptions
|
||||
var i, j int
|
||||
|
||||
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
||||
at := n.children[i]
|
||||
|
||||
if j < i {
|
||||
n.children[j] = at
|
||||
}
|
||||
|
||||
for {
|
||||
if at.t == ntAlternate {
|
||||
for k := 0; k < len(at.children); k++ {
|
||||
at.children[k].next = n
|
||||
}
|
||||
n.insertChildren(i+1, at.children)
|
||||
|
||||
j--
|
||||
} else if at.t == ntSet || at.t == ntOne {
|
||||
// Cannot merge sets if L or I options differ, or if either are negated.
|
||||
optionsAt := at.options & (RightToLeft | IgnoreCase)
|
||||
|
||||
if at.t == ntSet {
|
||||
if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
|
||||
wasLastSet = true
|
||||
lastNodeCannotMerge = !at.set.IsMergeable()
|
||||
optionsLast = optionsAt
|
||||
break
|
||||
}
|
||||
} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
|
||||
wasLastSet = true
|
||||
lastNodeCannotMerge = false
|
||||
optionsLast = optionsAt
|
||||
break
|
||||
}
|
||||
|
||||
// The last node was a Set or a One, we're a Set or One and our options are the same.
|
||||
// Merge the two nodes.
|
||||
j--
|
||||
prev := n.children[j]
|
||||
|
||||
var prevCharClass *CharSet
|
||||
if prev.t == ntOne {
|
||||
prevCharClass = &CharSet{}
|
||||
prevCharClass.addChar(prev.ch)
|
||||
} else {
|
||||
prevCharClass = prev.set
|
||||
}
|
||||
|
||||
if at.t == ntOne {
|
||||
prevCharClass.addChar(at.ch)
|
||||
} else {
|
||||
prevCharClass.addSet(*at.set)
|
||||
}
|
||||
|
||||
prev.t = ntSet
|
||||
prev.set = prevCharClass
|
||||
} else if at.t == ntNothing {
|
||||
j--
|
||||
} else {
|
||||
wasLastSet = false
|
||||
lastNodeCannotMerge = false
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if j < i {
|
||||
n.removeChildren(j, i)
|
||||
}
|
||||
|
||||
return n.stripEnation(ntNothing)
|
||||
}
|
||||
|
||||
// Basic optimization. Adjacent strings can be concatenated.
|
||||
//
|
||||
// (?:abc)(?:def) -> abcdef
|
||||
func (n *regexNode) reduceConcatenation() *regexNode {
|
||||
// Eliminate empties and concat adjacent strings/chars
|
||||
|
||||
var optionsLast RegexOptions
|
||||
var optionsAt RegexOptions
|
||||
var i, j int
|
||||
|
||||
if len(n.children) == 0 {
|
||||
return newRegexNode(ntEmpty, n.options)
|
||||
}
|
||||
|
||||
wasLastString := false
|
||||
|
||||
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
||||
var at, prev *regexNode
|
||||
|
||||
at = n.children[i]
|
||||
|
||||
if j < i {
|
||||
n.children[j] = at
|
||||
}
|
||||
|
||||
if at.t == ntConcatenate &&
|
||||
((at.options & RightToLeft) == (n.options & RightToLeft)) {
|
||||
for k := 0; k < len(at.children); k++ {
|
||||
at.children[k].next = n
|
||||
}
|
||||
|
||||
//insert at.children at i+1 index in n.children
|
||||
n.insertChildren(i+1, at.children)
|
||||
|
||||
j--
|
||||
} else if at.t == ntMulti || at.t == ntOne {
|
||||
// Cannot merge strings if L or I options differ
|
||||
optionsAt = at.options & (RightToLeft | IgnoreCase)
|
||||
|
||||
if !wasLastString || optionsLast != optionsAt {
|
||||
wasLastString = true
|
||||
optionsLast = optionsAt
|
||||
continue
|
||||
}
|
||||
|
||||
j--
|
||||
prev = n.children[j]
|
||||
|
||||
if prev.t == ntOne {
|
||||
prev.t = ntMulti
|
||||
prev.str = []rune{prev.ch}
|
||||
}
|
||||
|
||||
if (optionsAt & RightToLeft) == 0 {
|
||||
if at.t == ntOne {
|
||||
prev.str = append(prev.str, at.ch)
|
||||
} else {
|
||||
prev.str = append(prev.str, at.str...)
|
||||
}
|
||||
} else {
|
||||
if at.t == ntOne {
|
||||
// insert at the front by expanding our slice, copying the data over, and then setting the value
|
||||
prev.str = append(prev.str, 0)
|
||||
copy(prev.str[1:], prev.str)
|
||||
prev.str[0] = at.ch
|
||||
} else {
|
||||
//insert at the front...this one we'll make a new slice and copy both into it
|
||||
merge := make([]rune, len(prev.str)+len(at.str))
|
||||
copy(merge, at.str)
|
||||
copy(merge[len(at.str):], prev.str)
|
||||
prev.str = merge
|
||||
}
|
||||
}
|
||||
} else if at.t == ntEmpty {
|
||||
j--
|
||||
} else {
|
||||
wasLastString = false
|
||||
}
|
||||
}
|
||||
|
||||
if j < i {
|
||||
// remove indices j through i from the children
|
||||
n.removeChildren(j, i)
|
||||
}
|
||||
|
||||
return n.stripEnation(ntEmpty)
|
||||
}
|
||||
|
||||
// Nested repeaters just get multiplied with each other if they're not
|
||||
// too lumpy
|
||||
func (n *regexNode) reduceRep() *regexNode {
|
||||
|
||||
u := n
|
||||
t := n.t
|
||||
min := n.m
|
||||
max := n.n
|
||||
|
||||
for {
|
||||
if len(u.children) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
child := u.children[0]
|
||||
|
||||
// multiply reps of the same type only
|
||||
if child.t != t {
|
||||
childType := child.t
|
||||
|
||||
if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
|
||||
childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
|
||||
// [but things like (a {2,})+ are not too lumpy...]
|
||||
if u.m == 0 && child.m > 1 || child.n < child.m*2 {
|
||||
break
|
||||
}
|
||||
|
||||
u = child
|
||||
if u.m > 0 {
|
||||
if (math.MaxInt32-1)/u.m < min {
|
||||
u.m = math.MaxInt32
|
||||
} else {
|
||||
u.m = u.m * min
|
||||
}
|
||||
}
|
||||
if u.n > 0 {
|
||||
if (math.MaxInt32-1)/u.n < max {
|
||||
u.n = math.MaxInt32
|
||||
} else {
|
||||
u.n = u.n * max
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if math.MaxInt32 == min {
|
||||
return newRegexNode(ntNothing, n.options)
|
||||
}
|
||||
return u
|
||||
|
||||
}
|
||||
|
||||
// Simple optimization. If a concatenation or alternation has only
|
||||
// one child strip out the intermediate node. If it has zero children,
|
||||
// turn it into an empty.
|
||||
func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
|
||||
switch len(n.children) {
|
||||
case 0:
|
||||
return newRegexNode(emptyType, n.options)
|
||||
case 1:
|
||||
return n.children[0]
|
||||
default:
|
||||
return n
|
||||
}
|
||||
}
|
||||
|
||||
func (n *regexNode) reduceGroup() *regexNode {
|
||||
u := n
|
||||
|
||||
for u.t == ntGroup {
|
||||
u = u.children[0]
|
||||
}
|
||||
|
||||
return u
|
||||
}
|
||||
|
||||
// Simple optimization. If a set is a singleton, an inverse singleton,
|
||||
// or empty, it's transformed accordingly.
|
||||
func (n *regexNode) reduceSet() *regexNode {
|
||||
// Extract empty-set, one and not-one case as special
|
||||
|
||||
if n.set == nil {
|
||||
n.t = ntNothing
|
||||
} else if n.set.IsSingleton() {
|
||||
n.ch = n.set.SingletonChar()
|
||||
n.set = nil
|
||||
n.t += (ntOne - ntSet)
|
||||
} else if n.set.IsSingletonInverse() {
|
||||
n.ch = n.set.SingletonChar()
|
||||
n.set = nil
|
||||
n.t += (ntNotone - ntSet)
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
func (n *regexNode) reverseLeft() *regexNode {
|
||||
if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
|
||||
//reverse children order
|
||||
for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
|
||||
n.children[left], n.children[right] = n.children[right], n.children[left]
|
||||
}
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
|
||||
if min == 0 && max == 0 {
|
||||
return newRegexNode(ntEmpty, n.options)
|
||||
}
|
||||
|
||||
if min == 1 && max == 1 {
|
||||
return n
|
||||
}
|
||||
|
||||
switch n.t {
|
||||
case ntOne, ntNotone, ntSet:
|
||||
if lazy {
|
||||
n.makeRep(Onelazy, min, max)
|
||||
} else {
|
||||
n.makeRep(Oneloop, min, max)
|
||||
}
|
||||
return n
|
||||
|
||||
default:
|
||||
var t nodeType
|
||||
if lazy {
|
||||
t = ntLazyloop
|
||||
} else {
|
||||
t = ntLoop
|
||||
}
|
||||
result := newRegexNodeMN(t, n.options, min, max)
|
||||
result.addChild(n)
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
// debug functions
|
||||
|
||||
var typeStr = []string{
|
||||
"Onerep", "Notonerep", "Setrep",
|
||||
"Oneloop", "Notoneloop", "Setloop",
|
||||
"Onelazy", "Notonelazy", "Setlazy",
|
||||
"One", "Notone", "Set",
|
||||
"Multi", "Ref",
|
||||
"Bol", "Eol", "Boundary", "Nonboundary",
|
||||
"Beginning", "Start", "EndZ", "End",
|
||||
"Nothing", "Empty",
|
||||
"Alternate", "Concatenate",
|
||||
"Loop", "Lazyloop",
|
||||
"Capture", "Group", "Require", "Prevent", "Greedy",
|
||||
"Testref", "Testgroup",
|
||||
"Unknown", "Unknown", "Unknown",
|
||||
"Unknown", "Unknown", "Unknown",
|
||||
"ECMABoundary", "NonECMABoundary",
|
||||
}
|
||||
|
||||
func (n *regexNode) description() string {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
buf.WriteString(typeStr[n.t])
|
||||
|
||||
if (n.options & ExplicitCapture) != 0 {
|
||||
buf.WriteString("-C")
|
||||
}
|
||||
if (n.options & IgnoreCase) != 0 {
|
||||
buf.WriteString("-I")
|
||||
}
|
||||
if (n.options & RightToLeft) != 0 {
|
||||
buf.WriteString("-L")
|
||||
}
|
||||
if (n.options & Multiline) != 0 {
|
||||
buf.WriteString("-M")
|
||||
}
|
||||
if (n.options & Singleline) != 0 {
|
||||
buf.WriteString("-S")
|
||||
}
|
||||
if (n.options & IgnorePatternWhitespace) != 0 {
|
||||
buf.WriteString("-X")
|
||||
}
|
||||
if (n.options & ECMAScript) != 0 {
|
||||
buf.WriteString("-E")
|
||||
}
|
||||
|
||||
switch n.t {
|
||||
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
|
||||
buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
|
||||
break
|
||||
case ntCapture:
|
||||
buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
|
||||
break
|
||||
case ntRef, ntTestref:
|
||||
buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
|
||||
break
|
||||
case ntMulti:
|
||||
fmt.Fprintf(buf, "(String = %s)", string(n.str))
|
||||
break
|
||||
case ntSet, ntSetloop, ntSetlazy:
|
||||
buf.WriteString("(Set = " + n.set.String() + ")")
|
||||
break
|
||||
}
|
||||
|
||||
switch n.t {
|
||||
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
|
||||
buf.WriteString("(Min = ")
|
||||
buf.WriteString(strconv.Itoa(n.m))
|
||||
buf.WriteString(", Max = ")
|
||||
if n.n == math.MaxInt32 {
|
||||
buf.WriteString("inf")
|
||||
} else {
|
||||
buf.WriteString(strconv.Itoa(n.n))
|
||||
}
|
||||
buf.WriteString(")")
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
var padSpace = []byte(" ")
|
||||
|
||||
func (t *RegexTree) Dump() string {
|
||||
return t.root.dump()
|
||||
}
|
||||
|
||||
func (n *regexNode) dump() string {
|
||||
var stack []int
|
||||
CurNode := n
|
||||
CurChild := 0
|
||||
|
||||
buf := bytes.NewBufferString(CurNode.description())
|
||||
buf.WriteRune('\n')
|
||||
|
||||
for {
|
||||
if CurNode.children != nil && CurChild < len(CurNode.children) {
|
||||
stack = append(stack, CurChild+1)
|
||||
CurNode = CurNode.children[CurChild]
|
||||
CurChild = 0
|
||||
|
||||
Depth := len(stack)
|
||||
if Depth > 32 {
|
||||
Depth = 32
|
||||
}
|
||||
buf.Write(padSpace[:Depth])
|
||||
buf.WriteString(CurNode.description())
|
||||
buf.WriteRune('\n')
|
||||
} else {
|
||||
if len(stack) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
CurChild = stack[len(stack)-1]
|
||||
stack = stack[:len(stack)-1]
|
||||
CurNode = CurNode.next
|
||||
}
|
||||
}
|
||||
return buf.String()
|
||||
}
|
500
vendor/github.com/dlclark/regexp2/syntax/writer.go
generated
vendored
Normal file
500
vendor/github.com/dlclark/regexp2/syntax/writer.go
generated
vendored
Normal file
|
@ -0,0 +1,500 @@
|
|||
package syntax
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
)
|
||||
|
||||
func Write(tree *RegexTree) (*Code, error) {
|
||||
w := writer{
|
||||
intStack: make([]int, 0, 32),
|
||||
emitted: make([]int, 2),
|
||||
stringhash: make(map[string]int),
|
||||
sethash: make(map[string]int),
|
||||
}
|
||||
|
||||
code, err := w.codeFromTree(tree)
|
||||
|
||||
if tree.options&Debug > 0 && code != nil {
|
||||
os.Stdout.WriteString(code.Dump())
|
||||
os.Stdout.WriteString("\n")
|
||||
}
|
||||
|
||||
return code, err
|
||||
}
|
||||
|
||||
type writer struct {
|
||||
emitted []int
|
||||
|
||||
intStack []int
|
||||
curpos int
|
||||
stringhash map[string]int
|
||||
stringtable [][]rune
|
||||
sethash map[string]int
|
||||
settable []*CharSet
|
||||
counting bool
|
||||
count int
|
||||
trackcount int
|
||||
caps map[int]int
|
||||
}
|
||||
|
||||
const (
|
||||
beforeChild nodeType = 64
|
||||
afterChild = 128
|
||||
//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
|
||||
MaxPrefixSize = 50
|
||||
)
|
||||
|
||||
// The top level RegexCode generator. It does a depth-first walk
|
||||
// through the tree and calls EmitFragment to emits code before
|
||||
// and after each child of an interior node, and at each leaf.
|
||||
//
|
||||
// It runs two passes, first to count the size of the generated
|
||||
// code, and second to generate the code.
|
||||
//
|
||||
// We should time it against the alternative, which is
|
||||
// to just generate the code and grow the array as we go.
|
||||
func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
|
||||
var (
|
||||
curNode *regexNode
|
||||
curChild int
|
||||
capsize int
|
||||
)
|
||||
// construct sparse capnum mapping if some numbers are unused
|
||||
|
||||
if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
|
||||
capsize = tree.captop
|
||||
w.caps = nil
|
||||
} else {
|
||||
capsize = len(tree.capnumlist)
|
||||
w.caps = tree.caps
|
||||
for i := 0; i < len(tree.capnumlist); i++ {
|
||||
w.caps[tree.capnumlist[i]] = i
|
||||
}
|
||||
}
|
||||
|
||||
w.counting = true
|
||||
|
||||
for {
|
||||
if !w.counting {
|
||||
w.emitted = make([]int, w.count)
|
||||
}
|
||||
|
||||
curNode = tree.root
|
||||
curChild = 0
|
||||
|
||||
w.emit1(Lazybranch, 0)
|
||||
|
||||
for {
|
||||
if len(curNode.children) == 0 {
|
||||
w.emitFragment(curNode.t, curNode, 0)
|
||||
} else if curChild < len(curNode.children) {
|
||||
w.emitFragment(curNode.t|beforeChild, curNode, curChild)
|
||||
|
||||
curNode = curNode.children[curChild]
|
||||
|
||||
w.pushInt(curChild)
|
||||
curChild = 0
|
||||
continue
|
||||
}
|
||||
|
||||
if w.emptyStack() {
|
||||
break
|
||||
}
|
||||
|
||||
curChild = w.popInt()
|
||||
curNode = curNode.next
|
||||
|
||||
w.emitFragment(curNode.t|afterChild, curNode, curChild)
|
||||
curChild++
|
||||
}
|
||||
|
||||
w.patchJump(0, w.curPos())
|
||||
w.emit(Stop)
|
||||
|
||||
if !w.counting {
|
||||
break
|
||||
}
|
||||
|
||||
w.counting = false
|
||||
}
|
||||
|
||||
fcPrefix := getFirstCharsPrefix(tree)
|
||||
prefix := getPrefix(tree)
|
||||
rtl := (tree.options & RightToLeft) != 0
|
||||
|
||||
var bmPrefix *BmPrefix
|
||||
//TODO: benchmark string prefixes
|
||||
if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
|
||||
if len(prefix.PrefixStr) > MaxPrefixSize {
|
||||
// limit prefix changes to 10k
|
||||
prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
|
||||
}
|
||||
bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
|
||||
} else {
|
||||
bmPrefix = nil
|
||||
}
|
||||
|
||||
return &Code{
|
||||
Codes: w.emitted,
|
||||
Strings: w.stringtable,
|
||||
Sets: w.settable,
|
||||
TrackCount: w.trackcount,
|
||||
Caps: w.caps,
|
||||
Capsize: capsize,
|
||||
FcPrefix: fcPrefix,
|
||||
BmPrefix: bmPrefix,
|
||||
Anchors: getAnchors(tree),
|
||||
RightToLeft: rtl,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// The main RegexCode generator. It does a depth-first walk
|
||||
// through the tree and calls EmitFragment to emits code before
|
||||
// and after each child of an interior node, and at each leaf.
|
||||
func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
|
||||
bits := InstOp(0)
|
||||
|
||||
if nodetype <= ntRef {
|
||||
if (node.options & RightToLeft) != 0 {
|
||||
bits |= Rtl
|
||||
}
|
||||
if (node.options & IgnoreCase) != 0 {
|
||||
bits |= Ci
|
||||
}
|
||||
}
|
||||
ntBits := nodeType(bits)
|
||||
|
||||
switch nodetype {
|
||||
case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
|
||||
break
|
||||
|
||||
case ntAlternate | beforeChild:
|
||||
if curIndex < len(node.children)-1 {
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
}
|
||||
|
||||
case ntAlternate | afterChild:
|
||||
if curIndex < len(node.children)-1 {
|
||||
lbPos := w.popInt()
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
w.patchJump(lbPos, w.curPos())
|
||||
} else {
|
||||
for i := 0; i < curIndex; i++ {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
}
|
||||
break
|
||||
|
||||
case ntTestref | beforeChild:
|
||||
if curIndex == 0 {
|
||||
w.emit(Setjump)
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
w.emit1(Testref, w.mapCapnum(node.m))
|
||||
w.emit(Forejump)
|
||||
}
|
||||
|
||||
case ntTestref | afterChild:
|
||||
if curIndex == 0 {
|
||||
branchpos := w.popInt()
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
w.patchJump(branchpos, w.curPos())
|
||||
w.emit(Forejump)
|
||||
if len(node.children) <= 1 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
} else if curIndex == 1 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
|
||||
case ntTestgroup | beforeChild:
|
||||
if curIndex == 0 {
|
||||
w.emit(Setjump)
|
||||
w.emit(Setmark)
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
}
|
||||
|
||||
case ntTestgroup | afterChild:
|
||||
if curIndex == 0 {
|
||||
w.emit(Getmark)
|
||||
w.emit(Forejump)
|
||||
} else if curIndex == 1 {
|
||||
Branchpos := w.popInt()
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
w.patchJump(Branchpos, w.curPos())
|
||||
w.emit(Getmark)
|
||||
w.emit(Forejump)
|
||||
if len(node.children) <= 2 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
} else if curIndex == 2 {
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
}
|
||||
|
||||
case ntLoop | beforeChild, ntLazyloop | beforeChild:
|
||||
|
||||
if node.n < math.MaxInt32 || node.m > 1 {
|
||||
if node.m == 0 {
|
||||
w.emit1(Nullcount, 0)
|
||||
} else {
|
||||
w.emit1(Setcount, 1-node.m)
|
||||
}
|
||||
} else if node.m == 0 {
|
||||
w.emit(Nullmark)
|
||||
} else {
|
||||
w.emit(Setmark)
|
||||
}
|
||||
|
||||
if node.m == 0 {
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Goto, 0)
|
||||
}
|
||||
w.pushInt(w.curPos())
|
||||
|
||||
case ntLoop | afterChild, ntLazyloop | afterChild:
|
||||
|
||||
startJumpPos := w.curPos()
|
||||
lazy := (nodetype - (ntLoop | afterChild))
|
||||
|
||||
if node.n < math.MaxInt32 || node.m > 1 {
|
||||
if node.n == math.MaxInt32 {
|
||||
w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
|
||||
} else {
|
||||
w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
|
||||
}
|
||||
} else {
|
||||
w.emit1(InstOp(Branchmark+lazy), w.popInt())
|
||||
}
|
||||
|
||||
if node.m == 0 {
|
||||
w.patchJump(w.popInt(), startJumpPos)
|
||||
}
|
||||
|
||||
case ntGroup | beforeChild, ntGroup | afterChild:
|
||||
|
||||
case ntCapture | beforeChild:
|
||||
w.emit(Setmark)
|
||||
|
||||
case ntCapture | afterChild:
|
||||
w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
|
||||
|
||||
case ntRequire | beforeChild:
|
||||
// NOTE: the following line causes lookahead/lookbehind to be
|
||||
// NON-BACKTRACKING. It can be commented out with (*)
|
||||
w.emit(Setjump)
|
||||
|
||||
w.emit(Setmark)
|
||||
|
||||
case ntRequire | afterChild:
|
||||
w.emit(Getmark)
|
||||
|
||||
// NOTE: the following line causes lookahead/lookbehind to be
|
||||
// NON-BACKTRACKING. It can be commented out with (*)
|
||||
w.emit(Forejump)
|
||||
|
||||
case ntPrevent | beforeChild:
|
||||
w.emit(Setjump)
|
||||
w.pushInt(w.curPos())
|
||||
w.emit1(Lazybranch, 0)
|
||||
|
||||
case ntPrevent | afterChild:
|
||||
w.emit(Backjump)
|
||||
w.patchJump(w.popInt(), w.curPos())
|
||||
w.emit(Forejump)
|
||||
|
||||
case ntGreedy | beforeChild:
|
||||
w.emit(Setjump)
|
||||
|
||||
case ntGreedy | afterChild:
|
||||
w.emit(Forejump)
|
||||
|
||||
case ntOne, ntNotone:
|
||||
w.emit1(InstOp(node.t|ntBits), int(node.ch))
|
||||
|
||||
case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
|
||||
if node.m > 0 {
|
||||
if node.t == ntOneloop || node.t == ntOnelazy {
|
||||
w.emit2(Onerep|bits, int(node.ch), node.m)
|
||||
} else {
|
||||
w.emit2(Notonerep|bits, int(node.ch), node.m)
|
||||
}
|
||||
}
|
||||
if node.n > node.m {
|
||||
if node.n == math.MaxInt32 {
|
||||
w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
|
||||
} else {
|
||||
w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
|
||||
}
|
||||
}
|
||||
|
||||
case ntSetloop, ntSetlazy:
|
||||
if node.m > 0 {
|
||||
w.emit2(Setrep|bits, w.setCode(node.set), node.m)
|
||||
}
|
||||
if node.n > node.m {
|
||||
if node.n == math.MaxInt32 {
|
||||
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
|
||||
} else {
|
||||
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
|
||||
}
|
||||
}
|
||||
|
||||
case ntMulti:
|
||||
w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
|
||||
|
||||
case ntSet:
|
||||
w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
|
||||
|
||||
case ntRef:
|
||||
w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
|
||||
|
||||
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
||||
w.emit(InstOp(node.t))
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// To avoid recursion, we use a simple integer stack.
|
||||
// This is the push.
|
||||
func (w *writer) pushInt(i int) {
|
||||
w.intStack = append(w.intStack, i)
|
||||
}
|
||||
|
||||
// Returns true if the stack is empty.
|
||||
func (w *writer) emptyStack() bool {
|
||||
return len(w.intStack) == 0
|
||||
}
|
||||
|
||||
// This is the pop.
|
||||
func (w *writer) popInt() int {
|
||||
//get our item
|
||||
idx := len(w.intStack) - 1
|
||||
i := w.intStack[idx]
|
||||
//trim our slice
|
||||
w.intStack = w.intStack[:idx]
|
||||
return i
|
||||
}
|
||||
|
||||
// Returns the current position in the emitted code.
|
||||
func (w *writer) curPos() int {
|
||||
return w.curpos
|
||||
}
|
||||
|
||||
// Fixes up a jump instruction at the specified offset
|
||||
// so that it jumps to the specified jumpDest.
|
||||
func (w *writer) patchJump(offset, jumpDest int) {
|
||||
w.emitted[offset+1] = jumpDest
|
||||
}
|
||||
|
||||
// Returns an index in the set table for a charset
|
||||
// uses a map to eliminate duplicates.
|
||||
func (w *writer) setCode(set *CharSet) int {
|
||||
if w.counting {
|
||||
return 0
|
||||
}
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
set.mapHashFill(buf)
|
||||
hash := buf.String()
|
||||
i, ok := w.sethash[hash]
|
||||
if !ok {
|
||||
i = len(w.sethash)
|
||||
w.sethash[hash] = i
|
||||
w.settable = append(w.settable, set)
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// Returns an index in the string table for a string.
|
||||
// uses a map to eliminate duplicates.
|
||||
func (w *writer) stringCode(str []rune) int {
|
||||
if w.counting {
|
||||
return 0
|
||||
}
|
||||
|
||||
hash := string(str)
|
||||
i, ok := w.stringhash[hash]
|
||||
if !ok {
|
||||
i = len(w.stringhash)
|
||||
w.stringhash[hash] = i
|
||||
w.stringtable = append(w.stringtable, str)
|
||||
}
|
||||
|
||||
return i
|
||||
}
|
||||
|
||||
// When generating code on a regex that uses a sparse set
|
||||
// of capture slots, we hash them to a dense set of indices
|
||||
// for an array of capture slots. Instead of doing the hash
|
||||
// at match time, it's done at compile time, here.
|
||||
func (w *writer) mapCapnum(capnum int) int {
|
||||
if capnum == -1 {
|
||||
return -1
|
||||
}
|
||||
|
||||
if w.caps != nil {
|
||||
return w.caps[capnum]
|
||||
}
|
||||
|
||||
return capnum
|
||||
}
|
||||
|
||||
// Emits a zero-argument operation. Note that the emit
|
||||
// functions all run in two modes: they can emit code, or
|
||||
// they can just count the size of the code.
|
||||
func (w *writer) emit(op InstOp) {
|
||||
if w.counting {
|
||||
w.count++
|
||||
if opcodeBacktracks(op) {
|
||||
w.trackcount++
|
||||
}
|
||||
return
|
||||
}
|
||||
w.emitted[w.curpos] = int(op)
|
||||
w.curpos++
|
||||
}
|
||||
|
||||
// Emits a one-argument operation.
|
||||
func (w *writer) emit1(op InstOp, opd1 int) {
|
||||
if w.counting {
|
||||
w.count += 2
|
||||
if opcodeBacktracks(op) {
|
||||
w.trackcount++
|
||||
}
|
||||
return
|
||||
}
|
||||
w.emitted[w.curpos] = int(op)
|
||||
w.curpos++
|
||||
w.emitted[w.curpos] = opd1
|
||||
w.curpos++
|
||||
}
|
||||
|
||||
// Emits a two-argument operation.
|
||||
func (w *writer) emit2(op InstOp, opd1, opd2 int) {
|
||||
if w.counting {
|
||||
w.count += 3
|
||||
if opcodeBacktracks(op) {
|
||||
w.trackcount++
|
||||
}
|
||||
return
|
||||
}
|
||||
w.emitted[w.curpos] = int(op)
|
||||
w.curpos++
|
||||
w.emitted[w.curpos] = opd1
|
||||
w.curpos++
|
||||
w.emitted[w.curpos] = opd2
|
||||
w.curpos++
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue