1
0
Fork 0
forked from forgejo/forgejo

Server-side syntax highlighting for all code (#12047)

* Server-side syntax hilighting for all code

This PR does a few things:

* Remove all traces of highlight.js
* Use chroma library to provide fast syntax hilighting directly on the server
* Provide syntax hilighting for diffs
* Re-style both unified and split diffs views
* Add custom syntax hilighting styling for both regular and arc-green

Fixes #7729
Fixes #10157
Fixes #11825
Fixes #7728
Fixes #3872
Fixes #3682

And perhaps gets closer to #9553

* fix line marker

* fix repo search

* Fix single line select

* properly load settings

* npm uninstall highlight.js

* review suggestion

* code review

* forgot to call function

* fix test

* Apply suggestions from code review

suggestions from @silverwind thanks

Co-authored-by: silverwind <me@silverwind.io>

* code review

* copy/paste error

* Use const for highlight size limit

* Update web_src/less/_repository.less

Co-authored-by: Lauris BH <lauris@nix.lv>

* update size limit to 1MB and other styling tweaks

* fix highlighting for certain diff sections

* fix test

* add worker back as suggested

Co-authored-by: silverwind <me@silverwind.io>
Co-authored-by: Lauris BH <lauris@nix.lv>
This commit is contained in:
mrsdizzie 2020-06-30 17:34:03 -04:00 committed by GitHub
parent ce5f2b9845
commit af7ffaa279
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
336 changed files with 37293 additions and 769 deletions

854
vendor/github.com/dlclark/regexp2/syntax/charclass.go generated vendored Normal file
View file

@ -0,0 +1,854 @@
package syntax
import (
"bytes"
"encoding/binary"
"fmt"
"sort"
"unicode"
"unicode/utf8"
)
// CharSet combines start-end rune ranges and unicode categories representing a set of characters
type CharSet struct {
ranges []singleRange
categories []category
sub *CharSet //optional subtractor
negate bool
anything bool
}
type category struct {
negate bool
cat string
}
type singleRange struct {
first rune
last rune
}
const (
spaceCategoryText = " "
wordCategoryText = "W"
)
var (
ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
ecmaDigit = []rune{0x0030, 0x003a}
)
var (
AnyClass = getCharSetFromOldString([]rune{0}, false)
ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
NoneClass = getCharSetFromOldString(nil, false)
ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
DigitClass = getCharSetFromCategoryString(false, false, "Nd")
NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
)
var unicodeCategories = func() map[string]*unicode.RangeTable {
retVal := make(map[string]*unicode.RangeTable)
for k, v := range unicode.Scripts {
retVal[k] = v
}
for k, v := range unicode.Categories {
retVal[k] = v
}
for k, v := range unicode.Properties {
retVal[k] = v
}
return retVal
}()
func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
if negateCat && negateSet {
panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
}
c := CharSet{negate: negateSet}
c.categories = make([]category, len(cats))
for i, cat := range cats {
c.categories[i] = category{cat: cat, negate: negateCat}
}
return func() *CharSet {
//make a copy each time
local := c
//return that address
return &local
}
}
func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
c := CharSet{}
if len(setText) > 0 {
fillFirst := false
l := len(setText)
if negate {
if setText[0] == 0 {
setText = setText[1:]
} else {
l++
fillFirst = true
}
}
if l%2 == 0 {
c.ranges = make([]singleRange, l/2)
} else {
c.ranges = make([]singleRange, l/2+1)
}
first := true
if fillFirst {
c.ranges[0] = singleRange{first: 0}
first = false
}
i := 0
for _, r := range setText {
if first {
// lower bound in a new range
c.ranges[i] = singleRange{first: r}
first = false
} else {
c.ranges[i].last = r - 1
i++
first = true
}
}
if !first {
c.ranges[i].last = utf8.MaxRune
}
}
return func() *CharSet {
local := c
return &local
}
}
// Copy makes a deep copy to prevent accidental mutation of a set
func (c CharSet) Copy() CharSet {
ret := CharSet{
anything: c.anything,
negate: c.negate,
}
ret.ranges = append(ret.ranges, c.ranges...)
ret.categories = append(ret.categories, c.categories...)
if c.sub != nil {
sub := c.sub.Copy()
ret.sub = &sub
}
return ret
}
// gets a human-readable description for a set string
func (c CharSet) String() string {
buf := &bytes.Buffer{}
buf.WriteRune('[')
if c.IsNegated() {
buf.WriteRune('^')
}
for _, r := range c.ranges {
buf.WriteString(CharDescription(r.first))
if r.first != r.last {
if r.last-r.first != 1 {
//groups that are 1 char apart skip the dash
buf.WriteRune('-')
}
buf.WriteString(CharDescription(r.last))
}
}
for _, c := range c.categories {
buf.WriteString(c.String())
}
if c.sub != nil {
buf.WriteRune('-')
buf.WriteString(c.sub.String())
}
buf.WriteRune(']')
return buf.String()
}
// mapHashFill converts a charset into a buffer for use in maps
func (c CharSet) mapHashFill(buf *bytes.Buffer) {
if c.negate {
buf.WriteByte(0)
} else {
buf.WriteByte(1)
}
binary.Write(buf, binary.LittleEndian, len(c.ranges))
binary.Write(buf, binary.LittleEndian, len(c.categories))
for _, r := range c.ranges {
buf.WriteRune(r.first)
buf.WriteRune(r.last)
}
for _, ct := range c.categories {
buf.WriteString(ct.cat)
if ct.negate {
buf.WriteByte(1)
} else {
buf.WriteByte(0)
}
}
if c.sub != nil {
c.sub.mapHashFill(buf)
}
}
// CharIn returns true if the rune is in our character set (either ranges or categories).
// It handles negations and subtracted sub-charsets.
func (c CharSet) CharIn(ch rune) bool {
val := false
// in s && !s.subtracted
//check ranges
for _, r := range c.ranges {
if ch < r.first {
continue
}
if ch <= r.last {
val = true
break
}
}
//check categories if we haven't already found a range
if !val && len(c.categories) > 0 {
for _, ct := range c.categories {
// special categories...then unicode
if ct.cat == spaceCategoryText {
if unicode.IsSpace(ch) {
// we found a space so we're done
// negate means this is a "bad" thing
val = !ct.negate
break
} else if ct.negate {
val = true
break
}
} else if ct.cat == wordCategoryText {
if IsWordChar(ch) {
val = !ct.negate
break
} else if ct.negate {
val = true
break
}
} else if unicode.Is(unicodeCategories[ct.cat], ch) {
// if we're in this unicode category then we're done
// if negate=true on this category then we "failed" our test
// otherwise we're good that we found it
val = !ct.negate
break
} else if ct.negate {
val = true
break
}
}
}
// negate the whole char set
if c.negate {
val = !val
}
// get subtracted recurse
if val && c.sub != nil {
val = !c.sub.CharIn(ch)
}
//log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
return val
}
func (c category) String() string {
switch c.cat {
case spaceCategoryText:
if c.negate {
return "\\S"
}
return "\\s"
case wordCategoryText:
if c.negate {
return "\\W"
}
return "\\w"
}
if _, ok := unicodeCategories[c.cat]; ok {
if c.negate {
return "\\P{" + c.cat + "}"
}
return "\\p{" + c.cat + "}"
}
return "Unknown category: " + c.cat
}
// CharDescription Produces a human-readable description for a single character.
func CharDescription(ch rune) string {
/*if ch == '\\' {
return "\\\\"
}
if ch > ' ' && ch <= '~' {
return string(ch)
} else if ch == '\n' {
return "\\n"
} else if ch == ' ' {
return "\\ "
}*/
b := &bytes.Buffer{}
escape(b, ch, false) //fmt.Sprintf("%U", ch)
return b.String()
}
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
func IsWordChar(r rune) bool {
//"L", "Mn", "Nd", "Pc"
return unicode.In(r,
unicode.Categories["L"], unicode.Categories["Mn"],
unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}
func IsECMAWordChar(r rune) bool {
return unicode.In(r,
unicode.Categories["L"], unicode.Categories["Mn"],
unicode.Categories["Nd"], unicode.Categories["Pc"])
//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}
// SingletonChar will return the char from the first range without validation.
// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
func (c CharSet) SingletonChar() rune {
return c.ranges[0].first
}
func (c CharSet) IsSingleton() bool {
return !c.negate && //negated is multiple chars
len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
c.sub == nil && // subtraction means we've got multiple chars
c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
}
func (c CharSet) IsSingletonInverse() bool {
return c.negate && //same as above, but requires negated
len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
c.sub == nil && // subtraction means we've got multiple chars
c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
}
func (c CharSet) IsMergeable() bool {
return !c.IsNegated() && !c.HasSubtraction()
}
func (c CharSet) IsNegated() bool {
return c.negate
}
func (c CharSet) HasSubtraction() bool {
return c.sub != nil
}
func (c CharSet) IsEmpty() bool {
return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
}
func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
if ecma {
if negate {
c.addRanges(NotECMADigitClass().ranges)
} else {
c.addRanges(ECMADigitClass().ranges)
}
} else {
c.addCategories(category{cat: "Nd", negate: negate})
}
}
func (c *CharSet) addChar(ch rune) {
c.addRange(ch, ch)
}
func (c *CharSet) addSpace(ecma, negate bool) {
if ecma {
if negate {
c.addRanges(NotECMASpaceClass().ranges)
} else {
c.addRanges(ECMASpaceClass().ranges)
}
} else {
c.addCategories(category{cat: spaceCategoryText, negate: negate})
}
}
func (c *CharSet) addWord(ecma, negate bool) {
if ecma {
if negate {
c.addRanges(NotECMAWordClass().ranges)
} else {
c.addRanges(ECMAWordClass().ranges)
}
} else {
c.addCategories(category{cat: wordCategoryText, negate: negate})
}
}
// Add set ranges and categories into ours -- no deduping or anything
func (c *CharSet) addSet(set CharSet) {
if c.anything {
return
}
if set.anything {
c.makeAnything()
return
}
// just append here to prevent double-canon
c.ranges = append(c.ranges, set.ranges...)
c.addCategories(set.categories...)
c.canonicalize()
}
func (c *CharSet) makeAnything() {
c.anything = true
c.categories = []category{}
c.ranges = AnyClass().ranges
}
func (c *CharSet) addCategories(cats ...category) {
// don't add dupes and remove positive+negative
if c.anything {
// if we've had a previous positive+negative group then
// just return, we're as broad as we can get
return
}
for _, ct := range cats {
found := false
for _, ct2 := range c.categories {
if ct.cat == ct2.cat {
if ct.negate != ct2.negate {
// oposite negations...this mean we just
// take us as anything and move on
c.makeAnything()
return
}
found = true
break
}
}
if !found {
c.categories = append(c.categories, ct)
}
}
}
// Merges new ranges to our own
func (c *CharSet) addRanges(ranges []singleRange) {
if c.anything {
return
}
c.ranges = append(c.ranges, ranges...)
c.canonicalize()
}
// Merges everything but the new ranges into our own
func (c *CharSet) addNegativeRanges(ranges []singleRange) {
if c.anything {
return
}
var hi rune
// convert incoming ranges into opposites, assume they are in order
for _, r := range ranges {
if hi < r.first {
c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
}
hi = r.last + 1
}
if hi < utf8.MaxRune {
c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
}
c.canonicalize()
}
func isValidUnicodeCat(catName string) bool {
_, ok := unicodeCategories[catName]
return ok
}
func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
if !isValidUnicodeCat(categoryName) {
// unknown unicode category, script, or property "blah"
panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
}
if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
// when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
c.addCategories(
category{cat: "Ll", negate: negate},
category{cat: "Lu", negate: negate},
category{cat: "Lt", negate: negate})
}
c.addCategories(category{cat: categoryName, negate: negate})
}
func (c *CharSet) addSubtraction(sub *CharSet) {
c.sub = sub
}
func (c *CharSet) addRange(chMin, chMax rune) {
c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
c.canonicalize()
}
func (c *CharSet) addNamedASCII(name string, negate bool) bool {
var rs []singleRange
switch name {
case "alnum":
rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
case "alpha":
rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
case "ascii":
rs = []singleRange{singleRange{0, 0x7f}}
case "blank":
rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
case "cntrl":
rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
case "digit":
c.addDigit(false, negate, "")
case "graph":
rs = []singleRange{singleRange{'!', '~'}}
case "lower":
rs = []singleRange{singleRange{'a', 'z'}}
case "print":
rs = []singleRange{singleRange{' ', '~'}}
case "punct": //[!-/:-@[-`{-~]
rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
case "space":
c.addSpace(true, negate)
case "upper":
rs = []singleRange{singleRange{'A', 'Z'}}
case "word":
c.addWord(true, negate)
case "xdigit":
rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
default:
return false
}
if len(rs) > 0 {
if negate {
c.addNegativeRanges(rs)
} else {
c.addRanges(rs)
}
}
return true
}
type singleRangeSorter []singleRange
func (p singleRangeSorter) Len() int { return len(p) }
func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
// Logic to reduce a character class to a unique, sorted form.
func (c *CharSet) canonicalize() {
var i, j int
var last rune
//
// Find and eliminate overlapping or abutting ranges
//
if len(c.ranges) > 1 {
sort.Sort(singleRangeSorter(c.ranges))
done := false
for i, j = 1, 0; ; i++ {
for last = c.ranges[j].last; ; i++ {
if i == len(c.ranges) || last == utf8.MaxRune {
done = true
break
}
CurrentRange := c.ranges[i]
if CurrentRange.first > last+1 {
break
}
if last < CurrentRange.last {
last = CurrentRange.last
}
}
c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
j++
if done {
break
}
if j < i {
c.ranges[j] = c.ranges[i]
}
}
c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
}
}
// Adds to the class any lowercase versions of characters already
// in the class. Used for case-insensitivity.
func (c *CharSet) addLowercase() {
if c.anything {
return
}
toAdd := []singleRange{}
for i := 0; i < len(c.ranges); i++ {
r := c.ranges[i]
if r.first == r.last {
lower := unicode.ToLower(r.first)
c.ranges[i] = singleRange{first: lower, last: lower}
} else {
toAdd = append(toAdd, r)
}
}
for _, r := range toAdd {
c.addLowercaseRange(r.first, r.last)
}
c.canonicalize()
}
/**************************************************************************
Let U be the set of Unicode character values and let L be the lowercase
function, mapping from U to U. To perform case insensitive matching of
character sets, we need to be able to map an interval I in U, say
I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
to a set A such that A contains L(I) and A is contained in the union of
I and L(I).
The table below partitions U into intervals on which L is non-decreasing.
Thus, for any interval J = [a, b] contained in one of these intervals,
L(J) is contained in [L(a), L(b)].
It is also true that for any such J, [L(a), L(b)] is contained in the
union of J and L(J). This does not follow from L being non-decreasing on
these intervals. It follows from the nature of the L on each interval.
On each interval, L has one of the following forms:
(1) L(ch) = constant (LowercaseSet)
(2) L(ch) = ch + offset (LowercaseAdd)
(3) L(ch) = ch | 1 (LowercaseBor)
(4) L(ch) = ch + (ch & 1) (LowercaseBad)
It is easy to verify that for any of these forms [L(a), L(b)] is
contained in the union of [a, b] and L([a, b]).
***************************************************************************/
const (
LowercaseSet = 0 // Set to arg.
LowercaseAdd = 1 // Add arg.
LowercaseBor = 2 // Bitwise or with 1.
LowercaseBad = 3 // Bitwise and with 1 and add original.
)
type lcMap struct {
chMin, chMax rune
op, data int32
}
var lcTable = []lcMap{
lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
lcMap{'\u0100', '\u012E', LowercaseBor, 0},
lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
lcMap{'\u0132', '\u0136', LowercaseBor, 0},
lcMap{'\u0139', '\u0147', LowercaseBad, 0},
lcMap{'\u014A', '\u0176', LowercaseBor, 0},
lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
lcMap{'\u0179', '\u017D', LowercaseBad, 0},
lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
lcMap{'\u0182', '\u0184', LowercaseBor, 0},
lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
lcMap{'\u0460', '\u0480', LowercaseBor, 0},
lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
}
func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
var i, iMax, iMid int
var chMinT, chMaxT rune
var lc lcMap
for i, iMax = 0, len(lcTable); i < iMax; {
iMid = (i + iMax) / 2
if lcTable[iMid].chMax < chMin {
i = iMid + 1
} else {
iMax = iMid
}
}
for ; i < len(lcTable); i++ {
lc = lcTable[i]
if lc.chMin > chMax {
return
}
chMinT = lc.chMin
if chMinT < chMin {
chMinT = chMin
}
chMaxT = lc.chMax
if chMaxT > chMax {
chMaxT = chMax
}
switch lc.op {
case LowercaseSet:
chMinT = rune(lc.data)
chMaxT = rune(lc.data)
break
case LowercaseAdd:
chMinT += lc.data
chMaxT += lc.data
break
case LowercaseBor:
chMinT |= 1
chMaxT |= 1
break
case LowercaseBad:
chMinT += (chMinT & 1)
chMaxT += (chMaxT & 1)
break
}
if chMinT < chMin || chMaxT > chMax {
c.addRange(chMinT, chMaxT)
}
}
}

274
vendor/github.com/dlclark/regexp2/syntax/code.go generated vendored Normal file
View file

@ -0,0 +1,274 @@
package syntax
import (
"bytes"
"fmt"
"math"
)
// similar to prog.go in the go regex package...also with comment 'may not belong in this package'
// File provides operator constants for use by the Builder and the Machine.
// Implementation notes:
//
// Regexps are built into RegexCodes, which contain an operation array,
// a string table, and some constants.
//
// Each operation is one of the codes below, followed by the integer
// operands specified for each op.
//
// Strings and sets are indices into a string table.
type InstOp int
const (
// lef/back operands description
Onerep InstOp = 0 // lef,back char,min,max a {n}
Notonerep = 1 // lef,back char,min,max .{n}
Setrep = 2 // lef,back set,min,max [\d]{n}
Oneloop = 3 // lef,back char,min,max a {,n}
Notoneloop = 4 // lef,back char,min,max .{,n}
Setloop = 5 // lef,back set,min,max [\d]{,n}
Onelazy = 6 // lef,back char,min,max a {,n}?
Notonelazy = 7 // lef,back char,min,max .{,n}?
Setlazy = 8 // lef,back set,min,max [\d]{,n}?
One = 9 // lef char a
Notone = 10 // lef char [^a]
Set = 11 // lef set [a-z\s] \w \s \d
Multi = 12 // lef string abcd
Ref = 13 // lef group \#
Bol = 14 // ^
Eol = 15 // $
Boundary = 16 // \b
Nonboundary = 17 // \B
Beginning = 18 // \A
Start = 19 // \G
EndZ = 20 // \Z
End = 21 // \Z
Nothing = 22 // Reject!
// Primitive control structures
Lazybranch = 23 // back jump straight first
Branchmark = 24 // back jump branch first for loop
Lazybranchmark = 25 // back jump straight first for loop
Nullcount = 26 // back val set counter, null mark
Setcount = 27 // back val set counter, make mark
Branchcount = 28 // back jump,limit branch++ if zero<=c<limit
Lazybranchcount = 29 // back jump,limit same, but straight first
Nullmark = 30 // back save position
Setmark = 31 // back save position
Capturemark = 32 // back group define group
Getmark = 33 // back recall position
Setjump = 34 // back save backtrack state
Backjump = 35 // zap back to saved state
Forejump = 36 // zap backtracking state
Testref = 37 // backtrack if ref undefined
Goto = 38 // jump just go
Prune = 39 // prune it baby
Stop = 40 // done!
ECMABoundary = 41 // \b
NonECMABoundary = 42 // \B
// Modifiers for alternate modes
Mask = 63 // Mask to get unmodified ordinary operator
Rtl = 64 // bit to indicate that we're reverse scanning.
Back = 128 // bit to indicate that we're backtracking.
Back2 = 256 // bit to indicate that we're backtracking on a second branch.
Ci = 512 // bit to indicate that we're case-insensitive.
)
type Code struct {
Codes []int // the code
Strings [][]rune // string table
Sets []*CharSet //character set table
TrackCount int // how many instructions use backtracking
Caps map[int]int // mapping of user group numbers -> impl group slots
Capsize int // number of impl group slots
FcPrefix *Prefix // the set of candidate first characters (may be null)
BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
RightToLeft bool // true if right to left
}
func opcodeBacktracks(op InstOp) bool {
op &= Mask
switch op {
case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
Forejump, Goto:
return true
default:
return false
}
}
func opcodeSize(op InstOp) int {
op &= Mask
switch op {
case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
return 1
case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
Prune, Set:
return 2
case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
Setlazy, Setrep, Setloop:
return 3
default:
panic(fmt.Errorf("Unexpected op code: %v", op))
}
}
var codeStr = []string{
"Onerep", "Notonerep", "Setrep",
"Oneloop", "Notoneloop", "Setloop",
"Onelazy", "Notonelazy", "Setlazy",
"One", "Notone", "Set",
"Multi", "Ref",
"Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
"Nothing",
"Lazybranch", "Branchmark", "Lazybranchmark",
"Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
"Nullmark", "Setmark", "Capturemark", "Getmark",
"Setjump", "Backjump", "Forejump", "Testref", "Goto",
"Prune", "Stop",
"ECMABoundary", "NonECMABoundary",
}
func operatorDescription(op InstOp) string {
desc := codeStr[op&Mask]
if (op & Ci) != 0 {
desc += "-Ci"
}
if (op & Rtl) != 0 {
desc += "-Rtl"
}
if (op & Back) != 0 {
desc += "-Back"
}
if (op & Back2) != 0 {
desc += "-Back2"
}
return desc
}
// OpcodeDescription is a humman readable string of the specific offset
func (c *Code) OpcodeDescription(offset int) string {
buf := &bytes.Buffer{}
op := InstOp(c.Codes[offset])
fmt.Fprintf(buf, "%06d ", offset)
if opcodeBacktracks(op & Mask) {
buf.WriteString("*")
} else {
buf.WriteString(" ")
}
buf.WriteString(operatorDescription(op))
buf.WriteString("(")
op &= Mask
switch op {
case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
buf.WriteString("Ch = ")
buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
case Set, Setrep, Setloop, Setlazy:
buf.WriteString("Set = ")
buf.WriteString(c.Sets[c.Codes[offset+1]].String())
case Multi:
fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
case Ref, Testref:
fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
case Capturemark:
fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
if c.Codes[offset+2] != -1 {
fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
}
case Nullcount, Setcount:
fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
}
switch op {
case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
buf.WriteString(", Rep = ")
if c.Codes[offset+2] == math.MaxInt32 {
buf.WriteString("inf")
} else {
fmt.Fprintf(buf, "%d", c.Codes[offset+2])
}
case Branchcount, Lazybranchcount:
buf.WriteString(", Limit = ")
if c.Codes[offset+2] == math.MaxInt32 {
buf.WriteString("inf")
} else {
fmt.Fprintf(buf, "%d", c.Codes[offset+2])
}
}
buf.WriteString(")")
return buf.String()
}
func (c *Code) Dump() string {
buf := &bytes.Buffer{}
if c.RightToLeft {
fmt.Fprintln(buf, "Direction: right-to-left")
} else {
fmt.Fprintln(buf, "Direction: left-to-right")
}
if c.FcPrefix == nil {
fmt.Fprintln(buf, "Firstchars: n/a")
} else {
fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
}
if c.BmPrefix == nil {
fmt.Fprintln(buf, "Prefix: n/a")
} else {
fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String()))
}
fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors)
fmt.Fprintln(buf)
if c.BmPrefix != nil {
fmt.Fprintln(buf, "BoyerMoore:")
fmt.Fprintln(buf, c.BmPrefix.Dump(" "))
}
for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
fmt.Fprintln(buf, c.OpcodeDescription(i))
}
return buf.String()
}

94
vendor/github.com/dlclark/regexp2/syntax/escape.go generated vendored Normal file
View file

@ -0,0 +1,94 @@
package syntax
import (
"bytes"
"strconv"
"strings"
"unicode"
)
func Escape(input string) string {
b := &bytes.Buffer{}
for _, r := range input {
escape(b, r, false)
}
return b.String()
}
const meta = `\.+*?()|[]{}^$# `
func escape(b *bytes.Buffer, r rune, force bool) {
if unicode.IsPrint(r) {
if strings.IndexRune(meta, r) >= 0 || force {
b.WriteRune('\\')
}
b.WriteRune(r)
return
}
switch r {
case '\a':
b.WriteString(`\a`)
case '\f':
b.WriteString(`\f`)
case '\n':
b.WriteString(`\n`)
case '\r':
b.WriteString(`\r`)
case '\t':
b.WriteString(`\t`)
case '\v':
b.WriteString(`\v`)
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.FormatInt(int64(r), 16)
if len(s) == 1 {
b.WriteRune('0')
}
b.WriteString(s)
break
}
b.WriteString(`\u`)
b.WriteString(strconv.FormatInt(int64(r), 16))
}
}
func Unescape(input string) (string, error) {
idx := strings.IndexRune(input, '\\')
// no slashes means no unescape needed
if idx == -1 {
return input, nil
}
buf := bytes.NewBufferString(input[:idx])
// get the runes for the rest of the string -- we're going full parser scan on this
p := parser{}
p.setPattern(input[idx+1:])
for {
if p.rightMost() {
return "", p.getErr(ErrIllegalEndEscape)
}
r, err := p.scanCharEscape()
if err != nil {
return "", err
}
buf.WriteRune(r)
// are we done?
if p.rightMost() {
return buf.String(), nil
}
r = p.moveRightGetChar()
for r != '\\' {
buf.WriteRune(r)
if p.rightMost() {
// we're done, no more slashes
return buf.String(), nil
}
// keep scanning until we get another slash
r = p.moveRightGetChar()
}
}
}

20
vendor/github.com/dlclark/regexp2/syntax/fuzz.go generated vendored Normal file
View file

@ -0,0 +1,20 @@
// +build gofuzz
package syntax
// Fuzz is the input point for go-fuzz
func Fuzz(data []byte) int {
sdata := string(data)
tree, err := Parse(sdata, RegexOptions(0))
if err != nil {
return 0
}
// translate it to code
_, err = Write(tree)
if err != nil {
panic(err)
}
return 1
}

2202
vendor/github.com/dlclark/regexp2/syntax/parser.go generated vendored Normal file

File diff suppressed because it is too large Load diff

896
vendor/github.com/dlclark/regexp2/syntax/prefix.go generated vendored Normal file
View file

@ -0,0 +1,896 @@
package syntax
import (
"bytes"
"fmt"
"strconv"
"unicode"
"unicode/utf8"
)
type Prefix struct {
PrefixStr []rune
PrefixSet CharSet
CaseInsensitive bool
}
// It takes a RegexTree and computes the set of chars that can start it.
func getFirstCharsPrefix(tree *RegexTree) *Prefix {
s := regexFcd{
fcStack: make([]regexFc, 32),
intStack: make([]int, 32),
}
fc := s.regexFCFromRegexTree(tree)
if fc == nil || fc.nullable || fc.cc.IsEmpty() {
return nil
}
fcSet := fc.getFirstChars()
return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
}
type regexFcd struct {
intStack []int
intDepth int
fcStack []regexFc
fcDepth int
skipAllChildren bool // don't process any more children at the current level
skipchild bool // don't process the current child.
failed bool
}
/*
* The main FC computation. It does a shortcutted depth-first walk
* through the tree and calls CalculateFC to emits code before
* and after each child of an interior node, and at each leaf.
*/
func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
curNode := tree.root
curChild := 0
for {
if len(curNode.children) == 0 {
// This is a leaf node
s.calculateFC(curNode.t, curNode, 0)
} else if curChild < len(curNode.children) && !s.skipAllChildren {
// This is an interior node, and we have more children to analyze
s.calculateFC(curNode.t|beforeChild, curNode, curChild)
if !s.skipchild {
curNode = curNode.children[curChild]
// this stack is how we get a depth first walk of the tree.
s.pushInt(curChild)
curChild = 0
} else {
curChild++
s.skipchild = false
}
continue
}
// This is an interior node where we've finished analyzing all the children, or
// the end of a leaf node.
s.skipAllChildren = false
if s.intIsEmpty() {
break
}
curChild = s.popInt()
curNode = curNode.next
s.calculateFC(curNode.t|afterChild, curNode, curChild)
if s.failed {
return nil
}
curChild++
}
if s.fcIsEmpty() {
return nil
}
return s.popFC()
}
// To avoid recursion, we use a simple integer stack.
// This is the push.
func (s *regexFcd) pushInt(I int) {
if s.intDepth >= len(s.intStack) {
expanded := make([]int, s.intDepth*2)
copy(expanded, s.intStack)
s.intStack = expanded
}
s.intStack[s.intDepth] = I
s.intDepth++
}
// True if the stack is empty.
func (s *regexFcd) intIsEmpty() bool {
return s.intDepth == 0
}
// This is the pop.
func (s *regexFcd) popInt() int {
s.intDepth--
return s.intStack[s.intDepth]
}
// We also use a stack of RegexFC objects.
// This is the push.
func (s *regexFcd) pushFC(fc regexFc) {
if s.fcDepth >= len(s.fcStack) {
expanded := make([]regexFc, s.fcDepth*2)
copy(expanded, s.fcStack)
s.fcStack = expanded
}
s.fcStack[s.fcDepth] = fc
s.fcDepth++
}
// True if the stack is empty.
func (s *regexFcd) fcIsEmpty() bool {
return s.fcDepth == 0
}
// This is the pop.
func (s *regexFcd) popFC() *regexFc {
s.fcDepth--
return &s.fcStack[s.fcDepth]
}
// This is the top.
func (s *regexFcd) topFC() *regexFc {
return &s.fcStack[s.fcDepth-1]
}
// Called in Beforechild to prevent further processing of the current child
func (s *regexFcd) skipChild() {
s.skipchild = true
}
// FC computation and shortcut cases for each node type
func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
//fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
ci := false
rtl := false
if nt <= ntRef {
if (node.options & IgnoreCase) != 0 {
ci = true
}
if (node.options & RightToLeft) != 0 {
rtl = true
}
}
switch nt {
case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
break
case ntTestgroup | beforeChild:
if CurIndex == 0 {
s.skipChild()
}
break
case ntEmpty:
s.pushFC(regexFc{nullable: true})
break
case ntConcatenate | afterChild:
if CurIndex != 0 {
child := s.popFC()
cumul := s.topFC()
s.failed = !cumul.addFC(*child, true)
}
fc := s.topFC()
if !fc.nullable {
s.skipAllChildren = true
}
break
case ntTestgroup | afterChild:
if CurIndex > 1 {
child := s.popFC()
cumul := s.topFC()
s.failed = !cumul.addFC(*child, false)
}
break
case ntAlternate | afterChild, ntTestref | afterChild:
if CurIndex != 0 {
child := s.popFC()
cumul := s.topFC()
s.failed = !cumul.addFC(*child, false)
}
break
case ntLoop | afterChild, ntLazyloop | afterChild:
if node.m == 0 {
fc := s.topFC()
fc.nullable = true
}
break
case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
break
case ntRequire | beforeChild, ntPrevent | beforeChild:
s.skipChild()
s.pushFC(regexFc{nullable: true})
break
case ntRequire | afterChild, ntPrevent | afterChild:
break
case ntOne, ntNotone:
s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
break
case ntOneloop, ntOnelazy:
s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
break
case ntNotoneloop, ntNotonelazy:
s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
break
case ntMulti:
if len(node.str) == 0 {
s.pushFC(regexFc{nullable: true})
} else if !rtl {
s.pushFC(newRegexFc(node.str[0], false, false, ci))
} else {
s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
}
break
case ntSet:
s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
break
case ntSetloop, ntSetlazy:
s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
break
case ntRef:
s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
break
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
s.pushFC(regexFc{nullable: true})
break
default:
panic(fmt.Sprintf("unexpected op code: %v", nt))
}
}
type regexFc struct {
cc CharSet
nullable bool
caseInsensitive bool
}
func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
r := regexFc{
caseInsensitive: caseInsensitive,
nullable: nullable,
}
if not {
if ch > 0 {
r.cc.addRange('\x00', ch-1)
}
if ch < 0xFFFF {
r.cc.addRange(ch+1, utf8.MaxRune)
}
} else {
r.cc.addRange(ch, ch)
}
return r
}
func (r *regexFc) getFirstChars() CharSet {
if r.caseInsensitive {
r.cc.addLowercase()
}
return r.cc
}
func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
return false
}
if concatenate {
if !r.nullable {
return true
}
if !fc.nullable {
r.nullable = false
}
} else {
if fc.nullable {
r.nullable = true
}
}
r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
r.cc.addSet(fc.cc)
return true
}
// This is a related computation: it takes a RegexTree and computes the
// leading substring if it sees one. It's quite trivial and gives up easily.
func getPrefix(tree *RegexTree) *Prefix {
var concatNode *regexNode
nextChild := 0
curNode := tree.root
for {
switch curNode.t {
case ntConcatenate:
if len(curNode.children) > 0 {
concatNode = curNode
nextChild = 0
}
case ntGreedy, ntCapture:
curNode = curNode.children[0]
concatNode = nil
continue
case ntOneloop, ntOnelazy:
if curNode.m > 0 {
return &Prefix{
PrefixStr: repeat(curNode.ch, curNode.m),
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
}
}
return nil
case ntOne:
return &Prefix{
PrefixStr: []rune{curNode.ch},
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
}
case ntMulti:
return &Prefix{
PrefixStr: curNode.str,
CaseInsensitive: (curNode.options & IgnoreCase) != 0,
}
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
default:
return nil
}
if concatNode == nil || nextChild >= len(concatNode.children) {
return nil
}
curNode = concatNode.children[nextChild]
nextChild++
}
}
// repeat the rune r, c times... up to the max of MaxPrefixSize
func repeat(r rune, c int) []rune {
if c > MaxPrefixSize {
c = MaxPrefixSize
}
ret := make([]rune, c)
// binary growth using copy for speed
ret[0] = r
bp := 1
for bp < len(ret) {
copy(ret[bp:], ret[:bp])
bp *= 2
}
return ret
}
// BmPrefix precomputes the Boyer-Moore
// tables for fast string scanning. These tables allow
// you to scan for the first occurrence of a string within
// a large body of text without examining every character.
// The performance of the heuristic depends on the actual
// string and the text being searched, but usually, the longer
// the string that is being searched for, the fewer characters
// need to be examined.
type BmPrefix struct {
positive []int
negativeASCII []int
negativeUnicode [][]int
pattern []rune
lowASCII rune
highASCII rune
rightToLeft bool
caseInsensitive bool
}
func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
b := &BmPrefix{
rightToLeft: rightToLeft,
caseInsensitive: caseInsensitive,
pattern: pattern,
}
if caseInsensitive {
for i := 0; i < len(b.pattern); i++ {
// We do the ToLower character by character for consistency. With surrogate chars, doing
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
// linguistically, but since Regex doesn't support surrogates, it's more important to be
// consistent.
b.pattern[i] = unicode.ToLower(b.pattern[i])
}
}
var beforefirst, last, bump int
var scan, match int
if !rightToLeft {
beforefirst = -1
last = len(b.pattern) - 1
bump = 1
} else {
beforefirst = len(b.pattern)
last = 0
bump = -1
}
// PART I - the good-suffix shift table
//
// compute the positive requirement:
// if char "i" is the first one from the right that doesn't match,
// then we know the matcher can advance by _positive[i].
//
// This algorithm is a simplified variant of the standard
// Boyer-Moore good suffix calculation.
b.positive = make([]int, len(b.pattern))
examine := last
ch := b.pattern[examine]
b.positive[examine] = bump
examine -= bump
Outerloop:
for {
// find an internal char (examine) that matches the tail
for {
if examine == beforefirst {
break Outerloop
}
if b.pattern[examine] == ch {
break
}
examine -= bump
}
match = last
scan = examine
// find the length of the match
for {
if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
// at the end of the match, note the difference in _positive
// this is not the length of the match, but the distance from the internal match
// to the tail suffix.
if b.positive[match] == 0 {
b.positive[match] = match - scan
}
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
break
}
scan -= bump
match -= bump
}
examine -= bump
}
match = last - bump
// scan for the chars for which there are no shifts that yield a different candidate
// The inside of the if statement used to say
// "_positive[match] = last - beforefirst;"
// This is slightly less aggressive in how much we skip, but at worst it
// should mean a little more work rather than skipping a potential match.
for match != beforefirst {
if b.positive[match] == 0 {
b.positive[match] = bump
}
match -= bump
}
// PART II - the bad-character shift table
//
// compute the negative requirement:
// if char "ch" is the reject character when testing position "i",
// we can slide up by _negative[ch];
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
//
// the lookup table is divided into ASCII and Unicode portions;
// only those parts of the Unicode 16-bit code set that actually
// appear in the string are in the table. (Maximum size with
// Unicode is 65K; ASCII only case is 512 bytes.)
b.negativeASCII = make([]int, 128)
for i := 0; i < len(b.negativeASCII); i++ {
b.negativeASCII[i] = last - beforefirst
}
b.lowASCII = 127
b.highASCII = 0
for examine = last; examine != beforefirst; examine -= bump {
ch = b.pattern[examine]
switch {
case ch < 128:
if b.lowASCII > ch {
b.lowASCII = ch
}
if b.highASCII < ch {
b.highASCII = ch
}
if b.negativeASCII[ch] == last-beforefirst {
b.negativeASCII[ch] = last - examine
}
case ch <= 0xffff:
i, j := ch>>8, ch&0xFF
if b.negativeUnicode == nil {
b.negativeUnicode = make([][]int, 256)
}
if b.negativeUnicode[i] == nil {
newarray := make([]int, 256)
for k := 0; k < len(newarray); k++ {
newarray[k] = last - beforefirst
}
if i == 0 {
copy(newarray, b.negativeASCII)
//TODO: this line needed?
b.negativeASCII = newarray
}
b.negativeUnicode[i] = newarray
}
if b.negativeUnicode[i][j] == last-beforefirst {
b.negativeUnicode[i][j] = last - examine
}
default:
// we can't do the filter because this algo doesn't support
// unicode chars >0xffff
return nil
}
}
return b
}
func (b *BmPrefix) String() string {
return string(b.pattern)
}
// Dump returns the contents of the filter as a human readable string
func (b *BmPrefix) Dump(indent string) string {
buf := &bytes.Buffer{}
fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
for i := 0; i < len(b.positive); i++ {
buf.WriteString(strconv.Itoa(b.positive[i]))
buf.WriteRune(' ')
}
buf.WriteRune('\n')
if b.negativeASCII != nil {
buf.WriteString(indent)
buf.WriteString("Negative table\n")
for i := 0; i < len(b.negativeASCII); i++ {
if b.negativeASCII[i] != len(b.pattern) {
fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
}
}
}
return buf.String()
}
// Scan uses the Boyer-Moore algorithm to find the first occurrence
// of the specified string within text, beginning at index, and
// constrained within beglimit and endlimit.
//
// The direction and case-sensitivity of the match is determined
// by the arguments to the RegexBoyerMoore constructor.
func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
var (
defadv, test, test2 int
match, startmatch, endmatch int
bump, advance int
chTest rune
unicodeLookup []int
)
if !b.rightToLeft {
defadv = len(b.pattern)
startmatch = len(b.pattern) - 1
endmatch = 0
test = index + defadv - 1
bump = 1
} else {
defadv = -len(b.pattern)
startmatch = 0
endmatch = -defadv - 1
test = index + defadv
bump = -1
}
chMatch := b.pattern[startmatch]
for {
if test >= endlimit || test < beglimit {
return -1
}
chTest = text[test]
if b.caseInsensitive {
chTest = unicode.ToLower(chTest)
}
if chTest != chMatch {
if chTest < 128 {
advance = b.negativeASCII[chTest]
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
unicodeLookup = b.negativeUnicode[chTest>>8]
if len(unicodeLookup) > 0 {
advance = unicodeLookup[chTest&0xFF]
} else {
advance = defadv
}
} else {
advance = defadv
}
test += advance
} else { // if (chTest == chMatch)
test2 = test
match = startmatch
for {
if match == endmatch {
if b.rightToLeft {
return test2 + 1
} else {
return test2
}
}
match -= bump
test2 -= bump
chTest = text[test2]
if b.caseInsensitive {
chTest = unicode.ToLower(chTest)
}
if chTest != b.pattern[match] {
advance = b.positive[match]
if (chTest & 0xFF80) == 0 {
test2 = (match - startmatch) + b.negativeASCII[chTest]
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
unicodeLookup = b.negativeUnicode[chTest>>8]
if len(unicodeLookup) > 0 {
test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
} else {
test += advance
break
}
} else {
test += advance
break
}
if b.rightToLeft {
if test2 < advance {
advance = test2
}
} else if test2 > advance {
advance = test2
}
test += advance
break
}
}
}
}
}
// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
if !b.rightToLeft {
if index < beglimit || endlimit-index < len(b.pattern) {
return false
}
return b.matchPattern(text, index)
} else {
if index > endlimit || index-beglimit < len(b.pattern) {
return false
}
return b.matchPattern(text, index-len(b.pattern))
}
}
func (b *BmPrefix) matchPattern(text []rune, index int) bool {
if len(text)-index < len(b.pattern) {
return false
}
if b.caseInsensitive {
for i := 0; i < len(b.pattern); i++ {
//Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
if unicode.ToLower(text[index+i]) != b.pattern[i] {
return false
}
}
return true
} else {
for i := 0; i < len(b.pattern); i++ {
if text[index+i] != b.pattern[i] {
return false
}
}
return true
}
}
type AnchorLoc int16
// where the regex can be pegged
const (
AnchorBeginning AnchorLoc = 0x0001
AnchorBol = 0x0002
AnchorStart = 0x0004
AnchorEol = 0x0008
AnchorEndZ = 0x0010
AnchorEnd = 0x0020
AnchorBoundary = 0x0040
AnchorECMABoundary = 0x0080
)
func getAnchors(tree *RegexTree) AnchorLoc {
var concatNode *regexNode
nextChild, result := 0, AnchorLoc(0)
curNode := tree.root
for {
switch curNode.t {
case ntConcatenate:
if len(curNode.children) > 0 {
concatNode = curNode
nextChild = 0
}
case ntGreedy, ntCapture:
curNode = curNode.children[0]
concatNode = nil
continue
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
ntStart, ntEndZ, ntEnd:
return result | anchorFromType(curNode.t)
case ntEmpty, ntRequire, ntPrevent:
default:
return result
}
if concatNode == nil || nextChild >= len(concatNode.children) {
return result
}
curNode = concatNode.children[nextChild]
nextChild++
}
}
func anchorFromType(t nodeType) AnchorLoc {
switch t {
case ntBol:
return AnchorBol
case ntEol:
return AnchorEol
case ntBoundary:
return AnchorBoundary
case ntECMABoundary:
return AnchorECMABoundary
case ntBeginning:
return AnchorBeginning
case ntStart:
return AnchorStart
case ntEndZ:
return AnchorEndZ
case ntEnd:
return AnchorEnd
default:
return 0
}
}
// anchorDescription returns a human-readable description of the anchors
func (anchors AnchorLoc) String() string {
buf := &bytes.Buffer{}
if 0 != (anchors & AnchorBeginning) {
buf.WriteString(", Beginning")
}
if 0 != (anchors & AnchorStart) {
buf.WriteString(", Start")
}
if 0 != (anchors & AnchorBol) {
buf.WriteString(", Bol")
}
if 0 != (anchors & AnchorBoundary) {
buf.WriteString(", Boundary")
}
if 0 != (anchors & AnchorECMABoundary) {
buf.WriteString(", ECMABoundary")
}
if 0 != (anchors & AnchorEol) {
buf.WriteString(", Eol")
}
if 0 != (anchors & AnchorEnd) {
buf.WriteString(", End")
}
if 0 != (anchors & AnchorEndZ) {
buf.WriteString(", EndZ")
}
// trim off comma
if buf.Len() >= 2 {
return buf.String()[2:]
}
return "None"
}

View file

@ -0,0 +1,87 @@
package syntax
import (
"bytes"
"errors"
)
type ReplacerData struct {
Rep string
Strings []string
Rules []int
}
const (
replaceSpecials = 4
replaceLeftPortion = -1
replaceRightPortion = -2
replaceLastGroup = -3
replaceWholeString = -4
)
//ErrReplacementError is a general error during parsing the replacement text
var ErrReplacementError = errors.New("Replacement pattern error.")
// NewReplacerData will populate a reusable replacer data struct based on the given replacement string
// and the capture group data from a regexp
func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error) {
p := parser{
options: op,
caps: caps,
capsize: capsize,
capnames: capnames,
}
p.setPattern(rep)
concat, err := p.scanReplacement()
if err != nil {
return nil, err
}
if concat.t != ntConcatenate {
panic(ErrReplacementError)
}
sb := &bytes.Buffer{}
var (
strings []string
rules []int
)
for _, child := range concat.children {
switch child.t {
case ntMulti:
child.writeStrToBuf(sb)
case ntOne:
sb.WriteRune(child.ch)
case ntRef:
if sb.Len() > 0 {
rules = append(rules, len(strings))
strings = append(strings, sb.String())
sb.Reset()
}
slot := child.m
if len(caps) > 0 && slot >= 0 {
slot = caps[slot]
}
rules = append(rules, -replaceSpecials-1-slot)
default:
panic(ErrReplacementError)
}
}
if sb.Len() > 0 {
rules = append(rules, len(strings))
strings = append(strings, sb.String())
}
return &ReplacerData{
Rep: rep,
Strings: strings,
Rules: rules,
}, nil
}

654
vendor/github.com/dlclark/regexp2/syntax/tree.go generated vendored Normal file
View file

@ -0,0 +1,654 @@
package syntax
import (
"bytes"
"fmt"
"math"
"strconv"
)
type RegexTree struct {
root *regexNode
caps map[int]int
capnumlist []int
captop int
Capnames map[string]int
Caplist []string
options RegexOptions
}
// It is built into a parsed tree for a regular expression.
// Implementation notes:
//
// Since the node tree is a temporary data structure only used
// during compilation of the regexp to integer codes, it's
// designed for clarity and convenience rather than
// space efficiency.
//
// RegexNodes are built into a tree, linked by the n.children list.
// Each node also has a n.parent and n.ichild member indicating
// its parent and which child # it is in its parent's list.
//
// RegexNodes come in as many types as there are constructs in
// a regular expression, for example, "concatenate", "alternate",
// "one", "rept", "group". There are also node types for basic
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
//
// Because perl 5 allows "lookback" groups that scan backwards,
// each node also gets a "direction". Normally the value of
// boolean n.backward = false.
//
// During parsing, top-level nodes are also stacked onto a parse
// stack (a stack of trees). For this purpose we have a n.next
// pointer. [Note that to save a few bytes, we could overload the
// n.parent pointer instead.]
//
// On the parse stack, each tree has a "role" - basically, the
// nonterminal in the grammar that the parser has currently
// assigned to the tree. That code is stored in n.role.
//
// Finally, some of the different kinds of nodes have data.
// Two integers (for the looping constructs) are stored in
// n.operands, an an object (either a string or a set)
// is stored in n.data
type regexNode struct {
t nodeType
children []*regexNode
str []rune
set *CharSet
ch rune
m int
n int
options RegexOptions
next *regexNode
}
type nodeType int32
const (
// The following are leaves, and correspond to primitive operations
ntOnerep nodeType = 0 // lef,back char,min,max a {n}
ntNotonerep = 1 // lef,back char,min,max .{n}
ntSetrep = 2 // lef,back set,min,max [\d]{n}
ntOneloop = 3 // lef,back char,min,max a {,n}
ntNotoneloop = 4 // lef,back char,min,max .{,n}
ntSetloop = 5 // lef,back set,min,max [\d]{,n}
ntOnelazy = 6 // lef,back char,min,max a {,n}?
ntNotonelazy = 7 // lef,back char,min,max .{,n}?
ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
ntOne = 9 // lef char a
ntNotone = 10 // lef char [^a]
ntSet = 11 // lef set [a-z\s] \w \s \d
ntMulti = 12 // lef string abcd
ntRef = 13 // lef group \#
ntBol = 14 // ^
ntEol = 15 // $
ntBoundary = 16 // \b
ntNonboundary = 17 // \B
ntBeginning = 18 // \A
ntStart = 19 // \G
ntEndZ = 20 // \Z
ntEnd = 21 // \Z
// Interior nodes do not correspond to primitive operations, but
// control structures compositing other operations
// Concat and alternate take n children, and can run forward or backwards
ntNothing = 22 // []
ntEmpty = 23 // ()
ntAlternate = 24 // a|b
ntConcatenate = 25 // ab
ntLoop = 26 // m,x * + ? {,}
ntLazyloop = 27 // m,x *? +? ?? {,}?
ntCapture = 28 // n ()
ntGroup = 29 // (?:)
ntRequire = 30 // (?=) (?<=)
ntPrevent = 31 // (?!) (?<!)
ntGreedy = 32 // (?>) (?<)
ntTestref = 33 // (?(n) | )
ntTestgroup = 34 // (?(...) | )
ntECMABoundary = 41 // \b
ntNonECMABoundary = 42 // \B
)
func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
return &regexNode{
t: t,
options: opt,
}
}
func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
return &regexNode{
t: t,
options: opt,
ch: ch,
}
}
func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
return &regexNode{
t: t,
options: opt,
str: str,
}
}
func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
return &regexNode{
t: t,
options: opt,
set: set,
}
}
func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
return &regexNode{
t: t,
options: opt,
m: m,
}
}
func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
return &regexNode{
t: t,
options: opt,
m: m,
n: n,
}
}
func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
for i := 0; i < len(n.str); i++ {
buf.WriteRune(n.str[i])
}
}
func (n *regexNode) addChild(child *regexNode) {
reduced := child.reduce()
n.children = append(n.children, reduced)
reduced.next = n
}
func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
}
// removes children including the start but not the end index
func (n *regexNode) removeChildren(startIndex, endIndex int) {
n.children = append(n.children[:startIndex], n.children[endIndex:]...)
}
// Pass type as OneLazy or OneLoop
func (n *regexNode) makeRep(t nodeType, min, max int) {
n.t += (t - ntOne)
n.m = min
n.n = max
}
func (n *regexNode) reduce() *regexNode {
switch n.t {
case ntAlternate:
return n.reduceAlternation()
case ntConcatenate:
return n.reduceConcatenation()
case ntLoop, ntLazyloop:
return n.reduceRep()
case ntGroup:
return n.reduceGroup()
case ntSet, ntSetloop:
return n.reduceSet()
default:
return n
}
}
// Basic optimization. Single-letter alternations can be replaced
// by faster set specifications, and nested alternations with no
// intervening operators can be flattened:
//
// a|b|c|def|g|h -> [a-c]|def|[gh]
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
func (n *regexNode) reduceAlternation() *regexNode {
if len(n.children) == 0 {
return newRegexNode(ntNothing, n.options)
}
wasLastSet := false
lastNodeCannotMerge := false
var optionsLast RegexOptions
var i, j int
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
at := n.children[i]
if j < i {
n.children[j] = at
}
for {
if at.t == ntAlternate {
for k := 0; k < len(at.children); k++ {
at.children[k].next = n
}
n.insertChildren(i+1, at.children)
j--
} else if at.t == ntSet || at.t == ntOne {
// Cannot merge sets if L or I options differ, or if either are negated.
optionsAt := at.options & (RightToLeft | IgnoreCase)
if at.t == ntSet {
if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
wasLastSet = true
lastNodeCannotMerge = !at.set.IsMergeable()
optionsLast = optionsAt
break
}
} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
wasLastSet = true
lastNodeCannotMerge = false
optionsLast = optionsAt
break
}
// The last node was a Set or a One, we're a Set or One and our options are the same.
// Merge the two nodes.
j--
prev := n.children[j]
var prevCharClass *CharSet
if prev.t == ntOne {
prevCharClass = &CharSet{}
prevCharClass.addChar(prev.ch)
} else {
prevCharClass = prev.set
}
if at.t == ntOne {
prevCharClass.addChar(at.ch)
} else {
prevCharClass.addSet(*at.set)
}
prev.t = ntSet
prev.set = prevCharClass
} else if at.t == ntNothing {
j--
} else {
wasLastSet = false
lastNodeCannotMerge = false
}
break
}
}
if j < i {
n.removeChildren(j, i)
}
return n.stripEnation(ntNothing)
}
// Basic optimization. Adjacent strings can be concatenated.
//
// (?:abc)(?:def) -> abcdef
func (n *regexNode) reduceConcatenation() *regexNode {
// Eliminate empties and concat adjacent strings/chars
var optionsLast RegexOptions
var optionsAt RegexOptions
var i, j int
if len(n.children) == 0 {
return newRegexNode(ntEmpty, n.options)
}
wasLastString := false
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
var at, prev *regexNode
at = n.children[i]
if j < i {
n.children[j] = at
}
if at.t == ntConcatenate &&
((at.options & RightToLeft) == (n.options & RightToLeft)) {
for k := 0; k < len(at.children); k++ {
at.children[k].next = n
}
//insert at.children at i+1 index in n.children
n.insertChildren(i+1, at.children)
j--
} else if at.t == ntMulti || at.t == ntOne {
// Cannot merge strings if L or I options differ
optionsAt = at.options & (RightToLeft | IgnoreCase)
if !wasLastString || optionsLast != optionsAt {
wasLastString = true
optionsLast = optionsAt
continue
}
j--
prev = n.children[j]
if prev.t == ntOne {
prev.t = ntMulti
prev.str = []rune{prev.ch}
}
if (optionsAt & RightToLeft) == 0 {
if at.t == ntOne {
prev.str = append(prev.str, at.ch)
} else {
prev.str = append(prev.str, at.str...)
}
} else {
if at.t == ntOne {
// insert at the front by expanding our slice, copying the data over, and then setting the value
prev.str = append(prev.str, 0)
copy(prev.str[1:], prev.str)
prev.str[0] = at.ch
} else {
//insert at the front...this one we'll make a new slice and copy both into it
merge := make([]rune, len(prev.str)+len(at.str))
copy(merge, at.str)
copy(merge[len(at.str):], prev.str)
prev.str = merge
}
}
} else if at.t == ntEmpty {
j--
} else {
wasLastString = false
}
}
if j < i {
// remove indices j through i from the children
n.removeChildren(j, i)
}
return n.stripEnation(ntEmpty)
}
// Nested repeaters just get multiplied with each other if they're not
// too lumpy
func (n *regexNode) reduceRep() *regexNode {
u := n
t := n.t
min := n.m
max := n.n
for {
if len(u.children) == 0 {
break
}
child := u.children[0]
// multiply reps of the same type only
if child.t != t {
childType := child.t
if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
break
}
}
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
// [but things like (a {2,})+ are not too lumpy...]
if u.m == 0 && child.m > 1 || child.n < child.m*2 {
break
}
u = child
if u.m > 0 {
if (math.MaxInt32-1)/u.m < min {
u.m = math.MaxInt32
} else {
u.m = u.m * min
}
}
if u.n > 0 {
if (math.MaxInt32-1)/u.n < max {
u.n = math.MaxInt32
} else {
u.n = u.n * max
}
}
}
if math.MaxInt32 == min {
return newRegexNode(ntNothing, n.options)
}
return u
}
// Simple optimization. If a concatenation or alternation has only
// one child strip out the intermediate node. If it has zero children,
// turn it into an empty.
func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
switch len(n.children) {
case 0:
return newRegexNode(emptyType, n.options)
case 1:
return n.children[0]
default:
return n
}
}
func (n *regexNode) reduceGroup() *regexNode {
u := n
for u.t == ntGroup {
u = u.children[0]
}
return u
}
// Simple optimization. If a set is a singleton, an inverse singleton,
// or empty, it's transformed accordingly.
func (n *regexNode) reduceSet() *regexNode {
// Extract empty-set, one and not-one case as special
if n.set == nil {
n.t = ntNothing
} else if n.set.IsSingleton() {
n.ch = n.set.SingletonChar()
n.set = nil
n.t += (ntOne - ntSet)
} else if n.set.IsSingletonInverse() {
n.ch = n.set.SingletonChar()
n.set = nil
n.t += (ntNotone - ntSet)
}
return n
}
func (n *regexNode) reverseLeft() *regexNode {
if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
//reverse children order
for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
n.children[left], n.children[right] = n.children[right], n.children[left]
}
}
return n
}
func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
if min == 0 && max == 0 {
return newRegexNode(ntEmpty, n.options)
}
if min == 1 && max == 1 {
return n
}
switch n.t {
case ntOne, ntNotone, ntSet:
if lazy {
n.makeRep(Onelazy, min, max)
} else {
n.makeRep(Oneloop, min, max)
}
return n
default:
var t nodeType
if lazy {
t = ntLazyloop
} else {
t = ntLoop
}
result := newRegexNodeMN(t, n.options, min, max)
result.addChild(n)
return result
}
}
// debug functions
var typeStr = []string{
"Onerep", "Notonerep", "Setrep",
"Oneloop", "Notoneloop", "Setloop",
"Onelazy", "Notonelazy", "Setlazy",
"One", "Notone", "Set",
"Multi", "Ref",
"Bol", "Eol", "Boundary", "Nonboundary",
"Beginning", "Start", "EndZ", "End",
"Nothing", "Empty",
"Alternate", "Concatenate",
"Loop", "Lazyloop",
"Capture", "Group", "Require", "Prevent", "Greedy",
"Testref", "Testgroup",
"Unknown", "Unknown", "Unknown",
"Unknown", "Unknown", "Unknown",
"ECMABoundary", "NonECMABoundary",
}
func (n *regexNode) description() string {
buf := &bytes.Buffer{}
buf.WriteString(typeStr[n.t])
if (n.options & ExplicitCapture) != 0 {
buf.WriteString("-C")
}
if (n.options & IgnoreCase) != 0 {
buf.WriteString("-I")
}
if (n.options & RightToLeft) != 0 {
buf.WriteString("-L")
}
if (n.options & Multiline) != 0 {
buf.WriteString("-M")
}
if (n.options & Singleline) != 0 {
buf.WriteString("-S")
}
if (n.options & IgnorePatternWhitespace) != 0 {
buf.WriteString("-X")
}
if (n.options & ECMAScript) != 0 {
buf.WriteString("-E")
}
switch n.t {
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
break
case ntCapture:
buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
break
case ntRef, ntTestref:
buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
break
case ntMulti:
fmt.Fprintf(buf, "(String = %s)", string(n.str))
break
case ntSet, ntSetloop, ntSetlazy:
buf.WriteString("(Set = " + n.set.String() + ")")
break
}
switch n.t {
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
buf.WriteString("(Min = ")
buf.WriteString(strconv.Itoa(n.m))
buf.WriteString(", Max = ")
if n.n == math.MaxInt32 {
buf.WriteString("inf")
} else {
buf.WriteString(strconv.Itoa(n.n))
}
buf.WriteString(")")
break
}
return buf.String()
}
var padSpace = []byte(" ")
func (t *RegexTree) Dump() string {
return t.root.dump()
}
func (n *regexNode) dump() string {
var stack []int
CurNode := n
CurChild := 0
buf := bytes.NewBufferString(CurNode.description())
buf.WriteRune('\n')
for {
if CurNode.children != nil && CurChild < len(CurNode.children) {
stack = append(stack, CurChild+1)
CurNode = CurNode.children[CurChild]
CurChild = 0
Depth := len(stack)
if Depth > 32 {
Depth = 32
}
buf.Write(padSpace[:Depth])
buf.WriteString(CurNode.description())
buf.WriteRune('\n')
} else {
if len(stack) == 0 {
break
}
CurChild = stack[len(stack)-1]
stack = stack[:len(stack)-1]
CurNode = CurNode.next
}
}
return buf.String()
}

500
vendor/github.com/dlclark/regexp2/syntax/writer.go generated vendored Normal file
View file

@ -0,0 +1,500 @@
package syntax
import (
"bytes"
"fmt"
"math"
"os"
)
func Write(tree *RegexTree) (*Code, error) {
w := writer{
intStack: make([]int, 0, 32),
emitted: make([]int, 2),
stringhash: make(map[string]int),
sethash: make(map[string]int),
}
code, err := w.codeFromTree(tree)
if tree.options&Debug > 0 && code != nil {
os.Stdout.WriteString(code.Dump())
os.Stdout.WriteString("\n")
}
return code, err
}
type writer struct {
emitted []int
intStack []int
curpos int
stringhash map[string]int
stringtable [][]rune
sethash map[string]int
settable []*CharSet
counting bool
count int
trackcount int
caps map[int]int
}
const (
beforeChild nodeType = 64
afterChild = 128
//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
MaxPrefixSize = 50
)
// The top level RegexCode generator. It does a depth-first walk
// through the tree and calls EmitFragment to emits code before
// and after each child of an interior node, and at each leaf.
//
// It runs two passes, first to count the size of the generated
// code, and second to generate the code.
//
// We should time it against the alternative, which is
// to just generate the code and grow the array as we go.
func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
var (
curNode *regexNode
curChild int
capsize int
)
// construct sparse capnum mapping if some numbers are unused
if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
capsize = tree.captop
w.caps = nil
} else {
capsize = len(tree.capnumlist)
w.caps = tree.caps
for i := 0; i < len(tree.capnumlist); i++ {
w.caps[tree.capnumlist[i]] = i
}
}
w.counting = true
for {
if !w.counting {
w.emitted = make([]int, w.count)
}
curNode = tree.root
curChild = 0
w.emit1(Lazybranch, 0)
for {
if len(curNode.children) == 0 {
w.emitFragment(curNode.t, curNode, 0)
} else if curChild < len(curNode.children) {
w.emitFragment(curNode.t|beforeChild, curNode, curChild)
curNode = curNode.children[curChild]
w.pushInt(curChild)
curChild = 0
continue
}
if w.emptyStack() {
break
}
curChild = w.popInt()
curNode = curNode.next
w.emitFragment(curNode.t|afterChild, curNode, curChild)
curChild++
}
w.patchJump(0, w.curPos())
w.emit(Stop)
if !w.counting {
break
}
w.counting = false
}
fcPrefix := getFirstCharsPrefix(tree)
prefix := getPrefix(tree)
rtl := (tree.options & RightToLeft) != 0
var bmPrefix *BmPrefix
//TODO: benchmark string prefixes
if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
if len(prefix.PrefixStr) > MaxPrefixSize {
// limit prefix changes to 10k
prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
}
bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
} else {
bmPrefix = nil
}
return &Code{
Codes: w.emitted,
Strings: w.stringtable,
Sets: w.settable,
TrackCount: w.trackcount,
Caps: w.caps,
Capsize: capsize,
FcPrefix: fcPrefix,
BmPrefix: bmPrefix,
Anchors: getAnchors(tree),
RightToLeft: rtl,
}, nil
}
// The main RegexCode generator. It does a depth-first walk
// through the tree and calls EmitFragment to emits code before
// and after each child of an interior node, and at each leaf.
func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
bits := InstOp(0)
if nodetype <= ntRef {
if (node.options & RightToLeft) != 0 {
bits |= Rtl
}
if (node.options & IgnoreCase) != 0 {
bits |= Ci
}
}
ntBits := nodeType(bits)
switch nodetype {
case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
break
case ntAlternate | beforeChild:
if curIndex < len(node.children)-1 {
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
}
case ntAlternate | afterChild:
if curIndex < len(node.children)-1 {
lbPos := w.popInt()
w.pushInt(w.curPos())
w.emit1(Goto, 0)
w.patchJump(lbPos, w.curPos())
} else {
for i := 0; i < curIndex; i++ {
w.patchJump(w.popInt(), w.curPos())
}
}
break
case ntTestref | beforeChild:
if curIndex == 0 {
w.emit(Setjump)
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
w.emit1(Testref, w.mapCapnum(node.m))
w.emit(Forejump)
}
case ntTestref | afterChild:
if curIndex == 0 {
branchpos := w.popInt()
w.pushInt(w.curPos())
w.emit1(Goto, 0)
w.patchJump(branchpos, w.curPos())
w.emit(Forejump)
if len(node.children) <= 1 {
w.patchJump(w.popInt(), w.curPos())
}
} else if curIndex == 1 {
w.patchJump(w.popInt(), w.curPos())
}
case ntTestgroup | beforeChild:
if curIndex == 0 {
w.emit(Setjump)
w.emit(Setmark)
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
}
case ntTestgroup | afterChild:
if curIndex == 0 {
w.emit(Getmark)
w.emit(Forejump)
} else if curIndex == 1 {
Branchpos := w.popInt()
w.pushInt(w.curPos())
w.emit1(Goto, 0)
w.patchJump(Branchpos, w.curPos())
w.emit(Getmark)
w.emit(Forejump)
if len(node.children) <= 2 {
w.patchJump(w.popInt(), w.curPos())
}
} else if curIndex == 2 {
w.patchJump(w.popInt(), w.curPos())
}
case ntLoop | beforeChild, ntLazyloop | beforeChild:
if node.n < math.MaxInt32 || node.m > 1 {
if node.m == 0 {
w.emit1(Nullcount, 0)
} else {
w.emit1(Setcount, 1-node.m)
}
} else if node.m == 0 {
w.emit(Nullmark)
} else {
w.emit(Setmark)
}
if node.m == 0 {
w.pushInt(w.curPos())
w.emit1(Goto, 0)
}
w.pushInt(w.curPos())
case ntLoop | afterChild, ntLazyloop | afterChild:
startJumpPos := w.curPos()
lazy := (nodetype - (ntLoop | afterChild))
if node.n < math.MaxInt32 || node.m > 1 {
if node.n == math.MaxInt32 {
w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
} else {
w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
}
} else {
w.emit1(InstOp(Branchmark+lazy), w.popInt())
}
if node.m == 0 {
w.patchJump(w.popInt(), startJumpPos)
}
case ntGroup | beforeChild, ntGroup | afterChild:
case ntCapture | beforeChild:
w.emit(Setmark)
case ntCapture | afterChild:
w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
case ntRequire | beforeChild:
// NOTE: the following line causes lookahead/lookbehind to be
// NON-BACKTRACKING. It can be commented out with (*)
w.emit(Setjump)
w.emit(Setmark)
case ntRequire | afterChild:
w.emit(Getmark)
// NOTE: the following line causes lookahead/lookbehind to be
// NON-BACKTRACKING. It can be commented out with (*)
w.emit(Forejump)
case ntPrevent | beforeChild:
w.emit(Setjump)
w.pushInt(w.curPos())
w.emit1(Lazybranch, 0)
case ntPrevent | afterChild:
w.emit(Backjump)
w.patchJump(w.popInt(), w.curPos())
w.emit(Forejump)
case ntGreedy | beforeChild:
w.emit(Setjump)
case ntGreedy | afterChild:
w.emit(Forejump)
case ntOne, ntNotone:
w.emit1(InstOp(node.t|ntBits), int(node.ch))
case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
if node.m > 0 {
if node.t == ntOneloop || node.t == ntOnelazy {
w.emit2(Onerep|bits, int(node.ch), node.m)
} else {
w.emit2(Notonerep|bits, int(node.ch), node.m)
}
}
if node.n > node.m {
if node.n == math.MaxInt32 {
w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
} else {
w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
}
}
case ntSetloop, ntSetlazy:
if node.m > 0 {
w.emit2(Setrep|bits, w.setCode(node.set), node.m)
}
if node.n > node.m {
if node.n == math.MaxInt32 {
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
} else {
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
}
}
case ntMulti:
w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
case ntSet:
w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
case ntRef:
w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
w.emit(InstOp(node.t))
default:
return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
}
return nil
}
// To avoid recursion, we use a simple integer stack.
// This is the push.
func (w *writer) pushInt(i int) {
w.intStack = append(w.intStack, i)
}
// Returns true if the stack is empty.
func (w *writer) emptyStack() bool {
return len(w.intStack) == 0
}
// This is the pop.
func (w *writer) popInt() int {
//get our item
idx := len(w.intStack) - 1
i := w.intStack[idx]
//trim our slice
w.intStack = w.intStack[:idx]
return i
}
// Returns the current position in the emitted code.
func (w *writer) curPos() int {
return w.curpos
}
// Fixes up a jump instruction at the specified offset
// so that it jumps to the specified jumpDest.
func (w *writer) patchJump(offset, jumpDest int) {
w.emitted[offset+1] = jumpDest
}
// Returns an index in the set table for a charset
// uses a map to eliminate duplicates.
func (w *writer) setCode(set *CharSet) int {
if w.counting {
return 0
}
buf := &bytes.Buffer{}
set.mapHashFill(buf)
hash := buf.String()
i, ok := w.sethash[hash]
if !ok {
i = len(w.sethash)
w.sethash[hash] = i
w.settable = append(w.settable, set)
}
return i
}
// Returns an index in the string table for a string.
// uses a map to eliminate duplicates.
func (w *writer) stringCode(str []rune) int {
if w.counting {
return 0
}
hash := string(str)
i, ok := w.stringhash[hash]
if !ok {
i = len(w.stringhash)
w.stringhash[hash] = i
w.stringtable = append(w.stringtable, str)
}
return i
}
// When generating code on a regex that uses a sparse set
// of capture slots, we hash them to a dense set of indices
// for an array of capture slots. Instead of doing the hash
// at match time, it's done at compile time, here.
func (w *writer) mapCapnum(capnum int) int {
if capnum == -1 {
return -1
}
if w.caps != nil {
return w.caps[capnum]
}
return capnum
}
// Emits a zero-argument operation. Note that the emit
// functions all run in two modes: they can emit code, or
// they can just count the size of the code.
func (w *writer) emit(op InstOp) {
if w.counting {
w.count++
if opcodeBacktracks(op) {
w.trackcount++
}
return
}
w.emitted[w.curpos] = int(op)
w.curpos++
}
// Emits a one-argument operation.
func (w *writer) emit1(op InstOp, opd1 int) {
if w.counting {
w.count += 2
if opcodeBacktracks(op) {
w.trackcount++
}
return
}
w.emitted[w.curpos] = int(op)
w.curpos++
w.emitted[w.curpos] = opd1
w.curpos++
}
// Emits a two-argument operation.
func (w *writer) emit2(op InstOp, opd1, opd2 int) {
if w.counting {
w.count += 3
if opcodeBacktracks(op) {
w.trackcount++
}
return
}
w.emitted[w.curpos] = int(op)
w.curpos++
w.emitted[w.curpos] = opd1
w.curpos++
w.emitted[w.curpos] = opd2
w.curpos++
}