456 lines
9.5 KiB
Go
456 lines
9.5 KiB
Go
|
package syntax
|
||
|
|
||
|
import (
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
"unicode/utf8"
|
||
|
)
|
||
|
|
||
|
type token struct {
|
||
|
kind tokenKind
|
||
|
pos Position
|
||
|
}
|
||
|
|
||
|
func (tok token) String() string {
|
||
|
return tok.kind.String()
|
||
|
}
|
||
|
|
||
|
type tokenKind byte
|
||
|
|
||
|
//go:generate stringer -type=tokenKind -trimprefix=tok -linecomment=true
|
||
|
const (
|
||
|
tokNone tokenKind = iota
|
||
|
|
||
|
tokChar
|
||
|
tokGroupFlags
|
||
|
tokPosixClass
|
||
|
tokConcat
|
||
|
tokRepeat
|
||
|
tokEscapeChar
|
||
|
tokEscapeMeta
|
||
|
tokEscapeOctal
|
||
|
tokEscapeUni
|
||
|
tokEscapeUniFull
|
||
|
tokEscapeHex
|
||
|
tokEscapeHexFull
|
||
|
tokComment
|
||
|
|
||
|
tokQ // \Q
|
||
|
tokMinus // -
|
||
|
tokLbracket // [
|
||
|
tokLbracketCaret // [^
|
||
|
tokRbracket // ]
|
||
|
tokDollar // $
|
||
|
tokCaret // ^
|
||
|
tokQuestion // ?
|
||
|
tokDot // .
|
||
|
tokPlus // +
|
||
|
tokStar // *
|
||
|
tokPipe // |
|
||
|
tokLparen // (
|
||
|
tokLparenName // (?P<name>
|
||
|
tokLparenNameAngle // (?<name>
|
||
|
tokLparenNameQuote // (?'name'
|
||
|
tokLparenFlags // (?flags
|
||
|
tokLparenAtomic // (?>
|
||
|
tokLparenPositiveLookahead // (?=
|
||
|
tokLparenPositiveLookbehind // (?<=
|
||
|
tokLparenNegativeLookahead // (?!
|
||
|
tokLparenNegativeLookbehind // (?<!
|
||
|
tokRparen // )
|
||
|
)
|
||
|
|
||
|
// reMetachar is a table of meta chars outside of a char class.
|
||
|
var reMetachar = [256]bool{
|
||
|
'\\': true,
|
||
|
'|': true,
|
||
|
'*': true,
|
||
|
'+': true,
|
||
|
'?': true,
|
||
|
'.': true,
|
||
|
'[': true,
|
||
|
']': true,
|
||
|
'^': true,
|
||
|
'$': true,
|
||
|
'(': true,
|
||
|
')': true,
|
||
|
}
|
||
|
|
||
|
// charClassMetachar is a table of meta chars inside char class.
|
||
|
var charClassMetachar = [256]bool{
|
||
|
'-': true,
|
||
|
']': true,
|
||
|
}
|
||
|
|
||
|
type lexer struct {
|
||
|
tokens []token
|
||
|
pos int
|
||
|
input string
|
||
|
}
|
||
|
|
||
|
func (l *lexer) HasMoreTokens() bool {
|
||
|
return l.pos < len(l.tokens)
|
||
|
}
|
||
|
|
||
|
func (l *lexer) NextToken() token {
|
||
|
if l.pos < len(l.tokens) {
|
||
|
tok := l.tokens[l.pos]
|
||
|
l.pos++
|
||
|
return tok
|
||
|
}
|
||
|
return token{}
|
||
|
}
|
||
|
|
||
|
func (l *lexer) Peek() token {
|
||
|
if l.pos < len(l.tokens) {
|
||
|
return l.tokens[l.pos]
|
||
|
}
|
||
|
return token{}
|
||
|
}
|
||
|
|
||
|
func (l *lexer) scan() {
|
||
|
for l.pos < len(l.input) {
|
||
|
ch := l.input[l.pos]
|
||
|
if ch > unicode.MaxASCII {
|
||
|
_, size := utf8.DecodeRuneInString(l.input[l.pos:])
|
||
|
l.pushTok(tokChar, size)
|
||
|
l.maybeInsertConcat()
|
||
|
continue
|
||
|
}
|
||
|
switch ch {
|
||
|
case '\\':
|
||
|
l.scanEscape(false)
|
||
|
case '.':
|
||
|
l.pushTok(tokDot, 1)
|
||
|
case '+':
|
||
|
l.pushTok(tokPlus, 1)
|
||
|
case '*':
|
||
|
l.pushTok(tokStar, 1)
|
||
|
case '^':
|
||
|
l.pushTok(tokCaret, 1)
|
||
|
case '$':
|
||
|
l.pushTok(tokDollar, 1)
|
||
|
case '?':
|
||
|
l.pushTok(tokQuestion, 1)
|
||
|
case ')':
|
||
|
l.pushTok(tokRparen, 1)
|
||
|
case '|':
|
||
|
l.pushTok(tokPipe, 1)
|
||
|
case '[':
|
||
|
if l.byteAt(l.pos+1) == '^' {
|
||
|
l.pushTok(tokLbracketCaret, 2)
|
||
|
} else {
|
||
|
l.pushTok(tokLbracket, 1)
|
||
|
}
|
||
|
l.scanCharClass()
|
||
|
case '(':
|
||
|
if l.byteAt(l.pos+1) == '?' {
|
||
|
switch {
|
||
|
case l.byteAt(l.pos+2) == '>':
|
||
|
l.pushTok(tokLparenAtomic, len("(?>"))
|
||
|
case l.byteAt(l.pos+2) == '=':
|
||
|
l.pushTok(tokLparenPositiveLookahead, len("(?="))
|
||
|
case l.byteAt(l.pos+2) == '!':
|
||
|
l.pushTok(tokLparenNegativeLookahead, len("(?!"))
|
||
|
case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '=':
|
||
|
l.pushTok(tokLparenPositiveLookbehind, len("(?<="))
|
||
|
case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '!':
|
||
|
l.pushTok(tokLparenNegativeLookbehind, len("(?<!"))
|
||
|
default:
|
||
|
if l.tryScanComment(l.pos + 2) {
|
||
|
} else if l.tryScanGroupName(l.pos + 2) {
|
||
|
} else if l.tryScanGroupFlags(l.pos + 2) {
|
||
|
} else {
|
||
|
throwErrorf(l.pos, l.pos+1, "group token is incomplete")
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
l.pushTok(tokLparen, 1)
|
||
|
}
|
||
|
case '{':
|
||
|
if j := l.repeatWidth(l.pos + 1); j >= 0 {
|
||
|
l.pushTok(tokRepeat, len("{")+j)
|
||
|
} else {
|
||
|
l.pushTok(tokChar, 1)
|
||
|
}
|
||
|
default:
|
||
|
l.pushTok(tokChar, 1)
|
||
|
}
|
||
|
l.maybeInsertConcat()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (l *lexer) scanCharClass() {
|
||
|
l.maybeInsertConcat()
|
||
|
|
||
|
// We need to handle first `]` in a special way. See #3.
|
||
|
if l.byteAt(l.pos) == ']' {
|
||
|
l.pushTok(tokChar, 1)
|
||
|
}
|
||
|
|
||
|
for l.pos < len(l.input) {
|
||
|
ch := l.input[l.pos]
|
||
|
if ch > unicode.MaxASCII {
|
||
|
_, size := utf8.DecodeRuneInString(l.input[l.pos:])
|
||
|
l.pushTok(tokChar, size)
|
||
|
continue
|
||
|
}
|
||
|
switch ch {
|
||
|
case '\\':
|
||
|
l.scanEscape(true)
|
||
|
case '[':
|
||
|
isPosixClass := false
|
||
|
if l.byteAt(l.pos+1) == ':' {
|
||
|
j := l.stringIndex(l.pos+2, ":]")
|
||
|
if j >= 0 {
|
||
|
isPosixClass = true
|
||
|
l.pushTok(tokPosixClass, j+len("[::]"))
|
||
|
}
|
||
|
}
|
||
|
if !isPosixClass {
|
||
|
l.pushTok(tokChar, 1)
|
||
|
}
|
||
|
case '-':
|
||
|
l.pushTok(tokMinus, 1)
|
||
|
case ']':
|
||
|
l.pushTok(tokRbracket, 1)
|
||
|
return // Stop scanning in the char context
|
||
|
default:
|
||
|
l.pushTok(tokChar, 1)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (l *lexer) scanEscape(insideCharClass bool) {
|
||
|
s := l.input
|
||
|
if l.pos+1 >= len(s) {
|
||
|
throwErrorf(l.pos, l.pos+1, `unexpected end of pattern: trailing '\'`)
|
||
|
}
|
||
|
switch {
|
||
|
case s[l.pos+1] == 'p' || s[l.pos+1] == 'P':
|
||
|
if l.pos+2 >= len(s) {
|
||
|
throwErrorf(l.pos, l.pos+2, "unexpected end of pattern: expected uni-class-short or '{'")
|
||
|
}
|
||
|
if s[l.pos+2] == '{' {
|
||
|
j := strings.IndexByte(s[l.pos+2:], '}')
|
||
|
if j < 0 {
|
||
|
throwErrorf(l.pos, l.pos+2, "can't find closing '}'")
|
||
|
}
|
||
|
l.pushTok(tokEscapeUniFull, len(`\p{`)+j)
|
||
|
} else {
|
||
|
l.pushTok(tokEscapeUni, len(`\pL`))
|
||
|
}
|
||
|
case s[l.pos+1] == 'x':
|
||
|
if l.pos+2 >= len(s) {
|
||
|
throwErrorf(l.pos, l.pos+2, "unexpected end of pattern: expected hex-digit or '{'")
|
||
|
}
|
||
|
if s[l.pos+2] == '{' {
|
||
|
j := strings.IndexByte(s[l.pos+2:], '}')
|
||
|
if j < 0 {
|
||
|
throwErrorf(l.pos, l.pos+2, "can't find closing '}'")
|
||
|
}
|
||
|
l.pushTok(tokEscapeHexFull, len(`\x{`)+j)
|
||
|
} else {
|
||
|
if isHexDigit(l.byteAt(l.pos + 3)) {
|
||
|
l.pushTok(tokEscapeHex, len(`\xFF`))
|
||
|
} else {
|
||
|
l.pushTok(tokEscapeHex, len(`\xF`))
|
||
|
}
|
||
|
}
|
||
|
case isOctalDigit(s[l.pos+1]):
|
||
|
digits := 1
|
||
|
if isOctalDigit(l.byteAt(l.pos + 2)) {
|
||
|
if isOctalDigit(l.byteAt(l.pos + 3)) {
|
||
|
digits = 3
|
||
|
} else {
|
||
|
digits = 2
|
||
|
}
|
||
|
}
|
||
|
l.pushTok(tokEscapeOctal, len(`\`)+digits)
|
||
|
case s[l.pos+1] == 'Q':
|
||
|
size := len(s) - l.pos // Until the pattern ends
|
||
|
j := l.stringIndex(l.pos+2, `\E`)
|
||
|
if j >= 0 {
|
||
|
size = j + len(`\Q\E`)
|
||
|
}
|
||
|
l.pushTok(tokQ, size)
|
||
|
|
||
|
default:
|
||
|
ch := l.byteAt(l.pos + 1)
|
||
|
if ch > unicode.MaxASCII {
|
||
|
_, size := utf8.DecodeRuneInString(l.input[l.pos+1:])
|
||
|
l.pushTok(tokEscapeChar, len(`\`)+size)
|
||
|
return
|
||
|
}
|
||
|
kind := tokEscapeChar
|
||
|
if insideCharClass {
|
||
|
if charClassMetachar[ch] {
|
||
|
kind = tokEscapeMeta
|
||
|
}
|
||
|
} else {
|
||
|
if reMetachar[ch] {
|
||
|
kind = tokEscapeMeta
|
||
|
}
|
||
|
}
|
||
|
l.pushTok(kind, 2)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (l *lexer) maybeInsertConcat() {
|
||
|
if l.isConcatPos() {
|
||
|
last := len(l.tokens) - 1
|
||
|
tok := l.tokens[last]
|
||
|
l.tokens[last].kind = tokConcat
|
||
|
l.tokens = append(l.tokens, tok)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (l *lexer) Init(s string) {
|
||
|
l.pos = 0
|
||
|
l.tokens = l.tokens[:0]
|
||
|
l.input = s
|
||
|
|
||
|
l.scan()
|
||
|
|
||
|
l.pos = 0
|
||
|
}
|
||
|
|
||
|
func (l *lexer) tryScanGroupName(pos int) bool {
|
||
|
tok := tokLparenName
|
||
|
endCh := byte('>')
|
||
|
offset := 1
|
||
|
switch l.byteAt(pos) {
|
||
|
case '\'':
|
||
|
endCh = '\''
|
||
|
tok = tokLparenNameQuote
|
||
|
case '<':
|
||
|
tok = tokLparenNameAngle
|
||
|
case 'P':
|
||
|
offset = 2
|
||
|
default:
|
||
|
return false
|
||
|
}
|
||
|
if pos+offset >= len(l.input) {
|
||
|
return false
|
||
|
}
|
||
|
end := strings.IndexByte(l.input[pos+offset:], endCh)
|
||
|
if end < 0 {
|
||
|
return false
|
||
|
}
|
||
|
l.pushTok(tok, len("(?")+offset+end+1)
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
func (l *lexer) tryScanGroupFlags(pos int) bool {
|
||
|
colonPos := strings.IndexByte(l.input[pos:], ':')
|
||
|
parenPos := strings.IndexByte(l.input[pos:], ')')
|
||
|
if parenPos < 0 {
|
||
|
return false
|
||
|
}
|
||
|
end := parenPos
|
||
|
if colonPos >= 0 && colonPos < parenPos {
|
||
|
end = colonPos + len(":")
|
||
|
}
|
||
|
l.pushTok(tokLparenFlags, len("(?")+end)
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
func (l *lexer) tryScanComment(pos int) bool {
|
||
|
if l.byteAt(pos) != '#' {
|
||
|
return false
|
||
|
}
|
||
|
parenPos := strings.IndexByte(l.input[pos:], ')')
|
||
|
if parenPos < 0 {
|
||
|
return false
|
||
|
}
|
||
|
l.pushTok(tokComment, len("(?")+parenPos+len(")"))
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
func (l *lexer) repeatWidth(pos int) int {
|
||
|
j := pos
|
||
|
for isDigit(l.byteAt(j)) {
|
||
|
j++
|
||
|
}
|
||
|
if j == pos {
|
||
|
return -1
|
||
|
}
|
||
|
if l.byteAt(j) == '}' {
|
||
|
return (j + len("}")) - pos // {min}
|
||
|
}
|
||
|
if l.byteAt(j) != ',' {
|
||
|
return -1
|
||
|
}
|
||
|
j += len(",")
|
||
|
for isDigit(l.byteAt(j)) {
|
||
|
j++
|
||
|
}
|
||
|
if l.byteAt(j) == '}' {
|
||
|
return (j + len("}")) - pos // {min,} or {min,max}
|
||
|
}
|
||
|
return -1
|
||
|
}
|
||
|
|
||
|
func (l *lexer) stringIndex(offset int, s string) int {
|
||
|
if offset < len(l.input) {
|
||
|
return strings.Index(l.input[offset:], s)
|
||
|
}
|
||
|
return -1
|
||
|
}
|
||
|
|
||
|
func (l *lexer) byteAt(pos int) byte {
|
||
|
if pos >= 0 && pos < len(l.input) {
|
||
|
return l.input[pos]
|
||
|
}
|
||
|
return 0
|
||
|
}
|
||
|
|
||
|
func (l *lexer) pushTok(kind tokenKind, size int) {
|
||
|
l.tokens = append(l.tokens, token{
|
||
|
kind: kind,
|
||
|
pos: Position{Begin: uint16(l.pos), End: uint16(l.pos + size)},
|
||
|
})
|
||
|
l.pos += size
|
||
|
}
|
||
|
|
||
|
func (l *lexer) isConcatPos() bool {
|
||
|
if len(l.tokens) < 2 {
|
||
|
return false
|
||
|
}
|
||
|
x := l.tokens[len(l.tokens)-2].kind
|
||
|
if concatTable[x]&concatX != 0 {
|
||
|
return false
|
||
|
}
|
||
|
y := l.tokens[len(l.tokens)-1].kind
|
||
|
return concatTable[y]&concatY == 0
|
||
|
}
|
||
|
|
||
|
const (
|
||
|
concatX byte = 1 << iota
|
||
|
concatY
|
||
|
)
|
||
|
|
||
|
var concatTable = [256]byte{
|
||
|
tokPipe: concatX | concatY,
|
||
|
|
||
|
tokLparen: concatX,
|
||
|
tokLparenFlags: concatX,
|
||
|
tokLparenName: concatX,
|
||
|
tokLparenNameAngle: concatX,
|
||
|
tokLparenNameQuote: concatX,
|
||
|
tokLparenAtomic: concatX,
|
||
|
tokLbracket: concatX,
|
||
|
tokLbracketCaret: concatX,
|
||
|
tokLparenPositiveLookahead: concatX,
|
||
|
tokLparenPositiveLookbehind: concatX,
|
||
|
tokLparenNegativeLookahead: concatX,
|
||
|
tokLparenNegativeLookbehind: concatX,
|
||
|
|
||
|
tokRparen: concatY,
|
||
|
tokRbracket: concatY,
|
||
|
tokPlus: concatY,
|
||
|
tokStar: concatY,
|
||
|
tokQuestion: concatY,
|
||
|
tokRepeat: concatY,
|
||
|
}
|