diff options
Diffstat (limited to 'src/text/template/parse/lex.go')
-rw-r--r-- | src/text/template/parse/lex.go | 679 |
1 files changed, 679 insertions, 0 deletions
diff --git a/src/text/template/parse/lex.go b/src/text/template/parse/lex.go new file mode 100644 index 0000000..29403dd --- /dev/null +++ b/src/text/template/parse/lex.go @@ -0,0 +1,679 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package parse + +import ( + "fmt" + "strings" + "unicode" + "unicode/utf8" +) + +// item represents a token or text string returned from the scanner. +type item struct { + typ itemType // The type of this item. + pos Pos // The starting position, in bytes, of this item in the input string. + val string // The value of this item. + line int // The line number at the start of this item. +} + +func (i item) String() string { + switch { + case i.typ == itemEOF: + return "EOF" + case i.typ == itemError: + return i.val + case i.typ > itemKeyword: + return fmt.Sprintf("<%s>", i.val) + case len(i.val) > 10: + return fmt.Sprintf("%.10q...", i.val) + } + return fmt.Sprintf("%q", i.val) +} + +// itemType identifies the type of lex items. +type itemType int + +const ( + itemError itemType = iota // error occurred; value is text of error + itemBool // boolean constant + itemChar // printable ASCII character; grab bag for comma etc. + itemCharConstant // character constant + itemComment // comment text + itemComplex // complex constant (1+2i); imaginary is just a number + itemAssign // equals ('=') introducing an assignment + itemDeclare // colon-equals (':=') introducing a declaration + itemEOF + itemField // alphanumeric identifier starting with '.' + itemIdentifier // alphanumeric identifier not starting with '.' + itemLeftDelim // left action delimiter + itemLeftParen // '(' inside action + itemNumber // simple number, including imaginary + itemPipe // pipe symbol + itemRawString // raw quoted string (includes quotes) + itemRightDelim // right action delimiter + itemRightParen // ')' inside action + itemSpace // run of spaces separating arguments + itemString // quoted string (includes quotes) + itemText // plain text + itemVariable // variable starting with '$', such as '$' or '$1' or '$hello' + // Keywords appear after all the rest. + itemKeyword // used only to delimit the keywords + itemBlock // block keyword + itemBreak // break keyword + itemContinue // continue keyword + itemDot // the cursor, spelled '.' + itemDefine // define keyword + itemElse // else keyword + itemEnd // end keyword + itemIf // if keyword + itemNil // the untyped nil constant, easiest to treat as a keyword + itemRange // range keyword + itemTemplate // template keyword + itemWith // with keyword +) + +var key = map[string]itemType{ + ".": itemDot, + "block": itemBlock, + "break": itemBreak, + "continue": itemContinue, + "define": itemDefine, + "else": itemElse, + "end": itemEnd, + "if": itemIf, + "range": itemRange, + "nil": itemNil, + "template": itemTemplate, + "with": itemWith, +} + +const eof = -1 + +// Trimming spaces. +// If the action begins "{{- " rather than "{{", then all space/tab/newlines +// preceding the action are trimmed; conversely if it ends " -}}" the +// leading spaces are trimmed. This is done entirely in the lexer; the +// parser never sees it happen. We require an ASCII space (' ', \t, \r, \n) +// to be present to avoid ambiguity with things like "{{-3}}". It reads +// better with the space present anyway. For simplicity, only ASCII +// does the job. +const ( + spaceChars = " \t\r\n" // These are the space characters defined by Go itself. + trimMarker = '-' // Attached to left/right delimiter, trims trailing spaces from preceding/following text. + trimMarkerLen = Pos(1 + 1) // marker plus space before or after +) + +// stateFn represents the state of the scanner as a function that returns the next state. +type stateFn func(*lexer) stateFn + +// lexer holds the state of the scanner. +type lexer struct { + name string // the name of the input; used only for error reports + input string // the string being scanned + leftDelim string // start of action + rightDelim string // end of action + emitComment bool // emit itemComment tokens. + pos Pos // current position in the input + start Pos // start position of this item + atEOF bool // we have hit the end of input and returned eof + items chan item // channel of scanned items + parenDepth int // nesting depth of ( ) exprs + line int // 1+number of newlines seen + startLine int // start line of this item + breakOK bool // break keyword allowed + continueOK bool // continue keyword allowed +} + +// next returns the next rune in the input. +func (l *lexer) next() rune { + if int(l.pos) >= len(l.input) { + l.atEOF = true + return eof + } + r, w := utf8.DecodeRuneInString(l.input[l.pos:]) + l.pos += Pos(w) + if r == '\n' { + l.line++ + } + return r +} + +// peek returns but does not consume the next rune in the input. +func (l *lexer) peek() rune { + r := l.next() + l.backup() + return r +} + +// backup steps back one rune. +func (l *lexer) backup() { + if !l.atEOF && l.pos > 0 { + r, w := utf8.DecodeLastRuneInString(l.input[:l.pos]) + l.pos -= Pos(w) + // Correct newline count. + if r == '\n' { + l.line-- + } + } +} + +// emit passes an item back to the client. +func (l *lexer) emit(t itemType) { + l.items <- item{t, l.start, l.input[l.start:l.pos], l.startLine} + l.start = l.pos + l.startLine = l.line +} + +// ignore skips over the pending input before this point. +func (l *lexer) ignore() { + l.line += strings.Count(l.input[l.start:l.pos], "\n") + l.start = l.pos + l.startLine = l.line +} + +// accept consumes the next rune if it's from the valid set. +func (l *lexer) accept(valid string) bool { + if strings.ContainsRune(valid, l.next()) { + return true + } + l.backup() + return false +} + +// acceptRun consumes a run of runes from the valid set. +func (l *lexer) acceptRun(valid string) { + for strings.ContainsRune(valid, l.next()) { + } + l.backup() +} + +// errorf returns an error token and terminates the scan by passing +// back a nil pointer that will be the next state, terminating l.nextItem. +func (l *lexer) errorf(format string, args ...any) stateFn { + l.items <- item{itemError, l.start, fmt.Sprintf(format, args...), l.startLine} + return nil +} + +// nextItem returns the next item from the input. +// Called by the parser, not in the lexing goroutine. +func (l *lexer) nextItem() item { + return <-l.items +} + +// drain drains the output so the lexing goroutine will exit. +// Called by the parser, not in the lexing goroutine. +func (l *lexer) drain() { + for range l.items { + } +} + +// lex creates a new scanner for the input string. +func lex(name, input, left, right string, emitComment, breakOK, continueOK bool) *lexer { + if left == "" { + left = leftDelim + } + if right == "" { + right = rightDelim + } + l := &lexer{ + name: name, + input: input, + leftDelim: left, + rightDelim: right, + emitComment: emitComment, + breakOK: breakOK, + continueOK: continueOK, + items: make(chan item), + line: 1, + startLine: 1, + } + go l.run() + return l +} + +// run runs the state machine for the lexer. +func (l *lexer) run() { + for state := lexText; state != nil; { + state = state(l) + } + close(l.items) +} + +// state functions + +const ( + leftDelim = "{{" + rightDelim = "}}" + leftComment = "/*" + rightComment = "*/" +) + +// lexText scans until an opening action delimiter, "{{". +func lexText(l *lexer) stateFn { + if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 { + ldn := Pos(len(l.leftDelim)) + l.pos += Pos(x) + trimLength := Pos(0) + if hasLeftTrimMarker(l.input[l.pos+ldn:]) { + trimLength = rightTrimLength(l.input[l.start:l.pos]) + } + l.pos -= trimLength + if l.pos > l.start { + l.line += strings.Count(l.input[l.start:l.pos], "\n") + l.emit(itemText) + } + l.pos += trimLength + l.ignore() + return lexLeftDelim + } + l.pos = Pos(len(l.input)) + // Correctly reached EOF. + if l.pos > l.start { + l.line += strings.Count(l.input[l.start:l.pos], "\n") + l.emit(itemText) + } + l.emit(itemEOF) + return nil +} + +// rightTrimLength returns the length of the spaces at the end of the string. +func rightTrimLength(s string) Pos { + return Pos(len(s) - len(strings.TrimRight(s, spaceChars))) +} + +// atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker. +func (l *lexer) atRightDelim() (delim, trimSpaces bool) { + if hasRightTrimMarker(l.input[l.pos:]) && strings.HasPrefix(l.input[l.pos+trimMarkerLen:], l.rightDelim) { // With trim marker. + return true, true + } + if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { // Without trim marker. + return true, false + } + return false, false +} + +// leftTrimLength returns the length of the spaces at the beginning of the string. +func leftTrimLength(s string) Pos { + return Pos(len(s) - len(strings.TrimLeft(s, spaceChars))) +} + +// lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker. +func lexLeftDelim(l *lexer) stateFn { + l.pos += Pos(len(l.leftDelim)) + trimSpace := hasLeftTrimMarker(l.input[l.pos:]) + afterMarker := Pos(0) + if trimSpace { + afterMarker = trimMarkerLen + } + if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) { + l.pos += afterMarker + l.ignore() + return lexComment + } + l.emit(itemLeftDelim) + l.pos += afterMarker + l.ignore() + l.parenDepth = 0 + return lexInsideAction +} + +// lexComment scans a comment. The left comment marker is known to be present. +func lexComment(l *lexer) stateFn { + l.pos += Pos(len(leftComment)) + i := strings.Index(l.input[l.pos:], rightComment) + if i < 0 { + return l.errorf("unclosed comment") + } + l.pos += Pos(i + len(rightComment)) + delim, trimSpace := l.atRightDelim() + if !delim { + return l.errorf("comment ends before closing delimiter") + } + if l.emitComment { + l.emit(itemComment) + } + if trimSpace { + l.pos += trimMarkerLen + } + l.pos += Pos(len(l.rightDelim)) + if trimSpace { + l.pos += leftTrimLength(l.input[l.pos:]) + } + l.ignore() + return lexText +} + +// lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker. +func lexRightDelim(l *lexer) stateFn { + trimSpace := hasRightTrimMarker(l.input[l.pos:]) + if trimSpace { + l.pos += trimMarkerLen + l.ignore() + } + l.pos += Pos(len(l.rightDelim)) + l.emit(itemRightDelim) + if trimSpace { + l.pos += leftTrimLength(l.input[l.pos:]) + l.ignore() + } + return lexText +} + +// lexInsideAction scans the elements inside action delimiters. +func lexInsideAction(l *lexer) stateFn { + // Either number, quoted string, or identifier. + // Spaces separate arguments; runs of spaces turn into itemSpace. + // Pipe symbols separate and are emitted. + delim, _ := l.atRightDelim() + if delim { + if l.parenDepth == 0 { + return lexRightDelim + } + return l.errorf("unclosed left paren") + } + switch r := l.next(); { + case r == eof: + return l.errorf("unclosed action") + case isSpace(r): + l.backup() // Put space back in case we have " -}}". + return lexSpace + case r == '=': + l.emit(itemAssign) + case r == ':': + if l.next() != '=' { + return l.errorf("expected :=") + } + l.emit(itemDeclare) + case r == '|': + l.emit(itemPipe) + case r == '"': + return lexQuote + case r == '`': + return lexRawQuote + case r == '$': + return lexVariable + case r == '\'': + return lexChar + case r == '.': + // special look-ahead for ".field" so we don't break l.backup(). + if l.pos < Pos(len(l.input)) { + r := l.input[l.pos] + if r < '0' || '9' < r { + return lexField + } + } + fallthrough // '.' can start a number. + case r == '+' || r == '-' || ('0' <= r && r <= '9'): + l.backup() + return lexNumber + case isAlphaNumeric(r): + l.backup() + return lexIdentifier + case r == '(': + l.emit(itemLeftParen) + l.parenDepth++ + case r == ')': + l.emit(itemRightParen) + l.parenDepth-- + if l.parenDepth < 0 { + return l.errorf("unexpected right paren %#U", r) + } + case r <= unicode.MaxASCII && unicode.IsPrint(r): + l.emit(itemChar) + default: + return l.errorf("unrecognized character in action: %#U", r) + } + return lexInsideAction +} + +// lexSpace scans a run of space characters. +// We have not consumed the first space, which is known to be present. +// Take care if there is a trim-marked right delimiter, which starts with a space. +func lexSpace(l *lexer) stateFn { + var r rune + var numSpaces int + for { + r = l.peek() + if !isSpace(r) { + break + } + l.next() + numSpaces++ + } + // Be careful about a trim-marked closing delimiter, which has a minus + // after a space. We know there is a space, so check for the '-' that might follow. + if hasRightTrimMarker(l.input[l.pos-1:]) && strings.HasPrefix(l.input[l.pos-1+trimMarkerLen:], l.rightDelim) { + l.backup() // Before the space. + if numSpaces == 1 { + return lexRightDelim // On the delim, so go right to that. + } + } + l.emit(itemSpace) + return lexInsideAction +} + +// lexIdentifier scans an alphanumeric. +func lexIdentifier(l *lexer) stateFn { +Loop: + for { + switch r := l.next(); { + case isAlphaNumeric(r): + // absorb. + default: + l.backup() + word := l.input[l.start:l.pos] + if !l.atTerminator() { + return l.errorf("bad character %#U", r) + } + switch { + case key[word] > itemKeyword: + item := key[word] + if item == itemBreak && !l.breakOK || item == itemContinue && !l.continueOK { + l.emit(itemIdentifier) + } else { + l.emit(item) + } + case word[0] == '.': + l.emit(itemField) + case word == "true", word == "false": + l.emit(itemBool) + default: + l.emit(itemIdentifier) + } + break Loop + } + } + return lexInsideAction +} + +// lexField scans a field: .Alphanumeric. +// The . has been scanned. +func lexField(l *lexer) stateFn { + return lexFieldOrVariable(l, itemField) +} + +// lexVariable scans a Variable: $Alphanumeric. +// The $ has been scanned. +func lexVariable(l *lexer) stateFn { + if l.atTerminator() { // Nothing interesting follows -> "$". + l.emit(itemVariable) + return lexInsideAction + } + return lexFieldOrVariable(l, itemVariable) +} + +// lexVariable scans a field or variable: [.$]Alphanumeric. +// The . or $ has been scanned. +func lexFieldOrVariable(l *lexer, typ itemType) stateFn { + if l.atTerminator() { // Nothing interesting follows -> "." or "$". + if typ == itemVariable { + l.emit(itemVariable) + } else { + l.emit(itemDot) + } + return lexInsideAction + } + var r rune + for { + r = l.next() + if !isAlphaNumeric(r) { + l.backup() + break + } + } + if !l.atTerminator() { + return l.errorf("bad character %#U", r) + } + l.emit(typ) + return lexInsideAction +} + +// atTerminator reports whether the input is at valid termination character to +// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases +// like "$x+2" not being acceptable without a space, in case we decide one +// day to implement arithmetic. +func (l *lexer) atTerminator() bool { + r := l.peek() + if isSpace(r) { + return true + } + switch r { + case eof, '.', ',', '|', ':', ')', '(': + return true + } + return strings.HasPrefix(l.input[l.pos:], l.rightDelim) +} + +// lexChar scans a character constant. The initial quote is already +// scanned. Syntax checking is done by the parser. +func lexChar(l *lexer) stateFn { +Loop: + for { + switch l.next() { + case '\\': + if r := l.next(); r != eof && r != '\n' { + break + } + fallthrough + case eof, '\n': + return l.errorf("unterminated character constant") + case '\'': + break Loop + } + } + l.emit(itemCharConstant) + return lexInsideAction +} + +// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This +// isn't a perfect number scanner - for instance it accepts "." and "0x0.2" +// and "089" - but when it's wrong the input is invalid and the parser (via +// strconv) will notice. +func lexNumber(l *lexer) stateFn { + if !l.scanNumber() { + return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) + } + if sign := l.peek(); sign == '+' || sign == '-' { + // Complex: 1+2i. No spaces, must end in 'i'. + if !l.scanNumber() || l.input[l.pos-1] != 'i' { + return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) + } + l.emit(itemComplex) + } else { + l.emit(itemNumber) + } + return lexInsideAction +} + +func (l *lexer) scanNumber() bool { + // Optional leading sign. + l.accept("+-") + // Is it hex? + digits := "0123456789_" + if l.accept("0") { + // Note: Leading 0 does not mean octal in floats. + if l.accept("xX") { + digits = "0123456789abcdefABCDEF_" + } else if l.accept("oO") { + digits = "01234567_" + } else if l.accept("bB") { + digits = "01_" + } + } + l.acceptRun(digits) + if l.accept(".") { + l.acceptRun(digits) + } + if len(digits) == 10+1 && l.accept("eE") { + l.accept("+-") + l.acceptRun("0123456789_") + } + if len(digits) == 16+6+1 && l.accept("pP") { + l.accept("+-") + l.acceptRun("0123456789_") + } + // Is it imaginary? + l.accept("i") + // Next thing mustn't be alphanumeric. + if isAlphaNumeric(l.peek()) { + l.next() + return false + } + return true +} + +// lexQuote scans a quoted string. +func lexQuote(l *lexer) stateFn { +Loop: + for { + switch l.next() { + case '\\': + if r := l.next(); r != eof && r != '\n' { + break + } + fallthrough + case eof, '\n': + return l.errorf("unterminated quoted string") + case '"': + break Loop + } + } + l.emit(itemString) + return lexInsideAction +} + +// lexRawQuote scans a raw quoted string. +func lexRawQuote(l *lexer) stateFn { +Loop: + for { + switch l.next() { + case eof: + return l.errorf("unterminated raw quoted string") + case '`': + break Loop + } + } + l.emit(itemRawString) + return lexInsideAction +} + +// isSpace reports whether r is a space character. +func isSpace(r rune) bool { + return r == ' ' || r == '\t' || r == '\r' || r == '\n' +} + +// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. +func isAlphaNumeric(r rune) bool { + return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) +} + +func hasLeftTrimMarker(s string) bool { + return len(s) >= 2 && s[0] == trimMarker && isSpace(rune(s[1])) +} + +func hasRightTrimMarker(s string) bool { + return len(s) >= 2 && isSpace(rune(s[0])) && s[1] == trimMarker +} |