diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 13:16:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 13:16:40 +0000 |
commit | 47ab3d4a42e9ab51c465c4322d2ec233f6324e6b (patch) | |
tree | a61a0ffd83f4a3def4b36e5c8e99630c559aa723 /src/text/scanner | |
parent | Initial commit. (diff) | |
download | golang-1.18-47ab3d4a42e9ab51c465c4322d2ec233f6324e6b.tar.xz golang-1.18-47ab3d4a42e9ab51c465c4322d2ec233f6324e6b.zip |
Adding upstream version 1.18.10.upstream/1.18.10upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/text/scanner')
-rw-r--r-- | src/text/scanner/example_test.go | 140 | ||||
-rw-r--r-- | src/text/scanner/scanner.go | 793 | ||||
-rw-r--r-- | src/text/scanner/scanner_test.go | 915 |
3 files changed, 1848 insertions, 0 deletions
diff --git a/src/text/scanner/example_test.go b/src/text/scanner/example_test.go new file mode 100644 index 0000000..5e8c3fb --- /dev/null +++ b/src/text/scanner/example_test.go @@ -0,0 +1,140 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scanner_test + +import ( + "fmt" + "strings" + "text/scanner" + "unicode" +) + +func Example() { + const src = ` +// This is scanned code. +if a > 10 { + someParsable = text +}` + + var s scanner.Scanner + s.Init(strings.NewReader(src)) + s.Filename = "example" + for tok := s.Scan(); tok != scanner.EOF; tok = s.Scan() { + fmt.Printf("%s: %s\n", s.Position, s.TokenText()) + } + + // Output: + // example:3:1: if + // example:3:4: a + // example:3:6: > + // example:3:8: 10 + // example:3:11: { + // example:4:2: someParsable + // example:4:15: = + // example:4:17: text + // example:5:1: } +} + +func Example_isIdentRune() { + const src = "%var1 var2%" + + var s scanner.Scanner + s.Init(strings.NewReader(src)) + s.Filename = "default" + + for tok := s.Scan(); tok != scanner.EOF; tok = s.Scan() { + fmt.Printf("%s: %s\n", s.Position, s.TokenText()) + } + + fmt.Println() + s.Init(strings.NewReader(src)) + s.Filename = "percent" + + // treat leading '%' as part of an identifier + s.IsIdentRune = func(ch rune, i int) bool { + return ch == '%' && i == 0 || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 + } + + for tok := s.Scan(); tok != scanner.EOF; tok = s.Scan() { + fmt.Printf("%s: %s\n", s.Position, s.TokenText()) + } + + // Output: + // default:1:1: % + // default:1:2: var1 + // default:1:7: var2 + // default:1:11: % + // + // percent:1:1: %var1 + // percent:1:7: var2 + // percent:1:11: % +} + +func Example_mode() { + const src = ` + // Comment begins at column 5. + +This line should not be included in the output. + +/* +This multiline comment +should be extracted in +its entirety. +*/ +` + + var s scanner.Scanner + s.Init(strings.NewReader(src)) + s.Filename = "comments" + s.Mode ^= scanner.SkipComments // don't skip comments + + for tok := s.Scan(); tok != scanner.EOF; tok = s.Scan() { + txt := s.TokenText() + if strings.HasPrefix(txt, "//") || strings.HasPrefix(txt, "/*") { + fmt.Printf("%s: %s\n", s.Position, txt) + } + } + + // Output: + // comments:2:5: // Comment begins at column 5. + // comments:6:1: /* + // This multiline comment + // should be extracted in + // its entirety. + // */ +} + +func Example_whitespace() { + // tab-separated values + const src = `aa ab ac ad +ba bb bc bd +ca cb cc cd +da db dc dd` + + var ( + col, row int + s scanner.Scanner + tsv [4][4]string // large enough for example above + ) + s.Init(strings.NewReader(src)) + s.Whitespace ^= 1<<'\t' | 1<<'\n' // don't skip tabs and new lines + + for tok := s.Scan(); tok != scanner.EOF; tok = s.Scan() { + switch tok { + case '\n': + row++ + col = 0 + case '\t': + col++ + default: + tsv[row][col] = s.TokenText() + } + } + + fmt.Print(tsv) + + // Output: + // [[aa ab ac ad] [ba bb bc bd] [ca cb cc cd] [da db dc dd]] +} diff --git a/src/text/scanner/scanner.go b/src/text/scanner/scanner.go new file mode 100644 index 0000000..f1fbf98 --- /dev/null +++ b/src/text/scanner/scanner.go @@ -0,0 +1,793 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package scanner provides a scanner and tokenizer for UTF-8-encoded text. +// It takes an io.Reader providing the source, which then can be tokenized +// through repeated calls to the Scan function. For compatibility with +// existing tools, the NUL character is not allowed. If the first character +// in the source is a UTF-8 encoded byte order mark (BOM), it is discarded. +// +// By default, a Scanner skips white space and Go comments and recognizes all +// literals as defined by the Go language specification. It may be +// customized to recognize only a subset of those literals and to recognize +// different identifier and white space characters. +package scanner + +import ( + "bytes" + "fmt" + "io" + "os" + "unicode" + "unicode/utf8" +) + +// Position is a value that represents a source position. +// A position is valid if Line > 0. +type Position struct { + Filename string // filename, if any + Offset int // byte offset, starting at 0 + Line int // line number, starting at 1 + Column int // column number, starting at 1 (character count per line) +} + +// IsValid reports whether the position is valid. +func (pos *Position) IsValid() bool { return pos.Line > 0 } + +func (pos Position) String() string { + s := pos.Filename + if s == "" { + s = "<input>" + } + if pos.IsValid() { + s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column) + } + return s +} + +// Predefined mode bits to control recognition of tokens. For instance, +// to configure a Scanner such that it only recognizes (Go) identifiers, +// integers, and skips comments, set the Scanner's Mode field to: +// +// ScanIdents | ScanInts | SkipComments +// +// With the exceptions of comments, which are skipped if SkipComments is +// set, unrecognized tokens are not ignored. Instead, the scanner simply +// returns the respective individual characters (or possibly sub-tokens). +// For instance, if the mode is ScanIdents (not ScanStrings), the string +// "foo" is scanned as the token sequence '"' Ident '"'. +// +// Use GoTokens to configure the Scanner such that it accepts all Go +// literal tokens including Go identifiers. Comments will be skipped. +// +const ( + ScanIdents = 1 << -Ident + ScanInts = 1 << -Int + ScanFloats = 1 << -Float // includes Ints and hexadecimal floats + ScanChars = 1 << -Char + ScanStrings = 1 << -String + ScanRawStrings = 1 << -RawString + ScanComments = 1 << -Comment + SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space + GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments +) + +// The result of Scan is one of these tokens or a Unicode character. +const ( + EOF = -(iota + 1) + Ident + Int + Float + Char + String + RawString + Comment + + // internal use only + skipComment +) + +var tokenString = map[rune]string{ + EOF: "EOF", + Ident: "Ident", + Int: "Int", + Float: "Float", + Char: "Char", + String: "String", + RawString: "RawString", + Comment: "Comment", +} + +// TokenString returns a printable string for a token or Unicode character. +func TokenString(tok rune) string { + if s, found := tokenString[tok]; found { + return s + } + return fmt.Sprintf("%q", string(tok)) +} + +// GoWhitespace is the default value for the Scanner's Whitespace field. +// Its value selects Go's white space characters. +const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' + +const bufLen = 1024 // at least utf8.UTFMax + +// A Scanner implements reading of Unicode characters and tokens from an io.Reader. +type Scanner struct { + // Input + src io.Reader + + // Source buffer + srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() + srcPos int // reading position (srcBuf index) + srcEnd int // source end (srcBuf index) + + // Source position + srcBufOffset int // byte offset of srcBuf[0] in source + line int // line count + column int // character count + lastLineLen int // length of last line in characters (for correct column reporting) + lastCharLen int // length of last character in bytes + + // Token text buffer + // Typically, token text is stored completely in srcBuf, but in general + // the token text's head may be buffered in tokBuf while the token text's + // tail is stored in srcBuf. + tokBuf bytes.Buffer // token text head that is not in srcBuf anymore + tokPos int // token text tail position (srcBuf index); valid if >= 0 + tokEnd int // token text tail end (srcBuf index) + + // One character look-ahead + ch rune // character before current srcPos + + // Error is called for each error encountered. If no Error + // function is set, the error is reported to os.Stderr. + Error func(s *Scanner, msg string) + + // ErrorCount is incremented by one for each error encountered. + ErrorCount int + + // The Mode field controls which tokens are recognized. For instance, + // to recognize Ints, set the ScanInts bit in Mode. The field may be + // changed at any time. + Mode uint + + // The Whitespace field controls which characters are recognized + // as white space. To recognize a character ch <= ' ' as white space, + // set the ch'th bit in Whitespace (the Scanner's behavior is undefined + // for values ch > ' '). The field may be changed at any time. + Whitespace uint64 + + // IsIdentRune is a predicate controlling the characters accepted + // as the ith rune in an identifier. The set of valid characters + // must not intersect with the set of white space characters. + // If no IsIdentRune function is set, regular Go identifiers are + // accepted instead. The field may be changed at any time. + IsIdentRune func(ch rune, i int) bool + + // Start position of most recently scanned token; set by Scan. + // Calling Init or Next invalidates the position (Line == 0). + // The Filename field is always left untouched by the Scanner. + // If an error is reported (via Error) and Position is invalid, + // the scanner is not inside a token. Call Pos to obtain an error + // position in that case, or to obtain the position immediately + // after the most recently scanned token. + Position +} + +// Init initializes a Scanner with a new source and returns s. +// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, +// and Whitespace is set to GoWhitespace. +func (s *Scanner) Init(src io.Reader) *Scanner { + s.src = src + + // initialize source buffer + // (the first call to next() will fill it by calling src.Read) + s.srcBuf[0] = utf8.RuneSelf // sentinel + s.srcPos = 0 + s.srcEnd = 0 + + // initialize source position + s.srcBufOffset = 0 + s.line = 1 + s.column = 0 + s.lastLineLen = 0 + s.lastCharLen = 0 + + // initialize token text buffer + // (required for first call to next()). + s.tokPos = -1 + + // initialize one character look-ahead + s.ch = -2 // no char read yet, not EOF + + // initialize public fields + s.Error = nil + s.ErrorCount = 0 + s.Mode = GoTokens + s.Whitespace = GoWhitespace + s.Line = 0 // invalidate token position + + return s +} + +// next reads and returns the next Unicode character. It is designed such +// that only a minimal amount of work needs to be done in the common ASCII +// case (one test to check for both ASCII and end-of-buffer, and one test +// to check for newlines). +func (s *Scanner) next() rune { + ch, width := rune(s.srcBuf[s.srcPos]), 1 + + if ch >= utf8.RuneSelf { + // uncommon case: not ASCII or not enough bytes + for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { + // not enough bytes: read some more, but first + // save away token text if any + if s.tokPos >= 0 { + s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) + s.tokPos = 0 + // s.tokEnd is set by Scan() + } + // move unread bytes to beginning of buffer + copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) + s.srcBufOffset += s.srcPos + // read more bytes + // (an io.Reader must return io.EOF when it reaches + // the end of what it is reading - simply returning + // n == 0 will make this loop retry forever; but the + // error is in the reader implementation in that case) + i := s.srcEnd - s.srcPos + n, err := s.src.Read(s.srcBuf[i:bufLen]) + s.srcPos = 0 + s.srcEnd = i + n + s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel + if err != nil { + if err != io.EOF { + s.error(err.Error()) + } + if s.srcEnd == 0 { + if s.lastCharLen > 0 { + // previous character was not EOF + s.column++ + } + s.lastCharLen = 0 + return EOF + } + // If err == EOF, we won't be getting more + // bytes; break to avoid infinite loop. If + // err is something else, we don't know if + // we can get more bytes; thus also break. + break + } + } + // at least one byte + ch = rune(s.srcBuf[s.srcPos]) + if ch >= utf8.RuneSelf { + // uncommon case: not ASCII + ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) + if ch == utf8.RuneError && width == 1 { + // advance for correct error position + s.srcPos += width + s.lastCharLen = width + s.column++ + s.error("invalid UTF-8 encoding") + return ch + } + } + } + + // advance + s.srcPos += width + s.lastCharLen = width + s.column++ + + // special situations + switch ch { + case 0: + // for compatibility with other tools + s.error("invalid character NUL") + case '\n': + s.line++ + s.lastLineLen = s.column + s.column = 0 + } + + return ch +} + +// Next reads and returns the next Unicode character. +// It returns EOF at the end of the source. It reports +// a read error by calling s.Error, if not nil; otherwise +// it prints an error message to os.Stderr. Next does not +// update the Scanner's Position field; use Pos() to +// get the current position. +func (s *Scanner) Next() rune { + s.tokPos = -1 // don't collect token text + s.Line = 0 // invalidate token position + ch := s.Peek() + if ch != EOF { + s.ch = s.next() + } + return ch +} + +// Peek returns the next Unicode character in the source without advancing +// the scanner. It returns EOF if the scanner's position is at the last +// character of the source. +func (s *Scanner) Peek() rune { + if s.ch == -2 { + // this code is only run for the very first character + s.ch = s.next() + if s.ch == '\uFEFF' { + s.ch = s.next() // ignore BOM + } + } + return s.ch +} + +func (s *Scanner) error(msg string) { + s.tokEnd = s.srcPos - s.lastCharLen // make sure token text is terminated + s.ErrorCount++ + if s.Error != nil { + s.Error(s, msg) + return + } + pos := s.Position + if !pos.IsValid() { + pos = s.Pos() + } + fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) +} + +func (s *Scanner) errorf(format string, args ...any) { + s.error(fmt.Sprintf(format, args...)) +} + +func (s *Scanner) isIdentRune(ch rune, i int) bool { + if s.IsIdentRune != nil { + return s.IsIdentRune(ch, i) + } + return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 +} + +func (s *Scanner) scanIdentifier() rune { + // we know the zero'th rune is OK; start scanning at the next one + ch := s.next() + for i := 1; s.isIdentRune(ch, i); i++ { + ch = s.next() + } + return ch +} + +func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter +func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } +func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } + +// digits accepts the sequence { digit | '_' } starting with ch0. +// If base <= 10, digits accepts any decimal digit but records +// the first invalid digit >= base in *invalid if *invalid == 0. +// digits returns the first rune that is not part of the sequence +// anymore, and a bitset describing whether the sequence contained +// digits (bit 0 is set), or separators '_' (bit 1 is set). +func (s *Scanner) digits(ch0 rune, base int, invalid *rune) (ch rune, digsep int) { + ch = ch0 + if base <= 10 { + max := rune('0' + base) + for isDecimal(ch) || ch == '_' { + ds := 1 + if ch == '_' { + ds = 2 + } else if ch >= max && *invalid == 0 { + *invalid = ch + } + digsep |= ds + ch = s.next() + } + } else { + for isHex(ch) || ch == '_' { + ds := 1 + if ch == '_' { + ds = 2 + } + digsep |= ds + ch = s.next() + } + } + return +} + +func (s *Scanner) scanNumber(ch rune, seenDot bool) (rune, rune) { + base := 10 // number base + prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' + digsep := 0 // bit 0: digit present, bit 1: '_' present + invalid := rune(0) // invalid digit in literal, or 0 + + // integer part + var tok rune + var ds int + if !seenDot { + tok = Int + if ch == '0' { + ch = s.next() + switch lower(ch) { + case 'x': + ch = s.next() + base, prefix = 16, 'x' + case 'o': + ch = s.next() + base, prefix = 8, 'o' + case 'b': + ch = s.next() + base, prefix = 2, 'b' + default: + base, prefix = 8, '0' + digsep = 1 // leading 0 + } + } + ch, ds = s.digits(ch, base, &invalid) + digsep |= ds + if ch == '.' && s.Mode&ScanFloats != 0 { + ch = s.next() + seenDot = true + } + } + + // fractional part + if seenDot { + tok = Float + if prefix == 'o' || prefix == 'b' { + s.error("invalid radix point in " + litname(prefix)) + } + ch, ds = s.digits(ch, base, &invalid) + digsep |= ds + } + + if digsep&1 == 0 { + s.error(litname(prefix) + " has no digits") + } + + // exponent + if e := lower(ch); (e == 'e' || e == 'p') && s.Mode&ScanFloats != 0 { + switch { + case e == 'e' && prefix != 0 && prefix != '0': + s.errorf("%q exponent requires decimal mantissa", ch) + case e == 'p' && prefix != 'x': + s.errorf("%q exponent requires hexadecimal mantissa", ch) + } + ch = s.next() + tok = Float + if ch == '+' || ch == '-' { + ch = s.next() + } + ch, ds = s.digits(ch, 10, nil) + digsep |= ds + if ds&1 == 0 { + s.error("exponent has no digits") + } + } else if prefix == 'x' && tok == Float { + s.error("hexadecimal mantissa requires a 'p' exponent") + } + + if tok == Int && invalid != 0 { + s.errorf("invalid digit %q in %s", invalid, litname(prefix)) + } + + if digsep&2 != 0 { + s.tokEnd = s.srcPos - s.lastCharLen // make sure token text is terminated + if i := invalidSep(s.TokenText()); i >= 0 { + s.error("'_' must separate successive digits") + } + } + + return tok, ch +} + +func litname(prefix rune) string { + switch prefix { + default: + return "decimal literal" + case 'x': + return "hexadecimal literal" + case 'o', '0': + return "octal literal" + case 'b': + return "binary literal" + } +} + +// invalidSep returns the index of the first invalid separator in x, or -1. +func invalidSep(x string) int { + x1 := ' ' // prefix char, we only care if it's 'x' + d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else) + i := 0 + + // a prefix counts as a digit + if len(x) >= 2 && x[0] == '0' { + x1 = lower(rune(x[1])) + if x1 == 'x' || x1 == 'o' || x1 == 'b' { + d = '0' + i = 2 + } + } + + // mantissa and exponent + for ; i < len(x); i++ { + p := d // previous digit + d = rune(x[i]) + switch { + case d == '_': + if p != '0' { + return i + } + case isDecimal(d) || x1 == 'x' && isHex(d): + d = '0' + default: + if p == '_' { + return i - 1 + } + d = '.' + } + } + if d == '_' { + return len(x) - 1 + } + + return -1 +} + +func digitVal(ch rune) int { + switch { + case '0' <= ch && ch <= '9': + return int(ch - '0') + case 'a' <= lower(ch) && lower(ch) <= 'f': + return int(lower(ch) - 'a' + 10) + } + return 16 // larger than any legal digit val +} + +func (s *Scanner) scanDigits(ch rune, base, n int) rune { + for n > 0 && digitVal(ch) < base { + ch = s.next() + n-- + } + if n > 0 { + s.error("invalid char escape") + } + return ch +} + +func (s *Scanner) scanEscape(quote rune) rune { + ch := s.next() // read character after '/' + switch ch { + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: + // nothing to do + ch = s.next() + case '0', '1', '2', '3', '4', '5', '6', '7': + ch = s.scanDigits(ch, 8, 3) + case 'x': + ch = s.scanDigits(s.next(), 16, 2) + case 'u': + ch = s.scanDigits(s.next(), 16, 4) + case 'U': + ch = s.scanDigits(s.next(), 16, 8) + default: + s.error("invalid char escape") + } + return ch +} + +func (s *Scanner) scanString(quote rune) (n int) { + ch := s.next() // read character after quote + for ch != quote { + if ch == '\n' || ch < 0 { + s.error("literal not terminated") + return + } + if ch == '\\' { + ch = s.scanEscape(quote) + } else { + ch = s.next() + } + n++ + } + return +} + +func (s *Scanner) scanRawString() { + ch := s.next() // read character after '`' + for ch != '`' { + if ch < 0 { + s.error("literal not terminated") + return + } + ch = s.next() + } +} + +func (s *Scanner) scanChar() { + if s.scanString('\'') != 1 { + s.error("invalid char literal") + } +} + +func (s *Scanner) scanComment(ch rune) rune { + // ch == '/' || ch == '*' + if ch == '/' { + // line comment + ch = s.next() // read character after "//" + for ch != '\n' && ch >= 0 { + ch = s.next() + } + return ch + } + + // general comment + ch = s.next() // read character after "/*" + for { + if ch < 0 { + s.error("comment not terminated") + break + } + ch0 := ch + ch = s.next() + if ch0 == '*' && ch == '/' { + ch = s.next() + break + } + } + return ch +} + +// Scan reads the next token or Unicode character from source and returns it. +// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. +// It returns EOF at the end of the source. It reports scanner errors (read and +// token errors) by calling s.Error, if not nil; otherwise it prints an error +// message to os.Stderr. +func (s *Scanner) Scan() rune { + ch := s.Peek() + + // reset token text position + s.tokPos = -1 + s.Line = 0 + +redo: + // skip white space + for s.Whitespace&(1<<uint(ch)) != 0 { + ch = s.next() + } + + // start collecting token text + s.tokBuf.Reset() + s.tokPos = s.srcPos - s.lastCharLen + + // set token position + // (this is a slightly optimized version of the code in Pos()) + s.Offset = s.srcBufOffset + s.tokPos + if s.column > 0 { + // common case: last character was not a '\n' + s.Line = s.line + s.Column = s.column + } else { + // last character was a '\n' + // (we cannot be at the beginning of the source + // since we have called next() at least once) + s.Line = s.line - 1 + s.Column = s.lastLineLen + } + + // determine token value + tok := ch + switch { + case s.isIdentRune(ch, 0): + if s.Mode&ScanIdents != 0 { + tok = Ident + ch = s.scanIdentifier() + } else { + ch = s.next() + } + case isDecimal(ch): + if s.Mode&(ScanInts|ScanFloats) != 0 { + tok, ch = s.scanNumber(ch, false) + } else { + ch = s.next() + } + default: + switch ch { + case EOF: + break + case '"': + if s.Mode&ScanStrings != 0 { + s.scanString('"') + tok = String + } + ch = s.next() + case '\'': + if s.Mode&ScanChars != 0 { + s.scanChar() + tok = Char + } + ch = s.next() + case '.': + ch = s.next() + if isDecimal(ch) && s.Mode&ScanFloats != 0 { + tok, ch = s.scanNumber(ch, true) + } + case '/': + ch = s.next() + if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { + if s.Mode&SkipComments != 0 { + s.tokPos = -1 // don't collect token text + ch = s.scanComment(ch) + goto redo + } + ch = s.scanComment(ch) + tok = Comment + } + case '`': + if s.Mode&ScanRawStrings != 0 { + s.scanRawString() + tok = RawString + } + ch = s.next() + default: + ch = s.next() + } + } + + // end of token text + s.tokEnd = s.srcPos - s.lastCharLen + + s.ch = ch + return tok +} + +// Pos returns the position of the character immediately after +// the character or token returned by the last call to Next or Scan. +// Use the Scanner's Position field for the start position of the most +// recently scanned token. +func (s *Scanner) Pos() (pos Position) { + pos.Filename = s.Filename + pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen + switch { + case s.column > 0: + // common case: last character was not a '\n' + pos.Line = s.line + pos.Column = s.column + case s.lastLineLen > 0: + // last character was a '\n' + pos.Line = s.line - 1 + pos.Column = s.lastLineLen + default: + // at the beginning of the source + pos.Line = 1 + pos.Column = 1 + } + return +} + +// TokenText returns the string corresponding to the most recently scanned token. +// Valid after calling Scan and in calls of Scanner.Error. +func (s *Scanner) TokenText() string { + if s.tokPos < 0 { + // no token text + return "" + } + + if s.tokEnd < s.tokPos { + // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) + s.tokEnd = s.tokPos + } + // s.tokEnd >= s.tokPos + + if s.tokBuf.Len() == 0 { + // common case: the entire token text is still in srcBuf + return string(s.srcBuf[s.tokPos:s.tokEnd]) + } + + // part of the token text was saved in tokBuf: save the rest in + // tokBuf as well and return its content + s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) + s.tokPos = s.tokEnd // ensure idempotency of TokenText() call + return s.tokBuf.String() +} diff --git a/src/text/scanner/scanner_test.go b/src/text/scanner/scanner_test.go new file mode 100644 index 0000000..fe39d30 --- /dev/null +++ b/src/text/scanner/scanner_test.go @@ -0,0 +1,915 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scanner + +import ( + "bytes" + "fmt" + "io" + "strings" + "testing" + "unicode/utf8" +) + +// A StringReader delivers its data one string segment at a time via Read. +type StringReader struct { + data []string + step int +} + +func (r *StringReader) Read(p []byte) (n int, err error) { + if r.step < len(r.data) { + s := r.data[r.step] + n = copy(p, s) + r.step++ + } else { + err = io.EOF + } + return +} + +func readRuneSegments(t *testing.T, segments []string) { + got := "" + want := strings.Join(segments, "") + s := new(Scanner).Init(&StringReader{data: segments}) + for { + ch := s.Next() + if ch == EOF { + break + } + got += string(ch) + } + if got != want { + t.Errorf("segments=%v got=%s want=%s", segments, got, want) + } +} + +var segmentList = [][]string{ + {}, + {""}, + {"日", "本語"}, + {"\u65e5", "\u672c", "\u8a9e"}, + {"\U000065e5", " ", "\U0000672c", "\U00008a9e"}, + {"\xe6", "\x97\xa5\xe6", "\x9c\xac\xe8\xaa\x9e"}, + {"Hello", ", ", "World", "!"}, + {"Hello", ", ", "", "World", "!"}, +} + +func TestNext(t *testing.T) { + for _, s := range segmentList { + readRuneSegments(t, s) + } +} + +type token struct { + tok rune + text string +} + +var f100 = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" + +var tokenList = []token{ + {Comment, "// line comments"}, + {Comment, "//"}, + {Comment, "////"}, + {Comment, "// comment"}, + {Comment, "// /* comment */"}, + {Comment, "// // comment //"}, + {Comment, "//" + f100}, + + {Comment, "// general comments"}, + {Comment, "/**/"}, + {Comment, "/***/"}, + {Comment, "/* comment */"}, + {Comment, "/* // comment */"}, + {Comment, "/* /* comment */"}, + {Comment, "/*\n comment\n*/"}, + {Comment, "/*" + f100 + "*/"}, + + {Comment, "// identifiers"}, + {Ident, "a"}, + {Ident, "a0"}, + {Ident, "foobar"}, + {Ident, "abc123"}, + {Ident, "LGTM"}, + {Ident, "_"}, + {Ident, "_abc123"}, + {Ident, "abc123_"}, + {Ident, "_abc_123_"}, + {Ident, "_äöü"}, + {Ident, "_本"}, + {Ident, "äöü"}, + {Ident, "本"}, + {Ident, "a۰۱۸"}, + {Ident, "foo६४"}, + {Ident, "bar9876"}, + {Ident, f100}, + + {Comment, "// decimal ints"}, + {Int, "0"}, + {Int, "1"}, + {Int, "9"}, + {Int, "42"}, + {Int, "1234567890"}, + + {Comment, "// octal ints"}, + {Int, "00"}, + {Int, "01"}, + {Int, "07"}, + {Int, "042"}, + {Int, "01234567"}, + + {Comment, "// hexadecimal ints"}, + {Int, "0x0"}, + {Int, "0x1"}, + {Int, "0xf"}, + {Int, "0x42"}, + {Int, "0x123456789abcDEF"}, + {Int, "0x" + f100}, + {Int, "0X0"}, + {Int, "0X1"}, + {Int, "0XF"}, + {Int, "0X42"}, + {Int, "0X123456789abcDEF"}, + {Int, "0X" + f100}, + + {Comment, "// floats"}, + {Float, "0."}, + {Float, "1."}, + {Float, "42."}, + {Float, "01234567890."}, + {Float, ".0"}, + {Float, ".1"}, + {Float, ".42"}, + {Float, ".0123456789"}, + {Float, "0.0"}, + {Float, "1.0"}, + {Float, "42.0"}, + {Float, "01234567890.0"}, + {Float, "0e0"}, + {Float, "1e0"}, + {Float, "42e0"}, + {Float, "01234567890e0"}, + {Float, "0E0"}, + {Float, "1E0"}, + {Float, "42E0"}, + {Float, "01234567890E0"}, + {Float, "0e+10"}, + {Float, "1e-10"}, + {Float, "42e+10"}, + {Float, "01234567890e-10"}, + {Float, "0E+10"}, + {Float, "1E-10"}, + {Float, "42E+10"}, + {Float, "01234567890E-10"}, + + {Comment, "// chars"}, + {Char, `' '`}, + {Char, `'a'`}, + {Char, `'本'`}, + {Char, `'\a'`}, + {Char, `'\b'`}, + {Char, `'\f'`}, + {Char, `'\n'`}, + {Char, `'\r'`}, + {Char, `'\t'`}, + {Char, `'\v'`}, + {Char, `'\''`}, + {Char, `'\000'`}, + {Char, `'\777'`}, + {Char, `'\x00'`}, + {Char, `'\xff'`}, + {Char, `'\u0000'`}, + {Char, `'\ufA16'`}, + {Char, `'\U00000000'`}, + {Char, `'\U0000ffAB'`}, + + {Comment, "// strings"}, + {String, `" "`}, + {String, `"a"`}, + {String, `"本"`}, + {String, `"\a"`}, + {String, `"\b"`}, + {String, `"\f"`}, + {String, `"\n"`}, + {String, `"\r"`}, + {String, `"\t"`}, + {String, `"\v"`}, + {String, `"\""`}, + {String, `"\000"`}, + {String, `"\777"`}, + {String, `"\x00"`}, + {String, `"\xff"`}, + {String, `"\u0000"`}, + {String, `"\ufA16"`}, + {String, `"\U00000000"`}, + {String, `"\U0000ffAB"`}, + {String, `"` + f100 + `"`}, + + {Comment, "// raw strings"}, + {RawString, "``"}, + {RawString, "`\\`"}, + {RawString, "`" + "\n\n/* foobar */\n\n" + "`"}, + {RawString, "`" + f100 + "`"}, + + {Comment, "// individual characters"}, + // NUL character is not allowed + {'\x01', "\x01"}, + {' ' - 1, string(' ' - 1)}, + {'+', "+"}, + {'/', "/"}, + {'.', "."}, + {'~', "~"}, + {'(', "("}, +} + +func makeSource(pattern string) *bytes.Buffer { + var buf bytes.Buffer + for _, k := range tokenList { + fmt.Fprintf(&buf, pattern, k.text) + } + return &buf +} + +func checkTok(t *testing.T, s *Scanner, line int, got, want rune, text string) { + if got != want { + t.Fatalf("tok = %s, want %s for %q", TokenString(got), TokenString(want), text) + } + if s.Line != line { + t.Errorf("line = %d, want %d for %q", s.Line, line, text) + } + stext := s.TokenText() + if stext != text { + t.Errorf("text = %q, want %q", stext, text) + } else { + // check idempotency of TokenText() call + stext = s.TokenText() + if stext != text { + t.Errorf("text = %q, want %q (idempotency check)", stext, text) + } + } +} + +func checkTokErr(t *testing.T, s *Scanner, line int, want rune, text string) { + prevCount := s.ErrorCount + checkTok(t, s, line, s.Scan(), want, text) + if s.ErrorCount != prevCount+1 { + t.Fatalf("want error for %q", text) + } +} + +func countNewlines(s string) int { + n := 0 + for _, ch := range s { + if ch == '\n' { + n++ + } + } + return n +} + +func testScan(t *testing.T, mode uint) { + s := new(Scanner).Init(makeSource(" \t%s\n")) + s.Mode = mode + tok := s.Scan() + line := 1 + for _, k := range tokenList { + if mode&SkipComments == 0 || k.tok != Comment { + checkTok(t, s, line, tok, k.tok, k.text) + tok = s.Scan() + } + line += countNewlines(k.text) + 1 // each token is on a new line + } + checkTok(t, s, line, tok, EOF, "") +} + +func TestScan(t *testing.T) { + testScan(t, GoTokens) + testScan(t, GoTokens&^SkipComments) +} + +func TestInvalidExponent(t *testing.T) { + const src = "1.5e 1.5E 1e+ 1e- 1.5z" + s := new(Scanner).Init(strings.NewReader(src)) + s.Error = func(s *Scanner, msg string) { + const want = "exponent has no digits" + if msg != want { + t.Errorf("%s: got error %q; want %q", s.TokenText(), msg, want) + } + } + checkTokErr(t, s, 1, Float, "1.5e") + checkTokErr(t, s, 1, Float, "1.5E") + checkTokErr(t, s, 1, Float, "1e+") + checkTokErr(t, s, 1, Float, "1e-") + checkTok(t, s, 1, s.Scan(), Float, "1.5") + checkTok(t, s, 1, s.Scan(), Ident, "z") + checkTok(t, s, 1, s.Scan(), EOF, "") + if s.ErrorCount != 4 { + t.Errorf("%d errors, want 4", s.ErrorCount) + } +} + +func TestPosition(t *testing.T) { + src := makeSource("\t\t\t\t%s\n") + s := new(Scanner).Init(src) + s.Mode = GoTokens &^ SkipComments + s.Scan() + pos := Position{"", 4, 1, 5} + for _, k := range tokenList { + if s.Offset != pos.Offset { + t.Errorf("offset = %d, want %d for %q", s.Offset, pos.Offset, k.text) + } + if s.Line != pos.Line { + t.Errorf("line = %d, want %d for %q", s.Line, pos.Line, k.text) + } + if s.Column != pos.Column { + t.Errorf("column = %d, want %d for %q", s.Column, pos.Column, k.text) + } + pos.Offset += 4 + len(k.text) + 1 // 4 tabs + token bytes + newline + pos.Line += countNewlines(k.text) + 1 // each token is on a new line + s.Scan() + } + // make sure there were no token-internal errors reported by scanner + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } +} + +func TestScanZeroMode(t *testing.T) { + src := makeSource("%s\n") + str := src.String() + s := new(Scanner).Init(src) + s.Mode = 0 // don't recognize any token classes + s.Whitespace = 0 // don't skip any whitespace + tok := s.Scan() + for i, ch := range str { + if tok != ch { + t.Fatalf("%d. tok = %s, want %s", i, TokenString(tok), TokenString(ch)) + } + tok = s.Scan() + } + if tok != EOF { + t.Fatalf("tok = %s, want EOF", TokenString(tok)) + } + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } +} + +func testScanSelectedMode(t *testing.T, mode uint, class rune) { + src := makeSource("%s\n") + s := new(Scanner).Init(src) + s.Mode = mode + tok := s.Scan() + for tok != EOF { + if tok < 0 && tok != class { + t.Fatalf("tok = %s, want %s", TokenString(tok), TokenString(class)) + } + tok = s.Scan() + } + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } +} + +func TestScanSelectedMask(t *testing.T) { + testScanSelectedMode(t, 0, 0) + testScanSelectedMode(t, ScanIdents, Ident) + // Don't test ScanInts and ScanNumbers since some parts of + // the floats in the source look like (invalid) octal ints + // and ScanNumbers may return either Int or Float. + testScanSelectedMode(t, ScanChars, Char) + testScanSelectedMode(t, ScanStrings, String) + testScanSelectedMode(t, SkipComments, 0) + testScanSelectedMode(t, ScanComments, Comment) +} + +func TestScanCustomIdent(t *testing.T) { + const src = "faab12345 a12b123 a12 3b" + s := new(Scanner).Init(strings.NewReader(src)) + // ident = ( 'a' | 'b' ) { digit } . + // digit = '0' .. '3' . + // with a maximum length of 4 + s.IsIdentRune = func(ch rune, i int) bool { + return i == 0 && (ch == 'a' || ch == 'b') || 0 < i && i < 4 && '0' <= ch && ch <= '3' + } + checkTok(t, s, 1, s.Scan(), 'f', "f") + checkTok(t, s, 1, s.Scan(), Ident, "a") + checkTok(t, s, 1, s.Scan(), Ident, "a") + checkTok(t, s, 1, s.Scan(), Ident, "b123") + checkTok(t, s, 1, s.Scan(), Int, "45") + checkTok(t, s, 1, s.Scan(), Ident, "a12") + checkTok(t, s, 1, s.Scan(), Ident, "b123") + checkTok(t, s, 1, s.Scan(), Ident, "a12") + checkTok(t, s, 1, s.Scan(), Int, "3") + checkTok(t, s, 1, s.Scan(), Ident, "b") + checkTok(t, s, 1, s.Scan(), EOF, "") +} + +func TestScanNext(t *testing.T) { + const BOM = '\uFEFF' + BOMs := string(BOM) + s := new(Scanner).Init(strings.NewReader(BOMs + "if a == bcd /* com" + BOMs + "ment */ {\n\ta += c\n}" + BOMs + "// line comment ending in eof")) + checkTok(t, s, 1, s.Scan(), Ident, "if") // the first BOM is ignored + checkTok(t, s, 1, s.Scan(), Ident, "a") + checkTok(t, s, 1, s.Scan(), '=', "=") + checkTok(t, s, 0, s.Next(), '=', "") + checkTok(t, s, 0, s.Next(), ' ', "") + checkTok(t, s, 0, s.Next(), 'b', "") + checkTok(t, s, 1, s.Scan(), Ident, "cd") + checkTok(t, s, 1, s.Scan(), '{', "{") + checkTok(t, s, 2, s.Scan(), Ident, "a") + checkTok(t, s, 2, s.Scan(), '+', "+") + checkTok(t, s, 0, s.Next(), '=', "") + checkTok(t, s, 2, s.Scan(), Ident, "c") + checkTok(t, s, 3, s.Scan(), '}', "}") + checkTok(t, s, 3, s.Scan(), BOM, BOMs) + checkTok(t, s, 3, s.Scan(), -1, "") + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } +} + +func TestScanWhitespace(t *testing.T) { + var buf bytes.Buffer + var ws uint64 + // start at 1, NUL character is not allowed + for ch := byte(1); ch < ' '; ch++ { + buf.WriteByte(ch) + ws |= 1 << ch + } + const orig = 'x' + buf.WriteByte(orig) + + s := new(Scanner).Init(&buf) + s.Mode = 0 + s.Whitespace = ws + tok := s.Scan() + if tok != orig { + t.Errorf("tok = %s, want %s", TokenString(tok), TokenString(orig)) + } +} + +func testError(t *testing.T, src, pos, msg string, tok rune) { + s := new(Scanner).Init(strings.NewReader(src)) + errorCalled := false + s.Error = func(s *Scanner, m string) { + if !errorCalled { + // only look at first error + if p := s.Pos().String(); p != pos { + t.Errorf("pos = %q, want %q for %q", p, pos, src) + } + if m != msg { + t.Errorf("msg = %q, want %q for %q", m, msg, src) + } + errorCalled = true + } + } + tk := s.Scan() + if tk != tok { + t.Errorf("tok = %s, want %s for %q", TokenString(tk), TokenString(tok), src) + } + if !errorCalled { + t.Errorf("error handler not called for %q", src) + } + if s.ErrorCount == 0 { + t.Errorf("count = %d, want > 0 for %q", s.ErrorCount, src) + } +} + +func TestError(t *testing.T) { + testError(t, "\x00", "<input>:1:1", "invalid character NUL", 0) + testError(t, "\x80", "<input>:1:1", "invalid UTF-8 encoding", utf8.RuneError) + testError(t, "\xff", "<input>:1:1", "invalid UTF-8 encoding", utf8.RuneError) + + testError(t, "a\x00", "<input>:1:2", "invalid character NUL", Ident) + testError(t, "ab\x80", "<input>:1:3", "invalid UTF-8 encoding", Ident) + testError(t, "abc\xff", "<input>:1:4", "invalid UTF-8 encoding", Ident) + + testError(t, `"a`+"\x00", "<input>:1:3", "invalid character NUL", String) + testError(t, `"ab`+"\x80", "<input>:1:4", "invalid UTF-8 encoding", String) + testError(t, `"abc`+"\xff", "<input>:1:5", "invalid UTF-8 encoding", String) + + testError(t, "`a"+"\x00", "<input>:1:3", "invalid character NUL", RawString) + testError(t, "`ab"+"\x80", "<input>:1:4", "invalid UTF-8 encoding", RawString) + testError(t, "`abc"+"\xff", "<input>:1:5", "invalid UTF-8 encoding", RawString) + + testError(t, `'\"'`, "<input>:1:3", "invalid char escape", Char) + testError(t, `"\'"`, "<input>:1:3", "invalid char escape", String) + + testError(t, `01238`, "<input>:1:6", "invalid digit '8' in octal literal", Int) + testError(t, `01238123`, "<input>:1:9", "invalid digit '8' in octal literal", Int) + testError(t, `0x`, "<input>:1:3", "hexadecimal literal has no digits", Int) + testError(t, `0xg`, "<input>:1:3", "hexadecimal literal has no digits", Int) + testError(t, `'aa'`, "<input>:1:4", "invalid char literal", Char) + testError(t, `1.5e`, "<input>:1:5", "exponent has no digits", Float) + testError(t, `1.5E`, "<input>:1:5", "exponent has no digits", Float) + testError(t, `1.5e+`, "<input>:1:6", "exponent has no digits", Float) + testError(t, `1.5e-`, "<input>:1:6", "exponent has no digits", Float) + + testError(t, `'`, "<input>:1:2", "literal not terminated", Char) + testError(t, `'`+"\n", "<input>:1:2", "literal not terminated", Char) + testError(t, `"abc`, "<input>:1:5", "literal not terminated", String) + testError(t, `"abc`+"\n", "<input>:1:5", "literal not terminated", String) + testError(t, "`abc\n", "<input>:2:1", "literal not terminated", RawString) + testError(t, `/*/`, "<input>:1:4", "comment not terminated", EOF) +} + +// An errReader returns (0, err) where err is not io.EOF. +type errReader struct{} + +func (errReader) Read(b []byte) (int, error) { + return 0, io.ErrNoProgress // some error that is not io.EOF +} + +func TestIOError(t *testing.T) { + s := new(Scanner).Init(errReader{}) + errorCalled := false + s.Error = func(s *Scanner, msg string) { + if !errorCalled { + if want := io.ErrNoProgress.Error(); msg != want { + t.Errorf("msg = %q, want %q", msg, want) + } + errorCalled = true + } + } + tok := s.Scan() + if tok != EOF { + t.Errorf("tok = %s, want EOF", TokenString(tok)) + } + if !errorCalled { + t.Errorf("error handler not called") + } +} + +func checkPos(t *testing.T, got, want Position) { + if got.Offset != want.Offset || got.Line != want.Line || got.Column != want.Column { + t.Errorf("got offset, line, column = %d, %d, %d; want %d, %d, %d", + got.Offset, got.Line, got.Column, want.Offset, want.Line, want.Column) + } +} + +func checkNextPos(t *testing.T, s *Scanner, offset, line, column int, char rune) { + if ch := s.Next(); ch != char { + t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char)) + } + want := Position{Offset: offset, Line: line, Column: column} + checkPos(t, s.Pos(), want) +} + +func checkScanPos(t *testing.T, s *Scanner, offset, line, column int, char rune) { + want := Position{Offset: offset, Line: line, Column: column} + checkPos(t, s.Pos(), want) + if ch := s.Scan(); ch != char { + t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char)) + if string(ch) != s.TokenText() { + t.Errorf("tok = %q, want %q", s.TokenText(), string(ch)) + } + } + checkPos(t, s.Position, want) +} + +func TestPos(t *testing.T) { + // corner case: empty source + s := new(Scanner).Init(strings.NewReader("")) + checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) + s.Peek() // peek doesn't affect the position + checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) + + // corner case: source with only a newline + s = new(Scanner).Init(strings.NewReader("\n")) + checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) + checkNextPos(t, s, 1, 2, 1, '\n') + // after EOF position doesn't change + for i := 10; i > 0; i-- { + checkScanPos(t, s, 1, 2, 1, EOF) + } + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } + + // corner case: source with only a single character + s = new(Scanner).Init(strings.NewReader("本")) + checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1}) + checkNextPos(t, s, 3, 1, 2, '本') + // after EOF position doesn't change + for i := 10; i > 0; i-- { + checkScanPos(t, s, 3, 1, 2, EOF) + } + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } + + // positions after calling Next + s = new(Scanner).Init(strings.NewReader(" foo६४ \n\n本語\n")) + checkNextPos(t, s, 1, 1, 2, ' ') + s.Peek() // peek doesn't affect the position + checkNextPos(t, s, 2, 1, 3, ' ') + checkNextPos(t, s, 3, 1, 4, 'f') + checkNextPos(t, s, 4, 1, 5, 'o') + checkNextPos(t, s, 5, 1, 6, 'o') + checkNextPos(t, s, 8, 1, 7, '६') + checkNextPos(t, s, 11, 1, 8, '४') + checkNextPos(t, s, 12, 1, 9, ' ') + checkNextPos(t, s, 13, 1, 10, ' ') + checkNextPos(t, s, 14, 2, 1, '\n') + checkNextPos(t, s, 15, 3, 1, '\n') + checkNextPos(t, s, 18, 3, 2, '本') + checkNextPos(t, s, 21, 3, 3, '語') + checkNextPos(t, s, 22, 4, 1, '\n') + // after EOF position doesn't change + for i := 10; i > 0; i-- { + checkScanPos(t, s, 22, 4, 1, EOF) + } + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } + + // positions after calling Scan + s = new(Scanner).Init(strings.NewReader("abc\n本語\n\nx")) + s.Mode = 0 + s.Whitespace = 0 + checkScanPos(t, s, 0, 1, 1, 'a') + s.Peek() // peek doesn't affect the position + checkScanPos(t, s, 1, 1, 2, 'b') + checkScanPos(t, s, 2, 1, 3, 'c') + checkScanPos(t, s, 3, 1, 4, '\n') + checkScanPos(t, s, 4, 2, 1, '本') + checkScanPos(t, s, 7, 2, 2, '語') + checkScanPos(t, s, 10, 2, 3, '\n') + checkScanPos(t, s, 11, 3, 1, '\n') + checkScanPos(t, s, 12, 4, 1, 'x') + // after EOF position doesn't change + for i := 10; i > 0; i-- { + checkScanPos(t, s, 13, 4, 2, EOF) + } + if s.ErrorCount != 0 { + t.Errorf("%d errors", s.ErrorCount) + } +} + +type countReader int + +func (r *countReader) Read([]byte) (int, error) { + *r++ + return 0, io.EOF +} + +func TestNextEOFHandling(t *testing.T) { + var r countReader + + // corner case: empty source + s := new(Scanner).Init(&r) + + tok := s.Next() + if tok != EOF { + t.Error("1) EOF not reported") + } + + tok = s.Peek() + if tok != EOF { + t.Error("2) EOF not reported") + } + + if r != 1 { + t.Errorf("scanner called Read %d times, not once", r) + } +} + +func TestScanEOFHandling(t *testing.T) { + var r countReader + + // corner case: empty source + s := new(Scanner).Init(&r) + + tok := s.Scan() + if tok != EOF { + t.Error("1) EOF not reported") + } + + tok = s.Peek() + if tok != EOF { + t.Error("2) EOF not reported") + } + + if r != 1 { + t.Errorf("scanner called Read %d times, not once", r) + } +} + +func TestIssue29723(t *testing.T) { + s := new(Scanner).Init(strings.NewReader(`x "`)) + s.Error = func(s *Scanner, _ string) { + got := s.TokenText() // this call shouldn't panic + const want = `"` + if got != want { + t.Errorf("got %q; want %q", got, want) + } + } + for r := s.Scan(); r != EOF; r = s.Scan() { + } +} + +func TestNumbers(t *testing.T) { + for _, test := range []struct { + tok rune + src, tokens, err string + }{ + // binaries + {Int, "0b0", "0b0", ""}, + {Int, "0b1010", "0b1010", ""}, + {Int, "0B1110", "0B1110", ""}, + + {Int, "0b", "0b", "binary literal has no digits"}, + {Int, "0b0190", "0b0190", "invalid digit '9' in binary literal"}, + {Int, "0b01a0", "0b01 a0", ""}, // only accept 0-9 + + // binary floats (invalid) + {Float, "0b.", "0b.", "invalid radix point in binary literal"}, + {Float, "0b.1", "0b.1", "invalid radix point in binary literal"}, + {Float, "0b1.0", "0b1.0", "invalid radix point in binary literal"}, + {Float, "0b1e10", "0b1e10", "'e' exponent requires decimal mantissa"}, + {Float, "0b1P-1", "0b1P-1", "'P' exponent requires hexadecimal mantissa"}, + + // octals + {Int, "0o0", "0o0", ""}, + {Int, "0o1234", "0o1234", ""}, + {Int, "0O1234", "0O1234", ""}, + + {Int, "0o", "0o", "octal literal has no digits"}, + {Int, "0o8123", "0o8123", "invalid digit '8' in octal literal"}, + {Int, "0o1293", "0o1293", "invalid digit '9' in octal literal"}, + {Int, "0o12a3", "0o12 a3", ""}, // only accept 0-9 + + // octal floats (invalid) + {Float, "0o.", "0o.", "invalid radix point in octal literal"}, + {Float, "0o.2", "0o.2", "invalid radix point in octal literal"}, + {Float, "0o1.2", "0o1.2", "invalid radix point in octal literal"}, + {Float, "0o1E+2", "0o1E+2", "'E' exponent requires decimal mantissa"}, + {Float, "0o1p10", "0o1p10", "'p' exponent requires hexadecimal mantissa"}, + + // 0-octals + {Int, "0", "0", ""}, + {Int, "0123", "0123", ""}, + + {Int, "08123", "08123", "invalid digit '8' in octal literal"}, + {Int, "01293", "01293", "invalid digit '9' in octal literal"}, + {Int, "0F.", "0 F .", ""}, // only accept 0-9 + {Int, "0123F.", "0123 F .", ""}, + {Int, "0123456x", "0123456 x", ""}, + + // decimals + {Int, "1", "1", ""}, + {Int, "1234", "1234", ""}, + + {Int, "1f", "1 f", ""}, // only accept 0-9 + + // decimal floats + {Float, "0.", "0.", ""}, + {Float, "123.", "123.", ""}, + {Float, "0123.", "0123.", ""}, + + {Float, ".0", ".0", ""}, + {Float, ".123", ".123", ""}, + {Float, ".0123", ".0123", ""}, + + {Float, "0.0", "0.0", ""}, + {Float, "123.123", "123.123", ""}, + {Float, "0123.0123", "0123.0123", ""}, + + {Float, "0e0", "0e0", ""}, + {Float, "123e+0", "123e+0", ""}, + {Float, "0123E-1", "0123E-1", ""}, + + {Float, "0.e+1", "0.e+1", ""}, + {Float, "123.E-10", "123.E-10", ""}, + {Float, "0123.e123", "0123.e123", ""}, + + {Float, ".0e-1", ".0e-1", ""}, + {Float, ".123E+10", ".123E+10", ""}, + {Float, ".0123E123", ".0123E123", ""}, + + {Float, "0.0e1", "0.0e1", ""}, + {Float, "123.123E-10", "123.123E-10", ""}, + {Float, "0123.0123e+456", "0123.0123e+456", ""}, + + {Float, "0e", "0e", "exponent has no digits"}, + {Float, "0E+", "0E+", "exponent has no digits"}, + {Float, "1e+f", "1e+ f", "exponent has no digits"}, + {Float, "0p0", "0p0", "'p' exponent requires hexadecimal mantissa"}, + {Float, "1.0P-1", "1.0P-1", "'P' exponent requires hexadecimal mantissa"}, + + // hexadecimals + {Int, "0x0", "0x0", ""}, + {Int, "0x1234", "0x1234", ""}, + {Int, "0xcafef00d", "0xcafef00d", ""}, + {Int, "0XCAFEF00D", "0XCAFEF00D", ""}, + + {Int, "0x", "0x", "hexadecimal literal has no digits"}, + {Int, "0x1g", "0x1 g", ""}, + + // hexadecimal floats + {Float, "0x0p0", "0x0p0", ""}, + {Float, "0x12efp-123", "0x12efp-123", ""}, + {Float, "0xABCD.p+0", "0xABCD.p+0", ""}, + {Float, "0x.0189P-0", "0x.0189P-0", ""}, + {Float, "0x1.ffffp+1023", "0x1.ffffp+1023", ""}, + + {Float, "0x.", "0x.", "hexadecimal literal has no digits"}, + {Float, "0x0.", "0x0.", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x.0", "0x.0", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x1.1", "0x1.1", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x1.1e0", "0x1.1e0", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x1.2gp1a", "0x1.2 gp1a", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x0p", "0x0p", "exponent has no digits"}, + {Float, "0xeP-", "0xeP-", "exponent has no digits"}, + {Float, "0x1234PAB", "0x1234P AB", "exponent has no digits"}, + {Float, "0x1.2p1a", "0x1.2p1 a", ""}, + + // separators + {Int, "0b_1000_0001", "0b_1000_0001", ""}, + {Int, "0o_600", "0o_600", ""}, + {Int, "0_466", "0_466", ""}, + {Int, "1_000", "1_000", ""}, + {Float, "1_000.000_1", "1_000.000_1", ""}, + {Int, "0x_f00d", "0x_f00d", ""}, + {Float, "0x_f00d.0p1_2", "0x_f00d.0p1_2", ""}, + + {Int, "0b__1000", "0b__1000", "'_' must separate successive digits"}, + {Int, "0o60___0", "0o60___0", "'_' must separate successive digits"}, + {Int, "0466_", "0466_", "'_' must separate successive digits"}, + {Float, "1_.", "1_.", "'_' must separate successive digits"}, + {Float, "0._1", "0._1", "'_' must separate successive digits"}, + {Float, "2.7_e0", "2.7_e0", "'_' must separate successive digits"}, + {Int, "0x___0", "0x___0", "'_' must separate successive digits"}, + {Float, "0x1.0_p0", "0x1.0_p0", "'_' must separate successive digits"}, + } { + s := new(Scanner).Init(strings.NewReader(test.src)) + var err string + s.Error = func(s *Scanner, msg string) { + if err == "" { + err = msg + } + } + + for i, want := range strings.Split(test.tokens, " ") { + err = "" + tok := s.Scan() + lit := s.TokenText() + if i == 0 { + if tok != test.tok { + t.Errorf("%q: got token %s; want %s", test.src, TokenString(tok), TokenString(test.tok)) + } + if err != test.err { + t.Errorf("%q: got error %q; want %q", test.src, err, test.err) + } + } + if lit != want { + t.Errorf("%q: got literal %q (%s); want %s", test.src, lit, TokenString(tok), want) + } + } + + // make sure we read all + if tok := s.Scan(); tok != EOF { + t.Errorf("%q: got %s; want EOF", test.src, TokenString(tok)) + } + } +} + +func TestIssue30320(t *testing.T) { + for _, test := range []struct { + in, want string + mode uint + }{ + {"foo01.bar31.xx-0-1-1-0", "01 31 0 1 1 0", ScanInts}, + {"foo0/12/0/5.67", "0 12 0 5 67", ScanInts}, + {"xxx1e0yyy", "1 0", ScanInts}, + {"1_2", "1_2", ScanInts}, + {"xxx1.0yyy2e3ee", "1 0 2 3", ScanInts}, + {"xxx1.0yyy2e3ee", "1.0 2e3", ScanFloats}, + } { + got := extractInts(test.in, test.mode) + if got != test.want { + t.Errorf("%q: got %q; want %q", test.in, got, test.want) + } + } +} + +func extractInts(t string, mode uint) (res string) { + var s Scanner + s.Init(strings.NewReader(t)) + s.Mode = mode + for { + switch tok := s.Scan(); tok { + case Int, Float: + if len(res) > 0 { + res += " " + } + res += s.TokenText() + case EOF: + return + } + } +} |