diff options
Diffstat (limited to 'src/cmd/compile/internal/syntax/source.go')
-rw-r--r-- | src/cmd/compile/internal/syntax/source.go | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go new file mode 100644 index 0000000..01b5921 --- /dev/null +++ b/src/cmd/compile/internal/syntax/source.go @@ -0,0 +1,218 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file implements source, a buffered rune reader +// specialized for scanning Go code: Reading +// ASCII characters, maintaining current (line, col) +// position information, and recording of the most +// recently read source segment are highly optimized. +// This file is self-contained (go tool compile source.go +// compiles) and thus could be made into its own package. + +package syntax + +import ( + "io" + "unicode/utf8" +) + +// The source buffer is accessed using three indices b (begin), +// r (read), and e (end): +// +// - If b >= 0, it points to the beginning of a segment of most +// recently read characters (typically a Go literal). +// +// - r points to the byte immediately following the most recently +// read character ch, which starts at r-chw. +// +// - e points to the byte immediately following the last byte that +// was read into the buffer. +// +// The buffer content is terminated at buf[e] with the sentinel +// character utf8.RuneSelf. This makes it possible to test for +// the common case of ASCII characters with a single 'if' (see +// nextch method). +// +// +------ content in use -------+ +// v v +// buf [...read...|...segment...|ch|...unread...|s|...free...] +// ^ ^ ^ ^ +// | | | | +// b r-chw r e +// +// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel + +type source struct { + in io.Reader + errh func(line, col uint, msg string) + + buf []byte // source buffer + ioerr error // pending I/O error, or nil + b, r, e int // buffer indices (see comment above) + line, col uint // source position of ch (0-based) + ch rune // most recently read character + chw int // width of ch +} + +const sentinel = utf8.RuneSelf + +func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) { + s.in = in + s.errh = errh + + if s.buf == nil { + s.buf = make([]byte, nextSize(0)) + } + s.buf[0] = sentinel + s.ioerr = nil + s.b, s.r, s.e = -1, 0, 0 + s.line, s.col = 0, 0 + s.ch = ' ' + s.chw = 0 +} + +// starting points for line and column numbers +const linebase = 1 +const colbase = 1 + +// pos returns the (line, col) source position of s.ch. +func (s *source) pos() (line, col uint) { + return linebase + s.line, colbase + s.col +} + +// error reports the error msg at source position s.pos(). +func (s *source) error(msg string) { + line, col := s.pos() + s.errh(line, col, msg) +} + +// start starts a new active source segment (including s.ch). +// As long as stop has not been called, the active segment's +// bytes (excluding s.ch) may be retrieved by calling segment. +func (s *source) start() { s.b = s.r - s.chw } +func (s *source) stop() { s.b = -1 } +func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] } + +// rewind rewinds the scanner's read position and character s.ch +// to the start of the currently active segment, which must not +// contain any newlines (otherwise position information will be +// incorrect). Currently, rewind is only needed for handling the +// source sequence ".."; it must not be called outside an active +// segment. +func (s *source) rewind() { + // ok to verify precondition - rewind is rarely called + if s.b < 0 { + panic("no active segment") + } + s.col -= uint(s.r - s.b) + s.r = s.b + s.nextch() +} + +func (s *source) nextch() { +redo: + s.col += uint(s.chw) + if s.ch == '\n' { + s.line++ + s.col = 0 + } + + // fast common case: at least one ASCII character + if s.ch = rune(s.buf[s.r]); s.ch < sentinel { + s.r++ + s.chw = 1 + if s.ch == 0 { + s.error("invalid NUL character") + goto redo + } + return + } + + // slower general case: add more bytes to buffer if we don't have a full rune + for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil { + s.fill() + } + + // EOF + if s.r == s.e { + if s.ioerr != io.EOF { + // ensure we never start with a '/' (e.g., rooted path) in the error message + s.error("I/O error: " + s.ioerr.Error()) + s.ioerr = nil + } + s.ch = -1 + s.chw = 0 + return + } + + s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e]) + s.r += s.chw + + if s.ch == utf8.RuneError && s.chw == 1 { + s.error("invalid UTF-8 encoding") + goto redo + } + + // BOM's are only allowed as the first character in a file + const BOM = 0xfeff + if s.ch == BOM { + if s.line > 0 || s.col > 0 { + s.error("invalid BOM in the middle of the file") + } + goto redo + } +} + +// fill reads more source bytes into s.buf. +// It returns with at least one more byte in the buffer, or with s.ioerr != nil. +func (s *source) fill() { + // determine content to preserve + b := s.r + if s.b >= 0 { + b = s.b + s.b = 0 // after buffer has grown or content has been moved down + } + content := s.buf[b:s.e] + + // grow buffer or move content down + if len(content)*2 > len(s.buf) { + s.buf = make([]byte, nextSize(len(s.buf))) + copy(s.buf, content) + } else if b > 0 { + copy(s.buf, content) + } + s.r -= b + s.e -= b + + // read more data: try a limited number of times + for i := 0; i < 10; i++ { + var n int + n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel + if n < 0 { + panic("negative read") // incorrect underlying io.Reader implementation + } + if n > 0 || s.ioerr != nil { + s.e += n + s.buf[s.e] = sentinel + return + } + // n == 0 + } + + s.buf[s.e] = sentinel + s.ioerr = io.ErrNoProgress +} + +// nextSize returns the next bigger size for a buffer of a given size. +func nextSize(size int) int { + const min = 4 << 10 // 4K: minimum buffer size + const max = 1 << 20 // 1M: maximum buffer size which is still doubled + if size < min { + return min + } + if size <= max { + return size << 1 + } + return size + max +} |