diff options
Diffstat (limited to 'src/bufio/scan.go')
-rw-r--r-- | src/bufio/scan.go | 419 |
1 files changed, 419 insertions, 0 deletions
diff --git a/src/bufio/scan.go b/src/bufio/scan.go new file mode 100644 index 0000000..af46a14 --- /dev/null +++ b/src/bufio/scan.go @@ -0,0 +1,419 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bufio + +import ( + "bytes" + "errors" + "io" + "unicode/utf8" +) + +// Scanner provides a convenient interface for reading data such as +// a file of newline-delimited lines of text. Successive calls to +// the Scan method will step through the 'tokens' of a file, skipping +// the bytes between the tokens. The specification of a token is +// defined by a split function of type SplitFunc; the default split +// function breaks the input into lines with line termination stripped. Split +// functions are defined in this package for scanning a file into +// lines, bytes, UTF-8-encoded runes, and space-delimited words. The +// client may instead provide a custom split function. +// +// Scanning stops unrecoverably at EOF, the first I/O error, or a token too +// large to fit in the buffer. When a scan stops, the reader may have +// advanced arbitrarily far past the last token. Programs that need more +// control over error handling or large tokens, or must run sequential scans +// on a reader, should use bufio.Reader instead. +// +type Scanner struct { + r io.Reader // The reader provided by the client. + split SplitFunc // The function to split the tokens. + maxTokenSize int // Maximum size of a token; modified by tests. + token []byte // Last token returned by split. + buf []byte // Buffer used as argument to split. + start int // First non-processed byte in buf. + end int // End of data in buf. + err error // Sticky error. + empties int // Count of successive empty tokens. + scanCalled bool // Scan has been called; buffer is in use. + done bool // Scan has finished. +} + +// SplitFunc is the signature of the split function used to tokenize the +// input. The arguments are an initial substring of the remaining unprocessed +// data and a flag, atEOF, that reports whether the Reader has no more data +// to give. The return values are the number of bytes to advance the input +// and the next token to return to the user, if any, plus an error, if any. +// +// Scanning stops if the function returns an error, in which case some of +// the input may be discarded. +// +// Otherwise, the Scanner advances the input. If the token is not nil, +// the Scanner returns it to the user. If the token is nil, the +// Scanner reads more data and continues scanning; if there is no more +// data--if atEOF was true--the Scanner returns. If the data does not +// yet hold a complete token, for instance if it has no newline while +// scanning lines, a SplitFunc can return (0, nil, nil) to signal the +// Scanner to read more data into the slice and try again with a +// longer slice starting at the same point in the input. +// +// The function is never called with an empty data slice unless atEOF +// is true. If atEOF is true, however, data may be non-empty and, +// as always, holds unprocessed text. +type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error) + +// Errors returned by Scanner. +var ( + ErrTooLong = errors.New("bufio.Scanner: token too long") + ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count") + ErrAdvanceTooFar = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input") + ErrBadReadCount = errors.New("bufio.Scanner: Read returned impossible count") +) + +const ( + // MaxScanTokenSize is the maximum size used to buffer a token + // unless the user provides an explicit buffer with Scanner.Buffer. + // The actual maximum token size may be smaller as the buffer + // may need to include, for instance, a newline. + MaxScanTokenSize = 64 * 1024 + + startBufSize = 4096 // Size of initial allocation for buffer. +) + +// NewScanner returns a new Scanner to read from r. +// The split function defaults to ScanLines. +func NewScanner(r io.Reader) *Scanner { + return &Scanner{ + r: r, + split: ScanLines, + maxTokenSize: MaxScanTokenSize, + } +} + +// Err returns the first non-EOF error that was encountered by the Scanner. +func (s *Scanner) Err() error { + if s.err == io.EOF { + return nil + } + return s.err +} + +// Bytes returns the most recent token generated by a call to Scan. +// The underlying array may point to data that will be overwritten +// by a subsequent call to Scan. It does no allocation. +func (s *Scanner) Bytes() []byte { + return s.token +} + +// Text returns the most recent token generated by a call to Scan +// as a newly allocated string holding its bytes. +func (s *Scanner) Text() string { + return string(s.token) +} + +// ErrFinalToken is a special sentinel error value. It is intended to be +// returned by a Split function to indicate that the token being delivered +// with the error is the last token and scanning should stop after this one. +// After ErrFinalToken is received by Scan, scanning stops with no error. +// The value is useful to stop processing early or when it is necessary to +// deliver a final empty token. One could achieve the same behavior +// with a custom error value but providing one here is tidier. +// See the emptyFinalToken example for a use of this value. +var ErrFinalToken = errors.New("final token") + +// Scan advances the Scanner to the next token, which will then be +// available through the Bytes or Text method. It returns false when the +// scan stops, either by reaching the end of the input or an error. +// After Scan returns false, the Err method will return any error that +// occurred during scanning, except that if it was io.EOF, Err +// will return nil. +// Scan panics if the split function returns too many empty +// tokens without advancing the input. This is a common error mode for +// scanners. +func (s *Scanner) Scan() bool { + if s.done { + return false + } + s.scanCalled = true + // Loop until we have a token. + for { + // See if we can get a token with what we already have. + // If we've run out of data but have an error, give the split function + // a chance to recover any remaining, possibly empty token. + if s.end > s.start || s.err != nil { + advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil) + if err != nil { + if err == ErrFinalToken { + s.token = token + s.done = true + return true + } + s.setErr(err) + return false + } + if !s.advance(advance) { + return false + } + s.token = token + if token != nil { + if s.err == nil || advance > 0 { + s.empties = 0 + } else { + // Returning tokens not advancing input at EOF. + s.empties++ + if s.empties > maxConsecutiveEmptyReads { + panic("bufio.Scan: too many empty tokens without progressing") + } + } + return true + } + } + // We cannot generate a token with what we are holding. + // If we've already hit EOF or an I/O error, we are done. + if s.err != nil { + // Shut it down. + s.start = 0 + s.end = 0 + return false + } + // Must read more data. + // First, shift data to beginning of buffer if there's lots of empty space + // or space is needed. + if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) { + copy(s.buf, s.buf[s.start:s.end]) + s.end -= s.start + s.start = 0 + } + // Is the buffer full? If so, resize. + if s.end == len(s.buf) { + // Guarantee no overflow in the multiplication below. + const maxInt = int(^uint(0) >> 1) + if len(s.buf) >= s.maxTokenSize || len(s.buf) > maxInt/2 { + s.setErr(ErrTooLong) + return false + } + newSize := len(s.buf) * 2 + if newSize == 0 { + newSize = startBufSize + } + if newSize > s.maxTokenSize { + newSize = s.maxTokenSize + } + newBuf := make([]byte, newSize) + copy(newBuf, s.buf[s.start:s.end]) + s.buf = newBuf + s.end -= s.start + s.start = 0 + } + // Finally we can read some input. Make sure we don't get stuck with + // a misbehaving Reader. Officially we don't need to do this, but let's + // be extra careful: Scanner is for safe, simple jobs. + for loop := 0; ; { + n, err := s.r.Read(s.buf[s.end:len(s.buf)]) + if n < 0 || len(s.buf)-s.end < n { + s.setErr(ErrBadReadCount) + break + } + s.end += n + if err != nil { + s.setErr(err) + break + } + if n > 0 { + s.empties = 0 + break + } + loop++ + if loop > maxConsecutiveEmptyReads { + s.setErr(io.ErrNoProgress) + break + } + } + } +} + +// advance consumes n bytes of the buffer. It reports whether the advance was legal. +func (s *Scanner) advance(n int) bool { + if n < 0 { + s.setErr(ErrNegativeAdvance) + return false + } + if n > s.end-s.start { + s.setErr(ErrAdvanceTooFar) + return false + } + s.start += n + return true +} + +// setErr records the first error encountered. +func (s *Scanner) setErr(err error) { + if s.err == nil || s.err == io.EOF { + s.err = err + } +} + +// Buffer sets the initial buffer to use when scanning and the maximum +// size of buffer that may be allocated during scanning. The maximum +// token size is the larger of max and cap(buf). If max <= cap(buf), +// Scan will use this buffer only and do no allocation. +// +// By default, Scan uses an internal buffer and sets the +// maximum token size to MaxScanTokenSize. +// +// Buffer panics if it is called after scanning has started. +func (s *Scanner) Buffer(buf []byte, max int) { + if s.scanCalled { + panic("Buffer called after Scan") + } + s.buf = buf[0:cap(buf)] + s.maxTokenSize = max +} + +// Split sets the split function for the Scanner. +// The default split function is ScanLines. +// +// Split panics if it is called after scanning has started. +func (s *Scanner) Split(split SplitFunc) { + if s.scanCalled { + panic("Split called after Scan") + } + s.split = split +} + +// Split functions + +// ScanBytes is a split function for a Scanner that returns each byte as a token. +func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + return 1, data[0:1], nil +} + +var errorRune = []byte(string(utf8.RuneError)) + +// ScanRunes is a split function for a Scanner that returns each +// UTF-8-encoded rune as a token. The sequence of runes returned is +// equivalent to that from a range loop over the input as a string, which +// means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". +// Because of the Scan interface, this makes it impossible for the client to +// distinguish correctly encoded replacement runes from encoding errors. +func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + // Fast path 1: ASCII. + if data[0] < utf8.RuneSelf { + return 1, data[0:1], nil + } + + // Fast path 2: Correct UTF-8 decode without error. + _, width := utf8.DecodeRune(data) + if width > 1 { + // It's a valid encoding. Width cannot be one for a correctly encoded + // non-ASCII rune. + return width, data[0:width], nil + } + + // We know it's an error: we have width==1 and implicitly r==utf8.RuneError. + // Is the error because there wasn't a full rune to be decoded? + // FullRune distinguishes correctly between erroneous and incomplete encodings. + if !atEOF && !utf8.FullRune(data) { + // Incomplete; get more bytes. + return 0, nil, nil + } + + // We have a real UTF-8 encoding error. Return a properly encoded error rune + // but advance only one byte. This matches the behavior of a range loop over + // an incorrectly encoded string. + return 1, errorRune, nil +} + +// dropCR drops a terminal \r from the data. +func dropCR(data []byte) []byte { + if len(data) > 0 && data[len(data)-1] == '\r' { + return data[0 : len(data)-1] + } + return data +} + +// ScanLines is a split function for a Scanner that returns each line of +// text, stripped of any trailing end-of-line marker. The returned line may +// be empty. The end-of-line marker is one optional carriage return followed +// by one mandatory newline. In regular expression notation, it is `\r?\n`. +// The last non-empty line of input will be returned even if it has no +// newline. +func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + if i := bytes.IndexByte(data, '\n'); i >= 0 { + // We have a full newline-terminated line. + return i + 1, dropCR(data[0:i]), nil + } + // If we're at EOF, we have a final, non-terminated line. Return it. + if atEOF { + return len(data), dropCR(data), nil + } + // Request more data. + return 0, nil, nil +} + +// isSpace reports whether the character is a Unicode white space character. +// We avoid dependency on the unicode package, but check validity of the implementation +// in the tests. +func isSpace(r rune) bool { + if r <= '\u00FF' { + // Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs. + switch r { + case ' ', '\t', '\n', '\v', '\f', '\r': + return true + case '\u0085', '\u00A0': + return true + } + return false + } + // High-valued ones. + if '\u2000' <= r && r <= '\u200a' { + return true + } + switch r { + case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000': + return true + } + return false +} + +// ScanWords is a split function for a Scanner that returns each +// space-separated word of text, with surrounding spaces deleted. It will +// never return an empty string. The definition of space is set by +// unicode.IsSpace. +func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { + // Skip leading spaces. + start := 0 + for width := 0; start < len(data); start += width { + var r rune + r, width = utf8.DecodeRune(data[start:]) + if !isSpace(r) { + break + } + } + // Scan until space, marking end of word. + for width, i := 0, start; i < len(data); i += width { + var r rune + r, width = utf8.DecodeRune(data[i:]) + if isSpace(r) { + return i + width, data[start:i], nil + } + } + // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. + if atEOF && len(data) > start { + return len(data), data[start:], nil + } + // Request more data. + return start, nil, nil +} |