// Copyright 2016 The Go Authors. All rights reserved.
|
// Use of this source code is governed by a BSD-style
|
// license that can be found in the LICENSE file.
|
|
// This file implements source, a buffered rune reader
|
// which is specialized for the needs of the Go scanner:
|
// Contiguous sequences of runes (literals) are extracted
|
// directly as []byte without the need to re-encode the
|
// runes in UTF-8 (as would be necessary with bufio.Reader).
|
//
|
// This file is self-contained (go tool compile source.go
|
// compiles) and thus could be made into its own package.
|
|
package syntax
|
|
import (
|
"io"
|
"unicode/utf8"
|
)
|
|
// starting points for line and column numbers
|
const linebase = 1
|
const colbase = 1
|
|
// buf [...read...|...|...unread...|s|...free...]
|
// ^ ^ ^ ^
|
// | | | |
|
// suf r0 r w
|
|
type source struct {
|
src io.Reader
|
errh func(line, pos uint, msg string)
|
|
// source buffer
|
buf [4 << 10]byte
|
r0, r, w int // previous/current read and write buf positions, excluding sentinel
|
line0, line uint // previous/current line
|
col0, col uint // previous/current column (byte offsets from line start)
|
ioerr error // pending io error
|
|
// literal buffer
|
lit []byte // literal prefix
|
suf int // literal suffix; suf >= 0 means we are scanning a literal
|
}
|
|
// init initializes source to read from src and to report errors via errh.
|
// errh must not be nil.
|
func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
|
s.src = src
|
s.errh = errh
|
|
s.buf[0] = utf8.RuneSelf // terminate with sentinel
|
s.r0, s.r, s.w = 0, 0, 0
|
s.line0, s.line = 0, linebase
|
s.col0, s.col = 0, colbase
|
s.ioerr = nil
|
|
s.lit = s.lit[:0]
|
s.suf = -1
|
}
|
|
// ungetr ungets the most recently read rune.
|
func (s *source) ungetr() {
|
s.r, s.line, s.col = s.r0, s.line0, s.col0
|
}
|
|
// ungetr2 is like ungetr but enables a 2nd ungetr.
|
// It must not be called if one of the runes seen
|
// was a newline or had a UTF-8 encoding longer than
|
// 1 byte.
|
func (s *source) ungetr2() {
|
s.ungetr()
|
// line must not have changed
|
s.r0--
|
s.col0--
|
}
|
|
func (s *source) error(msg string) {
|
s.errh(s.line0, s.col0, msg)
|
}
|
|
// getr reads and returns the next rune.
|
//
|
// If a read or source encoding error occurs, getr
|
// calls the error handler installed with init.
|
// The handler must exist.
|
//
|
// The (line, col) position passed to the error handler
|
// is always at the current source reading position.
|
func (s *source) getr() rune {
|
redo:
|
s.r0, s.line0, s.col0 = s.r, s.line, s.col
|
|
// We could avoid at least one test that is always taken in the
|
// for loop below by duplicating the common case code (ASCII)
|
// here since we always have at least the sentinel (utf8.RuneSelf)
|
// in the buffer. Measure and optimize if necessary.
|
|
// make sure we have at least one rune in buffer, or we are at EOF
|
for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
|
s.fill() // s.w-s.r < len(s.buf) => buffer is not full
|
}
|
|
// common case: ASCII and enough bytes
|
// (invariant: s.buf[s.w] == utf8.RuneSelf)
|
if b := s.buf[s.r]; b < utf8.RuneSelf {
|
s.r++
|
// TODO(gri) Optimization: Instead of adjusting s.col for each character,
|
// remember the line offset instead and then compute the offset as needed
|
// (which is less often).
|
s.col++
|
if b == 0 {
|
s.error("invalid NUL character")
|
goto redo
|
}
|
if b == '\n' {
|
s.line++
|
s.col = colbase
|
}
|
return rune(b)
|
}
|
|
// EOF
|
if s.r == s.w {
|
if s.ioerr != io.EOF {
|
// ensure we never start with a '/' (e.g., rooted path) in the error message
|
s.error("I/O error: " + s.ioerr.Error())
|
}
|
return -1
|
}
|
|
// uncommon case: not ASCII
|
r, w := utf8.DecodeRune(s.buf[s.r:s.w])
|
s.r += w
|
s.col += uint(w)
|
|
if r == utf8.RuneError && w == 1 {
|
s.error("invalid UTF-8 encoding")
|
goto redo
|
}
|
|
// BOM's are only allowed as the first character in a file
|
const BOM = 0xfeff
|
if r == BOM {
|
if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
|
s.error("invalid BOM in the middle of the file")
|
}
|
goto redo
|
}
|
|
return r
|
}
|
|
func (s *source) fill() {
|
// Slide unread bytes to beginning but preserve last read char
|
// (for one ungetr call) plus one extra byte (for a 2nd ungetr
|
// call, only for ".." character sequence and float literals
|
// starting with ".").
|
if s.r0 > 1 {
|
// save literal prefix, if any
|
// (We see at most one ungetr call while reading
|
// a literal, so make sure s.r0 remains in buf.)
|
if s.suf >= 0 {
|
s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
|
s.suf = 1 // == s.r0 after slide below
|
}
|
n := s.r0 - 1
|
copy(s.buf[:], s.buf[n:s.w])
|
s.r0 = 1 // eqv: s.r0 -= n
|
s.r -= n
|
s.w -= n
|
}
|
|
// read more data: try a limited number of times
|
for i := 100; i > 0; i-- {
|
n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
|
if n < 0 {
|
panic("negative read") // incorrect underlying io.Reader implementation
|
}
|
s.w += n
|
if n > 0 || err != nil {
|
s.buf[s.w] = utf8.RuneSelf // sentinel
|
if err != nil {
|
s.ioerr = err
|
}
|
return
|
}
|
}
|
|
s.buf[s.w] = utf8.RuneSelf // sentinel
|
s.ioerr = io.ErrNoProgress
|
}
|
|
func (s *source) startLit() {
|
s.suf = s.r0
|
s.lit = s.lit[:0] // reuse lit
|
}
|
|
func (s *source) stopLit() []byte {
|
lit := s.buf[s.suf:s.r]
|
if len(s.lit) > 0 {
|
lit = append(s.lit, lit...)
|
}
|
s.killLit()
|
return lit
|
}
|
|
func (s *source) killLit() {
|
s.suf = -1 // no pending literal
|
}
|