lexer.go

package httphead

import (
	"bytes"
)

// ItemType encodes type of the lexing token.
type ItemType int

const (
	// ItemUndef reports that token is undefined.
	ItemUndef ItemType = iota
	// ItemToken reports that token is RFC2616 token.
	ItemToken
	// ItemSeparator reports that token is RFC2616 separator.
	ItemSeparator
	// ItemString reports that token is RFC2616 quouted string.
	ItemString
	// ItemComment reports that token is RFC2616 comment.
	ItemComment
	// ItemOctet reports that token is octet slice.
	ItemOctet
)

// Scanner represents header tokens scanner.
// See https://tools.ietf.org/html/rfc2616#section-2
type Scanner struct {
	data []byte
	pos  int

	itemType  ItemType
	itemBytes []byte

	err bool
}

// NewScanner creates new RFC2616 data scanner.
func NewScanner(data []byte) *Scanner {
	return &Scanner{data: data}
}

// Next scans for next token. It returns true on successful scanning, and false
// on error or EOF.
func (l *Scanner) Next() bool {
	c, ok := l.nextChar()
	if !ok {
		return false
	}
	switch c {
	case '"': // quoted-string;
		return l.fetchQuotedString()

	case '(': // comment;
		return l.fetchComment()

	case '\\', ')': // unexpected chars;
		l.err = true
		return false

	default:
		return l.fetchToken()
	}
}

// FetchUntil fetches ItemOctet from current scanner position to first
// occurence of the c or to the end of the underlying data.
func (l *Scanner) FetchUntil(c byte) bool {
	l.resetItem()
	if l.pos == len(l.data) {
		return false
	}
	return l.fetchOctet(c)
}

// Peek reads byte at current position without advancing it. On end of data it
// returns 0.
func (l *Scanner) Peek() byte {
	if l.pos == len(l.data) {
		return 0
	}
	return l.data[l.pos]
}

// Peek2 reads two first bytes at current position without advancing it.
// If there not enough data it returs 0.
func (l *Scanner) Peek2() (a, b byte) {
	if l.pos == len(l.data) {
		return 0, 0
	}
	if l.pos+1 == len(l.data) {
		return l.data[l.pos], 0
	}
	return l.data[l.pos], l.data[l.pos+1]
}

// Buffered reporst how many bytes there are left to scan.
func (l *Scanner) Buffered() int {
	return len(l.data) - l.pos
}

// Advance moves current position index at n bytes. It returns true on
// successful move.
func (l *Scanner) Advance(n int) bool {
	l.pos += n
	if l.pos > len(l.data) {
		l.pos = len(l.data)
		return false
	}
	return true
}

// Skip skips all bytes until first occurence of c.
func (l *Scanner) Skip(c byte) {
	if l.err {
		return
	}
	// Reset scanner state.
	l.resetItem()

	if i := bytes.IndexByte(l.data[l.pos:], c); i == -1 {
		// Reached the end of data.
		l.pos = len(l.data)
	} else {
		l.pos += i + 1
	}
}

// SkipEscaped skips all bytes until first occurence of non-escaped c.
func (l *Scanner) SkipEscaped(c byte) {
	if l.err {
		return
	}
	// Reset scanner state.
	l.resetItem()

	if i := ScanUntil(l.data[l.pos:], c); i == -1 {
		// Reached the end of data.
		l.pos = len(l.data)
	} else {
		l.pos += i + 1
	}
}

// Type reports current token type.
func (l *Scanner) Type() ItemType {
	return l.itemType
}

// Bytes returns current token bytes.
func (l *Scanner) Bytes() []byte {
	return l.itemBytes
}

func (l *Scanner) nextChar() (byte, bool) {
	// Reset scanner state.
	l.resetItem()

	if l.err {
		return 0, false
	}
	l.pos += SkipSpace(l.data[l.pos:])
	if l.pos == len(l.data) {
		return 0, false
	}
	return l.data[l.pos], true
}

func (l *Scanner) resetItem() {
	l.itemType = ItemUndef
	l.itemBytes = nil
}

func (l *Scanner) fetchOctet(c byte) bool {
	i := l.pos
	if j := bytes.IndexByte(l.data[l.pos:], c); j == -1 {
		// Reached the end of data.
		l.pos = len(l.data)
	} else {
		l.pos += j
	}

	l.itemType = ItemOctet
	l.itemBytes = l.data[i:l.pos]

	return true
}

func (l *Scanner) fetchToken() bool {
	n, t := ScanToken(l.data[l.pos:])
	if n == -1 {
		l.err = true
		return false
	}

	l.itemType = t
	l.itemBytes = l.data[l.pos : l.pos+n]
	l.pos += n

	return true
}

func (l *Scanner) fetchQuotedString() (ok bool) {
	l.pos++

	n := ScanUntil(l.data[l.pos:], '"')
	if n == -1 {
		l.err = true
		return false
	}

	l.itemType = ItemString
	l.itemBytes = RemoveByte(l.data[l.pos:l.pos+n], '\\')
	l.pos += n + 1

	return true
}

func (l *Scanner) fetchComment() (ok bool) {
	l.pos++

	n := ScanPairGreedy(l.data[l.pos:], '(', ')')
	if n == -1 {
		l.err = true
		return false
	}

	l.itemType = ItemComment
	l.itemBytes = RemoveByte(l.data[l.pos:l.pos+n], '\\')
	l.pos += n + 1

	return true
}

// ScanUntil scans for first non-escaped character c in given data.
// It returns index of matched c and -1 if c is not found.
func ScanUntil(data []byte, c byte) (n int) {
	for {
		i := bytes.IndexByte(data[n:], c)
		if i == -1 {
			return -1
		}
		n += i
		if n == 0 || data[n-1] != '\\' {
			break
		}
		n++
	}
	return
}

// ScanPairGreedy scans for complete pair of opening and closing chars in greedy manner.
// Note that first opening byte must not be present in data.
func ScanPairGreedy(data []byte, open, close byte) (n int) {
	var m int
	opened := 1
	for {
		i := bytes.IndexByte(data[n:], close)
		if i == -1 {
			return -1
		}
		n += i
		// If found index is not escaped then it is the end.
		if n == 0 || data[n-1] != '\\' {
			opened--
		}

		for m < i {
			j := bytes.IndexByte(data[m:i], open)
			if j == -1 {
				break
			}
			m += j + 1
			opened++
		}

		if opened == 0 {
			break
		}

		n++
		m = n
	}
	return
}

// RemoveByte returns data without c. If c is not present in data it returns
// the same slice. If not, it copies data without c.
func RemoveByte(data []byte, c byte) []byte {
	j := bytes.IndexByte(data, c)
	if j == -1 {
		return data
	}

	n := len(data) - 1

	// If character is present, than allocate slice with n-1 capacity. That is,
	// resulting bytes could be at most n-1 length.
	result := make([]byte, n)
	k := copy(result, data[:j])

	for i := j + 1; i < n; {
		j = bytes.IndexByte(data[i:], c)
		if j != -1 {
			k += copy(result[k:], data[i:i+j])
			i = i + j + 1
		} else {
			k += copy(result[k:], data[i:])
			break
		}
	}

	return result[:k]
}

// SkipSpace skips spaces and lws-sequences from p.
// It returns number ob bytes skipped.
func SkipSpace(p []byte) (n int) {
	for len(p) > 0 {
		switch {
		case len(p) >= 3 &&
			p[0] == '\r' &&
			p[1] == '\n' &&
			OctetTypes[p[2]].IsSpace():
			p = p[3:]
			n += 3
		case OctetTypes[p[0]].IsSpace():
			p = p[1:]
			n++
		default:
			return
		}
	}
	return
}

// ScanToken scan for next token in p. It returns length of the token and its
// type. It do not trim p.
func ScanToken(p []byte) (n int, t ItemType) {
	if len(p) == 0 {
		return 0, ItemUndef
	}

	c := p[0]
	switch {
	case OctetTypes[c].IsSeparator():
		return 1, ItemSeparator

	case OctetTypes[c].IsToken():
		for n = 1; n < len(p); n++ {
			c := p[n]
			if !OctetTypes[c].IsToken() {
				break
			}
		}
		return n, ItemToken

	default:
		return -1, ItemUndef
	}
}