segment_test.go

//  Copyright (c) 2014 Couchbase, Inc.
//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
//  except in compliance with the License. You may obtain a copy of the License at
//    http://www.apache.org/licenses/LICENSE-2.0
//  Unless required by applicable law or agreed to in writing, software distributed under the
//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
//  either express or implied. See the License for the specific language governing permissions
//  and limitations under the License.

package segment

import (
	"bufio"
	"bytes"
	"errors"
	"io"
	"strings"
	"testing"
)

// Tests borrowed from Scanner to test Segmenter

// slowReader is a reader that returns only a few bytes at a time, to test the incremental
// reads in Scanner.Scan.
type slowReader struct {
	max int
	buf io.Reader
}

func (sr *slowReader) Read(p []byte) (n int, err error) {
	if len(p) > sr.max {
		p = p[0:sr.max]
	}
	return sr.buf.Read(p)
}

// genLine writes to buf a predictable but non-trivial line of text of length
// n, including the terminal newline and an occasional carriage return.
// If addNewline is false, the \r and \n are not emitted.
func genLine(buf *bytes.Buffer, lineNum, n int, addNewline bool) {
	buf.Reset()
	doCR := lineNum%5 == 0
	if doCR {
		n--
	}
	for i := 0; i < n-1; i++ { // Stop early for \n.
		c := 'a' + byte(lineNum+i)
		if c == '\n' || c == '\r' { // Don't confuse us.
			c = 'N'
		}
		buf.WriteByte(c)
	}
	if addNewline {
		if doCR {
			buf.WriteByte('\r')
		}
		buf.WriteByte('\n')
	}
	return
}

func wrapSplitFuncAsSegmentFuncForTesting(splitFunc bufio.SplitFunc) SegmentFunc {
	return func(data []byte, atEOF bool) (advance int, token []byte, typ int, err error) {
		typ = 0
		advance, token, err = splitFunc(data, atEOF)
		return
	}
}

// Test that the line segmenter errors out on a long line.
func TestSegmentTooLong(t *testing.T) {
	const smallMaxTokenSize = 256 // Much smaller for more efficient testing.
	// Build a buffer of lots of line lengths up to but not exceeding smallMaxTokenSize.
	tmp := new(bytes.Buffer)
	buf := new(bytes.Buffer)
	lineNum := 0
	j := 0
	for i := 0; i < 2*smallMaxTokenSize; i++ {
		genLine(tmp, lineNum, j, true)
		j++
		buf.Write(tmp.Bytes())
		lineNum++
	}
	s := NewSegmenter(&slowReader{3, buf})
	// change to line segmenter for testing
	s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(bufio.ScanLines))
	s.MaxTokenSize(smallMaxTokenSize)
	j = 0
	for lineNum := 0; s.Segment(); lineNum++ {
		genLine(tmp, lineNum, j, false)
		if j < smallMaxTokenSize {
			j++
		} else {
			j--
		}
		line := tmp.Bytes()
		if !bytes.Equal(s.Bytes(), line) {
			t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Bytes(), line)
		}
	}
	err := s.Err()
	if err != ErrTooLong {
		t.Fatalf("expected ErrTooLong; got %s", err)
	}
}

var testError = errors.New("testError")

// Test the correct error is returned when the split function errors out.
func TestSegmentError(t *testing.T) {
	// Create a split function that delivers a little data, then a predictable error.
	numSplits := 0
	const okCount = 7
	errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		if atEOF {
			panic("didn't get enough data")
		}
		if numSplits >= okCount {
			return 0, nil, testError
		}
		numSplits++
		return 1, data[0:1], nil
	}
	// Read the data.
	const text = "abcdefghijklmnopqrstuvwxyz"
	buf := strings.NewReader(text)
	s := NewSegmenter(&slowReader{1, buf})
	// change to line segmenter for testing
	s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(errorSplit))
	var i int
	for i = 0; s.Segment(); i++ {
		if len(s.Bytes()) != 1 || text[i] != s.Bytes()[0] {
			t.Errorf("#%d: expected %q got %q", i, text[i], s.Bytes()[0])
		}
	}
	// Check correct termination location and error.
	if i != okCount {
		t.Errorf("unexpected termination; expected %d tokens got %d", okCount, i)
	}
	err := s.Err()
	if err != testError {
		t.Fatalf("expected %q got %v", testError, err)
	}
}

// Test that Scan finishes if we have endless empty reads.
type endlessZeros struct{}

func (endlessZeros) Read(p []byte) (int, error) {
	return 0, nil
}

func TestBadReader(t *testing.T) {
	scanner := NewSegmenter(endlessZeros{})
	for scanner.Segment() {
		t.Fatal("read should fail")
	}
	err := scanner.Err()
	if err != io.ErrNoProgress {
		t.Errorf("unexpected error: %v", err)
	}
}

func TestSegmentAdvanceNegativeError(t *testing.T) {
	errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		if atEOF {
			panic("didn't get enough data")
		}
		return -1, data[0:1], nil
	}
	// Read the data.
	const text = "abcdefghijklmnopqrstuvwxyz"
	buf := strings.NewReader(text)
	s := NewSegmenter(&slowReader{1, buf})
	// change to line segmenter for testing
	s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(errorSplit))
	s.Segment()
	err := s.Err()
	if err != ErrNegativeAdvance {
		t.Fatalf("expected %q got %v", testError, err)
	}
}

func TestSegmentAdvanceTooFarError(t *testing.T) {
	errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		if atEOF {
			panic("didn't get enough data")
		}
		return len(data) + 10, data[0:1], nil
	}
	// Read the data.
	const text = "abcdefghijklmnopqrstuvwxyz"
	buf := strings.NewReader(text)
	s := NewSegmenter(&slowReader{1, buf})
	// change to line segmenter for testing
	s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(errorSplit))
	s.Segment()
	err := s.Err()
	if err != ErrAdvanceTooFar {
		t.Fatalf("expected %q got %v", testError, err)
	}
}

func TestSegmentLongTokens(t *testing.T) {
	// Read the data.
	text := bytes.Repeat([]byte("abcdefghijklmnop"), 257)
	buf := strings.NewReader(string(text))
	s := NewSegmenter(&slowReader{1, buf})
	// change to line segmenter for testing
	s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(bufio.ScanLines))
	for s.Segment() {
		line := s.Bytes()
		if !bytes.Equal(text, line) {
			t.Errorf("expected %s, got %s", text, line)
		}
	}
	err := s.Err()
	if err != nil {
		t.Fatalf("unexpected error; got %s", err)
	}
}

func TestSegmentLongTokensDontDouble(t *testing.T) {
	// Read the data.
	text := bytes.Repeat([]byte("abcdefghijklmnop"), 257)
	buf := strings.NewReader(string(text))
	s := NewSegmenter(&slowReader{1, buf})
	// change to line segmenter for testing
	s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(bufio.ScanLines))
	s.MaxTokenSize(6144)
	for s.Segment() {
		line := s.Bytes()
		if !bytes.Equal(text, line) {
			t.Errorf("expected %s, got %s", text, line)
		}
	}
	err := s.Err()
	if err != nil {
		t.Fatalf("unexpected error; got %s", err)
	}
}