-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathstem_test.go
108 lines (86 loc) · 3.09 KB
/
stem_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package snowball
import (
"os"
"io"
"fmt"
"bufio"
"testing"
"regexp"
)
var (
WordStemLineRx *regexp.Regexp = regexp.MustCompile(`\A(?P<word>\S+)\s*(?P<stem>\S+).*\z`)
)
type wordStem struct {
word string
expectedStem string
}
func testLanguage(t *testing.T, language string, encoding string, filename string) {
// Create stemmer
stemmer, err := NewWordStemmer(language, encoding)
if nil != err {
t.Fatalf("Cannot create stemmer: %s", err)
}
defer stemmer.Close()
// Open test data file
f, err := os.Open(filename)
if nil != err {
t.Fatalf("Cannot open word stems file: %s", err)
}
// Open reader on it
r := bufio.NewReader(f)
var e error
// Read while any error occurs
for {
var line []byte
// Read line
line, _, e = r.ReadLine()
if nil != e {
break
}
// Extract data using regexp pattern
matches := WordStemLineRx.FindStringSubmatch(string(line))
if 3 != len(matches) {
e = fmt.Errorf("Incorrect line in file '%s':'%s'", filename, line)
break
}
// 0 is the whole line, 1 is 'word', 2 is 'stem'
testItem := wordStem{matches[1], matches[2]}
// Stem the word using stemmer
stm, err := stemmer.Stem([]byte(testItem.word))
if nil != err {
t.Error(err)
continue
}
// Compare stemmer result and expected result from file.
if string(stm) != testItem.expectedStem {
t.Errorf("Language: '%s' (%s) Word: '%s' Expected stem: '%s' Got stem: '%s'",
language,
DefaultEncoding,
testItem.word,
testItem.expectedStem,
stm)
}
}
if nil != e && io.EOF != e {
t.Fatal(e)
}
}
func TestUTF8(t *testing.T) {
testLanguage(t, "russian", DefaultEncoding, "test/rus_test.txt")
testLanguage(t, "danish", DefaultEncoding, "test/danish_test.txt")
testLanguage(t, "dutch", DefaultEncoding, "test/dutch_test.txt")
testLanguage(t, "english", DefaultEncoding, "test/english_test.txt")
testLanguage(t, "finnish", DefaultEncoding, "test/finnish_test.txt")
testLanguage(t, "french", DefaultEncoding, "test/french_test.txt")
testLanguage(t, "german", DefaultEncoding, "test/german_test.txt")
testLanguage(t, "hungarian", DefaultEncoding, "test/hungarian_test.txt")
testLanguage(t, "italian", DefaultEncoding, "test/italian_test.txt")
testLanguage(t, "norwegian", DefaultEncoding, "test/norwegian_test.txt")
testLanguage(t, "portuguese", DefaultEncoding, "test/portuguese_test.txt")
testLanguage(t, "romanian", DefaultEncoding, "test/romanian_test.txt")
testLanguage(t, "spanish", DefaultEncoding, "test/spanish_test.txt")
testLanguage(t, "swedish", DefaultEncoding, "test/swedish_test.txt")
}
func TestKOI8R(t *testing.T) {
testLanguage(t, "russian", "KOI8_R", "test/rus_koi8r_test.txt")
}