-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathtest_text_processing.py
332 lines (306 loc) · 14.4 KB
/
test_text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import unittest
from nltk import word_tokenize
from scripts.text_processing import StemTokenizer, WordAnalyzer, \
lowercase_strip_accents_and_ownership, LemmaTokenizer
# Sample abstracts taken from the USPTO Bulk Download Service: https://bulkdata.uspto.gov
# Data used was downloaded from "Patent Grant Full Text Data"
class TestStematizer(unittest.TestCase):
def test_stematizer(self):
words = ['freezing', 'frozen', 'freeze', 'reading']
stematizer = StemTokenizer()
expected_words = ['freez', 'frozen', 'freez', 'read']
actual_words = [stematizer(word)[0] for word in words]
self.assertListEqual(expected_words, actual_words)
class TestLowercaseStripAccentsAndOwnership(unittest.TestCase):
def test_lowercase(self):
doc = 'Test ABCdefGH IJ. Again'
expected = 'test abcdefgh ij. again'
actual = lowercase_strip_accents_and_ownership(doc)
self.assertEqual(expected, actual)
def test_accented(self):
doc = 'Test type âêîôûŵŷ, äëïöüẅÿ àèìòùẁỳ OR áéíóúẃý, hold. Again'
expected = 'test type aeiouwy, aeiouwy aeiouwy or aeiouwy, hold. again'
actual = lowercase_strip_accents_and_ownership(doc)
self.assertEqual(expected, actual)
def test_ownership(self):
doc = "Ian's simple test"
expected = 'ian simple test'
actual = lowercase_strip_accents_and_ownership(doc)
self.assertEqual(expected, actual)
class TestWordAnalyzer(unittest.TestCase):
def setUp(self):
self.word_tokenizer = word_tokenize
self.preprocess = lowercase_strip_accents_and_ownership
def test_preprocess(self):
actual = self.preprocess('Hello there')
self.assertEqual('hello there', actual)
def test_WordAnalyser_unigrams_no_stopwords(self):
ngram_range = (1, 1)
WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range)
doc = 'test words'
expected_ngrams = ['test', 'words']
actual_ngrams = WordAnalyzer.analyzer(doc)
self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_unigrams_with_stopwords(self):
ngram_range = (1, 1)
WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range)
doc = 'Some test words to ignore safely'
expected_ngrams = ['test', 'words', 'ignore', 'safely']
actual_ngrams = WordAnalyzer.analyzer(doc)
self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_unigrams_with_punctuation(self):
ngram_range = (1, 1)
WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range)
doc = "Some test words, to ignore except-hyphens but including someone's ownership"
expected_ngrams = ['test', 'words', 'ignore', 'except-hyphens', 'ownership']
actual_ngrams = WordAnalyzer.analyzer(doc)
self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_ngrams_dont_cross_punctuation_or_stop_words(self):
ngram_range = (1, 3)
WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range)
doc = "Some test words, except-hyphens metal but someone's metal fish bucket"
expected_ngrams = [
'test',
'words',
'except-hyphens',
'metal',
'metal',
'fish',
'bucket',
'test words',
'except-hyphens metal',
'metal metal',
'metal fish',
'fish bucket',
'except-hyphens metal metal',
'metal metal fish',
'metal fish bucket'
]
actual_ngrams = WordAnalyzer.analyzer(doc)
self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_ngrams(self):
ngram_range = (1, 3)
WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=self.preprocess, ngram_range=ngram_range)
doc = "Conductive structures in features of an insulator layer on a substrate are fabricated by a particular " \
"process. In this process, a layer of conductive material is applied over the insulator layer so that " \
"the layer of conductive material covers field regions adjacent the features and fills in the features " \
"themselves. A grain size differential between the conductive material which covers the field regions " \
"and the conductive material which fills in the feature is then established by annealing the layer of " \
"conductive material. Excess conductive material is then removed to uncover the field regions and leave " \
"the conductive structures. The layer of conductive material is applied so as to define a first layer " \
"thickness over the field regions and a second layer thickness in and over the features. These " \
"thicknesses are dimensioned such that d 1 ≦0.5d 2 , with d 1 being the first layer thickness and d 2 " \
"being the second layer thickness. Preferably, the first and second layer thicknesses are dimensioned " \
"such that d 1 ≦0.3d 2 . "
expected_ngrams = [
'conductive',
'structure',
'feature',
'insulator',
'layer',
'substrate',
'fabricate',
'particular',
'process',
'process',
'layer',
'conductive',
'material',
'apply',
'insulator',
'layer',
'layer',
'conductive',
'material',
'field',
'region',
'feature',
'fill',
'feature',
'themselves',
'grain',
'differential',
'conductive',
'material',
'field',
'region',
'conductive',
'material',
'fill',
'feature',
'establish',
'anneal',
'layer',
'conductive',
'material',
'conductive',
'material',
'remove',
'uncover',
'field',
'region',
'leave',
'conductive',
'structure',
'layer',
'conductive',
'material',
'apply',
'define',
'first',
'layer',
'thickness',
'field',
'region',
'second',
'layer',
'thickness',
'feature',
'thickness',
'dimension',
'0.5d',
'first',
'layer',
'thickness',
'second',
'layer',
'thickness',
'preferably',
'first',
'second',
'layer',
'thickness',
'dimension',
'0.3d',
'conductive structure',
'structure feature',
'feature insulator',
'insulator layer',
'layer substrate',
'substrate fabricate',
'fabricate particular',
'particular process',
'layer conductive',
'conductive material',
'material apply',
'apply insulator',
'insulator layer',
'layer layer',
'layer conductive',
'conductive material',
'material cover',
'cover field',
'field region',
'region adjacent',
'adjacent feature',
'feature fill',
'fill feature',
'feature themselves',
'grain differential',
'differential conductive',
'conductive material',
'material cover',
'cover field',
'field region',
'region conductive',
'conductive material',
'material fill',
'fill feature',
'feature establish',
'establish anneal',
'anneal layer',
'layer conductive',
'conductive material',
'conductive material',
'material remove',
'remove uncover',
'uncover field',
'field region',
'region leave',
'leave conductive',
'conductive structure',
'layer conductive',
'conductive material',
'material apply',
'apply define',
'define first',
'first layer',
'layer thickness',
'thickness field',
'field region',
'region second',
'second layer',
'layer thickness',
'thickness feature',
'thickness dimension',
'first layer',
'layer thickness',
'second layer',
'layer thickness',
'first second',
'second layer',
'layer thickness',
'thickness dimension',
'conductive structure feature',
'structure feature insulator',
'feature insulator layer',
'insulator layer substrate',
'layer substrate fabricate',
'substrate fabricate particular',
'fabricate particular process',
'layer conductive material',
'conductive material apply',
'material apply insulator',
'apply insulator layer',
'insulator layer layer',
'layer layer conductive',
'layer conductive material',
'conductive material cover',
'material cover field',
'cover field region',
'field region adjacent',
'region adjacent feature',
'adjacent feature fill',
'feature fill feature',
'fill feature themselves',
'grain differential conductive',
'differential conductive material',
'conductive material cover',
'material cover field',
'cover field region',
'field region conductive',
'region conductive material',
'conductive material fill',
'material fill feature',
'fill feature establish',
'feature establish anneal',
'establish anneal layer',
'anneal layer conductive',
'layer conductive material',
'conductive material remove',
'material remove uncover',
'remove uncover field',
'uncover field region',
'field region leave',
'region leave conductive',
'leave conductive structure',
'layer conductive material',
'conductive material apply',
'material apply define',
'apply define first',
'define first layer',
'first layer thickness',
'layer thickness field',
'thickness field region',
'field region second',
'region second layer',
'second layer thickness',
'layer thickness feature',
'first layer thickness',
'second layer thickness',
'first second layer',
'second layer thickness',
'layer thickness dimension'
]
actual_ngrams = WordAnalyzer.analyzer(doc)
self.assertListEqual(expected_ngrams, actual_ngrams)