-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
executable file
·61 lines (50 loc) · 2.21 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
"""A script that evaluates the wellformedness of interrogative expressions based on a bigram language model.
The language model is based on caregivers' utterances extracted from the CHILDES corpus."""
from nltk.lm import KneserNeyInterpolated
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline
from nltk.util import bigrams
train_corpus = "normalized_data.txt"
grammatical = "Bigrams_Richness_Experiment_1_grammatical.txt"
ungrammatical = "Bigrams_Richness_Experiment_1_ungrammatical.txt"
def get_data(path: str) -> list:
"""reads the data into a list of strings"""
list_of_lines = []
with open(path, "r") as source:
for line in source:
if not line:
continue
else:
list_of_lines.append(line.lstrip().rstrip())
return list_of_lines
def bigrammize(path: str) -> list:
"""converts a text file into bigrams. Each utterance line is a list of bigram tuples."""
bigrammized = []
data = get_data(path)
for line in data:
normalized = line[:-1].casefold()
bigrammed = list(bigrams(pad_both_ends(normalized.split(), n=2)))
bigrammized.append(bigrammed)
return bigrammized
def main() -> None:
corpus_data = get_data(train_corpus) # returns a list of utterance strings
split_corpus = [
utterance.split() for utterance in corpus_data
] # returns a list of split utterance lists
train, vocab = padded_everygram_pipeline(
2, split_corpus
) # returns lazy iterators for bigram counts and vocabulary set
lm = KneserNeyInterpolated(order=2) # sets up the language model
lm.fit(train, vocab) # fits the language model
bigrammized_grammatical = bigrammize(grammatical)
bigrammized_ungrammatical = bigrammize(ungrammatical)
incorrect = 0 # sets up a counter for incorrectly labeled sentences
for i in range(len(bigrammized_grammatical) - 1):
if lm.entropy(bigrammized_grammatical[i]) > lm.entropy(
bigrammized_ungrammatical[i]
):
incorrect += 1
accuracy = (len(bigrammized_grammatical) - incorrect) / 100
print(f"The accuracy score: {accuracy:.2%} ")
if __name__ == "__main__":
main()