forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathner_baseline.py
125 lines (106 loc) · 3.64 KB
/
ner_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Course URL:
# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
# https://udemy.com/natural-language-processing-with-deep-learning-in-python
# data from https://github.com/aritter/twitter_nlp/blob/master/data/annotated/ner.txt
# data2 from http://schwa.org/projects/resources/wiki/Wikiner#WikiGold
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
from sklearn.utils import shuffle
from pos_baseline import LogisticRegression
def get_data(split_sequences=False):
word2idx = {}
tag2idx = {}
word_idx = 0
tag_idx = 0
Xtrain = []
Ytrain = []
currentX = []
currentY = []
for line in open('ner.txt'):
line = line.rstrip()
if line:
r = line.split()
word, tag = r
word = word.lower()
if word not in word2idx:
word2idx[word] = word_idx
word_idx += 1
currentX.append(word2idx[word])
if tag not in tag2idx:
tag2idx[tag] = tag_idx
tag_idx += 1
currentY.append(tag2idx[tag])
elif split_sequences:
Xtrain.append(currentX)
Ytrain.append(currentY)
currentX = []
currentY = []
if not split_sequences:
Xtrain = currentX
Ytrain = currentY
print("number of samples:", len(Xtrain))
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
Ntest = int(0.3*len(Xtrain))
Xtest = Xtrain[:Ntest]
Ytest = Ytrain[:Ntest]
Xtrain = Xtrain[Ntest:]
Ytrain = Ytrain[Ntest:]
print("number of classes:", len(tag2idx))
return Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx
# def get_data2(split_sequences=False):
# word2idx = {}
# tag2idx = {}
# word_idx = 0
# tag_idx = 0
# Xtrain = []
# Ytrain = []
# for line in open('../large_files/aij-wikiner-en-wp3'):
# # each line is a full sentence
# currentX = []
# currentY = []
# line = line.rstrip()
# if not line:
# continue
# triples = line.split()
# for triple in triples:
# word, _, tag = triple.split('|')
# if word not in word2idx:
# word2idx[word] = word_idx
# word_idx += 1
# currentX.append(word2idx[word])
# if tag not in tag2idx:
# tag2idx[tag] = tag_idx
# tag_idx += 1
# currentY.append(tag2idx[tag])
# Xtrain.append(currentX)
# Ytrain.append(currentY)
# if not split_sequences:
# Xtrain = np.concatenate(Xtrain)
# Ytrain = np.concatenate(Ytrain)
# print("number of samples:", len(Xtrain))
# Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
# Ntest = int(0.3*len(Xtrain))
# Xtest = Xtrain[:Ntest]
# Ytest = Ytrain[:Ntest]
# Xtrain = Xtrain[Ntest:]
# Ytrain = Ytrain[Ntest:]
# print("number of classes:", len(tag2idx))
# return Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx
def main():
Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx = get_data()
V = len(word2idx)
print("vocabulary size:", V)
K = len(tag2idx)
# train and score
model = LogisticRegression()
model.fit(Xtrain, Ytrain, V=V, K=K, epochs=500)
print("training complete")
print("train score:", model.score(Xtrain, Ytrain))
print("train f1 score:", model.f1_score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))
print("test f1 score:", model.f1_score(Xtest, Ytest))
if __name__ == '__main__':
main()