-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgap.cc
171 lines (154 loc) · 4.73 KB
/
gap.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <vector>
#include <random>
#include <unordered_map>
//
#include <stdio.h>
#include <stdlib.h>
#define MAX_NGRAM_ORDER 10
class ngram_hash
{
private:
int ngramOrder;
int seed;
std::vector<uint32_t> ngram_hash_rnd;
public:
long operator()(const std::vector<int>& x) const
{
long l = 0;
for(int i = x.size() - 1 ; i != -1 ; i--){
l += ngram_hash_rnd[i] * x[i];
}
return l;
}
ngram_hash():ngramOrder(10), seed(10){
std::mt19937 rng;
rng.seed(seed);
std::uniform_int_distribution<uint32_t> uint_dist;
for(int i = 0; i < ngramOrder ; i++){
ngram_hash_rnd.push_back(uint_dist(rng));
std::cout << ngram_hash_rnd[i] << std::endl;
}
}
};
class ngram_equal_to
{
public:
bool operator()(const std::vector<int>& a, const std::vector<int>& b) const
{
if (a.size() != b.size()) return false;
for(int i = 0 ; i < a.size(); i++){
if(a[i] != b[i]) return false;
}
return true;
}
};
class gap{
public:
gap();
~gap();
void read_lm();
void add_line(std::string& s);
int get_id(std::string &s);
int add_string(std::string &s);
std::string get_string(int i);
std::vector<std::string> inline tokenize(const std::string &source, const char *delimiter);
private:
std::istream* inputSource;
int ngramOrder;
int seed;
std::unordered_map<std::vector<int>, double, ngram_hash, ngram_equal_to> logPHash;
std::unordered_map<std::vector<int>, double, ngram_hash, ngram_equal_to> logBHash;
std::unordered_map<std::string, int> voc;
std::vector<std::string> cov;
};
gap::gap():inputSource(&std::cin), ngramOrder(MAX_NGRAM_ORDER), seed(0){
};
gap::~gap(){}
int gap::add_string(std::string &s){
// std::cout << "Add:" << voc.size() << std::endl;
int id = voc.size();
voc[s] = id;
cov.push_back(s);
return id;
}
int gap::get_id(std::string &s){
if(voc.find(s) != voc.end()) return voc[s];
else return add_string(s);
}
std::string gap::get_string(int i){
if (i < cov.size()) return cov[i];
else std::cerr << "Index missmatch\n";
}
std::vector<std::string> inline gap::tokenize(const std::string &source, const char *delimiter = " "){
std::vector<std::string> results;
size_t prev = 0, next = 0;
while ((next = source.find_first_of(delimiter, prev)) != std::string::npos)
{
if (next - prev != 0)
results.push_back(source.substr(prev, next - prev));
prev = next + 1;
}
if (prev < source.size()){
results.push_back(source.substr(prev));
}
return results;
}
void gap::add_line(std::string& s){
std::vector<std::string> results = tokenize(s, "\t");
bool back_off = false;
double logP = -99, logB = -99;
// std::cout << s << std::endl;
logP = atof(results[0].c_str());
if (results.size() == 3) {
logB = atof(results[results.size() - 1].c_str());
back_off = true;
}
// printf("logB:%lf logP:%lf\n", logB, logP);
results = tokenize(results[1], " ");
std::vector<int> ng;
for(int i = 0 ; i < results.size() ; i++){
ng.push_back(get_id(results[i]));
// std::cout << "w:" << results[i] << " ng:" << ng[i] << std::endl;
}
return;
logPHash[ng] = logP;
if(back_off) logBHash[ng] = logB;
}
void gap::read_lm(){
std::string line;
int flag = 0;
while(getline(*inputSource, line)){
if(line.empty() || line[0] == '\n') continue;
else if(line == "\\data\\"){
flag = 1;
}
else if(line[0] == '\\'){
int norder = 0;
sscanf(&line[0],"\\%d-grams:", &norder);
flag = norder == 1 ? 2 : 3;
printf("##Reading %d-grams\n", norder);
}
else if(flag == 1 && line[0] == 'n'){
int norder = 0, ncount = 0;
sscanf(&line[0],"ngram %d=%d", &norder, &ncount);
// std::cout << "read:" << norder << " " << ncount << std::endl;
}
else{
add_line(line);
}
}
std::cout << "##Vocabulary:" << voc.size() << std::endl;
std::cout << "LOGP" << 1.0*logPHash.bucket_count()/logPHash.size() << std::endl;
std::cout << "BACK_OFF" << 1.0*logBHash.bucket_count()/logBHash.size()<< std::endl;
}
int main(){
std::cout << "##Reading LM" << std::endl;
gap g;
g.read_lm();
return 0;
}