-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtfidf_cosine.cpp
executable file
·376 lines (318 loc) · 13.7 KB
/
tfidf_cosine.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
#include <vector>
#include <map>
#include <algorithm>
#include <cstdlib>
#include <ctime>
#include <cmath>
#include <assert.h>
#include <iostream>
#include <sstream>
#include <fstream>
#include <numeric>
#include <Eigen/Core>
#include <Eigen/Dense>
#include <tclap/CmdLine.h>
using namespace std;
using namespace Eigen;
using namespace TCLAP;
// pair comparer copied from shaofeng mo
bool pairCmpDescend( const pair<int,double>& p1, const pair<int,double>& p2){
if( p1.second - p2.second > 0.00000000001 ){
return true;
}else{
return false;
}
}
bool pairCmpAscend( const pair<int,double>& p1, const pair<int,double>& p2){
if( p1.second - p2.second < 0.00000000001 ){
return true;
}else{
return false;
}
}
struct Document {
vector<int> word_ind;
vector<int> word_count;
};
Document* get_str2doc(string str)
{
// create Document struct from one-line str, return pointer
Document * doc = new Document;
//NOTE: istringstream is ambiguous with tclap; compiling error w/o std scoping
std::istringstream iss(str);
int w_i, w_c;
while (iss >> w_i >> w_c)
{
//NOTE: convert 1-based word and doc index to be internally 0-based
w_i -= 1;
doc->word_ind.push_back(w_i);
doc->word_count.push_back(w_c);
}
/*for (int i=0; i<doc->word_ind.size(); i++)
{
cerr << "w_i:" << doc->word_ind[i] << "\tw_c:" << doc->word_count[i] << endl;
}*/
return doc;
}
void read_vocab(string file, vector<string> & vocabs)
{
clock_t t_b4_read = clock();
ifstream ifs(file.c_str());
for (string line ; getline(ifs, line); )
{
vocabs.push_back(line);
}
cerr << "done reading vocab file, time spent: " << float(clock() - t_b4_read)/CLOCKS_PER_SEC << " secs" << endl << endl;
}
Document** read_docword(string file, int & D, int & W, int & C)
{
clock_t t_b4_read = clock();
Document** documents;
ifstream ifs(file.c_str());
int line_count = 0;
for (string line ; getline(ifs, line); )
{
//cerr << "line*"<<line<<"*\n";
line_count += 1;
if (line_count % 100000 == 0) cerr << "reading " << line_count << " lines" << endl;
if (line_count == 1)
{
D = atoi(line.c_str());
documents = new Document* [D];
for (int counter = 0; counter < D; counter++)
{
documents[counter] = new Document;
}
}
else if (line_count == 2)
{
W = atoi(line.c_str());
}
else if (line_count != 3)
{
//NOTE: istringstream is ambiguous with tclap; compiling error w/o std scoping
std::istringstream iss(line);
int d_j, w_i, w_c;
iss >> d_j >> w_i >> w_c;
C += w_c;
//NOTE: convert 1-based word and doc index to be internally 0-based
d_j -= 1;
w_i -= 1;
//cerr << d_j << " " << w_i << " " << w_c << endl;
documents[d_j]->word_ind.push_back(w_i);
documents[d_j]->word_count.push_back(w_c);
}
}
cerr << "D:" << D << "\tW:" << W << "\tC:" << C << endl;
cerr << "done reading docword file, time spent: " << float(clock() - t_b4_read)/CLOCKS_PER_SEC << " secs" << endl << endl;
return documents;
}
void get_tfidf_test(MatrixXd & mat_tfidf_test,
Document * document,
map<int, double> & word_ind2idf,
const int & W)
{
// return 1 by W tfidf matrix for test doc
mat_tfidf_test = MatrixXd::Constant(1, W, 0);
// compute doc length
double C_j = 0.0;
for (int i=0;i<document->word_count.size();i++)
{
C_j += document->word_count[i];
}
// populate mat_tfidf_test
for (int i=0; i<document->word_ind.size(); i++)
{
int w_i = document->word_ind[i];
mat_tfidf_test(0, w_i) = ( document->word_count[i] / C_j )* word_ind2idf[w_i];
}
//cerr << "tfidf mat for test" << endl << mat_tfidf_test.row(0) << endl;
}
void get_tfidf_train(MatrixXd & mat_tfidf_train,
Document ** documents,
map<int, double> & word_ind2idf,
const int & D,
const int & W)
{
// this function has two purposes:
// 1. return D by W tfidf matrix for train docs
// 2. return word_ind2idf hashmap for test doc lookup
mat_tfidf_train = MatrixXd::Constant(D, W, 0);
//NOTE: having total vocab len W, further assume that
for (int w_i=0; w_i < W; w_i++)
{
word_ind2idf[w_i] = 0;
}
// accumulate doc counts
for (int j=0; j<D; j++)
{
for (int i=0; i<documents[j]->word_ind.size(); i++)
{
int w_i = documents[j]->word_ind[i];
word_ind2idf[w_i] += 1;
}
}
// convert to idf
for (int w_i=0; w_i < W; w_i++)
{
word_ind2idf[w_i] = log10( D * 1.0 / word_ind2idf[w_i] );
}
// populate mat_tfidf_train
for (int j=0; j<D; j++)
{
// compute doc length
double C_j = 0.0;
for (int i=0;i<documents[j]->word_count.size();i++)
{
C_j += documents[j]->word_count[i];
}
for (int i=0; i<documents[j]->word_ind.size(); i++)
{
int w_i = documents[j]->word_ind[i];
mat_tfidf_train(j, w_i) = ( documents[j]->word_count[i] / C_j ) * word_ind2idf[w_i];
}
//cerr << "tfidf mat for train " << j << endl << mat_tfidf_train.row(j) << endl;
}
}
void get_similar_by_cosine_similarity(vector< pair<int, double> > & pair_docind_score,
const MatrixXd & mat_tfidf_train,
const MatrixXd & mat_tfidf_test,
const int & D,
const int & W)
{
// given testdoc, rank training doc by cosine similarity: higher cosine similarity, higher rank
double norm_testvec = mat_tfidf_test.row(0).norm();
//TODO: how to handle empty doc properly?
// empty doc, to avoid dividing by 0
norm_testvec = (norm_testvec == 0.0 ? 1.0 : norm_testvec);
for (int j=0; j<D; j++)
{
double norm_trainvec = mat_tfidf_train.row(j).norm();
// empty doc, to avoid dividing by 0
norm_trainvec = (norm_trainvec == 0.0 ? 1.0 : norm_trainvec);
double dist = mat_tfidf_train.row(j).dot( mat_tfidf_test.row(0) ) / norm_testvec / norm_trainvec;
pair_docind_score.push_back( pair<int, double> (j, dist) );
}
sort( pair_docind_score.begin() , pair_docind_score.end() , pairCmpDescend );
}
int main( int argc, // Number of strings in array argv
char *argv[], // Array of command-line argument strings
char *envp[] ) // Array of environment variable strings
{
//speed up c++ io stream
std::ios::sync_with_stdio(false);
try {
CmdLine cmd("Command description message", ' ', "0.2");
ValueArg<string> docwordInFileArg("d","docwordfile","Path to docword file for training, in uci sparse bag-of-words format, where each line is a (doc id, word id, word count) triple delimited by space. Both doc id and word id are 1-based. Word id refers to the corresponding line in the vocab file", true, "","string");
cmd.add( docwordInFileArg );
//TODO: output vocab for better visualization and sanity check, optional
ValueArg<string> vocabInFileArg("v","vocabfile","Path to vocab file associated with docword file for training, in uci sparse bag-of-words format, 1-based", false, "","string");
cmd.add( vocabInFileArg );
ValueArg<string> docwordTestInFileArg("t","docwordtestfile","Path to docword file for similarity testing, in uci sparse bag-of-words format. Output similarity prediction for each file to STDOUT. Note that test file can also be piped from STDIN in a compact oneline format, yet this arg switch is preferred to STDIN when there are empty lines", false, "","string");
cmd.add( docwordTestInFileArg );
ValueArg<int> similarSizeArg("","similarsize","for similarity test, how many top similar docs to report", false, 50,"int");
cmd.add( similarSizeArg );
cmd.parse( argc, argv );
// Get the value parsed by each arg.
const string f_docword = docwordInFileArg.getValue();
const string f_docword_test = docwordTestInFileArg.getValue();
const string f_vocab = vocabInFileArg.getValue();
const int SIMILAR_SIZE = similarSizeArg.getValue();
// report parameters
cerr << "\nParameters:\n";
cerr << "Input train docword file: " << f_docword << endl;
cerr << "Input train vocab file: " << f_vocab << endl;
cerr << "Input test docword file: " << f_docword_test << endl;
cerr << "SIMILAR_SIZE: " << SIMILAR_SIZE << endl;
cerr << endl;
// read vocab file
vector<string> vocabs;
read_vocab(f_vocab, vocabs);
//for (int i=0; i<vocabs.size(); i++) cerr << vocabs[i] << endl;
//cerr << "total vocab " << vocabs.size() << endl;
// read docword file and obtain D, W, C
int D=0, W=0, C=0;
//NOTE: error if we pass this pointer as function arg
//Document** documents = NULL;
Document** documents = read_docword(f_docword, D, W, C);
//TODO: assert to check size consistency of docword file and vocab file
//assert (vocabs.size()==W);
// report empty doc to stdout, convert internal 0-based doc ind to 1-based
int empty_doc_count = 0;
for (int d=0; d<D; d++)
{
if (documents[d]->word_ind.size() == 0)
{
cerr << "empty doc: " << d+1 << endl;
empty_doc_count++;
}
}
cerr << "# of empty doc: " << empty_doc_count << endl << endl;
// tf-idf
// bookkeeping word_ind2idf for test case
map<int, double> word_ind2idf;
// create tfidf matrix for dot product in cosine similarity
MatrixXd mat_tfidf_train;
get_tfidf_train(mat_tfidf_train, documents, word_ind2idf, D, W);
// mode 1: similarity testing for file from arg switch, output to STDOUT
// convert internal 0-based doc ind to 1-based
if (f_docword_test != "")
{
cerr << "Start similarity prediction for test corpus: " << f_docword_test << endl;
int D_test=0, W_test=0, C_test=0;
Document** documents_test = read_docword(f_docword_test, D_test, W_test, C_test);
for (int tj=0; tj<D_test; tj++)
{
Document * testdoc = documents_test[tj];
clock_t t_b4_test = clock();
MatrixXd mat_tfidf_test;
get_tfidf_test(mat_tfidf_test, testdoc, word_ind2idf, W);
// get paired (doc_ind, score) vector with similarity measure
vector< pair<int, double> > pair_docind_score;
get_similar_by_cosine_similarity(pair_docind_score, mat_tfidf_train, mat_tfidf_test, D, W);
// output to STDOUT in one line
int numTopSimilar = (pair_docind_score.size() < SIMILAR_SIZE ? pair_docind_score.size() : SIMILAR_SIZE);
for (int p=0; p<numTopSimilar; p++)
{
cout << pair_docind_score[p].first+1 << ":" << pair_docind_score[p].second;
if (p < numTopSimilar-1) cout << " ";
}
cout << endl;
cerr << "testdoc: " << tj+1 << " time spent: " << float(clock() - t_b4_test)/CLOCKS_PER_SEC << " secs" << endl;
}
}
// mode 2: similarity testing for file from STDIN, output to STDOUT, note: cannot handle blank lines!
// convert internal 0-based doc ind to 1-based
else
{
cerr << "\nReading input from STDIN" << endl;
long counter = 0;
while (cin)
{
string testdoc_line;
getline(cin, testdoc_line);
if (!cin.eof())
{
counter ++;
clock_t t_b4_test = clock();
Document * testdoc = get_str2doc(testdoc_line);
MatrixXd mat_tfidf_test;
get_tfidf_test(mat_tfidf_test, testdoc, word_ind2idf, W);
// get paired (doc_ind, score) vector with similarity measure
vector< pair<int, double> > pair_docind_score;
get_similar_by_cosine_similarity(pair_docind_score, mat_tfidf_train, mat_tfidf_test, D, W);
// output to STDOUT in one line
int numTopSimilar = (pair_docind_score.size() < SIMILAR_SIZE ? pair_docind_score.size() : SIMILAR_SIZE);
for (int p=0; p<numTopSimilar; p++)
{
cout << pair_docind_score[p].first+1 << ":" << pair_docind_score[p].second;
if (p < numTopSimilar-1) cout << " ";
}
cout << endl;
cerr << "testdoc: " << counter << " time spent: " << float(clock() - t_b4_test)/CLOCKS_PER_SEC << " secs" << endl;
}
}
}
} catch (ArgException &e) // catch any exceptions
{ cerr << "error: " << e.error() << " for arg " << e.argId() << endl; }
}