Skip to content

Commit

Permalink
Swivel: fastprep: use mmap()-ed IO for vocabulary parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
vmarkovtsev committed Mar 3, 2017
1 parent 89bccc6 commit 06ceb09
Showing 1 changed file with 71 additions and 14 deletions.
85 changes: 71 additions & 14 deletions swivel/fastprep.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,36 @@ bool NextWord(std::ifstream &fin, std::string* word) {
return false;
}

int NextWordPtr(const char *end, const char *ptr, std::string* word) {
auto start = ptr;

// Skip leading whitespace.
for (; ptr != end && std::isspace(*ptr); ptr++) {}

if (ptr == end) {
return ptr - start;
}
auto word_start = ptr;

// Read the next word.
for (; ptr != end && !std::isspace(*ptr); ptr++) {}

word->assign(word_start, ptr);

if (*ptr == '\n' || ptr == end) {
return ptr - start;
}

// Skip trailing whitespace.
for (; ptr != end && std::isspace(*ptr); ptr++) {}

if (ptr == end) {
return ptr - start;
}

return ptr - start - 1;
}

// Creates a vocabulary from the most frequent terms in the input file.
std::vector<std::string> CreateVocabulary(const std::string input_filename,
const int shard_size,
Expand All @@ -146,30 +176,57 @@ std::vector<std::string> CreateVocabulary(const std::string input_filename,
// consume all memory and should be re-written to periodically trim the data.)
std::unordered_map<std::string, long long> counts;

std::ifstream fin(input_filename, std::ifstream::ate);

if (!fin) {
auto fin = open(input_filename.c_str(), O_RDONLY);
if (fin < 0) {
std::cerr << "couldn't read input file '" << input_filename << "'"
<< std::endl;

return vocab;
}

const auto input_size = fin.tellg();
fin.seekg(0);
const auto input_size = lseek(fin, 0, SEEK_END);
std::remove_const<decltype(input_size)>::type offset = 0;
lseek(fin, 0, SEEK_SET);

long long ntokens = 0;
while (!fin.eof()) {
const decltype(input_size) page_size = sysconf(_SC_PAGESIZE) * 8;

std::string prev_word;
while (offset < input_size) {
auto data = reinterpret_cast<char *>(
mmap(nullptr, page_size, PROT_READ, MAP_PRIVATE, fin, offset));
assert(data != MAP_FAILED);
auto read_size = std::min(page_size, input_size - offset);
int data_offset = 0;
std::string word;
NextWord(fin, &word);
counts[word] += 1;

if (++ntokens % 1000000 == 0) {
const float pct = 100.0 * static_cast<float>(fin.tellg()) / input_size;
fprintf(stdout, "\rComputing vocabulary: %0.1f%% complete...", pct);
std::flush(std::cout);
while (int eaten = NextWordPtr(data + read_size,
data + data_offset,
&word)) {
data_offset += eaten;
if (!prev_word.empty()) {
if (std::isspace(data[0])) {
counts[prev_word] += 1;
ntokens++;
} else {
word = prev_word + word;
}
prev_word.erase();
}
if (data_offset == read_size && !std::isspace(data[page_size - 1]) &&
offset + page_size < input_size) {
prev_word = word;
} else {
counts[word] += 1;
if (++ntokens % 1000000 == 0) {
const float pct = (100.0 * offset) / input_size;
fprintf(stdout, "\rComputing vocabulary: %0.1f%% complete...", pct);
std::flush(std::cout);
}
}
}
munmap(data, page_size);
offset += page_size;
}
close(fin);

std::cout << counts.size() << " distinct tokens" << std::endl;

Expand Down

0 comments on commit 06ceb09

Please sign in to comment.