Swivel: fastprep: use mmap()-ed IO for vocabulary parsing

tensorflow · Mar 3, 2017 · 06ceb09 · 06ceb09
1 parent 89bccc6
commit 06ceb09
Showing 1 changed file with 71 additions and 14 deletions.
diff --git a/swivel/fastprep.cc b/swivel/fastprep.cc
@@ -135,6 +135,36 @@ bool NextWord(std::ifstream &fin, std::string* word) {
   return false;
 }
 
+int NextWordPtr(const char *end, const char *ptr, std::string* word) {
+  auto start = ptr;
+
+  // Skip leading whitespace.
+  for (; ptr != end && std::isspace(*ptr); ptr++) {}
+
+  if (ptr == end) {
+    return ptr - start;
+  }
+  auto word_start = ptr;
+
+  // Read the next word.
+  for (; ptr != end && !std::isspace(*ptr); ptr++) {}
+
+  word->assign(word_start, ptr);
+
+  if (*ptr == '\n' || ptr == end) {
+    return ptr - start;
+  }
+
+  // Skip trailing whitespace.
+  for (; ptr != end && std::isspace(*ptr); ptr++) {}
+
+  if (ptr == end) {
+    return ptr - start;
+  }
+
+  return ptr - start - 1;
+}
+
 // Creates a vocabulary from the most frequent terms in the input file.
 std::vector<std::string> CreateVocabulary(const std::string input_filename,
                                           const int shard_size,
@@ -146,30 +176,57 @@ std::vector<std::string> CreateVocabulary(const std::string input_filename,
   // consume all memory and should be re-written to periodically trim the data.)
   std::unordered_map<std::string, long long> counts;
 
-  std::ifstream fin(input_filename, std::ifstream::ate);
-
-  if (!fin) {
+  auto fin = open(input_filename.c_str(), O_RDONLY);
+  if (fin < 0) {
     std::cerr << "couldn't read input file '" << input_filename << "'"
               << std::endl;
 
     return vocab;
   }
-
-  const auto input_size = fin.tellg();
-  fin.seekg(0);
+  const auto input_size = lseek(fin, 0, SEEK_END);
+  std::remove_const<decltype(input_size)>::type offset = 0;
+  lseek(fin, 0, SEEK_SET);
 
   long long ntokens = 0;
-  while (!fin.eof()) {
+  const decltype(input_size) page_size = sysconf(_SC_PAGESIZE) * 8;
+
+  std::string prev_word;
+  while (offset < input_size) {
+    auto data = reinterpret_cast<char *>(
+        mmap(nullptr, page_size, PROT_READ, MAP_PRIVATE, fin, offset));
+    assert(data != MAP_FAILED);
+    auto read_size = std::min(page_size, input_size - offset);
+    int data_offset = 0;
     std::string word;
-    NextWord(fin, &word);
-    counts[word] += 1;
-
-    if (++ntokens % 1000000 == 0) {
-      const float pct = 100.0 * static_cast<float>(fin.tellg()) / input_size;
-      fprintf(stdout, "\rComputing vocabulary: %0.1f%% complete...", pct);
-      std::flush(std::cout);
+    while (int eaten = NextWordPtr(data + read_size,
+                                   data + data_offset,
+                                   &word)) {
+      data_offset += eaten;
+      if (!prev_word.empty()) {
+        if (std::isspace(data[0])) {
+          counts[prev_word] += 1;
+          ntokens++;
+        } else {
+          word = prev_word + word;
+        }
+        prev_word.erase();
+      }
+      if (data_offset == read_size && !std::isspace(data[page_size - 1]) &&
+          offset + page_size < input_size) {
+        prev_word = word;
+      } else {
+        counts[word] += 1;
+        if (++ntokens % 1000000 == 0) {
+          const float pct = (100.0 * offset) / input_size;
+          fprintf(stdout, "\rComputing vocabulary: %0.1f%% complete...", pct);
+          std::flush(std::cout);
+        }
+      }
     }
+    munmap(data, page_size);
+    offset += page_size;
   }
+  close(fin);
 
   std::cout << counts.size() << " distinct tokens" << std::endl;