From b2c52c728439448ef63c0568a9aad245c173126e Mon Sep 17 00:00:00 2001
From: Jim Hester <james.f.hester@gmail.com>
Date: Fri, 7 May 2021 09:09:26 -0400
Subject: [PATCH] Add a trim_ws argument to read_log

Fixes #738
---
 NEWS.md                        |  2 ++
 R/read_log.R                   |  3 ++-
 R/tokenizer.R                  |  4 ++--
 man/Tokenizers.Rd              |  2 +-
 man/read_log.Rd                |  4 ++++
 src/Tokenizer.cpp              |  3 ++-
 src/TokenizerLog.h             | 16 +++++++++++-----
 tests/testthat/test-read_log.R | 12 ++++++++++++
 8 files changed, 36 insertions(+), 10 deletions(-)
 create mode 100644 tests/testthat/test-read_log.R
diff --git a/NEWS.md b/NEWS.md
index aedca4e6..d9047a6b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -30,6 +30,8 @@
 
 ## Additional features and fixes
 
+* `read_log()` gains a `trim_ws` argument (#738)
+
 * `read_rds()` can now read .Rds files from URLs (#1186)
 
 * `read_*()` functions gain a `show_col_types` argument, if set to `FALSE` this turns off showing the column types unconditionally.
diff --git a/R/read_log.R b/R/read_log.R
index 9f1090d9..691f5506 100644
--- a/R/read_log.R
+++ b/R/read_log.R
@@ -9,8 +9,9 @@
 #' @examples
 #' read_log(readr_example("example.log"))
 read_log <- function(file, col_names = FALSE, col_types = NULL,
+                     trim_ws = TRUE,
                      skip = 0, n_max = Inf, progress = show_progress()) {
-  tokenizer <- tokenizer_log()
+  tokenizer <- tokenizer_log(trim_ws = trim_ws)
   read_delimited(file, tokenizer,
     col_names = col_names, col_types = col_types,
     skip = skip, n_max = n_max, progress = progress
diff --git a/R/tokenizer.R b/R/tokenizer.R
index 527dac93..6731e048 100644
--- a/R/tokenizer.R
+++ b/R/tokenizer.R
@@ -122,8 +122,8 @@ tokenizer_line <- function(na = character(), skip_empty_rows = TRUE) {
 
 #' @export
 #' @rdname Tokenizers
-tokenizer_log <- function() {
-  structure(list(), class = "tokenizer_log")
+tokenizer_log <- function(trim_ws) {
+  structure(list(trim_ws = trim_ws), class = "tokenizer_log")
 }
 
 
diff --git a/man/Tokenizers.Rd b/man/Tokenizers.Rd
index 222b70bd..9e17ee42 100644
--- a/man/Tokenizers.Rd
+++ b/man/Tokenizers.Rd
@@ -43,7 +43,7 @@ tokenizer_tsv(
 
 tokenizer_line(na = character(), skip_empty_rows = TRUE)
 
-tokenizer_log()
+tokenizer_log(trim_ws)
 
 tokenizer_fwf(
   begin,
diff --git a/man/read_log.Rd b/man/read_log.Rd
index fe390800..7bf7c88c 100644
--- a/man/read_log.Rd
+++ b/man/read_log.Rd
@@ -8,6 +8,7 @@ read_log(
   file,
   col_names = FALSE,
   col_types = NULL,
+  trim_ws = TRUE,
   skip = 0,
   n_max = Inf,
   progress = show_progress()
@@ -76,6 +77,9 @@ use \code{col_types = list()}, set \code{show_col_types = FALSE} or set
 `options(readr.show_col_types = FALSE)
 }}
 
+\item{trim_ws}{Should leading and trailing whitespace be trimmed from
+each field before parsing it?}
+
 \item{skip}{Number of lines to skip before reading data.}
 
 \item{n_max}{Maximum number of records to read.}
diff --git a/src/Tokenizer.cpp b/src/Tokenizer.cpp
index 4dc6c5da..7361e341 100644
--- a/src/Tokenizer.cpp
+++ b/src/Tokenizer.cpp
@@ -51,7 +51,8 @@ TokenizerPtr Tokenizer::create(cpp11::list spec) {
     bool skipEmptyRows = cpp11::as_cpp<bool>(spec["skip_empty_rows"]);
     return TokenizerPtr(new TokenizerLine(na, skipEmptyRows));
   } else if (subclass == "tokenizer_log") {
-    return TokenizerPtr(new TokenizerLog());
+    bool trimWs = cpp11::as_cpp<bool>(spec["trim_ws"]);
+    return TokenizerPtr(new TokenizerLog(trimWs));
   } else if (subclass == "tokenizer_ws") {
     std::vector<std::string> na =
         cpp11::as_cpp<std::vector<std::string>>(spec["na"]);
diff --git a/src/TokenizerLog.h b/src/TokenizerLog.h
index a6d503b5..e2a70438 100644
--- a/src/TokenizerLog.h
+++ b/src/TokenizerLog.h
@@ -21,9 +21,10 @@ class TokenizerLog : public Tokenizer {
   LogState state_;
   int row_, col_;
   bool moreTokens_;
+  bool trimWS_;
 
 public:
-  TokenizerLog() {}
+  TokenizerLog(bool trimWS) : trimWS_(trimWS) {}
 
   void tokenize(SourceIterator begin, SourceIterator end) {
     cur_ = begin;
@@ -63,8 +64,7 @@ class TokenizerLog : public Tokenizer {
           advanceForLF(&cur_, end_);
           return Token(TOKEN_EMPTY, row, col);
         } else if (*cur_ == ' ') {
-          newField();
-          return Token(TOKEN_EMPTY, row, col);
+          break;
         } else if (*cur_ == '"') {
           state_ = LOG_STRING;
         } else if (*cur_ == '[') {
@@ -165,8 +165,14 @@ class TokenizerLog : public Tokenizer {
   }
 
   Token fieldToken(SourceIterator begin, SourceIterator end, int row, int col) {
-    return Token(begin, end, row, col, false)
-        .flagNA(std::vector<std::string>(1, "-"));
+    Token t(begin, end, row, col, false);
+    if (trimWS_) {
+      t.trim();
+    }
+
+    t.flagNA(std::vector<std::string>(1, "-"));
+
+    return t;
   }
 };
 
diff --git a/tests/testthat/test-read_log.R b/tests/testthat/test-read_log.R
new file mode 100644
index 00000000..67968290
--- /dev/null
+++ b/tests/testthat/test-read_log.R
@@ -0,0 +1,12 @@
+test_that("read_log trims whitespace", {
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  writeLines('Nov  4 00:00:55 vrpweb1 httpd: 131.161.8.219 - - [04/Nov/2017:00:00:55 -0400] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1 HTTP/1.1" 200 10056 "http://www.colby.edu/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36"
+Nov 14 00:00:55 vrpweb1 httpd: 131.161.8.216 - - [04/Nov/2017:00:00:55 -0400] "GET /wp-content/plugins/wooslider-AxZp6o/assets/js/jquery.flexslider.min.js?ver=2.4.1-20170608 HTTP/1.1" 200 22414 "http://www.colby.edu/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36"',
+  tf)
+
+  res <- read_log(tf)
+
+  expect_equal(res[[2]], c(4, 14))
+})