From b2c52c728439448ef63c0568a9aad245c173126e Mon Sep 17 00:00:00 2001 From: Jim Hester Date: Fri, 7 May 2021 09:09:26 -0400 Subject: [PATCH] Add a trim_ws argument to read_log Fixes #738 --- NEWS.md | 2 ++ R/read_log.R | 3 ++- R/tokenizer.R | 4 ++-- man/Tokenizers.Rd | 2 +- man/read_log.Rd | 4 ++++ src/Tokenizer.cpp | 3 ++- src/TokenizerLog.h | 16 +++++++++++----- tests/testthat/test-read_log.R | 12 ++++++++++++ 8 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 tests/testthat/test-read_log.R diff --git a/NEWS.md b/NEWS.md index aedca4e6..d9047a6b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,8 @@ ## Additional features and fixes +* `read_log()` gains a `trim_ws` argument (#738) + * `read_rds()` can now read .Rds files from URLs (#1186) * `read_*()` functions gain a `show_col_types` argument, if set to `FALSE` this turns off showing the column types unconditionally. diff --git a/R/read_log.R b/R/read_log.R index 9f1090d9..691f5506 100644 --- a/R/read_log.R +++ b/R/read_log.R @@ -9,8 +9,9 @@ #' @examples #' read_log(readr_example("example.log")) read_log <- function(file, col_names = FALSE, col_types = NULL, + trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress()) { - tokenizer <- tokenizer_log() + tokenizer <- tokenizer_log(trim_ws = trim_ws) read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, skip = skip, n_max = n_max, progress = progress diff --git a/R/tokenizer.R b/R/tokenizer.R index 527dac93..6731e048 100644 --- a/R/tokenizer.R +++ b/R/tokenizer.R @@ -122,8 +122,8 @@ tokenizer_line <- function(na = character(), skip_empty_rows = TRUE) { #' @export #' @rdname Tokenizers -tokenizer_log <- function() { - structure(list(), class = "tokenizer_log") +tokenizer_log <- function(trim_ws) { + structure(list(trim_ws = trim_ws), class = "tokenizer_log") } diff --git a/man/Tokenizers.Rd b/man/Tokenizers.Rd index 222b70bd..9e17ee42 100644 --- a/man/Tokenizers.Rd +++ b/man/Tokenizers.Rd @@ -43,7 +43,7 @@ tokenizer_tsv( tokenizer_line(na = character(), skip_empty_rows = TRUE) -tokenizer_log() +tokenizer_log(trim_ws) tokenizer_fwf( begin, diff --git a/man/read_log.Rd b/man/read_log.Rd index fe390800..7bf7c88c 100644 --- a/man/read_log.Rd +++ b/man/read_log.Rd @@ -8,6 +8,7 @@ read_log( file, col_names = FALSE, col_types = NULL, + trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress() @@ -76,6 +77,9 @@ use \code{col_types = list()}, set \code{show_col_types = FALSE} or set `options(readr.show_col_types = FALSE) }} +\item{trim_ws}{Should leading and trailing whitespace be trimmed from +each field before parsing it?} + \item{skip}{Number of lines to skip before reading data.} \item{n_max}{Maximum number of records to read.} diff --git a/src/Tokenizer.cpp b/src/Tokenizer.cpp index 4dc6c5da..7361e341 100644 --- a/src/Tokenizer.cpp +++ b/src/Tokenizer.cpp @@ -51,7 +51,8 @@ TokenizerPtr Tokenizer::create(cpp11::list spec) { bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); return TokenizerPtr(new TokenizerLine(na, skipEmptyRows)); } else if (subclass == "tokenizer_log") { - return TokenizerPtr(new TokenizerLog()); + bool trimWs = cpp11::as_cpp(spec["trim_ws"]); + return TokenizerPtr(new TokenizerLog(trimWs)); } else if (subclass == "tokenizer_ws") { std::vector na = cpp11::as_cpp>(spec["na"]); diff --git a/src/TokenizerLog.h b/src/TokenizerLog.h index a6d503b5..e2a70438 100644 --- a/src/TokenizerLog.h +++ b/src/TokenizerLog.h @@ -21,9 +21,10 @@ class TokenizerLog : public Tokenizer { LogState state_; int row_, col_; bool moreTokens_; + bool trimWS_; public: - TokenizerLog() {} + TokenizerLog(bool trimWS) : trimWS_(trimWS) {} void tokenize(SourceIterator begin, SourceIterator end) { cur_ = begin; @@ -63,8 +64,7 @@ class TokenizerLog : public Tokenizer { advanceForLF(&cur_, end_); return Token(TOKEN_EMPTY, row, col); } else if (*cur_ == ' ') { - newField(); - return Token(TOKEN_EMPTY, row, col); + break; } else if (*cur_ == '"') { state_ = LOG_STRING; } else if (*cur_ == '[') { @@ -165,8 +165,14 @@ class TokenizerLog : public Tokenizer { } Token fieldToken(SourceIterator begin, SourceIterator end, int row, int col) { - return Token(begin, end, row, col, false) - .flagNA(std::vector(1, "-")); + Token t(begin, end, row, col, false); + if (trimWS_) { + t.trim(); + } + + t.flagNA(std::vector(1, "-")); + + return t; } }; diff --git a/tests/testthat/test-read_log.R b/tests/testthat/test-read_log.R new file mode 100644 index 00000000..67968290 --- /dev/null +++ b/tests/testthat/test-read_log.R @@ -0,0 +1,12 @@ +test_that("read_log trims whitespace", { + tf <- tempfile() + on.exit(unlink(tf)) + + writeLines('Nov 4 00:00:55 vrpweb1 httpd: 131.161.8.219 - - [04/Nov/2017:00:00:55 -0400] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1 HTTP/1.1" 200 10056 "http://www.colby.edu/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36" +Nov 14 00:00:55 vrpweb1 httpd: 131.161.8.216 - - [04/Nov/2017:00:00:55 -0400] "GET /wp-content/plugins/wooslider-AxZp6o/assets/js/jquery.flexslider.min.js?ver=2.4.1-20170608 HTTP/1.1" 200 22414 "http://www.colby.edu/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36"', + tf) + + res <- read_log(tf) + + expect_equal(res[[2]], c(4, 14)) +})