Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fwf_cols function #616

Merged
merged 24 commits into from
Feb 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export(default_locale)
export(format_csv)
export(format_delim)
export(format_tsv)
export(fwf_cols)
export(fwf_empty)
export(fwf_positions)
export(fwf_widths)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
* `write_*()` functions witting whole number doubles are no longer written with a trailing `.0` (#526).

### Whitespace / fixed width improvements

* `fwf_cols()` allows for specifying the `col_positions` argument of
`read_fwf()` with named arguments of either column positions or widths
(#616, @jrnold).
* `fwf_empty()` gains an `n` argument to control how many lines are read for whitespace to determine column structure (#518, @Yeedle).
* `read_table()` can now handle `pipe()` connections (#552).
* `read_table()` can now handle files with many lines of leading comments (#563).
Expand Down
69 changes: 48 additions & 21 deletions R/read_fwf.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


#' Read a fixed width file into a tibble
#'
#' A fixed width file can be a very compact representation of numeric data.
Expand All @@ -20,20 +22,24 @@
#' fwf_sample <- readr_example("fwf-sample.txt")
#' cat(read_lines(fwf_sample))
#'
#' # You can specify column positions in three ways:
#' # You can specify column positions in several ways:
#' # 1. Guess based on position of empty columns
#' read_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn")))
#' # 2. A vector of field widths
#' read_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn")))
#' # 3. Paired vectors of start and end positions
#' read_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn")))
#' # 4. Named arguments with start and end positions
#' read_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you include the width form here too?

#' # 5. Named arguments with column widths
#' read_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12))
read_fwf <- function(file, col_positions, col_types = NULL,
locale = default_locale(), na = c("", "NA"),
comment = "", skip = 0, n_max = Inf,
guess_max = min(n_max, 1000), progress = show_progress()) {
ds <- datasource(file, skip = skip)
if (inherits(ds, "source_file") && empty_file(file)) {
return(tibble::data_frame())
return(tibble::tibble())
}

tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, comment = comment)
Expand All @@ -54,7 +60,8 @@ read_fwf <- function(file, col_positions, col_types = NULL,
}

out <- read_tokens(ds, tokenizer, spec$cols, names(spec$cols),
locale_ = locale, n_max = if (n_max == Inf) -1 else n_max, progress = progress)
locale_ = locale, n_max = if (n_max == Inf) -1 else n_max,
progress = progress)

out <- name_problems(out, names(spec$cols), source_name(file))
attr(out, "spec") <- spec
Expand All @@ -71,13 +78,8 @@ fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L)
out <- whitespaceColumns(ds, comment = comment, n = n)
out$end[length(out$end)] <- NA

if (is.null(col_names)) {
col_names <- paste0("X", seq_along(out$begin))
} else {
stopifnot(length(out$begin) == length(col_names))
}
col_names <- fwf_col_names(col_names, length(out$begin))
out$col_names <- col_names

out
}

Expand All @@ -87,28 +89,53 @@ fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L)
#' reading a ragged fwf file.
#' @param col_names Either NULL, or a character vector column names.
fwf_widths <- function(widths, col_names = NULL) {
pos <- cumsum(c(1, abs(widths)))

fwf_positions(pos[-length(pos)], pos[-1] - 1, col_names)
pos <- cumsum(c(1L, abs(widths)))
fwf_positions(pos[-length(pos)], pos[-1] - 1L, col_names)
}

#' @rdname read_fwf
#' @export
#' @param start,end Starting and ending (inclusive) positions of each field.
#' Use NA as last end field when reading a ragged fwf file.
fwf_positions <- function(start, end, col_names = NULL) {
fwf_positions <- function(start, end = NULL, col_names = NULL) {

stopifnot(length(start) == length(end))
col_names <- fwf_col_names(col_names, length(start))

if (is.null(col_names)) {
col_names <- paste0("X", seq_along(start))
} else {
stopifnot(length(start) == length(col_names))
}

list(
begin = start - 1,
tibble(
begin = start - 1L,
end = end, # -1 to change to 0 offset, +1 to be exclusive,
col_names = col_names
)
}


#' @rdname read_fwf
#' @export
#' @param ... If the first element is a data frame,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels too flexible to me. But if you really think it's a good idea to keep it, the function signature should be x, ...

#' then it must have all numeric columns and either one or two rows.
#' The column names are the variable names, and the column values are the
#' variable widths if a length one vector, and variable start and end
#' positions.
#' Otherwise, the elements of `...` are used to construct a data frame
#' with or or two rows as above.
fwf_cols <- function(...) {
x <- lapply(list(...), as.integer)
names(x) <- fwf_col_names(names(x), length(x))
x <- tibble::as_tibble(x)
if (nrow(x) == 2) {
fwf_positions(as.integer(x[1, ]), as.integer(x[2, ]), names(x))
} else if (nrow(x) == 1) {
fwf_widths(as.integer(x[1, ]), names(x))
} else {
stop("All variables must have either one (width) two (start, end) values.",
call. = FALSE)
}
}

fwf_col_names <- function(nm, n) {
nm <- nm %||% rep("", n)
nm_empty <- (nm == "")
nm[nm_empty] <- paste0("X", seq_len(n))[nm_empty]
nm
}
19 changes: 17 additions & 2 deletions man/read_fwf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions tests/testthat/test-read-fwf.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,35 @@ test_that("error on empty spec (#511, #519)", {
expect_error(read_fwf(txt, pos), "Zero-length.*specifications not supported")
})

# fwf_cols
test_that("fwf_cols produces correct fwf_positions object with elements of length 2", {
expected <- fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("a", "b", "d"))
expect_equivalent(fwf_cols(a = c(1, 2), b = c(9, 12), d = c(4, 6)), expected)
})

test_that("fwf_cols produces correct fwf_positions object with elements of length 1", {
expected <- fwf_widths(c(2L, 4L, 3L), c("a", "b", "c"))
expect_equivalent(fwf_cols(a = 2, b = 4, c = 3), expected)
})


test_that("fwf_cols throws error when arguments are not length 1 or 2", {
pattern <- "Variables must be length 1 or .*"
expect_error(fwf_cols(a = 1:3, b = 4:5), pattern)
expect_error(fwf_cols(a = c(), b = 4:5), pattern)
})

test_that("fwf_cols works with unnamed columns", {
expect_equivalent(
fwf_cols(c(1, 2), c(9, 12), c(4, 6)),
fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("X1", "X2", "X3"))
)
expect_equivalent(
fwf_cols(a = c(1, 2), c(9, 12), c(4, 6)),
fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("a", "X2", "X3"))
)
})

# read_table -------------------------------------------------------------------

test_that("read_table silently reads ragged last column", {
Expand Down