-
Notifications
You must be signed in to change notification settings - Fork 286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add fwf_cols function #616
Merged
Merged
Changes from all commits
Commits
Show all changes
24 commits
Select commit
Hold shift + click to select a range
59a1422
Add fwf_cols function
jrnold 80586ec
fix example for fwf_cols
jrnold e5cc862
Fix failed Travis build
jrnold 2b74406
Add fwf_cols function
jrnold f954bc7
fix example for fwf_cols
jrnold f9f2f4d
Fix failed Travis build
jrnold 03172a7
Add fwf_cols function
jrnold 7a35bac
Updates to fwf_* column position functions
jrnold 0d48bb2
fix failing tests
jrnold 980371b
misc
jrnold 08495c2
merge
jrnold 8b942d7
respond to hadley's comments
jrnold 24def9d
respond to hadley's comments
jrnold 8a7ddff
respond to hadley's comments
jrnold 6370e0c
respond the hadley's comments
jrnold 0b29615
respond to hadley's comments
jrnold c2cce8d
respond to hadley's comments
jrnold 8610b4b
Fix indenting issues
jrnold 6640c3d
fix tests
jrnold 0651081
respond to hadley's comments
jrnold 9c912a3
convert numeric constants in fwf functions
jrnold c22745c
add bullet point to NEWS.md
jrnold bfe64d7
Merge 'tidyverse/master' into fwf_cols
jrnold e7a5b62
fix merge error
jrnold File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
|
||
|
||
#' Read a fixed width file into a tibble | ||
#' | ||
#' A fixed width file can be a very compact representation of numeric data. | ||
|
@@ -20,20 +22,24 @@ | |
#' fwf_sample <- readr_example("fwf-sample.txt") | ||
#' cat(read_lines(fwf_sample)) | ||
#' | ||
#' # You can specify column positions in three ways: | ||
#' # You can specify column positions in several ways: | ||
#' # 1. Guess based on position of empty columns | ||
#' read_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn"))) | ||
#' # 2. A vector of field widths | ||
#' read_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn"))) | ||
#' # 3. Paired vectors of start and end positions | ||
#' read_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn"))) | ||
#' # 4. Named arguments with start and end positions | ||
#' read_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42))) | ||
#' # 5. Named arguments with column widths | ||
#' read_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12)) | ||
read_fwf <- function(file, col_positions, col_types = NULL, | ||
locale = default_locale(), na = c("", "NA"), | ||
comment = "", skip = 0, n_max = Inf, | ||
guess_max = min(n_max, 1000), progress = show_progress()) { | ||
ds <- datasource(file, skip = skip) | ||
if (inherits(ds, "source_file") && empty_file(file)) { | ||
return(tibble::data_frame()) | ||
return(tibble::tibble()) | ||
} | ||
|
||
tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, comment = comment) | ||
|
@@ -54,7 +60,8 @@ read_fwf <- function(file, col_positions, col_types = NULL, | |
} | ||
|
||
out <- read_tokens(ds, tokenizer, spec$cols, names(spec$cols), | ||
locale_ = locale, n_max = if (n_max == Inf) -1 else n_max, progress = progress) | ||
locale_ = locale, n_max = if (n_max == Inf) -1 else n_max, | ||
progress = progress) | ||
|
||
out <- name_problems(out, names(spec$cols), source_name(file)) | ||
attr(out, "spec") <- spec | ||
|
@@ -71,13 +78,8 @@ fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L) | |
out <- whitespaceColumns(ds, comment = comment, n = n) | ||
out$end[length(out$end)] <- NA | ||
|
||
if (is.null(col_names)) { | ||
col_names <- paste0("X", seq_along(out$begin)) | ||
} else { | ||
stopifnot(length(out$begin) == length(col_names)) | ||
} | ||
col_names <- fwf_col_names(col_names, length(out$begin)) | ||
out$col_names <- col_names | ||
|
||
out | ||
} | ||
|
||
|
@@ -87,28 +89,53 @@ fwf_empty <- function(file, skip = 0, col_names = NULL, comment = "", n = 100L) | |
#' reading a ragged fwf file. | ||
#' @param col_names Either NULL, or a character vector column names. | ||
fwf_widths <- function(widths, col_names = NULL) { | ||
pos <- cumsum(c(1, abs(widths))) | ||
|
||
fwf_positions(pos[-length(pos)], pos[-1] - 1, col_names) | ||
pos <- cumsum(c(1L, abs(widths))) | ||
fwf_positions(pos[-length(pos)], pos[-1] - 1L, col_names) | ||
} | ||
|
||
#' @rdname read_fwf | ||
#' @export | ||
#' @param start,end Starting and ending (inclusive) positions of each field. | ||
#' Use NA as last end field when reading a ragged fwf file. | ||
fwf_positions <- function(start, end, col_names = NULL) { | ||
fwf_positions <- function(start, end = NULL, col_names = NULL) { | ||
|
||
stopifnot(length(start) == length(end)) | ||
col_names <- fwf_col_names(col_names, length(start)) | ||
|
||
if (is.null(col_names)) { | ||
col_names <- paste0("X", seq_along(start)) | ||
} else { | ||
stopifnot(length(start) == length(col_names)) | ||
} | ||
|
||
list( | ||
begin = start - 1, | ||
tibble( | ||
begin = start - 1L, | ||
end = end, # -1 to change to 0 offset, +1 to be exclusive, | ||
col_names = col_names | ||
) | ||
} | ||
|
||
|
||
#' @rdname read_fwf | ||
#' @export | ||
#' @param ... If the first element is a data frame, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels too flexible to me. But if you really think it's a good idea to keep it, the function signature should be |
||
#' then it must have all numeric columns and either one or two rows. | ||
#' The column names are the variable names, and the column values are the | ||
#' variable widths if a length one vector, and variable start and end | ||
#' positions. | ||
#' Otherwise, the elements of `...` are used to construct a data frame | ||
#' with or or two rows as above. | ||
fwf_cols <- function(...) { | ||
x <- lapply(list(...), as.integer) | ||
names(x) <- fwf_col_names(names(x), length(x)) | ||
x <- tibble::as_tibble(x) | ||
if (nrow(x) == 2) { | ||
fwf_positions(as.integer(x[1, ]), as.integer(x[2, ]), names(x)) | ||
} else if (nrow(x) == 1) { | ||
fwf_widths(as.integer(x[1, ]), names(x)) | ||
} else { | ||
stop("All variables must have either one (width) two (start, end) values.", | ||
call. = FALSE) | ||
} | ||
} | ||
|
||
fwf_col_names <- function(nm, n) { | ||
nm <- nm %||% rep("", n) | ||
nm_empty <- (nm == "") | ||
nm[nm_empty] <- paste0("X", seq_len(n))[nm_empty] | ||
nm | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you include the width form here too?