diff --git a/R/basics.r b/R/basics.r deleted file mode 100644 index 12d2cd49..00000000 --- a/R/basics.r +++ /dev/null @@ -1,144 +0,0 @@ -#' The length of a string (in characters). -#' -#' @param string input character vector -#' @return numeric vector giving number of characters in each element of the -#' character vector. Missing string have missing length. -#' @keywords character -#' @seealso \code{\link{nchar}} which this function wraps -#' @examples -#' str_length(letters) -#' str_length(c("i", "like", "programming", NA)) -str_length <- function(string) { - string <- as.character(string) - nc <- nchar(string) - is.na(nc) <- is.na(string) - nc -} - -#' Join multiple strings into a single string. -#' -#' @param ... one or more character vectors. Zero length arguments -#' are removed -#' @param sep string to insert between input vectors -#' @param collapse optional string used to combine input vectors into single -#' string -#' @return If \code{collapse = NULL} (the default) a character vector with -#' length equal to the longest input string. If \code{collapse} is non- -#' NULL, a character vector of length 1. -#' @keywords character -#' @seealso \code{\link{paste}} which this function wraps -#' @examples -#' str_join("Letter: ", letters) -#' str_join("Letter", letters, sep = ": ") -#' str_join(letters, " is for", "...") -#' str_join(letters[-26], " comes before ", letters[-1]) -#' -#' str_join(letters, collapse = "") -#' str_join(letters, collapse = ", ") -str_join <- function(..., sep = "", collapse = NULL) { - strings <- Filter(function(x) length(x) > 0, list(...)) - - do.call("paste", c(strings, list(sep = sep, collapse = collapse))) -} - -#' Trim whitespace from start and end of string. -#' -#' @param string input character vector -#' @return character vector with leading and trailing whitespace removed -#' @keywords character -#' @examples -#' str_trim(" String with trailing and leading white space\t") -#' str_trim("\n\nString with trailing and leading white space\n\n") -str_trim <- function(string) { - str_replace(string, "^\\s+|\\s+$", "") -} - -#' Extract substrings from a character vector. -#' -#' \code{sub_str} will replicate all arguments to be the same length as the -#' longest argument. If any arguments are of length 0, the output will be -#' a zero length character vector. -#' -#' @param string input character vector -#' @param start integer vector giving position of first charater in substring, -#' defaults to first character. -#' @param end integer vector giving position of last character in substring, -#' defaults to last character. -#' @return character vector of substring. Will be length of longest input -#' argument -#' @keywords character -#' @seealso \code{\link{substring}} which this function wraps -#' @examples -#' hw <- "Hadley Wickham" -#' -#' str_sub(hw, 1, 6) -#' str_sub(hw, end = 6) -#' str_sub(hw, 8, 14) -#' str_sub(hw, 8) -#' str_sub(hw, c(1, 8), c(6, 14)) -#' -#' str_sub(hw, seq_len(str_length(hw))) -#' str_sub(hw, end = seq_len(str_length(hw))) -str_sub <- function(string, start = 0, end = Inf) { - if (length(string) == 0 || length(start) == 0 || length(end) == 0) { - return(vector("character", 0)) - } - - n <- max(length(string), length(start), length(end)) - string <- rep(string, length = n) - start <- rep(start, length = n) - end <- rep(end, length = n) - - # Replace infinite ends with length of string - max_length <- !is.na(end) & end == Inf - end[max_length] <- str_length(string)[max_length] - - substring(string, start, end) -} - -#' Pad a string. -#' -#' @param string input character vector -#' @param width pad strings to this minimum width -#' @param side side on which padding character is added -#' @param pad padding character (default is a space) -#' @return character vector -#' @keywords character -#' @examples -#' rbind( -#' str_pad("hadley", 30, "left"), -#' str_pad("hadley", 30, "right"), -#' str_pad("hadley", 30, "center") -#' ) -#' # Longer strings are returned unchanged -#' str_pad("hadley", 3) -str_pad <- function(string, width, side = "left", pad = " ") { - stopifnot(length(width) == 1) - stopifnot(length(side) == 1) - stopifnot(length(pad) == 1) - - side <- match.arg(side, c("left", "right", "center")) - needed <- pmax(0, width - str_length(string)) - - left <- switch(side, - left = needed, right = 0, center = floor(needed / 2)) - right <- switch(side, - left = 0, right = needed, center = ceiling(needed / 2)) - - str_join(str_dup(pad, left), string, str_dup(pad, right)) -} - -#' Duplicate strings within a character vector. -#' -#' @param string input character vector -#' @param times number of times to duplicate each string -#' @return character vector -#' @keywords internal -str_dup <- function(string, times) { - # rep_matrix <- matrix(rep(string, times = times), nrow = times) - strings <- mlply(cbind(x = string, times), rep.int) - output <- unlist(llply(strings, str_join, collapse = "")) - - names(output) <- names(string) - output -} \ No newline at end of file diff --git a/R/detect.r b/R/detect.r new file mode 100644 index 00000000..0be8db3b --- /dev/null +++ b/R/detect.r @@ -0,0 +1,22 @@ +#' Detect the presence or absence of a pattern in a string. +#' +#' @param string input character vector +#' @param pattern pattern to look for. See \code{\link{regex}} for +#' description. +#' @return boolean vector +#' @seealso \code{\link{grepl}} which this function wraps +#' @keywords character +#' @examples +#' fruit <- c("apple", "banana", "pear", "pinapple") +#' str_detect(fruit, "a") +#' str_detect(fruit, "^a") +#' str_detect(fruit, "a$") +#' str_detect(fruit, "b") +#' str_detect(fruit, "[aeiou]") +str_detect <- function(string, pattern) { + results <- grepl(pattern, string) + is.na(results) <- is.na(string) + + results +} + diff --git a/R/dup.r b/R/dup.r new file mode 100644 index 00000000..196537d3 --- /dev/null +++ b/R/dup.r @@ -0,0 +1,14 @@ +#' Duplicate strings within a character vector. +#' +#' @param string input character vector +#' @param times number of times to duplicate each string +#' @return character vector +#' @keywords internal +str_dup <- function(string, times) { + # rep_matrix <- matrix(rep(string, times = times), nrow = times) + strings <- mlply(cbind(x = string, times), rep.int) + output <- unlist(llply(strings, str_join, collapse = "")) + + names(output) <- names(string) + output +} \ No newline at end of file diff --git a/R/extract.r b/R/extract.r new file mode 100644 index 00000000..1f96c5c2 --- /dev/null +++ b/R/extract.r @@ -0,0 +1,40 @@ +#' Extract first piece of a string that matches a pattern. +#' +#' @param string input character vector +#' @param pattern pattern to look for. See \code{\link{regex}} for +#' description. +#' @return character vector. +#' @keywords character +#' @seealso \code{\link{str_extract_all}} to extract all matches +#' @examples +#' shopping_list <- c("apples x4", "flour", "sugar", "milk x2") +#' str_extract(shopping_list, "\\d") +#' str_extract(shopping_list, "[a-z]+") +#' str_extract(shopping_list, "[a-z]{1,4}") +#' str_extract(shopping_list, "\\b[a-z]{1,4}\\b") +str_extract <- function(string, pattern) { + positions <- str_locate(string, pattern) + str_sub(string, positions[, "start"], positions[, "end"]) +} + + +#' Extract all pieces of a string that match a pattern. +#' +#' @param string input character vector +#' @param pattern pattern to look for. See \code{\link{regex}} for +#' description. +#' @return list of character vectors. +#' @keywords character +#' @seealso \code{\link{str_extract}} to extract the first match +#' @examples +#' shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2") +#' str_extract_all(shopping_list, "[a-z]+") +#' str_extract_all(shopping_list, "\\b[a-z]+\\b") +#' str_extract_all(shopping_list, "\\d") +str_extract_all <- function(string, pattern) { + positions <- str_locate_all(string, pattern) + llply(seq_along(string), function(i) { + position <- positions[[i]] + str_sub(string[i], position[, "start"], position[, "end"]) + }) +} diff --git a/R/join.r b/R/join.r new file mode 100644 index 00000000..fed2ab65 --- /dev/null +++ b/R/join.r @@ -0,0 +1,57 @@ +#' Join multiple strings into a single string. +#' +#' @param ... one or more character vectors. Zero length arguments +#' are removed +#' @param sep string to insert between input vectors +#' @param collapse optional string used to combine input vectors into single +#' string +#' @return If \code{collapse = NULL} (the default) a character vector with +#' length equal to the longest input string. If \code{collapse} is non- +#' NULL, a character vector of length 1. +#' @keywords character +#' @seealso \code{\link{paste}} which this function wraps +#' @examples +#' str_join("Letter: ", letters) +#' str_join("Letter", letters, sep = ": ") +#' str_join(letters, " is for", "...") +#' str_join(letters[-26], " comes before ", letters[-1]) +#' +#' str_join(letters, collapse = "") +#' str_join(letters, collapse = ", ") +str_join <- function(..., sep = "", collapse = NULL) { + strings <- Filter(function(x) length(x) > 0, list(...)) + + do.call("paste", c(strings, list(sep = sep, collapse = collapse))) +} + +#' Pad a string. +#' +#' @param string input character vector +#' @param width pad strings to this minimum width +#' @param side side on which padding character is added +#' @param pad padding character (default is a space) +#' @return character vector +#' @keywords character +#' @examples +#' rbind( +#' str_pad("hadley", 30, "left"), +#' str_pad("hadley", 30, "right"), +#' str_pad("hadley", 30, "center") +#' ) +#' # Longer strings are returned unchanged +#' str_pad("hadley", 3) +str_pad <- function(string, width, side = "left", pad = " ") { + stopifnot(length(width) == 1) + stopifnot(length(side) == 1) + stopifnot(length(pad) == 1) + + side <- match.arg(side, c("left", "right", "center")) + needed <- pmax(0, width - str_length(string)) + + left <- switch(side, + left = needed, right = 0, center = floor(needed / 2)) + right <- switch(side, + left = 0, right = needed, center = ceiling(needed / 2)) + + str_join(str_dup(pad, left), string, str_dup(pad, right)) +} \ No newline at end of file diff --git a/R/length.r b/R/length.r new file mode 100644 index 00000000..24903f2b --- /dev/null +++ b/R/length.r @@ -0,0 +1,16 @@ +#' The length of a string (in characters). +#' +#' @param string input character vector +#' @return numeric vector giving number of characters in each element of the +#' character vector. Missing string have missing length. +#' @keywords character +#' @seealso \code{\link{nchar}} which this function wraps +#' @examples +#' str_length(letters) +#' str_length(c("i", "like", "programming", NA)) +str_length <- function(string) { + string <- as.character(string) + nc <- nchar(string) + is.na(nc) <- is.na(string) + nc +} \ No newline at end of file diff --git a/R/locate.r b/R/locate.r new file mode 100644 index 00000000..0f806b5f --- /dev/null +++ b/R/locate.r @@ -0,0 +1,66 @@ +#' Locate the position of the first occurence of a pattern in a string. +#' +#' @param string input character vector +#' @param pattern pattern to look for. See \code{\link{regex}} for +#' description. +#' @return numeric matrix. First column gives start postion of match, and +#' second column gives end position. +#' @keywords character +#' @seealso +#' \code{\link{regexpr}} which this function wraps +#' +#' \code{\link{str_extract}} for a convenient way of extracting matches +# +#' \code{\link{str_locate_all}} to locate position of all matches +#' +#' @examples +#' fruit <- c("apple", "banana", "pear", "pinapple") +#' str_locate(fruit, "a") +#' str_locate(fruit, "e") +str_locate <- function(string, pattern) { + match <- regexpr(pattern, string) + + start <- as.vector(match) + end <- start + attr(match, "match.length") - 1 + + missing <- start == -1 + start[missing] <- NA + end[missing] <- NA + + cbind(start = start, end = end) +} + +#' Locate the position of all occurences of a pattern in a string. +#' +#' @param string input character vector +#' @param pattern pattern to look for. See \code{\link{regex}} for +#' description. +#' @keywords character +#' @return list of numeric matrices. First column gives start postion of +#' match, and second column gives end position. +#' @seealso +#' \code{\link{regexpr}} which this function wraps +#' +#' \code{\link{str_extract}} for a convenient way of extracting matches +#' +#' \code{\link{str_locate}} to locate position of first match +#' +#' @examples +#' fruit <- c("apple", "banana", "pear", "pinapple") +#' str_locate_all(fruit, "a") +#' str_locate_all(fruit, "e") +str_locate_all <- function(string, pattern) { + matches <- gregexpr(pattern, string) + + null <- matrix(0, nrow = 0, ncol = 2) + colnames(null) <- c("start", "end") + + llply(matches, function(match) { + if (length(match) == 1 && match == -1) return(null) + + start <- as.vector(match) + end <- start + attr(match, "match.length") - 1 + cbind(start = start, end = end) + }) +} + diff --git a/R/regexp.r b/R/regexp.r deleted file mode 100644 index 142a6eba..00000000 --- a/R/regexp.r +++ /dev/null @@ -1,172 +0,0 @@ -#' Detect the presence or absence of a pattern in a string. -#' -#' @param string input character vector -#' @param pattern pattern to look for. See \code{\link{regex}} for -#' description. -#' @return boolean vector -#' @seealso \code{\link{grepl}} which this function wraps -#' @keywords character -#' @examples -#' fruit <- c("apple", "banana", "pear", "pinapple") -#' str_detect(fruit, "a") -#' str_detect(fruit, "^a") -#' str_detect(fruit, "a$") -#' str_detect(fruit, "b") -#' str_detect(fruit, "[aeiou]") -str_detect <- function(string, pattern) { - results <- grepl(pattern, string) - is.na(results) <- is.na(string) - - results -} - -#' Locate the position of the first occurence of a pattern in a string. -#' -#' @param string input character vector -#' @param pattern pattern to look for. See \code{\link{regex}} for -#' description. -#' @return numeric matrix. First column gives start postion of match, and -#' second column gives end position. -#' @keywords character -#' @seealso -#' \code{\link{regexpr}} which this function wraps -#' -#' \code{\link{str_extract}} for a convenient way of extracting matches -# -#' \code{\link{str_locate_all}} to locate position of all matches -#' -#' @examples -#' fruit <- c("apple", "banana", "pear", "pinapple") -#' str_locate(fruit, "a") -#' str_locate(fruit, "e") -str_locate <- function(string, pattern) { - match <- regexpr(pattern, string) - - start <- as.vector(match) - end <- start + attr(match, "match.length") - 1 - - missing <- start == -1 - start[missing] <- NA - end[missing] <- NA - - cbind(start = start, end = end) -} - -#' Locate the position of all occurences of a pattern in a string. -#' -#' @param string input character vector -#' @param pattern pattern to look for. See \code{\link{regex}} for -#' description. -#' @keywords character -#' @return list of numeric matrices. First column gives start postion of -#' match, and second column gives end position. -#' @seealso -#' \code{\link{regexpr}} which this function wraps -#' -#' \code{\link{str_extract}} for a convenient way of extracting matches -#' -#' \code{\link{str_locate}} to locate position of first match -#' -#' @examples -#' fruit <- c("apple", "banana", "pear", "pinapple") -#' str_locate_all(fruit, "a") -#' str_locate_all(fruit, "e") -str_locate_all <- function(string, pattern) { - matches <- gregexpr(pattern, string) - - null <- matrix(0, nrow = 0, ncol = 2) - colnames(null) <- c("start", "end") - - llply(matches, function(match) { - if (length(match) == 1 && match == -1) return(null) - - start <- as.vector(match) - end <- start + attr(match, "match.length") - 1 - cbind(start = start, end = end) - }) -} - -#' Extract first piece of a string that matches a pattern. -#' -#' @param string input character vector -#' @param pattern pattern to look for. See \code{\link{regex}} for -#' description. -#' @return character vector. -#' @keywords character -#' @seealso \code{\link{str_extract_all}} to extract all matches -#' @examples -#' shopping_list <- c("apples x4", "flour", "sugar", "milk x2") -#' str_extract(shopping_list, "\\d") -#' str_extract(shopping_list, "[a-z]+") -#' str_extract(shopping_list, "[a-z]{1,4}") -#' str_extract(shopping_list, "\\b[a-z]{1,4}\\b") -str_extract <- function(string, pattern) { - positions <- str_locate(string, pattern) - str_sub(string, positions[, "start"], positions[, "end"]) -} - - -#' Extract all pieces of a string that match a pattern. -#' -#' @param string input character vector -#' @param pattern pattern to look for. See \code{\link{regex}} for -#' description. -#' @return list of character vectors. -#' @keywords character -#' @seealso \code{\link{str_extract}} to extract the first match -#' @examples -#' shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2") -#' str_extract_all(shopping_list, "[a-z]+") -#' str_extract_all(shopping_list, "\\b[a-z]+\\b") -#' str_extract_all(shopping_list, "\\d") -str_extract_all <- function(string, pattern) { - positions <- str_locate_all(string, pattern) - llply(seq_along(string), function(i) { - position <- positions[[i]] - str_sub(string[i], position[, "start"], position[, "end"]) - }) -} - -#' Replace replaced occurences of a matched pattern in a string. -#' -#' @param string input character vector -#' @param pattern pattern to look for. See \code{\link{regex}} for -#' description. -#' @param replacement replacement string. References of the form \code{\1}, -#' \code{\2} will be replaced with the contents of the respective matched -#' group (created by \code{()}) within the pattern. -#' @return character vector. -#' @keywords character -#' @seealso \code{\link{gsub}} which this function wraps -str_replace <- function(string, pattern, replacement) { - gsub(pattern, replacement, string) -} - - -#' Split up a string by a pattern. -#' -#' @param string input character vector -#' @param pattern pattern to split up string by. See \code{\link{regex}} for -#' description. If \code{NA}, returns original string. If \code{""} splits -#' into individual characters. -#' @param n maximum number of pieces to return. Default (Inf) uses all -#' possible split positions -#' @return a list of character vectors. -#' @keywords character -#' @seealso \code{\link{strsplit}} which this function wraps -str_split <- function(string, pattern, n = Inf) { - string <- as.character(string) - if (n == Inf) { - strsplit(string, pattern) - } else if (n == 1) { - string - } else { - locations <- str_locate_all(string, pattern) - llply(locations, function(mat) { - cut <- mat[seq_len(min(n - 1, nrow(mat))), , drop = FALSE] - keep <- matrix(c(0, t(cut), Inf), ncol = 2, byrow = TRUE) - - str_sub(string, keep[, 1] + 1, keep[, 2] - 1) - }) - } -} diff --git a/R/replace.r b/R/replace.r new file mode 100644 index 00000000..2fa6e1ba --- /dev/null +++ b/R/replace.r @@ -0,0 +1,28 @@ + +#' Replace replaced occurences of a matched pattern in a string. +#' +#' @param string input character vector +#' @param pattern pattern to look for. See \code{\link{regex}} for +#' description. +#' @param replacement replacement string. References of the form \code{\1}, +#' \code{\2} will be replaced with the contents of the respective matched +#' group (created by \code{()}) within the pattern. +#' @return character vector. +#' @keywords character +#' @seealso \code{\link{gsub}} which this function wraps +str_replace <- function(string, pattern, replacement) { + gsub(pattern, replacement, string) +} + + +#' Trim whitespace from start and end of string. +#' +#' @param string input character vector +#' @return character vector with leading and trailing whitespace removed +#' @keywords character +#' @examples +#' str_trim(" String with trailing and leading white space\t") +#' str_trim("\n\nString with trailing and leading white space\n\n") +str_trim <- function(string) { + str_replace(string, "^\\s+|\\s+$", "") +} diff --git a/R/split.r b/R/split.r new file mode 100644 index 00000000..b7227e38 --- /dev/null +++ b/R/split.r @@ -0,0 +1,28 @@ + +#' Split up a string by a pattern. +#' +#' @param string input character vector +#' @param pattern pattern to split up string by. See \code{\link{regex}} for +#' description. If \code{NA}, returns original string. If \code{""} splits +#' into individual characters. +#' @param n maximum number of pieces to return. Default (Inf) uses all +#' possible split positions +#' @return a list of character vectors. +#' @keywords character +#' @seealso \code{\link{strsplit}} which this function wraps +str_split <- function(string, pattern, n = Inf) { + string <- as.character(string) + if (n == Inf) { + strsplit(string, pattern) + } else if (n == 1) { + string + } else { + locations <- str_locate_all(string, pattern) + llply(locations, function(mat) { + cut <- mat[seq_len(min(n - 1, nrow(mat))), , drop = FALSE] + keep <- matrix(c(0, t(cut), Inf), ncol = 2, byrow = TRUE) + + str_sub(string, keep[, 1] + 1, keep[, 2] - 1) + }) + } +} diff --git a/R/sub.r b/R/sub.r new file mode 100644 index 00000000..09d5cee6 --- /dev/null +++ b/R/sub.r @@ -0,0 +1,42 @@ +#' Extract substrings from a character vector. +#' +#' \code{sub_str} will replicate all arguments to be the same length as the +#' longest argument. If any arguments are of length 0, the output will be +#' a zero length character vector. +#' +#' @param string input character vector +#' @param start integer vector giving position of first charater in substring, +#' defaults to first character. +#' @param end integer vector giving position of last character in substring, +#' defaults to last character. +#' @return character vector of substring. Will be length of longest input +#' argument +#' @keywords character +#' @seealso \code{\link{substring}} which this function wraps +#' @examples +#' hw <- "Hadley Wickham" +#' +#' str_sub(hw, 1, 6) +#' str_sub(hw, end = 6) +#' str_sub(hw, 8, 14) +#' str_sub(hw, 8) +#' str_sub(hw, c(1, 8), c(6, 14)) +#' +#' str_sub(hw, seq_len(str_length(hw))) +#' str_sub(hw, end = seq_len(str_length(hw))) +str_sub <- function(string, start = 0, end = Inf) { + if (length(string) == 0 || length(start) == 0 || length(end) == 0) { + return(vector("character", 0)) + } + + n <- max(length(string), length(start), length(end)) + string <- rep(string, length = n) + start <- rep(start, length = n) + end <- rep(end, length = n) + + # Replace infinite ends with length of string + max_length <- !is.na(end) & end == Inf + end[max_length] <- str_length(string)[max_length] + + substring(string, start, end) +} \ No newline at end of file