Skip to content

Commit

Permalink
Vectorise pattern in str_detect. Add fixed and ignore.case modifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed Aug 23, 2010
1 parent bb839d4 commit c103344
Show file tree
Hide file tree
Showing 21 changed files with 166 additions and 41 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ Depends: plyr
License: GPL-2
Collate: 'c.r' 'checks.r' 'detect.r' 'dup.r' 'extract.r' 'length.r'
'locate.r' 'match.r' 'modifiers.r' 'pad-trim.r' 'replace.r'
'split.r' 'sub.r'
'split.r' 'sub.r' 'vectorise.r'
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(invert_match)
export(str_match)
export(str_match_all)
export(fixed)
export(ignore.case)
export(str_pad)
export(str_trim)
export(str_replace)
Expand Down
7 changes: 7 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
stringr 0.4
===========

* fixed() function now tells stringr functions to use fixed matching, rather
than escaping regular expression. Should have improve performance for
large vectors
* new ignore.case() modifier tells stringr functions to ignore case of
pattern
* all functions are now vectorised with respect to string, pattern (and
where appropriate) replacement parameters
* str_replace renamed to str_replace_all and new str_replace function added.
This makes str_replace consistent with all functions.
* new str_sub<- function (analogous to substring<-) for substring replacement
Expand Down
10 changes: 7 additions & 3 deletions R/checks.r
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@ check_string <- function(string) {
#'
#' @param pattern input vector
#' @keywords internal
check_pattern <- function(pattern) {
if (!is.character(pattern) || length(pattern) != 1)
stop("Pattern must be character vector of length one", call. = FALSE)
check_pattern <- function(pattern, string, replacement = NULL) {
if (!is.character(pattern))
stop("Pattern must be a character vector", call. = FALSE)

if (!recyclable(string, pattern, replacement)) {
stop("Lengths of string and pattern not compatible")
}

pattern
}
10 changes: 6 additions & 4 deletions R/detect.r
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#' Detect the presence or absence of a pattern in a string.
#'
#' Vectorised over \code{string}. \code{pattern} should be a single pattern,
#' i.e. a character vector of length one.
#' Vectorised over \code{string} and \code{pattern}.
#'
#' @param string input character vector
#' @param pattern pattern to look for, as defined by a POSIX regular
Expand All @@ -18,11 +17,14 @@
#' str_detect(fruit, "a$")
#' str_detect(fruit, "b")
#' str_detect(fruit, "[aeiou]")
#'
#' # Also vectorised over pattern
#' str_detect("aecfg", letters)
str_detect <- function(string, pattern) {
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

results <- grepl(pattern, string)
results <- re_vectorise("grepl", string, pattern)
is.na(results) <- is.na(string)

results
Expand Down
4 changes: 2 additions & 2 deletions R/extract.r
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#' str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
str_extract <- function(string, pattern) {
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

positions <- str_locate(string, pattern)
str_sub(string, positions[, "start"], positions[, "end"])
Expand All @@ -45,7 +45,7 @@ str_extract <- function(string, pattern) {
#' str_extract_all(shopping_list, "\\d")
str_extract_all <- function(string, pattern) {
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

positions <- str_locate_all(string, pattern)
llply(seq_along(string), function(i) {
Expand Down
4 changes: 2 additions & 2 deletions R/locate.r
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' str_locate(fruit, "e")
str_locate <- function(string, pattern) {
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

if (length(string) == 0) return(character())
match <- regexpr(pattern, string)
Expand Down Expand Up @@ -65,7 +65,7 @@ str_locate <- function(string, pattern) {
str_locate_all <- function(string, pattern) {
if (length(string) == 0) return(character())
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

matches <- gregexpr(pattern, string)

Expand Down
2 changes: 1 addition & 1 deletion R/match.r
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#' @export
str_match <- function(string, pattern) {
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

# Locate complete match
matches <- str_extract(string, pattern)
Expand Down
36 changes: 30 additions & 6 deletions R/modifiers.r
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#' Escape all special regular expression characters.
#' Match fixed characters, not regular expression.
#'
#' This function escapes all characters that have meaning for regular
#' expressions so the string will be matched exactly as is.
#' This function specifies that a pattern is a fixed string, rather
#' than a regular expression. This can yield substantial speed ups, if
#' regular expression matching is not needed.
#'
#' @param string string to match exactly as is
#' @author William Dunlap \email{wdunlap@@tibco.com}
#' @keywords character
#' @export
#' @examples
Expand All @@ -13,7 +13,31 @@
#' str_detect(strings, pattern)
#' str_detect(strings, fixed(pattern))
fixed <- function(string) {
string <- check_string(string)
structure(string, fixed = TRUE)
}

str_replace_all(string, "([][^${}().?*+\\|])", "\\\\\\1")
is.fixed <- function(string) {
fixed <- attr(string, "fixed")
if (is.null(fixed)) FALSE else fixed
}

#' Ignore case of match.
#'
#' This function specifies that a pattern should ignore the case of matches.
#'
#' @param string pattern for which to ignore case
#' @keywords character
#' @export
#' @examples
#' pattern <- "a.b"
#' strings <- c("ABB", "aaB", "aab")
#' str_detect(strings, pattern)
#' str_detect(strings, ignore.case(pattern))
ignore.case <- function(string) {
structure(string, ignore.case = TRUE)
}

case.ignored <- function(string) {
ignore.case <- attr(string, "ignore.case")
if (is.null(ignore.case)) FALSE else ignore.case
}
8 changes: 4 additions & 4 deletions R/replace.r
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' Replace first occurence of a matched pattern in a string.
#' Replace first occurrence of a matched pattern in a string.
#'
#' Vectorised over \code{string}. \code{pattern} and \code{replacement}
#' should both be single strings, i.e. a character vectors of length one.
Expand All @@ -17,12 +17,12 @@
#' @export
str_replace <- function(string, pattern, replacement) {
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string, replacement)

sub(pattern, replacement, string)
}

#' Replace all occurences of a matched pattern in a string.
#' Replace all occurrences of a matched pattern in a string.
#'
#' Vectorised over \code{string}. \code{pattern} and \code{replacement}
#' should both be single strings, i.e. a character vectors of length one.
Expand All @@ -41,7 +41,7 @@ str_replace <- function(string, pattern, replacement) {
#' @export
str_replace_all <- function(string, pattern, replacement) {
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string, replacement)

gsub(pattern, replacement, string)
}
4 changes: 2 additions & 2 deletions R/split.r
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
str_split_fixed <- function(string, pattern, n) {
if (length(string) == 0) return(matrix(character(), nrow = n, ncol = 1))
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

if (!is.numeric(n) || length(n) != 1) {
stop("n should be a numeric vector of length 1")
Expand Down Expand Up @@ -82,7 +82,7 @@ str_split_fixed <- function(string, pattern, n) {
str_split <- function(string, pattern, n = Inf) {
if (length(string) == 0) return(list())
string <- check_string(string)
pattern <- check_pattern(pattern)
pattern <- check_pattern(pattern, string)

if (!is.numeric(n) || length(n) != 1) {
stop("n should be a numeric vector of length 1")
Expand Down
28 changes: 28 additions & 0 deletions R/vectorise.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# General wrapper around sub, gsub, regexpr, gregexpr, grepl.
# Vectorises with pattern and replacement, and uses fixed and ignored.case
# attributes.
re_vectorise <- function(f, string, pattern, ...) {
f <- match.fun(f)
if (length(pattern) == 1) {
f(pattern, string,
fixed = is.fixed(pattern), ignore.case = case.ignored(pattern))
} else {
unname(mapply(f, pattern, string, MoreArgs =
list(fixed = is.fixed(pattern), ignore.case = case.ignored(pattern))))
}

# Need tests for fixed and ignore.case !


}

# Check if a set of vectors is recyclable.
# Ignores zero length vectors. Trivially TRUE if all inputs are zero length.
recyclable <- function(...) {
lengths <- vapply(list(...), length, 1)

lengths <- lengths[lengths != 0]
if (length(lengths) == 0) return(TRUE)

all(max(lengths) %% lengths == 0)
}
5 changes: 2 additions & 3 deletions TODO
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
* add examples to str_replace and str_match

* vectorise with respect to pattern
* implement fixed and ignore.case functions
* check that str_locate, str_extract, str_match, str_replace and str_split all
use new vectorisation and modifier strategy.
23 changes: 23 additions & 0 deletions inst/tests/test-check.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
library(testthat)
library_if_available(stringr)

context("String and pattern checks")

test_that("string is atomic", {
expect_that(check_string(list()), throws_error("must be an atomic"))
})

test_that("pattern is a string", {
expect_that(check_pattern(1), throws_error("must be a character vector"))
})

test_that("error when string and pattern lengths incompatible", {
expect_that(check_pattern(letters, "a"), equals(letters))
expect_that(check_pattern("a", letters), equals("a"))

expect_that(check_pattern(c("a", "b", "c"), c("a", "b")),
throws_error("not compatible"))
expect_that(check_pattern(c("a", "b"), c("a", "b", "c")),
throws_error("not compatible"))
})

14 changes: 14 additions & 0 deletions inst/tests/test-detect.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,17 @@ test_that("special cases are correct", {
expect_that(str_detect(NA, ""), equals(NA))
expect_that(str_detect(character(), ""), equals(logical()))
})

test_that("vectorised patterns work", {
expect_that(str_detect("ab", c("a", "b", "c")), equals(c(T, T, F)))
expect_that(str_detect(c("ca", "ab"), c("a", "c")), equals(c(T, F)))
})

test_that("modifiers work", {
expect_that(str_detect("ab", "AB"), equals(FALSE))
expect_that(str_detect("ab", ignore.case("AB")), equals(TRUE))

expect_that(str_detect("abc", "ab[c]"), equals(TRUE))
expect_that(str_detect("abc", fixed("ab[c]")), equals(FALSE))
expect_that(str_detect("ab[c]", fixed("ab[c]")), equals(TRUE))
})
2 changes: 1 addition & 1 deletion man/check_pattern.Rd
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
\name{check_pattern}
\alias{check_pattern}
\title{Check that pattern is of the correct type for stringr functions...}
\usage{check_pattern(pattern)}
\usage{check_pattern(pattern, string, replacement)}

\description{
Check that pattern is of the correct type for stringr functions
Expand Down
10 changes: 5 additions & 5 deletions man/fixed.Rd
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
\name{fixed}
\alias{fixed}
\title{Escape all special regular expression characters.}
\title{Match fixed characters, not regular expression.}
\usage{fixed(string)}

\description{
Escape all special regular expression characters.
Match fixed characters, not regular expression.
}

\details{
This function escapes all characters that have meaning for regular
expressions so the string will be matched exactly as is.
This function specifies that a pattern is a fixed string, rather than
a regular expression. This can yield substantial speed ups, if
regular expression matching is not needed.
}
\author{William Dunlap \email{wdunlap@tibco.com}}
\keyword{character}
\arguments{
\item{string}{string to match exactly as is}
Expand Down
21 changes: 21 additions & 0 deletions man/ignore.case.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
\name{ignore.case}
\alias{ignore.case}
\title{Ignore case of match.}
\usage{ignore.case(string)}

\description{
Ignore case of match.
}

\details{
This function specifies that a pattern should ignore the case of
matches.
}
\keyword{character}
\arguments{
\item{string}{pattern for which to ignore case}
}
\examples{pattern <- "a.b"
strings <- c("ABB", "aaB", "aab")
str_detect(strings, pattern)
str_detect(strings, ignore.case(pattern))}
8 changes: 5 additions & 3 deletions man/str_detect.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
}

\details{
Vectorised over \code{string}. \code{pattern} should be a single
pattern, i.e. a character vector of length one.
Vectorised over \code{string} and \code{pattern}.
}
\value{boolean vector}
\seealso{\code{\link{grepl}} which this function wraps}
Expand All @@ -25,4 +24,7 @@ str_detect(fruit, "a")
str_detect(fruit, "^a")
str_detect(fruit, "a$")
str_detect(fruit, "b")
str_detect(fruit, "[aeiou]")}
str_detect(fruit, "[aeiou]")

# Also vectorised over pattern
str_detect("aecfg", letters)}
4 changes: 2 additions & 2 deletions man/str_replace.Rd
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
\name{str_replace}
\alias{str_replace}
\title{Replace first occurence of a matched pattern in a string.}
\title{Replace first occurrence of a matched pattern in a string.}
\usage{str_replace(string, pattern, replacement)}

\description{
Replace first occurence of a matched pattern in a string.
Replace first occurrence of a matched pattern in a string.
}

\details{
Expand Down
4 changes: 2 additions & 2 deletions man/str_replace_all.Rd
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
\name{str_replace_all}
\alias{str_replace_all}
\title{Replace all occurences of a matched pattern in a string.}
\title{Replace all occurrences of a matched pattern in a string.}
\usage{str_replace_all(string, pattern, replacement)}

\description{
Replace all occurences of a matched pattern in a string.
Replace all occurrences of a matched pattern in a string.
}

\details{
Expand Down

0 comments on commit c103344

Please sign in to comment.