diff --git a/DESCRIPTION b/DESCRIPTION index 3bddab3b..0e413527 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,7 @@ Depends: Imports: dplyr (>= 0.5.0), tidyr, - stringi + snakecase License: MIT + file LICENSE LazyData: true RoxygenNote: 6.0.1 diff --git a/R/clean_names.R b/R/clean_names.R index 8b7182dd..cc5f933f 100644 --- a/R/clean_names.R +++ b/R/clean_names.R @@ -1,12 +1,31 @@ #' @title Cleans names of a data.frame. #' #' @description -#' Resulting names are unique and consist only of the \code{_} character, lowercase letters, and numbers. +#' Resulting names are unique and consist only of the \code{_} character, numbers and letters regarding the specified \code{case}. #' #' Accented characters are #' transliterated to ASCII. For example, an "o" with a german umlaut over it becomes "o", and the Spanish character "enye" becomes "n". #' #' @param dat the input data.frame. +#' @param case The desired target case (default is \code{"snake"}), provided as one of the following: +#' \itemize{ +#' \item{snake_case: \code{"snake"}} +#' \item{lowerCamel: \code{"lower_camel"} or \code{"small_camel"}} +#' \item{UpperCamel: \code{"upper_camel"} or \code{"big_camel"}} +#' \item{ALL_CAPS: \code{"screaming_snake"} or \code{"all_caps"}} +#' \item{lowerUPPER: \code{"lower_upper"}} +#' \item{UPPERlower: \code{"upper_lower"}} +#' } +#' +#' There are three "special" cases available: +#' \itemize{ +#' \item{\code{"parsed"}: Every substring a string consists of, surrounded by an underscore. No lower or +#' upper case pattern from the input string are changed.} +#' \item{\code{"mixed"}: Almost the same as \code{case = "parsed"}. Every letter which is not at the start +#' or behind an underscore is turned into lowercase.} +#' \item{\code{"none"}: This case is just available as an artifact from the underlying snakecase-pkg, but shouldn't be called within the context of \code{clean_names()}.} +#' } +#' #' @return Returns the data.frame with clean names. #' @export #' @examples @@ -20,25 +39,27 @@ #' # library(readxl) #' # readxl("messy_excel_file.xlsx") %>% clean_names() -clean_names <- function(dat){ +clean_names <- function(dat, case = "snake"){ # Takes a data.frame, returns the same data frame with cleaned names old_names <- names(dat) new_names <- old_names %>% gsub("'", "", .) %>% # remove quotation marks gsub("\"", "", .) %>% # remove quotation marks - gsub("%", "percent", .) %>% + gsub("%", ".percent_", .) %>% # starting with "." as a workaround, to make + # ".percent" a valid name. The "." will be replaced in the call to to_any_case + # via the preprocess argument anyway. gsub("^[ ]+", "", .) %>% make.names(.) %>% - gsub("[.]+", "_", .) %>% # convert 1+ periods to single _ - gsub("[_]+", "_", .) %>% # fix rare cases of multiple consecutive underscores - tolower(.) %>% - gsub("_$", "", .) %>% # remove string-final underscores - stringi::stri_trans_general("latin-ascii") - + # Handle dots, multiple underscores, case conversion, string transliteration + snakecase::to_any_case(case = case, preprocess = "\\.", + replace_special_characters = c("Latin-ASCII")) + # Handle duplicated names - they mess up dplyr pipelines # This appends the column number to repeated instances of duplicate variable names - dupe_count <- sapply(1:length(new_names), function(i) { sum(new_names[i] == new_names[1:i]) }) + dupe_count <- vapply(1:length(new_names), function(i) { + sum(new_names[i] == new_names[1:i]) }, integer(1)) + new_names[dupe_count > 1] <- paste(new_names[dupe_count > 1], dupe_count[dupe_count > 1], sep = "_") diff --git a/man/clean_names.Rd b/man/clean_names.Rd index 0a559f5d..ceb9706a 100644 --- a/man/clean_names.Rd +++ b/man/clean_names.Rd @@ -4,16 +4,35 @@ \alias{clean_names} \title{Cleans names of a data.frame.} \usage{ -clean_names(dat) +clean_names(dat, case = "snake") } \arguments{ \item{dat}{the input data.frame.} + +\item{case}{The desired target case (default is \code{"snake"}), provided as one of the following: +\itemize{ + \item{snake_case: \code{"snake"}} + \item{lowerCamel: \code{"lower_camel"} or \code{"small_camel"}} + \item{UpperCamel: \code{"upper_camel"} or \code{"big_camel"}} + \item{ALL_CAPS: \code{"screaming_snake"} or \code{"all_caps"}} + \item{lowerUPPER: \code{"lower_upper"}} + \item{UPPERlower: \code{"upper_lower"}} + } + + There are three "special" cases available: +\itemize{ + \item{\code{"parsed"}: Every substring a string consists of, surrounded by an underscore. No lower or + upper case pattern from the input string are changed.} + \item{\code{"mixed"}: Almost the same as \code{case = "parsed"}. Every letter which is not at the start + or behind an underscore is turned into lowercase.} + \item{\code{"none"}: This case is just available as an artifact from the underlying snakecase-pkg, but shouldn't be called within the context of \code{clean_names()}.} + }} } \value{ Returns the data.frame with clean names. } \description{ -Resulting names are unique and consist only of the \code{_} character, lowercase letters, and numbers. +Resulting names are unique and consist only of the \code{_} character, numbers and letters regarding the specified \code{case}. Accented characters are transliterated to ASCII. For example, an "o" with a german umlaut over it becomes "o", and the Spanish character "enye" becomes "n".