Skip to content

Commit

Permalink
[r] add external tests #276 (#279)
Browse files Browse the repository at this point in the history
* Add external tests

* Update external-tests.yaml

* Documentation imprevement

* Test if external tests fails

* Line length fix

* ver change

* Documentation improvement

* code cov fix

* test fix

* Update NEWS.md

Co-authored-by: Hubert Baniecki <[email protected]>
  • Loading branch information
maksymiuks and hbaniecki authored Aug 3, 2020
1 parent 2865793 commit c23e860
Show file tree
Hide file tree
Showing 44 changed files with 718 additions and 166 deletions.
104 changes: 104 additions & 0 deletions .github/workflows/external-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
on:
push:
branches:
- master
- 'dev*'
- 'fix*'
- 'issue*'
- 'python*'
- 'r*'
- 'doc*'
- 'gh-actions'
- 'githubactions'
paths:
- '**.R'
- '**.r'
- '*DESCRIPTION'
- '**.Rmd'
- '**.rmd'
- '**.rda'
- '**.Rda'
- '**.rds'
- '**.Rds'
pull_request:
branches:
- master
paths:
- '**.R'
- '**.r'
- '*DESCRIPTION'
- '**.Rmd'
- '**.rmd'
- '**.rda'
- '**.Rda'
- '**.rds'
- '**.Rds'

name: External-tests

jobs:
External-tests:
runs-on: ${{ matrix.config.os }}

name: ${{ matrix.config.os }} (${{ matrix.config.r }})

strategy:
fail-fast: false
matrix:
config:
- {os: macOS-latest, r: '4.0'}
- {os: windows-latest, r: 'devel'}
- {os: windows-latest, r: '4.0'}
- {os: windows-latest, r: '3.6'}
- {os: ubuntu-16.04, r: '4.0', rspm: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
- {os: ubuntu-16.04, r: '3.6', rspm: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}

steps:
- uses: actions/checkout@v2

- uses: r-lib/actions/setup-r@master
with:
r-version: ${{ matrix.config.r }}

- uses: r-lib/actions/setup-pandoc@master

- name: Query dependencies
run: |
install.packages('remotes')
saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
shell: Rscript {0}

- name: Cache R packages
if: runner.os != 'Windows'
uses: actions/cache@v1
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-

- name: Install system dependencies
if: runner.os == 'Linux'
env:
RHUB_PLATFORM: linux-x86_64-ubuntu-gcc
run: |
Rscript -e "remotes::install_github('r-hub/sysreqs')"
sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))")
sudo -s eval "$sysreqs"
sudo apt-get install -y qpdf
- name: Install dependencies
run: |
remotes::install_deps(dependencies = TRUE)
remotes::install_github("ModelOriented/ingredients")
remotes::install_github("ModelOriented/iBreakDown")
remotes::install_cran(c("rcmdcheck", "glmnet", "parsnip", "ranger", "randomForest", "e1071", "caret", "gbm", "rms", "rpart", "kernlab"))
shell: Rscript {0}

- name: Test
run: |
Sys.setenv(NOT_CRAN = "true")
remotes::install_local()
source("tests/external_tests/external_test_yhat_model_info.R")
shell: Rscript {0}
1 change: 1 addition & 0 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ on:
push:
branches:
- master
- 'issue*'
- 'gh-actions'
- 'githubactions'
paths:
Expand Down
38 changes: 33 additions & 5 deletions R/explain.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' @param y numeric vector with outputs / scores. If provided then it shall have the same size as \code{data}
#' @param weights numeric vector with sampling weights. By default it's \code{NULL}. If provided then it shall have the same length as \code{data}
#' @param predict_function function that takes two arguments: model and new data and returns numeric vector with predictions. By default it is \code{yhat}.
#' @param residual_function function that takes four arguments: model, data, response vector y and predict function (optionally). It should return a numeric vector with model residuals for given data. If not provided, response residuals (\eqn{y-\hat{y}}) are calculated. By default it is \code{residual_function_default}.
#' @param residual_function function that takes four arguments: model, data, target vector y and predict function (optionally). It should return a numeric vector with model residuals for given data. If not provided, response residuals (\eqn{y-\hat{y}}) are calculated. By default it is \code{residual_function_default}.
#' @param ... other parameters
#' @param label character - the name of the model. By default it's extracted from the 'class' attribute of the model
#' @param verbose logical. If TRUE (default) then diagnostic messages will be printed
Expand Down Expand Up @@ -58,20 +58,48 @@
#' # silent execution
#' aps_lm <- explain(aps_lm_model4, verbose = FALSE)
#'
#' # user provided predict_function
#' aps_lm <- explain(aps_lm_model4, data = apartments, label = "model_4v", predict_function = predict)
#'
#' # set target variable
#' aps_lm <- explain(aps_lm_model4, data = apartments, label = "model_4v", y = apartments$m2.price)
#' aps_lm <- explain(aps_lm_model4, data = apartments, label = "model_4v", y = apartments$m2.price,
#' predict_function = predict)
#'
#' # user provided predict_function
#' aps_ranger <- ranger::ranger(m2.price~., data = apartments, num.trees = 50)
#' custom_predict <- function(X.model, newdata) {
#' predict(X.model, newdata)$predictions
#' }
#' aps_ranger_exp <- explain(aps_ranger, data = apartments, y = apartments$m2.price,
#' predict_function = custom_predict)
#'
#'
#' # user provided residual_function
#' aps_ranger <- ranger::ranger(m2.price~., data = apartments, num.trees = 50)
#' custom_residual <- function(X.model, newdata, y, predict_function) {
#' abs(y - predict_function(X.model, newdata))
#' }
#' aps_ranger_exp <- explain(aps_ranger, data = apartments,
#' y = apartments$m2.price,
#' residual_function = custom_residual)
#'
#' # binary classification
#' titanic_ranger <- ranger::ranger(as.factor(survived)~., data = titanic_imputed, num.trees = 50,
#' probability = TRUE)
#' # keep in mind that for binary classification y parameter has to be numeric with 0 and 1 values
#' titanic_ranger_exp <- explain(titanic_ranger, data = titanic_imputed, y = titanic_imputed$survived)
#'
#' # multilabel classification
#' hr_ranger <- ranger::ranger(status~., data = HR, num.trees = 50, probability = TRUE)
#' # keep in mind that for multilabel classification y parameter has to be a factor,
#' # with same levels as in training data
#' hr_ranger_exp <- explain(hr_ranger, data = HR, y = HR$status)
#'
#' # set model_info
#' model_info <- list(package = "stats", ver = "3.6.2", type = "regression")
#' aps_lm_model4 <- lm(m2.price ~., data = apartments)
#' aps_lm_explainer4 <- explain(aps_lm_model4, data = apartments, label = "model_4v",
#' model_info = model_info)
#'
#' \dontrun{
#' \donttest{
#' # set model_info
#' model_info <- list(package = "stats", ver = "3.6.2", type = "regression")
#' aps_lm_model4 <- lm(m2.price ~., data = apartments)
Expand Down
2 changes: 1 addition & 1 deletion R/misc_loss_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' @aliases loss_cross_entropy loss_sum_of_squares loss_root_mean_square loss_accuracy loss_one_minus_auc
#' @export
#' @examples
#' \dontrun{
#' \donttest{
#' library("ranger")
#' titanic_ranger_model <- ranger(survived~., data = titanic_imputed, num.trees = 50,
#' probability = TRUE)
Expand Down
4 changes: 2 additions & 2 deletions R/misc_yhat.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ yhat.cv.glmnet <- function(X.model, newdata, ...) {
return(pred[,2])
}
} else {
pred <- predict(X.model, newdata, type = "response")
pred <- as.numeric(predict(X.model, newdata, type = "response", s = X.model$lambda[length(X.model$lambda)]))
}
pred
}
Expand All @@ -128,7 +128,7 @@ yhat.glmnet <- function(X.model, newdata, ...) {
return(as.numeric(pred))
}
} else {
pred <- predict(X.model, newdata, type = "response")
pred <- as.numeric(predict(X.model, newdata, type = "response", s = X.model$lambda[length(X.model$lambda)]))
}
pred
}
Expand Down
4 changes: 2 additions & 2 deletions R/model_diagnostics.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#' diag_lm <- model_diagnostics(explainer_lm)
#' diag_lm
#' plot(diag_lm)
#' \dontrun{
#' \donttest{
#' library("ranger")
#' apartments_ranger_model <- ranger(m2.price ~ ., data = apartments)
#' explainer_ranger <- explain(apartments_ranger_model,
Expand Down Expand Up @@ -65,7 +65,7 @@ model_diagnostics <- function(explainer, variables = NULL, ...) {

# are there residuals
if (is.null(explainer$residuals)) {
explainer$residuals <- explainer$residual_function(explainer$model, explainer$data)
explainer$residuals <- explainer$residual_function(explainer$model, explainer$data, explainer$y)
}
if (is.null(dim(explainer$residuals))) {
results$residuals <- explainer$residuals
Expand Down
4 changes: 2 additions & 2 deletions R/model_info.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
#' model_info(aps_lm_model4)
#'
#' library("ranger")
#' model_regr_rf <- ranger::ranger(m2.price~., data = apartments, num.trees = 50)
#' model_info(model_regr_rf)
#' model_regr_rf <- ranger::ranger(status~., data = HR, num.trees = 50, probability = TRUE)
#' model_info(model_regr_rf, is_multiclass = TRUE)
#'
model_info <- function(model, is_multiclass = FALSE, ...)
UseMethod("model_info")
Expand Down
47 changes: 32 additions & 15 deletions R/model_parts.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#' Find information how to use this function here: \url{https://pbiecek.github.io/ema/featureImportance.html}.
#'
#' @param explainer a model to be explained, preprocessed by the \code{explain} function
#' @param loss_function a function that will be used to assess variable importance. By default it is 1-AUC for classification, cross entropy for multilabel classification and RMSE for regression.
#' @param loss_function a function that will be used to assess variable importance. By default it is 1-AUC for classification, cross entropy for multilabel classification and RMSE for regression. Custom, user-made loss function should accept two obligatory parameters (observed, predicted), where \code{observed} states for actual values of the target, while \code{predicted} for predicted values. If attribute "loss_accuracy" is associated with function object, then it will be plotted as name of the loss function.
#' @param ... other parameters
#' @param type character, type of transformation that should be applied for dropout loss. \code{variable_importance} and \code{raw} results raw drop lossess, \code{ratio} returns \code{drop_loss/drop_loss_full_model} while \code{difference} returns \code{drop_loss - drop_loss_full_model}
#' @param N number of observations that should be sampled for calculation of variable importance. If negative then variable importance will be calculated on whole dataset (no sampling).
Expand All @@ -21,25 +21,42 @@
#' @export
#'
#' @examples
#' \dontrun{
#' \donttest{
#' # regression
#'
#' library("ranger")
#' titanic_ranger_model <- ranger(survived~., data = titanic_imputed, num.trees = 50,
#' probability = TRUE)
#' explainer_ranger <- explain(titanic_ranger_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived)
#' vi_ranger <- model_parts(explainer_ranger, type = "raw")
#' head(vi_ranger, 8)
#' plot(vi_ranger)
#' apartments_ranger_model <- ranger(m2.price~., data = apartments, num.trees = 50)
#' explainer_ranger <- explain(apartments_ranger_model, data = apartments[,-1],
#' y = apartments$m2.price, label = "Ranger Apartments")
#' model_parts_ranger_aps <- model_parts(explainer_ranger, type = "raw")
#' head(model_parts_ranger_aps, 8)
#' plot(model_parts_ranger_aps)
#'
#' # binary classification
#'
#' titanic_glm_model <- glm(survived~., data = titanic_imputed, family = "binomial")
#' explainer_glm <- explain(titanic_glm_model, data = titanic_imputed[,-8],
#' explainer_glm_titanic <- explain(titanic_glm_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived)
#' logit <- function(x) exp(x)/(1+exp(x))
#' vi_glm <- model_parts(explainer_glm, type = "raw",
#' loss_function = function(observed, predicted)
#' sum((observed - logit(predicted))^2))
#' head(vi_glm, 8)
#' plot(vi_glm)
#' custom_loss <- function(observed, predicted){
#' sum((observed - logit(predicted))^2)
#' }
#' attr(custom_loss, "loss_name") <- "Logit residuals"
#' model_parts_glm_titanic <- model_parts(explainer_glm_titanic, type = "raw",
#' loss_function = custom_loss)
#' head(model_parts_glm_titanic, 8)
#' plot(model_parts_glm_titanic)
#'
#' # multilabel classification
#'
#' HR_ranger_model_HR <- ranger(status~., data = HR, num.trees = 50,
#' probability = TRUE)
#' explainer_ranger_HR <- explain(HR_ranger_model_HR, data = HR[,-6],
#' y = HR$status, label = "Ranger HR")
#' model_parts_ranger_HR <- model_parts(explainer_ranger_HR, type = "raw")
#' head(model_parts_ranger_HR, 8)
#' plot(model_parts_ranger_HR)
#'
#'}
#'
model_parts <- function(explainer,
Expand Down
64 changes: 42 additions & 22 deletions R/model_performance.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,56 @@
#' @param cutoff a cutoff for classification models, needed for measures like recall, precision, ACC, F1. By default 0.5.
#'
#' @return An object of the class \code{model_performance}.
#'
#' It's a list with following fields:
#'
#' \itemize{
#' \item \code{residuals} - data frame that contains residuals for each observation
#' \item \code{measures} - list with calculated measures that are dedicated for the task, whether it is regression, binary classification or multiclass classification.
#' \item \code{type} - character that specifies type of the task.
#' }
#'
#' @references Explanatory Model Analysis. Explore, Explain and Examine Predictive Models. \url{https://pbiecek.github.io/ema/}
#' @importFrom stats median weighted.mean
#' @export
#' @examples
#' \dontrun{
#' \donttest{
#' # regression
#'
#' library("ranger")
#' titanic_ranger_model <- ranger(survived~., data = titanic_imputed, num.trees = 100,
#' probability = TRUE)
#' # It's a good practice to pass data without target variable
#' explainer_ranger <- explain(titanic_ranger_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived)
#' # resulting dataframe has predicted values and residuals
#' mp_ex_rn <- model_performance(explainer_ranger)
#' apartments_ranger_model <- ranger(m2.price~., data = apartments, num.trees = 50)
#' explainer_ranger_apartments <- explain(apartments_ranger_model, data = apartments[,-1],
#' y = apartments$m2.price, label = "Ranger Apartments")
#' model_performance_ranger_aps <- model_performance(explainer_ranger_apartments )
#' model_performance_ranger_aps
#' plot(model_performance_ranger_aps)
#' plot(model_performance_ranger_aps, geom = "boxplot")
#' plot(model_performance_ranger_aps, geom = "histogram")
#'
#' # binary classification
#'
#' titanic_glm_model <- glm(survived~., data = titanic_imputed, family = "binomial")
#' explainer_glm <- explain(titanic_glm_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived,
#' predict_function = function(m,x) predict.glm(m,x,type = "response"),
#' label = "glm")
#' mp_ex_glm <- model_performance(explainer_glm)
#' mp_ex_glm
#' plot(mp_ex_glm)
#' plot(mp_ex_glm, mp_ex_rn)
#' explainer_glm_titanic <- explain(titanic_glm_model, data = titanic_imputed[,-8],
#' y = titanic_imputed$survived)
#' model_performance_glm_titanic <- model_performance(explainer_glm_titanic)
#' model_performance_glm_titanic
#' plot(model_performance_glm_titanic)
#' plot(model_performance_glm_titanic, geom = "boxplot")
#' plot(model_performance_glm_titanic, geom = "histogram")
#'
#' # multilabel classification
#'
#' HR_ranger_model <- ranger(status~., data = HR, num.trees = 50,
#' probability = TRUE)
#' explainer_ranger_HR <- explain(HR_ranger_model, data = HR[,-6],
#' y = HR$status, label = "Ranger HR")
#' model_performance_ranger_HR <- model_performance(explainer_ranger_HR)
#' model_performance_ranger_HR
#' plot(model_performance_ranger_HR)
#' plot(model_performance_ranger_HR, geom = "boxplot")
#' plot(model_performance_ranger_HR, geom = "histogram")
#'
#' titanic_lm_model <- lm(survived~., data = titanic_imputed)
#' explainer_lm <- explain(titanic_lm_model, data = titanic_imputed[,-8], y = titanic_imputed$survived)
#' mp_ex_lm <- model_performance(explainer_lm)
#' plot(mp_ex_lm)
#' plot(mp_ex_glm, mp_ex_rn, mp_ex_lm)
#' }
#'}
#'
model_performance <- function(explainer, ..., cutoff = 0.5) {
test_explainer(explainer, has_data = TRUE, has_y = TRUE, function_name = "model_performance")
Expand Down
Loading

0 comments on commit c23e860

Please sign in to comment.