[r] add external tests #276 (#279)

* Add external tests * Update external-tests.yaml * Documentation imprevement * Test if external tests fails * Line length fix * ver change * Documentation improvement * code cov fix * test fix * Update NEWS.md Co-authored-by: Hubert Baniecki <[email protected]>
ModelOriented · Aug 3, 2020 · c23e860 · c23e860
1 parent 2865793
commit c23e860
Show file tree

Hide file tree

Showing 44 changed files with 718 additions and 166 deletions.
diff --git a/.github/workflows/external-tests.yaml b/.github/workflows/external-tests.yaml
@@ -0,0 +1,104 @@
+on:
+  push:
+    branches:
+      - master
+      - 'dev*'
+      - 'fix*'
+      - 'issue*'
+      - 'python*'
+      - 'r*'
+      - 'doc*'
+      - 'gh-actions'
+      - 'githubactions'
+    paths:
+      - '**.R'
+      - '**.r'
+      - '*DESCRIPTION'
+      - '**.Rmd'
+      - '**.rmd'
+      - '**.rda'
+      - '**.Rda'
+      - '**.rds'
+      - '**.Rds'
+  pull_request:
+    branches:
+      - master
+    paths:
+      - '**.R'
+      - '**.r'
+      - '*DESCRIPTION'
+      - '**.Rmd'
+      - '**.rmd'
+      - '**.rda'
+      - '**.Rda'
+      - '**.rds'
+      - '**.Rds'
+
+name: External-tests
+
+jobs:
+  External-tests:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macOS-latest,   r: '4.0'}
+          - {os: windows-latest, r: 'devel'}
+          - {os: windows-latest, r: '4.0'}
+          - {os: windows-latest, r: '3.6'}
+          - {os: ubuntu-16.04, r: '4.0', rspm: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
+          - {os: ubuntu-16.04, r: '3.6', rspm: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
+    env:
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      RSPM: ${{ matrix.config.rspm }}
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@master
+        with:
+          r-version: ${{ matrix.config.r }}
+
+      - uses: r-lib/actions/setup-pandoc@master
+
+      - name: Query dependencies
+        run: |
+          install.packages('remotes')
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
+        shell: Rscript {0}
+
+      - name: Cache R packages
+        if: runner.os != 'Windows'
+        uses: actions/cache@v1
+        with:
+          path: ${{ env.R_LIBS_USER }}
+          key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-
+
+      - name: Install system dependencies
+        if: runner.os == 'Linux'
+        env:
+          RHUB_PLATFORM: linux-x86_64-ubuntu-gcc
+        run: |
+          Rscript -e "remotes::install_github('r-hub/sysreqs')"
+          sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))")
+          sudo -s eval "$sysreqs"
+          sudo apt-get install -y qpdf
+      - name: Install dependencies
+        run: |
+          remotes::install_deps(dependencies = TRUE)
+          remotes::install_github("ModelOriented/ingredients")
+          remotes::install_github("ModelOriented/iBreakDown")
+          remotes::install_cran(c("rcmdcheck", "glmnet", "parsnip", "ranger", "randomForest", "e1071", "caret", "gbm", "rms", "rpart", "kernlab"))
+        shell: Rscript {0}
+
+      - name: Test
+        run: |
+          Sys.setenv(NOT_CRAN = "true")
+          remotes::install_local()
+          source("tests/external_tests/external_test_yhat_model_info.R")
+        shell: Rscript {0}
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -2,6 +2,7 @@ on:
   push:
     branches:
       - master
+      - 'issue*'
       - 'gh-actions'
       - 'githubactions'
     paths:

diff --git a/R/explain.R b/R/explain.R
@@ -11,7 +11,7 @@
 #' @param y numeric vector with outputs / scores. If provided then it shall have the same size as \code{data}
 #' @param weights numeric vector with sampling weights. By default it's \code{NULL}. If provided then it shall have the same length as \code{data}
 #' @param predict_function function that takes two arguments: model and new data and returns numeric vector with predictions.   By default it is \code{yhat}.
-#' @param residual_function function that takes four arguments: model, data, response vector y and predict function (optionally). It should return a numeric vector with model residuals for given data. If not provided, response residuals (\eqn{y-\hat{y}}) are calculated. By default it is \code{residual_function_default}.
+#' @param residual_function function that takes four arguments: model, data, target vector y and predict function (optionally). It should return a numeric vector with model residuals for given data. If not provided, response residuals (\eqn{y-\hat{y}}) are calculated. By default it is \code{residual_function_default}.
 #' @param ... other parameters
 #' @param label character - the name of the model. By default it's extracted from the 'class' attribute of the model
 #' @param verbose logical. If TRUE (default) then diagnostic messages will be printed
@@ -58,20 +58,48 @@
 #' # silent execution
 #' aps_lm <- explain(aps_lm_model4, verbose = FALSE)
 #'
-#' # user provided predict_function
-#' aps_lm <- explain(aps_lm_model4, data = apartments, label = "model_4v", predict_function = predict)
-#'
 #' # set target variable
 #' aps_lm <- explain(aps_lm_model4, data = apartments, label = "model_4v", y = apartments$m2.price)
 #' aps_lm <- explain(aps_lm_model4, data = apartments, label = "model_4v", y = apartments$m2.price,
 #'                                    predict_function = predict)
+#'
+#' # user provided predict_function
+#' aps_ranger <- ranger::ranger(m2.price~., data = apartments, num.trees = 50)
+#' custom_predict <- function(X.model, newdata) {
+#'    predict(X.model, newdata)$predictions
+#' }
+#' aps_ranger_exp <- explain(aps_ranger, data = apartments, y = apartments$m2.price,
+#'                           predict_function = custom_predict)
+#'
+#'
+#' # user provided residual_function
+#' aps_ranger <- ranger::ranger(m2.price~., data = apartments, num.trees = 50)
+#' custom_residual <- function(X.model, newdata, y, predict_function) {
+#'    abs(y - predict_function(X.model, newdata))
+#' }
+#' aps_ranger_exp <- explain(aps_ranger, data = apartments,
+#'                           y = apartments$m2.price,
+#'                           residual_function = custom_residual)
+#'
+#' # binary classification
+#' titanic_ranger <- ranger::ranger(as.factor(survived)~., data = titanic_imputed, num.trees = 50,
+#'                                  probability = TRUE)
+#' # keep in mind that for binary classification y parameter has to be numeric  with 0 and 1 values
+#' titanic_ranger_exp <- explain(titanic_ranger, data = titanic_imputed, y = titanic_imputed$survived)
+#'
+#' # multilabel classification
+#' hr_ranger <- ranger::ranger(status~., data = HR, num.trees = 50, probability = TRUE)
+#' # keep in mind that for multilabel classification y parameter has to be a factor,
+#' # with same levels as in training data
+#' hr_ranger_exp <- explain(hr_ranger, data = HR, y = HR$status)
+#'
 #' # set model_info
 #' model_info <- list(package = "stats", ver = "3.6.2", type = "regression")
 #' aps_lm_model4 <- lm(m2.price ~., data = apartments)
 #' aps_lm_explainer4 <- explain(aps_lm_model4, data = apartments, label = "model_4v",
 #'                              model_info = model_info)
 #'
-#' \dontrun{
+#' \donttest{
 #' # set model_info
 #' model_info <- list(package = "stats", ver = "3.6.2", type = "regression")
 #' aps_lm_model4 <- lm(m2.price ~., data = apartments)

diff --git a/R/misc_loss_functions.R b/R/misc_loss_functions.R
@@ -11,7 +11,7 @@
 #' @aliases loss_cross_entropy loss_sum_of_squares loss_root_mean_square loss_accuracy loss_one_minus_auc
 #' @export
 #' @examples
-#'  \dontrun{
+#'  \donttest{
 #' library("ranger")
 #' titanic_ranger_model <- ranger(survived~., data = titanic_imputed, num.trees = 50,
 #'                                probability = TRUE)

diff --git a/R/misc_yhat.R b/R/misc_yhat.R
@@ -106,7 +106,7 @@ yhat.cv.glmnet <- function(X.model, newdata, ...) {
       return(pred[,2])
     }
   } else {
-    pred <- predict(X.model, newdata, type = "response")
+    pred <- as.numeric(predict(X.model, newdata, type = "response", s = X.model$lambda[length(X.model$lambda)]))
   }
   pred
 }
@@ -128,7 +128,7 @@ yhat.glmnet <- function(X.model, newdata, ...) {
       return(as.numeric(pred))
     }
   } else {
-    pred <- predict(X.model, newdata, type = "response")
+    pred <- as.numeric(predict(X.model, newdata, type = "response", s = X.model$lambda[length(X.model$lambda)]))
   }
   pred
 }

diff --git a/R/model_diagnostics.R b/R/model_diagnostics.R
@@ -21,7 +21,7 @@
 #' diag_lm <- model_diagnostics(explainer_lm)
 #' diag_lm
 #' plot(diag_lm)
-#' \dontrun{
+#' \donttest{
 #' library("ranger")
 #' apartments_ranger_model <- ranger(m2.price ~ ., data = apartments)
 #' explainer_ranger <- explain(apartments_ranger_model,
@@ -65,7 +65,7 @@ model_diagnostics <-  function(explainer, variables = NULL, ...) {
 
   # are there residuals
   if (is.null(explainer$residuals)) {
-    explainer$residuals <- explainer$residual_function(explainer$model, explainer$data)
+    explainer$residuals <- explainer$residual_function(explainer$model, explainer$data, explainer$y)
   }
   if (is.null(dim(explainer$residuals))) {
     results$residuals <- explainer$residuals

diff --git a/R/model_info.R b/R/model_info.R
@@ -33,8 +33,8 @@
 #' model_info(aps_lm_model4)
 #'
 #' library("ranger")
-#' model_regr_rf <- ranger::ranger(m2.price~., data = apartments, num.trees = 50)
-#' model_info(model_regr_rf)
+#' model_regr_rf <- ranger::ranger(status~., data = HR, num.trees = 50, probability = TRUE)
+#' model_info(model_regr_rf, is_multiclass = TRUE)
 #'
 model_info <- function(model, is_multiclass = FALSE, ...)
   UseMethod("model_info")

diff --git a/R/model_parts.R b/R/model_parts.R
@@ -4,7 +4,7 @@
 #' Find information how to use this function here: \url{https://pbiecek.github.io/ema/featureImportance.html}.
 #'
 #' @param explainer a model to be explained, preprocessed by the \code{explain} function
-#' @param loss_function a function that will be used to assess variable importance. By default it is 1-AUC for classification, cross entropy for multilabel classification and RMSE for regression.
+#' @param loss_function a function that will be used to assess variable importance. By default it is 1-AUC for classification, cross entropy for multilabel classification and RMSE for regression. Custom, user-made loss function should accept two obligatory parameters (observed, predicted), where \code{observed} states for actual values of the target, while \code{predicted} for predicted values. If attribute "loss_accuracy" is associated with function object, then it will be plotted as name of the loss function.
 #' @param ... other parameters
 #' @param type character, type of transformation that should be applied for dropout loss. \code{variable_importance} and \code{raw} results raw drop lossess, \code{ratio} returns \code{drop_loss/drop_loss_full_model} while \code{difference} returns \code{drop_loss - drop_loss_full_model}
 #' @param N number of observations that should be sampled for calculation of variable importance. If negative then variable importance will be calculated on whole dataset (no sampling).
@@ -21,25 +21,42 @@
 #' @export
 #'
 #' @examples
-#' \dontrun{
+#' \donttest{
+#' # regression
+#'
 #' library("ranger")
-#' titanic_ranger_model <- ranger(survived~., data = titanic_imputed, num.trees = 50,
-#'                                probability = TRUE)
-#' explainer_ranger  <- explain(titanic_ranger_model, data = titanic_imputed[,-8],
-#'                              y = titanic_imputed$survived)
-#' vi_ranger <- model_parts(explainer_ranger, type = "raw")
-#' head(vi_ranger, 8)
-#' plot(vi_ranger)
+#' apartments_ranger_model <- ranger(m2.price~., data = apartments, num.trees = 50)
+#' explainer_ranger  <- explain(apartments_ranger_model, data = apartments[,-1],
+#'                              y = apartments$m2.price, label = "Ranger Apartments")
+#' model_parts_ranger_aps <- model_parts(explainer_ranger, type = "raw")
+#' head(model_parts_ranger_aps, 8)
+#' plot(model_parts_ranger_aps)
+#'
+#' # binary classification
 #'
 #' titanic_glm_model <- glm(survived~., data = titanic_imputed, family = "binomial")
-#' explainer_glm <- explain(titanic_glm_model, data = titanic_imputed[,-8],
+#' explainer_glm_titanic <- explain(titanic_glm_model, data = titanic_imputed[,-8],
 #'                          y = titanic_imputed$survived)
 #' logit <- function(x) exp(x)/(1+exp(x))
-#' vi_glm <- model_parts(explainer_glm, type = "raw",
-#'                         loss_function = function(observed, predicted)
-#'                                      sum((observed - logit(predicted))^2))
-#' head(vi_glm, 8)
-#' plot(vi_glm)
+#' custom_loss <- function(observed, predicted){
+#'    sum((observed - logit(predicted))^2)
+#' }
+#' attr(custom_loss, "loss_name") <- "Logit residuals"
+#' model_parts_glm_titanic <- model_parts(explainer_glm_titanic, type = "raw",
+#'                                        loss_function = custom_loss)
+#' head(model_parts_glm_titanic, 8)
+#' plot(model_parts_glm_titanic)
+#'
+#' # multilabel classification
+#'
+#' HR_ranger_model_HR <- ranger(status~., data = HR, num.trees = 50,
+#'                                probability = TRUE)
+#' explainer_ranger_HR  <- explain(HR_ranger_model_HR, data = HR[,-6],
+#'                              y = HR$status, label = "Ranger HR")
+#' model_parts_ranger_HR <- model_parts(explainer_ranger_HR, type = "raw")
+#' head(model_parts_ranger_HR, 8)
+#' plot(model_parts_ranger_HR)
+#'
 #'}
 #'
 model_parts <- function(explainer,

diff --git a/R/model_performance.R b/R/model_performance.R
@@ -9,36 +9,56 @@
 #' @param cutoff a cutoff for classification models, needed for measures like recall, precision, ACC, F1. By default 0.5.
 #'
 #' @return An object of the class \code{model_performance}.
+#'
+#' It's a list with following fields:
+#'
+#' \itemize{
+#' \item \code{residuals} - data frame that contains residuals for each observation
+#' \item \code{measures} - list with calculated measures that are dedicated for the task, whether it is regression, binary classification or multiclass classification.
+#' \item \code{type} - character that specifies type of the task.
+#' }
+#'
 #' @references Explanatory Model Analysis. Explore, Explain and Examine Predictive Models. \url{https://pbiecek.github.io/ema/}
 #' @importFrom stats median weighted.mean
 #' @export
 #' @examples
-#'  \dontrun{
+#' \donttest{
+#' # regression
+#'
 #' library("ranger")
-#' titanic_ranger_model <- ranger(survived~., data = titanic_imputed, num.trees = 100,
-#'                                probability = TRUE)
-#' # It's a good practice to pass data without target variable
-#' explainer_ranger  <- explain(titanic_ranger_model, data = titanic_imputed[,-8],
-#'                              y = titanic_imputed$survived)
-#' # resulting dataframe has predicted values and residuals
-#' mp_ex_rn <- model_performance(explainer_ranger)
+#' apartments_ranger_model <- ranger(m2.price~., data = apartments, num.trees = 50)
+#' explainer_ranger_apartments  <- explain(apartments_ranger_model, data = apartments[,-1],
+#'                              y = apartments$m2.price, label = "Ranger Apartments")
+#' model_performance_ranger_aps <- model_performance(explainer_ranger_apartments )
+#' model_performance_ranger_aps
+#' plot(model_performance_ranger_aps)
+#' plot(model_performance_ranger_aps, geom = "boxplot")
+#' plot(model_performance_ranger_aps, geom = "histogram")
+#'
+#' # binary classification
 #'
 #' titanic_glm_model <- glm(survived~., data = titanic_imputed, family = "binomial")
-#' explainer_glm <- explain(titanic_glm_model, data = titanic_imputed[,-8],
-#'                          y = titanic_imputed$survived,
-#'                     predict_function = function(m,x) predict.glm(m,x,type = "response"),
-#'                          label = "glm")
-#' mp_ex_glm <- model_performance(explainer_glm)
-#' mp_ex_glm
-#' plot(mp_ex_glm)
-#' plot(mp_ex_glm, mp_ex_rn)
+#' explainer_glm_titanic <- explain(titanic_glm_model, data = titanic_imputed[,-8],
+#'                          y = titanic_imputed$survived)
+#' model_performance_glm_titanic <- model_performance(explainer_glm_titanic)
+#' model_performance_glm_titanic
+#' plot(model_performance_glm_titanic)
+#' plot(model_performance_glm_titanic, geom = "boxplot")
+#' plot(model_performance_glm_titanic, geom = "histogram")
+#'
+#' # multilabel classification
+#'
+#' HR_ranger_model <- ranger(status~., data = HR, num.trees = 50,
+#'                                probability = TRUE)
+#' explainer_ranger_HR  <- explain(HR_ranger_model, data = HR[,-6],
+#'                              y = HR$status, label = "Ranger HR")
+#' model_performance_ranger_HR <- model_performance(explainer_ranger_HR)
+#' model_performance_ranger_HR
+#' plot(model_performance_ranger_HR)
+#' plot(model_performance_ranger_HR, geom = "boxplot")
+#' plot(model_performance_ranger_HR, geom = "histogram")
 #'
-#' titanic_lm_model <- lm(survived~., data = titanic_imputed)
-#' explainer_lm <- explain(titanic_lm_model, data = titanic_imputed[,-8], y = titanic_imputed$survived)
-#' mp_ex_lm <- model_performance(explainer_lm)
-#' plot(mp_ex_lm)
-#' plot(mp_ex_glm, mp_ex_rn, mp_ex_lm)
-#'  }
+#'}
 #'
 model_performance <- function(explainer, ..., cutoff = 0.5) {
   test_explainer(explainer, has_data = TRUE, has_y = TRUE, function_name = "model_performance")