From 9b87e69d2631cce512747f829d7013766fcd06c2 Mon Sep 17 00:00:00 2001 From: Robert Samoilescu Date: Wed, 29 Jun 2022 09:56:08 +0100 Subject: [PATCH 1/3] Included warning TreeSHAP background dataset size. --- alibi/explainers/shap_wrappers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/alibi/explainers/shap_wrappers.py b/alibi/explainers/shap_wrappers.py index e61fc8470..cac75ef2b 100644 --- a/alibi/explainers/shap_wrappers.py +++ b/alibi/explainers/shap_wrappers.py @@ -1000,6 +1000,7 @@ def reset_predictor(self, predictor: Callable) -> None: # TODO: Look into pyspark support requirements if requested # TODO: catboost.Pool not supported for fit stage (due to summarisation) but can do if there is a user need +TREE_SHAP_BACKGROUND_SUPPORTED_SIZE = 100 TREE_SHAP_BACKGROUND_WARNING_THRESHOLD = 1000 TREE_SHAP_MODEL_OUTPUT = ['raw', 'probability', 'probability_doubled', 'log_loss'] @@ -1159,6 +1160,19 @@ def fit(self, # type: ignore[override] else: self._check_inputs(background_data) + # Warns the user that TreeShap supports only up to TREE_SHAP_BACKGROUND_SIZE(100) samples in the + # background dataset. Note that there is a logic above related to the summarisation of the background + # dataset which uses TREE_SHAP_BACKGROUND_WARNING_THRESHOLD(1000) as (warning) threshold. Although the + # TREE_SHAP_BACKGROUND_WARNING_THRESHOLD > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE which is contradictory, we + # leave the logic above untouched. This approach has at least two benefits: + # i) minimal refactoring + # ii) return the correct result if a newer version of shap which fixes the issue is used before we + # update our wrapper in alibi (i.e. just ignore the warning) + if background_data.shape[0] > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE: + logger.warning('The upstream implementation of interventional TreeShap supports only up to ' + f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} samples in the background dataset. ' + 'A larger background dataset size will result in erroneous Shap values.') + perturbation = 'interventional' if background_data is not None else 'tree_path_dependent' self.background_data = background_data self._explainer = shap.TreeExplainer( From db1c0e65c6d345a360fd9568cb86da49c5e31cf4 Mon Sep 17 00:00:00 2001 From: Robert Samoilescu Date: Wed, 29 Jun 2022 11:40:10 +0100 Subject: [PATCH 2/3] Fixed background size when DenseData object returned by summarisation. --- alibi/explainers/shap_wrappers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/alibi/explainers/shap_wrappers.py b/alibi/explainers/shap_wrappers.py index cac75ef2b..902fb3ca9 100644 --- a/alibi/explainers/shap_wrappers.py +++ b/alibi/explainers/shap_wrappers.py @@ -1160,6 +1160,10 @@ def fit(self, # type: ignore[override] else: self._check_inputs(background_data) + # summarisation can return a DenseData object + n_samples = (background_data.data if isinstance(background_data, shap_utils.DenseData) + else background_data).shape[0] + # Warns the user that TreeShap supports only up to TREE_SHAP_BACKGROUND_SIZE(100) samples in the # background dataset. Note that there is a logic above related to the summarisation of the background # dataset which uses TREE_SHAP_BACKGROUND_WARNING_THRESHOLD(1000) as (warning) threshold. Although the @@ -1168,7 +1172,7 @@ def fit(self, # type: ignore[override] # i) minimal refactoring # ii) return the correct result if a newer version of shap which fixes the issue is used before we # update our wrapper in alibi (i.e. just ignore the warning) - if background_data.shape[0] > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE: + if n_samples > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE: logger.warning('The upstream implementation of interventional TreeShap supports only up to ' f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} samples in the background dataset. ' 'A larger background dataset size will result in erroneous Shap values.') From 58ee40d215f3076b34bf5dfb16846f70ab7a0e3a Mon Sep 17 00:00:00 2001 From: RobertSamoilescu Date: Fri, 1 Jul 2022 12:10:23 +0100 Subject: [PATCH 3/3] Updated waring to emphasize sampling with replacement. --- alibi/explainers/shap_wrappers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/alibi/explainers/shap_wrappers.py b/alibi/explainers/shap_wrappers.py index 902fb3ca9..ae981c637 100644 --- a/alibi/explainers/shap_wrappers.py +++ b/alibi/explainers/shap_wrappers.py @@ -1171,11 +1171,12 @@ def fit(self, # type: ignore[override] # leave the logic above untouched. This approach has at least two benefits: # i) minimal refactoring # ii) return the correct result if a newer version of shap which fixes the issue is used before we - # update our wrapper in alibi (i.e. just ignore the warning) + # update our dependencies in alibi (i.e. just ignore the warning) if n_samples > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE: - logger.warning('The upstream implementation of interventional TreeShap supports only up to ' + logger.warning(f'The upstream implementation of interventional TreeShap supports only up to ' f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} samples in the background dataset. ' - 'A larger background dataset size will result in erroneous Shap values.') + f'A larger background dataset will be sampled with replacement to ' + f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} instances.') perturbation = 'interventional' if background_data is not None else 'tree_path_dependent' self.background_data = background_data