From 9b87e69d2631cce512747f829d7013766fcd06c2 Mon Sep 17 00:00:00 2001
From: Robert Samoilescu <robert.samoilescu@gmail.com>
Date: Wed, 29 Jun 2022 09:56:08 +0100
Subject: [PATCH 1/3] Included warning TreeSHAP background dataset size.

---
 alibi/explainers/shap_wrappers.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/alibi/explainers/shap_wrappers.py b/alibi/explainers/shap_wrappers.py
index e61fc8470..cac75ef2b 100644
--- a/alibi/explainers/shap_wrappers.py
+++ b/alibi/explainers/shap_wrappers.py
@@ -1000,6 +1000,7 @@ def reset_predictor(self, predictor: Callable) -> None:
 # TODO: Look into pyspark support requirements if requested
 # TODO: catboost.Pool not supported for fit stage (due to summarisation) but can do if there is a user need
 
+TREE_SHAP_BACKGROUND_SUPPORTED_SIZE = 100
 TREE_SHAP_BACKGROUND_WARNING_THRESHOLD = 1000
 TREE_SHAP_MODEL_OUTPUT = ['raw', 'probability', 'probability_doubled', 'log_loss']
 
@@ -1159,6 +1160,19 @@ def fit(self,  # type: ignore[override]
             else:
                 self._check_inputs(background_data)
 
+            # Warns the user that TreeShap supports only up to TREE_SHAP_BACKGROUND_SIZE(100) samples in the
+            # background dataset. Note that there is a logic above related to the summarisation of the background
+            # dataset which uses TREE_SHAP_BACKGROUND_WARNING_THRESHOLD(1000) as (warning) threshold. Although the
+            # TREE_SHAP_BACKGROUND_WARNING_THRESHOLD > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE which is contradictory, we
+            # leave the logic above untouched. This approach has at least two benefits:
+            #   i) minimal refactoring
+            #   ii) return the correct result if a newer version of shap which fixes the issue is used before we
+            #   update our wrapper in alibi (i.e. just ignore the warning)
+            if background_data.shape[0] > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE:
+                logger.warning('The upstream implementation of interventional TreeShap supports only up to '
+                               f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} samples in the background dataset. '
+                               'A larger background dataset size will result in erroneous Shap values.')
+
         perturbation = 'interventional' if background_data is not None else 'tree_path_dependent'
         self.background_data = background_data
         self._explainer = shap.TreeExplainer(

From db1c0e65c6d345a360fd9568cb86da49c5e31cf4 Mon Sep 17 00:00:00 2001
From: Robert Samoilescu <robert.samoilescu@gmail.com>
Date: Wed, 29 Jun 2022 11:40:10 +0100
Subject: [PATCH 2/3] Fixed background size when DenseData object returned by
 summarisation.

---
 alibi/explainers/shap_wrappers.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/alibi/explainers/shap_wrappers.py b/alibi/explainers/shap_wrappers.py
index cac75ef2b..902fb3ca9 100644
--- a/alibi/explainers/shap_wrappers.py
+++ b/alibi/explainers/shap_wrappers.py
@@ -1160,6 +1160,10 @@ def fit(self,  # type: ignore[override]
             else:
                 self._check_inputs(background_data)
 
+            # summarisation can return a DenseData object
+            n_samples = (background_data.data if isinstance(background_data, shap_utils.DenseData)
+                         else background_data).shape[0]
+
             # Warns the user that TreeShap supports only up to TREE_SHAP_BACKGROUND_SIZE(100) samples in the
             # background dataset. Note that there is a logic above related to the summarisation of the background
             # dataset which uses TREE_SHAP_BACKGROUND_WARNING_THRESHOLD(1000) as (warning) threshold. Although the
@@ -1168,7 +1172,7 @@ def fit(self,  # type: ignore[override]
             #   i) minimal refactoring
             #   ii) return the correct result if a newer version of shap which fixes the issue is used before we
             #   update our wrapper in alibi (i.e. just ignore the warning)
-            if background_data.shape[0] > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE:
+            if n_samples > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE:
                 logger.warning('The upstream implementation of interventional TreeShap supports only up to '
                                f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} samples in the background dataset. '
                                'A larger background dataset size will result in erroneous Shap values.')

From 58ee40d215f3076b34bf5dfb16846f70ab7a0e3a Mon Sep 17 00:00:00 2001
From: RobertSamoilescu <robert.samoilescu@gmail.com>
Date: Fri, 1 Jul 2022 12:10:23 +0100
Subject: [PATCH 3/3] Updated waring to emphasize sampling with replacement.

---
 alibi/explainers/shap_wrappers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/alibi/explainers/shap_wrappers.py b/alibi/explainers/shap_wrappers.py
index 902fb3ca9..ae981c637 100644
--- a/alibi/explainers/shap_wrappers.py
+++ b/alibi/explainers/shap_wrappers.py
@@ -1171,11 +1171,12 @@ def fit(self,  # type: ignore[override]
             # leave the logic above untouched. This approach has at least two benefits:
             #   i) minimal refactoring
             #   ii) return the correct result if a newer version of shap which fixes the issue is used before we
-            #   update our wrapper in alibi (i.e. just ignore the warning)
+            #   update our dependencies in alibi (i.e. just ignore the warning)
             if n_samples > TREE_SHAP_BACKGROUND_SUPPORTED_SIZE:
-                logger.warning('The upstream implementation of interventional TreeShap supports only up to '
+                logger.warning(f'The upstream implementation of interventional TreeShap supports only up to '
                                f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} samples in the background dataset. '
-                               'A larger background dataset size will result in erroneous Shap values.')
+                               f'A larger background dataset will be sampled with replacement to '
+                               f'{TREE_SHAP_BACKGROUND_SUPPORTED_SIZE} instances.')
 
         perturbation = 'interventional' if background_data is not None else 'tree_path_dependent'
         self.background_data = background_data