datakind · bdewilde · Jan 8, 2025 · Jan 2, 2025 · Jan 2, 2025 · Jan 3, 2025
@@ -1,9 +1,9 @@
 import typing as t
+from shap import KernelExplainer
 
 import numpy as np
 import pandas as pd
 
-
 def select_top_features_for_display(
     features: pd.DataFrame,
     unique_ids: pd.Series,
@@ -64,3 +64,48 @@ def select_top_features_for_display(
                 }
             )
     return pd.DataFrame(top_features_info)
+
+
+def calculate_shap_values(
+    dfs: t.Iterator[pd.DataFrame], *, 
+    student_id_col: str,
+    model_features: list[str], 
+    explainer: KernelExplainer,
+    mode: pd.Series,
+) -> t.Iterator[pd.DataFrame]:
+    """
+    SHAP is computationally expensive, so this function enables parallelization,
+    by calculating SHAP values over an iterator of DataFrames. Sparks' repartition
+    performs a full shuffle (does not preserve row order), so it is critical to 
+    extract the student_id_col prior to creating shap values and then reattach 
+    for our final output.
+
+    Args:
+        dfs: An iterator over Pandas DataFrames.
+        Each DataFrame is a batch of data points.
+        student_id_col: The name of the column containing student_id
+        model_features: A list of strings representing the names 
+        of the features for our model
+        explainer: A KernelExplainer object used to compute
+        shap values from our loaded model.
+        mode: A Series containing values to impute missing values
+
+    Returns:
+        Iterator[pd.DataFrame]: An iterator over Pandas DataFrames. Each DataFrame
+        contains the SHAP values for that partition of data.
+    """
+    for df in dfs:
+        # Preserve student_id column
+        student_ids_batch = df.loc[:, student_id_col]
+
+        # Impute missing values and run shap values using just pdf features
+        df_features = df[model_features].fillna(mode)
+        shap_values = explainer.shap_values(df_features)
+
+        # Create a DataFrame from the SHAP values
+        shap_df = pd.DataFrame(shap_values, columns=model_features)
+
+        # Reattach the student_id column to our SHAP values DataFrame
+        shap_df[student_id_col] = student_ids_batch
+
+        yield shap_df
@@ -1,9 +1,10 @@
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_numeric_dtype
 import pytest
 
 from student_success_tool.modeling.inference import select_top_features_for_display
-
+from student_success_tool.modeling.inference import calculate_shap_values
 
 @pytest.mark.parametrize(
     [
@@ -115,3 +116,91 @@ def test_select_top_features_for_display(
     )
     assert isinstance(obs, pd.DataFrame) and not obs.empty
     assert pd.testing.assert_frame_equal(obs, exp) is None
+
+
+@pytest.fixture
+def sample_data():
+    data = {
+        'student_id': [1, 2, 3],
+        'feature1': [0.1, 0.2, 0.3],
+        'feature2': [0.4, 0.5, 0.6]
+    }
+    return pd.DataFrame(data)
+
+# Create dummy KernelExplainer 
+class SimpleKernelExplainer:
+    def shap_values(self, X):
+        # Simulate SHAP values: For simplicity, we return random numbers
+        return np.random.rand(len(X), len(X.columns)) * 0.1  # Random SHAP values between 0 and 0.1
+
+@pytest.fixture
+def explainer():
+    return SimpleKernelExplainer()
+
+@pytest.mark.parametrize(
+    "input_data, expected_shape",
+    [
+        ({"student_id": [1, 2, 3], "feature1": [0.1, 0.2, 0.3], "feature2": [0.4, 0.5, 0.6]}, (3, 3)),
+        ({"student_id": [1, 2], "feature1": [0.1, 0.2], "feature2": [0.4, 0.5]}, (2, 3))
+    ]
+)
+def test_calculate_shap_values_basic(input_data, expected_shape, explainer):
+    df = pd.DataFrame(input_data)
+    student_id_col = 'student_id'
+    model_features = ['feature1', 'feature2']
+    mode = df.mode().iloc[0] 
+
+    iterator = iter([df])
+
+    result = list(calculate_shap_values(iterator, student_id_col=student_id_col, model_features=model_features, explainer=explainer, mode=mode))
+
+    # Check that the result contains the expected number of rows and columns
+    shap_df = result[0]
+    assert shap_df.shape == expected_shape
+
+    # Ensure that 'student_id' column is present
+    assert student_id_col in shap_df.columns
+
+    # Ensure that SHAP values are generated and are numeric
+    assert is_numeric_dtype(shap_df[model_features].iloc[0, 0])
+    assert is_numeric_dtype(shap_df[model_features].iloc[0, 1])
+
+    # Ensure student IDs are correctly reattached
+    assert shap_df[student_id_col].iloc[0] == 1
+    assert shap_df[student_id_col].iloc[1] == 2
+
+@pytest.mark.parametrize(
+    "batch1_data, batch2_data, expected_shape1, expected_shape2",
+    [
+        ({"student_id": [1, 2, 3], "feature1": [0.1, 0.2, 0.3], "feature2": [0.4, 0.5, 0.6]},
+         {"student_id": [4, 5, 6], "feature1": [0.7, 0.8, 0.9], "feature2": [0.6, 0.7, 0.8]},
+         (3, 3), (3, 3)),
+        ({"student_id": [4, 5, 6], "feature1": [0.1, 0.2, 0.3], "feature2": [0.4, 0.5, 0.6]},
+         {"student_id": [4, 5, 6], "feature1": [0.5, 0.6, 0.7], "feature2": [0.7, 0.8, 0.9]},
+         (3, 3), (3, 3))
+    ]
+)
+def test_calculate_shap_values_multiple_batches(batch1_data, batch2_data, expected_shape1, expected_shape2, explainer):
+    batch1 = pd.DataFrame(batch1_data)
+    batch2 = pd.DataFrame(batch2_data)
+
+    student_id_col = 'student_id'
+    model_features = ['feature1', 'feature2']
+    mode = batch1.mode().iloc[0] 
+
+    iterator = iter([batch1, batch2])
+
+    result = list(calculate_shap_values(iterator, student_id_col=student_id_col, model_features=model_features, explainer=explainer, mode=mode))
+
+    # Ensure we have two DataFrames 
+    assert len(result) == 2
+
+    # Check first batch 
+    shap_df1 = result[0]
+    assert shap_df1.shape == expected_shape1
+
+    # Check second batch
+    shap_df2 = result[1]
+    assert shap_df2.shape == expected_shape2
+
+