Fix clv plotting bugs and edits to Quickstart (#601)

ColtAllen · twiecki · commit c69eb97f5a77 · 2024-09-10T05:32:19.000+01:00
* move fixtures to conftest

* docstrings and moved set_model_fit to conftest

* fixed pandas quickstart warnings

* revert to MockModel and add ParetoNBD support

* quickstart edit for issue 609

* notebook edit
diff --git a/docs/source/notebooks/clv/clv_quickstart.ipynb b/docs/source/notebooks/clv/clv_quickstart.ipynb
diff --git a/pymc_marketing/clv/plotting.py b/pymc_marketing/clv/plotting.py
@@ -1,10 +1,12 @@
-from typing import Optional, Sequence, Tuple
+from typing import Optional, Sequence, Tuple, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from matplotlib.lines import Line2D
 
+from pymc_marketing.clv import BetaGeoModel, ParetoNBDModel
+
 __all__ = [
     "plot_customer_exposure",
     "plot_frequency_recency_matrix",
@@ -156,7 +158,7 @@ def _create_frequency_recency_meshes(
 
 
 def plot_frequency_recency_matrix(
-    model,
+    model: Union[BetaGeoModel, ParetoNBDModel],
     t=1,
     max_frequency: Optional[int] = None,
     max_recency: Optional[int] = None,
@@ -172,8 +174,8 @@ def plot_frequency_recency_matrix(
 
     Parameters
     ----------
-    model: lifetimes model
-        A fitted lifetimes model.
+    model: CLV model
+        A fitted CLV model.
     t: float, optional
         Next units of time to make predictions for
     max_frequency: int, optional
@@ -197,27 +199,49 @@ def plot_frequency_recency_matrix(
     axes: matplotlib.AxesSubplot
     """
     if max_frequency is None:
-        max_frequency = int(model.frequency.max())
+        max_frequency = int(model.data["frequency"].max())
 
     if max_recency is None:
-        max_recency = int(model.recency.max())
+        max_recency = int(model.data["recency"].max())
 
     mesh_frequency, mesh_recency = _create_frequency_recency_meshes(
         max_frequency=max_frequency,
         max_recency=max_recency,
     )
 
-    Z = (
-        model.expected_num_purchases(
-            customer_id=np.arange(mesh_recency.size),  # placeholder
-            t=t,
-            frequency=mesh_frequency.ravel(),
-            recency=mesh_recency.ravel(),
-            T=max_recency,
+    # FIXME: This is a hotfix for ParetoNBDModel, as it has a different API from BetaGeoModel
+    #  We should harmonize them!
+    if isinstance(model, ParetoNBDModel):
+        transaction_data = pd.DataFrame(
+            {
+                "customer_id": np.arange(mesh_recency.size),  # placeholder
+                "frequency": mesh_frequency.ravel(),
+                "recency": mesh_recency.ravel(),
+                "T": max_recency,
+            }
         )
-        .mean(("draw", "chain"))
-        .values.reshape(mesh_recency.shape)
-    )
+
+        Z = (
+            model.expected_purchases(
+                data=transaction_data,
+                future_t=t,
+            )
+            .mean(("draw", "chain"))
+            .values.reshape(mesh_recency.shape)
+        )
+    else:
+        Z = (
+            model.expected_num_purchases(
+                customer_id=np.arange(mesh_recency.size),  # placeholder
+                frequency=mesh_frequency.ravel(),
+                recency=mesh_recency.ravel(),
+                T=max_recency,
+                t=t,
+            )
+            .mean(("draw", "chain"))
+            .values.reshape(mesh_recency.shape)
+        )
+
     if ax is None:
         ax = plt.subplot(111)
 
@@ -245,7 +269,7 @@ def plot_frequency_recency_matrix(
 
 
 def plot_probability_alive_matrix(
-    model,
+    model: Union[BetaGeoModel, ParetoNBDModel],
     max_frequency: Optional[int] = None,
     max_recency: Optional[int] = None,
     title: str = "Probability Customer is Alive,\nby Frequency and Recency of a Customer",
@@ -261,8 +285,8 @@ def plot_probability_alive_matrix(
 
     Parameters
     ----------
-    model: lifetimes model
-        A fitted lifetimes model.
+    model: CLV model
+        A fitted CLV model.
     max_frequency: int, optional
         The maximum frequency to plot. Default is max observed frequency.
     max_recency: int, optional
@@ -285,26 +309,46 @@ def plot_probability_alive_matrix(
     """
 
     if max_frequency is None:
-        max_frequency = int(model.frequency.max())
+        max_frequency = int(model.data["frequency"].max())
 
     if max_recency is None:
-        max_recency = int(model.recency.max())
+        max_recency = int(model.data["recency"].max())
 
     mesh_frequency, mesh_recency = _create_frequency_recency_meshes(
         max_frequency=max_frequency,
         max_recency=max_recency,
     )
+    # FIXME: This is a hotfix for ParetoNBDModel, as it has a different API from BetaGeoModel
+    #  We should harmonize them!
+    if isinstance(model, ParetoNBDModel):
+        transaction_data = pd.DataFrame(
+            {
+                "customer_id": np.arange(mesh_recency.size),  # placeholder
+                "frequency": mesh_frequency.ravel(),
+                "recency": mesh_recency.ravel(),
+                "T": max_recency,
+            }
+        )
 
-    Z = (
-        model.expected_probability_alive(
-            customer_id=np.arange(mesh_recency.size),  # placeholder
-            frequency=mesh_frequency.ravel(),
-            recency=mesh_recency.ravel(),
-            T=max_recency,
+        Z = (
+            model.expected_probability_alive(
+                data=transaction_data,
+                future_t=0,  # TODO: This can be a function parameter in the case of ParetoNBDModel
+            )
+            .mean(("draw", "chain"))
+            .values.reshape(mesh_recency.shape)
+        )
+    else:
+        Z = (
+            model.expected_probability_alive(
+                customer_id=np.arange(mesh_recency.size),  # placeholder
+                frequency=mesh_frequency.ravel(),
+                recency=mesh_recency.ravel(),
+                T=max_recency,  # type: ignore
+            )
+            .mean(("draw", "chain"))
+            .values.reshape(mesh_recency.shape)
         )
-        .mean(("draw", "chain"))
-        .values.reshape(mesh_recency.shape)
-    )
 
     interpolation = kwargs.pop("interpolation", "none")
 
diff --git a/tests/clv/models/test_basic.py b/tests/clv/models/test_basic.py
@@ -7,7 +7,7 @@
 from arviz import InferenceData, from_dict
 
 from pymc_marketing.clv.models.basic import CLVModel
-from tests.clv.utils import set_model_fit
+from tests.conftest import set_model_fit
 
 
 class CLVModelTest(CLVModel):
diff --git a/tests/clv/models/test_gamma_gamma.py b/tests/clv/models/test_gamma_gamma.py
@@ -10,7 +10,7 @@
     GammaGammaModel,
     GammaGammaModelIndividual,
 )
-from tests.clv.utils import set_model_fit
+from tests.conftest import set_model_fit
 
 
 class BaseTestGammaGammaModel:
diff --git a/tests/clv/models/test_pareto_nbd.py b/tests/clv/models/test_pareto_nbd.py
@@ -9,7 +9,7 @@
 
 from pymc_marketing.clv import ParetoNBDModel
 from pymc_marketing.clv.distributions import ParetoNBD
-from tests.clv.utils import set_model_fit
+from tests.conftest import set_model_fit
 
 
 class TestParetoNBDModel:
diff --git a/tests/clv/test_plotting.py b/tests/clv/test_plotting.py
@@ -14,9 +14,57 @@
 )
 
 
-@pytest.fixture(scope="module")
-def test_summary_data() -> pd.DataFrame:
-    return pd.read_csv("tests/clv/datasets/test_summary_data.csv", index_col=0)
+class MockModel:
+    def __init__(self, data: pd.DataFrame):
+        self.data = data
+
+    def _mock_posterior(
+        self, customer_id: Union[np.ndarray, pd.Series]
+    ) -> xr.DataArray:
+        n_customers = len(customer_id)
+        n_chains = 4
+        n_draws = 10
+        chains = np.arange(n_chains)
+        draws = np.arange(n_draws)
+        return xr.DataArray(
+            data=np.ones((n_customers, n_chains, n_draws)),
+            coords={"customer_id": customer_id, "chain": chains, "draw": draws},
+            dims=["customer_id", "chain", "draw"],
+        )
+
+    def expected_probability_alive(
+        self,
+        customer_id: Union[np.ndarray, pd.Series],
+        frequency: Union[np.ndarray, pd.Series],
+        recency: Union[np.ndarray, pd.Series],
+        T: Union[np.ndarray, pd.Series],
+    ):
+        return self._mock_posterior(customer_id)
+
+    def expected_purchases(
+        self,
+        customer_id: Union[np.ndarray, pd.Series],
+        data: pd.DataFrame,
+        *,
+        future_t: Union[np.ndarray, pd.Series, TensorVariable],
+    ):
+        return self._mock_posterior(customer_id)
+
+    # TODO: This is required until CLV API is standardized.
+    def expected_num_purchases(
+        self,
+        customer_id: Union[np.ndarray, pd.Series],
+        t: Union[np.ndarray, pd.Series, TensorVariable],
+        frequency: Union[np.ndarray, pd.Series, TensorVariable],
+        recency: Union[np.ndarray, pd.Series, TensorVariable],
+        T: Union[np.ndarray, pd.Series, TensorVariable],
+    ):
+        return self._mock_posterior(customer_id)
+
+
+@pytest.fixture
+def mock_model(test_summary_data) -> MockModel:
+    return MockModel(test_summary_data)
 
 
 @pytest.mark.parametrize(
@@ -33,7 +81,7 @@ def test_plot_customer_exposure(test_summary_data, kwargs) -> None:
     assert isinstance(ax, plt.Axes)
 
 
-def test_plot_cumstomer_exposure_with_ax(test_summary_data) -> None:
+def test_plot_customer_exposure_with_ax(test_summary_data) -> None:
     ax = plt.subplot()
     plot_customer_exposure(test_summary_data, ax=ax)
 
@@ -59,50 +107,6 @@ def test_plot_customer_exposure_invalid_args(test_summary_data, kwargs) -> None:
         plot_customer_exposure(test_summary_data, **kwargs)
 
 
-class MockModel:
-    def __init__(self, frequency, recency):
-        self.frequency = frequency
-        self.recency = recency
-
-    def _mock_posterior(
-        self, customer_id: Union[np.ndarray, pd.Series]
-    ) -> xr.DataArray:
-        n_customers = len(customer_id)
-        n_chains = 4
-        n_draws = 10
-        chains = np.arange(n_chains)
-        draws = np.arange(n_draws)
-        return xr.DataArray(
-            data=np.ones((n_customers, n_chains, n_draws)),
-            coords={"customer_id": customer_id, "chain": chains, "draw": draws},
-            dims=["customer_id", "chain", "draw"],
-        )
-
-    def expected_probability_alive(
-        self,
-        customer_id: Union[np.ndarray, pd.Series],
-        frequency: Union[np.ndarray, pd.Series],
-        recency: Union[np.ndarray, pd.Series],
-        T: Union[np.ndarray, pd.Series],
-    ):
-        return self._mock_posterior(customer_id)
-
-    def expected_num_purchases(
-        self,
-        customer_id: Union[np.ndarray, pd.Series],
-        t: Union[np.ndarray, pd.Series, TensorVariable],
-        frequency: Union[np.ndarray, pd.Series, TensorVariable],
-        recency: Union[np.ndarray, pd.Series, TensorVariable],
-        T: Union[np.ndarray, pd.Series, TensorVariable],
-    ):
-        return self._mock_posterior(customer_id)
-
-
-@pytest.fixture
-def mock_model(test_summary_data) -> MockModel:
-    return MockModel(test_summary_data["frequency"], test_summary_data["recency"])
-
-
 def test_plot_frequency_recency_matrix(mock_model) -> None:
     ax: plt.Axes = plot_frequency_recency_matrix(mock_model)
 
diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py
@@ -16,7 +16,7 @@
     rfm_train_test_split,
     to_xarray,
 )
-from tests.clv.utils import set_model_fit
+from tests.conftest import set_model_fit
 
 
 def test_to_xarray():
@@ -42,15 +42,6 @@ def test_to_xarray():
     np.testing.assert_array_equal(new_y.coords["test_dim"], customer_id)
 
 
-@pytest.fixture(scope="module")
-def test_summary_data() -> pd.DataFrame:
-    rng = np.random.default_rng(14)
-    df = pd.read_csv("tests/clv/datasets/test_summary_data.csv", index_col=0)
-    df["monetary_value"] = rng.lognormal(size=(len(df)))
-    df["customer_id"] = df.index
-    return df
-
-
 @pytest.fixture(scope="module")
 def fitted_bg(test_summary_data) -> BetaGeoModel:
     rng = np.random.default_rng(13)
@@ -100,6 +91,7 @@ def fitted_pnbd(test_summary_data) -> ParetoNBDModel:
     pnbd_model.build_model()
 
     # Mock an idata object for tests requiring a fitted model
+    # TODO: This is quite slow. Check similar fixtures in the model tests to speed this up.
     fake_fit = pm.sample_prior_predictive(
         samples=50, model=pnbd_model.model, random_seed=rng
     ).prior
diff --git a/tests/clv/utils.py b/tests/clv/utils.py
diff --git a/tests/conftest.py b/tests/conftest.py

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`GammaGammaModel,`
`11`	`11`	`GammaGammaModelIndividual,`
`12`	`12`	`)`
`13`		`-from tests.clv.utils import set_model_fit`
	`13`	`+from tests.conftest import set_model_fit`
`14`	`14`
`15`	`15`
`16`	`16`	`class BaseTestGammaGammaModel:`