From 3eb45a0f97781746f3a25334733c9b138023d681 Mon Sep 17 00:00:00 2001
From: MichaelSchreier <m.schreier@gmx.de>
Date: Thu, 13 Dec 2018 22:09:28 +0100
Subject: [PATCH 1/3] Speedup of summary_data_from_transaction_data

Replaced pd.period instances with timestamps for the actual calculations / aggregations which yields a speedup of several factors x10 on large datasets.
---
 utils.py | 551 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 551 insertions(+)
 create mode 100644 utils.py

diff --git a/utils.py b/utils.py
new file mode 100644
index 00000000..6de04bc8
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,551 @@
+"""Lifetimes utils and helpers."""
+from __future__ import division
+
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+import dill
+from scipy.optimize import minimize
+
+pd.options.mode.chained_assignment = None
+
+__all__ = ['calibration_and_holdout_data',
+           'summary_data_from_transaction_data',
+           '_find_first_transactions',
+           'calculate_alive_path',
+           'expected_cumulative_transactions']
+
+
+def calibration_and_holdout_data(transactions, customer_id_col, datetime_col, calibration_period_end,
+                                 observation_period_end=None, freq='D', datetime_format=None,
+                                 monetary_value_col=None):
+    """
+    Create a summary of each customer over a calibration and holdout period.
+
+    This function creates a summary of each customer over a calibration and
+    holdout period (training and testing, respectively).
+    It accepts transaction data, and returns a Dataframe of sufficient statistics.
+
+    Parameters
+    ----------
+    transactions: :obj: DataFrame
+        a Pandas DataFrame that contains the customer_id col and the datetime col.
+    customer_id_col: string
+        the column in transactions DataFrame that denotes the customer_id
+    datetime_col:  string
+        the column in transactions that denotes the datetime the purchase was made.
+    calibration_period_end: :obj: datetime
+        a period to limit the calibration to, inclusive.
+    observation_period_end: :obj: datetime, optional
+         a string or datetime to denote the final date of the study.
+         Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
+    freq: string, optional
+        Default 'D' for days. Other examples: 'W' for weekly.
+    datetime_format: string, optional
+        a string that represents the timestamp format. Useful if Pandas can't understand
+        the provided format.
+    monetary_value_col: string, optional
+        the column in transactions that denotes the monetary value of the transaction.
+        Optional, only needed for customer lifetime value estimation models.
+
+    Returns
+    -------
+    :obj: DataFrame
+        A dataframe with columns frequency_cal, recency_cal, T_cal, frequency_holdout, duration_holdout
+        If monetary_value_col isn't None, the dataframe will also have the columns monetary_value_cal and
+        monetary_value_holdout.
+
+    """
+    def to_period(d):
+        return d.to_period(freq)
+
+    if observation_period_end is None:
+        observation_period_end = transactions[datetime_col].max()
+
+    transaction_cols = [customer_id_col, datetime_col]
+    if monetary_value_col:
+        transaction_cols.append(monetary_value_col)
+    transactions = transactions[transaction_cols].copy()
+
+    transactions[datetime_col] = pd.to_datetime(transactions[datetime_col], format=datetime_format)
+    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format)
+    calibration_period_end = pd.to_datetime(calibration_period_end, format=datetime_format)
+
+    # create calibration dataset
+    calibration_transactions = transactions.loc[transactions[datetime_col] <= calibration_period_end]
+    calibration_summary_data = summary_data_from_transaction_data(calibration_transactions,
+                                                                  customer_id_col,
+                                                                  datetime_col,
+                                                                  datetime_format=datetime_format,
+                                                                  observation_period_end=calibration_period_end,
+                                                                  freq=freq,
+                                                                  monetary_value_col=monetary_value_col)
+    calibration_summary_data.columns = [c + '_cal' for c in calibration_summary_data.columns]
+
+    # create holdout dataset
+    holdout_transactions = transactions.loc[(observation_period_end >= transactions[datetime_col]) &
+                                            (transactions[datetime_col] > calibration_period_end)]
+    holdout_transactions[datetime_col] = holdout_transactions[datetime_col].map(to_period)
+    holdout_summary_data = holdout_transactions.groupby([customer_id_col, datetime_col], sort=False).agg(lambda r: 1)\
+                                               .groupby(level=customer_id_col).agg(['count'])
+    holdout_summary_data.columns = ['frequency_holdout']
+    if monetary_value_col:
+        holdout_summary_data['monetary_value_holdout'] = \
+            holdout_transactions.groupby(customer_id_col)[monetary_value_col].mean()
+
+    combined_data = calibration_summary_data.join(holdout_summary_data, how='left')
+    combined_data.fillna(0, inplace=True)
+
+    delta_time = to_period(observation_period_end) - to_period(calibration_period_end)
+    combined_data['duration_holdout'] = delta_time
+
+    return combined_data
+
+
+def _find_first_transactions(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None,
+                             observation_period_end=None, freq='D'):
+    """
+    Return dataframe with first transactions.
+
+    This takes a Dataframe of transaction data of the form:
+        customer_id, datetime [, monetary_value]
+    and appends a column named 'repeated' to the transaction log which indicates which rows
+    are repeated transactions for that customer_id.
+
+    Parameters
+    ----------
+    transactions: :obj: DataFrame
+        a Pandas DataFrame that contains the customer_id col and the datetime col.
+    customer_id_col: string
+        the column in transactions DataFrame that denotes the customer_id
+    datetime_col:  string
+        the column in transactions that denotes the datetime the purchase was made.
+    monetary_value_col: string, optional
+        the column in transactions that denotes the monetary value of the transaction.
+        Optional, only needed for customer lifetime value estimation models.
+    observation_period_end: :obj: datetime
+        a string or datetime to denote the final date of the study.
+        Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
+    datetime_format: string, optional
+        a string that represents the timestamp format. Useful if Pandas can't understand
+        the provided format.
+    freq: string, optional
+        Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
+        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
+
+    """
+    if observation_period_end is None:
+        observation_period_end = transactions[datetime_col].max()
+
+    select_columns = [customer_id_col, datetime_col]
+
+    if monetary_value_col:
+        select_columns.append(monetary_value_col)
+
+    transactions = transactions[select_columns].sort_values(select_columns).copy()
+
+    # make sure the date column uses datetime objects, and use Pandas' DateTimeIndex.to_period()
+    # to convert the column to a PeriodIndex which is useful for time-wise grouping and truncating
+    transactions[datetime_col] = pd.to_datetime(transactions[datetime_col], format=datetime_format)
+    transactions = transactions.set_index(datetime_col).to_period(freq).to_timestamp()
+
+    transactions = transactions.loc[(transactions.index <= observation_period_end)].reset_index()
+
+    period_groupby = transactions.groupby([datetime_col, customer_id_col], sort=False, as_index=False)
+
+    if monetary_value_col:
+        # when we have a monetary column, make sure to sum together any values in the same period
+        period_transactions = period_groupby.sum()
+    else:
+        # by calling head() on the groupby object, the datetime_col and customer_id_col columns
+        # will be reduced
+        period_transactions = period_groupby.head(1)
+
+    # initialize a new column where we will indicate which are the first transactions
+    period_transactions['first'] = False
+    # find all of the initial transactions and store as an index
+    first_transactions = period_transactions.groupby(customer_id_col, sort=True, as_index=False).head(1).index
+    # mark the initial transactions as True
+    period_transactions.loc[first_transactions, 'first'] = True
+    select_columns.append('first')
+    return period_transactions[select_columns]
+
+
+def summary_data_from_transaction_data(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None,
+                                       observation_period_end=None, freq='D', freq_multiplier=1):
+    """
+    Return summary data from transactions.
+
+    This transforms a Dataframe of transaction data of the form:
+        customer_id, datetime [, monetary_value]
+    to a Dataframe of the form:
+        customer_id, frequency, recency, T [, monetary_value]
+
+    Parameters
+    ----------
+    transactions: :obj: DataFrame
+        a Pandas DataFrame that contains the customer_id col and the datetime col.
+    customer_id_col: string
+        the column in transactions DataFrame that denotes the customer_id
+    datetime_col:  string
+        the column in transactions that denotes the datetime the purchase was made.
+    monetary_value_col: string, optional
+        the columns in the transactions that denotes the monetary value of the transaction.
+        Optional, only needed for customer lifetime value estimation models.
+    observation_period_end: datetime, optional
+         a string or datetime to denote the final date of the study.
+         Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
+    datetime_format: string, optional
+        a string that represents the timestamp format. Useful if Pandas can't understand
+        the provided format.
+    freq: string, optional
+        Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
+        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
+    freq_multiplier: int, optional
+        Default 1, could be use to get exact recency and T, i.e. with freq='W'
+        row for user id_sample=1 will be recency=30 and T=39 while data in
+        CDNOW summary are different. Exact values could be obtained with
+        freq='D' and freq_multiplier=7 which will lead to recency=30.43
+        and T=38.86
+
+    Returns
+    -------
+    :obj: Dataframe:
+        customer_id, frequency, recency, T [, monetary_value]
+
+    """
+    if observation_period_end is None:
+        observation_period_end = transactions[datetime_col].max().to_period(freq).to_timestamp()
+    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp()
+
+    # label all of the repeated transactions
+    repeated_transactions = _find_first_transactions(
+        transactions,
+        customer_id_col,
+        datetime_col,
+        monetary_value_col,
+        datetime_format,
+        observation_period_end,
+        freq
+    )
+    # count all orders by customer.
+    customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(['min', 'max', 'count'])
+
+    # subtract 1 from count, as we ignore their first order.
+    customers['frequency'] = customers['count'] - 1
+
+    customers['T'] = (observation_period_end - customers['min']) / np.timedelta64(1, freq) / freq_multiplier
+    customers['recency'] = (customers['max'] - customers['min']) / np.timedelta64(1, freq) / freq_multiplier
+
+    summary_columns = ['frequency', 'recency', 'T']
+
+    if monetary_value_col:
+        # create an index of all the first purchases
+        first_purchases = repeated_transactions[repeated_transactions['first']].index
+        # by setting the monetary_value cells of all the first purchases to NaN,
+        # those values will be excluded from the mean value calculation
+        repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan
+        customers['monetary_value'] = repeated_transactions.groupby(customer_id_col)[monetary_value_col].mean().fillna(0)
+        summary_columns.append('monetary_value')
+
+    return customers[summary_columns].astype(float)
+
+
+def calculate_alive_path(model, transactions, datetime_col, t, freq='D'):
+    """
+    Calculate alive path for plotting alive history of user.
+
+    Parameters
+    ----------
+    model:
+        A fitted lifetimes model
+    transactions: :obj: dataframe
+        a Pandas DataFrame containing the transactions history of the customer_id
+    datetime_col: string
+        the column in the transactions that denotes the datetime the purchase was made
+    t: array_like
+        the number of time units since the birth for which we want to draw the p_alive
+    freq: string
+        Default 'D' for days. Other examples= 'W' for weekly
+
+    Returns
+    -------
+    :obj: Series
+        A pandas Series containing the p_alive as a function of T (age of the customer)
+
+    """
+    customer_history = transactions[[datetime_col]].copy()
+    customer_history[datetime_col] = pd.to_datetime(customer_history[datetime_col])
+    customer_history = customer_history.set_index(datetime_col)
+    # Add transactions column
+    customer_history['transactions'] = 1
+
+    # for some reason fillna(0) not working for resample in pandas with python 3.x,
+    # changed to replace
+    purchase_history = (customer_history.resample(freq).sum().replace(np.nan, 0)
+                        ['transactions'].values)
+
+    extra_columns = t + 1 - len(purchase_history)
+    customer_history = pd.DataFrame(np.append(purchase_history, [0] * extra_columns), columns=['transactions'])
+    # add T column
+    customer_history['T'] = np.arange(customer_history.shape[0])
+    # add cumulative transactions column
+    customer_history['transactions'] = customer_history['transactions'].apply(lambda t: int(t > 0))
+    customer_history['frequency'] = customer_history['transactions'].cumsum() - 1  # first purchase is ignored
+    # Add t_x column
+    customer_history['recency'] = customer_history.apply(lambda row: row['T'] if row['transactions'] != 0 else np.nan, axis=1)
+    customer_history['recency'] = customer_history['recency'].fillna(method='ffill').fillna(0)
+    return customer_history.apply(
+        lambda row: model.conditional_probability_alive(row['frequency'], row['recency'], row['T']),
+        axis=1)
+
+
+def _fit(minimizing_function, minimizing_function_args, iterative_fitting,
+         initial_params, params_size, disp, tol=1e-6, fit_method='Nelder-Mead',
+         maxiter=2000, **kwargs):
+    """Fit function for fitters."""
+    ll = []
+    sols = []
+
+    def _func_caller(params, func_args, function):
+        return function(params, *func_args)
+
+    if iterative_fitting <= 0:
+        raise ValueError("iterative_fitting parameter should be greater than 0 as of lifetimes v0.2.1")
+
+    if iterative_fitting > 1 and initial_params is not None:
+        raise ValueError("iterative_fitting and initial_params should not be both set, as no improvement could be made.")
+
+    # set options for minimize, if specified in kwargs will be overwrittern
+    minimize_options = {}
+    minimize_options['disp'] = disp
+    minimize_options['maxiter'] = maxiter
+    minimize_options.update(kwargs)
+
+    total_count = 0
+
+    while total_count < iterative_fitting:
+        current_init_params = np.random.normal(1.0, scale=0.05, size=params_size) if initial_params is None else initial_params
+        if minimize_options['disp']:
+            print('Optimize function with {}'.format(fit_method))
+        output = minimize(_func_caller, method=fit_method, tol=tol,
+                          x0=current_init_params,
+                          args=(minimizing_function_args, minimizing_function),
+                          options=minimize_options)
+        sols.append(output.x)
+        ll.append(output.fun)
+
+        total_count += 1
+    argmin_ll, min_ll = min(enumerate(ll), key=lambda x: x[1])
+    minimizing_params = sols[argmin_ll]
+    return minimizing_params, min_ll
+
+
+def _scale_time(age):
+    """Create a scalar such that the maximum age is 1."""
+    return 1. / age.max()
+
+
+def _check_inputs(frequency, recency=None, T=None, monetary_value=None):
+    """
+    Check validity of inputs.
+
+    Raises ValueError when checks failed.
+
+    Parameters
+    ----------
+    frequency: array_like
+        the frequency vector of customers' purchases (denoted x in literature).
+    recency: array_like, optional
+        the recency vector of customers' purchases (denoted t_x in literature).
+    T: array_like, optional
+        the vector of customers' age (time since first purchase)
+    monetary_value: array_like, optional
+        the monetary value vector of customer's purchases (denoted m in literature).
+
+    """
+    if recency is not None:
+        if T is not None and np.any(recency > T):
+            raise ValueError("Some values in recency vector are larger than T vector.")
+        if np.any(recency[frequency == 0] != 0):
+            raise ValueError("There exist non-zero recency values when frequency is zero.")
+        if np.any(recency < 0):
+            raise ValueError("There exist negative recency (ex: last order set before first order)")
+        if any(len(x) == 0 for x in [recency, frequency, T]):
+            raise ValueError("There exists a zero length vector in one of frequency, recency or T.")
+    if np.sum((frequency - frequency.astype(int)) ** 2) != 0:
+        raise ValueError("There exist non-integer values in the frequency vector.")
+    if monetary_value is not None and np.any(monetary_value <= 0):
+        raise ValueError("There exist non-positive values in the monetary_value vector.")
+    # TODO: raise warning if np.any(freqency > T) as this means that there are
+    # more order-periods than periods.
+
+
+def _customer_lifetime_value(transaction_prediction_model, frequency, recency, T, monetary_value, time=12, discount_rate=0.01):
+    """
+    Compute the average lifetime value for a group of one or more customers.
+
+    This method computes the average lifetime value for a group of one or more customers.
+
+    Parameters
+    ----------
+    transaction_prediction_model:
+        the model to predict future transactions, literature uses pareto/nbd but we can also use a different model like bg
+    frequency: array_like
+        the frequency vector of customers' purchases (denoted x in literature).
+    recency: array_like
+        the recency vector of customers' purchases (denoted t_x in literature).
+    T: array_like
+        the vector of customers' age (time since first purchase)
+    monetary_value: array_like
+        the monetary value vector of customer's purchases (denoted m in literature).
+    time: int, optional
+        the lifetime expected for the user in months. Default: 12
+    discount_rate: float, optional
+        the monthly adjusted discount rate. Default: 1
+
+    Returns
+    -------
+    :obj: Series
+        series with customer ids as index and the estimated customer lifetime values as values
+
+    """
+    df = pd.DataFrame(index=frequency.index)
+    df['clv'] = 0  # initialize the clv column to zeros
+
+    for i in range(30, (time * 30) + 1, 30):
+        # since the prediction of number of transactions is cumulative, we have to subtract off the previous periods
+        expected_number_of_transactions = transaction_prediction_model.predict(i, frequency, recency, T) - transaction_prediction_model.predict(i - 30, frequency, recency, T)
+        # sum up the CLV estimates of all of the periods
+        df['clv'] += (monetary_value * expected_number_of_transactions) / (1 + discount_rate) ** (i / 30)
+
+    return df['clv']  # return as a series
+
+
+def expected_cumulative_transactions(model, transactions, datetime_col,
+                                     customer_id_col, t, datetime_format=None,
+                                     freq='D', set_index_date=False,
+                                     freq_multiplier=1):
+    """
+    Get expected and actual repeated cumulative transactions.
+
+    Parameters
+    ----------
+    model:
+        A fitted lifetimes model
+    transactions: :obj: DataFrame
+        a Pandas DataFrame containing the transactions history of the customer_id
+    datetime_col: string
+        the column in transactions that denotes the datetime the purchase was made.
+    customer_id_col: string
+        the column in transactions that denotes the customer_id
+    t: int
+        the number of time units since the begining of
+        data for which we want to calculate cumulative transactions
+    datetime_format: string, optional
+        a string that represents the timestamp format. Useful if Pandas can't
+        understand the provided format.
+    freq: string, optional
+        Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
+        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
+    set_index_date: bool, optional
+        when True set date as Pandas DataFrame index, default False - number of time units
+    freq_multiplier: int, optional
+        Default 1, could be use to get exact cumulative transactions predicted
+        by model, i.e. model trained with freq='W', passed freq to
+        expected_cumulative_transactions is freq='D', and freq_multiplier=7.
+
+    Returns
+    -------
+    :obj: DataFrame
+        A dataframe with columns actual, predicted
+
+    """
+    start_date = pd.to_datetime(transactions[datetime_col],
+                                format=datetime_format).min()
+    start_period = start_date.to_period(freq)
+    observation_period_end = start_period + t
+
+    repeated_and_first_transactions = _find_first_transactions(
+        transactions,
+        customer_id_col,
+        datetime_col,
+        datetime_format=datetime_format,
+        observation_period_end=observation_period_end,
+        freq=freq
+    )
+
+    first_trans_mask = repeated_and_first_transactions['first']
+    repeated_transactions = repeated_and_first_transactions[~first_trans_mask]
+    first_transactions = repeated_and_first_transactions[first_trans_mask]
+
+    date_range = pd.date_range(start_date, periods=t + 1, freq=freq)
+    date_periods = date_range.to_period(freq)
+
+    pred_cum_transactions = []
+    first_trans_size = first_transactions.groupby('date').size()
+    for i, period in enumerate(date_periods):
+        if i % freq_multiplier == 0 and i > 0:
+            times = period - first_trans_size.index
+            times = times[times > 0].astype(float) / freq_multiplier
+            expected_trans_agg = \
+                model.expected_number_of_purchases_up_to_time(times)
+
+            mask = first_trans_size.index < period
+            expected_trans = sum(expected_trans_agg * first_trans_size[mask])
+            pred_cum_transactions.append(expected_trans)
+
+    act_trans = repeated_transactions.groupby('date').size()
+    act_tracking_transactions = act_trans.reindex(date_periods, fill_value=0)
+
+    act_cum_transactions = []
+    for j in range(1, t // freq_multiplier + 1):
+        sum_trans = sum(act_tracking_transactions.iloc[:j * freq_multiplier])
+        act_cum_transactions.append(sum_trans)
+
+    if set_index_date:
+        index = date_periods[freq_multiplier - 1: -1:freq_multiplier]
+    else:
+        index = range(0, t // freq_multiplier)
+
+    df_cum_transactions = pd.DataFrame({'actual': act_cum_transactions,
+                                        'predicted': pred_cum_transactions},
+                                       index=index)
+
+    return df_cum_transactions
+
+
+def _save_obj_without_attr(obj, attr_list, path, values_to_save=None):
+    """
+    Save object with attributes from attr_list.
+
+    Parameters
+    ----------
+    obj: obj
+        Object of class with __dict__ attribute.
+    attr_list: list
+        List with attributes to exclude from saving to dill object. If empty
+        list all attributes will be saved.
+    path: str
+        Where to save dill object.
+    values_to_save: list, optional
+        Placeholders for original attributes for saving object. If None will be
+        extended to attr_list length like [None] * len(attr_list)
+
+    """
+    if values_to_save is None:
+        values_to_save = [None] * len(attr_list)
+
+    saved_attr_dict = {}
+    for attr, val_save in zip(attr_list, values_to_save):
+        if attr in obj.__dict__:
+            item = obj.__dict__.pop(attr)
+            saved_attr_dict[attr] = item
+            setattr(obj, attr, val_save)
+
+    with open(path, 'wb') as out_file:
+        dill.dump(obj, out_file)
+
+    for attr, item in saved_attr_dict.items():
+        setattr(obj, attr, item)

From 3e00457e8839185a6e050d7d51fa2e59474364ab Mon Sep 17 00:00:00 2001
From: Michael Schreier <m.schreier@gmx.de>
Date: Fri, 14 Dec 2018 06:34:57 +0100
Subject: [PATCH 2/3] Moved utils.py to proper location

---
 lifetimes/utils.py |  10 +-
 utils.py           | 551 ---------------------------------------------
 2 files changed, 5 insertions(+), 556 deletions(-)
 delete mode 100644 utils.py

diff --git a/lifetimes/utils.py b/lifetimes/utils.py
index 4e49fa73..9f127225 100644
--- a/lifetimes/utils.py
+++ b/lifetimes/utils.py
@@ -148,7 +148,7 @@ def _find_first_transactions(transactions, customer_id_col, datetime_col, moneta
     # make sure the date column uses datetime objects, and use Pandas' DateTimeIndex.to_period()
     # to convert the column to a PeriodIndex which is useful for time-wise grouping and truncating
     transactions[datetime_col] = pd.to_datetime(transactions[datetime_col], format=datetime_format)
-    transactions = transactions.set_index(datetime_col).to_period(freq)
+    transactions = transactions.set_index(datetime_col).to_period(freq).to_timestamp()
 
     transactions = transactions.loc[(transactions.index <= observation_period_end)].reset_index()
 
@@ -216,8 +216,8 @@ def summary_data_from_transaction_data(transactions, customer_id_col, datetime_c
 
     """
     if observation_period_end is None:
-        observation_period_end = transactions[datetime_col].max()
-    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq)
+        observation_period_end = transactions[datetime_col].max().to_period(freq).to_timestamp()
+    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp()
 
     # label all of the repeated transactions
     repeated_transactions = _find_first_transactions(
@@ -235,8 +235,8 @@ def summary_data_from_transaction_data(transactions, customer_id_col, datetime_c
     # subtract 1 from count, as we ignore their first order.
     customers['frequency'] = customers['count'] - 1
 
-    customers['T'] = (observation_period_end - customers['min']) / freq_multiplier
-    customers['recency'] = (customers['max'] - customers['min']) / freq_multiplier
+    customers['T'] = (observation_period_end - customers['min']) / np.timedelta64(1, freq) / freq_multiplier
+    customers['recency'] = (customers['max'] - customers['min']) / np.timedelta64(1, freq) / freq_multiplier
 
     summary_columns = ['frequency', 'recency', 'T']
 
diff --git a/utils.py b/utils.py
deleted file mode 100644
index 6de04bc8..00000000
--- a/utils.py
+++ /dev/null
@@ -1,551 +0,0 @@
-"""Lifetimes utils and helpers."""
-from __future__ import division
-
-from datetime import datetime
-
-import numpy as np
-import pandas as pd
-import dill
-from scipy.optimize import minimize
-
-pd.options.mode.chained_assignment = None
-
-__all__ = ['calibration_and_holdout_data',
-           'summary_data_from_transaction_data',
-           '_find_first_transactions',
-           'calculate_alive_path',
-           'expected_cumulative_transactions']
-
-
-def calibration_and_holdout_data(transactions, customer_id_col, datetime_col, calibration_period_end,
-                                 observation_period_end=None, freq='D', datetime_format=None,
-                                 monetary_value_col=None):
-    """
-    Create a summary of each customer over a calibration and holdout period.
-
-    This function creates a summary of each customer over a calibration and
-    holdout period (training and testing, respectively).
-    It accepts transaction data, and returns a Dataframe of sufficient statistics.
-
-    Parameters
-    ----------
-    transactions: :obj: DataFrame
-        a Pandas DataFrame that contains the customer_id col and the datetime col.
-    customer_id_col: string
-        the column in transactions DataFrame that denotes the customer_id
-    datetime_col:  string
-        the column in transactions that denotes the datetime the purchase was made.
-    calibration_period_end: :obj: datetime
-        a period to limit the calibration to, inclusive.
-    observation_period_end: :obj: datetime, optional
-         a string or datetime to denote the final date of the study.
-         Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
-    freq: string, optional
-        Default 'D' for days. Other examples: 'W' for weekly.
-    datetime_format: string, optional
-        a string that represents the timestamp format. Useful if Pandas can't understand
-        the provided format.
-    monetary_value_col: string, optional
-        the column in transactions that denotes the monetary value of the transaction.
-        Optional, only needed for customer lifetime value estimation models.
-
-    Returns
-    -------
-    :obj: DataFrame
-        A dataframe with columns frequency_cal, recency_cal, T_cal, frequency_holdout, duration_holdout
-        If monetary_value_col isn't None, the dataframe will also have the columns monetary_value_cal and
-        monetary_value_holdout.
-
-    """
-    def to_period(d):
-        return d.to_period(freq)
-
-    if observation_period_end is None:
-        observation_period_end = transactions[datetime_col].max()
-
-    transaction_cols = [customer_id_col, datetime_col]
-    if monetary_value_col:
-        transaction_cols.append(monetary_value_col)
-    transactions = transactions[transaction_cols].copy()
-
-    transactions[datetime_col] = pd.to_datetime(transactions[datetime_col], format=datetime_format)
-    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format)
-    calibration_period_end = pd.to_datetime(calibration_period_end, format=datetime_format)
-
-    # create calibration dataset
-    calibration_transactions = transactions.loc[transactions[datetime_col] <= calibration_period_end]
-    calibration_summary_data = summary_data_from_transaction_data(calibration_transactions,
-                                                                  customer_id_col,
-                                                                  datetime_col,
-                                                                  datetime_format=datetime_format,
-                                                                  observation_period_end=calibration_period_end,
-                                                                  freq=freq,
-                                                                  monetary_value_col=monetary_value_col)
-    calibration_summary_data.columns = [c + '_cal' for c in calibration_summary_data.columns]
-
-    # create holdout dataset
-    holdout_transactions = transactions.loc[(observation_period_end >= transactions[datetime_col]) &
-                                            (transactions[datetime_col] > calibration_period_end)]
-    holdout_transactions[datetime_col] = holdout_transactions[datetime_col].map(to_period)
-    holdout_summary_data = holdout_transactions.groupby([customer_id_col, datetime_col], sort=False).agg(lambda r: 1)\
-                                               .groupby(level=customer_id_col).agg(['count'])
-    holdout_summary_data.columns = ['frequency_holdout']
-    if monetary_value_col:
-        holdout_summary_data['monetary_value_holdout'] = \
-            holdout_transactions.groupby(customer_id_col)[monetary_value_col].mean()
-
-    combined_data = calibration_summary_data.join(holdout_summary_data, how='left')
-    combined_data.fillna(0, inplace=True)
-
-    delta_time = to_period(observation_period_end) - to_period(calibration_period_end)
-    combined_data['duration_holdout'] = delta_time
-
-    return combined_data
-
-
-def _find_first_transactions(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None,
-                             observation_period_end=None, freq='D'):
-    """
-    Return dataframe with first transactions.
-
-    This takes a Dataframe of transaction data of the form:
-        customer_id, datetime [, monetary_value]
-    and appends a column named 'repeated' to the transaction log which indicates which rows
-    are repeated transactions for that customer_id.
-
-    Parameters
-    ----------
-    transactions: :obj: DataFrame
-        a Pandas DataFrame that contains the customer_id col and the datetime col.
-    customer_id_col: string
-        the column in transactions DataFrame that denotes the customer_id
-    datetime_col:  string
-        the column in transactions that denotes the datetime the purchase was made.
-    monetary_value_col: string, optional
-        the column in transactions that denotes the monetary value of the transaction.
-        Optional, only needed for customer lifetime value estimation models.
-    observation_period_end: :obj: datetime
-        a string or datetime to denote the final date of the study.
-        Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
-    datetime_format: string, optional
-        a string that represents the timestamp format. Useful if Pandas can't understand
-        the provided format.
-    freq: string, optional
-        Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
-        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
-
-    """
-    if observation_period_end is None:
-        observation_period_end = transactions[datetime_col].max()
-
-    select_columns = [customer_id_col, datetime_col]
-
-    if monetary_value_col:
-        select_columns.append(monetary_value_col)
-
-    transactions = transactions[select_columns].sort_values(select_columns).copy()
-
-    # make sure the date column uses datetime objects, and use Pandas' DateTimeIndex.to_period()
-    # to convert the column to a PeriodIndex which is useful for time-wise grouping and truncating
-    transactions[datetime_col] = pd.to_datetime(transactions[datetime_col], format=datetime_format)
-    transactions = transactions.set_index(datetime_col).to_period(freq).to_timestamp()
-
-    transactions = transactions.loc[(transactions.index <= observation_period_end)].reset_index()
-
-    period_groupby = transactions.groupby([datetime_col, customer_id_col], sort=False, as_index=False)
-
-    if monetary_value_col:
-        # when we have a monetary column, make sure to sum together any values in the same period
-        period_transactions = period_groupby.sum()
-    else:
-        # by calling head() on the groupby object, the datetime_col and customer_id_col columns
-        # will be reduced
-        period_transactions = period_groupby.head(1)
-
-    # initialize a new column where we will indicate which are the first transactions
-    period_transactions['first'] = False
-    # find all of the initial transactions and store as an index
-    first_transactions = period_transactions.groupby(customer_id_col, sort=True, as_index=False).head(1).index
-    # mark the initial transactions as True
-    period_transactions.loc[first_transactions, 'first'] = True
-    select_columns.append('first')
-    return period_transactions[select_columns]
-
-
-def summary_data_from_transaction_data(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None,
-                                       observation_period_end=None, freq='D', freq_multiplier=1):
-    """
-    Return summary data from transactions.
-
-    This transforms a Dataframe of transaction data of the form:
-        customer_id, datetime [, monetary_value]
-    to a Dataframe of the form:
-        customer_id, frequency, recency, T [, monetary_value]
-
-    Parameters
-    ----------
-    transactions: :obj: DataFrame
-        a Pandas DataFrame that contains the customer_id col and the datetime col.
-    customer_id_col: string
-        the column in transactions DataFrame that denotes the customer_id
-    datetime_col:  string
-        the column in transactions that denotes the datetime the purchase was made.
-    monetary_value_col: string, optional
-        the columns in the transactions that denotes the monetary value of the transaction.
-        Optional, only needed for customer lifetime value estimation models.
-    observation_period_end: datetime, optional
-         a string or datetime to denote the final date of the study.
-         Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
-    datetime_format: string, optional
-        a string that represents the timestamp format. Useful if Pandas can't understand
-        the provided format.
-    freq: string, optional
-        Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
-        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
-    freq_multiplier: int, optional
-        Default 1, could be use to get exact recency and T, i.e. with freq='W'
-        row for user id_sample=1 will be recency=30 and T=39 while data in
-        CDNOW summary are different. Exact values could be obtained with
-        freq='D' and freq_multiplier=7 which will lead to recency=30.43
-        and T=38.86
-
-    Returns
-    -------
-    :obj: Dataframe:
-        customer_id, frequency, recency, T [, monetary_value]
-
-    """
-    if observation_period_end is None:
-        observation_period_end = transactions[datetime_col].max().to_period(freq).to_timestamp()
-    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp()
-
-    # label all of the repeated transactions
-    repeated_transactions = _find_first_transactions(
-        transactions,
-        customer_id_col,
-        datetime_col,
-        monetary_value_col,
-        datetime_format,
-        observation_period_end,
-        freq
-    )
-    # count all orders by customer.
-    customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(['min', 'max', 'count'])
-
-    # subtract 1 from count, as we ignore their first order.
-    customers['frequency'] = customers['count'] - 1
-
-    customers['T'] = (observation_period_end - customers['min']) / np.timedelta64(1, freq) / freq_multiplier
-    customers['recency'] = (customers['max'] - customers['min']) / np.timedelta64(1, freq) / freq_multiplier
-
-    summary_columns = ['frequency', 'recency', 'T']
-
-    if monetary_value_col:
-        # create an index of all the first purchases
-        first_purchases = repeated_transactions[repeated_transactions['first']].index
-        # by setting the monetary_value cells of all the first purchases to NaN,
-        # those values will be excluded from the mean value calculation
-        repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan
-        customers['monetary_value'] = repeated_transactions.groupby(customer_id_col)[monetary_value_col].mean().fillna(0)
-        summary_columns.append('monetary_value')
-
-    return customers[summary_columns].astype(float)
-
-
-def calculate_alive_path(model, transactions, datetime_col, t, freq='D'):
-    """
-    Calculate alive path for plotting alive history of user.
-
-    Parameters
-    ----------
-    model:
-        A fitted lifetimes model
-    transactions: :obj: dataframe
-        a Pandas DataFrame containing the transactions history of the customer_id
-    datetime_col: string
-        the column in the transactions that denotes the datetime the purchase was made
-    t: array_like
-        the number of time units since the birth for which we want to draw the p_alive
-    freq: string
-        Default 'D' for days. Other examples= 'W' for weekly
-
-    Returns
-    -------
-    :obj: Series
-        A pandas Series containing the p_alive as a function of T (age of the customer)
-
-    """
-    customer_history = transactions[[datetime_col]].copy()
-    customer_history[datetime_col] = pd.to_datetime(customer_history[datetime_col])
-    customer_history = customer_history.set_index(datetime_col)
-    # Add transactions column
-    customer_history['transactions'] = 1
-
-    # for some reason fillna(0) not working for resample in pandas with python 3.x,
-    # changed to replace
-    purchase_history = (customer_history.resample(freq).sum().replace(np.nan, 0)
-                        ['transactions'].values)
-
-    extra_columns = t + 1 - len(purchase_history)
-    customer_history = pd.DataFrame(np.append(purchase_history, [0] * extra_columns), columns=['transactions'])
-    # add T column
-    customer_history['T'] = np.arange(customer_history.shape[0])
-    # add cumulative transactions column
-    customer_history['transactions'] = customer_history['transactions'].apply(lambda t: int(t > 0))
-    customer_history['frequency'] = customer_history['transactions'].cumsum() - 1  # first purchase is ignored
-    # Add t_x column
-    customer_history['recency'] = customer_history.apply(lambda row: row['T'] if row['transactions'] != 0 else np.nan, axis=1)
-    customer_history['recency'] = customer_history['recency'].fillna(method='ffill').fillna(0)
-    return customer_history.apply(
-        lambda row: model.conditional_probability_alive(row['frequency'], row['recency'], row['T']),
-        axis=1)
-
-
-def _fit(minimizing_function, minimizing_function_args, iterative_fitting,
-         initial_params, params_size, disp, tol=1e-6, fit_method='Nelder-Mead',
-         maxiter=2000, **kwargs):
-    """Fit function for fitters."""
-    ll = []
-    sols = []
-
-    def _func_caller(params, func_args, function):
-        return function(params, *func_args)
-
-    if iterative_fitting <= 0:
-        raise ValueError("iterative_fitting parameter should be greater than 0 as of lifetimes v0.2.1")
-
-    if iterative_fitting > 1 and initial_params is not None:
-        raise ValueError("iterative_fitting and initial_params should not be both set, as no improvement could be made.")
-
-    # set options for minimize, if specified in kwargs will be overwrittern
-    minimize_options = {}
-    minimize_options['disp'] = disp
-    minimize_options['maxiter'] = maxiter
-    minimize_options.update(kwargs)
-
-    total_count = 0
-
-    while total_count < iterative_fitting:
-        current_init_params = np.random.normal(1.0, scale=0.05, size=params_size) if initial_params is None else initial_params
-        if minimize_options['disp']:
-            print('Optimize function with {}'.format(fit_method))
-        output = minimize(_func_caller, method=fit_method, tol=tol,
-                          x0=current_init_params,
-                          args=(minimizing_function_args, minimizing_function),
-                          options=minimize_options)
-        sols.append(output.x)
-        ll.append(output.fun)
-
-        total_count += 1
-    argmin_ll, min_ll = min(enumerate(ll), key=lambda x: x[1])
-    minimizing_params = sols[argmin_ll]
-    return minimizing_params, min_ll
-
-
-def _scale_time(age):
-    """Create a scalar such that the maximum age is 1."""
-    return 1. / age.max()
-
-
-def _check_inputs(frequency, recency=None, T=None, monetary_value=None):
-    """
-    Check validity of inputs.
-
-    Raises ValueError when checks failed.
-
-    Parameters
-    ----------
-    frequency: array_like
-        the frequency vector of customers' purchases (denoted x in literature).
-    recency: array_like, optional
-        the recency vector of customers' purchases (denoted t_x in literature).
-    T: array_like, optional
-        the vector of customers' age (time since first purchase)
-    monetary_value: array_like, optional
-        the monetary value vector of customer's purchases (denoted m in literature).
-
-    """
-    if recency is not None:
-        if T is not None and np.any(recency > T):
-            raise ValueError("Some values in recency vector are larger than T vector.")
-        if np.any(recency[frequency == 0] != 0):
-            raise ValueError("There exist non-zero recency values when frequency is zero.")
-        if np.any(recency < 0):
-            raise ValueError("There exist negative recency (ex: last order set before first order)")
-        if any(len(x) == 0 for x in [recency, frequency, T]):
-            raise ValueError("There exists a zero length vector in one of frequency, recency or T.")
-    if np.sum((frequency - frequency.astype(int)) ** 2) != 0:
-        raise ValueError("There exist non-integer values in the frequency vector.")
-    if monetary_value is not None and np.any(monetary_value <= 0):
-        raise ValueError("There exist non-positive values in the monetary_value vector.")
-    # TODO: raise warning if np.any(freqency > T) as this means that there are
-    # more order-periods than periods.
-
-
-def _customer_lifetime_value(transaction_prediction_model, frequency, recency, T, monetary_value, time=12, discount_rate=0.01):
-    """
-    Compute the average lifetime value for a group of one or more customers.
-
-    This method computes the average lifetime value for a group of one or more customers.
-
-    Parameters
-    ----------
-    transaction_prediction_model:
-        the model to predict future transactions, literature uses pareto/nbd but we can also use a different model like bg
-    frequency: array_like
-        the frequency vector of customers' purchases (denoted x in literature).
-    recency: array_like
-        the recency vector of customers' purchases (denoted t_x in literature).
-    T: array_like
-        the vector of customers' age (time since first purchase)
-    monetary_value: array_like
-        the monetary value vector of customer's purchases (denoted m in literature).
-    time: int, optional
-        the lifetime expected for the user in months. Default: 12
-    discount_rate: float, optional
-        the monthly adjusted discount rate. Default: 1
-
-    Returns
-    -------
-    :obj: Series
-        series with customer ids as index and the estimated customer lifetime values as values
-
-    """
-    df = pd.DataFrame(index=frequency.index)
-    df['clv'] = 0  # initialize the clv column to zeros
-
-    for i in range(30, (time * 30) + 1, 30):
-        # since the prediction of number of transactions is cumulative, we have to subtract off the previous periods
-        expected_number_of_transactions = transaction_prediction_model.predict(i, frequency, recency, T) - transaction_prediction_model.predict(i - 30, frequency, recency, T)
-        # sum up the CLV estimates of all of the periods
-        df['clv'] += (monetary_value * expected_number_of_transactions) / (1 + discount_rate) ** (i / 30)
-
-    return df['clv']  # return as a series
-
-
-def expected_cumulative_transactions(model, transactions, datetime_col,
-                                     customer_id_col, t, datetime_format=None,
-                                     freq='D', set_index_date=False,
-                                     freq_multiplier=1):
-    """
-    Get expected and actual repeated cumulative transactions.
-
-    Parameters
-    ----------
-    model:
-        A fitted lifetimes model
-    transactions: :obj: DataFrame
-        a Pandas DataFrame containing the transactions history of the customer_id
-    datetime_col: string
-        the column in transactions that denotes the datetime the purchase was made.
-    customer_id_col: string
-        the column in transactions that denotes the customer_id
-    t: int
-        the number of time units since the begining of
-        data for which we want to calculate cumulative transactions
-    datetime_format: string, optional
-        a string that represents the timestamp format. Useful if Pandas can't
-        understand the provided format.
-    freq: string, optional
-        Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
-        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
-    set_index_date: bool, optional
-        when True set date as Pandas DataFrame index, default False - number of time units
-    freq_multiplier: int, optional
-        Default 1, could be use to get exact cumulative transactions predicted
-        by model, i.e. model trained with freq='W', passed freq to
-        expected_cumulative_transactions is freq='D', and freq_multiplier=7.
-
-    Returns
-    -------
-    :obj: DataFrame
-        A dataframe with columns actual, predicted
-
-    """
-    start_date = pd.to_datetime(transactions[datetime_col],
-                                format=datetime_format).min()
-    start_period = start_date.to_period(freq)
-    observation_period_end = start_period + t
-
-    repeated_and_first_transactions = _find_first_transactions(
-        transactions,
-        customer_id_col,
-        datetime_col,
-        datetime_format=datetime_format,
-        observation_period_end=observation_period_end,
-        freq=freq
-    )
-
-    first_trans_mask = repeated_and_first_transactions['first']
-    repeated_transactions = repeated_and_first_transactions[~first_trans_mask]
-    first_transactions = repeated_and_first_transactions[first_trans_mask]
-
-    date_range = pd.date_range(start_date, periods=t + 1, freq=freq)
-    date_periods = date_range.to_period(freq)
-
-    pred_cum_transactions = []
-    first_trans_size = first_transactions.groupby('date').size()
-    for i, period in enumerate(date_periods):
-        if i % freq_multiplier == 0 and i > 0:
-            times = period - first_trans_size.index
-            times = times[times > 0].astype(float) / freq_multiplier
-            expected_trans_agg = \
-                model.expected_number_of_purchases_up_to_time(times)
-
-            mask = first_trans_size.index < period
-            expected_trans = sum(expected_trans_agg * first_trans_size[mask])
-            pred_cum_transactions.append(expected_trans)
-
-    act_trans = repeated_transactions.groupby('date').size()
-    act_tracking_transactions = act_trans.reindex(date_periods, fill_value=0)
-
-    act_cum_transactions = []
-    for j in range(1, t // freq_multiplier + 1):
-        sum_trans = sum(act_tracking_transactions.iloc[:j * freq_multiplier])
-        act_cum_transactions.append(sum_trans)
-
-    if set_index_date:
-        index = date_periods[freq_multiplier - 1: -1:freq_multiplier]
-    else:
-        index = range(0, t // freq_multiplier)
-
-    df_cum_transactions = pd.DataFrame({'actual': act_cum_transactions,
-                                        'predicted': pred_cum_transactions},
-                                       index=index)
-
-    return df_cum_transactions
-
-
-def _save_obj_without_attr(obj, attr_list, path, values_to_save=None):
-    """
-    Save object with attributes from attr_list.
-
-    Parameters
-    ----------
-    obj: obj
-        Object of class with __dict__ attribute.
-    attr_list: list
-        List with attributes to exclude from saving to dill object. If empty
-        list all attributes will be saved.
-    path: str
-        Where to save dill object.
-    values_to_save: list, optional
-        Placeholders for original attributes for saving object. If None will be
-        extended to attr_list length like [None] * len(attr_list)
-
-    """
-    if values_to_save is None:
-        values_to_save = [None] * len(attr_list)
-
-    saved_attr_dict = {}
-    for attr, val_save in zip(attr_list, values_to_save):
-        if attr in obj.__dict__:
-            item = obj.__dict__.pop(attr)
-            saved_attr_dict[attr] = item
-            setattr(obj, attr, val_save)
-
-    with open(path, 'wb') as out_file:
-        dill.dump(obj, out_file)
-
-    for attr, item in saved_attr_dict.items():
-        setattr(obj, attr, item)

From 5b4e51d5b68b29f9676b6b1ae44c2eae5a3c1220 Mon Sep 17 00:00:00 2001
From: Michael Schreier <m.schreier@gmx.de>
Date: Fri, 14 Dec 2018 21:34:50 +0100
Subject: [PATCH 3/3] _find_first_transactions now returns datetime_col as
 pd.Period again to ensure compatibility with other methods

---
 lifetimes/utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lifetimes/utils.py b/lifetimes/utils.py
index 9f127225..91cc83ed 100644
--- a/lifetimes/utils.py
+++ b/lifetimes/utils.py
@@ -138,6 +138,9 @@ def _find_first_transactions(transactions, customer_id_col, datetime_col, moneta
     if observation_period_end is None:
         observation_period_end = transactions[datetime_col].max()
 
+    if isinstance(observation_period_end, pd.Period):
+        observation_period_end = observation_period_end.to_timestamp()
+
     select_columns = [customer_id_col, datetime_col]
 
     if monetary_value_col:
@@ -169,6 +172,9 @@ def _find_first_transactions(transactions, customer_id_col, datetime_col, moneta
     # mark the initial transactions as True
     period_transactions.loc[first_transactions, 'first'] = True
     select_columns.append('first')
+    # reset datetime_col to period
+    period_transactions[datetime_col] = pd.Index(period_transactions[datetime_col]).to_period(freq)
+
     return period_transactions[select_columns]
 
 
@@ -216,8 +222,9 @@ def summary_data_from_transaction_data(transactions, customer_id_col, datetime_c
 
     """
     if observation_period_end is None:
-        observation_period_end = transactions[datetime_col].max().to_period(freq).to_timestamp()
-    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp()
+        observation_period_end = pd.to_datetime(transactions[datetime_col].max(), format=datetime_format).to_period(freq).to_timestamp()
+    else:
+        observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp()
 
     # label all of the repeated transactions
     repeated_transactions = _find_first_transactions(
@@ -229,6 +236,9 @@ def summary_data_from_transaction_data(transactions, customer_id_col, datetime_c
         observation_period_end,
         freq
     )
+    # reset datetime_col to timestamp
+    repeated_transactions[datetime_col] = pd.Index(repeated_transactions[datetime_col]).to_timestamp()
+
     # count all orders by customer.
     customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(['min', 'max', 'count'])