-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcltv_prediction.py
190 lines (136 loc) · 8.1 KB
/
cltv_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
##############################################################
# CLTV Prediction with BG-NBD and Gamma-Gamma Model
##############################################################
###############################################################
# TASKS
###############################################################
import pandas as pd
import datetime as dt
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from sklearn.preprocessing import MinMaxScaler
import numpy as np
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 500)
###############################################################
# Task 1: Preparing data
###############################################################
# 1. Read Flo_data_20K.CSV dataset. Create a copy of the DataFrame.
main_df = pd.read_csv('CLTV/flo_data_20k.csv')
df = main_df.copy()
df.head()
def missing_values_analysis(df):
na_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
n_miss = df[na_columns].isnull().sum().sort_values(ascending=True)
ratio = (df[na_columns].isnull().sum() / df.shape[0] * 100).sort_values(ascending=True)
missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
missing_df = pd.DataFrame(missing_df)
return missing_df
def check_dataframe(df, row_num=5):
print("*************** Dataset Shape ***************")
print("No. of Rows:", df.shape[0], "\nNo. of Columns:", df.shape[1])
print("*************** Types of Columns ***************")
print(df.dtypes)
print(f"*************** First {row_num} Rows ***************")
print(df.head(row_num))
print(f"*************** Last {row_num} Rows ***************")
print(df.tail(row_num))
print("*************** Summary Statistics of The Dataset ***************")
print(df.describe().T)
print("*************** Dataset Missing Values Analysis ***************")
print(missing_values_analysis(df))
check_dataframe(df)
# # 2. Define Outlier_thresholds and Replace_with_thresholds functions that are required to suppress contradictory
# values. Note: Frequency values must be integer when calculating CLTV. Therefore, round the upper and lower
# limits with round ().
def outlier_thresholds(dataframe, column):
quartile_1 = dataframe[column].quantile(0.01)
quartile_3 = dataframe[column].quantile(0.99)
iqr_range = quartile_3 - quartile_1
up_limit = quartile_3 + 1.5 * iqr_range
low_limit = quartile_1 - 1.5 * iqr_range
return low_limit, up_limit
def replace_with_thresholds(dataframe, variable):
low_limit, up_limit = outlier_thresholds(dataframe, variable)
dataframe.loc[(dataframe[variable] < low_limit), variable] = round(low_limit, 0)
dataframe.loc[(dataframe[variable] > up_limit), variable] = round(up_limit, 0)
# 3. Determine if variables "order_num_total_ever_online", "order_num_total_ever_offline",
# "customer_value_total_ever_offline" and "customer_value_total_ever_online" have contradictory values
threshold_columns = ["order_num_total_ever_online", "order_num_total_ever_offline",
"customer_value_total_ever_offline", "customer_value_total_ever_online"]
for col in threshold_columns:
replace_with_thresholds(df, col)
check_dataframe(df)
# 4. Omnichannel states that customers shop both online and offline platforms. Create new variables for
# the total number of shopping and spending of each customer.
df["order_num_total"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
df["customer_value_total"] = df["customer_value_total_ever_offline"] + df["customer_value_total_ever_online"]
# 5. Examine the variable types. Turn the type of variables containing "date" to the datetime type.
date_columns = df.columns[df.columns.str.contains("date")]
df[date_columns] = df[date_columns].apply(pd.to_datetime)
df.info()
###############################################################
# Task 2: Creation of CLTV data structure
###############################################################
# 1.Take 2 days after the last shopping on the data set as an analysis date.
df["last_order_date"].max()
last_date = dt.datetime(2021, 6, 1)
# 2.Create a new cltv dataframe with customer_id, recency_cltv_weekly, T_weekly, Frequency and Monetary_CLTV_AVG values.
cltv_df = pd.DataFrame()
cltv_df["customer_id"] = df["master_id"]
cltv_df["recency_cltv_weekly"] = ((df["last_order_date"] - df["first_order_date"]).astype('timedelta64[D]')) / 7
cltv_df["T_weekly"] = ((last_date - df["first_order_date"]).astype('timedelta64[D]')) / 7
cltv_df["frequency"] = df["order_num_total"]
cltv_df["monetary_cltv_avg"] = df["customer_value_total"] / df["order_num_total"]
cltv_df.head()
###############################################################
# Task 3: Establishing BG/NBD and Gamma-Hamma models, calculation of 6 months of CLTV
###############################################################
# 1. Create the BG/NBD model
bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(cltv_df['frequency'],
cltv_df['recency_cltv_weekly'],
cltv_df['T_weekly'])
# Estimate the expected purchases from customers in 3 months and add to the CLTV DataFrame as exp_sales_3_month.
cltv_df["exp_sales_3_month"] = bgf.predict(4 * 3,
cltv_df['frequency'],
cltv_df['recency_cltv_weekly'],
cltv_df['T_weekly'])
# Estimate the expected purchases from customers in 6 months and add to the CLTV DataFrame as exp_sales_6_month.
cltv_df["exp_sales_6_month"] = bgf.predict(4 * 6,
cltv_df['frequency'],
cltv_df['recency_cltv_weekly'],
cltv_df['T_weekly'])
# We examine the 10 people who will make the most purchase in the 3rd and 6th month.
cltv_df.groupby('customer_id').agg({'exp_sales_3_month': 'sum',
'exp_sales_6_month': 'sum'}).sort_values(by='exp_sales_6_month',
ascending=False).head(10)
# 2. Fit the gamma-gamma model. Estimates the average value of customers and add to CLTV DataFrame as exp_average_value.
ggf = GammaGammaFitter(penalizer_coef=0.01)
ggf.fit(cltv_df['frequency'], cltv_df['monetary_cltv_avg'])
cltv_df["exp_average_value"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],
cltv_df['monetary_cltv_avg'])
cltv_df.head()
# 3.Calculate 6-month CLTV and add dataframe with the name 'cltv'.
cltv_df['cltv'] = ggf.customer_lifetime_value(bgf,
cltv_df['frequency'],
cltv_df['recency_cltv_weekly'],
cltv_df['T_weekly'],
cltv_df['monetary_cltv_avg'],
time=6,
freq="W",
discount_rate=0.01)
# Observe 20 people at the highest value of CLTV.
cltv_df.sort_values("cltv", ascending=False).head(20)
###############################################################
# Task 4: Creation of segments according to CLTV
###############################################################
# 1. According to 6 -month CLTV, separate all your customers into 4 groups (segment) and add the group names to
# the data set. Assign with the name CLTV_segment.
cltv_df["cltv_segment"] = pd.qcut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])
cltv_df.head()
# 2.Check out the averages of the segments of the Recency, Frequency and Monetary.
cltv_df.groupby("cltv_segment").agg({"count", "mean", "sum"})