-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathexplore_page.py
176 lines (141 loc) · 7.22 KB
/
explore_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from PIL import Image
# Load the model
def load_model():
with open('saved_steps.pkl', 'rb') as f:
data = pickle.load(f)
return data
data = load_model()
model = data['model']
df_test = data['test_data']
df_train = data['train_data']
# define function to show data
def show_explore_page():
st.title("Using statistical models to predict horse racing results")
st.write("""
#### Data analysis and backtesting was done on a dataset of horse racing data from \
Hong Kong Jockey Club. The dataset contains the race results of more than 2000 local \
races from 2014 to 2017.
#### On average, betting equally on all horses will incur a loss of 17.5%. \
Even if we were to only bet on the horse with the lowest odds at every race, chances are that you will \
be losing quite abit over the long run.
""")
image1 = Image.open('./images/bet_lowest_odds.jpg')
st.image(image1, caption='Betting at each race on the horse with the lowest odds',\
use_column_width=True)
st.write("""
#### Our backtesting results show that using a statistical model to predict horse racing \
can yield better performance.
""")
image2 = Image.open('./images/deployment.jpg')
st.image(image2, caption='Betting at each race using our app prediction',\
use_column_width=True)
#----------------------------------------------------------------------------------
# insert new line
st.write("")
st.write("""#### Winning Odds""")
# Plot mean of win odds against finishings
fig1 = plt.figure(figsize=(10, 5))
sns.barplot(x=df_train['finishing_position'].unique(), y=df_train.groupby('finishing_position')['win_odds'].mean(), palette='Greens_d')
plt.xlabel('Finishing Position', fontsize=12)
plt.ylabel('Mean Win Odds', fontsize=12)
plt.title('Mean Win Odds against Finishing Position', fontsize=15)
st.pyplot(fig1)
# Describe the data
st.write("""A horse with lower odds usually finishes higher. However this does not mean\
that you should only bet on the horse with the lowest odds. Other factors also contribute\
to the horse's finishing position. For example, the horse's weight, jockey's weight \
""")
#----------------------------------------------------------------------------------
# insert new line
st.write("")
st.write("""#### Weight Handicap""")
# Find the mean of actual weight for each horse number
meanWtPerHorse = df_train.groupby('horse_number')['actual_weight'].mean()
# change horse number to int
meanWtPerHorse.index = meanWtPerHorse.index.astype(int)
# Plot the distribution of mean actual weight for each horse number
fig2= plt.figure(figsize=(10, 5))
sns.barplot(x=meanWtPerHorse.index.astype(int), y=meanWtPerHorse.values, palette='Greens_d')
plt.xlabel('Horse Number', fontsize=12)
plt.ylabel('Mean Actual Weight', fontsize=12)
plt.title('Distribution of Mean Weight Handicap for Each Horse Number', fontsize=15)
st.pyplot(fig2)
# Describe the data
st.write("Hong Kong horse racing uses a weight handicap system to make races more competitive. The weight ranges from \
100lbs to 133lbs. The average weight carried is around 95lbs. \
The horse number indicates a higher rating, and usually Horse Number 1 carries the most weight.")
#----------------------------------------------------------------------------------
# insert new line
st.write("")
st.write("""#### Draw and Gate Number Effect""")
# Return only 14 columns. Draw #15 is not included.
avgPos_vs_Draw = df_train.groupby('draw')['finishing_position'].mean()[:14]
# Plot the distribution of average finishing position against draw
fig3 = plt.figure(figsize=(10, 5))
sns.barplot(x=avgPos_vs_Draw.index, y=avgPos_vs_Draw.values, palette='Greens_d')
plt.xlabel('Draw', fontsize=12)
plt.ylabel('Average Finishing Position', fontsize=12)
plt.title('Distribution of Draw against Average Finishing Position ', fontsize=15)
st.pyplot(fig3)
# Describe the data
st.write("""Draw is the position of the horse in the starting gate. The average finishing position \
is higher for horses with a lower draw number and vice versa. This is because \
horses with draw 1 have the advantage of being the first to start closest to the inside rail.""")
#----------------------------------------------------------------------------------
# insert new line
st.write("")
st.write("""#### Race Type/ Distance""")
# Plot the distribution of race distance
fig4 = plt.figure(figsize=(10, 5))
# Do a groupby to see distribution
df_racetype = df_train[['race_id', 'race_distance']].\
drop_duplicates().groupby('race_distance').count().reset_index()
# Plot the distribution of df_racetype
sns.barplot(x=df_racetype['race_distance'], y=df_racetype['race_id'])
plt.xlabel("Race Distance (m)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Distribution of Race Types", fontsize=15)
st.pyplot(fig4)
# Describe the data
st.write("""There are different race distances. Like humans, horses have different \
strengths and weaknesses. Some horses are better at longer distances, \
while others are better at shorter distances. \
""")
#----------------------------------------------------------------------------------
# insert new line
st.write("")
st.write("""#### The Form Factor""")
# keep only features we want
cols = ['finishing_position', 'actual_weight', 'declared_horse_weight', \
'draw', 'recent_ave_rank', \
'jockey_ave_rank','trainer_ave_rank', 'race_distance']
# view the correlation matrix
corr = df_train[cols].corr()
# do a mask to hide the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# plot the heatmap with the mask and correct aspect ratio
fig5 = plt.figure(figsize=(15, 10))
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=1, vmin=-1,
center=0, square=False, linewidths=.5,
cbar_kws={"shrink": .8}, annot=True)
plt.title('Correlation Matrix', fontsize=15)
st.pyplot(fig5)
# Describe the data
st.write("""Recent performances of the jockey and horse seems to have a strong effect \
on the winning position. We call this the 'form factor'. The form factor looks to be \
the most important factor in predicting the winning position. \
""")
#----------------------------------------------------------------------------------
# insert new line
st.write("")
st.write("""#### The Important Question. Can you make money betting on horse racing?""")
st.write("""While we do not guarantee success with every bet (nobody can!), our prediction\
model has shown a recall of 40% and precision of 20%. This means that if you bet on every horse\
as predicted by our model, you will win 20% of the time.""")