-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_experiments.py
191 lines (154 loc) · 7.96 KB
/
run_experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# StockNet evaluation experiments in GPU
from __future__ import print_function, division
import numpy as np
np.random.seed(42)
from models import StockNet, WaveNet
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy import stats
# Solve TensorFlow Problem in GPU Memory Growth
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
timeseries_folder = "/data/asset_raw_splited_series"
stocks = ["AAPL", "AMZN", "CSCO", "F", "GOOGL", "IBM", "MSFT", "NFLX", "ORCL", "VZ"]
def toarray(ts):
timeseries = np.atleast_2d(np.asarray(ts))
if timeseries.shape[0] == 1:
timeseries = timeseries.T
return timeseries
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = pd.DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
# put it all together
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
def mape(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def evaluate_prediction(timeseries, forecast_horizon=1, n_epochs=3000, stock_net_active=False, train_model=True,
stock_name=""):
length = len(timeseries) - forecast_horizon
# Input attributes
if stock_net_active:
series = toarray(timeseries[['close', 'pos', 'neg', 'lit', 'unc', 'con']].values)
else:
series = toarray(timeseries[['close']].values)
nb_features = series.shape[1]
# prime model
X = series[:-forecast_horizon].reshape(1, length, nb_features)
y = toarray(timeseries[['close']].values)[forecast_horizon:].reshape(1, length, 1)
if stock_net_active:
model = StockNet(length, nb_features)
else:
model = WaveNet(length)
best_model_file = '../data/models/{0}_best_model_{1}.h5'.format("stocknet" if stock_net_active else "wavenet",
stock_name)
if train_model:
checkpoint = ModelCheckpoint(best_model_file, verbose=0, monitor='mean_squared_error',
save_best_only=True, mode='auto')
model.fit(X, y, epochs=n_epochs, callbacks=[checkpoint], verbose=False)
model.load_weights(best_model_file)
# get fit for training data
X_val = series[forecast_horizon:].reshape(1, length, nb_features)
X_trfit = model.predict([X_val])
y_hat = model.predict([X_val])[0, -forecast_horizon:, 0]
return y_hat, X_trfit.flatten()
# Starting Evaluation
for stock in stocks:
# how many timeframes will be predicted
predict_horizon = 10
# how many train epochs
epochs = 1500
# how many timeframes will be showed in graph
plot_horizon = 100
timeserie = pd.read_pickle('{0}/{1}.pkl'.format(timeseries_folder,stock))
train = timeserie.iloc[:-predict_horizon]
test = timeserie.iloc[-predict_horizon:]
# Interpolatind last X with first y
y_series = test.append(train.iloc[-1]).close
y_series.sort_index(inplace=True)
plt.plot(train.close[-plot_horizon:], 'k-', label='past', linewidth=.5)
plt.plot(y_series, ':', label='real', color='0.4', linewidth=.6)
# Running WaveNet ******
y_hat_wn, X_fit = evaluate_prediction(train, predict_horizon, n_epochs=epochs, stock_name=stock)
# rmse_wn_str = " {:.4f}".format(np.sqrt(mean_squared_error(y_hat_wn, test.close)))
mape_wn_str = " {:.4f}".format(mape(y_hat_wn, test.close))
print("Wavenet results for {0} RMSE: {1} MAPE: {2}".format(stock, rmse_wn_str, mape_wn_str))
# Converting y_hat to Dataframe and Interpolatind last X with first y_hat
y_hat_wn_plot = pd.Series(y_hat_wn, index=test.index)
y_hat_wn_plot = y_hat_wn_plot.append(pd.Series([train.iloc[-1].close], index=[train.index[-1]]))
y_hat_wn_plot.sort_index(inplace=True)
# Running StockNet ******
y_hat_sn, _ = evaluate_prediction(train, predict_horizon, n_epochs=epochs, stock_net_active=True, train_model=True,
stock_name=stock)
sn_rmse_str = " {:.4f}".format(np.sqrt(mean_squared_error(y_hat_sn, test.close)))
mape_sn_str = " {:.4f}".format(mape(y_hat_sn, test.close))
# t, p = stats.ttest_ind(y_hat_sn, y_hat_wn)
# print("StockNet/WaveNet statistical tests: t-value={0} and p-value={1}".format(t, p))
# Converting y_hat_sn to Dataframe and Interpolatind last X with first y_hat_sn
y_hat_sn_plot = pd.Series(y_hat_sn, index=test.index)
y_hat_sn_plot = y_hat_sn_plot.append(pd.Series([train.iloc[-1].close], index=[train.index[-1]]))
y_hat_sn_plot.sort_index(inplace=True)
# Running SSE (Baseline) ******
# 7 lag variables (according paper)
see_timeserie = series_to_supervised(timeserie[['close', 'see']], n_in=7)
# Splitting Train/test
X_train = see_timeserie.iloc[:-predict_horizon]
X_test = see_timeserie.iloc[-predict_horizon:]
y_train = X_train['var1(t)']
y_test = X_test['var1(t)']
X_train.drop(['var1(t)'], axis=1, inplace=True)
X_test.drop(['var1(t)'], axis=1, inplace=True)
# 1000 estimators with max 50 nodes (according SEE paper)
model = RandomForestRegressor(n_estimators=1000, max_leaf_nodes=50, n_jobs=8, random_state=42)
model.fit(X_train, y_train)
y_hat_see = model.predict(X_test)
see_rmse_str = " {:.4f}".format(np.sqrt(mean_squared_error(y_hat_see, test.close)))
mape_see_str = " {:.4f}".format(mape(y_hat_see, test.close))
print("SEE-RMSE SEE-MAPE SN-RMSE SN-MAPE (Results for {0})".format(stock))
print("{0} {1} {2} {3} {4}".format("[Win]" if float(see_rmse_str) >= float(sn_rmse_str) else "[Lost]",
see_rmse_str, mape_see_str, sn_rmse_str, mape_sn_str))
# Statistical Tests (null hypothesis -> y_hat_sn eq v_random)
random_scale = (np.max(train.iloc[-predict_horizon:].close) - np.min(train.iloc[-predict_horizon:].close)) / 2
v_random = np.random.normal(loc=train.iloc[-1].close, scale=random_scale, size=predict_horizon)
t, p = stats.ttest_ind(y_hat_sn, v_random)
print("Statistical tests: t-value={0} and p-value={1} SEE Mean={2} SN Mean={3}".format(t, p, np.mean(y_hat_sn),
np.mean(v_random)))
print("Shapiro SN {0}".format(stats.shapiro(y_hat_sn)))
print("Shapiro Random {0}".format(stats.shapiro(v_random)))
# Converting y_hat_see to Dataframe and Interpolatind last X with first y_hat_see
y_hat_see_plot = pd.Series(y_hat_see, index=test.index)
y_hat_see_plot = y_hat_see_plot.append(pd.Series([train.iloc[-1].close], index=[train.index[-1]]))
y_hat_see_plot.sort_index(inplace=True)
# Setting up Plotting
plt.plot(y_hat_see_plot, 'r:', label='SEE Baseline (RMSE:{0})'.format(see_rmse_str), linewidth=.6)
# plt.plot(y_hat_wn_plot, 'b:', label='WaveNet (RMSE:{0})'.format(rmse_wn_str), linewidth=.5)
plt.plot(y_hat_sn_plot, 'g:', label='StockNet (RMSE:{0})'.format(sn_rmse_str), linewidth=.6)
plt.legend(loc='upper left')
plt.title("Last " + str(plot_horizon) + " samples and " + str(predict_horizon) + "-step prediction for " + stock)
# plt.savefig('../data/results/stocknet_{0}.svg'.format(stock))
# plt.close()