-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_raw_free_dataset.py
executable file
·282 lines (219 loc) · 10.7 KB
/
process_raw_free_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 13 15:07:34 2021
@author: Diogo
"""
"""
Clear the console and remove all variables present on the namespace. This is
useful to prevent Python from consuming more RAM each time I run the code.
"""
try:
from IPython import get_ipython
get_ipython().magic('clear')
get_ipython().magic('reset -f')
except:
pass
from pathlib import Path
import pandas as pd
from datetime import datetime
import numpy as np
from tqdm import tqdm
"""
Underlying asset
"""
"""Create dataframe (df) for the data of the underlying from August to
September 2018"""
underlying1 = pd.read_csv("Raw data/Underlying/SPX_August-September_2018.csv")
# Create df for the data of the underlying from July to August 2019
underlying2 = pd.read_csv("Raw Data/Underlying/SPX_July-August_2019.csv")
# Convert dates on Date columns to datetime64
underlying1['Date'] = pd.to_datetime(underlying1['Date'])
underlying2['Date'] = pd.to_datetime(underlying2['Date'])
# Sort underlying df by Date column, in ascending order
underlying1 = underlying1.sort_values(by='Date')
underlying2 = underlying2.sort_values(by='Date')
# Creates new column with the standard deviation of the returns from the past 20 days
underlying1['Sigma_20_Days'] = underlying1[" Close"].rolling(20).apply(lambda x:
(np.diff(x) / x[:-1]).std())
underlying2['Sigma_20_Days'] = underlying2[" Close"].rolling(20).apply(lambda x:
(np.diff(x) / x[:-1]).std())
# Create df with all underlying data
underlying = underlying1.append(underlying2)
# Creates new column with the annualized standard deviation
underlying['Sigma_20_Days_Annualized'] = underlying['Sigma_20_Days'] * 250**0.5
# Remove unnecessary columns
underlying = underlying.drop([" Open", " High", " Low", "Sigma_20_Days"],
axis = 1)
# # Create csv file from the underlying df
# underlying.to_csv('Processed data/underlying_free_dataset.csv', index=False)
"""
Treasury rates
"""
# Create df for treasury rates from August to September 2018
treasury1 = pd.read_csv(
"Raw data/Treasury/Treasury_rates_August-September_2018.csv",
index_col = "Date")
# Create df for treasury rates from July to August 2019
treasury2 = pd.read_csv(
"Raw data/Treasury/Treasury_rates_July-August_2019.csv",
index_col = "Date")
# Create df with all treasury rates data
treasury = treasury1.append(treasury2)
# List with the different maturities, in years, of Treasury bonds
treasury_maturities = [1/12, 2/12, 3/12, 6/12, 1, 2, 3, 5, 7, 10, 20, 30]
# Change column names of treasury df
treasury_columns = [1/12, 2/12, 3/12, 6/12, 1, 2, 3, 5, 7, 10, 20, 30]
treasury.columns = treasury_columns
# Convert data on Date column of treasury df to datetime64
treasury.index = pd.to_datetime(treasury.index)
"""
Options
"""
# Create df with options' data from September 2018
options1 = pd.read_csv("Raw data/Options/SPX_20180904_20180928.csv")
# Remove unnecessary columns
options1 = options1.drop(["DNID", 'UnderlyingSymbol', 'UnderlyingPrice',
'OptionSymbol', 'OpenPrice', 'HighPrice', 'LowPrice',
'LastPrice', 'Volume', 'TradeCount', 'OpenInterest',
'T1OpenInterest', 'IVGreeks', 'IVBid', 'IVAsk',
'Delta', 'Gamma', 'Theta', 'Vega', 'Rho', 'Vanna',
'Vomma', 'Charm', 'DvegaDtime', 'Color', 'Speed',
'TradeSize', 'TradeDateTime', 'TradedMarket',
'BidMarket', 'AskMarket', 'BidSize', 'AskSize',
'OptionAlias'], axis = 1)
# Set the path to files for options from August 2019
p = Path("Raw data/Options/SPX_20190801_20190830")
# Create a list of the option files from August 2019
options2_files = list(p.glob('L2_options_201908*.csv'))
# Number of files for options2
n_options2 = len(options2_files)
print("7 loops will follow, with respective progress bars:")
# Creates df from all files
options2 = pd.concat([pd.read_csv(f) for f in tqdm(options2_files,
total = n_options2)])
# TEST WITH SMALL SAMPLE
# options2 = pd.read_csv("Raw data/Options/L2_options_20190801.csv")
# Deletes rows for options with an underlying asset that isn't SPX or SPXW
options2 = options2.loc[options2['UnderlyingSymbol'].isin(["SPX", "SPXW"])]
# Remove unnecessary columns
options2 = options2.drop(['UnderlyingSymbol', 'UnderlyingPrice', 'Exchange',
'OptionSymbol', 'OptionExt', 'Last', 'Volume',
'OpenInterest', 'IV', 'Delta', 'Gamma', 'Theta',
'Vega', 'AKA'], axis = 1)
# Rename columns DataDate to QuoteDate and Type to OptionType
options2 = options2.rename(columns = {'DataDate': 'QuoteDate',
"Type": "OptionType"})
# Create df with all options data
options = options1.append(options2)
# Function that returns the number of years between two dates
def years_between(d1, d2):
d1 = datetime.strptime(d1, "%m/%d/%Y")
d2 = datetime.strptime(d2, "%m/%d/%Y")
return abs((d2 - d1).days / 365)
# Total number of options
n_options = options.shape[0]
# Calculate the time to maturity (TTM), in years, for each option
ttm = []
for index, row in tqdm(options.iterrows(), total = n_options):
d1 = row.Expiration
d2 = row.QuoteDate
d = years_between(d1, d2)
ttm.append(d)
# Create new column with the TTM
options['Time_to_Maturity'] = ttm
# Calculate the average of the bid and ask prices of each option
option_average_price = []
for index, row in tqdm(options.iterrows(), total = n_options):
bid = row.Bid
ask = row.Ask
average = (ask + bid) / 2
option_average_price.append(average)
# Create new column with the average price
options['Option_Average_Price'] = option_average_price
# Convert data on QuoteDate column of options df to datetime64
options['QuoteDate'] = pd.to_datetime(options['QuoteDate'])
# treasury["Date"] = pd.to_datetime(treasury["Date"])
"""
Get risk-free rates that match(or are close to) each option's maturity and
QuoteDate
"""
# Subtract the TTM of each option with the different maturities
differences = pd.DataFrame(columns = treasury_maturities)
treasury_maturities1 = [3/12, 6/12, 1, 2, 3, 5, 7, 10, 20, 30]
treasury_maturities2 = [1/12]
treasury_maturities3 = [6/12, 1, 2, 3, 5, 7, 10, 20, 30]
for index, row in tqdm(options.iterrows(), total = n_options):
"""The following code is complicated because there aren't data for the 2
month rates between 2018-08-01 and 2018-09-28"""
if pd.to_datetime("2018-08-01") <= row.QuoteDate <= pd.to_datetime("2018-09-28"):
if 1.5/12 <= row.Time_to_Maturity <= 2/12:
list_s = [0, 40] # 40 is an arbitrary number bigger than 30
list_s = list_s + [abs(maturity - row.Time_to_Maturity) for
maturity in treasury_maturities1]
differences.loc[len(differences)] = list_s
elif 2/12 < row.Time_to_Maturity <= 2.5/12:
list_s = ([abs(maturity - row.Time_to_Maturity) for maturity in
treasury_maturities2])
list_s = list_s + [40, 0]
list_s = list_s + [abs(maturity - row.Time_to_Maturity) for
maturity in treasury_maturities3]
differences.loc[len(differences)] = list_s
else:
list_s = [abs(maturity - row.Time_to_Maturity) for maturity in
treasury_maturities]
differences.loc[len(differences)] = list_s
else:
list_s = [abs(maturity - row.Time_to_Maturity) for maturity in
treasury_maturities]
differences.loc[len(differences)] = list_s
"""Add the columns for each Treasury maturity, containing the differences
previously calculated, to the options df"""
options.reset_index(inplace = True, drop = True)
options = pd.concat([options, differences], axis = 1)
# Retrieve the maturity that is closest to the TTM of the option
maturity_closest_ttm = options[treasury_maturities].idxmin(axis = 1)
# Add maturity_closest_ttm as a new column in options df and rename it.
options = pd.concat([options, maturity_closest_ttm], axis = 1)
options = options.rename(columns = {0:'Maturity_Closest_TTM'})
# Create list with the Treasury rates that matches the each option's QuoteDate
# and Maturity_Closest_TTM
rf_rate = []
for index, row in tqdm(options.iterrows(), total = n_options):
rf_rate.append(float(treasury[row.Maturity_Closest_TTM].loc[(treasury.index
== row.QuoteDate)]))
# Add rf_rate as a column in the options df and drop unnecessary columns
options["RF_Rate"] = rf_rate
options = options.drop(treasury_maturities, axis = 1)
options = options.drop("Maturity_Closest_TTM", axis = 1)
# Create list with the standard deviations that match each option's QuoteDate
# sigma_20 = []
sigma_20_annualized = []
for index, row in tqdm(options.iterrows(), total = n_options):
# sigma_20.append(float(underlying["Sigma_20_Days"].loc[underlying["Date"] == row.QuoteDate]))
sigma_20_annualized.append(float(underlying[
"Sigma_20_Days_Annualized"].loc[underlying["Date"] == row.QuoteDate]))
# Add sigma_20_annualized as a column in the options df
# options["Sigma_20_Days"] = sigma_20
options["Sigma_20_Days_Annualized"] = sigma_20_annualized
# Create list with the closing prices (of the underlying) that match each
# option's QuoteDate
underlying_price = []
for index, row in tqdm(options.iterrows(), total = n_options):
underlying_price.append(float(underlying[" Close"].loc[
underlying["Date"] == row.QuoteDate]))
# Add column of closing price of the underlying for each QuoteDate and drop
# more unnecessary columns
options["Underlying_Price"] = underlying_price
options = options.drop(["Expiration"], axis = 1)
# Rename columns
options = options.rename(columns = {'Bid': 'bid_eod', "Ask": "ask_eod",
"Strike": "strike"})
# Change call into c and put into p in the OptionType column
options['OptionType'] = np.where(options['OptionType'] == 'call', 'c', 'p')
# Remove options with Time_to_Maturity = 0
options = options[options["Time_to_Maturity"] != 0]
# Remove options with Option_Average_Price = 0
options = options[options["Option_Average_Price"] != 0]
# Create csv file from the options df
options.to_csv('Processed data/options_free_dataset.csv', index=False)