-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 9cebc6b
Showing
266 changed files
with
24,699 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
venv/ | ||
src/data/*.csv | ||
src/*.pdf | ||
.idea/ | ||
.env |
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import os | ||
|
||
from .database import Database | ||
from .explore import explore | ||
|
||
|
||
class Analyzer(object): | ||
def __init__(self): | ||
self.database = Database(os.environ["DB_HOST"]) | ||
|
||
def analyze(self): | ||
data = self.database.get_table("btc_history") | ||
|
||
explore(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from rethinkdb import RethinkDB | ||
|
||
|
||
class Database(object): | ||
def __init__(self, url): | ||
self.r = RethinkDB() | ||
self.conn = self.r.connect(url, 28015) | ||
|
||
def client(self): | ||
return self.r | ||
|
||
def get_table(self, tbl): | ||
return self.r.db("olympus").table(tbl).run(self.conn) | ||
|
||
def get(self, tbl, key): | ||
return self.r.db("olympus").table(tbl).get(key).run(self.conn) | ||
|
||
def insert(self, tbl, obj): | ||
self.r.db("olympus").table(tbl).insert( | ||
obj, | ||
conflict="replace" | ||
).run(self.conn) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import pandas as pd | ||
|
||
import matplotlib.pyplot as plt | ||
from statsmodels.tsa.stattools import adfuller | ||
|
||
from numpy import log | ||
|
||
from .linear_assumptions import linear_regression_assumptions | ||
|
||
|
||
def get_stationarity(df): | ||
df_open = df["close"] | ||
|
||
# Rolling Statistics | ||
rolling_mean = df_open.rolling(window=60).mean() | ||
rolling_std = df_open.rolling(window=60).std() | ||
|
||
plt.plot(df_open, color='blue', label='Valor') | ||
plt.plot(rolling_mean, color='red', label='Media Móvil') | ||
plt.plot(rolling_std, color='black', label='Desviación Móvil') | ||
|
||
plt.title(f"Precio Bitcoin Media & Desviación Estándar Móvil de 60 días") | ||
plt.xticks(rotation=90) | ||
plt.tight_layout() | ||
plt.show(block=False) | ||
|
||
plt.cla() | ||
|
||
plt.title(f"Precio Bitcoin Histograma Logarítmico") | ||
plt.hist(log(df_open)) | ||
plt.show() | ||
|
||
# Dickey-Fuller Test | ||
result = adfuller(log(df["close"].values)) | ||
print("Column: close") | ||
print('ADF Statistic: {}'.format(result[0])) | ||
print('p-value: {}'.format(result[1])) | ||
print('Critical Values:') | ||
for key, value in result[4].items(): | ||
print('\t{}: {}'.format(key, value)) | ||
|
||
|
||
def explore(data): | ||
pd.options.display.max_columns = 20 | ||
|
||
df = pd.DataFrame(data) | ||
df['id'] = pd.to_datetime(df['id'], unit='s') | ||
df = df.set_index(keys=["id"]) | ||
df.sort_index(inplace=True) | ||
|
||
fts = ["hashrate", "difficulty", | ||
"block_time", "comments", | ||
"posts", "followers", "points", | ||
"reddit_active_users", "reddit_comments_per_hour"] | ||
|
||
linear_regression_assumptions(df[fts].to_numpy(), df["close"].to_numpy(), fts) | ||
|
||
print("Sample data: ") | ||
print(df.head().to_string()) | ||
print() | ||
|
||
print("Summary Describe") | ||
print(df.iloc[:, 1:10].describe()) | ||
print() | ||
|
||
get_stationarity(df) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
from datetime import timedelta | ||
|
||
import pandas as pd | ||
|
||
import matplotlib.pyplot as plt | ||
|
||
import numpy as np | ||
import seaborn as sns | ||
|
||
|
||
def linear_regression_assumptions(features, label, feature_names=None): | ||
""" | ||
Tests a linear regression on the model to see if assumptions are being met | ||
""" | ||
from sklearn.linear_model import LinearRegression | ||
|
||
# Setting feature names to x1, x2, x3, etc. if they are not defined | ||
if feature_names is None: | ||
feature_names = ['X' + str(feature + 1) for feature in range(features.shape[1])] | ||
|
||
print('Fitting linear regression') | ||
# Multi-threading if the dataset is a size where doing so is beneficial | ||
if features.shape[0] < 100000: | ||
model = LinearRegression(n_jobs=-1) | ||
else: | ||
model = LinearRegression() | ||
|
||
model.fit(features, label) | ||
|
||
# Returning linear regression R^2 and coefficients before performing diagnostics | ||
r2 = model.score(features, label) | ||
print() | ||
print('R^2:', r2, '\n') | ||
print('Coefficients') | ||
print('-------------------------------------') | ||
print('Intercept:', model.intercept_) | ||
|
||
for feature in range(len(model.coef_)): | ||
print('{0}: {1}'.format(feature_names[feature], model.coef_[feature])) | ||
|
||
print('\nPerforming linear regression assumption testing') | ||
|
||
# Creating predictions and calculating residuals for assumption tests | ||
predictions = model.predict(features) | ||
df_results = pd.DataFrame({'Actual': label, 'Predicted': predictions}) | ||
df_results['Residuals'] = abs(df_results['Actual']) - abs(df_results['Predicted']) | ||
|
||
def linear_assumption(): | ||
""" | ||
Linearity: Assumes there is a linear relationship between the predictors and | ||
the response variable. If not, either a polynomial term or another | ||
algorithm should be used. | ||
""" | ||
print('\n=======================================================================================') | ||
print('Assumption 1: Linear Relationship between the Target and the Features') | ||
|
||
print('Checking with a scatter plot of actual vs. predicted. Predictions should follow the diagonal line.') | ||
|
||
# Plotting the actual vs predicted values | ||
sns.lmplot(x='Actual', y='Predicted', data=df_results, fit_reg=False, size=7) | ||
|
||
# Plotting the diagonal line | ||
line_coords = np.arange(df_results.min().min(), df_results.max().max()) | ||
plt.plot(line_coords, line_coords, # X and y points | ||
color='darkorange', linestyle='--') | ||
plt.title('Actual vs. Predicted') | ||
plt.show() | ||
print('If non-linearity is apparent, consider adding a polynomial term') | ||
|
||
def normal_errors_assumption(p_value_thresh=0.05): | ||
""" | ||
Normality: Assumes that the error terms are normally distributed. If they are not, | ||
nonlinear transformations of variables may solve this. | ||
This assumption being violated primarily causes issues with the confidence intervals | ||
""" | ||
from statsmodels.stats.diagnostic import normal_ad | ||
print('\n=======================================================================================') | ||
print('Assumption 2: The error terms are normally distributed') | ||
print() | ||
|
||
print('Using the Anderson-Darling test for normal distribution') | ||
|
||
# Performing the test on the residuals | ||
p_value = normal_ad(df_results['Residuals'])[1] | ||
print('p-value from the test - below 0.05 generally means non-normal:', p_value) | ||
|
||
# Reporting the normality of the residuals | ||
if p_value < p_value_thresh: | ||
print('Residuals are not normally distributed') | ||
else: | ||
print('Residuals are normally distributed') | ||
|
||
# Plotting the residuals distribution | ||
plt.subplots(figsize=(12, 6)) | ||
plt.title('Distribution of Residuals') | ||
sns.distplot(df_results['Residuals']) | ||
plt.show() | ||
|
||
print() | ||
if p_value > p_value_thresh: | ||
print('Assumption satisfied') | ||
else: | ||
print('Assumption not satisfied') | ||
print() | ||
print('Confidence intervals will likely be affected') | ||
print('Try performing nonlinear transformations on variables') | ||
|
||
def multicollinearity_assumption(): | ||
""" | ||
Multicollinearity: Assumes that predictors are not correlated with each other. If there is | ||
correlation among the predictors, then either remove prepdictors with high | ||
Variance Inflation Factor (VIF) values or perform dimensionality reduction | ||
This assumption being violated causes issues with interpretability of the | ||
coefficients and the standard errors of the coefficients. | ||
""" | ||
from statsmodels.stats.outliers_influence import variance_inflation_factor | ||
print('\n=======================================================================================') | ||
print('Assumption 3: Little to no multicollinearity among predictors') | ||
|
||
# Plotting the heatmap | ||
plt.figure(figsize=(10, 8)) | ||
sns.heatmap(pd.DataFrame(features, columns=feature_names).corr(), annot=True) | ||
plt.title('Correlation of Variables') | ||
plt.show() | ||
|
||
print('Variance Inflation Factors (VIF)') | ||
print('> 10: An indication that multicollinearity may be present') | ||
print('> 100: Certain multicollinearity among the variables') | ||
print('-------------------------------------') | ||
|
||
# Gathering the VIF for each variable | ||
VIF = [variance_inflation_factor(features, i) for i in range(features.shape[1])] | ||
for idx, vif in enumerate(VIF): | ||
print('{0}: {1}'.format(feature_names[idx], vif)) | ||
|
||
# Gathering and printing total cases of possible or definite multicollinearity | ||
possible_multicollinearity = sum([1 for vif in VIF if vif > 10]) | ||
definite_multicollinearity = sum([1 for vif in VIF if vif > 100]) | ||
print() | ||
print('{0} cases of possible multicollinearity'.format(possible_multicollinearity)) | ||
print('{0} cases of definite multicollinearity'.format(definite_multicollinearity)) | ||
print() | ||
|
||
if definite_multicollinearity == 0: | ||
if possible_multicollinearity == 0: | ||
print('Assumption satisfied') | ||
else: | ||
print('Assumption possibly satisfied') | ||
print() | ||
print('Coefficient interpretability may be problematic') | ||
print('Consider removing variables with a high Variance Inflation Factor (VIF)') | ||
else: | ||
print('Assumption not satisfied') | ||
print() | ||
print('Coefficient interpretability will be problematic') | ||
print('Consider removing variables with a high Variance Inflation Factor (VIF)') | ||
|
||
def autocorrelation_assumption(): | ||
""" | ||
Autocorrelation: Assumes that there is no autocorrelation in the residuals. If there is | ||
autocorrelation, then there is a pattern that is not explained due to | ||
the current value being dependent on the previous value. | ||
This may be resolved by adding a lag variable of either the dependent | ||
variable or some of the predictors. | ||
""" | ||
from statsmodels.stats.stattools import durbin_watson | ||
print('\n=======================================================================================') | ||
print('Assumption 4: No Autocorrelation') | ||
print('\nPerforming Durbin-Watson Test') | ||
print('Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data') | ||
print('0 to 2< is positive autocorrelation') | ||
print('>2 to 4 is negative autocorrelation') | ||
print('-------------------------------------') | ||
durbinWatson = durbin_watson(df_results['Residuals']) | ||
print('Durbin-Watson:', durbinWatson) | ||
if durbinWatson < 1.5: | ||
print('Signs of positive autocorrelation', '\n') | ||
print('Assumption not satisfied', '\n') | ||
print('Consider adding lag variables') | ||
elif durbinWatson > 2.5: | ||
print('Signs of negative autocorrelation', '\n') | ||
print('Assumption not satisfied', '\n') | ||
print('Consider adding lag variables') | ||
else: | ||
print('Little to no autocorrelation', '\n') | ||
print('Assumption satisfied') | ||
|
||
def homoscedasticity_assumption(): | ||
""" | ||
Homoscedasticity: Assumes that the errors exhibit constant variance | ||
""" | ||
print('\n=======================================================================================') | ||
print('Assumption 5: Homoscedasticity of Error Terms') | ||
print('Residuals should have relative constant variance') | ||
|
||
# Plotting the residuals | ||
plt.subplots(figsize=(12, 6)) | ||
ax = plt.subplot(111) # To remove spines | ||
plt.scatter(x=df_results.index, y=df_results.Residuals, alpha=0.5) | ||
plt.plot(np.repeat(0, df_results.index.max()), color='darkorange', linestyle='--') | ||
ax.spines['right'].set_visible(False) # Removing the right spine | ||
ax.spines['top'].set_visible(False) # Removing the top spine | ||
plt.title('Residuals') | ||
plt.show() | ||
print('If heteroscedasticity is apparent, confidence intervals and predictions will be affected') | ||
|
||
linear_assumption() | ||
normal_errors_assumption() | ||
multicollinearity_assumption() | ||
autocorrelation_assumption() | ||
homoscedasticity_assumption() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
requests | ||
|
||
matplotlib | ||
pandas | ||
sklearn | ||
statsmodels | ||
numpy | ||
seaborn | ||
|
||
rethinkdb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from analyze import analyze | ||
|
||
|
||
def run(): | ||
a = analyze.Analyzer() | ||
a.analyze() | ||
|
||
|
||
if __name__ == '__main__': | ||
try: | ||
run() | ||
except KeyboardInterrupt: | ||
print() | ||
print("Olympus aborted. Exiting!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
version: "3.5" | ||
|
||
services: | ||
gatherer: | ||
build: gatherer | ||
env_file: | ||
- .env | ||
environment: | ||
DB_HOST: db | ||
restart: unless-stopped | ||
|
||
db: | ||
image: rethinkdb:2.4.1-buster-slim | ||
volumes: | ||
- db_data:/data | ||
restart: unless-stopped | ||
ports: | ||
- "5000:8080" | ||
- "28015:28015" | ||
|
||
volumes: | ||
db_data: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
FROM python:alpine3.8 | ||
|
||
WORKDIR /usr/olympus | ||
|
||
COPY requirements.txt requirements.txt | ||
RUN pip install -r requirements.txt | ||
|
||
COPY . . | ||
|
||
ENTRYPOINT ["./entrypoint.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/sh | ||
|
||
python start.py |
Empty file.
Oops, something went wrong.