Initial commit.

PatricioNapoli · Nov 16, 2020 · 9cebc6b · 9cebc6b
commit 9cebc6b
Show file tree

Hide file tree

Showing 266 changed files with 24,699 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+venv/
+src/data/*.csv
+src/*.pdf
+.idea/
+.env
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
diff --git a/analysis/analyze/__init__.py b/analysis/analyze/__init__.py
diff --git a/analysis/analyze/analyze.py b/analysis/analyze/analyze.py
@@ -0,0 +1,14 @@
+import os
+
+from .database import Database
+from .explore import explore
+
+
+class Analyzer(object):
+    def __init__(self):
+        self.database = Database(os.environ["DB_HOST"])
+
+    def analyze(self):
+        data = self.database.get_table("btc_history")
+
+        explore(data)
diff --git a/analysis/analyze/database.py b/analysis/analyze/database.py
@@ -0,0 +1,22 @@
+from rethinkdb import RethinkDB
+
+
+class Database(object):
+    def __init__(self, url):
+        self.r = RethinkDB()
+        self.conn = self.r.connect(url, 28015)
+
+    def client(self):
+        return self.r
+
+    def get_table(self, tbl):
+        return self.r.db("olympus").table(tbl).run(self.conn)
+
+    def get(self, tbl, key):
+        return self.r.db("olympus").table(tbl).get(key).run(self.conn)
+
+    def insert(self, tbl, obj):
+        self.r.db("olympus").table(tbl).insert(
+            obj,
+            conflict="replace"
+        ).run(self.conn)
diff --git a/analysis/analyze/explore.py b/analysis/analyze/explore.py
@@ -0,0 +1,68 @@
+import pandas as pd
+
+import matplotlib.pyplot as plt
+from statsmodels.tsa.stattools import adfuller
+
+from numpy import log
+
+from .linear_assumptions import linear_regression_assumptions
+
+
+def get_stationarity(df):
+    df_open = df["close"]
+
+    # Rolling Statistics
+    rolling_mean = df_open.rolling(window=60).mean()
+    rolling_std = df_open.rolling(window=60).std()
+
+    plt.plot(df_open, color='blue', label='Valor')
+    plt.plot(rolling_mean, color='red', label='Media Móvil')
+    plt.plot(rolling_std, color='black', label='Desviación Móvil')
+
+    plt.title(f"Precio Bitcoin Media & Desviación Estándar Móvil de 60 días")
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+    plt.show(block=False)
+
+    plt.cla()
+
+    plt.title(f"Precio Bitcoin Histograma Logarítmico")
+    plt.hist(log(df_open))
+    plt.show()
+
+    # Dickey-Fuller Test
+    result = adfuller(log(df["close"].values))
+    print("Column: close")
+    print('ADF Statistic: {}'.format(result[0]))
+    print('p-value: {}'.format(result[1]))
+    print('Critical Values:')
+    for key, value in result[4].items():
+        print('\t{}: {}'.format(key, value))
+
+
+def explore(data):
+    pd.options.display.max_columns = 20
+
+    df = pd.DataFrame(data)
+    df['id'] = pd.to_datetime(df['id'], unit='s')
+    df = df.set_index(keys=["id"])
+    df.sort_index(inplace=True)
+
+    fts = ["hashrate", "difficulty",
+            "block_time", "comments",
+            "posts", "followers", "points",
+            "reddit_active_users", "reddit_comments_per_hour"]
+
+    linear_regression_assumptions(df[fts].to_numpy(), df["close"].to_numpy(), fts)
+
+    print("Sample data: ")
+    print(df.head().to_string())
+    print()
+
+    print("Summary Describe")
+    print(df.iloc[:, 1:10].describe())
+    print()
+
+    get_stationarity(df)
+
+
diff --git a/analysis/analyze/linear_assumptions.py b/analysis/analyze/linear_assumptions.py
@@ -0,0 +1,213 @@
+from datetime import timedelta
+
+import pandas as pd
+
+import matplotlib.pyplot as plt
+
+import numpy as np
+import seaborn as sns
+
+
+def linear_regression_assumptions(features, label, feature_names=None):
+    """
+    Tests a linear regression on the model to see if assumptions are being met
+    """
+    from sklearn.linear_model import LinearRegression
+
+    # Setting feature names to x1, x2, x3, etc. if they are not defined
+    if feature_names is None:
+        feature_names = ['X' + str(feature + 1) for feature in range(features.shape[1])]
+
+    print('Fitting linear regression')
+    # Multi-threading if the dataset is a size where doing so is beneficial
+    if features.shape[0] < 100000:
+        model = LinearRegression(n_jobs=-1)
+    else:
+        model = LinearRegression()
+
+    model.fit(features, label)
+
+    # Returning linear regression R^2 and coefficients before performing diagnostics
+    r2 = model.score(features, label)
+    print()
+    print('R^2:', r2, '\n')
+    print('Coefficients')
+    print('-------------------------------------')
+    print('Intercept:', model.intercept_)
+
+    for feature in range(len(model.coef_)):
+        print('{0}: {1}'.format(feature_names[feature], model.coef_[feature]))
+
+    print('\nPerforming linear regression assumption testing')
+
+    # Creating predictions and calculating residuals for assumption tests
+    predictions = model.predict(features)
+    df_results = pd.DataFrame({'Actual': label, 'Predicted': predictions})
+    df_results['Residuals'] = abs(df_results['Actual']) - abs(df_results['Predicted'])
+
+    def linear_assumption():
+        """
+        Linearity: Assumes there is a linear relationship between the predictors and
+                   the response variable. If not, either a polynomial term or another
+                   algorithm should be used.
+        """
+        print('\n=======================================================================================')
+        print('Assumption 1: Linear Relationship between the Target and the Features')
+
+        print('Checking with a scatter plot of actual vs. predicted. Predictions should follow the diagonal line.')
+
+        # Plotting the actual vs predicted values
+        sns.lmplot(x='Actual', y='Predicted', data=df_results, fit_reg=False, size=7)
+
+        # Plotting the diagonal line
+        line_coords = np.arange(df_results.min().min(), df_results.max().max())
+        plt.plot(line_coords, line_coords,  # X and y points
+                 color='darkorange', linestyle='--')
+        plt.title('Actual vs. Predicted')
+        plt.show()
+        print('If non-linearity is apparent, consider adding a polynomial term')
+
+    def normal_errors_assumption(p_value_thresh=0.05):
+        """
+        Normality: Assumes that the error terms are normally distributed. If they are not,
+        nonlinear transformations of variables may solve this.
+
+        This assumption being violated primarily causes issues with the confidence intervals
+        """
+        from statsmodels.stats.diagnostic import normal_ad
+        print('\n=======================================================================================')
+        print('Assumption 2: The error terms are normally distributed')
+        print()
+
+        print('Using the Anderson-Darling test for normal distribution')
+
+        # Performing the test on the residuals
+        p_value = normal_ad(df_results['Residuals'])[1]
+        print('p-value from the test - below 0.05 generally means non-normal:', p_value)
+
+        # Reporting the normality of the residuals
+        if p_value < p_value_thresh:
+            print('Residuals are not normally distributed')
+        else:
+            print('Residuals are normally distributed')
+
+        # Plotting the residuals distribution
+        plt.subplots(figsize=(12, 6))
+        plt.title('Distribution of Residuals')
+        sns.distplot(df_results['Residuals'])
+        plt.show()
+
+        print()
+        if p_value > p_value_thresh:
+            print('Assumption satisfied')
+        else:
+            print('Assumption not satisfied')
+            print()
+            print('Confidence intervals will likely be affected')
+            print('Try performing nonlinear transformations on variables')
+
+    def multicollinearity_assumption():
+        """
+        Multicollinearity: Assumes that predictors are not correlated with each other. If there is
+                           correlation among the predictors, then either remove prepdictors with high
+                           Variance Inflation Factor (VIF) values or perform dimensionality reduction
+
+                           This assumption being violated causes issues with interpretability of the
+                           coefficients and the standard errors of the coefficients.
+        """
+        from statsmodels.stats.outliers_influence import variance_inflation_factor
+        print('\n=======================================================================================')
+        print('Assumption 3: Little to no multicollinearity among predictors')
+
+        # Plotting the heatmap
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(pd.DataFrame(features, columns=feature_names).corr(), annot=True)
+        plt.title('Correlation of Variables')
+        plt.show()
+
+        print('Variance Inflation Factors (VIF)')
+        print('> 10: An indication that multicollinearity may be present')
+        print('> 100: Certain multicollinearity among the variables')
+        print('-------------------------------------')
+
+        # Gathering the VIF for each variable
+        VIF = [variance_inflation_factor(features, i) for i in range(features.shape[1])]
+        for idx, vif in enumerate(VIF):
+            print('{0}: {1}'.format(feature_names[idx], vif))
+
+        # Gathering and printing total cases of possible or definite multicollinearity
+        possible_multicollinearity = sum([1 for vif in VIF if vif > 10])
+        definite_multicollinearity = sum([1 for vif in VIF if vif > 100])
+        print()
+        print('{0} cases of possible multicollinearity'.format(possible_multicollinearity))
+        print('{0} cases of definite multicollinearity'.format(definite_multicollinearity))
+        print()
+
+        if definite_multicollinearity == 0:
+            if possible_multicollinearity == 0:
+                print('Assumption satisfied')
+            else:
+                print('Assumption possibly satisfied')
+                print()
+                print('Coefficient interpretability may be problematic')
+                print('Consider removing variables with a high Variance Inflation Factor (VIF)')
+        else:
+            print('Assumption not satisfied')
+            print()
+            print('Coefficient interpretability will be problematic')
+            print('Consider removing variables with a high Variance Inflation Factor (VIF)')
+
+    def autocorrelation_assumption():
+        """
+        Autocorrelation: Assumes that there is no autocorrelation in the residuals. If there is
+                         autocorrelation, then there is a pattern that is not explained due to
+                         the current value being dependent on the previous value.
+                         This may be resolved by adding a lag variable of either the dependent
+                         variable or some of the predictors.
+        """
+        from statsmodels.stats.stattools import durbin_watson
+        print('\n=======================================================================================')
+        print('Assumption 4: No Autocorrelation')
+        print('\nPerforming Durbin-Watson Test')
+        print('Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data')
+        print('0 to 2< is positive autocorrelation')
+        print('>2 to 4 is negative autocorrelation')
+        print('-------------------------------------')
+        durbinWatson = durbin_watson(df_results['Residuals'])
+        print('Durbin-Watson:', durbinWatson)
+        if durbinWatson < 1.5:
+            print('Signs of positive autocorrelation', '\n')
+            print('Assumption not satisfied', '\n')
+            print('Consider adding lag variables')
+        elif durbinWatson > 2.5:
+            print('Signs of negative autocorrelation', '\n')
+            print('Assumption not satisfied', '\n')
+            print('Consider adding lag variables')
+        else:
+            print('Little to no autocorrelation', '\n')
+            print('Assumption satisfied')
+
+    def homoscedasticity_assumption():
+        """
+        Homoscedasticity: Assumes that the errors exhibit constant variance
+        """
+        print('\n=======================================================================================')
+        print('Assumption 5: Homoscedasticity of Error Terms')
+        print('Residuals should have relative constant variance')
+
+        # Plotting the residuals
+        plt.subplots(figsize=(12, 6))
+        ax = plt.subplot(111)  # To remove spines
+        plt.scatter(x=df_results.index, y=df_results.Residuals, alpha=0.5)
+        plt.plot(np.repeat(0, df_results.index.max()), color='darkorange', linestyle='--')
+        ax.spines['right'].set_visible(False)  # Removing the right spine
+        ax.spines['top'].set_visible(False)  # Removing the top spine
+        plt.title('Residuals')
+        plt.show()
+        print('If heteroscedasticity is apparent, confidence intervals and predictions will be affected')
+
+    linear_assumption()
+    normal_errors_assumption()
+    multicollinearity_assumption()
+    autocorrelation_assumption()
+    homoscedasticity_assumption()
diff --git a/analysis/requirements.txt b/analysis/requirements.txt
@@ -0,0 +1,10 @@
+requests
+
+matplotlib
+pandas
+sklearn
+statsmodels
+numpy
+seaborn
+
+rethinkdb
diff --git a/analysis/start.py b/analysis/start.py
@@ -0,0 +1,14 @@
+from analyze import analyze
+
+
+def run():
+    a = analyze.Analyzer()
+    a.analyze()
+
+
+if __name__ == '__main__':
+    try:
+        run()
+    except KeyboardInterrupt:
+        print()
+        print("Olympus aborted. Exiting!")
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,22 @@
+version: "3.5"
+
+services:
+  gatherer:
+    build: gatherer
+    env_file:
+      - .env
+    environment:
+      DB_HOST: db
+    restart: unless-stopped
+
+  db:
+    image: rethinkdb:2.4.1-buster-slim
+    volumes:
+      - db_data:/data
+    restart: unless-stopped
+    ports:
+      - "5000:8080"
+      - "28015:28015"
+
+volumes:
+  db_data:
diff --git a/gatherer/Dockerfile b/gatherer/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:alpine3.8
+
+WORKDIR /usr/olympus
+
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+COPY . .
+
+ENTRYPOINT ["./entrypoint.sh"]
diff --git a/gatherer/entrypoint.sh b/gatherer/entrypoint.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+python start.py
diff --git a/gatherer/gather/__init__.py b/gatherer/gather/__init__.py