scripts/mlpr

#! /bin/bash
export VERSION='0.1.5'
echo  -e "
\033[92m
MachineLearningPRoject
\033[0m
MLPR is a Machine Learning Project helper script
which creates a project directory to help you out.

It does the following:
- make data, cache, script directories
- create a scripts/mlpr.py helper script along with a scripts/main.py
- create a README.md for note taking purposes, adding you
  as author and the timestamping the project start date-time
- Create a python3 virtualenv and attempt to install commonly used
  ML libraries in it.
"

echo -e "\033[92mName of Project:\033[0m"
read -r project
echo -e "\033[92mAuthor Name:\033[0m"
read -r author

echo "Making the project directories"
mkdir $project
cd $project

echo "Making README files"
echo "
Meta
====
$project
$author
$(date)
mlpr version = '$VERSION'

- data: for raw data files
- cache: used by @cache to cache expensive functions
- scripts: holds all scripts. To be used as PWD

Notes
-----"  >> README.md


echo "Making directories for working"
mkdir data scripts cache

echo "Making default scripts"

# DEFAULT PROJECT TEMPLATE
echo "# $project
# $author
# $(date)

import numpy as np
np.random.seed(1)

import pandas as pd
from tqdm import tqdm
# see https://github.com/theSage21/reimagined-chainsaw
# mlpr.__version__ = $VERSION
from mlpr import report, cache
 
#import xgboost as xgb

#from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute
#from sklearn.pipeline import Pipeline, FeatureUnion
#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#from sklearn.linear_model import LinearRegression, Ridge
#from sklearn.preprocessing import (Imputer, MinMaxScaler,
#    RobustScaler, PolynomialFeatures,
#    LabelEncoder, OneHotEncoder,
#    StandardScaler)
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import GridSearchCV
#from sklearn.decomposition import PCA
# -----------------------------------------------------------------------------

@report
def load_db():
    df = pd.read_csv('../data/train.csv')
    return df

@report
@cache
def process_data(df):
    return df

@report
def make_pipe():
    rf = RandomForestClassifier(n_jobs=-1, n_estimators=30)
    return rf

@report
def cross_validate(df, pipe):
    # TODO: scoring
    scores = cross_val_score(pipe, x, y, scoring='accuracy', cv=10)
    return scores

@report
@cache
def test_data_processing(ts):
    "Processing specific to test data"
    return ts

@report
def make_submission(pipe):
    ts = pd.read_csv('../data/test.csv')
    pts = process_data(ts)
    pts = test_data_processing(pts)
    predictions = pipe.predict(pts)
    #  TODO
    pd.DataFrame({}
    )[[]
    ].to_csv('../submission.csv', index=False)

# -----------------------------------------------------------------------------
if __name__ == '__main__':
    df = load_db()
    df = process_data(df)
    pipe = make_pipe()
    scores = cross_validate(df, pipe)
    print(scores.mean(), scores.std(), scores)
    x, y = df.drop('', axis=1), df['']  # TODO
    pipe.fit(x, y)
    make_submission(pipe)
# -----------------------------------------------------------------------------" > scripts/main.py
# TEMPLATE DONE

echo "import os
import pickle
from functools import wraps
from hashlib import md5
__all__ = ['report', 'cache']
__version__ = '$VERSION'
LEVEL = 0

def clear_cache():
    'Used to clear all cache'
    # TODO
    

def report(fn):
    'report the beginning and ending of this functions calls.'
    B, G, R, CLR = '\033[94m', '\033[92m','\033[91m', '\033[0m'
    import time
    def new_fn(*args, **kwargs):
        doc, start = fn.__doc__ or fn.__name__, time.time()
        global LEVEL
        LEVEL += 1
        pre = str(LEVEL) + ('>' * LEVEL)
        print(R, pre, 'starting ', CLR, fn.__name__,
            '{}...'.format(doc[:20]))

        results = fn(*args, **kwargs)

        end = time.time()
        print(G, pre, 'complete ', CLR, fn.__name__,
                'In {}{}{} seconds'.format(B, end - start, CLR))
        print('')
        LEVEL -= 1
        return results
    return new_fn

def cache(fn):
    '''
    save results of this function to file for future caching
    Kwargs are not counted in function signature
    '''
    root = '../cache/'
    if not os.path.exists(root):
        os.system('mkdir '+ root)

    @wraps(fn)
    def new_fn(*args, **kwargs):
        name, string = fn.__name__, (fn.__name__, args)
        key = md5(string.__str__().encode()).hexdigest()
        if os.path.exists(root + key):
            print('Found cached. Name: ', key)
            with open(root + key, 'rb') as fl:
                result = pickle.load(fl)
        else:
            print('Calculating. Not found in cache...')
            result = fn(*args, **kwargs)  # Calculate
            with open(root + key, 'wb') as fl:
                pickle.dump(result, fl)  # Save results
            print('Saved to cache for future runs. Name: ', key)
        return result
    return new_fn
" > scripts/mlpr.py

echo "Creating environment"
virtualenv -p python3 env
source env/bin/activate
# We do this to install all that we can and not fail alltogether
# on a pip install -r req.txt
pip install numpy 
pip install scipy 
pip install sklearn 
pip install pandas 
pip install tqdm 
pip install matplotlib 
pip install pillow 
pip install seaborn 
pip install xgboost
pip freeze > requirements.txt
echo "Complete"