-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmlpr
executable file
·207 lines (177 loc) · 5.28 KB
/
mlpr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#! /bin/bash
export VERSION='0.1.5'
echo -e "
\033[92m
MachineLearningPRoject
\033[0m
MLPR is a Machine Learning Project helper script
which creates a project directory to help you out.
It does the following:
- make data, cache, script directories
- create a scripts/mlpr.py helper script along with a scripts/main.py
- create a README.md for note taking purposes, adding you
as author and the timestamping the project start date-time
- Create a python3 virtualenv and attempt to install commonly used
ML libraries in it.
"
echo -e "\033[92mName of Project:\033[0m"
read -r project
echo -e "\033[92mAuthor Name:\033[0m"
read -r author
echo "Making the project directories"
mkdir $project
cd $project
echo "Making README files"
echo "
Meta
====
$project
$author
$(date)
mlpr version = '$VERSION'
- data: for raw data files
- cache: used by @cache to cache expensive functions
- scripts: holds all scripts. To be used as PWD
Notes
-----" >> README.md
echo "Making directories for working"
mkdir data scripts cache
echo "Making default scripts"
# DEFAULT PROJECT TEMPLATE
echo "# $project
# $author
# $(date)
import numpy as np
np.random.seed(1)
import pandas as pd
from tqdm import tqdm
# see https://github.com/theSage21/reimagined-chainsaw
# mlpr.__version__ = $VERSION
from mlpr import report, cache
#import xgboost as xgb
#from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute
#from sklearn.pipeline import Pipeline, FeatureUnion
#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#from sklearn.linear_model import LinearRegression, Ridge
#from sklearn.preprocessing import (Imputer, MinMaxScaler,
# RobustScaler, PolynomialFeatures,
# LabelEncoder, OneHotEncoder,
# StandardScaler)
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import GridSearchCV
#from sklearn.decomposition import PCA
# -----------------------------------------------------------------------------
@report
def load_db():
df = pd.read_csv('../data/train.csv')
return df
@report
@cache
def process_data(df):
return df
@report
def make_pipe():
rf = RandomForestClassifier(n_jobs=-1, n_estimators=30)
return rf
@report
def cross_validate(df, pipe):
# TODO: scoring
scores = cross_val_score(pipe, x, y, scoring='accuracy', cv=10)
return scores
@report
@cache
def test_data_processing(ts):
"Processing specific to test data"
return ts
@report
def make_submission(pipe):
ts = pd.read_csv('../data/test.csv')
pts = process_data(ts)
pts = test_data_processing(pts)
predictions = pipe.predict(pts)
# TODO
pd.DataFrame({}
)[[]
].to_csv('../submission.csv', index=False)
# -----------------------------------------------------------------------------
if __name__ == '__main__':
df = load_db()
df = process_data(df)
pipe = make_pipe()
scores = cross_validate(df, pipe)
print(scores.mean(), scores.std(), scores)
x, y = df.drop('', axis=1), df[''] # TODO
pipe.fit(x, y)
make_submission(pipe)
# -----------------------------------------------------------------------------" > scripts/main.py
# TEMPLATE DONE
echo "import os
import pickle
from functools import wraps
from hashlib import md5
__all__ = ['report', 'cache']
__version__ = '$VERSION'
LEVEL = 0
def clear_cache():
'Used to clear all cache'
# TODO
def report(fn):
'report the beginning and ending of this functions calls.'
B, G, R, CLR = '\033[94m', '\033[92m','\033[91m', '\033[0m'
import time
def new_fn(*args, **kwargs):
doc, start = fn.__doc__ or fn.__name__, time.time()
global LEVEL
LEVEL += 1
pre = str(LEVEL) + ('>' * LEVEL)
print(R, pre, 'starting ', CLR, fn.__name__,
'{}...'.format(doc[:20]))
results = fn(*args, **kwargs)
end = time.time()
print(G, pre, 'complete ', CLR, fn.__name__,
'In {}{}{} seconds'.format(B, end - start, CLR))
print('')
LEVEL -= 1
return results
return new_fn
def cache(fn):
'''
save results of this function to file for future caching
Kwargs are not counted in function signature
'''
root = '../cache/'
if not os.path.exists(root):
os.system('mkdir '+ root)
@wraps(fn)
def new_fn(*args, **kwargs):
name, string = fn.__name__, (fn.__name__, args)
key = md5(string.__str__().encode()).hexdigest()
if os.path.exists(root + key):
print('Found cached. Name: ', key)
with open(root + key, 'rb') as fl:
result = pickle.load(fl)
else:
print('Calculating. Not found in cache...')
result = fn(*args, **kwargs) # Calculate
with open(root + key, 'wb') as fl:
pickle.dump(result, fl) # Save results
print('Saved to cache for future runs. Name: ', key)
return result
return new_fn
" > scripts/mlpr.py
echo "Creating environment"
virtualenv -p python3 env
source env/bin/activate
# We do this to install all that we can and not fail alltogether
# on a pip install -r req.txt
pip install numpy
pip install scipy
pip install sklearn
pip install pandas
pip install tqdm
pip install matplotlib
pip install pillow
pip install seaborn
pip install xgboost
pip freeze > requirements.txt
echo "Complete"