-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
65 lines (44 loc) · 1.89 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Train a linear regression model for predicting home prices.
This script reads data from a CSV file, preprocesses it by removing rows with zero prices
and outliers, splits it into training and testing sets, and saves the necessary data for
testing in a pickle file.
Usage:
python train.py
Requirements:
- The `os` module must be imported.
- The `pickle` module (imported as `pk`) must be installed.
- The `pandas` module (imported as `pd`) must be installed.
- The `lin_utils` module must be available in the same directory.
Note:
Before running this script, make sure to have a CSV file named 'data.csv' in the same
directory with the appropriate data columns.
"""
import os
import pickle as pk
import pandas as pd
from lin_utils import train
df = pd.read_csv("data/data.csv")
df = df.drop(df[df['price'] == 0.0].index) # Remove rows where home has no price
df= df.sample(frac=1).reset_index(drop=True)
df['price'] = df['price'].where(df['price'] <= 1000000, 999999) # Remove outliers
# Calculate average price for each 'statezip'
avg_prices = df.groupby('statezip')['price'].mean()
# Assign the average prices to the corresponding rows in 'df'
df['avgprice'] = df['statezip'].map(avg_prices)
rows = df.shape[0]
train_split = int(rows * 0.80)
test_split = int(rows * 0.80) + 1
X_train = df[['sqft_lot', 'sqft_living', 'bathrooms', 'bedrooms', 'condition', 'avgprice']].loc[:train_split]
y_train = df['price'].loc[:train_split]
X_test = df[['sqft_lot', 'sqft_living', 'bathrooms', 'bedrooms', 'condition', 'avgprice']].loc[test_split:]
y_test = df['price'].loc[test_split:]
n_features = X_train.shape[1]
weights, bias = train(X_train, y_train, n_features)
if os.path.exists("pickle") is not True:
os.mkdir("pickle")
with open('pickle/test.pkl', 'wb') as handle:
pk.dump(X_test, handle)
pk.dump(y_test, handle)
pk.dump(weights, handle)
pk.dump(bias, handle)