-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUtil.py
97 lines (73 loc) · 2.81 KB
/
Util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from pathlib import Path
import pandas as pd
class Util:
def __init__(self, name):
"""
Initialize the class
"""
self.name = name
def data_processing(self):
"""
Process the raw data
Save the processed data in the data folder as
X_train.csv, X_val.csv, X_test.csv, y_train.csv, y_val.csv, y_test.csv
to use default methods.
"""
pass
def data_division(self, n_workers):
"""
Divide the data into n_workers subsets
Save the subsets in [n_workers]_workers folder as
x_train_subset_[i].csv, y_train_subset_[i].csv, where i = 1, 2, ..., n_workers
"""
print(f'Dividing the data for {n_workers} workers...')
x_train_path = f"{self.name}/data/X_train.csv"
y_train_path = f"{self.name}/data/y_train.csv"
output_path = f"{self.name}/data/{n_workers}_workers"
output = Path(output_path)
output.mkdir(parents=True, exist_ok=True)
x_train = pd.read_csv(x_train_path)
y_train = pd.read_csv(y_train_path)
n = x_train.shape[0]
subset_size = n // n_workers
file_number = 1
for i in range(0, subset_size*n_workers, subset_size):
print(f'Creating subset {file_number}...')
x_train_subset = x_train.iloc[i:i+subset_size]
y_train_subset = y_train.iloc[i:i+subset_size]
x_train_subset.to_csv(f'{output_path}/x_train_subset_{file_number}.csv', index=False, header=None)
y_train_subset.to_csv(f'{output_path}/y_train_subset_{file_number}.csv', index=False, header=None)
file_number += 1
def load_training_data(self):
"""
Load the training data
"""
x_train = pd.read_csv(f"{self.name}/data/X_train.csv")
y_train = pd.read_csv(f"{self.name}/data/y_train.csv")
return x_train, y_train
def load_validation_data(self):
"""
Load the validation data
"""
x_val = pd.read_csv(f"{self.name}/data/X_val.csv")
y_val = pd.read_csv(f"{self.name}/data/y_val.csv")
return x_val, y_val
def load_test_data(self):
"""
Load the test data
"""
x_test = pd.read_csv(f"{self.name}/data/X_test.csv")
y_test = pd.read_csv(f"{self.name}/data/y_test.csv")
return x_test, y_test
def load_worker_data(self, n_workers, worker_id):
"""
Load the data of a worker
"""
x_train = pd.read_csv(f"{self.name}/data/{n_workers}_workers/x_train_subset_{worker_id}.csv", header=None)
y_train = pd.read_csv(f"{self.name}/data/{n_workers}_workers/y_train_subset_{worker_id}.csv", header=None)
return x_train, y_train
def create_model(self):
"""
Create the model
"""
pass