-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmunge_data.py
46 lines (42 loc) · 1.59 KB
/
munge_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# munge data
import pandas as pd
import os
import ast
from sklearn import model_selection
from tqdm import tqdm
import numpy as np
import shutil
data_path = 'data/input/'
output_path = 'data/preprocessed/'
def process_data(data, data_type='train'):
for _, row in tqdm(data.iterrows(), total=len(data)):
image_name = row['image_id']
bounding_boxes = row['bboxes']
yolo_data = []
for bbox in bounding_boxes:
x = bbox[0]
y = bbox[1]
w = bbox[2]
h = bbox[3]
x_center = x+w/2
y_center = y+h/2
x_center /= 1024
y_center /= 1024
w /= 1024
h /= 1024
yolo_data.append([0, x_center, y_center, w, h])
yolo_data = np.array(yolo_data)
np.savetxt(os.path.join(output_path, f"labels/{data_type}/{image_name}.txt"), yolo_data, fmt=["%d", "%f", "%f" , "%f", "%f"])
shutil.copyfile(
os.path.join(data_path,f"train/{image_name}.jpg"),
os.path.join(output_path,f"images/{data_type}/{image_name}.jpg")
)
if __name__=="__main__":
df = pd.read_csv(os.path.join(data_path, 'train.csv'))
df.bbox = df.bbox.apply(ast.literal_eval)
df = df.groupby('image_id')['bbox'].apply(list).reset_index(name='bboxes')
df_train, df_valid = model_selection.train_test_split(df, test_size=0.1, random_state=42, shuffle=True)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
process_data(df_train, data_type='train')
process_data(df_valid, data_type='validation')