-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathrun_supernet_training.py
66 lines (55 loc) · 2.04 KB
/
run_supernet_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -- coding: utf-8 -*-
import os
import subprocess
import multiprocessing
import argparse
GPUs = [0, 1, 2, 3]
parser = argparse.ArgumentParser()
parser.add_argument("--result-dir", required=True)
parser.add_argument("--seed", default=20)
parser.add_argument("--save-every", required=True)
parser.add_argument("--load", default=None)
parser.add_argument("--gpus", default=None)
args, cfgs = parser.parse_known_args()
if args.gpus is not None:
GPUs = [int(g) for g in args.gpus.split(",")]
cfgs = [os.path.abspath(cfg) for cfg in cfgs]
common_path = os.path.commonpath(cfgs)
os.makedirs(args.result_dir, exist_ok=True)
rel_res_dirs = [os.path.relpath(cfg, common_path).replace(".yaml", "") for cfg in cfgs]
res_dirs = [os.path.join(args.result_dir, rel_dir) for rel_dir in rel_res_dirs]
for res_dir in res_dirs:
os.makedirs(res_dir, exist_ok=True)
num_processes = len(GPUs)
print(
"Num process: {}. Num exp: {}. Would save to: {}".format(
num_processes, len(cfgs), res_dirs
)
)
queue = multiprocessing.Queue(maxsize=num_processes)
def _worker(p_id, gpu_id, queue):
while 1:
token = queue.get()
if token is None:
break
cfg_file, res_dir = token
# os.makedirs(res_dir, exist_ok=True)
# log_file = os.path.join(res_dir, "search_tail.log")
load_str = "" if args.load is None else "--load {}".format(args.load)
cmd = (
"awnas search {} --gpu {} --seed {} --save-every {} --train-dir {} {}"
" >/dev/null 2>&1"
).format(
cfg_file, gpu_id, args.seed, args.save_every, res_dir, load_str
) # , log_file)
print("Process #{}: cfg {}; CMD: {}".format(p_id, cfg_file, cmd))
subprocess.check_call(cmd, shell=True)
print("Process #{} end".format(p_id))
for p_id in range(num_processes):
p = multiprocessing.Process(target=_worker, args=(p_id, GPUs[p_id], queue))
p.start()
for cfg, res_dir in zip(cfgs, res_dirs):
queue.put((cfg, res_dir))
# close all the workers
for _ in range(num_processes):
queue.put(None)