forked from LARS-research/SNAG
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmol_utils.py
76 lines (70 loc) · 2.34 KB
/
mol_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
'''
File for splitting the dataset loaded from DGL
'''
import torch
import dgl
from dgllife.utils import ScaffoldSplitter, RandomSplitter
from pdb import set_trace
def split_dataset(args, dataset):
"""Split the dataset for pretrain downstream task
Parameters
----------
args
Settings
dataset
Dataset instance
Returns
-------
train_set
Training subset
val_set
Validation subset
test_set
Test subset
"""
train_ratio, val_ratio, test_ratio = map(float, args.split_ratio.split(','))
if args.split == 'scaffold':
train_set, val_set, test_set = ScaffoldSplitter.train_val_test_split(
dataset, frac_train=train_ratio, frac_val=val_ratio, frac_test=test_ratio,
scaffold_func='smiles')
elif args.split == 'random':
train_set, val_set, test_set = RandomSplitter.train_val_test_split(
dataset, frac_train=train_ratio, frac_val=val_ratio, frac_test=test_ratio)
else:
return ValueError("Expect the splitting method to be 'scaffold' or 'random', got {}".format(args.split))
return train_set, val_set, test_set
def collate_molgraphs(data):
"""Batching a list of datapoints for dataloader.
Parameters
----------
data : list of 3-tuples or 4-tuples.
Each tuple is for a single datapoint, consisting of
a SMILES, a DGLGraph, all-task labels and optionally a binary
mask indicating the existence of labels.
Returns
-------
smiles : list
List of smiles
bg : DGLGraph
The batched DGLGraph.
labels : Tensor of dtype float32 and shape (B, T)
Batched datapoint labels. B is len(data) and
T is the number of total tasks.
masks : Tensor of dtype float32 and shape (B, T)
Batched datapoint binary mask, indicating the
existence of labels.
"""
# set_trace()
if len(data[0]) == 3:
smiles, graphs, labels = map(list, zip(*data))
else:
smiles, graphs, labels, masks = map(list, zip(*data))
bg = dgl.batch(graphs)
bg.set_n_initializer(dgl.init.zero_initializer)
bg.set_e_initializer(dgl.init.zero_initializer)
labels = torch.stack(labels, dim=0)
if len(data[0]) == 3:
masks = torch.ones(labels.shape)
else:
masks = torch.stack(masks, dim=0)
return smiles, bg, labels, masks