-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
150 lines (129 loc) · 4.46 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from typing import List
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning.trainer.supporters import CombinedLoader
from transformers import T5Tokenizer
class QADataset(Dataset):
def __init__(
self,
data: pd.DataFrame,
tokenizer: T5Tokenizer,
source_max_token_len: int,
target_max_token_len: int
):
self.tokenizer = tokenizer
self.data = data
self.source_max_token_len = source_max_token_len
self.target_max_token_len = target_max_token_len
def __len__(self):
return len(self.data)
def __getitem__(self, index: int):
data_row = self.data.iloc[index]
source_encoding = self.tokenizer(
data_row["question"],
max_length=self.source_max_token_len,
padding="max_length",
truncation="only_first",
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt"
)
target_encoding = self.tokenizer(
data_row["answer_text"],
max_length=self.target_max_token_len,
padding="max_length",
truncation="only_first",
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt"
)
labels = target_encoding["input_ids"]
labels[labels == 0] = -100
return dict(
question=data_row["question"],
aliases=[data_row['aliases']],
answer_text=data_row["answer_text"],
input_ids=source_encoding["input_ids"].flatten(),
attention_mask=source_encoding["attention_mask"].flatten(),
labels=labels.flatten(),
)
class QADataModule(pl.LightningDataModule):
def __init__(
self,
train_df: pd.DataFrame,
test_df: List[pd.DataFrame],
tokenizer: T5Tokenizer,
batch_size: int,
source_max_token_len: int,
target_max_token_len: int
):
super().__init__()
self.batch_size = batch_size
self.train_df = train_df
self.test_dfs = test_df
self.tokenizer = tokenizer
self.source_max_token_len = source_max_token_len
self.target_max_token_len = target_max_token_len
def setup(self):
self.train_dataset = QADataset(
self.train_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len
)
self.test_datasets = [QADataset(
test_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len
) for test_df in self.test_dfs]
def collate_fn(self, batch):
questions = [b['question'] for b in batch]
aliases = [b['aliases'] for b in batch]
answers = [b['answer_text'] for b in batch]
input_ids = torch.tensor([list(b['input_ids']) for b in batch])
attention_masks = torch.tensor(
[list(b['attention_mask']) for b in batch])
labels = torch.tensor([list(b['labels']) for b in batch])
batches = {
"question": questions,
"aliases": aliases,
"answers": answers,
"input_ids": input_ids,
"attention_mask": attention_masks,
"labels": labels,
}
return batches
def train_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=24,
drop_last=False,
collate_fn=self.collate_fn
)
def val_dataloader(self):
loaders = {}
for i, test_dataset in enumerate(self.test_datasets):
loaders[chr(ord('a') + i)] = DataLoader(
test_dataset,
batch_size=self.batch_size,
num_workers=24,
drop_last=False,
collate_fn=self.collate_fn
)
return CombinedLoader(loaders, "max_size_cycle")
def test_dataloader(self):
loaders = {}
for i, test_dataset in enumerate(self.test_datasets):
loaders[chr(ord('a') + i)] = DataLoader(
test_dataset,
batch_size=self.batch_size,
num_workers=24,
drop_last=False,
collate_fn=self.collate_fn
)
return CombinedLoader(loaders, "max_size_cycle")