-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodeling.py
119 lines (109 loc) · 6.52 KB
/
modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from transformers import BertPreTrainedModel
import lightning as pl
from torch.nn import Parameter
import numpy as np
from transformers import AutoModel
import torchmetrics
class EyeGazeQAclip(pl.LightningModule):
def __init__(self):
super().__init__()
self.class_token = nn.Parameter(torch.zeros(1,1,768))
el = nn.TransformerEncoderLayer(768,12, batch_first=True)
self.model = nn.TransformerEncoder(encoder_layer=el, num_layers=3)
self.classifier = nn.Linear(768, 1)
self.correctchoice = 0
self.numberofquestion = 0
self.threshold = 0.1
self.layer = nn.Linear(1024,768)
self.cliplayer = nn.Linear(512,768)
self.testid = []
self.testlabel = []
self.testpredict = []
self.questionnum = []
self.test_step_outputs=[]
self.validation_step_outputs=[]
#changed for 16 figure
self.conv = nn.Conv1d(in_channels=512,out_channels=512,kernel_size=32,stride=32)
def training_step(self,batch):
loss = 0
questionid,questionfeatures,videofeature1,videofeature2,videofeature3,videofeature4,videofeature5,textanswer,imageanswer,videoanswer,label= batch
videofeature1 = self.cliplayer(self.conv(videofeature1.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature2 = self.cliplayer(self.conv(videofeature2.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature3 = self.cliplayer(self.conv(videofeature3.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature4 = self.cliplayer(self.conv(videofeature4.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature5 = self.cliplayer(self.conv(videofeature5.to(torch.float32).transpose(2,1)).transpose(1,2))
image = self.layer(imageanswer)
questionfeatures = self.cliplayer(questionfeatures.to(torch.float32))
videoanswer=self.cliplayer(self.conv(videoanswer.to(torch.float32).transpose(2,1)).transpose(1,2))
textanswer =self.cliplayer(textanswer.to(torch.float32))
#assume one batch is [{question},{n videos choices features},{4 text answers},{correctlabel}]
features = torch.cat([questionfeatures,videofeature1,videofeature2,videofeature3,videofeature4,videofeature5,image,textanswer,videoanswer],dim=1)
class_token = self.class_token.expand(features.shape[0],-1,-1)
_features = torch.cat([class_token,features],dim=1)
multimodaloutput = self.model(_features)
multimodalhead = multimodaloutput[:,0]#with shape of 1*768
logit = self.classifier(multimodalhead)#dot product of multimodal features head with video features head to calculate similarity
device = torch.cuda.current_device()
label= label.unsqueeze(1)
loss = F.binary_cross_entropy_with_logits(logit.float().to('cuda'),label.float().to('cuda'))
self.log('train_loss',loss)
return loss
def test_step(self,batch,batch_idx):
questionid,questionfeatures,videofeature1,videofeature2,videofeature3,videofeature4,videofeature5,textanswer,imageanswer,videoanswer,label= batch
videofeature1 = self.cliplayer(self.conv(videofeature1.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature2 = self.cliplayer(self.conv(videofeature2.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature3 = self.cliplayer(self.conv(videofeature3.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature4 = self.cliplayer(self.conv(videofeature4.to(torch.float32).transpose(2,1)).transpose(1,2))
videofeature5 = self.cliplayer(self.conv(videofeature5.to(torch.float32).transpose(2,1)).transpose(1,2))
image = self.layer(imageanswer)
questionfeatures = self.cliplayer(questionfeatures.to(torch.float32))
videoanswer=self.cliplayer(self.conv(videoanswer.to(torch.float32).transpose(2,1)).transpose(1,2))
textanswer =self.cliplayer(textanswer.to(torch.float32))
#assume one batch is [{question},{n videos choices features},{4 text answers},{correctlabel}]
features = torch.cat([questionfeatures,videofeature1,videofeature2,videofeature3,videofeature4,videofeature5,textanswer,videoanswer],dim=1)
class_token = self.class_token.expand(features.shape[0],-1,-1)
_features = torch.cat([class_token,features],dim=1)
features_img = torch.cat([features,image],dim=1)
#assume one batch is [{question},{n videos choices features},{4 text answers},{correctlabel}]
_features_img = torch.cat([class_token,features_img],dim=1)
multimodaloutput = self.model(_features)
multimodaloutput_img = self.model(_features_img)
multimodalhead = multimodaloutput[:,0]#with shape of 1*768
multimodalhead_img = multimodaloutput_img[:,0]
logit = self.classifier(multimodalhead)
logit_img = self.classifier(multimodalhead_img)
predicted = torch.sigmoid(logit)
predicted_img = torch.sigmoid(logit_img)
for item in predicted_img:
self.testpredict.append(item.cpu().numpy())
num = image.get_device()
random = torch.rand(_features.shape[0]).view(features.shape[0],1)
random = random.to(torch.device(f'cuda:{num}'))
self.test_step_outputs.append([predicted,predicted_img, random, label])
return predicted,predicted_img, random, label
def on_test_epoch_end(self) -> None:
#self.tesid = self.all_gather(self.testid)
self.testlabel = self.all_gather(self.testlabel)
self.questionnum = self.all_gather(self.questionnum)
self.testpredict = self.all_gather(self.testpredict)
predicted_img=[sublist[1] for sublist in self.test_step_outputs]
random=[sublist[2] for sublist in self.test_step_outputs]
label=[sublist[-1] for sublist in self.test_step_outputs]
predicted_img = torch.cat(predicted_img).to('cuda')
label = torch.cat(label).to('cuda').unsqueeze(1)
random = torch.cat(random).to('cuda')
metric = torchmetrics.AveragePrecision(task="binary")
ap_img = metric(predicted_img, label.long())
ap_random = metric(random,label.long())
metric = torchmetrics.Accuracy(task="binary").to('cuda')
self.log('ap', ap_img)
self.log('ap_randomguess', ap_random)
return ap_img,ap_random
def configure_optimizers(self):
optimizer = torch.optim.AdamW(self.parameters(), lr=6e-7)
return optimizer