-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvideo_classification_test.py
121 lines (96 loc) · 4.25 KB
/
video_classification_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
from transformers import TimesformerForVideoClassification, AutoImageProcessor
from torch.utils.data import DataLoader, Dataset
import torch
from torchvision.transforms import Compose, Resize, Normalize
import torchvision.io as io
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load CSV manifest
test_manifest = pd.read_csv("manifests/test_fold_1.csv")
# Encode labels to integers
label_encoder = LabelEncoder()
test_manifest["Emotion"] = label_encoder.fit_transform(test_manifest["Emotion"])
# Define constants
BATCH_SIZE = 4
NUM_FRAMES = 8 # Number of frames sampled per video
RESOLUTION = 224 # Resize video frames to 224x224
FPS = 50 # Original frame rate
TARGET_FPS = FPS // 2 # Use half of the frames
BEST_MODEL_DIR = "best_model" # Directory where the fine-tuned model is saved
# Load the fine-tuned model and feature extractor
feature_extractor = AutoImageProcessor.from_pretrained(BEST_MODEL_DIR)
model = TimesformerForVideoClassification.from_pretrained(BEST_MODEL_DIR)
model.to(device)
# Define Transformations
transform = Compose([
Resize((RESOLUTION, RESOLUTION)),
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
])
# Define Dataset Class
class VideoDataset(Dataset):
def __init__(self, manifest, transform=None, num_frames=NUM_FRAMES):
self.file_paths = manifest["file_path"].values
self.labels = manifest["Emotion"].values
self.transform = transform
self.num_frames = num_frames
def __len__(self):
return len(self.file_paths)
def __getitem__(self, idx):
video_path = self.file_paths[idx]
label = self.labels[idx] # Now numeric due to preprocessing
# Load video frames using torchvision
frames, _, info = io.read_video(video_path, pts_unit="sec")
# Calculate duration based on frame rate and total frames
total_frames = frames.shape[0]
duration = total_frames / FPS # Duration in seconds
max_frames = TARGET_FPS * 8 # Max frames for 8 seconds at target FPS
# Clip to first 8 seconds if necessary
if duration > 8:
frames = frames[:max_frames]
# Downsample frames to half the FPS by removing even frames
frames = frames[::2]
# Sample frames evenly if more than required
total_frames = frames.shape[0]
indices = torch.linspace(0, total_frames - 1, self.num_frames).long()
sampled_frames = frames[indices]
# Apply transformations
if self.transform:
sampled_frames = torch.stack([self.transform(frame.permute(2, 0, 1).float() / 255.0) for frame in sampled_frames])
return sampled_frames, torch.tensor(label, dtype=torch.long)
# Create test dataset and dataloader
test_dataset = VideoDataset(test_manifest, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
# Evaluate the model
model.eval()
total_correct = 0
total_samples = 0
predicted_labels = []
wrong_items = []
with torch.no_grad():
for videos, labels in tqdm(test_loader, desc="Evaluating"):
videos = videos.to(device)
labels = labels.to(device)
outputs = model(pixel_values=videos)
predictions = torch.argmax(outputs.logits, dim=1)
total_correct += (predictions == labels).sum().item()
total_samples += labels.size(0)
# Collect all predicted labels
predicted_labels.extend(predictions.cpu().tolist())
# Collect wrong predictions
for i, (pred, true_label) in enumerate(zip(predictions, labels)):
if pred != true_label:
wrong_items.append({
"file_path": test_dataset.file_paths[i],
"predicted": pred.item(),
"true_label": true_label.item()
})
accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy * 100:.2f}%")
# Save results
pd.DataFrame(predicted_labels, columns=["Predicted Labels"]).to_csv("results/predicted_labels_1.csv", index=False)
pd.DataFrame(wrong_items).to_csv("results/wrong_items_1.csv", index=False)
print("Predicted labels saved to 'predicted_labels'")
print("Wrong predictions saved to 'wrong_items'")