-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathopen_world_cifar.py
246 lines (198 loc) · 8.91 KB
/
open_world_cifar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
from __future__ import print_function
from PIL import Image
import os
import os.path
import numpy as np
import sys
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
import torchvision
import torch.utils.data as data
from torchvision import transforms
import itertools
from torch.utils.data.sampler import Sampler
class OPENWORLDCIFAR100(torchvision.datasets.CIFAR100):
def __init__(self, root, labeled=True, labeled_num=50, labeled_ratio=0.5, rand_number=0, transform=None, target_transform=None, train=True, class_list=None,
download=False, unlabeled_idxs=None):
super(OPENWORLDCIFAR100, self).__init__(root, train, transform, target_transform, download)
if self.train:
downloaded_list = self.train_list
else:
downloaded_list = self.test_list
self.data = []
self.targets = []
# now load the picked numpy arrays
for file_name, checksum in downloaded_list:
file_path = os.path.join(self.root, self.base_folder, file_name)
with open(file_path, 'rb') as f:
entry = pickle.load(f, encoding='latin1')
self.data.append(entry['data'])
if 'labels' in entry:
self.targets.extend(entry['labels'])
else:
self.targets.extend(entry['fine_labels'])
self.targets = np.array(self.targets)
if class_list is not None:
known_idx = [self.class_to_idx[class_name] for class_name in class_list]
novel_idx = list(set(range(100)) - set(known_idx))
label_map = [None for _ in range(100)]
for new_idx, old_idx in enumerate(known_idx + novel_idx):
label_map[old_idx] = new_idx
# label_map = {old_idx:new_idx for new_idx, old_idx in enumerate(known_idx + novel_idx)}
self.targets = np.array(label_map)[self.targets]
print("target transformed")
self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC
labeled_classes = range(labeled_num)
np.random.seed(rand_number)
if train:
if labeled:
self.labeled_idxs, self.unlabeled_idxs = self.get_labeled_index(labeled_classes, labeled_ratio)
self.shrink_data(self.labeled_idxs)
else:
self.shrink_data(unlabeled_idxs)
def get_labeled_index(self, labeled_classes, labeled_ratio):
labeled_idxs = []
unlabeled_idxs = []
for idx, label in enumerate(self.targets):
if label in labeled_classes and np.random.rand() < labeled_ratio:
labeled_idxs.append(idx)
else:
unlabeled_idxs.append(idx)
return labeled_idxs, unlabeled_idxs
def shrink_data(self, idxs):
self.targets = self.targets[idxs].tolist()
self.data = self.data[idxs, ...]
class OPENWORLDCIFAR10(torchvision.datasets.CIFAR10):
def __init__(self, root, labeled=True, labeled_num=5, labeled_ratio=0.5, rand_number=0, transform=None, target_transform=None, train=True,
download=False, unlabeled_idxs=None):
super(OPENWORLDCIFAR10, self).__init__(root, train, transform, target_transform, download)
if self.train:
downloaded_list = self.train_list
else:
downloaded_list = self.test_list
self.data = []
self.targets = []
# now load the picked numpy arrays
for file_name, checksum in downloaded_list:
file_path = os.path.join(self.root, self.base_folder, file_name)
with open(file_path, 'rb') as f:
entry = pickle.load(f, encoding='latin1')
self.data.append(entry['data'])
if 'labels' in entry:
self.targets.extend(entry['labels'])
else:
self.targets.extend(entry['fine_labels'])
self.targets = np.array(self.targets)
self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC
labeled_classes = range(labeled_num)
np.random.seed(rand_number)
if train:
if labeled:
self.labeled_idxs, self.unlabeled_idxs = self.get_labeled_index(labeled_classes, labeled_ratio)
self.shrink_data(self.labeled_idxs)
else:
self.shrink_data(unlabeled_idxs)
def get_labeled_index(self, labeled_classes, labeled_ratio):
labeled_idxs = []
unlabeled_idxs = []
for idx, label in enumerate(self.targets):
if label in labeled_classes and np.random.rand() < labeled_ratio:
labeled_idxs.append(idx)
else:
unlabeled_idxs.append(idx)
return labeled_idxs, unlabeled_idxs
def shrink_data(self, idxs):
targets = np.array(self.targets)
self.targets = targets[idxs].tolist()
self.data = self.data[idxs, ...]
# Dictionary of transforms
dict_transform = {
'cifar_train_oldxxx': transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
]),
'cifar_train': transforms.Compose([
# transforms.RandomCrop(32, padding=4),
# transforms.RandomHorizontalFlip(),
transforms.RandomResizedCrop(32, scale=(0.2, 1.)),
transforms.RandomHorizontalFlip(),
transforms.RandomApply([
transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)
], p=0.8),
transforms.RandomGrayscale(p=0.2),
transforms.ToTensor(),
transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
]),
'cifar_test': transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
])
}
import numpy as np
import torch
from bisect import bisect_left
class TinyImages(torch.utils.data.Dataset):
def __init__(self, transform=None, exclude_cifar=True):
data_file = open('/nobackup-slow/dataset/80million/tiny_images.bin', "rb")
def load_image(idx):
data_file.seek(idx * 3072)
data = data_file.read(3072)
# print(np.fromstring(data, dtype='uint8').reshape(32, 32, 3, order="F"))
# breakpoint()
return np.fromstring(data, dtype='uint8').reshape(32, 32, 3, order="F")
self.load_image = load_image
self.offset = 0 # offset index
self.transform = transform
self.exclude_cifar = exclude_cifar
if exclude_cifar:
self.cifar_idxs = []
with open('/nobackup-slow/dataset/80million/80mn_cifar_idxs.txt', 'r') as idxs:
for idx in idxs:
# indices in file take the 80mn database to start at 1, hence "- 1"
self.cifar_idxs.append(int(idx) - 1)
# hash table option
self.cifar_idxs = set(self.cifar_idxs)
self.in_cifar = lambda x: x in self.cifar_idxs
# bisection search option
# self.cifar_idxs = tuple(sorted(self.cifar_idxs))
#
# def binary_search(x, hi=len(self.cifar_idxs)):
# pos = bisect_left(self.cifar_idxs, x, 0, hi) # find insertion position
# return True if pos != hi and self.cifar_idxs[pos] == x else False
#
# self.in_cifar = binary_search
def __getitem__(self, index):
index = (index + self.offset) % 79302016
if self.exclude_cifar:
while self.in_cifar(index):
index = np.random.randint(79302017)
img = self.load_image(index)
if self.transform is not None:
img = self.transform(img)
return img, 0 # 0 is the class
def __len__(self):
return 79302017
class RandomImages(torch.utils.data.Dataset):
def __init__(self, root, transform=None, exclude_cifar=False):
data_file = np.load(root)
def load_image(idx):
data = data_file[idx]
return np.asarray(data, dtype='uint8')#.reshape(32, 32, 3, order="F")
self.load_image = load_image
self.offset = 0 # offset index
self.transform = transform
self.exclude_cifar = exclude_cifar
def __getitem__(self, index):
index = (index + self.offset) % 299999
img = self.load_image(index)
if self.transform is not None:
img = self.transform(img)
return img, 0 # 0 is the class
def __len__(self):
return 300000