-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhdf5datasetwriter.py
57 lines (47 loc) · 2.16 KB
/
hdf5datasetwriter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# import packages
import h5py
import os
class HDF5DatasetWriter:
def __init__(self, dims, outputPath, dataKey = "images", bufSize = 1000):
# check to see if the output path exists, and if so, raise an exception
if os.path.exists(outputPath):
raise ValueError("The supplied 'outputPath' already "
"exists and cannot be overwritten. Manually delete "
"the file before continuing", outputPath)
# open the HDF5 database for writing and create two datasets:
# one to store images/features and another to store the class labels
self.db = h5py.File(outputPath, "w")
self.data = self.db.create_dataset(dataKey, dims, dtype = "float")
self.labels = self.db.create_dataset("labels", (dims[0],), dtype = "int")
# store the buffer size, then initialize the buffer itself
# along with the index into the datasets
self.bufSize = bufSize
self.buffer = {"data": [], "labels": []}
self.idx = 0
def add(self, rows, labels):
# add the rows and labels to the buffer
self.buffer["data"].extend(rows)
self.buffer["labels"].extend(labels)
# check to see if the buffer needs to be flushed to disk
if len(self.buffer["data"]) >= self.bufSize:
self.flush()
def flush(self):
# write the buffers to disk then reset the buffer
i = self.idx + len(self.buffer["data"])
self.data[self.idx : i] = self.buffer["data"]
self.labels[self.idx : i] = self.buffer["labels"]
self.idx = i
self.buffer = {"data" : [], "labels": []}
def storeClassLabels(self, classLabels):
# create a dataset to store the actual class label names
# then store the class labels
dt = h5py.special_dtype(vlen = str)
labelSet = self.db.create_dataset("label_names", (len(classLabels),), dtype = dt)
labelSet[:] = classLabels
def close(self):
# check to see if there are any other entries in the buffer
# that need to be flushed to disk
if len(self.buffer["data"]) > 0:
self.flush()
# close the dataset
self.db.close()