svhn.py

import os
import sys
import numpy
import scipy.io

import gzip
import tarfile
import h5py
from PIL import Image
import six.moves.cPickle as pickle
from six.moves import urllib
import shutil

def load_data(data_dir):
    def check_dataset(dataset):
        data_path = os.path.join(data_dir, dataset)
        if (not os.path.isfile(data_path)):
            origin = (
                'http://ufldl.stanford.edu/housenumbers/' + dataset
            )
            print('Downloading data from %s' % origin)
            urllib.request.urlretrieve(origin, data_path)
        return data_path

    train_dataset = check_dataset('train.tar.gz')
    test_dataset = check_dataset('test.tar.gz')

    def format_data(dataset):
        data_path = os.path.join(data_dir, dataset)
        tar = tarfile.open(data_path, 'r:gz')

        data_file_split = os.path.splitext(dataset)[0]
        data_type = os.path.splitext(data_file_split)[0]

        def check_file(folder_name):
            new_path = os.path.join(data_dir, folder_name)
            if (not os.path.exists(new_path)):
                tar.extractall(data_dir)
                process_data()

        def process_data():
            print '... processing data (should only occur when downloading for the first time)'
            # Access label information in digitStruct.mat
            new_path = os.path.join(data_dir, data_type, 'digitStruct.mat')
            f = h5py.File(new_path, 'r')

            digitStructName = f['digitStruct']['name']
            digitStructBbox = f['digitStruct']['bbox']

            def getName(n):
                return ''.join([chr(c[0]) for c in f[digitStructName[n][0]].value])

            def bboxHelper(attr):
                if (len(attr) > 1):
                    attr = [f[attr.value[j].item()].value[0][0] for j in range(len(attr))]
                else:
                    attr = [attr.value[0][0]]
                return attr

            def getBbox(n):
                bbox = {}
                bb = digitStructBbox[n].item()
                # bbox = bboxHelper(f[bb]["label"])
                bbox['height'] = bboxHelper(f[bb]["height"])
                bbox['label'] = bboxHelper(f[bb]["label"])
                bbox['left'] = bboxHelper(f[bb]["left"])
                bbox['top'] = bboxHelper(f[bb]["top"])
                bbox['width'] = bboxHelper(f[bb]["width"])
                return bbox

            def getDigitStructure(n):
                s = getBbox(n)
                s['name'] = getName(n)
                return s

            # Process labels
            print '... creating image box bound dict for %s data' % data_type
            image_dict = {}
            for i in range(len(digitStructName)):
                image_dict[getName(i)] = getBbox(i)
                if (i%1000 == 0):
                    print '     image dict processing: %i/%i complete' %(i,len(digitStructName))
            print '... dict processing complete'

            # Process the data
            print('... processing image data and labels')

            names = []
            for item in os.listdir(os.path.join(data_dir, data_type)):
                if item.endswith('.png'):
                    names.append(item)

            y = []
            x = []
            for i in range(len(names)):
                path = os.path.join(data_dir, data_type)
                y.append(image_dict[names[i]]['label'])
                image = Image.open(path + '/' + names[i]).convert('L')
                left = int(min(image_dict[names[i]]['left']))
                upper = int(min(image_dict[names[i]]['top']))
                right = int(max(image_dict[names[i]]['left'])) + int(max(image_dict[names[i]]['width']))
                lower = int(max(image_dict[names[i]]['top'])) + int(max(image_dict[names[i]]['height']))
                image = image.crop(box = (left, upper, right, lower))
                image = image.resize([32,32])
                image_array = numpy.array(image)
                x.append(image_array)
                if (i%1000 == 0):
                    print '     image processing: %i/%i complete' %(i,len(names))
            print '... image processing complete'

            # Save data
            print '... pickling data'
            out = {}
            out['names'] = names
            out['labels'] = y
            out['images'] = x
            output_file = data_type + 'pkl.gz'
            out_path = os.path.join(data_dir, output_file)
            p = gzip.open(out_path, 'wb')
            pickle.dump(out, p)
            p.close()

            tar.close()
            # clean up (delete test/train folders that were used to create the pickled data)
            shutil.rmtree(os.path.join(data_dir, data_type))

        check_file(data_type)

    # This check will run everytime load_data() is called

    if (not os.path.isfile(os.path.join(data_dir, 'trainpkl.gz'))):
        format_data('train.tar.gz')

    f_train = gzip.open(os.path.join(data_dir, 'trainpkl.gz'), 'rb')
    train_set = pickle.load(f_train)
    f_train.close()

    if (not os.path.isfile(os.path.join(data_dir, 'testpkl.gz'))):
        format_data('test.tar.gz')

    f_test = gzip.open(os.path.join(data_dir, 'testpkl.gz'), 'rb')
    test_set = pickle.load(f_test)
    f_test.close()

    # Convert data format
    def convert_data_format(data):
        data['X'] = data.pop('images')
        data['X'] = numpy.array(data['X'])
        data['X'] = numpy.rollaxis(data['X'],0, data['X'].ndim)
        data['y'] = data.pop('labels')

        X = numpy.reshape(data['X'],
                          (numpy.prod(data['X'].shape[:-1]), data['X'].shape[-1]),
                          order='C').T / 255.

        def process_sequence(labels):
            for i in range(len(labels)):
                l = len(labels[i])-1
                labels[i].insert(0,l)
                zeros = numpy.zeros(6-l-1).tolist()
                labels[i].extend(zeros)
            return numpy.array(labels)

        y = process_sequence(data['y'])
        return (X,y)

    train_set = convert_data_format(train_set)
    test_set = convert_data_format(test_set)

    train_set_len = len(train_set[1])

    # Extract validation dataset from train dataset (10% of the train_set)
    valid_set = [x[-(train_set_len//10):] for x in train_set]
    train_set = [x[:-(train_set_len//10)] for x in train_set]

    # train_set, valid_set, test_set each contain a list [flattened image, sequence].
    # The 'flattened image' part of the list is a 2D numpy array where each row
    # corresponds to a 32x32x3 image. The sequence is a 2D numpy array of the
    # number represented in the image. The first element in the sequence is the
    # length of the number (where 0 = a 1 digit number), the second element in
    # the sequence is the first digit of the number (where 0 means no digit
    # present and 10 = 0), and so on.

    return [train_set, valid_set, test_set]

if __name__ == '__main__':
    load_data(sys.argv[1])