initial_state_features.py

from __future__ import print_function

import theano
import numpy as np

import argparse
import logging
import sys

from data_generator import VisualWordDataGenerator
import models

# Set up logger
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger(__name__)

# Dimensionality of image feature vector
IMG_FEATS = 4096


class ExtractMergeActivations:

    def __init__(self, args):
        self.args = args
        self.vocab = dict()
        self.unkdict = dict()
        self.counter = 0
        self.maxSeqLen = 0

        # consistent with models.py
        # maybe use_sourcelang isn't applicable here?
        self.use_sourcelang = args.source_vectors is not None
        self.use_image = not args.no_image

        if self.args.debug:
            theano.config.optimizer = 'None'
            theano.config.exception_verbosity = 'high'

    def get_merge_activations(self):
        '''
        In the model, we will merge the VGG image representation with
        the word embeddings. We need to feed the data as a list, in which
        the order of the elements in the list is _crucial_.
        '''

        self.data_generator = VisualWordDataGenerator(self.args,
                                                      input_dataset=self.args.checkpoint_dataset,
                                                      hsn=self.args.hidden_size)
        self.data_generator.set_vocabulary(self.args.checkpoint)
        self.vocab_len = len(self.data_generator.index2word)

        if not self.use_sourcelang:
            hsn_size = 0
        else:
            hsn_size = self.data_generator.hsn_size  # ick

        m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len,
                                self.args.dropin,
                                self.args.optimiser, self.args.l2reg,
                                hsn_size=hsn_size,
                                weights=self.args.checkpoint,
                                gru=self.args.gru)

        self.model =\
            m.buildMergeActivations(use_image=self.use_image,
                                    use_sourcelang=self.use_sourcelang)

        self.generate_activations('val')

    def generate_activations(self, split):
        '''
        Generate and serialise merge state activations
        into --dataset.
        '''
        logger.info("Generating merge state activations\
                    from this model for %s\n", split)

        if split == 'train':
            """ WARNING: This collects the *entirety of the training data* in
            hidden_states, so should not be used on non-toy training data.
            """
            hidden_states = []
            batch_start = 0
            batch_end = 0
            for train_input, trainY, indicator in\
                self.data_generator.yield_training_batch(self.args.big_batch_size,
                                                         self.use_sourcelang,
                                                         self.use_image):
                feats = self.model.predict(train_input,
                                           batch_size=self.args.batch_size,
                                           verbose=1)
                for f in feats:
                    activations = f[0]  # we want the merge features
                    hidden_states.append(activations)
                    batch_end += 1
                # Note: serialisation happens over training batches too.
                # now serialise the hidden representations in the h5
                self.serialise_to_csv(split, hidden_states,
                                     batch_start, batch_end)

                batch_start = batch_end
                hidden_states = []

        elif split == 'val':
            val_input, valY = self.data_generator.get_data_by_split('val',
                self.use_sourcelang, self.use_image)
            logger.info("Generating merge activations from this model for val\n")

            hidden_states = []
            feats = self.model.predict(val_input,
                                       batch_size=self.args.batch_size,
                                       verbose=1)
            for f in feats:
                activations = f[0]  # we want the merge features
                hidden_states.append(activations)

            # now serialise the hidden representations in the h5
            self.serialise_to_csv(split, hidden_states)

    def serialise_to_csv(self, split, hidden_states,
                         batch_start=None, batch_end=None):
        """ Serialise the hidden representations from generate_activations
        into a CSV for t-SNE visualisation."""
        logger.info("Serialising merge state features from %s to csv",
                    split)
        fhf_str = "%s-initial_hidden_features" % self.args.run_string

        if self.args.source_vectors is not None:
            fhf_str = "%s-multilingual_initial_hidden_features" % self.args.run_string
        f = open(fhf_str, 'a')
        for h in hidden_states:
            np.savetxt(f, h, delimiter=',', newline=',')
            f.write("\n")
        f.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="""
                                     Serialise initial RNN hidden state vector
                                     for each instance in a dataset.""")

    parser.add_argument("--run_string", default="", type=str,
                        help="Optional string to help you identify the run")
    parser.add_argument("--debug", action="store_true",
                        help="Print debug messages to stdout?")

    parser.add_argument("--small", action="store_true",
                        help="Run on 100 image--{sentences} pairing.\
                        Useful for debugging")
    parser.add_argument("--num_sents", default=5, type=int,
                        help="Number of descriptions/image to use")
    parser.add_argument("--small_val", action="store_true",
                        help="Validate on 100 descriptions.")

    parser.add_argument("--batch_size", default=100, type=int)
    parser.add_argument("--hidden_size", default=256, type=int)
    parser.add_argument("--dropin", default=0.5, type=float,
                        help="Prob. of dropping embedding units. Default=0.5")
    parser.add_argument("--droph", default=0.2, type=float,
                        help="Prob. of dropping hidden units. Default=0.2")
    parser.add_argument("--gru", action="store_true", help="Use GRU instead\
                      of LSTM recurrent state? (default = False)")

    parser.add_argument("--test", action="store_true",
                        help="Generate for the test images? Default=False")
    parser.add_argument("--generation_timesteps", default=10, type=int,
                        help="Attempt to generate how many words?")
    parser.add_argument("--checkpoint", type=str, required=True,
                        help="Path to the checkpointed parameters")
    parser.add_argument("--dataset", type=str,
                        help="Dataset on which to evaluate")
    parser.add_argument("--big_batch_size", type=int, default=1000)

    parser.add_argument("--optimiser", default="adagrad", type=str,
                        help="Optimiser: rmsprop, momentum, adagrad, etc.")
    parser.add_argument("--l2reg", default=1e-8, type=float,
                        help="L2 cost penalty. Default=1e-8")
    parser.add_argument("--unk", type=int, default=5)
    parser.add_argument("--supertrain_datasets", nargs="+")
    parser.add_argument("--h5_writeable", action="store_true",
                        help="Open the H5 file for write-access? Useful for\
                        serialising hidden states to disk. (default = False)")

    parser.add_argument("--mt_only", action="store_true",
                        help="Do not use image data: MT baseline.")
    # If --source_vectors = None: model uses only visual/image input, no
    # source language/encoder hidden layer representation feature vectors.
    parser.add_argument("--source_vectors", default=None, type=str,
                        help="Path to final hidden representations of\
                        encoder/source language VisualWordLSTM model.\
                        (default: None.) Expects a final_hidden_representation\
                        vector for each image in the dataset")
    parser.add_argument("--checkpoint_dataset", type=str)
    parser.add_argument("--use_source_vectors", action="store_true")
    # These options turn off image or source language inputs.
    # Image data is *always* included in the hdf5 dataset, even if --no_image
    # is set.
    parser.add_argument("--no_image", action="store_true",
                        help="Do not use image data.")

    w = ExtractMergeActivations(parser.parse_args())
    w.get_merge_activations()