dodo.py

# dodo.py
import datetime
import string
import copy
import os
import itertools
from doit import get_var
from doit.tools import run_once

import run
import plot
from mytools.bootstrap import Bootstrapper
from mytools import git
import numpy as np

DOIT_CONFIG = {'verbosity': 2}

if __name__ != "__main__":
    DOIT_CONFIG['datetime'] = get_var('datetime', '')

num_subtasks = 4

test_names = ['jump', 'hierarchical', 'deep']

date_time_string = DOIT_CONFIG.get('datetime', '')

if not date_time_string:
    date_time_string = str(datetime.datetime.now()).split('.')[0]
    date_time_string = reduce(
        lambda y, z: string.replace(y, z, "_"),
        [date_time_string, ":", " ", "-"])

# experiment_directory = '/home/e2crawfo/hrr-scaling/experiments'
#experiment_directory = '/data/e2crawfo/hrr-scaling/experiments'
experiment_directory = '/home/eric/hrr-scaling/experiments'

directory = os.path.join(
    experiment_directory, 'experiments_' + date_time_string)

if not os.path.exists(directory):
    os.makedirs(directory)

# arguments to the function run in run.py
args = {'num_runs': 1, 'jump_trials': 10, 'hier_trials': 3, 'sent_trials': 0,
        'deep_trials': 3, 'expr': 0, 'unitary_roles': True,
        'short_sentence': False, 'do_neg': True, 'corpus_seed': -1,
        'extractor_seed': -1, 'test_seed': -1, 'seed': 1000,
        'dimension': 512, 'num_synsets': -1, 'proportion': 1.0,
        'unitary_relations': False, 'id_vecs': True, 'abstract': False,
        'synapse': 0.005, 'timesteps': 100, 'threshold': 0.3,
        'probeall': False, 'identical': True, 'fast': False, 'plot': True,
        'gpus': [], 'ocl': [], 'name': "temp"}

variables = [
    ('abstract', [False]),  # , False]),
    ('id_vecs', [False]),
    ('unitary_relations', [True, False])
]


def consolidate_bootstraps(input_filenames, summary_filename):

    bs = Bootstrapper(write_raw_data=True)

    for fn in input_filenames:
        bs.read_bootstrap_file(fn)

    bs.print_summary(summary_filename)


def run_experiments(name, arg_func):
    results_files = []

    for subtask in range(num_subtasks):
        task_name = "%s_subtask_%d" % (name, subtask)

        results_file = os.path.join(directory, task_name)
        results_files.append(results_file)

        subtask_args = copy.deepcopy(args)
        subtask_args['name'] = results_file
        subtask_args['seed'] += subtask * 1000
        subtask_args['gpus'] = [subtask]

        # is assumed to modify the dictionary in place
        arg_func(subtask_args)

        yield {
            'name': task_name,
            'actions': [(run.run, [], subtask_args)],
            'file_dep': [],
            'targets': [results_file],
            'uptodate': [run_once]
            }

    summary_file = os.path.join(directory, name + "_summary")

    yield {
        'name': 'Consolidate ' + name,
        'actions': [(consolidate_bootstraps,
                     [results_files, summary_file])],
        'file_dep': results_files,
        'targets': [summary_file]
        }


def make_arg_func(names, values):
    def arg_func(args):
        """Assumes a copy of args is made externally"""
        for n, v in zip(names, values):
            args[n] = v
    return arg_func


def make_sym_link(source, name):
    try:
        os.remove(name)
    except OSError:
        pass

    os.symlink(source, name)


def task_run_experiments():
    """
    The primary task. Runs experiments.

    The tasks generated by this function are determined by `variables`.
    """

    product = itertools.product(*[vals for name, vals in variables])
    tasks = []

    names = [name for name, vals in variables]

    for values in product:
        arg_func = make_arg_func(names, values)

        name = itertools.chain(*zip(names, values))
        name = '_'.join(str(s) for s in name)

        tasks.extend(run_experiments(name, arg_func))

    for n in range(num_subtasks):
        subtasks = filter(lambda x: str(n) in x['name'], tasks)

        for i, subtask in enumerate(subtasks):
            if i != 0:
                prev_subtask = subtasks[i-1]
                subtask['file_dep'].append(prev_subtask['targets'][0])

    for task in tasks:
        print task

    for task in tasks:
        yield task

    summaries = []
    for task in tasks:
        if task['name'].startswith('Consolidate'):
            summaries.extend(task['targets'])

    def finish(filename, link_target, link_name, args):
        make_sym_link(link_target, link_name)

        f = open(filename, 'a')

        h = git.get_git_revision_short_hash()

        if h is not None:
            h = h.strip()
            f.write("Git revision hash: %s" % h)

        f.write(str(args))

        f.close()

    complete_file = os.path.join(directory, 'complete')
    latest = os.path.join(experiment_directory, 'latest')
    latest_target = os.path.basename(directory)

    yield {
        'name': 'Complete. ',
        'actions': [(finish, [complete_file, latest_target, latest, args])],
        'file_dep': summaries,
        'targets': [complete_file]
        }


def plot_results(summary_filenames, keys, **kwargs):
    """
    Read the summary bootstrap files and create a plot using
    plot_performance.

    Params:

    summary_filenames: list of str
        The names of the summary bootstrap files, with the data
        from all experiments.
    keys: list of str
        The keys to extract from the summary bootstrap

    **kwargs is passed to plot_performance

    """

    means = []
    low_cis = []
    high_cis = []

    for fn in summary_filenames:
        bs = Bootstrapper(write_raw_data=True)
        bs.read_bootstrap_file(fn)

        mean = []
        low_ci = []
        high_ci = []

        for key in keys:
            stats = bs.get_stats(key)

            mean.append(stats[1] * 100)
            low_ci.append(stats[2][0] * 100)
            high_ci.append(stats[2][1] * 100)

        means.append(mean)
        low_cis.append(low_ci)
        high_cis.append(high_ci)

    print means, low_cis, high_cis

    plot.plot_performance(means, low_cis, high_cis, **kwargs)


def task_performance_plot():

    def filt(x):
        return 'abstract' in x
        #return 'unitary_relations_' in x

    summary_directory = '/data/hrr-scaling/srvg-gpu-experiments/summaries_for_unitary_no_idvec'
    plot_filename = os.path.join(summary_directory, 'unitary_prgraph.pdf')

    #labels = [f for f in os.listdir(summary_directory) if 'summary' in f]
    #labels = [f for f in labels if filt(f)]
    #labels = sorted(labels)
    summary_filenames = [os.path.join(summary_directory, f) for f in os.listdir(summary_directory) if filt(f)]
    summary_filenames = sorted(summary_filenames)
    summary_filenames = summary_filenames[::-1]
    print summary_filenames
    labels = ['ID-vectors',
              'no ID-vectors,\n non-unitary',
              'no ID-vectors,\n unitary']

    #labels = [l.split('_') for l in labels]
    #labels = ['n = %d' % int(l[12]) for l in labels]
    labels = [string.replace(l, '_', '\_') for l in labels]
    labels = sorted(labels)
    print "LABELS"
    print labels

    colors = [np.ones(3) * l for l in [0.35, 0.5, 0.65]]

    kwargs = {
        'summary_filenames': summary_filenames,

        'keys': ['jump_score_correct', 'hierarchical_score',
                 'sentence_score_1', 'sentence_score_2'],

        'measure_labels': ["Simple", "Hierarchical",
                           "Sentence\n(Surface)",
                           "Sentence\n(Embedded)"],

        'condition_labels': labels,

        'filename': plot_filename,
        'colors': colors
        }

    yield {
        'name': 'performance_graph',
        'actions': [(plot_results, [], kwargs)],
        'file_dep': [],
        'targets': [plot_filename]
        }


def task_tuning_curve_plot():

    filename = 'tuning_curves.pdf'
    yield {
        'name': 'tuning_curves',
        'actions': [(plot.plot_tuning_curves, [filename])],
        'targets': [filename]
        }


def task_chain_simulation():
    filename = 'chain_simulation.pdf'
    yield {
        'name': 'chain_simulation',
        'actions': [(plot.chain_simulation, [filename])],
        'targets': [filename]
    }

# if __name__ == '__main__':
#     task_iter = task_neural_experiments()
#     for task in task_iter:
#         actions = task['actions']
#         for action in actions:
#            action[0](*action[1], **action[2])