From ecdf6e6f55c287445485a226e73a5cec8e4ffdc3 Mon Sep 17 00:00:00 2001 From: Nils Murrugarra Date: Thu, 19 Jun 2014 14:16:05 -0400 Subject: [PATCH] Some initial changes to work with FeatureHasher --- skll/data.py | 110 +++++++++++++++++++++++++++++++++++++++++--- skll/experiments.py | 22 ++++++--- 2 files changed, 119 insertions(+), 13 deletions(-) diff --git a/skll/data.py b/skll/data.py index db4840e0..3a4a34bf 100644 --- a/skll/data.py +++ b/skll/data.py @@ -522,6 +522,87 @@ def __init__(self, path_or_list, quiet=True, ids_to_floats=False, class_map=class_map) +def get_unique_features_iter(iter_features): + seen = set() + for feat in iter_features: + seen = seen.union(set(feat)) + return seen + + +def get_unique_features(path, quiet=False, sparse=True, label_col='y', + ids_to_floats=False, class_map=None): + ''' + Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.megam``, + ``.ndj``, or ``.tsv`` formats. + + If you would like to include example/instance IDs in your files, they must + be specified in the following ways: + + * MegaM: As a comment line directly preceding the line with feature values. + * CSV/TSV/ARFF: An "id" column. + * JSONLINES: An "id" key in each JSON dictionary. + + Also, for ARFF, CSV, and TSV files, there must be a column with the name + specified by `label_col` if the data is labelled. For ARFF files, this + column must also be the final one (as it is in Weka). + + :param path: The path to the file to load the examples from, or a list of + example dictionaries (like you would pass to + `convert_examples`). + :type path: str or dict + :param quiet: Do not print "Loading..." status message to stderr. + :type quiet: bool + :param sparse: Whether or not to store the features in a numpy CSR matrix. + :type sparse: bool + :param label_col: Name of the column which contains the class labels for + ARFF/CSV/TSV files. If no column with that name exists, or + `None` is specified, the data is considered to be + unlabelled. + :type label_col: str + :param ids_to_floats: Convert IDs to float to save memory. Will raise error + if we encounter an a non-numeric ID. + :type ids_to_floats: bool + :param class_map: Mapping from original class labels to new ones. This is + mainly used for collapsing multiple classes into a single + class. Anything not in the mapping will be kept the same. + :type class_map: dict from str to str + + :return: a set of unique feature labels + ''' + # Setup logger + logger = logging.getLogger(__name__) + + logger.debug('Path: %s', path) + + + # Build an appropriate generator for examples so we process the input file + # through the feature vectorizer without using tons of memory + if not isinstance(path, string_types): + example_iter_type = _DummyDictIter + # Lowercase path for file extension checking, if it's a string + else: + lc_path = path.lower() + if lc_path.endswith(".tsv"): + example_iter_type = _TSVDictIter + elif lc_path.endswith(".csv"): + example_iter_type = _CSVDictIter + elif lc_path.endswith(".arff"): + example_iter_type = _ARFFDictIter + elif lc_path.endswith(".jsonlines") or lc_path.endswith('.ndj'): + example_iter_type = _JSONDictIter + elif lc_path.endswith(".megam"): + example_iter_type = _MegaMDictIter + else: + raise ValueError(('Example files must be in either .arff, .csv, ' + + '.jsonlines, .megam, .ndj, or .tsv format. You '+ + 'specified: {}').format(path)) + + logger.debug('Example iterator type: %s', example_iter_type) + iter_features = _features_for_iter_type_partial(example_iter_type, + path, quiet, sparse, label_col) + unique_features = get_unique_features_iter(iter_features) + return unique_features + def _ids_for_iter_type(example_iter_type, path, ids_to_floats): ''' Little helper function to return an array of IDs for a given example @@ -554,8 +635,24 @@ def _classes_for_iter_type(example_iter_type, path, label_col, class_map): raise e return res_array +def _features_for_iter_type_partial(example_iter_type, path, quiet, sparse, label_col): + ''' + Little helper function to return a sparse matrix of features and feature + vectorizer for a given example generator (and whether or not the examples + have labels). + ''' + try: + example_iter = example_iter_type(path, quiet=quiet, label_col=label_col) + feat_dict_generator = map(itemgetter(2), example_iter) + for feat in feat_dict_generator: + yield feat + except Exception as e: + # Setup logger + logger = logging.getLogger(__name__) + logger.exception('Failed to load features for %s.', path) + raise e -def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col): +def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col, nro_features): ''' Little helper function to return a sparse matrix of features and feature vectorizer for a given example generator (and whether or not the examples @@ -563,8 +660,7 @@ def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col): ''' try: example_iter = example_iter_type(path, quiet=quiet, label_col=label_col) - # feat_vectorizer = DictVectorizer(sparse=sparse) - feat_vectorizer = FeatureHasher(n_features=5) + feat_vectorizer = FeatureHasher(n_features=nro_features) feat_dict_generator = map(itemgetter(2), example_iter) except Exception as e: # Setup logger @@ -572,14 +668,15 @@ def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col): logger.exception('Failed to load features for %s.', path) raise e try: - features = feat_vectorizer.fit_transform(feat_dict_generator) + # features = feat_vectorizer.fit_transform(feat_dict_generator) + features = feat_vectorizer.transform(feat_dict_generator) except ValueError: raise ValueError('The last feature file did not include any features.') return features, feat_vectorizer def load_examples(path, quiet=False, sparse=True, label_col='y', - ids_to_floats=False, class_map=None): + ids_to_floats=False, class_map=None, nro_features=None): ''' Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.megam``, ``.ndj``, or ``.tsv`` formats. @@ -681,8 +778,7 @@ def load_examples(path, quiet=False, sparse=True, label_col='y', class_map) features_future = executor.submit(_features_for_iter_type, example_iter_type, path, quiet, - sparse, label_col) - + sparse, label_col, nro_features) # Wait for processes/threads to complete and store results ids = ids_future.result() classes = classes_future.result() diff --git a/skll/experiments.py b/skll/experiments.py index 1cc4d81b..1d933b2c 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -29,7 +29,7 @@ from six.moves import zip from sklearn.metrics import SCORERS -from skll.data import ExamplesTuple, load_examples +from skll.data import ExamplesTuple, load_examples, get_unique_features from skll.learner import Learner, MAX_CONCURRENT_PROCESSES from skll.version import __version__ @@ -403,7 +403,7 @@ def _parse_config_file(config_path): def _load_featureset(dirpath, featureset, suffix, label_col='y', ids_to_floats=False, quiet=False, class_map=None, - unlabelled=False): + unlabelled=False,nro_features=None): ''' Load a list of feature files and merge them. @@ -440,9 +440,8 @@ def _load_featureset(dirpath, featureset, suffix, label_col='y', in featureset) example_tuples = [load_examples(file_name, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, - class_map=class_map) + class_map=class_map,nro_features=nro_features) for file_name in file_names] - # Check that the IDs are unique within each file. for file_name, examples in zip(file_names, example_tuples): ex_ids = examples.ids @@ -591,13 +590,23 @@ def _classify_featureset(args): # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) + file_names = sorted(os.path.join(train_path, featfile + suffix) for featfile + in featureset) + # nro_features = sum([len(get_unique_features(file_name, label_col=label_col, + # ids_to_floats=ids_to_floats, quiet=quiet, + # class_map=class_map)) + # for file_name in file_names]) + + nro_features = 20000000 + 5000; + # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, - quiet=quiet, class_map=class_map) + quiet=quiet, class_map=class_map, + nro_features=nro_features) # initialize a classifer object learner = Learner(learner_name, probability=probability, @@ -618,7 +627,8 @@ def _classify_featureset(args): label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, - unlabelled=True) + unlabelled=True, + nro_features=nro_features) # create a list of dictionaries of the results information