Skip to content

Commit

Permalink
Some initial changes to work with FeatureHasher
Browse files Browse the repository at this point in the history
  • Loading branch information
nineil committed Jun 19, 2014
1 parent 3b0928e commit ecdf6e6
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 13 deletions.
110 changes: 103 additions & 7 deletions skll/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,87 @@ def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
class_map=class_map)


def get_unique_features_iter(iter_features):
seen = set()
for feat in iter_features:
seen = seen.union(set(feat))
return seen


def get_unique_features(path, quiet=False, sparse=True, label_col='y',
ids_to_floats=False, class_map=None):
'''
Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.megam``,
``.ndj``, or ``.tsv`` formats.
If you would like to include example/instance IDs in your files, they must
be specified in the following ways:
* MegaM: As a comment line directly preceding the line with feature values.
* CSV/TSV/ARFF: An "id" column.
* JSONLINES: An "id" key in each JSON dictionary.
Also, for ARFF, CSV, and TSV files, there must be a column with the name
specified by `label_col` if the data is labelled. For ARFF files, this
column must also be the final one (as it is in Weka).
:param path: The path to the file to load the examples from, or a list of
example dictionaries (like you would pass to
`convert_examples`).
:type path: str or dict
:param quiet: Do not print "Loading..." status message to stderr.
:type quiet: bool
:param sparse: Whether or not to store the features in a numpy CSR matrix.
:type sparse: bool
:param label_col: Name of the column which contains the class labels for
ARFF/CSV/TSV files. If no column with that name exists, or
`None` is specified, the data is considered to be
unlabelled.
:type label_col: str
:param ids_to_floats: Convert IDs to float to save memory. Will raise error
if we encounter an a non-numeric ID.
:type ids_to_floats: bool
:param class_map: Mapping from original class labels to new ones. This is
mainly used for collapsing multiple classes into a single
class. Anything not in the mapping will be kept the same.
:type class_map: dict from str to str
:return: a set of unique feature labels
'''
# Setup logger
logger = logging.getLogger(__name__)

logger.debug('Path: %s', path)


# Build an appropriate generator for examples so we process the input file
# through the feature vectorizer without using tons of memory
if not isinstance(path, string_types):
example_iter_type = _DummyDictIter
# Lowercase path for file extension checking, if it's a string
else:
lc_path = path.lower()
if lc_path.endswith(".tsv"):
example_iter_type = _TSVDictIter
elif lc_path.endswith(".csv"):
example_iter_type = _CSVDictIter
elif lc_path.endswith(".arff"):
example_iter_type = _ARFFDictIter
elif lc_path.endswith(".jsonlines") or lc_path.endswith('.ndj'):
example_iter_type = _JSONDictIter
elif lc_path.endswith(".megam"):
example_iter_type = _MegaMDictIter
else:
raise ValueError(('Example files must be in either .arff, .csv, ' +
'.jsonlines, .megam, .ndj, or .tsv format. You '+
'specified: {}').format(path))

logger.debug('Example iterator type: %s', example_iter_type)
iter_features = _features_for_iter_type_partial(example_iter_type,
path, quiet, sparse, label_col)
unique_features = get_unique_features_iter(iter_features)
return unique_features

def _ids_for_iter_type(example_iter_type, path, ids_to_floats):
'''
Little helper function to return an array of IDs for a given example
Expand Down Expand Up @@ -554,32 +635,48 @@ def _classes_for_iter_type(example_iter_type, path, label_col, class_map):
raise e
return res_array

def _features_for_iter_type_partial(example_iter_type, path, quiet, sparse, label_col):
'''
Little helper function to return a sparse matrix of features and feature
vectorizer for a given example generator (and whether or not the examples
have labels).
'''
try:
example_iter = example_iter_type(path, quiet=quiet, label_col=label_col)
feat_dict_generator = map(itemgetter(2), example_iter)
for feat in feat_dict_generator:
yield feat
except Exception as e:
# Setup logger
logger = logging.getLogger(__name__)
logger.exception('Failed to load features for %s.', path)
raise e

def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col):
def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col, nro_features):
'''
Little helper function to return a sparse matrix of features and feature
vectorizer for a given example generator (and whether or not the examples
have labels).
'''
try:
example_iter = example_iter_type(path, quiet=quiet, label_col=label_col)
# feat_vectorizer = DictVectorizer(sparse=sparse)
feat_vectorizer = FeatureHasher(n_features=5)
feat_vectorizer = FeatureHasher(n_features=nro_features)
feat_dict_generator = map(itemgetter(2), example_iter)
except Exception as e:
# Setup logger
logger = logging.getLogger(__name__)
logger.exception('Failed to load features for %s.', path)
raise e
try:
features = feat_vectorizer.fit_transform(feat_dict_generator)
# features = feat_vectorizer.fit_transform(feat_dict_generator)
features = feat_vectorizer.transform(feat_dict_generator)
except ValueError:
raise ValueError('The last feature file did not include any features.')
return features, feat_vectorizer


def load_examples(path, quiet=False, sparse=True, label_col='y',
ids_to_floats=False, class_map=None):
ids_to_floats=False, class_map=None, nro_features=None):
'''
Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.megam``,
``.ndj``, or ``.tsv`` formats.
Expand Down Expand Up @@ -681,8 +778,7 @@ def load_examples(path, quiet=False, sparse=True, label_col='y',
class_map)
features_future = executor.submit(_features_for_iter_type,
example_iter_type, path, quiet,
sparse, label_col)

sparse, label_col, nro_features)
# Wait for processes/threads to complete and store results
ids = ids_future.result()
classes = classes_future.result()
Expand Down
22 changes: 16 additions & 6 deletions skll/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from six.moves import zip
from sklearn.metrics import SCORERS

from skll.data import ExamplesTuple, load_examples
from skll.data import ExamplesTuple, load_examples, get_unique_features
from skll.learner import Learner, MAX_CONCURRENT_PROCESSES
from skll.version import __version__

Expand Down Expand Up @@ -403,7 +403,7 @@ def _parse_config_file(config_path):

def _load_featureset(dirpath, featureset, suffix, label_col='y',
ids_to_floats=False, quiet=False, class_map=None,
unlabelled=False):
unlabelled=False,nro_features=None):
'''
Load a list of feature files and merge them.
Expand Down Expand Up @@ -440,9 +440,8 @@ def _load_featureset(dirpath, featureset, suffix, label_col='y',
in featureset)
example_tuples = [load_examples(file_name, label_col=label_col,
ids_to_floats=ids_to_floats, quiet=quiet,
class_map=class_map)
class_map=class_map,nro_features=nro_features)
for file_name in file_names]

# Check that the IDs are unique within each file.
for file_name, examples in zip(file_names, example_tuples):
ex_ids = examples.ids
Expand Down Expand Up @@ -591,13 +590,23 @@ def _classify_featureset(args):
# featureset already exists if so, load it and then use it on test data
modelfile = os.path.join(model_path, '{}.model'.format(job_name))

file_names = sorted(os.path.join(train_path, featfile + suffix) for featfile
in featureset)
# nro_features = sum([len(get_unique_features(file_name, label_col=label_col,
# ids_to_floats=ids_to_floats, quiet=quiet,
# class_map=class_map))
# for file_name in file_names])

nro_features = 20000000 + 5000;

# load the training and test examples
if task == 'cross_validate' or (not os.path.exists(modelfile) or
overwrite):
train_examples = _load_featureset(train_path, featureset, suffix,
label_col=label_col,
ids_to_floats=ids_to_floats,
quiet=quiet, class_map=class_map)
quiet=quiet, class_map=class_map,
nro_features=nro_features)
# initialize a classifer object
learner = Learner(learner_name,
probability=probability,
Expand All @@ -618,7 +627,8 @@ def _classify_featureset(args):
label_col=label_col,
ids_to_floats=ids_to_floats,
quiet=quiet, class_map=class_map,
unlabelled=True)
unlabelled=True,
nro_features=nro_features)


# create a list of dictionaries of the results information
Expand Down

0 comments on commit ecdf6e6

Please sign in to comment.