Skip to content

Commit

Permalink
FeatureHasher Added
Browse files Browse the repository at this point in the history
  • Loading branch information
nineil committed Jun 20, 2014
1 parent 928c9c0 commit 4a3e2a1
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 35 deletions.
14 changes: 9 additions & 5 deletions skll/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,23 +664,26 @@ def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col,
if feature_hasher:
feat_vectorizer = FeatureHasher(n_features=nro_features)
else:
feat_vectorizer = FeatureHasher(n_features=nro_features)
feat_vectorizer = DictVectorizer(sparse=sparse)
feat_dict_generator = map(itemgetter(2), example_iter)
except Exception as e:
# Setup logger
logger = logging.getLogger(__name__)
logger.exception('Failed to load features for %s.', path)
raise e
try:
# features = feat_vectorizer.fit_transform(feat_dict_generator)
features = feat_vectorizer.transform(feat_dict_generator)
if feature_hasher:
features = feat_vectorizer.transform(feat_dict_generator)
else:
features = feat_vectorizer.fit_transform(feat_dict_generator)
except ValueError:
raise ValueError('The last feature file did not include any features.')
return features, feat_vectorizer


def load_examples(path, quiet=False, sparse=True, label_col='y',
ids_to_floats=False, class_map=None, nro_features=None):
ids_to_floats=False, class_map=None, feature_hasher=False,
nro_features=None):
'''
Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.megam``,
``.ndj``, or ``.tsv`` formats.
Expand Down Expand Up @@ -782,7 +785,8 @@ def load_examples(path, quiet=False, sparse=True, label_col='y',
class_map)
features_future = executor.submit(_features_for_iter_type,
example_iter_type, path, quiet,
sparse, label_col, nro_features)
sparse, label_col, feature_hasher,
nro_features)
# Wait for processes/threads to complete and store results
ids = ids_future.result()
classes = classes_future.result()
Expand Down
49 changes: 24 additions & 25 deletions skll/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,17 +245,14 @@ def _parse_config_file(config_path):
experiment_name = config.get("General", "experiment_name")

# Input
if config.has_option("Input", "feature_hasher"):
feature_hasher = config.get("Input", "feature_hasher")
if feature_hasher and config.has_option("Input", "nro_features_hasher"):
nro_features_hasher = config.get("Input", "nro_features_hasher")
nro_features_hasher = None
feature_hasher = config.getboolean("Input", "feature_hasher")
if feature_hasher:
if config.has_option("Input", "nro_features_hasher"):
nro_features_hasher = config.getint("Input", "nro_features_hasher")
else:
raise ValueError("Configuration file does not contain "+
"option nro_features_hasher")
if config.has_option("Input", "nro_features_hasher") and \
(not config.has_option("Input", "feature_hasher")):
raise ValueError("Configuration file does not contain "+
"option feautre_hasher")
"option nro_features_hasher")
if config.has_option("Input", "learners"):
learners_string = config.get("Input", "learners")
elif config.has_option("Input", "classifiers"):
Expand Down Expand Up @@ -405,7 +402,7 @@ def _parse_config_file(config_path):
train_set_name = os.path.basename(train_path)
test_set_name = os.path.basename(test_path) if test_path else "cv"

return (experiment_name, task, feture_hasher, nro_features_hasher, label_col,
return (experiment_name, task, feature_hasher, nro_features_hasher, label_col,
train_set_name, test_set_name, suffix, featuresets, model_path,
do_grid_search, grid_objective, probability, results_path,
pos_label_str, feature_scaling, min_feature_count, grid_search_jobs,
Expand All @@ -416,7 +413,7 @@ def _parse_config_file(config_path):

def _load_featureset(dirpath, featureset, suffix, label_col='y',
ids_to_floats=False, quiet=False, class_map=None,
unlabelled=False, feature_hasher = False, nro_features=None):
unlabelled=False, feature_hasher=False, nro_features=None):
'''
Load a list of feature files and merge them.
Expand Down Expand Up @@ -453,7 +450,8 @@ def _load_featureset(dirpath, featureset, suffix, label_col='y',
in featureset)
example_tuples = [load_examples(file_name, label_col=label_col,
ids_to_floats=ids_to_floats, quiet=quiet,
class_map=class_map,nro_features=nro_features)
class_map=class_map, feature_hasher=feature_hasher,
nro_features=nro_features)
for file_name in file_names]
# Check that the IDs are unique within each file.
for file_name, examples in zip(file_names, example_tuples):
Expand Down Expand Up @@ -492,19 +490,19 @@ def _load_featureset(dirpath, featureset, suffix, label_col='y',
for ids, classes, features, feat_vectorizer in example_tuples:
# Combine feature matrices and vectorizers
if merged_features is not None:
# # Check for duplicate feature names
# if (set(merged_vectorizer.get_feature_names()) &
# set(feat_vectorizer.get_feature_names())):
# raise ValueError('Two feature files have the same feature!')

if not feature_hasher:
# Check for duplicate feature names
if (set(merged_vectorizer.get_feature_names()) &
set(feat_vectorizer.get_feature_names())):
raise ValueError('Two feature files have the same feature!')
num_merged = merged_features.shape[1]
merged_features = sp.hstack([merged_features, features], 'csr')

# # dictvectorizer sorts the vocabularies within each file
# for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(),
# key=lambda x: x[1]):
# merged_vectorizer.vocabulary_[feat_name] = index + num_merged
# merged_vectorizer.feature_names_.append(feat_name)
if not feature_hasher:
# dictvectorizer sorts the vocabularies within each file
for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(),
key=lambda x: x[1]):
merged_vectorizer.vocabulary_[feat_name] = index + num_merged
merged_vectorizer.feature_names_.append(feat_name)
else:
merged_features = features
merged_vectorizer = feat_vectorizer
Expand Down Expand Up @@ -575,7 +573,6 @@ def _classify_featureset(args):
if args:
raise ValueError(("Extra arguments passed to _classify_featureset: " +
"{}").format(args.keys()))

timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S')

with open(log_path, 'w') as log_file:
Expand Down Expand Up @@ -621,6 +618,7 @@ def _classify_featureset(args):
label_col=label_col,
ids_to_floats=ids_to_floats,
quiet=quiet, class_map=class_map,
feature_hasher=feature_hasher,
nro_features=nro_features)
# initialize a classifer object
learner = Learner(learner_name,
Expand All @@ -643,6 +641,7 @@ def _classify_featureset(args):
ids_to_floats=ids_to_floats,
quiet=quiet, class_map=class_map,
unlabelled=True,
feature_hasher=feature_hasher,
nro_features=nro_features)


Expand Down Expand Up @@ -949,7 +948,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
logger = logging.getLogger(__name__)

# Read configuration
(experiment_name, task, feaure_hasher, nro_features_hasher, label_col,
(experiment_name, task, feature_hasher, nro_features_hasher, label_col,
train_set_name, test_set_name, suffix, featuresets, model_path,
do_grid_search, grid_objective, probability, results_path,
pos_label_str, feature_scaling, min_feature_count, grid_search_jobs,
Expand Down
7 changes: 2 additions & 5 deletions skll/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,11 +956,8 @@ def predict(self, examples, prediction_prefix=None, append=False,
# Need to do some transformations so the features are in the right
# columns for the test set. Obviously a bit hacky, but storing things
# in sparse matrices saves memory over our old list of dicts approach.
# if self.feat_vectorizer == examples.feat_vectorizer:
if (self.feat_vectorizer.n_features == examples.feat_vectorizer.n_features)\
and (self.feat_vectorizer.input_type == examples.feat_vectorizer.input_type)\
and (self.feat_vectorizer.dtype == examples.feat_vectorizer.dtype)\
and (self.feat_vectorizer.non_negative == examples.feat_vectorizer.non_negative):
if self.feat_vectorizer == examples.feat_vectorizer:

xtest = examples.features
else:
xtest = self.feat_vectorizer.transform(examples.feat_vectorizer.inverse_transform(examples.features))
Expand Down

0 comments on commit 4a3e2a1

Please sign in to comment.