Skip to content

Commit

Permalink
Some initial changes to work with FeatureHasher
Browse files Browse the repository at this point in the history
  • Loading branch information
nineil committed Jun 17, 2014
1 parent 3f25150 commit 3b0928e
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 12 deletions.
5 changes: 3 additions & 2 deletions skll/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from collections import namedtuple
from six import iteritems, string_types, text_type
from six.moves import map, zip
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

# Import QueueHandler and QueueListener for multiprocess-safe logging
if sys.version_info < (3, 0):
Expand Down Expand Up @@ -563,7 +563,8 @@ def _features_for_iter_type(example_iter_type, path, quiet, sparse, label_col):
'''
try:
example_iter = example_iter_type(path, quiet=quiet, label_col=label_col)
feat_vectorizer = DictVectorizer(sparse=sparse)
# feat_vectorizer = DictVectorizer(sparse=sparse)
feat_vectorizer = FeatureHasher(n_features=5)
feat_dict_generator = map(itemgetter(2), example_iter)
except Exception as e:
# Setup logger
Expand Down
18 changes: 9 additions & 9 deletions skll/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,19 +480,19 @@ def _load_featureset(dirpath, featureset, suffix, label_col='y',
for ids, classes, features, feat_vectorizer in example_tuples:
# Combine feature matrices and vectorizers
if merged_features is not None:
# Check for duplicate feature names
if (set(merged_vectorizer.get_feature_names()) &
set(feat_vectorizer.get_feature_names())):
raise ValueError('Two feature files have the same feature!')
# # Check for duplicate feature names
# if (set(merged_vectorizer.get_feature_names()) &
# set(feat_vectorizer.get_feature_names())):
# raise ValueError('Two feature files have the same feature!')

num_merged = merged_features.shape[1]
merged_features = sp.hstack([merged_features, features], 'csr')

# dictvectorizer sorts the vocabularies within each file
for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(),
key=lambda x: x[1]):
merged_vectorizer.vocabulary_[feat_name] = index + num_merged
merged_vectorizer.feature_names_.append(feat_name)
# # dictvectorizer sorts the vocabularies within each file
# for feat_name, index in sorted(feat_vectorizer.vocabulary_.items(),
# key=lambda x: x[1]):
# merged_vectorizer.vocabulary_[feat_name] = index + num_merged
# merged_vectorizer.feature_names_.append(feat_name)
else:
merged_features = features
merged_vectorizer = feat_vectorizer
Expand Down
6 changes: 5 additions & 1 deletion skll/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,7 +956,11 @@ def predict(self, examples, prediction_prefix=None, append=False,
# Need to do some transformations so the features are in the right
# columns for the test set. Obviously a bit hacky, but storing things
# in sparse matrices saves memory over our old list of dicts approach.
if self.feat_vectorizer == examples.feat_vectorizer:
# if self.feat_vectorizer == examples.feat_vectorizer:
if (self.feat_vectorizer.n_features == examples.feat_vectorizer.n_features)\
and (self.feat_vectorizer.input_type == examples.feat_vectorizer.input_type)\
and (self.feat_vectorizer.dtype == examples.feat_vectorizer.dtype)\
and (self.feat_vectorizer.non_negative == examples.feat_vectorizer.non_negative):
xtest = examples.features
else:
xtest = self.feat_vectorizer.transform(examples.feat_vectorizer.inverse_transform(examples.features))
Expand Down

0 comments on commit 3b0928e

Please sign in to comment.