Skip to content

Commit

Permalink
feat(encoder): fix codes in response to the reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
nan-wang committed Apr 1, 2020
1 parent 4448ff5 commit 62b0d84
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 55 deletions.
2 changes: 1 addition & 1 deletion extra-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ torchvision: framework
onnx: framework, py37
onnxruntime: framework, py37
annoy: index
sklearn: numeric
sklearn: framework
faiss-cpu: numeric
faiss-gpu: numeric, gpu
100 changes: 50 additions & 50 deletions jina/executors/encoders/numeric/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,17 @@

from sklearn.decomposition import PCA

class IncrementalPCAEncoder(BaseNumericEncoder):
"""
:class:`IncrementalPCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.

.. note::
:class:`IncrementalPCAEncoder` must be trained before calling ``encode()``. This encoder can be trained in an
incremental way.
class _BasePCAEncoder(BaseNumericEncoder):
"""Base class for PCA methods.
Warning: This class should not be used directly.
Use derived classes instead.
"""
def __init__(self,
output_dim: int,
num_features: int = None,
whiten: bool = False,
save_path: str = '',
*args,
**kwargs):
"""
Expand All @@ -27,24 +25,41 @@ def __init__(self,
:param num_features: the number of input features. If ``num_features`` is None, then ``num_features`` is
inferred from the data
:param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
:param encoder_abspath: the absolute saving path of the encoder. If a valid path is given, the encoder will be
loaded from the given path.
"""
super().__init__(*args, **kwargs)
self.output_dim = output_dim
self.whiten = whiten
self.num_features = num_features
self.encoder_abspath = save_path
self.is_trained = False


class IncrementalPCAEncoder(_BasePCAEncoder):
"""
:class:`IncrementalPCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
.. note::
:class:`IncrementalPCAEncoder` must be trained before calling ``encode()``. This encoder can be trained in an
incremental way.
"""
def __init__(self,
output_dim: int,
num_features: int = None,
whiten: bool = False,
*args,
**kwargs):
"""
:param output_dim: the output size.
:param num_features: the number of input features. If ``num_features`` is None, then ``num_features`` is
inferred from the data
:param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
"""
super().__init__(output_dim, num_features, whiten, *args, **kwargs)
self.model = None

def post_init(self):
from sklearn.decomposition import IncrementalPCA
if os.path.exists(self.encoder_abspath):
import pickle
with open(self.encoder_abspath, 'rb') as f:
self.model = pickle.load(f)
self.logger.info('load existing model from {}'.format(self.encoder_abspath))
else:
if not self.model:
self.model = IncrementalPCA(
n_components=self.output_dim,
whiten=self.whiten)
Expand All @@ -71,21 +86,8 @@ def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
_, num_features = data.shape
return self.model.transform(data)

def __getstate__(self):
if not self.encoder_abspath:
self.encoder_abspath = os.path.join(self.current_workspace, "pca.bin")
if os.path.exists(self.encoder_abspath):
self.logger.warning(
'the existed model file will be overrided: {}".format(save_path)')
import pickle
with open(self.encoder_abspath, 'wb') as f:
pickle.dump(self.model, f)
self.logger.info(
'the model is saved at: {}'.format(self.encoder_abspath))
return super().__getstate__()


class PCAEncoder(BaseNumericEncoder):
class PCAEncoder(_BasePCAEncoder):
"""
:class:`PCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
Expand All @@ -96,7 +98,7 @@ def __init__(self,
output_dim: int,
num_features: int,
whiten: bool = False,
save_path: str = '',
model_filename: str = 'pca.bin',
*args,
**kwargs):
"""
Expand All @@ -108,50 +110,48 @@ def __init__(self,
:param encoder_abspath: the absolute saving path of the encoder. If a valid path is given, the encoder will be
loaded from the given path.
"""
super().__init__(*args, **kwargs)
self.output_dim = output_dim
self.whiten = whiten
self.num_features = num_features
self.encoder_abspath = save_path
self.is_trained = False
super().__init__(output_dim, num_features, whiten, *args, **kwargs)
self.model_filename = model_filename
self.mean = None
self._num_samples = None
self.num_samples = None

def __getstate__(self):
if not self.encoder_abspath:
self.encoder_abspath = os.path.join(self.current_workspace, "pca.bin")
if os.path.exists(self.encoder_abspath):
if os.path.exists(self.model_abspath):
self.logger.warning(
'the existed model file will be overrided: {}".format(save_path)')
'the existed model file will be overrided: {}'.format(self.model_abspath))
import faiss
faiss.write_VectorTransform(self.model, self.encoder_abspath)
faiss.write_VectorTransform(self.model, self.model_abspath)
self.logger.info(
'the model is saved at: {}'.format(self.encoder_abspath))
'the model is saved at: {}'.format(self.model_abspath))
return super().__getstate__()

@staticmethod
def _calc_std(data, n_samples):
return np.sqrt(data ** 2 / (n_samples - 1))

@property
def model_abspath(self) -> str:
return self.get_file_from_workspace(self.model_filename)

def post_init(self):
self.model = None
import faiss
if os.path.exists(self.encoder_abspath):
self.model = faiss.read_VectorTransform(self.encoder_abspath)
if os.path.exists(self.model_abspath):
self.model = faiss.read_VectorTransform(self.model_abspath)
self.std = self._calc_std(
faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self._num_samples)
self.logger.info('load existing model from {}'.format(self.encoder_abspath))
faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
self.logger.info('load existing model from {}'.format(self.model_abspath))
else:
self.model = faiss.PCAMatrix(self.num_features, self.output_dim)

def train(self, data: 'np.ndarray', *args, **kwargs):
import faiss
self._num_samples, num_features = data.shape
self.num_samples, num_features = data.shape
if not self.num_features:
self.num_features = num_features
self.mean = np.mean(data, axis=0)
self.model.train((data - self.mean).astype('float32'))
self.std = self._calc_std(faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self._num_samples)
self.std = self._calc_std(faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
self.is_trained = True

@require_train
Expand Down
13 changes: 9 additions & 4 deletions tests/test_exec_encoder_numeric_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def _test_encoding_results(self, encoder):
encoded_data = encoder.encode(test_data)
self.assertEqual(encoded_data.shape, (test_data.shape[0], self.output_dim))
self.assertIs(type(encoded_data), np.ndarray)
self.add_tmpfile(encoder.config_abspath, encoder.save_abspath)
if hasattr(encoder, 'model_abspath'):
self.add_tmpfile(encoder.model_abspath)

def test_encoding_results(self):
for m in self.model_list:
Expand All @@ -40,9 +43,9 @@ def _test_save_and_load(self, encoder):
encoded_data_test = encoder_loaded.encode(test_data)
np.testing.assert_array_equal(
encoded_data_test, encoded_data_control)
self.add_tmpfile(
encoder.config_abspath, encoder.save_abspath, encoder_loaded.config_abspath, encoder_loaded.save_abspath,
encoder.encoder_abspath)
self.add_tmpfile(encoder.config_abspath, encoder.save_abspath)
if hasattr(encoder, 'model_abspath'):
self.add_tmpfile(encoder.model_abspath)

def test_save_and_load(self):
for m in self.model_list:
Expand All @@ -57,7 +60,9 @@ def _test_save_and_load_config(self, encoder):
self.assertEqual(
encoder_loaded.output_dim,
encoder.output_dim)
self.add_tmpfile(encoder_loaded.config_abspath, encoder_loaded.save_abspath)
self.add_tmpfile(encoder.config_abspath, encoder.save_abspath)
if hasattr(encoder, 'model_abspath'):
self.add_tmpfile(encoder.model_abspath)

def test_save_and_load_config(self):
for m in self.model_list:
Expand Down

0 comments on commit 62b0d84

Please sign in to comment.