From a2e26e48b295e30c345d70f61c4756740f5c4499 Mon Sep 17 00:00:00 2001 From: Nan Wang Date: Wed, 1 Apr 2020 18:02:12 +0800 Subject: [PATCH] feat(encoder): remove the PCAEncoder --- extra-requirements.txt | 2 - jina/executors/encoders/numeric/pca.py | 116 ++----------------------- tests/test_exec_encoder_numeric_pca.py | 24 +++-- 3 files changed, 16 insertions(+), 126 deletions(-) diff --git a/extra-requirements.txt b/extra-requirements.txt index 8ba176b84db21..c5ff972b6a0eb 100644 --- a/extra-requirements.txt +++ b/extra-requirements.txt @@ -31,5 +31,3 @@ onnx: framework, py37 onnxruntime: framework, py37 annoy: index sklearn: framework -faiss-cpu: numeric -faiss-gpu: numeric, gpu diff --git a/jina/executors/encoders/numeric/pca.py b/jina/executors/encoders/numeric/pca.py index 21cc6571bdc4f..841da5a0d0dc4 100644 --- a/jina/executors/encoders/numeric/pca.py +++ b/jina/executors/encoders/numeric/pca.py @@ -4,36 +4,8 @@ from .. import BaseNumericEncoder -from sklearn.decomposition import PCA - -class _BasePCAEncoder(BaseNumericEncoder): - """Base class for PCA methods. - - Warning: This class should not be used directly. - Use derived classes instead. - """ - def __init__(self, - output_dim: int, - num_features: int = None, - whiten: bool = False, - *args, - **kwargs): - """ - - :param output_dim: the output size. - :param num_features: the number of input features. If ``num_features`` is None, then ``num_features`` is - inferred from the data - :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed. - """ - super().__init__(*args, **kwargs) - self.output_dim = output_dim - self.whiten = whiten - self.num_features = num_features - self.is_trained = False - - -class IncrementalPCAEncoder(_BasePCAEncoder): +class IncrementalPCAEncoder(BaseNumericEncoder): """ :class:`IncrementalPCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`. @@ -54,7 +26,11 @@ def __init__(self, inferred from the data :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed. """ - super().__init__(output_dim, num_features, whiten, *args, **kwargs) + super().__init__(*args, **kwargs) + self.output_dim = output_dim + self.whiten = whiten + self.num_features = num_features + self.is_trained = False self.model = None def post_init(self): @@ -85,83 +61,3 @@ def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ _, num_features = data.shape return self.model.transform(data) - - -class PCAEncoder(_BasePCAEncoder): - """ - :class:`PCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`. - - .. note:: - :class:`PCAEncoder` must be trained before calling ``encode()``. This encoder can NOT be trained in the batch mode. - """ - def __init__(self, - output_dim: int, - num_features: int, - whiten: bool = False, - model_filename: str = 'pca.bin', - *args, - **kwargs): - """ - - :param output_dim: the output size. - :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed. - :param num_features: the number of input features. If ``num_features`` is None, then ``num_features`` is - inferred from the data - :param encoder_abspath: the absolute saving path of the encoder. If a valid path is given, the encoder will be - loaded from the given path. - """ - super().__init__(output_dim, num_features, whiten, *args, **kwargs) - self.model_filename = model_filename - self.mean = None - self.num_samples = None - - def __getstate__(self): - if os.path.exists(self.model_abspath): - self.logger.warning( - 'the existed model file will be overrided: {}'.format(self.model_abspath)) - import faiss - faiss.write_VectorTransform(self.model, self.model_abspath) - self.logger.info( - 'the model is saved at: {}'.format(self.model_abspath)) - return super().__getstate__() - - @staticmethod - def _calc_std(data, n_samples): - return np.sqrt(data ** 2 / (n_samples - 1)) - - @property - def model_abspath(self) -> str: - return self.get_file_from_workspace(self.model_filename) - - def post_init(self): - self.model = None - import faiss - if os.path.exists(self.model_abspath): - self.model = faiss.read_VectorTransform(self.model_abspath) - self.std = self._calc_std( - faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples) - self.logger.info('load existing model from {}'.format(self.model_abspath)) - else: - self.model = faiss.PCAMatrix(self.num_features, self.output_dim) - - def train(self, data: 'np.ndarray', *args, **kwargs): - import faiss - self.num_samples, num_features = data.shape - if not self.num_features: - self.num_features = num_features - self.mean = np.mean(data, axis=0) - self.model.train((data - self.mean).astype('float32')) - self.std = self._calc_std(faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples) - self.is_trained = True - - @require_train - @batching - def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': - """ - :param data: a `B x T` numpy ``ndarray``, `B` is the size of the batch - :return: a `B x D` numpy ``ndarray`` - """ - output = self.model.apply_py((data - self.mean).astype('float32')) - if self.whiten: - output /= self.std - return output diff --git a/tests/test_exec_encoder_numeric_pca.py b/tests/test_exec_encoder_numeric_pca.py index ef526d86832ae..cd843261d1556 100644 --- a/tests/test_exec_encoder_numeric_pca.py +++ b/tests/test_exec_encoder_numeric_pca.py @@ -4,14 +4,13 @@ import os from tests import JinaTestCase -import jina.executors.encoders.numeric.pca as pca +from jina.executors.encoders.numeric.pca import IncrementalPCAEncoder from jina.executors import BaseExecutor class MyTestCase(JinaTestCase): num_features = 28 output_dim = 2 - model_list = ('IncrementalPCAEncoder', 'PCAEncoder') def _test_encoding_results(self, encoder): train_data = np.random.rand(1000, self.num_features) @@ -26,10 +25,9 @@ def _test_encoding_results(self, encoder): self.add_tmpfile(encoder.model_abspath) def test_encoding_results(self): - for m in self.model_list: - encoder = getattr(pca, m)( - output_dim=self.output_dim, whiten=True, num_features=self.num_features) - self._test_encoding_results(encoder) + encoder = IncrementalPCAEncoder( + output_dim=self.output_dim, whiten=True, num_features=self.num_features) + self._test_encoding_results(encoder) def _test_save_and_load(self, encoder): train_data = np.random.rand(1000, self.num_features) @@ -48,10 +46,9 @@ def _test_save_and_load(self, encoder): self.add_tmpfile(encoder.model_abspath) def test_save_and_load(self): - for m in self.model_list: - encoder = getattr(pca, m)( - output_dim=self.output_dim, whiten=True, num_features=self.num_features) - self._test_save_and_load(encoder) + encoder = IncrementalPCAEncoder( + output_dim=self.output_dim, whiten=True, num_features=self.num_features) + self._test_save_and_load(encoder) def _test_save_and_load_config(self, encoder): encoder.save_config() @@ -65,10 +62,9 @@ def _test_save_and_load_config(self, encoder): self.add_tmpfile(encoder.model_abspath) def test_save_and_load_config(self): - for m in self.model_list: - encoder = getattr(pca, m)( - output_dim=self.output_dim, whiten=True, num_features=self.num_features) - self._test_save_and_load_config(encoder) + encoder = IncrementalPCAEncoder( + output_dim=self.output_dim, whiten=True, num_features=self.num_features) + self._test_save_and_load_config(encoder) if __name__ == '__main__':