From a2e26e48b295e30c345d70f61c4756740f5c4499 Mon Sep 17 00:00:00 2001
From: Nan Wang <nan.wang@jina.ai>
Date: Wed, 1 Apr 2020 18:02:12 +0800
Subject: [PATCH] feat(encoder): remove the PCAEncoder

---
 extra-requirements.txt                 |   2 -
 jina/executors/encoders/numeric/pca.py | 116 ++-----------------------
 tests/test_exec_encoder_numeric_pca.py |  24 +++--
 3 files changed, 16 insertions(+), 126 deletions(-)

diff --git a/extra-requirements.txt b/extra-requirements.txt
index 8ba176b84db21..c5ff972b6a0eb 100644
--- a/extra-requirements.txt
+++ b/extra-requirements.txt
@@ -31,5 +31,3 @@ onnx:                       framework, py37
 onnxruntime:                framework, py37
 annoy:                      index
 sklearn:                    framework
-faiss-cpu:                  numeric
-faiss-gpu:                  numeric, gpu
diff --git a/jina/executors/encoders/numeric/pca.py b/jina/executors/encoders/numeric/pca.py
index 21cc6571bdc4f..841da5a0d0dc4 100644
--- a/jina/executors/encoders/numeric/pca.py
+++ b/jina/executors/encoders/numeric/pca.py
@@ -4,36 +4,8 @@
 
 from .. import BaseNumericEncoder
 
-from sklearn.decomposition import PCA
 
-
-class _BasePCAEncoder(BaseNumericEncoder):
-    """Base class for PCA methods.
-
-    Warning: This class should not be used directly.
-    Use derived classes instead.
-    """
-    def __init__(self,
-                 output_dim: int,
-                 num_features: int = None,
-                 whiten: bool = False,
-                 *args,
-                 **kwargs):
-        """
-
-        :param output_dim: the output size.
-        :param num_features: the number of input features.  If ``num_features`` is None, then ``num_features`` is
-            inferred from the data
-        :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
-        """
-        super().__init__(*args, **kwargs)
-        self.output_dim = output_dim
-        self.whiten = whiten
-        self.num_features = num_features
-        self.is_trained = False
-
-
-class IncrementalPCAEncoder(_BasePCAEncoder):
+class IncrementalPCAEncoder(BaseNumericEncoder):
     """
     :class:`IncrementalPCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
 
@@ -54,7 +26,11 @@ def __init__(self,
             inferred from the data
         :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
         """
-        super().__init__(output_dim, num_features, whiten, *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        self.output_dim = output_dim
+        self.whiten = whiten
+        self.num_features = num_features
+        self.is_trained = False
         self.model = None
 
     def post_init(self):
@@ -85,83 +61,3 @@ def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
         """
         _, num_features = data.shape
         return self.model.transform(data)
-
-
-class PCAEncoder(_BasePCAEncoder):
-    """
-    :class:`PCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
-
-    .. note::
-        :class:`PCAEncoder` must be trained before calling ``encode()``. This encoder can NOT be trained in the batch mode.
-    """
-    def __init__(self,
-                 output_dim: int,
-                 num_features: int,
-                 whiten: bool = False,
-                 model_filename: str = 'pca.bin',
-                 *args,
-                 **kwargs):
-        """
-
-        :param output_dim: the output size.
-        :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
-        :param num_features: the number of input features.  If ``num_features`` is None, then ``num_features`` is
-            inferred from the data
-        :param encoder_abspath: the absolute saving path of the encoder. If a valid path is given, the encoder will be
-            loaded from the given path.
-        """
-        super().__init__(output_dim, num_features, whiten, *args, **kwargs)
-        self.model_filename = model_filename
-        self.mean = None
-        self.num_samples = None
-
-    def __getstate__(self):
-        if os.path.exists(self.model_abspath):
-            self.logger.warning(
-                'the existed model file will be overrided: {}'.format(self.model_abspath))
-        import faiss
-        faiss.write_VectorTransform(self.model, self.model_abspath)
-        self.logger.info(
-            'the model is saved at: {}'.format(self.model_abspath))
-        return super().__getstate__()
-
-    @staticmethod
-    def _calc_std(data, n_samples):
-        return np.sqrt(data ** 2 / (n_samples - 1))
-
-    @property
-    def model_abspath(self) -> str:
-        return self.get_file_from_workspace(self.model_filename)
-
-    def post_init(self):
-        self.model = None
-        import faiss
-        if os.path.exists(self.model_abspath):
-            self.model = faiss.read_VectorTransform(self.model_abspath)
-            self.std = self._calc_std(
-                faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
-            self.logger.info('load existing model from {}'.format(self.model_abspath))
-        else:
-            self.model = faiss.PCAMatrix(self.num_features, self.output_dim)
-
-    def train(self, data: 'np.ndarray', *args, **kwargs):
-        import faiss
-        self.num_samples, num_features = data.shape
-        if not self.num_features:
-            self.num_features = num_features
-        self.mean = np.mean(data, axis=0)
-        self.model.train((data - self.mean).astype('float32'))
-        self.std = self._calc_std(faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
-        self.is_trained = True
-
-    @require_train
-    @batching
-    def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
-        """
-        :param data: a `B x T` numpy ``ndarray``, `B` is the size of the batch
-        :return: a `B x D` numpy ``ndarray``
-        """
-        output = self.model.apply_py((data - self.mean).astype('float32'))
-        if self.whiten:
-            output /= self.std
-        return output
diff --git a/tests/test_exec_encoder_numeric_pca.py b/tests/test_exec_encoder_numeric_pca.py
index ef526d86832ae..cd843261d1556 100644
--- a/tests/test_exec_encoder_numeric_pca.py
+++ b/tests/test_exec_encoder_numeric_pca.py
@@ -4,14 +4,13 @@
 import os
 
 from tests import JinaTestCase
-import jina.executors.encoders.numeric.pca as pca
+from jina.executors.encoders.numeric.pca import IncrementalPCAEncoder
 from jina.executors import BaseExecutor
 
 
 class MyTestCase(JinaTestCase):
     num_features = 28
     output_dim = 2
-    model_list = ('IncrementalPCAEncoder', 'PCAEncoder')
 
     def _test_encoding_results(self, encoder):
         train_data = np.random.rand(1000, self.num_features)
@@ -26,10 +25,9 @@ def _test_encoding_results(self, encoder):
             self.add_tmpfile(encoder.model_abspath)
 
     def test_encoding_results(self):
-        for m in self.model_list:
-            encoder = getattr(pca, m)(
-                output_dim=self.output_dim, whiten=True, num_features=self.num_features)
-            self._test_encoding_results(encoder)
+        encoder = IncrementalPCAEncoder(
+            output_dim=self.output_dim, whiten=True, num_features=self.num_features)
+        self._test_encoding_results(encoder)
 
     def _test_save_and_load(self, encoder):
         train_data = np.random.rand(1000, self.num_features)
@@ -48,10 +46,9 @@ def _test_save_and_load(self, encoder):
             self.add_tmpfile(encoder.model_abspath)
 
     def test_save_and_load(self):
-        for m in self.model_list:
-            encoder = getattr(pca, m)(
-                output_dim=self.output_dim, whiten=True, num_features=self.num_features)
-            self._test_save_and_load(encoder)
+        encoder = IncrementalPCAEncoder(
+            output_dim=self.output_dim, whiten=True, num_features=self.num_features)
+        self._test_save_and_load(encoder)
 
     def _test_save_and_load_config(self, encoder):
         encoder.save_config()
@@ -65,10 +62,9 @@ def _test_save_and_load_config(self, encoder):
             self.add_tmpfile(encoder.model_abspath)
 
     def test_save_and_load_config(self):
-        for m in self.model_list:
-            encoder = getattr(pca, m)(
-                output_dim=self.output_dim, whiten=True, num_features=self.num_features)
-            self._test_save_and_load_config(encoder)
+        encoder = IncrementalPCAEncoder(
+            output_dim=self.output_dim, whiten=True, num_features=self.num_features)
+        self._test_save_and_load_config(encoder)
 
 
 if __name__ == '__main__':