feat(encoder): fix codes in response to the reviews

jina-ai · Apr 1, 2020 · 62b0d84 · 62b0d84
1 parent 4448ff5
commit 62b0d84
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 55 deletions.
diff --git a/extra-requirements.txt b/extra-requirements.txt
@@ -30,6 +30,6 @@ torchvision:                framework
 onnx:                       framework, py37
 onnxruntime:                framework, py37
 annoy:                      index
-sklearn:                    numeric
+sklearn:                    framework
 faiss-cpu:                  numeric
 faiss-gpu:                  numeric, gpu
diff --git a/jina/executors/encoders/numeric/pca.py b/jina/executors/encoders/numeric/pca.py
@@ -6,19 +6,17 @@
 
 from sklearn.decomposition import PCA
 
-class IncrementalPCAEncoder(BaseNumericEncoder):
-    """
-    :class:`IncrementalPCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
 
-    .. note::
-        :class:`IncrementalPCAEncoder` must be trained before calling ``encode()``. This encoder can be trained in an
-        incremental way.
+class _BasePCAEncoder(BaseNumericEncoder):
+    """Base class for PCA methods.
+
+    Warning: This class should not be used directly.
+    Use derived classes instead.
     """
     def __init__(self,
                  output_dim: int,
                  num_features: int = None,
                  whiten: bool = False,
-                 save_path: str = '',
                  *args,
                  **kwargs):
         """
@@ -27,24 +25,41 @@ def __init__(self,
         :param num_features: the number of input features.  If ``num_features`` is None, then ``num_features`` is
             inferred from the data
         :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
-        :param encoder_abspath: the absolute saving path of the encoder. If a valid path is given, the encoder will be
-            loaded from the given path.
         """
         super().__init__(*args, **kwargs)
         self.output_dim = output_dim
         self.whiten = whiten
         self.num_features = num_features
-        self.encoder_abspath = save_path
         self.is_trained = False
 
+
+class IncrementalPCAEncoder(_BasePCAEncoder):
+    """
+    :class:`IncrementalPCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
+
+    .. note::
+        :class:`IncrementalPCAEncoder` must be trained before calling ``encode()``. This encoder can be trained in an
+        incremental way.
+    """
+    def __init__(self,
+                 output_dim: int,
+                 num_features: int = None,
+                 whiten: bool = False,
+                 *args,
+                 **kwargs):
+        """
+
+        :param output_dim: the output size.
+        :param num_features: the number of input features.  If ``num_features`` is None, then ``num_features`` is
+            inferred from the data
+        :param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
+        """
+        super().__init__(output_dim, num_features, whiten, *args, **kwargs)
+        self.model = None
+
     def post_init(self):
         from sklearn.decomposition import IncrementalPCA
-        if os.path.exists(self.encoder_abspath):
-            import pickle
-            with open(self.encoder_abspath, 'rb') as f:
-                self.model = pickle.load(f)
-            self.logger.info('load existing model from {}'.format(self.encoder_abspath))
-        else:
+        if not self.model:
             self.model = IncrementalPCA(
                 n_components=self.output_dim,
                 whiten=self.whiten)
@@ -71,21 +86,8 @@ def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
         _, num_features = data.shape
         return self.model.transform(data)
 
-    def __getstate__(self):
-        if not self.encoder_abspath:
-            self.encoder_abspath = os.path.join(self.current_workspace, "pca.bin")
-        if os.path.exists(self.encoder_abspath):
-            self.logger.warning(
-                'the existed model file will be overrided: {}".format(save_path)')
-        import pickle
-        with open(self.encoder_abspath, 'wb') as f:
-            pickle.dump(self.model, f)
-        self.logger.info(
-            'the model is saved at: {}'.format(self.encoder_abspath))
-        return super().__getstate__()
 
-
-class PCAEncoder(BaseNumericEncoder):
+class PCAEncoder(_BasePCAEncoder):
     """
     :class:`PCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
 
@@ -96,7 +98,7 @@ def __init__(self,
                  output_dim: int,
                  num_features: int,
                  whiten: bool = False,
-                 save_path: str = '',
+                 model_filename: str = 'pca.bin',
                  *args,
                  **kwargs):
         """
@@ -108,50 +110,48 @@ def __init__(self,
         :param encoder_abspath: the absolute saving path of the encoder. If a valid path is given, the encoder will be
             loaded from the given path.
         """
-        super().__init__(*args, **kwargs)
-        self.output_dim = output_dim
-        self.whiten = whiten
-        self.num_features = num_features
-        self.encoder_abspath = save_path
-        self.is_trained = False
+        super().__init__(output_dim, num_features, whiten, *args, **kwargs)
+        self.model_filename = model_filename
         self.mean = None
-        self._num_samples = None
+        self.num_samples = None
 
     def __getstate__(self):
-        if not self.encoder_abspath:
-            self.encoder_abspath = os.path.join(self.current_workspace, "pca.bin")
-        if os.path.exists(self.encoder_abspath):
+        if os.path.exists(self.model_abspath):
             self.logger.warning(
-                'the existed model file will be overrided: {}".format(save_path)')
+                'the existed model file will be overrided: {}'.format(self.model_abspath))
         import faiss
-        faiss.write_VectorTransform(self.model, self.encoder_abspath)
+        faiss.write_VectorTransform(self.model, self.model_abspath)
         self.logger.info(
-            'the model is saved at: {}'.format(self.encoder_abspath))
+            'the model is saved at: {}'.format(self.model_abspath))
         return super().__getstate__()
 
     @staticmethod
     def _calc_std(data, n_samples):
         return np.sqrt(data ** 2 / (n_samples - 1))
 
+    @property
+    def model_abspath(self) -> str:
+        return self.get_file_from_workspace(self.model_filename)
+
     def post_init(self):
         self.model = None
         import faiss
-        if os.path.exists(self.encoder_abspath):
-            self.model = faiss.read_VectorTransform(self.encoder_abspath)
+        if os.path.exists(self.model_abspath):
+            self.model = faiss.read_VectorTransform(self.model_abspath)
             self.std = self._calc_std(
-                faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self._num_samples)
-            self.logger.info('load existing model from {}'.format(self.encoder_abspath))
+                faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
+            self.logger.info('load existing model from {}'.format(self.model_abspath))
         else:
             self.model = faiss.PCAMatrix(self.num_features, self.output_dim)
 
     def train(self, data: 'np.ndarray', *args, **kwargs):
         import faiss
-        self._num_samples, num_features = data.shape
+        self.num_samples, num_features = data.shape
         if not self.num_features:
             self.num_features = num_features
         self.mean = np.mean(data, axis=0)
         self.model.train((data - self.mean).astype('float32'))
-        self.std = self._calc_std(faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self._num_samples)
+        self.std = self._calc_std(faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
         self.is_trained = True
 
     @require_train

diff --git a/tests/test_exec_encoder_numeric_pca.py b/tests/test_exec_encoder_numeric_pca.py
@@ -21,6 +21,9 @@ def _test_encoding_results(self, encoder):
         encoded_data = encoder.encode(test_data)
         self.assertEqual(encoded_data.shape, (test_data.shape[0], self.output_dim))
         self.assertIs(type(encoded_data), np.ndarray)
+        self.add_tmpfile(encoder.config_abspath, encoder.save_abspath)
+        if hasattr(encoder, 'model_abspath'):
+            self.add_tmpfile(encoder.model_abspath)
 
     def test_encoding_results(self):
         for m in self.model_list:
@@ -40,9 +43,9 @@ def _test_save_and_load(self, encoder):
         encoded_data_test = encoder_loaded.encode(test_data)
         np.testing.assert_array_equal(
             encoded_data_test, encoded_data_control)
-        self.add_tmpfile(
-            encoder.config_abspath, encoder.save_abspath, encoder_loaded.config_abspath, encoder_loaded.save_abspath,
-            encoder.encoder_abspath)
+        self.add_tmpfile(encoder.config_abspath, encoder.save_abspath)
+        if hasattr(encoder, 'model_abspath'):
+            self.add_tmpfile(encoder.model_abspath)
 
     def test_save_and_load(self):
         for m in self.model_list:
@@ -57,7 +60,9 @@ def _test_save_and_load_config(self, encoder):
         self.assertEqual(
             encoder_loaded.output_dim,
             encoder.output_dim)
-        self.add_tmpfile(encoder_loaded.config_abspath, encoder_loaded.save_abspath)
+        self.add_tmpfile(encoder.config_abspath, encoder.save_abspath)
+        if hasattr(encoder, 'model_abspath'):
+            self.add_tmpfile(encoder.model_abspath)
 
     def test_save_and_load_config(self):
         for m in self.model_list: