Skip to content

Commit

Permalink
feat(encoder): remove the PCAEncoder
Browse files Browse the repository at this point in the history
  • Loading branch information
nan-wang committed Apr 1, 2020
1 parent 62b0d84 commit a2e26e4
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 126 deletions.
2 changes: 0 additions & 2 deletions extra-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,3 @@ onnx: framework, py37
onnxruntime: framework, py37
annoy: index
sklearn: framework
faiss-cpu: numeric
faiss-gpu: numeric, gpu
116 changes: 6 additions & 110 deletions jina/executors/encoders/numeric/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,8 @@

from .. import BaseNumericEncoder

from sklearn.decomposition import PCA


class _BasePCAEncoder(BaseNumericEncoder):
"""Base class for PCA methods.
Warning: This class should not be used directly.
Use derived classes instead.
"""
def __init__(self,
output_dim: int,
num_features: int = None,
whiten: bool = False,
*args,
**kwargs):
"""
:param output_dim: the output size.
:param num_features: the number of input features. If ``num_features`` is None, then ``num_features`` is
inferred from the data
:param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
"""
super().__init__(*args, **kwargs)
self.output_dim = output_dim
self.whiten = whiten
self.num_features = num_features
self.is_trained = False


class IncrementalPCAEncoder(_BasePCAEncoder):
class IncrementalPCAEncoder(BaseNumericEncoder):
"""
:class:`IncrementalPCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
Expand All @@ -54,7 +26,11 @@ def __init__(self,
inferred from the data
:param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
"""
super().__init__(output_dim, num_features, whiten, *args, **kwargs)
super().__init__(*args, **kwargs)
self.output_dim = output_dim
self.whiten = whiten
self.num_features = num_features
self.is_trained = False
self.model = None

def post_init(self):
Expand Down Expand Up @@ -85,83 +61,3 @@ def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
"""
_, num_features = data.shape
return self.model.transform(data)


class PCAEncoder(_BasePCAEncoder):
"""
:class:`PCAEncoder` encodes data from an ndarray in size `B x T` into an ndarray in size `B x D`.
.. note::
:class:`PCAEncoder` must be trained before calling ``encode()``. This encoder can NOT be trained in the batch mode.
"""
def __init__(self,
output_dim: int,
num_features: int,
whiten: bool = False,
model_filename: str = 'pca.bin',
*args,
**kwargs):
"""
:param output_dim: the output size.
:param whiten: If whiten is false, the data is already considered to be whitened, and no whitening is performed.
:param num_features: the number of input features. If ``num_features`` is None, then ``num_features`` is
inferred from the data
:param encoder_abspath: the absolute saving path of the encoder. If a valid path is given, the encoder will be
loaded from the given path.
"""
super().__init__(output_dim, num_features, whiten, *args, **kwargs)
self.model_filename = model_filename
self.mean = None
self.num_samples = None

def __getstate__(self):
if os.path.exists(self.model_abspath):
self.logger.warning(
'the existed model file will be overrided: {}'.format(self.model_abspath))
import faiss
faiss.write_VectorTransform(self.model, self.model_abspath)
self.logger.info(
'the model is saved at: {}'.format(self.model_abspath))
return super().__getstate__()

@staticmethod
def _calc_std(data, n_samples):
return np.sqrt(data ** 2 / (n_samples - 1))

@property
def model_abspath(self) -> str:
return self.get_file_from_workspace(self.model_filename)

def post_init(self):
self.model = None
import faiss
if os.path.exists(self.model_abspath):
self.model = faiss.read_VectorTransform(self.model_abspath)
self.std = self._calc_std(
faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
self.logger.info('load existing model from {}'.format(self.model_abspath))
else:
self.model = faiss.PCAMatrix(self.num_features, self.output_dim)

def train(self, data: 'np.ndarray', *args, **kwargs):
import faiss
self.num_samples, num_features = data.shape
if not self.num_features:
self.num_features = num_features
self.mean = np.mean(data, axis=0)
self.model.train((data - self.mean).astype('float32'))
self.std = self._calc_std(faiss.vector_to_array(self.model.eigenvalues)[:self.output_dim], self.num_samples)
self.is_trained = True

@require_train
@batching
def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
"""
:param data: a `B x T` numpy ``ndarray``, `B` is the size of the batch
:return: a `B x D` numpy ``ndarray``
"""
output = self.model.apply_py((data - self.mean).astype('float32'))
if self.whiten:
output /= self.std
return output
24 changes: 10 additions & 14 deletions tests/test_exec_encoder_numeric_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
import os

from tests import JinaTestCase
import jina.executors.encoders.numeric.pca as pca
from jina.executors.encoders.numeric.pca import IncrementalPCAEncoder
from jina.executors import BaseExecutor


class MyTestCase(JinaTestCase):
num_features = 28
output_dim = 2
model_list = ('IncrementalPCAEncoder', 'PCAEncoder')

def _test_encoding_results(self, encoder):
train_data = np.random.rand(1000, self.num_features)
Expand All @@ -26,10 +25,9 @@ def _test_encoding_results(self, encoder):
self.add_tmpfile(encoder.model_abspath)

def test_encoding_results(self):
for m in self.model_list:
encoder = getattr(pca, m)(
output_dim=self.output_dim, whiten=True, num_features=self.num_features)
self._test_encoding_results(encoder)
encoder = IncrementalPCAEncoder(
output_dim=self.output_dim, whiten=True, num_features=self.num_features)
self._test_encoding_results(encoder)

def _test_save_and_load(self, encoder):
train_data = np.random.rand(1000, self.num_features)
Expand All @@ -48,10 +46,9 @@ def _test_save_and_load(self, encoder):
self.add_tmpfile(encoder.model_abspath)

def test_save_and_load(self):
for m in self.model_list:
encoder = getattr(pca, m)(
output_dim=self.output_dim, whiten=True, num_features=self.num_features)
self._test_save_and_load(encoder)
encoder = IncrementalPCAEncoder(
output_dim=self.output_dim, whiten=True, num_features=self.num_features)
self._test_save_and_load(encoder)

def _test_save_and_load_config(self, encoder):
encoder.save_config()
Expand All @@ -65,10 +62,9 @@ def _test_save_and_load_config(self, encoder):
self.add_tmpfile(encoder.model_abspath)

def test_save_and_load_config(self):
for m in self.model_list:
encoder = getattr(pca, m)(
output_dim=self.output_dim, whiten=True, num_features=self.num_features)
self._test_save_and_load_config(encoder)
encoder = IncrementalPCAEncoder(
output_dim=self.output_dim, whiten=True, num_features=self.num_features)
self._test_save_and_load_config(encoder)


if __name__ == '__main__':
Expand Down

0 comments on commit a2e26e4

Please sign in to comment.