Merge pull request #284 from WenjieDu/dev

Updating docs, fixing CSDI&LOCF&MRNN, and adding the strategy to save all models
WenjieDu · Dec 25, 2023 · bae6dd4 · bae6dd4
2 parents 62f67e1 + 5ba8ac3
commit bae6dd4
Show file tree

Hide file tree

Showing 36 changed files with 446 additions and 266 deletions.
diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml
@@ -70,8 +70,8 @@ jobs:
             - name: Test with pytest
               run: |
                   python tests/global_test_config.py
-                  rm -rf tests/__pycache__ && rm -rf tests/*/__pycache__
-                  python -m pytest -rA tests/*/* -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc
+                  rm -rf testing_results && rm -rf tests/__pycache__ && rm -rf tests/*/__pycache__
+                  python -m pytest -rA tests/*/* -s -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc
 
             - name: Generate the LCOV report
               run: |

diff --git a/.github/workflows/testing_daily.yml b/.github/workflows/testing_daily.yml
@@ -39,14 +39,16 @@ jobs:
               run: |
                   # run tests separately here due to Segmentation Fault in test_clustering when run all in
                   # one command with `pytest` on MacOS. Bugs not caught, so this is a trade-off to avoid SF.
-                  python -m pytest -rA tests/classification/* -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc
-                  python -m pytest -rA tests/imputation/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
-                  python -m pytest -rA tests/clustering/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
-                  python -m pytest -rA tests/forecasting/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
-                  python -m pytest -rA tests/optim/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
-                  python -m pytest -rA tests/data/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
-                  python -m pytest -rA tests/utils/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
-                  python -m pytest -rA tests/cli/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
+                  python tests/global_test_config.py
+                  rm -rf testing_results && rm -rf tests/__pycache__ && rm -rf tests/*/__pycache__
+                  python -m pytest -rA tests/classification/* -s -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc
+                  python -m pytest -rA tests/imputation/* -s -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
+                  python -m pytest -rA tests/clustering/* -s -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
+                  python -m pytest -rA tests/forecasting/* -s -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
+                  python -m pytest -rA tests/optim/* -s -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
+                  python -m pytest -rA tests/data/* -s -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
+                  python -m pytest -rA tests/utils/* -s -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
+                  python -m pytest -rA tests/cli/* -s -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc
 
             - name: Generate the LCOV report
               run: |

diff --git a/README.md b/README.md
@@ -150,10 +150,10 @@ We present you a usage example of imputing missing values in time series with Py
 ``` python
 import numpy as np
 from sklearn.preprocessing import StandardScaler
-from pygrinder import mcar, masked_fill
+from pygrinder import mcar
 from pypots.data import load_specific_dataset
 from pypots.imputation import SAITS
-from pypots.utils.metrics import cal_mae
+from pypots.utils.metrics import calc_mae
 
 # Data preprocessing. Tedious, but PyPOTS can help.
 data = load_specific_dataset('physionet_2012')  # PyPOTS will automatically download and extract it.
@@ -162,17 +162,18 @@ num_samples = len(X['RecordID'].unique())
 X = X.drop(['RecordID', 'Time'], axis = 1)
 X = StandardScaler().fit_transform(X.to_numpy())
 X = X.reshape(num_samples, 48, -1)
-X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1) # hold out 10% observed values as ground truth
-X = masked_fill(X, 1 - missing_mask, np.nan)
-dataset = {"X": X}
-print(dataset["X"].shape)  # (11988, 48, 37), 11988 samples, 48 time steps, 37 features
+X_ori = X  # keep X_ori for validation
+X = mcar(X, 0.1)  # randomly hold out 10% observed values as ground truth
+dataset = {"X": X}  # X for model input
+print(X.shape)  # (11988, 48, 37), 11988 samples, 48 time steps, 37 features
 
 # Model training. This is PyPOTS showtime.
 saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, d_inner=128, n_heads=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
 # Here I use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
 saits.fit(dataset)
 imputation = saits.impute(dataset)  # impute the originally-missing values and artificially-missing values
-mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
+indicating_mask = np.isnan(X) ^ np.isnan(X_ori)  # indicating mask for imputation error calculation
+mae = calc_mae(imputation, np.nan_to_num(X_ori), indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
 ```
 </details>
 

diff --git a/docs/examples.rst b/docs/examples.rst
@@ -22,10 +22,10 @@ You can also find a simple and quick-start tutorial notebook on Google Colab wit
 
     import numpy as np
     from sklearn.preprocessing import StandardScaler
-    from pygrinder import mcar, masked_fill
+    from pygrinder import mcar
     from pypots.data import load_specific_dataset
     from pypots.imputation import SAITS
-    from pypots.utils.metrics import cal_mae
+    from pypots.utils.metrics import calc_mae
 
     # Data preprocessing. Tedious, but PyPOTS can help. 🤓
     data = load_specific_dataset('physionet_2012')  # PyPOTS will automatically download and extract it.
@@ -34,10 +34,10 @@ You can also find a simple and quick-start tutorial notebook on Google Colab wit
     X = X.drop(['RecordID', 'Time'], axis = 1)
     X = StandardScaler().fit_transform(X.to_numpy())
     X = X.reshape(num_samples, 48, -1)
-    X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1) # hold out 10% observed values as ground truth
-    X = masked_fill(X, 1 - missing_mask, np.nan)
-    dataset = {"X": X}
-    print(dataset["X"].shape)  # (11988, 48, 37), 11988 samples, 48 time steps, 37 features
+    X_ori = X  # keep X_ori for validation
+    X = mcar(X, 0.1)  # randomly hold out 10% observed values as ground truth
+    dataset = {"X": X}  # X for model input
+    print(X.shape)  # (11988, 48, 37), 11988 samples, 48 time steps, 37 features
 
     # initialize the model
     saits = SAITS(
@@ -60,7 +60,8 @@ You can also find a simple and quick-start tutorial notebook on Google Colab wit
     # impute the originally-missing values and artificially-missing values
     imputation = saits.impute(dataset)
     # calculate mean absolute error on the ground truth (artificially-missing values)
-    mae = cal_mae(imputation, X_intact, indicating_mask)
+    indicating_mask = np.isnan(X) ^ np.isnan(X_ori)  # indicating mask for imputation error calculation
+    mae = calc_mae(imputation, np.nan_to_num(X_ori), indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
 
     # the best model has been already saved, but you can still manually save it with function save_model() as below
     saits.save_model(saving_dir="examples/saits",file_name="manually_saved_saits_model")

diff --git a/docs/pypots.modules.rst b/docs/pypots.modules.rst
diff --git a/docs/pypots.nn.rst b/docs/pypots.nn.rst
@@ -0,0 +1,21 @@
+pypots.nn package
+======================
+
+pypots.nn.functional
+--------------------
+
+.. automodule:: pypots.nn.functional
+   :members:
+
+
+pypots.nn.modules.rnn
+---------------------
+
+.. automodule:: pypots.nn.modules.rnn
+   :members:
+
+pypots.nn.modules.transformer
+-----------------------------
+
+.. automodule:: pypots.nn.modules.transformer
+   :members:
diff --git a/docs/pypots.rst b/docs/pypots.rst
@@ -11,7 +11,7 @@ Subpackages
    pypots.classification
    pypots.clustering
    pypots.forecasting
-   pypots.modules
+   pypots.nn
    pypots.optim
    pypots.data
    pypots.utils
diff --git a/pypots/base.py b/pypots/base.py
@@ -36,11 +36,12 @@ class BaseModel(ABC):
         training into a tensorboard file). Will not save if not given.
 
     model_saving_strategy :
-        The strategy to save model checkpoints. It has to be one of [None, "best", "better"].
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
         No model will be saved when it is set as None.
         The "best" strategy will only automatically save the best model after the training finished.
         The "better" strategy will automatically save the model during training whenever the model performs
         better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
 
     Attributes
     ----------
@@ -64,7 +65,7 @@ def __init__(
         saving_path: str = None,
         model_saving_strategy: Optional[str] = "best",
     ):
-        saving_strategies = [None, "best", "better"]
+        saving_strategies = [None, "best", "better", "all"]
         assert (
             model_saving_strategy in saving_strategies
         ), f"saving_strategy must be one of {saving_strategies}, but got f{model_saving_strategy}."
@@ -132,6 +133,8 @@ def _setup_device(self, device: Union[None, str, torch.device, list]) -> None:
                     f"device should be str/torch.device/a list containing str or torch.device, but got {type(device)}"
                 )
 
+            logger.info(f"Using the given device: {self.device}")
+
         # check CUDA availability if using CUDA
         if (isinstance(self.device, list) and "cuda" in self.device[0].type) or (
             isinstance(self.device, torch.device) and "cuda" in self.device.type
@@ -237,7 +240,9 @@ def _auto_save_model_if_necessary(
         if self.saving_path is not None and self.model_saving_strategy is not None:
             name = self.__class__.__name__ if saving_name is None else saving_name
             saving_path = os.path.join(self.saving_path, name)
-            if not training_finished and self.model_saving_strategy == "better":
+            if self.model_saving_strategy == "all":
+                self.save(saving_path)
+            elif not training_finished and self.model_saving_strategy == "better":
                 self.save(saving_path)
             elif training_finished and self.model_saving_strategy == "best":
                 self.save(saving_path)
@@ -287,7 +292,7 @@ def save(
                 torch.save(self.model.module, saving_path)
             else:
                 torch.save(self.model, saving_path)
-            logger.info(f"Saved the model to {saving_path}.")
+            logger.info(f"Saved the model to {saving_path}")
         except Exception as e:
             raise RuntimeError(
                 f'Failed to save the model to "{saving_path}" because of the below error! \n{e}'
@@ -323,7 +328,7 @@ def load(self, path: str) -> None:
                 self.model = loaded_model.model
         except Exception as e:
             raise e
-        logger.info(f"Model loaded successfully from {path}.")
+        logger.info(f"Model loaded successfully from {path}")
 
     def save_model(
         self,
@@ -475,11 +480,12 @@ class BaseNNModel(BaseModel):
         training into a tensorboard file). Will not save if not given.
 
     model_saving_strategy :
-        The strategy to save model checkpoints. It has to be one of [None, "best", "better"].
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
         No model will be saved when it is set as None.
         The "best" strategy will only automatically save the best model after the training finished.
         The "better" strategy will automatically save the model during training whenever the model performs
         better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
 
 
     Attributes
@@ -543,7 +549,7 @@ def _print_model_size(self) -> None:
         """Print the number of trainable parameters in the initialized NN model."""
         num_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
         logger.info(
-            f"A {self.__class__.__name__} model initialized with the given hyperparameters, "
+            f"{self.__class__.__name__} initialized with the given hyperparameters, "
             f"the number of trainable parameters: {num_params:,}"
         )
 

diff --git a/pypots/classification/base.py b/pypots/classification/base.py
@@ -44,11 +44,12 @@ class BaseClassifier(BaseModel):
         training into a tensorboard file). Will not save if not given.
 
     model_saving_strategy :
-        The strategy to save model checkpoints. It has to be one of [None, "best", "better"].
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
         No model will be saved when it is set as None.
         The "best" strategy will only automatically save the best model after the training finished.
         The "better" strategy will automatically save the model during training whenever the model performs
         better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
 
     """
 
@@ -172,11 +173,12 @@ class BaseNNClassifier(BaseNNModel):
         training into a tensorboard file). Will not save if not given.
 
     model_saving_strategy :
-        The strategy to save model checkpoints. It has to be one of [None, "best", "better"].
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
         No model will be saved when it is set as None.
         The "best" strategy will only automatically save the best model after the training finished.
         The "better" strategy will automatically save the model during training whenever the model performs
         better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
 
 
     Notes
@@ -278,7 +280,7 @@ def _train_model(
 
         try:
             training_step = 0
-            for epoch in range(self.epochs):
+            for epoch in range(1, self.epochs + 1):
                 self.model.train()
                 epoch_train_loss_collector = []
                 for idx, data in enumerate(training_loader):
@@ -318,13 +320,15 @@ def _train_model(
                         self._save_log_into_tb_file(epoch, "validating", val_loss_dict)
 
                     logger.info(
-                        f"Epoch {epoch} - "
+                        f"Epoch {epoch:03d} - "
                         f"training loss: {mean_train_loss:.4f}, "
                         f"validating loss: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
-                    logger.info(f"Epoch {epoch} - training loss: {mean_train_loss:.4f}")
+                    logger.info(
+                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                    )
                     mean_loss = mean_train_loss
 
                 if np.isnan(mean_loss):

diff --git a/pypots/classification/brits/model.py b/pypots/classification/brits/model.py
@@ -83,11 +83,12 @@ class BRITS(BaseNNClassifier):
         training into a tensorboard file). Will not save if not given.
 
     model_saving_strategy :
-        The strategy to save model checkpoints. It has to be one of [None, "best", "better"].
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
         No model will be saved when it is set as None.
         The "best" strategy will only automatically save the best model after the training finished.
         The "better" strategy will automatically save the model during training whenever the model performs
         better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
 
     References
     ----------

diff --git a/pypots/classification/grud/model.py b/pypots/classification/grud/model.py
@@ -73,11 +73,12 @@ class GRUD(BaseNNClassifier):
         training into a tensorboard file). Will not save if not given.
 
     model_saving_strategy :
-        The strategy to save model checkpoints. It has to be one of [None, "best", "better"].
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
         No model will be saved when it is set as None.
         The "best" strategy will only automatically save the best model after the training finished.
         The "better" strategy will automatically save the model during training whenever the model performs
         better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
 
     References
     ----------

diff --git a/pypots/classification/raindrop/model.py b/pypots/classification/raindrop/model.py
@@ -99,11 +99,12 @@ class Raindrop(BaseNNClassifier):
         training into a tensorboard file). Will not save if not given.
 
     model_saving_strategy :
-        The strategy to save model checkpoints. It has to be one of [None, "best", "better"].
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
         No model will be saved when it is set as None.
         The "best" strategy will only automatically save the best model after the training finished.
         The "better" strategy will automatically save the model during training whenever the model performs
         better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
 
     References
     ----------

diff --git a/pypots/cli/tuning.py b/pypots/cli/tuning.py
@@ -219,7 +219,7 @@ def run(self):
                 train_set, val_set = self._train_set, self._val_set
             else:
                 logger.info(
-                    f"lazy loading {self._lazy_load}, loading all data from file..."
+                    "Option lazy_load is set as False, hence loading all data from file..."
                 )
                 train_set = load_dict_from_h5(self._train_set)
                 val_set = load_dict_from_h5(self._val_set)