Merge pull request #73 from predict-idlab/processing_logs

🎍 improving loggers as described in #66
predict-idlab · Jan 18, 2023 · 8ac5086 · 8ac5086
2 parents b602a71 + 7a4b36a
commit 8ac5086
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 19 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tsflex"
-version = "0.2.3.7"  # Do not forget to update the __init__.py __version__ variable
+version = "0.3.0"  # Do not forget to update the __init__.py __version__ variable
 description = "Toolkit for flexible processing & feature extraction on time-series data"
 authors = ["Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"]
 readme = "README.md"

diff --git a/tests/test_processing_logging.py b/tests/test_processing_logging.py
@@ -3,7 +3,6 @@
 __author__ = "Jeroen Van Der Donckt, Emiel Deprost, Jonas Van Der Donckt"
 
 import os
-import pytest
 import warnings
 import pandas as pd
 import numpy as np
@@ -43,15 +42,15 @@ def drop_nans(df: pd.DataFrame) -> pd.DataFrame:
     assert os.path.exists(logging_file_path)
     logging_df = get_processor_logs(logging_file_path)
 
-    assert all(logging_df.columns.values == ['log_time', 'function', 'series_names', 'duration'])
+    assert all(logging_df.columns.values == ['log_time', 'function', 'series_names', 'output_names', 'duration', 'duration %'])
 
     assert len(logging_df) == len(series_pipeline.processing_steps)
     assert logging_df.select_dtypes(include=[np.datetime64]).columns.values == ['log_time']
     assert logging_df.select_dtypes(include=[np.timedelta64]).columns.values == ['duration']
 
     assert all(logging_df["function"].values == [step.name for step in series_pipeline.processing_steps])
     assert all(logging_df["series_names"].values == ["(TMP,), (ACC_x,)", "(TMP,)"])
-
+    assert all(logging_df["output_names"].values == ["TMP, ACC_x", "TMP"])
 
 def test_file_warning_processing_logging(dummy_data, logging_file_path):
     def interpolate(series: pd.Series) -> pd.Series:

diff --git a/tsflex/__init__.py b/tsflex/__init__.py
@@ -9,7 +9,7 @@
 
 __docformat__ = 'numpy'
 __author__ = "Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"
-__version__ = '0.2.3.7'
+__version__ = '0.3.0'
 __pdoc__ = {
     # do not show tue utils module
     'tsflex.utils': False,

diff --git a/tsflex/processing/logger.py b/tsflex/processing/logger.py
@@ -6,7 +6,7 @@
 
 """
 
-__author__ = "Jeroen Van Der Donckt"
+__author__ = "Jeroen Van Der Donckt, Jonas Van Der Donckt"
 
 import logging
 import pandas as pd
@@ -28,39 +28,41 @@ def _parse_message(message: str) -> list:
     """Parse the message of the logged info."""
     regex = r"\[(.*?)\]"
     matches = re.findall(regex, remove_inner_brackets(message))
-    assert len(matches) == 3
+    assert len(matches) == 4
     func = matches[0]
     series_names = matches[1].replace("'", "")
-    duration_s = float(matches[2].rstrip(" seconds"))
-    return [func, series_names, duration_s]
+    output_names = matches[2].replace("'", "")
+    duration_s = float(matches[3].rstrip(" seconds"))
+    return [func, series_names, output_names, duration_s]
 
 
 def _parse_logging_execution_to_df(logging_file_path: str) -> pd.DataFrame:
-    """Parse the logged messages into a dataframe that contains execution info.
+    """Parse the logged messages into a DataFrame that contains execution info.
 
     Parameters
     ----------
     logging_file_path: str
         The file path where the logged messages are stored. This is the file path that
-        is passed to the ``SeriesPipeline`` its process method.
+        is passed to the ``SeriesPipeline`` its ``process`` method.
 
     Returns
     -------
     pd.DataFrame
-        A DataFrame with the processor its method, series names and calculation 
-        duration.
+        A DataFrame with the processor its method, series names, output names, and
+        calculation duration.
 
     Note
     ----
-    This function only works when the ``logging_file_path`` used in a
-    ``SeriesPipeline`` its ``process`` method is passed.
+    This function only works when the ``logging_file_path`` used in a ``SeriesPipeline``
+    its ``process`` method is passed.
 
     """
     df = logging_file_to_df(logging_file_path)
-    df[["function", "series_names", "duration"]] = pd.DataFrame(
+    df[["function", "series_names", "output_names", "duration"]] = pd.DataFrame(
         list(df["message"].apply(_parse_message)),
         index=df.index,
     )
+    df["duration %"] = (100 * (df['duration'] / df["duration"].sum())).round(2)
     return df.drop(columns=["name", "log_level", "message"])
 
 
@@ -76,7 +78,8 @@ def get_processor_logs(logging_file_path: str) -> pd.DataFrame:
     Returns
     -------
     pd.DataFrame
-        A DataFrame containing each processor its duration and its series names.
+        A DataFrame containing each processor its series names, output names, and
+        duration.
     
     """
     df = _parse_logging_execution_to_df(logging_file_path)

diff --git a/tsflex/processing/series_processor.py b/tsflex/processing/series_processor.py
@@ -216,8 +216,8 @@ def get_series_dict(keys: Tuple[str, ...]):
 
         elapsed = time.time() - t_start
         logger.info(
-            f"Finished function [{self.name}] on {self.series_names} in "
-            f"[{elapsed} seconds]!"
+            f"Finished function [{self.name}] on {self.series_names} with output "
+            f"{list(processed_output.keys())} in [{elapsed} seconds]!"
         )
 
         return processed_output