Skip to content

Commit

Permalink
Merge pull request #73 from predict-idlab/processing_logs
Browse files Browse the repository at this point in the history
🎍 improving loggers as described in #66
  • Loading branch information
jvdd authored Jan 18, 2023
2 parents b602a71 + 7a4b36a commit 8ac5086
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 19 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tsflex"
version = "0.2.3.7" # Do not forget to update the __init__.py __version__ variable
version = "0.3.0" # Do not forget to update the __init__.py __version__ variable
description = "Toolkit for flexible processing & feature extraction on time-series data"
authors = ["Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"]
readme = "README.md"
Expand Down
5 changes: 2 additions & 3 deletions tests/test_processing_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
__author__ = "Jeroen Van Der Donckt, Emiel Deprost, Jonas Van Der Donckt"

import os
import pytest
import warnings
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -43,15 +42,15 @@ def drop_nans(df: pd.DataFrame) -> pd.DataFrame:
assert os.path.exists(logging_file_path)
logging_df = get_processor_logs(logging_file_path)

assert all(logging_df.columns.values == ['log_time', 'function', 'series_names', 'duration'])
assert all(logging_df.columns.values == ['log_time', 'function', 'series_names', 'output_names', 'duration', 'duration %'])

assert len(logging_df) == len(series_pipeline.processing_steps)
assert logging_df.select_dtypes(include=[np.datetime64]).columns.values == ['log_time']
assert logging_df.select_dtypes(include=[np.timedelta64]).columns.values == ['duration']

assert all(logging_df["function"].values == [step.name for step in series_pipeline.processing_steps])
assert all(logging_df["series_names"].values == ["(TMP,), (ACC_x,)", "(TMP,)"])

assert all(logging_df["output_names"].values == ["TMP, ACC_x", "TMP"])

def test_file_warning_processing_logging(dummy_data, logging_file_path):
def interpolate(series: pd.Series) -> pd.Series:
Expand Down
2 changes: 1 addition & 1 deletion tsflex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

__docformat__ = 'numpy'
__author__ = "Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"
__version__ = '0.2.3.7'
__version__ = '0.3.0'
__pdoc__ = {
# do not show tue utils module
'tsflex.utils': False,
Expand Down
27 changes: 15 additions & 12 deletions tsflex/processing/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

__author__ = "Jeroen Van Der Donckt"
__author__ = "Jeroen Van Der Donckt, Jonas Van Der Donckt"

import logging
import pandas as pd
Expand All @@ -28,39 +28,41 @@ def _parse_message(message: str) -> list:
"""Parse the message of the logged info."""
regex = r"\[(.*?)\]"
matches = re.findall(regex, remove_inner_brackets(message))
assert len(matches) == 3
assert len(matches) == 4
func = matches[0]
series_names = matches[1].replace("'", "")
duration_s = float(matches[2].rstrip(" seconds"))
return [func, series_names, duration_s]
output_names = matches[2].replace("'", "")
duration_s = float(matches[3].rstrip(" seconds"))
return [func, series_names, output_names, duration_s]


def _parse_logging_execution_to_df(logging_file_path: str) -> pd.DataFrame:
"""Parse the logged messages into a dataframe that contains execution info.
"""Parse the logged messages into a DataFrame that contains execution info.
Parameters
----------
logging_file_path: str
The file path where the logged messages are stored. This is the file path that
is passed to the ``SeriesPipeline`` its process method.
is passed to the ``SeriesPipeline`` its ``process`` method.
Returns
-------
pd.DataFrame
A DataFrame with the processor its method, series names and calculation
duration.
A DataFrame with the processor its method, series names, output names, and
calculation duration.
Note
----
This function only works when the ``logging_file_path`` used in a
``SeriesPipeline`` its ``process`` method is passed.
This function only works when the ``logging_file_path`` used in a ``SeriesPipeline``
its ``process`` method is passed.
"""
df = logging_file_to_df(logging_file_path)
df[["function", "series_names", "duration"]] = pd.DataFrame(
df[["function", "series_names", "output_names", "duration"]] = pd.DataFrame(
list(df["message"].apply(_parse_message)),
index=df.index,
)
df["duration %"] = (100 * (df['duration'] / df["duration"].sum())).round(2)
return df.drop(columns=["name", "log_level", "message"])


Expand All @@ -76,7 +78,8 @@ def get_processor_logs(logging_file_path: str) -> pd.DataFrame:
Returns
-------
pd.DataFrame
A DataFrame containing each processor its duration and its series names.
A DataFrame containing each processor its series names, output names, and
duration.
"""
df = _parse_logging_execution_to_df(logging_file_path)
Expand Down
4 changes: 2 additions & 2 deletions tsflex/processing/series_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ def get_series_dict(keys: Tuple[str, ...]):

elapsed = time.time() - t_start
logger.info(
f"Finished function [{self.name}] on {self.series_names} in "
f"[{elapsed} seconds]!"
f"Finished function [{self.name}] on {self.series_names} with output "
f"{list(processed_output.keys())} in [{elapsed} seconds]!"
)

return processed_output
Expand Down

0 comments on commit 8ac5086

Please sign in to comment.