From d58563aebef183f2c86e9a4f1b657d8d194548dc Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 17 Jul 2024 12:35:31 -0700 Subject: [PATCH] Add unique column to output of the `log_parsing` pipeline (#1795) * Add a unique column to output of the `log_parsing` pipeline allowing for the output to be easily validated with the `scripts/compare_data_files.py` script. * Remove unused variables from `examples/log_parsing/postprocessing.py` Closes #1789 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1795 --- examples/log_parsing/postprocessing.py | 7 +------ tests/tests_data/examples/log_parsing/expected_out.csv | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/log_parsing/postprocessing.py b/examples/log_parsing/postprocessing.py index 728b2c2594..72b1e34fc6 100644 --- a/examples/log_parsing/postprocessing.py +++ b/examples/log_parsing/postprocessing.py @@ -18,7 +18,6 @@ from collections import defaultdict import mrc -import numpy as np import pandas as pd from mrc.core import operators as ops @@ -101,22 +100,18 @@ def _postprocess(self, x: MultiResponseMessage): parsed_dfs = infer_pdf.apply(lambda row: self.__get_label_dicts(row), axis=1, result_type="expand") ext_parsed = pd.DataFrame(parsed_dfs[0].tolist()) - ext_confidence = pd.DataFrame(parsed_dfs[1].tolist()) parsed_df = pd.DataFrame() - confidence_df = pd.DataFrame() - ext_confidence = ext_confidence.applymap(np.mean) for label in ext_parsed.columns: if label[0] == "B": col_name = label[2:] if "I-" + col_name in ext_parsed.columns: parsed_df[col_name] = ext_parsed[label] + " " + ext_parsed["I-" + col_name].fillna('') - confidence_df[col_name] = (ext_confidence[label] + ext_confidence[label]) / 2 else: parsed_df[col_name] = ext_parsed[label] - confidence_df[col_name] = ext_confidence[label] # decode cleanup parsed_df = self.__decode_cleanup(parsed_df) + parsed_df["doc"] = parsed_dfs.index return MessageMeta(df=cudf.DataFrame.from_pandas(parsed_df)) def __get_label_dicts(self, row): diff --git a/tests/tests_data/examples/log_parsing/expected_out.csv b/tests/tests_data/examples/log_parsing/expected_out.csv index d791a8dfd4..d6ff36d297 100644 --- a/tests/tests_data/examples/log_parsing/expected_out.csv +++ b/tests/tests_data/examples/log_parsing/expected_out.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b93d0803edbff581d0e15e2d526c2e411c8da18d04061b276c44fe11f2cd0ab2 -size 850 +oid sha256:e40047888bc51c00abc2eb5822be61aff86d30e437dd260b768f46e4c1dfe3a9 +size 864