Add unique column to output of the log_parsing pipeline (#1795)

* Add a unique column to output of the `log_parsing` pipeline allowing for the output to be easily validated with the `scripts/compare_data_files.py` script. * Remove unused variables from `examples/log_parsing/postprocessing.py` Closes #1789 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: #1795
nv-morpheus · Jul 17, 2024 · d58563a · d58563a
1 parent a742ade
commit d58563a
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 8 deletions.
diff --git a/examples/log_parsing/postprocessing.py b/examples/log_parsing/postprocessing.py
@@ -18,7 +18,6 @@
 from collections import defaultdict
 
 import mrc
-import numpy as np
 import pandas as pd
 from mrc.core import operators as ops
 
@@ -101,22 +100,18 @@ def _postprocess(self, x: MultiResponseMessage):
         parsed_dfs = infer_pdf.apply(lambda row: self.__get_label_dicts(row), axis=1, result_type="expand")
 
         ext_parsed = pd.DataFrame(parsed_dfs[0].tolist())
-        ext_confidence = pd.DataFrame(parsed_dfs[1].tolist())
         parsed_df = pd.DataFrame()
-        confidence_df = pd.DataFrame()
-        ext_confidence = ext_confidence.applymap(np.mean)
         for label in ext_parsed.columns:
             if label[0] == "B":
                 col_name = label[2:]
                 if "I-" + col_name in ext_parsed.columns:
                     parsed_df[col_name] = ext_parsed[label] + " " + ext_parsed["I-" + col_name].fillna('')
-                    confidence_df[col_name] = (ext_confidence[label] + ext_confidence[label]) / 2
                 else:
                     parsed_df[col_name] = ext_parsed[label]
-                    confidence_df[col_name] = ext_confidence[label]
 
         # decode cleanup
         parsed_df = self.__decode_cleanup(parsed_df)
+        parsed_df["doc"] = parsed_dfs.index
         return MessageMeta(df=cudf.DataFrame.from_pandas(parsed_df))
 
     def __get_label_dicts(self, row):

diff --git a/tests/tests_data/examples/log_parsing/expected_out.csv b/tests/tests_data/examples/log_parsing/expected_out.csv