arise-insights · rachelt44 · Jan 27, 2025 · Jan 26, 2025 · drbwa · Jan 26, 2025
diff --git a/README.md b/README.md
@@ -249,6 +249,12 @@ The data is divided into `job-metadata-inputs`: the properties of the workload t
 once the workload completes (e.g., items 6-9 above). The inputs and outputs specification is provided in the 
 `job_spec.yaml` file. See [this example](examples/MLCommons/job_spec.yaml) of a job spec.
 
+In your job spec, you can use the `job-entry-filter` key to filter out entries from the original data according to 
+specific input values. In [this example](examples/MLCommons/job_spec_with_value_filter.yaml), we filter out all entries 
+where the Processor is`2xAMD EPYC 9374F`, but we keep Processor as a data input. The semantics between the different 
+entries specified in `job-entry-filter` is OR. That is, an entry matching any of the values specified will be 
+filtered out.
+
 If the format of your data requires special parsing to transform into a dataframe (i.e., beyond a simple csv file), you 
 can implement your own parser in [this class](arise_predictions/preprocessing/custom_job_parser.py). For example, the sentiment 
 analysis example ([here](examples/sentiment_analysis/data)) uses `SAJsonJobParser` as its parser, since its original 

diff --git a/arise_predictions/preprocessing/job_parser.py b/arise_predictions/preprocessing/job_parser.py
@@ -132,10 +132,6 @@ def collect_jobs_history(data_dir, output_path, job_inputs, job_outputs, start_t
     columns_with_derived = utils.adjust_columns_with_duration(job_inputs + job_outputs, start_time_field_name,
                                                               end_time_field_name)
 
-    # add columns to be filtered by (to be removed at the end of processing)
-    filter_columns = list(job_entry_filter.keys())
-    columns_with_derived = columns_with_derived + filter_columns
-
     df = pd.DataFrame(columns=columns_with_derived)
 
     if not os.path.exists(data_dir):
@@ -179,9 +175,10 @@ def collect_jobs_history(data_dir, output_path, job_inputs, job_outputs, start_t
         return None, None
     else:
         if job_entry_filter:
-            for key, value in job_entry_filter.items():
-                df = df[df[key] != value]
-                df = df.drop(key, axis=1)
+            for entry in job_entry_filter:
+                df = df[~df[entry[constants.JOB_ENTRY_FILTER_NAME_COL]].isin(entry[constants.JOB_ENTRY_FILTER_VALUES_COL])]
+                if not entry[constants.JOB_ENTRY_FILTER_KEEP_COL]:
+                    df = df.drop(entry[constants.JOB_ENTRY_FILTER_NAME_COL], axis=1)
         logger.info("Found {:d} executions in history".format(len(df)))
 
     collect_and_persist_data_metadata(df, job_inputs, job_outputs, output_path)

diff --git a/arise_predictions/utils/constants.py b/arise_predictions/utils/constants.py
@@ -12,6 +12,9 @@
 JOB_PARSER_CLASS_NAME_FIELD = 'job-parser-class-name'
 METADATA_PARSER_CLASS_NAME_FIELD = 'metadata-parser-class-name'
 JOB_ENTRY_FILTER_FIELD = 'job-entry-filter'
+JOB_ENTRY_FILTER_NAME_COL = 'name'
+JOB_ENTRY_FILTER_VALUES_COL = 'excluded_values'
+JOB_ENTRY_FILTER_KEEP_COL = 'keep_input'
 DUMMY_VARS_PREFIX = 'dummy_input_'
 JOB_INPUTS_FEATURE_ENGINEERING = 'job-metadata-fe'
 JOB_DATA_DIR = "data"

diff --git a/examples/MLCommons/job_spec_with_value_filter.yaml b/examples/MLCommons/job_spec_with_value_filter.yaml
@@ -0,0 +1,17 @@
+job-metadata-inputs:
+  - "# of Nodes"
+  - Processor
+  - Accelerator
+  - "# of Accelerators"
+  - "Model MLC"
+  - Scenario
+  - "Host Processor Core Count"
+
+job-metadata-outputs:
+  - tokens_per_second
+
+job-entry-filter:
+  - name: Processor
+    excluded_values: ["2xAMD EPYC 9374F"]
+    keep_input: True
+