From 5611fea59edaba4d04f7f4615ad5060bb7138262 Mon Sep 17 00:00:00 2001
From: Shawn Martin <smartin@sandia.gov>
Date: Mon, 3 Jul 2023 16:27:18 -0600
Subject: [PATCH] Fixed parameter space csv parser using pandas.

---
 web-server/plugins/slycat-csv-parser.py       | 225 ++++++++++--------
 .../slycat-parameter-image/js/wizard-ui.js    |  50 +++-
 .../slycat-parameter-image/wizard-ui.html     |  13 +-
 3 files changed, 177 insertions(+), 111 deletions(-)

diff --git a/web-server/plugins/slycat-csv-parser.py b/web-server/plugins/slycat-csv-parser.py
index a992b0599..23c0ebd51 100644
--- a/web-server/plugins/slycat-csv-parser.py
+++ b/web-server/plugins/slycat-csv-parser.py
@@ -9,112 +9,123 @@
 import slycat.web.server
 import cherrypy
 
-def parse_file(file, model, database):
+import pandas as pd
+from io import StringIO
+
+
+# common csv parsing code used by both slycat-web-server and slycat-web-client
+# does not call slycat.web.server directly, instead returns any error messages
+# error messages are returned as a list of {'type': 'warning', 'message': 'message'}
+def parse_file_offline(file):
+    """
+    parses out a csv file into numpy array by column (data), the dimension meta data(dimensions),
+    and sets attributes (attributes)
+    :param file: csv file to be parsed
+    :returns: attributes, dimensions, data, error_messages
     """
-  parses out a csv file into numpy array by column (data), the dimension meta data(dimensions),
-  and sets attributes (attributes)
-  :param file: csv file to be parsed
-  :returns: attributes, dimensions, data
-  """
-    def isfloat(value):
-        try:
-            float(value)
-            return True
-        except ValueError:
-            return False
-
-    invalid_csv = False  # CSV is completely missing a column header (it isn't just a blank string)
-    content = file.splitlines()
-    csv_reader = csv.reader(content)
-    headings = next(csv_reader)
-    first_line = next(csv_reader)
-
-    if len(headings) != len(first_line):
-        invalid_csv = True
-
-    rows = [row for row in
-            csv.reader(file.splitlines(), delimiter=",", doublequote=True, escapechar=None, quotechar='"',
-                       quoting=csv.QUOTE_MINIMAL, skipinitialspace=True)]
-    if len(rows) < 2:
-        cherrypy.log.error("slycat-csv-parser.py parse_file", "File must contain at least two rows.")
-        raise Exception("File must contain at least two rows.")
-
-    attributes = []
-    dimensions = [{"name": "row", "type": "int64", "begin": 0, "end": len(rows[1:])}]
+
+    # initial attributes, dimensions, and data
+    # empty, but existing to avoid crashing slycat
+    dimensions = [dict(name="row", type="int64", begin=0, end=0)]
+    attributes = [dict(name="None", type="float64")]
+    data = [numpy.zeros(0)]
+
+    # keep track of errors
+    csv_read_error = []
+
+    # load input file as pandas dataframe
+    try:
+        df = pd.read_csv(StringIO(file))
+
+    # return empty values if couldn't read file
+    except Exception as e:
+        csv_read_error.append({'type': 'error', 'message': 'Could not read .csv file.\n\n' +
+                               'Pandas exception: "' + str(e) + '".'})
+        return attributes, dimensions, data, csv_read_error 
+
+    # check for at least two rows
+    if df.shape[0] < 2:
+        csv_read_error.append({'type': 'error', 'message': 'File must contains at least two rows.'})
+        return attributes, dimensions, data, csv_read_error 
+    
+    # check for at least one column
+    if df.shape[1] < 1:
+        csv_read_error.append({'type': 'error', 'message': 'File must have at least one column.'})
+        return attributes, dimensions, data, csv_read_error
+
+    # parse attributes, dimensions of data frame
+    dimensions = [dict(name="row", type="int64", begin=0, end=len(df.index))]
+    attributes = [dict(name=header, type="float64" 
+        if df[header].dtype != "object" else "string") 
+        for header in df.columns]
+    
+    # parse data
     data = []
-    default_name_index = 0
-    duplicate_name_index = 0
-    duplicate_names = []
-    duplicate_indeces = []
-    blank_header_columns = []
-    column_headers = []
-    error_message = []
-    duplicate_headers = False
-    blank_headers = False  # Header with a blank string, i.e. ",,"
-
-    # go through the csv by column
-    for column in zip(*rows):
-        column_has_floats = False
-
-        # start from 1 to avoid the column name
-        for value in column[1:]:
-            if isfloat(value):
-                column_has_floats = True
-                try:  # note NaN's are floats
-                    output_list = ['NaN' if x == '' else x for x in column[1:]]
-                    data.append(numpy.array(output_list).astype("float64"))
-                    attributes.append({"name": column[0], "type": "float64"})
-                    column_headers.append(column[0])
-
-                # could not convert something to a float defaulting to string
-                except Exception as e:
-                    column_has_floats = False
-                    cherrypy.log.error("found floats but failed to convert, switching to string types Trace: %s" % e)
-                break
-        if not column_has_floats:
-            [str(item) for item in column[1:]]
-
-            data.append(numpy.array(column[1:]))
-            attributes.append({"name": column[0], "type": "string"})
-            column_headers.append(column[0])
-
-    if len(attributes) < 1:
-        cherrypy.log.error("slycat-csv-parser.py parse_file", "File must contain at least one column.")
-        raise Exception("File must contain at least one column.")
-
-# Adding deafult headers and making duplicates unique
-
-    for index, attribute in enumerate(attributes):
-        if attribute["name"] is "":
-            message = "Column " + str(index + 1)
-            blank_header_columns.append(message)
-            blank_headers = True
-        # Don't want to include blank headers as duplicates.
-        if column_headers.count(attribute["name"]) > 1 and attribute["name"] is not '':
-            duplicate_names.append(attribute["name"])
-            duplicate_indeces.append(str(index + 1))
-            duplicate_headers = True
-
-    if invalid_csv is True:
-        error_message.append(
-            "Your CSV is invalid because it's missing at least one column header. Please CLOSE this wizard, fix the issue, then start a new wizard. \n")
-    else:
-        if blank_headers is True:
-            error_message.append("Your CSV file contained blank headers in: \n")
-            for message in blank_header_columns:
-                error_message.append(
-                    "%s \n" % message)
-        if duplicate_headers is True:
-            error_message.append("Your CSV file contained these identical headers: \n ")
-            for name, index in zip(duplicate_names, duplicate_indeces):
-                error_message.append(
-                    "%s \n" % str("'" + name + "' " + "in column " + index))
-
-    if error_message is not "":
-        slycat.web.server.put_model_parameter(database, model, "error-messages", error_message)
+    for header in df.columns.values:
+        if df[header].dtype == "object":
+            data.append(df[header].values.astype('unicode'))
+        else:
+            data.append(df[header].values)
+            
+    # check for empty headers (pandas replaced them with 'Unnamed: <Column #>')
+    empty_headers = []
+    headers = df.columns.values
+    for i in range(len(headers)):
+        if headers[i].startswith('Unnamed:'):
+            empty_headers.append(int(headers[i][8:])+1)
+
+            # rename header so index starts at 1
+            df = df.rename(columns = {headers[i]: "Unnamed: " + str(empty_headers[-1])})
+    headers = df.columns.values
+
+    # slycat warning for empty headers
+    if len(empty_headers) != 0:
+        csv_read_error.append({'type': 'warning', 'message': 'Found empty headers in columns: ' + \
+                               str(empty_headers) + '.  Using "Unnamed: <Column #>" for empty headers.'})
+    
+    # look for duplicate headers (pandas adds .# to column)
+    duplicated_headers = []
+    for i in range(len(headers)):
+        for j in range(i+1,len(headers)):
+            if headers[j].startswith(headers[i]):
+                header_j_suffix = headers[j].split('.')[-1]
+                if header_j_suffix.isnumeric():
+                    duplicated_headers.append(i+1)
+                    duplicated_headers.append(j+1)
+    duplicated_headers = pd.unique(duplicated_headers)
+
+    # slycat warning for duplicated headers
+    if len(duplicated_headers) != 0:
+        csv_read_error.append({'type': 'warning', 'message': 'Found duplicated headers in columns: ' + \
+                              str(duplicated_headers) + '.  Using "<Header>.#" for duplicate headers.'})
+
+    # headers may have been changed, need to recompute
+    attributes = [dict(name=header, type="float64" 
+        if df[header].dtype != "object" else "string") 
+        for header in df.columns]
+    
+    # return data and errors
+    return attributes, dimensions, data, csv_read_error
+
+
+def parse_file(file, model, database):
+    """
+    parses out a csv file into numpy array by column (data), the dimension meta data(dimensions),
+    and sets attributes (attributes)
+    :param file: csv file to be parsed
+    :returns: attributes, dimensions, data
+    """
+
+    # parse file
+    attributes, dimensions, data, csv_read_errors = parse_file_offline(file)
+
+    # pass errors on to model
+    if len(csv_read_errors) != 0:
+        slycat.web.server.put_model_parameter(database, model, "error-messages", csv_read_errors)
     else:
-        slycat.web.server.put_model_parameter(database, model, "error-messages", "")
+        slycat.web.server.put_model_parameter(database, model, "error-messages", [])
 
+    # return data
     return attributes, dimensions, data
 
 
@@ -129,18 +140,26 @@ def parse(database, model, input, files, aids, **kwargs):
     :param aids: artifact ID
     :param kwargs:
     """
+
+    # keep track of processing time
     start = time.time()
+
     if len(files) != len(aids):
         cherrypy.log.error("slycat-csv-parser.py parse", "Number of files and artifact IDs must match.")
         raise Exception("Number of files and artifact ids must match.")
 
+    # parse files
     parsed = [parse_file(file, model, database) for file in files]
 
+    # upload data
     array_index = int(kwargs.get("array", "0"))
     for (attributes, dimensions, data), aid in zip(parsed, aids):
         slycat.web.server.put_model_arrayset(database, model, aid, input)
         slycat.web.server.put_model_array(database, model, aid, 0, attributes, dimensions)
-        slycat.web.server.put_model_arrayset_data(database, model, aid, "%s/.../..." % array_index, data)
+        slycat.web.server.put_model_arrayset_data(database, model, aid, 
+                                                    "%s/.../..." % array_index, data)
+    
+    # done processing
     end = time.time()
 
     model = database.get("model", model['_id'])
diff --git a/web-server/plugins/slycat-parameter-image/js/wizard-ui.js b/web-server/plugins/slycat-parameter-image/js/wizard-ui.js
index 708147bc9..ac44a07ca 100644
--- a/web-server/plugins/slycat-parameter-image/js/wizard-ui.js
+++ b/web-server/plugins/slycat-parameter-image/js/wizard-ui.js
@@ -30,6 +30,7 @@ function constructor(params) {
   component.current_aids = ko.observable("");
   component.csv_data = ko.observableArray();
   component.error_messages = ko.observable("");
+  component.warning_messages = ko.observable("");
   component.useProjectData = ko.observable(false);
   // Alex removing default model name per team meeting discussion
   // component.model = mapping.fromJS({_id: null, name: "New Parameter Space Model", description: "", marking: markings.preselected()});
@@ -143,10 +144,37 @@ function constructor(params) {
         aid: "error-messages",
       })
       .then((errors) => {
+
+        // keep track of both warnings and errors
         var error_messages = "";
+        var warning_messages = "";
+
+        // check if there are actual errors or just warnings
         if (errors.length >= 1) {
+          for (var i=0; i<errors.length; i++) {
+            if (errors[i]["type"] == "error") {
+              error_messages = "Errors listed below must be fixed before you can upload a model.\n" +
+                               "Please close this dialog or try again with a new file.\n\n"
+              break;
+            }
+          }
+
+          // display warnings/errors
+          for (var i=0; i<errors.length; i++) {
+            if (errors[i]["type"] == "warning"){
+              warning_messages += "Warning: " + errors[i]["message"] + "\n";
+            } else {
+              error_messages += "Error: " + errors[i]["message"] + "\n";
+            }   
+          }
+          component.error_messages(error_messages);
+          component.warning_messages(warning_messages);
+
           // if there were errors, cleanup project data
-          client
+          if (error_messages.length > 0) {
+
+            // delete model and data
+            client
             .get_project_data_in_model_fetch({
               mid: component.model._id(),
             })
@@ -156,10 +184,16 @@ function constructor(params) {
               }
             });
 
-          for (var i = 0; i < errors.length; i++) {
-            error_messages += errors[i] + "\n";
+            // re-enable button
+            $(".browser-continue").toggleClass("disabled", false);
+
+          } else {
+
+            // only warnings, continue
+            component.tab(4);
+            $(".browser-continue").toggleClass("disabled", false);
+
           }
-          component.error_messages(error_messages);
         } else {
           component.error_messages(error_messages);
         }
@@ -303,6 +337,14 @@ function constructor(params) {
   };
 
   component.upload_table = function () {
+
+    // check that a file has been selected
+    if (component.browser.selection().length == 0) {
+      component.error_messages("You must selected a file before continuing.");
+      return
+    }
+
+    // get file data
     $(".local-browser-continue").toggleClass("disabled", true);
     var file = component.browser.selection()[0];
 
diff --git a/web-server/plugins/slycat-parameter-image/wizard-ui.html b/web-server/plugins/slycat-parameter-image/wizard-ui.html
index fdfbec693..b3cc0e279 100644
--- a/web-server/plugins/slycat-parameter-image/wizard-ui.html
+++ b/web-server/plugins/slycat-parameter-image/wizard-ui.html
@@ -65,9 +65,9 @@ <h3 class="modal-title">New Parameter Space Model</h3>
     </div>
 
     <div data-bind="visible:tab() == 1" class="form-horizontal">
-      <label class="alert alert-danger slycat-big-scrolling-alert" role="alert" data-bind="visible:error_messages().length > 0, text: error_messages()" style="
-        white-space:pre-line;
-        "></label>
+      <div class="alert alert-danger slycat-big-scrolling-alert" role="alert" 
+             data-bind="visible:error_messages().length > 0, text: error_messages()" 
+             style="white-space:pre-line;"></div>
       <slycat-local-browser params="
         selection:browser.selection,
         progress:browser.progress,
@@ -125,6 +125,9 @@ <h3 class="modal-title">New Parameter Space Model</h3>
     </div>
 
     <div data-bind="visible:tab() == 4">
+      <div class="alert alert-warning slycat-big-scrolling-alert" role="alert" 
+      data-bind="visible:warning_messages().length > 0, text: warning_messages()" 
+      style="white-space:pre-line;"></div>
       <slycat-table-ingestion params="
         variables: attributes,
         properties: [
@@ -145,7 +148,9 @@ <h3 class="modal-title">New Parameter Space Model</h3>
 <div class="modal-footer">
   <button class="btn btn-light mr-auto" data-bind="visible: [0].indexOf(tab()) == -1, click: back">Back</button>
   <button class="btn btn-primary" data-bind="visible:tab() == 0,click:select_type">Continue</button>
-  <button class="btn btn-primary local-browser-continue browser-continue" data-bind="visible:tab() == 1,click:upload_table"><i class="fa fa-spinner fa-pulse"></i> Continue</button>
+  <button class="btn btn-primary local-browser-continue browser-continue" 
+    data-bind="visible:tab() == 1,click:upload_table">
+    <i class="fa fa-spinner fa-pulse"></i> Continue</button>
   <button class="btn btn-primary" data-bind="visible:tab() == 2 && ps_type() == 'remote',click:connect,enable:remote.enable">Continue</button>
   <button class="btn btn-primary" data-bind="visible:tab() == 2 && ps_type() == 'smb',click:connectSMB,enable:remote.enable">Continue</button>
   <button class="btn btn-primary remote-browser-continue browser-continue" data-bind="visible:tab() == 3 && ps_type() == 'remote',click:load_table"><i class="fa fa-spinner fa-pulse"></i> Continue</button>