From cf0430b8693f663f64ab8e7bf86626852de55b03 Mon Sep 17 00:00:00 2001
From: "Anna Jung (VMware)" <antheaj@vmware.com>
Date: Wed, 1 Dec 2021 13:07:43 -0600
Subject: [PATCH 1/5] Fix referncing variables before assignment in run.py

Signed-off-by: Anna Jung (VMware) <antheaj@vmware.com>
---
 ml-conversational-analytic-tool/run.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)
diff --git a/ml-conversational-analytic-tool/run.py b/ml-conversational-analytic-tool/run.py
index 263582a..46c0aba 100644
--- a/ml-conversational-analytic-tool/run.py
+++ b/ml-conversational-analytic-tool/run.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-
 from sklearn.model_selection import train_test_split
 
 from baseCNN import BaseCNN
@@ -16,20 +15,18 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type
     data.setupPreProcess(annotated_filename, dataset_filename)
     data.encodeData()
 
-    # Get data for training
-    if encoding_type == 'role':
-        obs, res = data.getRoleMatrix(outcome, padding)
-    elif encoding_type == 'role-agnostic':
-        obs, res = data.getRoleAgnosticMatrix(outcome, padding)
-
     # Create models
-    if model_type == 'CNN':
-        model = BaseCNN()
-    elif model_type == 'LSTM':
+    if model_type == 'LSTM':
         model = BaseLSTM()
+    else:
+        model = BaseCNN()
+
+    # Get data for training
     if encoding_type == 'role':
+        obs, res = data.getRoleMatrix(outcome, padding)
         model.makeModel2D(obs[0].shape)
-    elif encoding_type == 'role-agnostic':
+    else:
+        obs, res = data.getRoleAgnosticMatrix(outcome, padding)
         model.makeModel(obs[0].shape)
 
     # Train model

From ec7b4f94ba75457b4d1dd8af5cd44c2b5487cfd4 Mon Sep 17 00:00:00 2001
From: "Anna Jung (VMware)" <antheaj@vmware.com>
Date: Wed, 1 Dec 2021 13:45:15 -0600
Subject: [PATCH 2/5] Update README to include correct path to the scripts

Signed-off-by: Anna Jung (VMware) <antheaj@vmware.com>
---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 852ff54..ecf166e 100644
--- a/README.md
+++ b/README.md
@@ -88,12 +88,12 @@ Note: There is a rate limit associated with GitHub API. Please read more about
 before extracting data from a GitHub repo.
 
 ```
-GITACCESS=<YOUR_TOKEN>
+export GITACCESS=<YOUR_TOKEN>
 ```
 
 Run the script by passing in `organization` and `repo`  
 ```python
-python runDataExtraction.py <organization> <repo>
+python ./ml-conversational-analytic-tool/runDataExtraction.py <organization> <repo>
 ```
 
 - `organization` is the name of the repository owner
@@ -105,7 +105,7 @@ python runDataExtraction.py <organization> <repo>
 `featureVector.py` prepares your data for annotation use. Run the script by passing in path to `rawdatafile` and `words`.
 
 ```python
-python featureVector.py <rawdatafile> <words> -unannotated
+python ./ml-conversational-analytic-tool/featureVector.py <rawdatafile> <words> -unannotated
 ```
 
 - `rawdatafile` is location of raw data csv
@@ -128,7 +128,7 @@ There are two models available for training
 To train, run the script with required parameters path to `annotated_filename`, `dataset_filename`, `model`, and `outcome`.
 
 ```python
-python run.py <annotated_filename> <dataset_filename> <model> <outcome>
+python ./ml-conversational-analytic-tool/run.py <annotated_filename> <dataset_filename> <model> <outcome>
 ```
 
 - `annotated_filename` is the location of the annotated dataset file

From 7c401c75ac0374197fed895d3104e1bff8893b0b Mon Sep 17 00:00:00 2001
From: "Anna Jung (VMware)" <antheaj@vmware.com>
Date: Wed, 1 Dec 2021 14:09:57 -0600
Subject: [PATCH 3/5] Fix missing Number column in the annotated dataset

Signed-off-by: Anna Jung (VMware) <antheaj@vmware.com>
---
 ml-conversational-analytic-tool/featureVector.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ml-conversational-analytic-tool/featureVector.py b/ml-conversational-analytic-tool/featureVector.py
index e1b0af7..a09a723 100644
--- a/ml-conversational-analytic-tool/featureVector.py
+++ b/ml-conversational-analytic-tool/featureVector.py
@@ -163,6 +163,7 @@ def pullStringConversation(self, export_filename="", export=True):
         # Store each interaction and pull URL for export
         string_conversations = []
         pull_urls = []
+        pull_numbers = []
 
         for index, row in self.raw_data.iterrows():
             # Make pull message
@@ -181,9 +182,11 @@ def pullStringConversation(self, export_filename="", export=True):
                                                                           comment_row["Body"])
             string_conversations.append(conversation.encode("ascii", "ignore").decode())
             pull_urls.append(row["URL"])
+            pull_numbers.append(row["Number"])
 
         # Export converation field dataset
         export_df = pd.DataFrame()
+        export_df["Number"] = pull_numbers
         export_df["URL"] = pull_urls
         export_df["Thread"] = string_conversations
 

From 2189fe17c35e1973d550dbab963374ad44d78bd7 Mon Sep 17 00:00:00 2001
From: "Anna Jung (VMware)" <antheaj@vmware.com>
Date: Wed, 1 Dec 2021 14:28:37 -0600
Subject: [PATCH 4/5] Add model input validation for run.py

Signed-off-by: Anna Jung (VMware) <antheaj@vmware.com>
---
 ml-conversational-analytic-tool/run.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ml-conversational-analytic-tool/run.py b/ml-conversational-analytic-tool/run.py
index 46c0aba..dcc082a 100644
--- a/ml-conversational-analytic-tool/run.py
+++ b/ml-conversational-analytic-tool/run.py
@@ -51,6 +51,10 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type
     parser.add_argument('-pad', action='store_true', default=False, help='Pad total length of each pull')
 
     args = parser.parse_args()
+
+    if args.model != 'CNN' and args.model != 'LSTM':
+        raise Exception("Model must be either CNN or LSTM")
+
     encodingType = 'role'
     if not args.roleRelevant:
         encodingType = 'role-agnostic'

From 74755c8a9b8db41bad99e9bfa3aec8939c547103 Mon Sep 17 00:00:00 2001
From: "Anna Jung (VMware)" <antheaj@vmware.com>
Date: Fri, 3 Dec 2021 09:26:50 -0600
Subject: [PATCH 5/5] Modify help message for model argument

Signed-off-by: Anna Jung (VMware) <antheaj@vmware.com>
---
 ml-conversational-analytic-tool/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-conversational-analytic-tool/run.py b/ml-conversational-analytic-tool/run.py
index dcc082a..6d716a4 100644
--- a/ml-conversational-analytic-tool/run.py
+++ b/ml-conversational-analytic-tool/run.py
@@ -44,7 +44,7 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type
         description="Obtain models to determine constructive and inclusive feedback in Open source communities")
     parser.add_argument('annotated_filename', help='File location of annotated file')
     parser.add_argument('dataset_filename', help='File location of extracted dataset')
-    parser.add_argument('model', help='Model type to use for training')
+    parser.add_argument('model', help='Model type to use for training, supported CNN and LSTM')
     parser.add_argument('outcome', help='Inclusive, Constructive, or Both')
     parser.add_argument('-roleRelevant', action='store_true', default=False,
                         help='Encoding method differentiates b/w conversation roles')