Merge pull request #2 from annajung/main

Fix missing column and variable reference before assignment
vmware-archive · Dec 6, 2021 · cc3762d · cc3762d
2 parents 16fdcd9 + 74755c8
commit cc3762d
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -88,12 +88,12 @@ Note: There is a rate limit associated with GitHub API. Please read more about
 before extracting data from a GitHub repo.
 
 ```
-GITACCESS=<YOUR_TOKEN>
+export GITACCESS=<YOUR_TOKEN>
 ```
 
 Run the script by passing in `organization` and `repo`  
 ```python
-python runDataExtraction.py <organization> <repo>
+python ./ml-conversational-analytic-tool/runDataExtraction.py <organization> <repo>
 ```
 
 - `organization` is the name of the repository owner
@@ -105,7 +105,7 @@ python runDataExtraction.py <organization> <repo>
 `featureVector.py` prepares your data for annotation use. Run the script by passing in path to `rawdatafile` and `words`.
 
 ```python
-python featureVector.py <rawdatafile> <words> -unannotated
+python ./ml-conversational-analytic-tool/featureVector.py <rawdatafile> <words> -unannotated
 ```
 
 - `rawdatafile` is location of raw data csv
@@ -128,7 +128,7 @@ There are two models available for training
 To train, run the script with required parameters path to `annotated_filename`, `dataset_filename`, `model`, and `outcome`.
 
 ```python
-python run.py <annotated_filename> <dataset_filename> <model> <outcome>
+python ./ml-conversational-analytic-tool/run.py <annotated_filename> <dataset_filename> <model> <outcome>
 ```
 
 - `annotated_filename` is the location of the annotated dataset file

diff --git a/ml-conversational-analytic-tool/featureVector.py b/ml-conversational-analytic-tool/featureVector.py
@@ -163,6 +163,7 @@ def pullStringConversation(self, export_filename="", export=True):
         # Store each interaction and pull URL for export
         string_conversations = []
         pull_urls = []
+        pull_numbers = []
 
         for index, row in self.raw_data.iterrows():
             # Make pull message
@@ -181,9 +182,11 @@ def pullStringConversation(self, export_filename="", export=True):
                                                                           comment_row["Body"])
             string_conversations.append(conversation.encode("ascii", "ignore").decode())
             pull_urls.append(row["URL"])
+            pull_numbers.append(row["Number"])
 
         # Export converation field dataset
         export_df = pd.DataFrame()
+        export_df["Number"] = pull_numbers
         export_df["URL"] = pull_urls
         export_df["Thread"] = string_conversations
 

diff --git a/ml-conversational-analytic-tool/run.py b/ml-conversational-analytic-tool/run.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-
 from sklearn.model_selection import train_test_split
 
 from baseCNN import BaseCNN
@@ -16,20 +15,18 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type
     data.setupPreProcess(annotated_filename, dataset_filename)
     data.encodeData()
 
-    # Get data for training
-    if encoding_type == 'role':
-        obs, res = data.getRoleMatrix(outcome, padding)
-    elif encoding_type == 'role-agnostic':
-        obs, res = data.getRoleAgnosticMatrix(outcome, padding)
-
     # Create models
-    if model_type == 'CNN':
-        model = BaseCNN()
-    elif model_type == 'LSTM':
+    if model_type == 'LSTM':
         model = BaseLSTM()
+    else:
+        model = BaseCNN()
+
+    # Get data for training
     if encoding_type == 'role':
+        obs, res = data.getRoleMatrix(outcome, padding)
         model.makeModel2D(obs[0].shape)
-    elif encoding_type == 'role-agnostic':
+    else:
+        obs, res = data.getRoleAgnosticMatrix(outcome, padding)
         model.makeModel(obs[0].shape)
 
     # Train model
@@ -47,13 +44,17 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type
         description="Obtain models to determine constructive and inclusive feedback in Open source communities")
     parser.add_argument('annotated_filename', help='File location of annotated file')
     parser.add_argument('dataset_filename', help='File location of extracted dataset')
-    parser.add_argument('model', help='Model type to use for training')
+    parser.add_argument('model', help='Model type to use for training, supported CNN and LSTM')
     parser.add_argument('outcome', help='Inclusive, Constructive, or Both')
     parser.add_argument('-roleRelevant', action='store_true', default=False,
                         help='Encoding method differentiates b/w conversation roles')
     parser.add_argument('-pad', action='store_true', default=False, help='Pad total length of each pull')
 
     args = parser.parse_args()
+
+    if args.model != 'CNN' and args.model != 'LSTM':
+        raise Exception("Model must be either CNN or LSTM")
+
     encodingType = 'role'
     if not args.roleRelevant:
         encodingType = 'role-agnostic'