From cf0430b8693f663f64ab8e7bf86626852de55b03 Mon Sep 17 00:00:00 2001 From: "Anna Jung (VMware)" Date: Wed, 1 Dec 2021 13:07:43 -0600 Subject: [PATCH 1/5] Fix referncing variables before assignment in run.py Signed-off-by: Anna Jung (VMware) --- ml-conversational-analytic-tool/run.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ml-conversational-analytic-tool/run.py b/ml-conversational-analytic-tool/run.py index 263582a..46c0aba 100644 --- a/ml-conversational-analytic-tool/run.py +++ b/ml-conversational-analytic-tool/run.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import argparse - from sklearn.model_selection import train_test_split from baseCNN import BaseCNN @@ -16,20 +15,18 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type data.setupPreProcess(annotated_filename, dataset_filename) data.encodeData() - # Get data for training - if encoding_type == 'role': - obs, res = data.getRoleMatrix(outcome, padding) - elif encoding_type == 'role-agnostic': - obs, res = data.getRoleAgnosticMatrix(outcome, padding) - # Create models - if model_type == 'CNN': - model = BaseCNN() - elif model_type == 'LSTM': + if model_type == 'LSTM': model = BaseLSTM() + else: + model = BaseCNN() + + # Get data for training if encoding_type == 'role': + obs, res = data.getRoleMatrix(outcome, padding) model.makeModel2D(obs[0].shape) - elif encoding_type == 'role-agnostic': + else: + obs, res = data.getRoleAgnosticMatrix(outcome, padding) model.makeModel(obs[0].shape) # Train model From ec7b4f94ba75457b4d1dd8af5cd44c2b5487cfd4 Mon Sep 17 00:00:00 2001 From: "Anna Jung (VMware)" Date: Wed, 1 Dec 2021 13:45:15 -0600 Subject: [PATCH 2/5] Update README to include correct path to the scripts Signed-off-by: Anna Jung (VMware) --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 852ff54..ecf166e 100644 --- a/README.md +++ b/README.md @@ -88,12 +88,12 @@ Note: There is a rate limit associated with GitHub API. Please read more about before extracting data from a GitHub repo. ``` -GITACCESS= +export GITACCESS= ``` Run the script by passing in `organization` and `repo` ```python -python runDataExtraction.py +python ./ml-conversational-analytic-tool/runDataExtraction.py ``` - `organization` is the name of the repository owner @@ -105,7 +105,7 @@ python runDataExtraction.py `featureVector.py` prepares your data for annotation use. Run the script by passing in path to `rawdatafile` and `words`. ```python -python featureVector.py -unannotated +python ./ml-conversational-analytic-tool/featureVector.py -unannotated ``` - `rawdatafile` is location of raw data csv @@ -128,7 +128,7 @@ There are two models available for training To train, run the script with required parameters path to `annotated_filename`, `dataset_filename`, `model`, and `outcome`. ```python -python run.py +python ./ml-conversational-analytic-tool/run.py ``` - `annotated_filename` is the location of the annotated dataset file From 7c401c75ac0374197fed895d3104e1bff8893b0b Mon Sep 17 00:00:00 2001 From: "Anna Jung (VMware)" Date: Wed, 1 Dec 2021 14:09:57 -0600 Subject: [PATCH 3/5] Fix missing Number column in the annotated dataset Signed-off-by: Anna Jung (VMware) --- ml-conversational-analytic-tool/featureVector.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ml-conversational-analytic-tool/featureVector.py b/ml-conversational-analytic-tool/featureVector.py index e1b0af7..a09a723 100644 --- a/ml-conversational-analytic-tool/featureVector.py +++ b/ml-conversational-analytic-tool/featureVector.py @@ -163,6 +163,7 @@ def pullStringConversation(self, export_filename="", export=True): # Store each interaction and pull URL for export string_conversations = [] pull_urls = [] + pull_numbers = [] for index, row in self.raw_data.iterrows(): # Make pull message @@ -181,9 +182,11 @@ def pullStringConversation(self, export_filename="", export=True): comment_row["Body"]) string_conversations.append(conversation.encode("ascii", "ignore").decode()) pull_urls.append(row["URL"]) + pull_numbers.append(row["Number"]) # Export converation field dataset export_df = pd.DataFrame() + export_df["Number"] = pull_numbers export_df["URL"] = pull_urls export_df["Thread"] = string_conversations From 2189fe17c35e1973d550dbab963374ad44d78bd7 Mon Sep 17 00:00:00 2001 From: "Anna Jung (VMware)" Date: Wed, 1 Dec 2021 14:28:37 -0600 Subject: [PATCH 4/5] Add model input validation for run.py Signed-off-by: Anna Jung (VMware) --- ml-conversational-analytic-tool/run.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ml-conversational-analytic-tool/run.py b/ml-conversational-analytic-tool/run.py index 46c0aba..dcc082a 100644 --- a/ml-conversational-analytic-tool/run.py +++ b/ml-conversational-analytic-tool/run.py @@ -51,6 +51,10 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type parser.add_argument('-pad', action='store_true', default=False, help='Pad total length of each pull') args = parser.parse_args() + + if args.model != 'CNN' and args.model != 'LSTM': + raise Exception("Model must be either CNN or LSTM") + encodingType = 'role' if not args.roleRelevant: encodingType = 'role-agnostic' From 74755c8a9b8db41bad99e9bfa3aec8939c547103 Mon Sep 17 00:00:00 2001 From: "Anna Jung (VMware)" Date: Fri, 3 Dec 2021 09:26:50 -0600 Subject: [PATCH 5/5] Modify help message for model argument Signed-off-by: Anna Jung (VMware) --- ml-conversational-analytic-tool/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-conversational-analytic-tool/run.py b/ml-conversational-analytic-tool/run.py index dcc082a..6d716a4 100644 --- a/ml-conversational-analytic-tool/run.py +++ b/ml-conversational-analytic-tool/run.py @@ -44,7 +44,7 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type description="Obtain models to determine constructive and inclusive feedback in Open source communities") parser.add_argument('annotated_filename', help='File location of annotated file') parser.add_argument('dataset_filename', help='File location of extracted dataset') - parser.add_argument('model', help='Model type to use for training') + parser.add_argument('model', help='Model type to use for training, supported CNN and LSTM') parser.add_argument('outcome', help='Inclusive, Constructive, or Both') parser.add_argument('-roleRelevant', action='store_true', default=False, help='Encoding method differentiates b/w conversation roles')