Skip to content
This repository has been archived by the owner on Feb 20, 2023. It is now read-only.

Commit

Permalink
Merge pull request #2 from annajung/main
Browse files Browse the repository at this point in the history
Fix missing column and variable reference before assignment
  • Loading branch information
difince authored Dec 6, 2021
2 parents 16fdcd9 + 74755c8 commit cc3762d
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 16 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,12 @@ Note: There is a rate limit associated with GitHub API. Please read more about
before extracting data from a GitHub repo.

```
GITACCESS=<YOUR_TOKEN>
export GITACCESS=<YOUR_TOKEN>
```

Run the script by passing in `organization` and `repo`
```python
python runDataExtraction.py <organization> <repo>
python ./ml-conversational-analytic-tool/runDataExtraction.py <organization> <repo>
```

- `organization` is the name of the repository owner
Expand All @@ -105,7 +105,7 @@ python runDataExtraction.py <organization> <repo>
`featureVector.py` prepares your data for annotation use. Run the script by passing in path to `rawdatafile` and `words`.

```python
python featureVector.py <rawdatafile> <words> -unannotated
python ./ml-conversational-analytic-tool/featureVector.py <rawdatafile> <words> -unannotated
```

- `rawdatafile` is location of raw data csv
Expand All @@ -128,7 +128,7 @@ There are two models available for training
To train, run the script with required parameters path to `annotated_filename`, `dataset_filename`, `model`, and `outcome`.

```python
python run.py <annotated_filename> <dataset_filename> <model> <outcome>
python ./ml-conversational-analytic-tool/run.py <annotated_filename> <dataset_filename> <model> <outcome>
```

- `annotated_filename` is the location of the annotated dataset file
Expand Down
3 changes: 3 additions & 0 deletions ml-conversational-analytic-tool/featureVector.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def pullStringConversation(self, export_filename="", export=True):
# Store each interaction and pull URL for export
string_conversations = []
pull_urls = []
pull_numbers = []

for index, row in self.raw_data.iterrows():
# Make pull message
Expand All @@ -181,9 +182,11 @@ def pullStringConversation(self, export_filename="", export=True):
comment_row["Body"])
string_conversations.append(conversation.encode("ascii", "ignore").decode())
pull_urls.append(row["URL"])
pull_numbers.append(row["Number"])

# Export converation field dataset
export_df = pd.DataFrame()
export_df["Number"] = pull_numbers
export_df["URL"] = pull_urls
export_df["Thread"] = string_conversations

Expand Down
25 changes: 13 additions & 12 deletions ml-conversational-analytic-tool/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# SPDX-License-Identifier: Apache-2.0

import argparse

from sklearn.model_selection import train_test_split

from baseCNN import BaseCNN
Expand All @@ -16,20 +15,18 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type
data.setupPreProcess(annotated_filename, dataset_filename)
data.encodeData()

# Get data for training
if encoding_type == 'role':
obs, res = data.getRoleMatrix(outcome, padding)
elif encoding_type == 'role-agnostic':
obs, res = data.getRoleAgnosticMatrix(outcome, padding)

# Create models
if model_type == 'CNN':
model = BaseCNN()
elif model_type == 'LSTM':
if model_type == 'LSTM':
model = BaseLSTM()
else:
model = BaseCNN()

# Get data for training
if encoding_type == 'role':
obs, res = data.getRoleMatrix(outcome, padding)
model.makeModel2D(obs[0].shape)
elif encoding_type == 'role-agnostic':
else:
obs, res = data.getRoleAgnosticMatrix(outcome, padding)
model.makeModel(obs[0].shape)

# Train model
Expand All @@ -47,13 +44,17 @@ def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type
description="Obtain models to determine constructive and inclusive feedback in Open source communities")
parser.add_argument('annotated_filename', help='File location of annotated file')
parser.add_argument('dataset_filename', help='File location of extracted dataset')
parser.add_argument('model', help='Model type to use for training')
parser.add_argument('model', help='Model type to use for training, supported CNN and LSTM')
parser.add_argument('outcome', help='Inclusive, Constructive, or Both')
parser.add_argument('-roleRelevant', action='store_true', default=False,
help='Encoding method differentiates b/w conversation roles')
parser.add_argument('-pad', action='store_true', default=False, help='Pad total length of each pull')

args = parser.parse_args()

if args.model != 'CNN' and args.model != 'LSTM':
raise Exception("Model must be either CNN or LSTM")

encodingType = 'role'
if not args.roleRelevant:
encodingType = 'role-agnostic'
Expand Down

0 comments on commit cc3762d

Please sign in to comment.