diff --git a/docs/ml-conversational-analytic-tool/baseCNN.html b/docs/ml-conversational-analytic-tool/baseCNN.html index 4005539..a1b771b 100644 --- a/docs/ml-conversational-analytic-tool/baseCNN.html +++ b/docs/ml-conversational-analytic-tool/baseCNN.html @@ -35,18 +35,21 @@

Module ml-conversational-analytic-tool.baseCNNModule ml-conversational-analytic-tool.baseCNNModule ml-conversational-analytic-tool.baseCNNModule ml-conversational-analytic-tool.baseCNNModule ml-conversational-analytic-tool.baseCNN @@ -117,22 +133,26 @@

Classes

class BaseCNN
-
+

Constructor creates model and explainer

Expand source code
class BaseCNN:
-    # Constructor creates model and explainer
     def __init__(self):
+        """
+        Constructor creates model and explainer
+        """
         self.dimension2 = True
         self.input_shape = ()
         self.model = keras.models.Sequential()
         self.explainer = SmoothGrad()
         self.model_ready = False
 
-    # Make 1d model for role agnostic data
     def makeModel(self, input_shape):
+        """
+        Make 1d model for role agnostic data
+        """
         self.input_shape = input_shape
         self.model.add(keras.layers.Conv1D(32, 3, activation='relu', input_shape=input_shape))
         self.model.add(keras.layers.MaxPooling1D(2))
@@ -143,8 +163,10 @@ 

Classes

self.dimension2 = False self.model_ready = True - # Make 2d model for role relevant data def makeModel2D(self, input_shape): + """ + Make 2d model for role relevant data + """ self.input_shape = input_shape self.model.add(keras.layers.Conv2D(4, (5, 5), activation='relu', input_shape=input_shape)) self.model.add(keras.layers.MaxPooling2D((4, 4))) @@ -156,8 +178,10 @@

Classes

self.dimension2 = True self.model_ready = True - # Train model def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32): + """ + Train model + """ self.model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy']) if val_set: train_hist = self.model.fit(np.array(obs), np.array(res), epochs=epochs, batch_size=batch_size, @@ -168,8 +192,13 @@

Classes

validation_split=val_split, verbose=1) return train_hist - # Score model for accuracy, precision and recall + def saveModel(self, name, version): + self.model.save("{}/{}".format(name, version)) + def scoreModel(self, obs, res): + """ + Score model for accuracy, precision and recall + """ evaluation = {} evaluation['Loss_Acc'] = self.model.evaluate(np.array(obs), np.array(res)) evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True), @@ -177,15 +206,19 @@

Classes

print("Accuracy: {}".format(evaluation['Loss_Acc'][1])) return evaluation - # Get predictions def predict(self, obs, labels=False): + """ + Get predictions + """ predictions = self.model.predict(np.array(obs)) if labels: return [1 if x > 0.5 else 0 for x in predictions] return predictions - # Explain prediction for obs using explainer def explain(self, obs): + """ + Explain prediction for obs using explainer + """ output = self.explainer.explain((obs, None), self.model, 1, 20, 1.) return output
@@ -195,12 +228,15 @@

Methods

def explain(self, obs)
-
+

Explain prediction for obs using explainer

Expand source code
def explain(self, obs):
+    """
+    Explain prediction for obs using explainer
+    """
     output = self.explainer.explain((obs, None), self.model, 1, 20, 1.)
     return output
@@ -209,12 +245,15 @@

Methods

def makeModel(self, input_shape)
-
+

Make 1d model for role agnostic data

Expand source code
def makeModel(self, input_shape):
+    """
+    Make 1d model for role agnostic data
+    """
     self.input_shape = input_shape
     self.model.add(keras.layers.Conv1D(32, 3, activation='relu', input_shape=input_shape))
     self.model.add(keras.layers.MaxPooling1D(2))
@@ -230,12 +269,15 @@ 

Methods

def makeModel2D(self, input_shape)
-
+

Make 2d model for role relevant data

Expand source code
def makeModel2D(self, input_shape):
+    """
+    Make 2d model for role relevant data
+    """
     self.input_shape = input_shape
     self.model.add(keras.layers.Conv2D(4, (5, 5), activation='relu', input_shape=input_shape))
     self.model.add(keras.layers.MaxPooling2D((4, 4)))
@@ -252,28 +294,47 @@ 

Methods

def predict(self, obs, labels=False)
-
+

Get predictions

Expand source code
def predict(self, obs, labels=False):
+    """
+    Get predictions
+    """
     predictions = self.model.predict(np.array(obs))
     if labels:
         return [1 if x > 0.5 else 0 for x in predictions]
     return predictions
+
+def saveModel(self, name, version) +
+
+
+
+ +Expand source code + +
def saveModel(self, name, version):
+    self.model.save("{}/{}".format(name, version))
+
+
def scoreModel(self, obs, res)
-
+

Score model for accuracy, precision and recall

Expand source code
def scoreModel(self, obs, res):
+    """
+    Score model for accuracy, precision and recall
+    """
     evaluation = {}
     evaluation['Loss_Acc'] = self.model.evaluate(np.array(obs), np.array(res))
     evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True),
@@ -286,12 +347,15 @@ 

Methods

def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32)
-
+

Train model

Expand source code
def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32):
+    """
+    Train model
+    """
     self.model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
     if val_set:
         train_hist = self.model.fit(np.array(obs), np.array(res), epochs=epochs, batch_size=batch_size,
@@ -328,6 +392,7 @@ 

makeModel
  • makeModel2D
  • predict
  • +
  • saveModel
  • scoreModel
  • trainModel
  • diff --git a/docs/ml-conversational-analytic-tool/baseLSTM.html b/docs/ml-conversational-analytic-tool/baseLSTM.html index c11717a..e8fc887 100644 --- a/docs/ml-conversational-analytic-tool/baseLSTM.html +++ b/docs/ml-conversational-analytic-tool/baseLSTM.html @@ -35,17 +35,20 @@

    Module ml-conversational-analytic-tool.baseLSTMModule ml-conversational-analytic-tool.baseLSTMModule ml-conversational-analytic-tool.baseLSTMModule ml-conversational-analytic-tool.baseLSTMModule ml-conversational-analytic-tool.baseLSTMClasses

    class BaseLSTM
    -
    +

    Constructor creates model

    Expand source code
    class BaseLSTM:
    -    # Constructor creates model
         def __init__(self):
    +        """
    +        Constructor creates model
    +        """
             self.dimension2 = True
             self.input_shape = ()
             self.model = None
             self.model_ready = False
     
    -    # Make lstm model for role agnostic data
         def makeModel(self, input_shape):
    +        """
    +        Make lstm model for role agnostic data
    +        """
             self.input_shape = input_shape
             self.model = keras.models.Sequential()
             self.model.add(keras.layers.LSTM(8, input_shape=input_shape))
    @@ -164,22 +185,29 @@ 

    Classes

    self.dimension2 = False self.model_ready = True - # Made lstm model for role relevant layers data def makeModel2D(self, input_shape): + """ + Made lstm model for role relevant layers data + """ + # Inputs self.input_shape = input_shape inputs = keras.Input(shape=(None, 512, 2), dtype="float32") + # Pipe output of author and reviewer layer to two lstm author = keras.layers.Lambda(lambda x: x[:, :, :, 0])(inputs) reviewer = keras.layers.Lambda(lambda x: x[:, :, :, 1])(inputs) + # Create author lstm author_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(author) author_dropout = keras.layers.Dropout(0.2)(author_lstm) author_output = keras.layers.Dense(1, activation='relu')(author_dropout) + # Create reviewer lstm reviewer_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(reviewer) reviewer_dropout = keras.layers.Dropout(0.2)(reviewer_lstm) reviewer_output = keras.layers.Dense(1, activation='relu')(reviewer_dropout) + # Concatenate author and reviewer output combine = keras.layers.Concatenate(axis=1)([author_output, reviewer_output]) output = keras.layers.Dense(1, activation='sigmoid')(combine) @@ -189,7 +217,6 @@

    Classes

    self.dimension2 = True self.model_ready = True - # Train model def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32): self.model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy']) if val_set: @@ -203,8 +230,13 @@

    Classes

    batch_size=batch_size, verbose=1) return train_hist - # Score model for accuracy, precision and recall + def saveModel(self, name, version): + self.model.save("{}/{}".format(name, version)) + def scoreModel(self, obs, res): + """ + Score model for accuracy, precision and recall + """ evaluation = {} evaluation['Loss_Acc'] = self.model.evaluate(tf.ragged.stack(obs), tf.convert_to_tensor(res)) evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True), @@ -212,21 +244,24 @@

    Classes

    print("Accuracy: {}".format(evaluation['Loss_Acc'][1])) return evaluation - # Get predictions def predict(self, obs, labels=False): predictions = self.model.predict(tf.ragged.stack(obs)) if labels: return [1 if x > 0.5 else 0 for x in predictions] return predictions - # Explain prediction for obs by calculating gradients def explain(self, obs): + """ + Explain prediction for obs by calculating gradients + """ grads = self._gradientImportance(obs) imp = grads[:, 0] + grads[:, 1] return imp - # Get gradients def _gradientImportance(self, seq): + """ + Get gradients + """ seq = tf.Variable(seq[np.newaxis, :, :]) with tf.GradientTape() as tape: predictions = self.model(seq) @@ -241,12 +276,15 @@

    Methods

    def explain(self, obs)
    -
    +

    Explain prediction for obs by calculating gradients

    Expand source code
    def explain(self, obs):
    +    """
    +    Explain prediction for obs by calculating gradients
    +    """
         grads = self._gradientImportance(obs)
         imp = grads[:, 0] + grads[:, 1]
         return imp
    @@ -256,12 +294,15 @@

    Methods

    def makeModel(self, input_shape)
    -
    +

    Make lstm model for role agnostic data

    Expand source code
    def makeModel(self, input_shape):
    +    """
    +    Make lstm model for role agnostic data
    +    """
         self.input_shape = input_shape
         self.model = keras.models.Sequential()
         self.model.add(keras.layers.LSTM(8, input_shape=input_shape))
    @@ -276,26 +317,34 @@ 

    Methods

    def makeModel2D(self, input_shape)
    -
    +

    Made lstm model for role relevant layers data

    Expand source code
    def makeModel2D(self, input_shape):
    +    """
    +    Made lstm model for role relevant layers data
    +    """
    +
         # Inputs
         self.input_shape = input_shape
         inputs = keras.Input(shape=(None, 512, 2), dtype="float32")
    +
         # Pipe output of author and reviewer layer to two lstm
         author = keras.layers.Lambda(lambda x: x[:, :, :, 0])(inputs)
         reviewer = keras.layers.Lambda(lambda x: x[:, :, :, 1])(inputs)
    +
         # Create author lstm
         author_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(author)
         author_dropout = keras.layers.Dropout(0.2)(author_lstm)
         author_output = keras.layers.Dense(1, activation='relu')(author_dropout)
    +
         # Create reviewer lstm
         reviewer_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(reviewer)
         reviewer_dropout = keras.layers.Dropout(0.2)(reviewer_lstm)
         reviewer_output = keras.layers.Dense(1, activation='relu')(reviewer_dropout)
    +
         # Concatenate author and reviewer output
         combine = keras.layers.Concatenate(axis=1)([author_output, reviewer_output])
         output = keras.layers.Dense(1, activation='sigmoid')(combine)
    @@ -322,16 +371,32 @@ 

    Methods

    return predictions
    +
    +def saveModel(self, name, version) +
    +
    +
    +
    + +Expand source code + +
    def saveModel(self, name, version):
    +    self.model.save("{}/{}".format(name, version))
    +
    +
    def scoreModel(self, obs, res)
    -
    +

    Score model for accuracy, precision and recall

    Expand source code
    def scoreModel(self, obs, res):
    +    """
    +    Score model for accuracy, precision and recall
    +    """
         evaluation = {}
         evaluation['Loss_Acc'] = self.model.evaluate(tf.ragged.stack(obs), tf.convert_to_tensor(res))
         evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True),
    @@ -388,6 +453,7 @@ 

    makeModel
  • makeModel2D
  • predict
  • +
  • saveModel
  • scoreModel
  • trainModel
  • diff --git a/docs/ml-conversational-analytic-tool/commentAnalysis.html b/docs/ml-conversational-analytic-tool/commentAnalysis.html index 70caebc..e98c683 100644 --- a/docs/ml-conversational-analytic-tool/commentAnalysis.html +++ b/docs/ml-conversational-analytic-tool/commentAnalysis.html @@ -37,17 +37,21 @@

    Module ml-conversational-analytic-tool.commentAnalysisModule ml-conversational-analytic-tool.commentAnalysisModule ml-conversational-analytic-tool.commentAnalysisClasses

    (words)
    -
    +

    Constructors form a dictionary to be used for counting. +Parameters: words - list of words to count

    Expand source code
    class CommentAnalyzer:
    -    # Constructors form a dictionary to be used for counting.
    -    # Parameters: words - list of words to count
         def __init__(self, words):
    +        """
    +        Constructors form a dictionary to be used for counting.
    +        Parameters: words - list of words to count
    +        """
             self.word_count = {word.lower(): 0 for word in words}  # Create dictionary with list items as key
             self.vader_sentiment = SentimentIntensityAnalyzer()  # Initialize sentiment analysis model
     
    -    # Method to get desired features from an input comment.
    -    # Parameters: comment - string. Returns: dictionary with features
         def analyzeComment(self, comment):
    +        """
    +        Method to get desired features from an input comment.
    +        Parameters: comment - string.
    +        Returns: dictionary with features
    +        """
             result = {}  # Create return dictionary
             cleaned_comment = self.preProcess(comment)  # Clean comment text
             # result['Word Counts'] = self.countWords(cleaned_comment)  # Determine word counts
    @@ -143,16 +166,22 @@ 

    Classes

    result['Code Blocks'] = self.getCodeBlockCount(cleaned_comment) # Determine code block count return result - # Method to clean and return text. Parameters: text - string. - # Returns: string after cleaning def preProcess(self, text): + """ + Method to clean and return text. + Parameters: text - string. + Returns: string after cleaning + """ cleaned_text = text.strip() # Remove trailing and starting spaces cleaned_text = cleaned_text.lower() # Convert to lowercase return cleaned_text - # Method to determine word count. Parameters: comment - string - # Returns: dictionary with word counts def countWords(self, comment): + """ + Method to determine word count. + Parameters: comment - string + Returns: dictionary with word counts + """ words = comment.split(" ") # Split text into words current_word_count = self.word_count.copy() # Copy default dict for new count for word in words: # Iterate over all words @@ -160,22 +189,29 @@

    Classes

    current_word_count[word] = current_word_count[word] + 1 return current_word_count - # Method to determine the code blocks. Parameters: comment - string - # Returns: integer count def getCodeBlockCount(self, comment): + """ + Method to determine the code blocks. + Parameters: comment - string + Returns: integer count + """ count = comment.count("```") # Find occurences of code block if count % 2 != 0: # Should be in pairs print("Warning: Mismatched code blocks") return int(count / 2 - 1) # Subtract 1 since unmatched pair return int(count / 2) # Divide by 2 since pairs - # Method to determine sentiment. Parameters: comment - string - # Returns: dictionary with positive, negative and neutral scores def getSentiment(self, comment): + """ + Method to determine sentiment. Parameters: comment - string + Returns: dictionary with positive, negative and neutral scores + """ return self.vader_sentiment.polarity_scores(comment)["compound"] - # Method to change words to count. Parameters: Set new word count with new keys/ def changeWords(self, words): + """ + Method to change words to count. Parameters: Set new word count with new keys/ + """ self.word_count = {word: 0 for word in words}

    Methods

    @@ -184,12 +220,19 @@

    Methods

    def analyzeComment(self, comment)
    -
    +

    Method to get desired features from an input comment. +Parameters: comment - string. +Returns: dictionary with features

    Expand source code
    def analyzeComment(self, comment):
    +    """
    +    Method to get desired features from an input comment.
    +    Parameters: comment - string.
    +    Returns: dictionary with features
    +    """
         result = {}  # Create return dictionary
         cleaned_comment = self.preProcess(comment)  # Clean comment text
         # result['Word Counts'] = self.countWords(cleaned_comment)  # Determine word counts
    @@ -203,12 +246,15 @@ 

    Methods

    def changeWords(self, words)
    -
    +

    Method to change words to count. Parameters: Set new word count with new keys/

    Expand source code
    def changeWords(self, words):
    +    """
    +    Method to change words to count. Parameters: Set new word count with new keys/
    +    """
         self.word_count = {word: 0 for word in words}
    @@ -216,12 +262,19 @@

    Methods

    def countWords(self, comment)
    -
    +

    Method to determine word count. +Parameters: comment - string +Returns: dictionary with word counts

    Expand source code
    def countWords(self, comment):
    +    """
    +    Method to determine word count.
    +    Parameters: comment - string
    +    Returns: dictionary with word counts
    +    """
         words = comment.split(" ")  # Split text into words
         current_word_count = self.word_count.copy()  # Copy default dict for new count
         for word in words:  # Iterate over all words
    @@ -234,12 +287,19 @@ 

    Methods

    def getCodeBlockCount(self, comment)
    -
    +

    Method to determine the code blocks. +Parameters: comment - string +Returns: integer count

    Expand source code
    def getCodeBlockCount(self, comment):
    +    """
    +    Method to determine the code blocks.
    +    Parameters: comment - string
    +    Returns: integer count
    +    """
         count = comment.count("```")  # Find occurences of code block
         if count % 2 != 0:  # Should be in pairs
             print("Warning: Mismatched code blocks")
    @@ -251,12 +311,17 @@ 

    Methods

    def getSentiment(self, comment)
    -
    +

    Method to determine sentiment. Parameters: comment - string +Returns: dictionary with positive, negative and neutral scores

    Expand source code
    def getSentiment(self, comment):
    +    """
    +    Method to determine sentiment. Parameters: comment - string
    +    Returns: dictionary with positive, negative and neutral scores
    +    """
         return self.vader_sentiment.polarity_scores(comment)["compound"]
    @@ -264,12 +329,19 @@

    Methods

    def preProcess(self, text)
    -
    +

    Method to clean and return text. +Parameters: text - string. +Returns: string after cleaning

    Expand source code
    def preProcess(self, text):
    +    """
    +    Method to clean and return text.
    +    Parameters: text - string.
    +    Returns: string after cleaning
    +    """
         cleaned_text = text.strip()  # Remove trailing and starting spaces
         cleaned_text = cleaned_text.lower()  # Convert to lowercase
         return cleaned_text
    diff --git a/docs/ml-conversational-analytic-tool/featureVector.html b/docs/ml-conversational-analytic-tool/featureVector.html index 0e2b581..6fbfe77 100644 --- a/docs/ml-conversational-analytic-tool/featureVector.html +++ b/docs/ml-conversational-analytic-tool/featureVector.html @@ -38,12 +38,12 @@

    Module ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorClasses

    (retain_features, analysis_features)
    -
    +

    Constructor sets instance variables +Inputs: Retained Features (list) -> pull data features, Analysis Features (list) - > comment analysis features

    Expand source code
    class Featurizer:
    -    # Constructor sets instance variables
    -    # Inputs: Retained Features (list) -> pull data features, Analysis Features (list) - > comment analysis features
         def __init__(self, retain_features, analysis_features):
    +        """
    +        Constructor sets instance variables
    +        Inputs: Retained Features (list) -> pull data features, Analysis Features (list) - > comment analysis features
    +        """
             self.analysis_features = analysis_features
             self.retain_pull_features = retain_features
             self.raw_filename = ""
    @@ -266,9 +281,11 @@ 

    Classes

    self.featurized_data = None self.commentAnalyzer = None - # Function to read raw data stored as csv - # Inputs: File location (string) -> input raw data file location def readRawData(self, filename): + """ + Function to read raw data stored as csv + Inputs: File location (string) -> input raw data file location + """ self.raw_filename = filename self.raw_data = pd.read_csv(filename) @@ -284,9 +301,11 @@

    Classes

    self.raw_data['Review_Comments'] = self.raw_data['Review_Comments'].apply(lambda comment: stringToDict(comment)) print("Done reading raw data.") - # Function to obtain Comment Analyzer with parameters - # Inputs: File location (string) -> .txt containing keywords to count def setupCommentAnalyzer(self, filename): + """ + Function to obtain Comment Analyzer with parameters + Inputs: File location (string) -> .txt containing keywords to count + """ word_list = [] # Open file to obtain list of words in Comment Analzyer @@ -296,9 +315,11 @@

    Classes

    self.analysis_features = self.analysis_features + word_list print("Comment Analyzer Setup") - # Function to create/export dataset with desired features - # Inputs: Name of file to export (string), export flag (boolean) def formFeatures(self, export_filename="", export=True): + """ + Function to create/export dataset with desired features + Inputs: Name of file to export (string), export flag (boolean) + """ features = [] # List of rows to convert to dataframe # Iterate over each pull @@ -391,12 +412,16 @@

    Classes

    return export_df - # Function to form conversation string given communication on a pull request - # Inputs: Name of file to export (string), export flag (boolean) def pullStringConversation(self, export_filename="", export=True): + """ + Function to form conversation string given communication on a pull request + Inputs: Name of file to export (string), export flag (boolean) + """ + # Store each interaction and pull URL for export string_conversations = [] pull_urls = [] + pull_numbers = [] for index, row in self.raw_data.iterrows(): # Make pull message @@ -415,9 +440,11 @@

    Classes

    comment_row["Body"]) string_conversations.append(conversation.encode("ascii", "ignore").decode()) pull_urls.append(row["URL"]) + pull_numbers.append(row["Number"]) # Export converation field dataset export_df = pd.DataFrame() + export_df["Number"] = pull_numbers export_df["URL"] = pull_urls export_df["Thread"] = string_conversations @@ -436,12 +463,17 @@

    Methods

    def formFeatures(self, export_filename='', export=True)
    -
    +

    Function to create/export dataset with desired features +Inputs: Name of file to export (string), export flag (boolean)

    Expand source code
    def formFeatures(self, export_filename="", export=True):
    +    """
    +    Function to create/export dataset with desired features
    +    Inputs: Name of file to export (string), export flag (boolean)
    +    """
         features = []  # List of rows to convert to dataframe
     
         # Iterate over each pull
    @@ -539,15 +571,22 @@ 

    Methods

    def pullStringConversation(self, export_filename='', export=True)
    -
    +

    Function to form conversation string given communication on a pull request +Inputs: Name of file to export (string), export flag (boolean)

    Expand source code
    def pullStringConversation(self, export_filename="", export=True):
    +    """
    +    Function to form conversation string given communication on a pull request
    +    Inputs: Name of file to export (string), export flag (boolean)
    +    """
    +
         # Store each interaction and pull URL for export
         string_conversations = []
         pull_urls = []
    +    pull_numbers = []
     
         for index, row in self.raw_data.iterrows():
             # Make pull message
    @@ -566,9 +605,11 @@ 

    Methods

    comment_row["Body"]) string_conversations.append(conversation.encode("ascii", "ignore").decode()) pull_urls.append(row["URL"]) + pull_numbers.append(row["Number"]) # Export converation field dataset export_df = pd.DataFrame() + export_df["Number"] = pull_numbers export_df["URL"] = pull_urls export_df["Thread"] = string_conversations @@ -586,12 +627,17 @@

    Methods

    def readRawData(self, filename)
    -
    +

    Function to read raw data stored as csv +Inputs: File location (string) -> input raw data file location

    Expand source code
    def readRawData(self, filename):
    +    """
    +    Function to read raw data stored as csv
    +    Inputs: File location (string) -> input raw data file location
    +    """
         self.raw_filename = filename
         self.raw_data = pd.read_csv(filename)
     
    @@ -612,12 +658,17 @@ 

    Methods

    def setupCommentAnalyzer(self, filename)
    -
    +

    Function to obtain Comment Analyzer with parameters +Inputs: File location (string) -> .txt containing keywords to count

    Expand source code
    def setupCommentAnalyzer(self, filename):
    +    """
    +    Function to obtain Comment Analyzer with parameters
    +    Inputs: File location (string) -> .txt containing keywords to count
    +    """
         word_list = []
     
         # Open file to obtain list of words in Comment Analzyer
    diff --git a/docs/ml-conversational-analytic-tool/githubDataExtraction.html b/docs/ml-conversational-analytic-tool/githubDataExtraction.html
    index fa70a3e..93adbb9 100644
    --- a/docs/ml-conversational-analytic-tool/githubDataExtraction.html
    +++ b/docs/ml-conversational-analytic-tool/githubDataExtraction.html
    @@ -29,20 +29,20 @@ 

    Module ml-conversational-analytic-tool.githubDataExtract
    # Copyright 2021 VMware, Inc.
     # SPDX-License-Identifier: Apache-2.0
     
    -import argparse
     import os
    -# Imports
     import time
     
    +import argparse
     import pandas as pd
     from github import Github
     from github.GithubException import RateLimitExceededException
     
     
    -# Class to extract pull data from a github repo. The data includes reviews, comments, and other metadata from each pull.
     class GithubDataExtractor:
    -    # Constructor requires an access token to start a Github session, and specifies instance variables
         def __init__(self, access_token):
    +        """
    +        Constructor requires an access token to start a Github session, and specifies instance variables
    +        """
             self.g_ses = Github(access_token)  # Github object is used as a channel to the Github API
             self.current_repo = None  # Current Opended Repo
             self.reaction_flag = False
    @@ -50,18 +50,22 @@ 

    Module ml-conversational-analytic-tool.githubDataExtract self.repo_name = "" self.organization = "" - # Method to open (access) repository with given organization and repository name (reponame). - # Parameters: usernmae - owner of the repository, repo_name - name of repo to be opened def openRepo(self, organization, repo_name): + """ + Method to open (access) repository with given organization and repository name (reponame). + Parameters: username - owner of the repository, repo_name - name of repo to be opened + """ self.current_repo = self.g_ses.get_repo(organization + "/" + repo_name) # Open repo self.repo_opened = True self.repo_name = repo_name self.organization = organization print("Opened repo {} - {}".format(repo_name, organization)) - # Method to form a dataframe containing pull information. Parameters: name - name of exported csv file, - # export - if the dataframe should be exported to csv. Returns: Dataframe with pull data def getAllPulls(self, name="", reaction_flag=False, export_to_csv=True): + """ + Method to form a dataframe containing pull information. Parameters: name - name of exported csv file, + export - if the dataframe should be exported to csv. Returns: Dataframe with pull data + """ self.reaction_flag = reaction_flag if self.repo_opened: # Verify if a repo has been opened pull_data = [] @@ -78,9 +82,11 @@

    Module ml-conversational-analytic-tool.githubDataExtract return pull_df print("Please open a Repo") - # Extract pulls with given state. Parameters: state - state of the pull (open or closed) - # Return: list of dictionaries containing data regardining each pull def getPullsByState(self, state): + """ + Extract pulls with given state. Parameters: state - state of the pull (open or closed) + Return: list of dictionaries containing data regardining each pull + """ pull_data = [] try: # Call the Github api to get all pulls pulls = self.current_repo.get_pulls(state=state, sort='create') @@ -98,9 +104,11 @@

    Module ml-conversational-analytic-tool.githubDataExtract pull_data.append(self.getPullFeatures(pull)) return pull_data - # Method to form a list of json strings rerpesenting comments (reviews or issue). - # Parameters: comments - list of comment objects. Returns: List of json strings def listOfComments(self, comments): + """ + Method to form a list of json strings rerpesenting comments (reviews or issue). + Parameters: comments - list of comment objects. Returns: List of json strings + """ list_comments = [] # Iterate over each comment @@ -142,9 +150,11 @@

    Module ml-conversational-analytic-tool.githubDataExtract "Updated_At": str(comment.updated_at)})) return list_comments - # Method to get all data for a particular pull. Parameters: pull - object representing a pull - # Returns: dictionary containing all data of a pull def getPullFeatures(self, pull): + """ + Method to get all data for a particular pull. Parameters: pull - object representing a pull + Returns: dictionary containing all data of a pull + """ pull_dict = {} pull_dict["Number"] = pull.number pull_dict["Title"] = pull.title @@ -187,6 +197,7 @@

    Module ml-conversational-analytic-tool.githubDataExtract ACCESS_TOKEN = os.environ["GITACCESS"] # Access Github token from environment for security purposes extractor = GithubDataExtractor(ACCESS_TOKEN) # Create object extractor.openRepo(args.organization, args.reponame) # Open repo + # Extract all pulls and export them to .csv if args.filename: extractor.getAllPulls(args.filename, args.reactions) @@ -208,14 +219,16 @@

    Classes

    (access_token)
    -
    +

    Constructor requires an access token to start a Github session, and specifies instance variables

    Expand source code
    class GithubDataExtractor:
    -    # Constructor requires an access token to start a Github session, and specifies instance variables
         def __init__(self, access_token):
    +        """
    +        Constructor requires an access token to start a Github session, and specifies instance variables
    +        """
             self.g_ses = Github(access_token)  # Github object is used as a channel to the Github API
             self.current_repo = None  # Current Opended Repo
             self.reaction_flag = False
    @@ -223,18 +236,22 @@ 

    Classes

    self.repo_name = "" self.organization = "" - # Method to open (access) repository with given organization and repository name (reponame). - # Parameters: usernmae - owner of the repository, repo_name - name of repo to be opened def openRepo(self, organization, repo_name): + """ + Method to open (access) repository with given organization and repository name (reponame). + Parameters: username - owner of the repository, repo_name - name of repo to be opened + """ self.current_repo = self.g_ses.get_repo(organization + "/" + repo_name) # Open repo self.repo_opened = True self.repo_name = repo_name self.organization = organization print("Opened repo {} - {}".format(repo_name, organization)) - # Method to form a dataframe containing pull information. Parameters: name - name of exported csv file, - # export - if the dataframe should be exported to csv. Returns: Dataframe with pull data def getAllPulls(self, name="", reaction_flag=False, export_to_csv=True): + """ + Method to form a dataframe containing pull information. Parameters: name - name of exported csv file, + export - if the dataframe should be exported to csv. Returns: Dataframe with pull data + """ self.reaction_flag = reaction_flag if self.repo_opened: # Verify if a repo has been opened pull_data = [] @@ -251,9 +268,11 @@

    Classes

    return pull_df print("Please open a Repo") - # Extract pulls with given state. Parameters: state - state of the pull (open or closed) - # Return: list of dictionaries containing data regardining each pull def getPullsByState(self, state): + """ + Extract pulls with given state. Parameters: state - state of the pull (open or closed) + Return: list of dictionaries containing data regardining each pull + """ pull_data = [] try: # Call the Github api to get all pulls pulls = self.current_repo.get_pulls(state=state, sort='create') @@ -271,9 +290,11 @@

    Classes

    pull_data.append(self.getPullFeatures(pull)) return pull_data - # Method to form a list of json strings rerpesenting comments (reviews or issue). - # Parameters: comments - list of comment objects. Returns: List of json strings def listOfComments(self, comments): + """ + Method to form a list of json strings rerpesenting comments (reviews or issue). + Parameters: comments - list of comment objects. Returns: List of json strings + """ list_comments = [] # Iterate over each comment @@ -315,9 +336,11 @@

    Classes

    "Updated_At": str(comment.updated_at)})) return list_comments - # Method to get all data for a particular pull. Parameters: pull - object representing a pull - # Returns: dictionary containing all data of a pull def getPullFeatures(self, pull): + """ + Method to get all data for a particular pull. Parameters: pull - object representing a pull + Returns: dictionary containing all data of a pull + """ pull_dict = {} pull_dict["Number"] = pull.number pull_dict["Title"] = pull.title @@ -353,12 +376,17 @@

    Methods

    def getAllPulls(self, name='', reaction_flag=False, export_to_csv=True)
    -
    +

    Method to form a dataframe containing pull information. Parameters: name - name of exported csv file, +export - if the dataframe should be exported to csv. Returns: Dataframe with pull data

    Expand source code
    def getAllPulls(self, name="", reaction_flag=False, export_to_csv=True):
    +    """
    +    Method to form a dataframe containing pull information. Parameters: name - name of exported csv file,
    +    export - if the dataframe should be exported to csv. Returns: Dataframe with pull data
    +    """
         self.reaction_flag = reaction_flag
         if self.repo_opened:  # Verify if a repo has been opened
             pull_data = []
    @@ -380,12 +408,17 @@ 

    Methods

    def getPullFeatures(self, pull)
    -
    +

    Method to get all data for a particular pull. Parameters: pull - object representing a pull +Returns: dictionary containing all data of a pull

    Expand source code
    def getPullFeatures(self, pull):
    +    """
    +    Method to get all data for a particular pull. Parameters: pull - object representing a pull
    +    Returns: dictionary containing all data of a pull
    +    """
         pull_dict = {}
         pull_dict["Number"] = pull.number
         pull_dict["Title"] = pull.title
    @@ -420,12 +453,17 @@ 

    Methods

    def getPullsByState(self, state)
    -
    +

    Extract pulls with given state. Parameters: state - state of the pull (open or closed) +Return: list of dictionaries containing data regardining each pull

    Expand source code
    def getPullsByState(self, state):
    +    """
    +    Extract pulls with given state. Parameters: state - state of the pull (open or closed)
    +    Return: list of dictionaries containing data regardining each pull
    +    """
         pull_data = []
         try:  # Call the Github api to get all pulls
             pulls = self.current_repo.get_pulls(state=state, sort='create')
    @@ -448,12 +486,17 @@ 

    Methods

    def listOfComments(self, comments)
    -
    +

    Method to form a list of json strings rerpesenting comments (reviews or issue). +Parameters: comments - list of comment objects. Returns: List of json strings

    Expand source code
    def listOfComments(self, comments):
    +    """
    +    Method to form a list of json strings rerpesenting comments (reviews or issue).
    +    Parameters: comments - list of comment objects. Returns: List of json strings
    +    """
         list_comments = []
     
         # Iterate over each comment
    @@ -500,12 +543,18 @@ 

    Methods

    def openRepo(self, organization, repo_name)
    -
    +

    Method to +open (access) repository with given organization and repository name (reponame). +Parameters: username - owner of the repository, repo_name - name of repo to be opened

    Expand source code
    def openRepo(self, organization, repo_name):
    +    """
    +    Method to  open (access) repository with given organization and repository name (reponame).
    +    Parameters: username - owner of the repository, repo_name - name of repo to be opened
    +    """
         self.current_repo = self.g_ses.get_repo(organization + "/" + repo_name)  # Open repo
         self.repo_opened = True
         self.repo_name = repo_name
    diff --git a/docs/ml-conversational-analytic-tool/preProcessedDataset.html b/docs/ml-conversational-analytic-tool/preProcessedDataset.html
    index 4f2d49c..c810642 100644
    --- a/docs/ml-conversational-analytic-tool/preProcessedDataset.html
    +++ b/docs/ml-conversational-analytic-tool/preProcessedDataset.html
    @@ -36,10 +36,11 @@ 

    Module ml-conversational-analytic-tool.preProcessedDatas import tensorflow as tf -# Class to prepare dataset for machine learning tasks class PreProcessedDataset: - # Set flags and instance variables in constructor def __init__(self, vocab_size=1000, no_tokens=512, max_pull_length=100): + """ + Set flags and instance variables in constructor + """ self.annotated_data = None self.dataset = None self.full_dataset = None @@ -59,8 +60,11 @@

    Module ml-conversational-analytic-tool.preProcessedDatas self.full_dataset_ready = False self.encode_ready = False - # Setup the preprocessed dataset def setupPreProcess(self, annotated_filename, dataset_filename): + """ + Setup the preprocessed dataset + """ + # Load datasets self.loadAnnotatedData(annotated_filename) self.loadDataset(dataset_filename) @@ -79,8 +83,11 @@

    Module ml-conversational-analytic-tool.preProcessedDatas self.full_dataset['Review_Comments'] = self.full_dataset['Review_Comments'].apply( lambda comment: stringToDict(comment)) - # Encode all utterances def encodeData(self): + """ + Encode all utterances + """ + all_utterances = self._getObsResUsers() # Encode each utterance for utterances in all_utterances: @@ -98,8 +105,11 @@

    Module ml-conversational-analytic-tool.preProcessedDatas self.results['Inclusive'] = self.results['Inclusive'][:idx] + self.results['Inclusive'][idx + 1:] self.curr_max_length = max([len(x) for x in self.all_encoded_utterances]) - # Get matrix observation and results for ML task soutcome = Inclusive, Constructive, or None -> both def getRoleAgnosticMatrix(self, outcome=None, padPull=True): + """ + Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both + """ + obs = [] for i in range(len(self.all_encoded_utterances)): def pad(inp): @@ -116,17 +126,21 @@

    Module ml-conversational-analytic-tool.preProcessedDatas res = self.getRes(outcome) return obs, res - # Get stacked matrix observation and results for ML task def getRoleMatrix(self, outcome=None, padPull=True): + """ + Get stacked matrix observation and results for ML task + """ # Check if results must be padded to same length for each pull if padPull: # Author and reviwer layers layer_writer = [] layer_reviewer = [] + for i in range(len(self.all_encoded_utterances)): writer_comments = [] reviewer_comments = [] writer = self.all_users[i][0] + for j in range(len(self.all_encoded_utterances[i])): # Check if author or reviewer if self.all_users[i][j] == writer: @@ -136,12 +150,14 @@

    Module ml-conversational-analytic-tool.preProcessedDatas reviewer_comments.append(self.all_encoded_utterances[i][j]) writer_comments.append(np.zeros(self.no_tokens)) padding = [0] * self.no_tokens + # Pad both reviewer and writer layers while len(writer_comments) < self.curr_max_length: writer_comments.append(padding.copy()) reviewer_comments.append(padding.copy()) layer_writer.append(np.array(writer_comments)) layer_reviewer.append(np.array(reviewer_comments)) + # Stack reviwer and author matrices obs = np.stack((layer_writer, layer_reviewer), axis=3) res = self.getRes(outcome) @@ -152,6 +168,7 @@

    Module ml-conversational-analytic-tool.preProcessedDatas writer_comments = [] reviewer_comments = [] writer = self.all_users[i][0] + for j in range(len(self.all_encoded_utterances[i])): if self.all_users[i][j] == writer: writer_comments.append(self.all_encoded_utterances[i][j]) @@ -164,24 +181,32 @@

    Module ml-conversational-analytic-tool.preProcessedDatas res = self.getRes(outcome) return obs, res - # Get list of results def getRes(self, outcome=None): + """ + Get list of results + """ if outcome: return self.results[outcome] return pd.DataFrame(data=self.results) - # Load annotated dataset from file def loadAnnotatedData(self, filename): + """ + Load annotated dataset from file + """ self.annotated_data = pd.read_csv(filename) self.annotated_data_open = True - # Load raw data from file def loadDataset(self, filename): + """ + Load raw data from file + """ self.dataset = pd.read_csv(filename) self.dataset_open = True - # Encode an utterance through the lookup dictionary def encode(self, utterances): + """ + Encode an utterance through the lookup dictionary + """ encoded_utterances = [] for utterance in utterances: obs = [] @@ -200,8 +225,10 @@

    Module ml-conversational-analytic-tool.preProcessedDatas encoded_utterances.append(np.array(obs)) return np.array(encoded_utterances) - # Setup the lookup dictionary through frequency encoding def _setupEncode(self): + """ + Setup the lookup dictionary through frequency encoding + """ word_counts = {} for index, row in self.annotated_data.iterrows(): words = row['Thread'].split(" ") @@ -210,6 +237,7 @@

    Module ml-conversational-analytic-tool.preProcessedDatas word_counts[word] = word_counts[word] + 1 else: word_counts[word] = 1 + # Sort the words to generate code # Lower number : higher count # 0 - Padding, 1 - End, 2 - Missing @@ -224,8 +252,10 @@

    Module ml-conversational-analytic-tool.preProcessedDatas break self.encode_ready = True - # Get all utterances by parsing dictionary for each pull def _getObsResUsers(self): + """ + Get all utterances by parsing dictionary for each pull + """ all_utterances = [] for index, row in self.full_dataset.iterrows(): utterances = [] @@ -234,17 +264,21 @@

    Module ml-conversational-analytic-tool.preProcessedDatas users.append(row['User']) temp_df_comments = pd.DataFrame(row['Comments']) temp_df_review_comments = pd.DataFrame(row["Review_Comments"]) + if len(temp_df_comments) > 0 or len(temp_df_review_comments) > 0: all_comments = temp_df_comments.append(temp_df_review_comments) all_comments['Created_At'] = pd.to_datetime(all_comments['Created_At']) all_comments = all_comments.sort_values(by=['Created_At']) + for comment_index, comment_row in all_comments.iterrows(): utterances.append(comment_row['Body']) users.append(comment_row['User']) + all_utterances.append(utterances) self.all_users.append(users) self.results['Constructive'].append(row['Constructive']) self.results['Inclusive'].append(row['Inclusive']) + return all_utterances

    @@ -262,14 +296,16 @@

    Classes

    (vocab_size=1000, no_tokens=512, max_pull_length=100)
    -
    +

    Set flags and instance variables in constructor

    Expand source code
    class PreProcessedDataset:
    -    # Set flags and instance variables in constructor
         def __init__(self, vocab_size=1000, no_tokens=512, max_pull_length=100):
    +        """
    +        Set flags and instance variables in constructor
    +        """
             self.annotated_data = None
             self.dataset = None
             self.full_dataset = None
    @@ -289,8 +325,11 @@ 

    Classes

    self.full_dataset_ready = False self.encode_ready = False - # Setup the preprocessed dataset def setupPreProcess(self, annotated_filename, dataset_filename): + """ + Setup the preprocessed dataset + """ + # Load datasets self.loadAnnotatedData(annotated_filename) self.loadDataset(dataset_filename) @@ -309,8 +348,11 @@

    Classes

    self.full_dataset['Review_Comments'] = self.full_dataset['Review_Comments'].apply( lambda comment: stringToDict(comment)) - # Encode all utterances def encodeData(self): + """ + Encode all utterances + """ + all_utterances = self._getObsResUsers() # Encode each utterance for utterances in all_utterances: @@ -328,8 +370,11 @@

    Classes

    self.results['Inclusive'] = self.results['Inclusive'][:idx] + self.results['Inclusive'][idx + 1:] self.curr_max_length = max([len(x) for x in self.all_encoded_utterances]) - # Get matrix observation and results for ML task soutcome = Inclusive, Constructive, or None -> both def getRoleAgnosticMatrix(self, outcome=None, padPull=True): + """ + Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both + """ + obs = [] for i in range(len(self.all_encoded_utterances)): def pad(inp): @@ -346,17 +391,21 @@

    Classes

    res = self.getRes(outcome) return obs, res - # Get stacked matrix observation and results for ML task def getRoleMatrix(self, outcome=None, padPull=True): + """ + Get stacked matrix observation and results for ML task + """ # Check if results must be padded to same length for each pull if padPull: # Author and reviwer layers layer_writer = [] layer_reviewer = [] + for i in range(len(self.all_encoded_utterances)): writer_comments = [] reviewer_comments = [] writer = self.all_users[i][0] + for j in range(len(self.all_encoded_utterances[i])): # Check if author or reviewer if self.all_users[i][j] == writer: @@ -366,12 +415,14 @@

    Classes

    reviewer_comments.append(self.all_encoded_utterances[i][j]) writer_comments.append(np.zeros(self.no_tokens)) padding = [0] * self.no_tokens + # Pad both reviewer and writer layers while len(writer_comments) < self.curr_max_length: writer_comments.append(padding.copy()) reviewer_comments.append(padding.copy()) layer_writer.append(np.array(writer_comments)) layer_reviewer.append(np.array(reviewer_comments)) + # Stack reviwer and author matrices obs = np.stack((layer_writer, layer_reviewer), axis=3) res = self.getRes(outcome) @@ -382,6 +433,7 @@

    Classes

    writer_comments = [] reviewer_comments = [] writer = self.all_users[i][0] + for j in range(len(self.all_encoded_utterances[i])): if self.all_users[i][j] == writer: writer_comments.append(self.all_encoded_utterances[i][j]) @@ -394,24 +446,32 @@

    Classes

    res = self.getRes(outcome) return obs, res - # Get list of results def getRes(self, outcome=None): + """ + Get list of results + """ if outcome: return self.results[outcome] return pd.DataFrame(data=self.results) - # Load annotated dataset from file def loadAnnotatedData(self, filename): + """ + Load annotated dataset from file + """ self.annotated_data = pd.read_csv(filename) self.annotated_data_open = True - # Load raw data from file def loadDataset(self, filename): + """ + Load raw data from file + """ self.dataset = pd.read_csv(filename) self.dataset_open = True - # Encode an utterance through the lookup dictionary def encode(self, utterances): + """ + Encode an utterance through the lookup dictionary + """ encoded_utterances = [] for utterance in utterances: obs = [] @@ -430,8 +490,10 @@

    Classes

    encoded_utterances.append(np.array(obs)) return np.array(encoded_utterances) - # Setup the lookup dictionary through frequency encoding def _setupEncode(self): + """ + Setup the lookup dictionary through frequency encoding + """ word_counts = {} for index, row in self.annotated_data.iterrows(): words = row['Thread'].split(" ") @@ -440,6 +502,7 @@

    Classes

    word_counts[word] = word_counts[word] + 1 else: word_counts[word] = 1 + # Sort the words to generate code # Lower number : higher count # 0 - Padding, 1 - End, 2 - Missing @@ -454,8 +517,10 @@

    Classes

    break self.encode_ready = True - # Get all utterances by parsing dictionary for each pull def _getObsResUsers(self): + """ + Get all utterances by parsing dictionary for each pull + """ all_utterances = [] for index, row in self.full_dataset.iterrows(): utterances = [] @@ -464,17 +529,21 @@

    Classes

    users.append(row['User']) temp_df_comments = pd.DataFrame(row['Comments']) temp_df_review_comments = pd.DataFrame(row["Review_Comments"]) + if len(temp_df_comments) > 0 or len(temp_df_review_comments) > 0: all_comments = temp_df_comments.append(temp_df_review_comments) all_comments['Created_At'] = pd.to_datetime(all_comments['Created_At']) all_comments = all_comments.sort_values(by=['Created_At']) + for comment_index, comment_row in all_comments.iterrows(): utterances.append(comment_row['Body']) users.append(comment_row['User']) + all_utterances.append(utterances) self.all_users.append(users) self.results['Constructive'].append(row['Constructive']) self.results['Inclusive'].append(row['Inclusive']) + return all_utterances

    Methods

    @@ -483,12 +552,15 @@

    Methods

    def encode(self, utterances)
    -
    +

    Encode an utterance through the lookup dictionary

    Expand source code
    def encode(self, utterances):
    +    """
    +    Encode an utterance through the lookup dictionary
    +    """
         encoded_utterances = []
         for utterance in utterances:
             obs = []
    @@ -512,12 +584,16 @@ 

    Methods

    def encodeData(self)
    -
    +

    Encode all utterances

    Expand source code
    def encodeData(self):
    +    """
    +    Encode all utterances
    +    """
    +
         all_utterances = self._getObsResUsers()
         # Encode each utterance
         for utterances in all_utterances:
    @@ -540,12 +616,15 @@ 

    Methods

    def getRes(self, outcome=None)
    -
    +

    Get list of results

    Expand source code
    def getRes(self, outcome=None):
    +    """
    +    Get list of results
    +    """
         if outcome:
             return self.results[outcome]
         return pd.DataFrame(data=self.results)
    @@ -555,12 +634,16 @@

    Methods

    def getRoleAgnosticMatrix(self, outcome=None, padPull=True)
    -
    +

    Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both

    Expand source code
    def getRoleAgnosticMatrix(self, outcome=None, padPull=True):
    +    """
    +    Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both
    +    """
    +
         obs = []
         for i in range(len(self.all_encoded_utterances)):
             def pad(inp):
    @@ -582,21 +665,26 @@ 

    Methods

    def getRoleMatrix(self, outcome=None, padPull=True)
    -
    +

    Get stacked matrix observation and results for ML task

    Expand source code
    def getRoleMatrix(self, outcome=None, padPull=True):
    +    """
    +    Get stacked matrix observation and results for ML task
    +    """
         # Check if results must be padded to same length for each pull
         if padPull:
             # Author and reviwer layers
             layer_writer = []
             layer_reviewer = []
    +
             for i in range(len(self.all_encoded_utterances)):
                 writer_comments = []
                 reviewer_comments = []
                 writer = self.all_users[i][0]
    +
                 for j in range(len(self.all_encoded_utterances[i])):
                     # Check if author or reviewer
                     if self.all_users[i][j] == writer:
    @@ -606,12 +694,14 @@ 

    Methods

    reviewer_comments.append(self.all_encoded_utterances[i][j]) writer_comments.append(np.zeros(self.no_tokens)) padding = [0] * self.no_tokens + # Pad both reviewer and writer layers while len(writer_comments) < self.curr_max_length: writer_comments.append(padding.copy()) reviewer_comments.append(padding.copy()) layer_writer.append(np.array(writer_comments)) layer_reviewer.append(np.array(reviewer_comments)) + # Stack reviwer and author matrices obs = np.stack((layer_writer, layer_reviewer), axis=3) res = self.getRes(outcome) @@ -622,6 +712,7 @@

    Methods

    writer_comments = [] reviewer_comments = [] writer = self.all_users[i][0] + for j in range(len(self.all_encoded_utterances[i])): if self.all_users[i][j] == writer: writer_comments.append(self.all_encoded_utterances[i][j]) @@ -639,12 +730,15 @@

    Methods

    def loadAnnotatedData(self, filename)
    -
    +

    Load annotated dataset from file

    Expand source code
    def loadAnnotatedData(self, filename):
    +    """
    +    Load annotated dataset from file
    +    """
         self.annotated_data = pd.read_csv(filename)
         self.annotated_data_open = True
    @@ -653,12 +747,15 @@

    Methods

    def loadDataset(self, filename)
    -
    +

    Load raw data from file

    Expand source code
    def loadDataset(self, filename):
    +    """
    +    Load raw data from file
    +    """
         self.dataset = pd.read_csv(filename)
         self.dataset_open = True
    @@ -667,12 +764,16 @@

    Methods

    def setupPreProcess(self, annotated_filename, dataset_filename)
    -
    +

    Setup the preprocessed dataset

    Expand source code
    def setupPreProcess(self, annotated_filename, dataset_filename):
    +    """
    +    Setup the preprocessed dataset
    +    """
    +
         # Load datasets
         self.loadAnnotatedData(annotated_filename)
         self.loadDataset(dataset_filename)
    diff --git a/docs/ml-conversational-analytic-tool/run.html b/docs/ml-conversational-analytic-tool/run.html
    index 396398b..1c8639a 100644
    --- a/docs/ml-conversational-analytic-tool/run.html
    +++ b/docs/ml-conversational-analytic-tool/run.html
    @@ -30,6 +30,8 @@ 

    Module ml-conversational-analytic-tool.run

    # SPDX-License-Identifier: Apache-2.0 import argparse +import tarfile +import os from sklearn.model_selection import train_test_split @@ -37,27 +39,39 @@

    Module ml-conversational-analytic-tool.run

    from baseLSTM import BaseLSTM from preProcessedDataset import PreProcessedDataset - -def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding): +model_directory = 'models' + +def save_model(model, name, version): + if not os.path.exists(model_directory): + os.makedirs(model_directory) + model_path = "{}/{}".format(model_directory, name) + tar_file_name = "{}-{}.tar.gz".format(name, version) + model.saveModel(name=model_path, version=version) + os.chdir(model_path) + tar = tarfile.open(tar_file_name, "w:gz") + tar.add(version) + tar.close() + os.chdir("../../") + print("Model saved in {}/{}; {}/{}".format(model_path, version, model_path, tar_file_name)) + +def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding, save_name, model_ver): # Setup dataset data = PreProcessedDataset() data.setupPreProcess(annotated_filename, dataset_filename) data.encodeData() - # Get data for training - if encoding_type == 'role': - obs, res = data.getRoleMatrix(outcome, padding) - elif encoding_type == 'role-agnostic': - obs, res = data.getRoleAgnosticMatrix(outcome, padding) - # Create models - if model_type == 'CNN': - model = BaseCNN() - elif model_type == 'LSTM': + if model_type == 'LSTM': model = BaseLSTM() + else: + model = BaseCNN() + + # Get data for training if encoding_type == 'role': + obs, res = data.getRoleMatrix(outcome, padding) model.makeModel2D(obs[0].shape) - elif encoding_type == 'role-agnostic': + else: + obs, res = data.getRoleAgnosticMatrix(outcome, padding) model.makeModel(obs[0].shape) # Train model @@ -66,6 +80,10 @@

    Module ml-conversational-analytic-tool.run

    # Score model scores = model.scoreModel(test_obs, test_res) + + # Save model + if save_name is not None and len(save_name) > 0: + save_model(model=model, name=save_name+"-"+outcome, version=model_ver) return scores @@ -75,27 +93,36 @@

    Module ml-conversational-analytic-tool.run

    description="Obtain models to determine constructive and inclusive feedback in Open source communities") parser.add_argument('annotated_filename', help='File location of annotated file') parser.add_argument('dataset_filename', help='File location of extracted dataset') - parser.add_argument('model', help='Model type to use for training') + parser.add_argument('model', help='Model type to use for training, supported CNN and LSTM') parser.add_argument('outcome', help='Inclusive, Constructive, or Both') + parser.add_argument('-save', metavar='NAME', help='Save the model using given NAME') + parser.add_argument('-save_version', metavar='VERSION', default='001', + help='Together with -save NAME: save the model using given NAME and VERSION. '\ + 'If omitted, 001 is used. The parameter is ignored if -save is missing.') parser.add_argument('-roleRelevant', action='store_true', default=False, help='Encoding method differentiates b/w conversation roles') parser.add_argument('-pad', action='store_true', default=False, help='Pad total length of each pull') args = parser.parse_args() + + if args.model != 'CNN' and args.model != 'LSTM': + raise Exception("Model must be either CNN or LSTM") + encodingType = 'role' if not args.roleRelevant: encodingType = 'role-agnostic' if args.outcome != 'Both': - run_res = run(args.annotated_filename, args.dataset_filename, args.outcome, encodingType, args.model, args.pad) + run_res = run(args.annotated_filename, args.dataset_filename, args.outcome, encodingType, + args.model, args.pad, args.save, args.save_version) print(run_res) else: run_res_constructive = run(args.annotated_filename, args.dataset_filename, 'Constructive', encodingType, - args.model, args.pad) + args.model, args.pad, args.save, args.save_version) print("Constructive: {}".format(run_res_constructive)) - run_res_inclusive = run(args.annotated_filename, args.dataset_filename, 'Inclusive', encodingType, args.model, - args.pad) + run_res_inclusive = run(args.annotated_filename, args.dataset_filename, 'Inclusive', encodingType, + args.model, args.pad, args.save, args.save_version) print("Inclusvie: {}".format(run_res_inclusive))
    @@ -107,7 +134,7 @@

    Module ml-conversational-analytic-tool.run

    Functions

    -def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding) +def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding, save_name, model_ver)
    @@ -115,26 +142,24 @@

    Functions

    Expand source code -
    def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding):
    +
    def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding, save_name, model_ver):
         # Setup dataset
         data = PreProcessedDataset()
         data.setupPreProcess(annotated_filename, dataset_filename)
         data.encodeData()
     
    -    # Get data for training
    -    if encoding_type == 'role':
    -        obs, res = data.getRoleMatrix(outcome, padding)
    -    elif encoding_type == 'role-agnostic':
    -        obs, res = data.getRoleAgnosticMatrix(outcome, padding)
    -
         # Create models
    -    if model_type == 'CNN':
    -        model = BaseCNN()
    -    elif model_type == 'LSTM':
    +    if model_type == 'LSTM':
             model = BaseLSTM()
    +    else:
    +        model = BaseCNN()
    +
    +    # Get data for training
         if encoding_type == 'role':
    +        obs, res = data.getRoleMatrix(outcome, padding)
             model.makeModel2D(obs[0].shape)
    -    elif encoding_type == 'role-agnostic':
    +    else:
    +        obs, res = data.getRoleAgnosticMatrix(outcome, padding)
             model.makeModel(obs[0].shape)
     
         # Train model
    @@ -143,10 +168,37 @@ 

    Functions

    # Score model scores = model.scoreModel(test_obs, test_res) + + # Save model + if save_name is not None and len(save_name) > 0: + save_model(model=model, name=save_name+"-"+outcome, version=model_ver) return scores
    +
    +def save_model(model, name, version) +
    +
    +
    +
    + +Expand source code + +
    def save_model(model, name, version):
    +    if not os.path.exists(model_directory):
    +        os.makedirs(model_directory)
    +    model_path = "{}/{}".format(model_directory, name)
    +    tar_file_name = "{}-{}.tar.gz".format(name, version)
    +    model.saveModel(name=model_path, version=version)
    +    os.chdir(model_path)
    +    tar = tarfile.open(tar_file_name, "w:gz")
    +    tar.add(version)
    +    tar.close()
    +    os.chdir("../../")
    +    print("Model saved in {}/{}; {}/{}".format(model_path, version, model_path, tar_file_name))
    +
    +
    @@ -166,6 +218,7 @@

    Index

  • Functions

  • diff --git a/docs/ml-conversational-analytic-tool/runDataExtraction.html b/docs/ml-conversational-analytic-tool/runDataExtraction.html index 1ebe4fa..8d7d03b 100644 --- a/docs/ml-conversational-analytic-tool/runDataExtraction.html +++ b/docs/ml-conversational-analytic-tool/runDataExtraction.html @@ -35,8 +35,10 @@

    Module ml-conversational-analytic-tool.runDataExtraction from githubDataExtraction import GithubDataExtractor -# Method to extract data for all repositories in organization def getRepos(access_token, organization, reaction): + """ + Method to extract data for all repositories in organization + """ extractor = GithubDataExtractor(access_token) # Create object repos = extractor.g_ses.get_organization(organization).get_repos() for repo in repos: @@ -45,8 +47,10 @@

    Module ml-conversational-analytic-tool.runDataExtraction extractor.getAllPulls("", reaction) -# Method to extract data for an individual repository def getRepo(access_token, organization, reponame, reaction): + """ + Method to extract data for an individual repository + """ extractor = GithubDataExtractor(access_token) # Create object extractor.openRepo(organization, reponame) extractor.getAllPulls("", reaction) @@ -77,12 +81,15 @@

    Functions

    def getRepo(access_token, organization, reponame, reaction)
    -
    +

    Method to extract data for an individual repository

    Expand source code
    def getRepo(access_token, organization, reponame, reaction):
    +    """
    +    Method to extract data for an individual repository
    +    """
         extractor = GithubDataExtractor(access_token)  # Create object 
         extractor.openRepo(organization, reponame)
         extractor.getAllPulls("", reaction)
    @@ -92,12 +99,15 @@

    Functions

    def getRepos(access_token, organization, reaction)
    -
    +

    Method to extract data for all repositories in organization

    Expand source code
    def getRepos(access_token, organization, reaction):
    +    """
    +    Method to extract data for all repositories in organization
    +    """
         extractor = GithubDataExtractor(access_token)  # Create object
         repos = extractor.g_ses.get_organization(organization).get_repos()
         for repo in repos: