diff --git a/docs/ml-conversational-analytic-tool/baseCNN.html b/docs/ml-conversational-analytic-tool/baseCNN.html index 4005539..a1b771b 100644 --- a/docs/ml-conversational-analytic-tool/baseCNN.html +++ b/docs/ml-conversational-analytic-tool/baseCNN.html @@ -35,18 +35,21 @@
ml-conversational-analytic-tool.baseCNN
Module ml-conversational-analytic-tool.baseCNN
Module ml-conversational-analytic-tool.baseCNN
Module ml-conversational-analytic-tool.baseCNN
Module ml-conversational-analytic-tool.baseCNN
@@ -117,22 +133,26 @@ Constructor creates model and explainer
class BaseCNN:
- # Constructor creates model and explainer
def __init__(self):
+ """
+ Constructor creates model and explainer
+ """
self.dimension2 = True
self.input_shape = ()
self.model = keras.models.Sequential()
self.explainer = SmoothGrad()
self.model_ready = False
- # Make 1d model for role agnostic data
def makeModel(self, input_shape):
+ """
+ Make 1d model for role agnostic data
+ """
self.input_shape = input_shape
self.model.add(keras.layers.Conv1D(32, 3, activation='relu', input_shape=input_shape))
self.model.add(keras.layers.MaxPooling1D(2))
@@ -143,8 +163,10 @@ Classes
self.dimension2 = False
self.model_ready = True
- # Make 2d model for role relevant data
def makeModel2D(self, input_shape):
+ """
+ Make 2d model for role relevant data
+ """
self.input_shape = input_shape
self.model.add(keras.layers.Conv2D(4, (5, 5), activation='relu', input_shape=input_shape))
self.model.add(keras.layers.MaxPooling2D((4, 4)))
@@ -156,8 +178,10 @@ Classes
self.dimension2 = True
self.model_ready = True
- # Train model
def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32):
+ """
+ Train model
+ """
self.model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
if val_set:
train_hist = self.model.fit(np.array(obs), np.array(res), epochs=epochs, batch_size=batch_size,
@@ -168,8 +192,13 @@ Classes
validation_split=val_split, verbose=1)
return train_hist
- # Score model for accuracy, precision and recall
+ def saveModel(self, name, version):
+ self.model.save("{}/{}".format(name, version))
+
def scoreModel(self, obs, res):
+ """
+ Score model for accuracy, precision and recall
+ """
evaluation = {}
evaluation['Loss_Acc'] = self.model.evaluate(np.array(obs), np.array(res))
evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True),
@@ -177,15 +206,19 @@ Classes
print("Accuracy: {}".format(evaluation['Loss_Acc'][1]))
return evaluation
- # Get predictions
def predict(self, obs, labels=False):
+ """
+ Get predictions
+ """
predictions = self.model.predict(np.array(obs))
if labels:
return [1 if x > 0.5 else 0 for x in predictions]
return predictions
- # Explain prediction for obs using explainer
def explain(self, obs):
+ """
+ Explain prediction for obs using explainer
+ """
output = self.explainer.explain((obs, None), self.model, 1, 20, 1.)
return output
Explain prediction for obs using explainer
def explain(self, obs):
+ """
+ Explain prediction for obs using explainer
+ """
output = self.explainer.explain((obs, None), self.model, 1, 20, 1.)
return output
Make 1d model for role agnostic data
def makeModel(self, input_shape):
+ """
+ Make 1d model for role agnostic data
+ """
self.input_shape = input_shape
self.model.add(keras.layers.Conv1D(32, 3, activation='relu', input_shape=input_shape))
self.model.add(keras.layers.MaxPooling1D(2))
@@ -230,12 +269,15 @@ Methods
def makeModel2D(self, input_shape)
Make 2d model for role relevant data
def makeModel2D(self, input_shape):
+ """
+ Make 2d model for role relevant data
+ """
self.input_shape = input_shape
self.model.add(keras.layers.Conv2D(4, (5, 5), activation='relu', input_shape=input_shape))
self.model.add(keras.layers.MaxPooling2D((4, 4)))
@@ -252,28 +294,47 @@ Methods
def predict(self, obs, labels=False)
Get predictions
def predict(self, obs, labels=False):
+ """
+ Get predictions
+ """
predictions = self.model.predict(np.array(obs))
if labels:
return [1 if x > 0.5 else 0 for x in predictions]
return predictions
+def saveModel(self, name, version)
+
def saveModel(self, name, version):
+ self.model.save("{}/{}".format(name, version))
+
def scoreModel(self, obs, res)
Score model for accuracy, precision and recall
def scoreModel(self, obs, res):
+ """
+ Score model for accuracy, precision and recall
+ """
evaluation = {}
evaluation['Loss_Acc'] = self.model.evaluate(np.array(obs), np.array(res))
evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True),
@@ -286,12 +347,15 @@ Methods
def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32)
Train model
def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32):
+ """
+ Train model
+ """
self.model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
if val_set:
train_hist = self.model.fit(np.array(obs), np.array(res), epochs=epochs, batch_size=batch_size,
@@ -328,6 +392,7 @@ makeModel
makeModel2D
predict
+saveModel
scoreModel
trainModel
diff --git a/docs/ml-conversational-analytic-tool/baseLSTM.html b/docs/ml-conversational-analytic-tool/baseLSTM.html
index c11717a..e8fc887 100644
--- a/docs/ml-conversational-analytic-tool/baseLSTM.html
+++ b/docs/ml-conversational-analytic-tool/baseLSTM.html
@@ -35,17 +35,20 @@ Module ml-conversational-analytic-tool.baseLSTM
from tensorflow import keras
-# Class to create LSTM
class BaseLSTM:
- # Constructor creates model
def __init__(self):
+ """
+ Constructor creates model
+ """
self.dimension2 = True
self.input_shape = ()
self.model = None
self.model_ready = False
- # Make lstm model for role agnostic data
def makeModel(self, input_shape):
+ """
+ Make lstm model for role agnostic data
+ """
self.input_shape = input_shape
self.model = keras.models.Sequential()
self.model.add(keras.layers.LSTM(8, input_shape=input_shape))
@@ -55,22 +58,29 @@ Module ml-conversational-analytic-tool.baseLSTM
self.dimension2 = False
self.model_ready = True
- # Made lstm model for role relevant layers data
def makeModel2D(self, input_shape):
+ """
+ Made lstm model for role relevant layers data
+ """
+
# Inputs
self.input_shape = input_shape
inputs = keras.Input(shape=(None, 512, 2), dtype="float32")
+
# Pipe output of author and reviewer layer to two lstm
author = keras.layers.Lambda(lambda x: x[:, :, :, 0])(inputs)
reviewer = keras.layers.Lambda(lambda x: x[:, :, :, 1])(inputs)
+
# Create author lstm
author_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(author)
author_dropout = keras.layers.Dropout(0.2)(author_lstm)
author_output = keras.layers.Dense(1, activation='relu')(author_dropout)
+
# Create reviewer lstm
reviewer_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(reviewer)
reviewer_dropout = keras.layers.Dropout(0.2)(reviewer_lstm)
reviewer_output = keras.layers.Dense(1, activation='relu')(reviewer_dropout)
+
# Concatenate author and reviewer output
combine = keras.layers.Concatenate(axis=1)([author_output, reviewer_output])
output = keras.layers.Dense(1, activation='sigmoid')(combine)
@@ -80,7 +90,6 @@ Module ml-conversational-analytic-tool.baseLSTM
self.dimension2 = True
self.model_ready = True
- # Train model
def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32):
self.model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
if val_set:
@@ -94,8 +103,13 @@ Module ml-conversational-analytic-tool.baseLSTM
batch_size=batch_size, verbose=1)
return train_hist
- # Score model for accuracy, precision and recall
+ def saveModel(self, name, version):
+ self.model.save("{}/{}".format(name, version))
+
def scoreModel(self, obs, res):
+ """
+ Score model for accuracy, precision and recall
+ """
evaluation = {}
evaluation['Loss_Acc'] = self.model.evaluate(tf.ragged.stack(obs), tf.convert_to_tensor(res))
evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True),
@@ -103,21 +117,24 @@ Module ml-conversational-analytic-tool.baseLSTM
print("Accuracy: {}".format(evaluation['Loss_Acc'][1]))
return evaluation
- # Get predictions
def predict(self, obs, labels=False):
predictions = self.model.predict(tf.ragged.stack(obs))
if labels:
return [1 if x > 0.5 else 0 for x in predictions]
return predictions
- # Explain prediction for obs by calculating gradients
def explain(self, obs):
+ """
+ Explain prediction for obs by calculating gradients
+ """
grads = self._gradientImportance(obs)
imp = grads[:, 0] + grads[:, 1]
return imp
- # Get gradients
def _gradientImportance(self, seq):
+ """
+ Get gradients
+ """
seq = tf.Variable(seq[np.newaxis, :, :])
with tf.GradientTape() as tape:
predictions = self.model(seq)
@@ -140,21 +157,25 @@ Classes
class BaseLSTM
Constructor creates model
class BaseLSTM:
- # Constructor creates model
def __init__(self):
+ """
+ Constructor creates model
+ """
self.dimension2 = True
self.input_shape = ()
self.model = None
self.model_ready = False
- # Make lstm model for role agnostic data
def makeModel(self, input_shape):
+ """
+ Make lstm model for role agnostic data
+ """
self.input_shape = input_shape
self.model = keras.models.Sequential()
self.model.add(keras.layers.LSTM(8, input_shape=input_shape))
@@ -164,22 +185,29 @@ Classes
self.dimension2 = False
self.model_ready = True
- # Made lstm model for role relevant layers data
def makeModel2D(self, input_shape):
+ """
+ Made lstm model for role relevant layers data
+ """
+
# Inputs
self.input_shape = input_shape
inputs = keras.Input(shape=(None, 512, 2), dtype="float32")
+
# Pipe output of author and reviewer layer to two lstm
author = keras.layers.Lambda(lambda x: x[:, :, :, 0])(inputs)
reviewer = keras.layers.Lambda(lambda x: x[:, :, :, 1])(inputs)
+
# Create author lstm
author_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(author)
author_dropout = keras.layers.Dropout(0.2)(author_lstm)
author_output = keras.layers.Dense(1, activation='relu')(author_dropout)
+
# Create reviewer lstm
reviewer_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(reviewer)
reviewer_dropout = keras.layers.Dropout(0.2)(reviewer_lstm)
reviewer_output = keras.layers.Dense(1, activation='relu')(reviewer_dropout)
+
# Concatenate author and reviewer output
combine = keras.layers.Concatenate(axis=1)([author_output, reviewer_output])
output = keras.layers.Dense(1, activation='sigmoid')(combine)
@@ -189,7 +217,6 @@ Classes
self.dimension2 = True
self.model_ready = True
- # Train model
def trainModel(self, obs, res, val_split=0.3, val_set=None, epochs=10, batch_size=32):
self.model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
if val_set:
@@ -203,8 +230,13 @@ Classes
batch_size=batch_size, verbose=1)
return train_hist
- # Score model for accuracy, precision and recall
+ def saveModel(self, name, version):
+ self.model.save("{}/{}".format(name, version))
+
def scoreModel(self, obs, res):
+ """
+ Score model for accuracy, precision and recall
+ """
evaluation = {}
evaluation['Loss_Acc'] = self.model.evaluate(tf.ragged.stack(obs), tf.convert_to_tensor(res))
evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True),
@@ -212,21 +244,24 @@ Classes
print("Accuracy: {}".format(evaluation['Loss_Acc'][1]))
return evaluation
- # Get predictions
def predict(self, obs, labels=False):
predictions = self.model.predict(tf.ragged.stack(obs))
if labels:
return [1 if x > 0.5 else 0 for x in predictions]
return predictions
- # Explain prediction for obs by calculating gradients
def explain(self, obs):
+ """
+ Explain prediction for obs by calculating gradients
+ """
grads = self._gradientImportance(obs)
imp = grads[:, 0] + grads[:, 1]
return imp
- # Get gradients
def _gradientImportance(self, seq):
+ """
+ Get gradients
+ """
seq = tf.Variable(seq[np.newaxis, :, :])
with tf.GradientTape() as tape:
predictions = self.model(seq)
@@ -241,12 +276,15 @@ Methods
def explain(self, obs)
Explain prediction for obs by calculating gradients
def explain(self, obs):
+ """
+ Explain prediction for obs by calculating gradients
+ """
grads = self._gradientImportance(obs)
imp = grads[:, 0] + grads[:, 1]
return imp
@@ -256,12 +294,15 @@ Make lstm model for role agnostic data
def makeModel(self, input_shape):
+ """
+ Make lstm model for role agnostic data
+ """
self.input_shape = input_shape
self.model = keras.models.Sequential()
self.model.add(keras.layers.LSTM(8, input_shape=input_shape))
@@ -276,26 +317,34 @@ Methods
def makeModel2D(self, input_shape)
Made lstm model for role relevant layers data
def makeModel2D(self, input_shape):
+ """
+ Made lstm model for role relevant layers data
+ """
+
# Inputs
self.input_shape = input_shape
inputs = keras.Input(shape=(None, 512, 2), dtype="float32")
+
# Pipe output of author and reviewer layer to two lstm
author = keras.layers.Lambda(lambda x: x[:, :, :, 0])(inputs)
reviewer = keras.layers.Lambda(lambda x: x[:, :, :, 1])(inputs)
+
# Create author lstm
author_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(author)
author_dropout = keras.layers.Dropout(0.2)(author_lstm)
author_output = keras.layers.Dense(1, activation='relu')(author_dropout)
+
# Create reviewer lstm
reviewer_lstm = keras.layers.LSTM(4, activation='relu', return_sequences=False)(reviewer)
reviewer_dropout = keras.layers.Dropout(0.2)(reviewer_lstm)
reviewer_output = keras.layers.Dense(1, activation='relu')(reviewer_dropout)
+
# Concatenate author and reviewer output
combine = keras.layers.Concatenate(axis=1)([author_output, reviewer_output])
output = keras.layers.Dense(1, activation='sigmoid')(combine)
@@ -322,16 +371,32 @@ Methods
return predictions
+def saveModel(self, name, version)
+
def saveModel(self, name, version):
+ self.model.save("{}/{}".format(name, version))
+
def scoreModel(self, obs, res)
Score model for accuracy, precision and recall
def scoreModel(self, obs, res):
+ """
+ Score model for accuracy, precision and recall
+ """
evaluation = {}
evaluation['Loss_Acc'] = self.model.evaluate(tf.ragged.stack(obs), tf.convert_to_tensor(res))
evaluation['Precision_Recall_Fscore_Support'] = precision_recall_fscore_support(res, self.predict(obs, True),
@@ -388,6 +453,7 @@ makeModel
makeModel2D
predict
+saveModel
scoreModel
trainModel
diff --git a/docs/ml-conversational-analytic-tool/commentAnalysis.html b/docs/ml-conversational-analytic-tool/commentAnalysis.html
index 70caebc..e98c683 100644
--- a/docs/ml-conversational-analytic-tool/commentAnalysis.html
+++ b/docs/ml-conversational-analytic-tool/commentAnalysis.html
@@ -37,17 +37,21 @@ Module ml-conversational-analytic-tool.commentAnalysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
-# Class to analyze text to obtain desired features
class CommentAnalyzer:
- # Constructors form a dictionary to be used for counting.
- # Parameters: words - list of words to count
def __init__(self, words):
+ """
+ Constructors form a dictionary to be used for counting.
+ Parameters: words - list of words to count
+ """
self.word_count = {word.lower(): 0 for word in words} # Create dictionary with list items as key
self.vader_sentiment = SentimentIntensityAnalyzer() # Initialize sentiment analysis model
- # Method to get desired features from an input comment.
- # Parameters: comment - string. Returns: dictionary with features
def analyzeComment(self, comment):
+ """
+ Method to get desired features from an input comment.
+ Parameters: comment - string.
+ Returns: dictionary with features
+ """
result = {} # Create return dictionary
cleaned_comment = self.preProcess(comment) # Clean comment text
# result['Word Counts'] = self.countWords(cleaned_comment) # Determine word counts
@@ -56,16 +60,22 @@ Module ml-conversational-analytic-tool.commentAnalysis
result['Code Blocks'] = self.getCodeBlockCount(cleaned_comment) # Determine code block count
return result
- # Method to clean and return text. Parameters: text - string.
- # Returns: string after cleaning
def preProcess(self, text):
+ """
+ Method to clean and return text.
+ Parameters: text - string.
+ Returns: string after cleaning
+ """
cleaned_text = text.strip() # Remove trailing and starting spaces
cleaned_text = cleaned_text.lower() # Convert to lowercase
return cleaned_text
- # Method to determine word count. Parameters: comment - string
- # Returns: dictionary with word counts
def countWords(self, comment):
+ """
+ Method to determine word count.
+ Parameters: comment - string
+ Returns: dictionary with word counts
+ """
words = comment.split(" ") # Split text into words
current_word_count = self.word_count.copy() # Copy default dict for new count
for word in words: # Iterate over all words
@@ -73,22 +83,29 @@ Module ml-conversational-analytic-tool.commentAnalysis
current_word_count[word] = current_word_count[word] + 1
return current_word_count
- # Method to determine the code blocks. Parameters: comment - string
- # Returns: integer count
def getCodeBlockCount(self, comment):
+ """
+ Method to determine the code blocks.
+ Parameters: comment - string
+ Returns: integer count
+ """
count = comment.count("```") # Find occurences of code block
if count % 2 != 0: # Should be in pairs
print("Warning: Mismatched code blocks")
return int(count / 2 - 1) # Subtract 1 since unmatched pair
return int(count / 2) # Divide by 2 since pairs
- # Method to determine sentiment. Parameters: comment - string
- # Returns: dictionary with positive, negative and neutral scores
def getSentiment(self, comment):
+ """
+ Method to determine sentiment. Parameters: comment - string
+ Returns: dictionary with positive, negative and neutral scores
+ """
return self.vader_sentiment.polarity_scores(comment)["compound"]
- # Method to change words to count. Parameters: Set new word count with new keys/
def changeWords(self, words):
+ """
+ Method to change words to count. Parameters: Set new word count with new keys/
+ """
self.word_count = {word: 0 for word in words}
@@ -120,21 +137,27 @@ Classes
(words)
-
+Constructors form a dictionary to be used for counting.
+Parameters: words - list of words to count
Expand source code
class CommentAnalyzer:
- # Constructors form a dictionary to be used for counting.
- # Parameters: words - list of words to count
def __init__(self, words):
+ """
+ Constructors form a dictionary to be used for counting.
+ Parameters: words - list of words to count
+ """
self.word_count = {word.lower(): 0 for word in words} # Create dictionary with list items as key
self.vader_sentiment = SentimentIntensityAnalyzer() # Initialize sentiment analysis model
- # Method to get desired features from an input comment.
- # Parameters: comment - string. Returns: dictionary with features
def analyzeComment(self, comment):
+ """
+ Method to get desired features from an input comment.
+ Parameters: comment - string.
+ Returns: dictionary with features
+ """
result = {} # Create return dictionary
cleaned_comment = self.preProcess(comment) # Clean comment text
# result['Word Counts'] = self.countWords(cleaned_comment) # Determine word counts
@@ -143,16 +166,22 @@ Classes
result['Code Blocks'] = self.getCodeBlockCount(cleaned_comment) # Determine code block count
return result
- # Method to clean and return text. Parameters: text - string.
- # Returns: string after cleaning
def preProcess(self, text):
+ """
+ Method to clean and return text.
+ Parameters: text - string.
+ Returns: string after cleaning
+ """
cleaned_text = text.strip() # Remove trailing and starting spaces
cleaned_text = cleaned_text.lower() # Convert to lowercase
return cleaned_text
- # Method to determine word count. Parameters: comment - string
- # Returns: dictionary with word counts
def countWords(self, comment):
+ """
+ Method to determine word count.
+ Parameters: comment - string
+ Returns: dictionary with word counts
+ """
words = comment.split(" ") # Split text into words
current_word_count = self.word_count.copy() # Copy default dict for new count
for word in words: # Iterate over all words
@@ -160,22 +189,29 @@ Classes
current_word_count[word] = current_word_count[word] + 1
return current_word_count
- # Method to determine the code blocks. Parameters: comment - string
- # Returns: integer count
def getCodeBlockCount(self, comment):
+ """
+ Method to determine the code blocks.
+ Parameters: comment - string
+ Returns: integer count
+ """
count = comment.count("```") # Find occurences of code block
if count % 2 != 0: # Should be in pairs
print("Warning: Mismatched code blocks")
return int(count / 2 - 1) # Subtract 1 since unmatched pair
return int(count / 2) # Divide by 2 since pairs
- # Method to determine sentiment. Parameters: comment - string
- # Returns: dictionary with positive, negative and neutral scores
def getSentiment(self, comment):
+ """
+ Method to determine sentiment. Parameters: comment - string
+ Returns: dictionary with positive, negative and neutral scores
+ """
return self.vader_sentiment.polarity_scores(comment)["compound"]
- # Method to change words to count. Parameters: Set new word count with new keys/
def changeWords(self, words):
+ """
+ Method to change words to count. Parameters: Set new word count with new keys/
+ """
self.word_count = {word: 0 for word in words}
Methods
@@ -184,12 +220,19 @@ Methods
def analyzeComment(self, comment)
-
+Method to get desired features from an input comment.
+Parameters: comment - string.
+Returns: dictionary with features
Expand source code
def analyzeComment(self, comment):
+ """
+ Method to get desired features from an input comment.
+ Parameters: comment - string.
+ Returns: dictionary with features
+ """
result = {} # Create return dictionary
cleaned_comment = self.preProcess(comment) # Clean comment text
# result['Word Counts'] = self.countWords(cleaned_comment) # Determine word counts
@@ -203,12 +246,15 @@ Methods
def changeWords(self, words)
-
+Method to change words to count. Parameters: Set new word count with new keys/
Expand source code
def changeWords(self, words):
+ """
+ Method to change words to count. Parameters: Set new word count with new keys/
+ """
self.word_count = {word: 0 for word in words}
@@ -216,12 +262,19 @@ Methods
def countWords(self, comment)
-
+Method to determine word count.
+Parameters: comment - string
+Returns: dictionary with word counts
Expand source code
def countWords(self, comment):
+ """
+ Method to determine word count.
+ Parameters: comment - string
+ Returns: dictionary with word counts
+ """
words = comment.split(" ") # Split text into words
current_word_count = self.word_count.copy() # Copy default dict for new count
for word in words: # Iterate over all words
@@ -234,12 +287,19 @@ Methods
def getCodeBlockCount(self, comment)
-
+Method to determine the code blocks.
+Parameters: comment - string
+Returns: integer count
Expand source code
def getCodeBlockCount(self, comment):
+ """
+ Method to determine the code blocks.
+ Parameters: comment - string
+ Returns: integer count
+ """
count = comment.count("```") # Find occurences of code block
if count % 2 != 0: # Should be in pairs
print("Warning: Mismatched code blocks")
@@ -251,12 +311,17 @@ Methods
def getSentiment(self, comment)
-
+Method to determine sentiment. Parameters: comment - string
+Returns: dictionary with positive, negative and neutral scores
Expand source code
def getSentiment(self, comment):
+ """
+ Method to determine sentiment. Parameters: comment - string
+ Returns: dictionary with positive, negative and neutral scores
+ """
return self.vader_sentiment.polarity_scores(comment)["compound"]
@@ -264,12 +329,19 @@ Methods
def preProcess(self, text)
Method to clean and return text. +Parameters: text - string. +Returns: string after cleaning
def preProcess(self, text):
+ """
+ Method to clean and return text.
+ Parameters: text - string.
+ Returns: string after cleaning
+ """
cleaned_text = text.strip() # Remove trailing and starting spaces
cleaned_text = cleaned_text.lower() # Convert to lowercase
return cleaned_text
diff --git a/docs/ml-conversational-analytic-tool/featureVector.html b/docs/ml-conversational-analytic-tool/featureVector.html
index 0e2b581..6fbfe77 100644
--- a/docs/ml-conversational-analytic-tool/featureVector.html
+++ b/docs/ml-conversational-analytic-tool/featureVector.html
@@ -38,12 +38,12 @@ ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorModule ml-conversational-analytic-tool.featureVectorClasses
(retain_features, analysis_features)
-
+Constructor sets instance variables
+Inputs: Retained Features (list) -> pull data features, Analysis Features (list) - > comment analysis features
Expand source code
class Featurizer:
- # Constructor sets instance variables
- # Inputs: Retained Features (list) -> pull data features, Analysis Features (list) - > comment analysis features
def __init__(self, retain_features, analysis_features):
+ """
+ Constructor sets instance variables
+ Inputs: Retained Features (list) -> pull data features, Analysis Features (list) - > comment analysis features
+ """
self.analysis_features = analysis_features
self.retain_pull_features = retain_features
self.raw_filename = ""
@@ -266,9 +281,11 @@ Classes
self.featurized_data = None
self.commentAnalyzer = None
- # Function to read raw data stored as csv
- # Inputs: File location (string) -> input raw data file location
def readRawData(self, filename):
+ """
+ Function to read raw data stored as csv
+ Inputs: File location (string) -> input raw data file location
+ """
self.raw_filename = filename
self.raw_data = pd.read_csv(filename)
@@ -284,9 +301,11 @@ Classes
self.raw_data['Review_Comments'] = self.raw_data['Review_Comments'].apply(lambda comment: stringToDict(comment))
print("Done reading raw data.")
- # Function to obtain Comment Analyzer with parameters
- # Inputs: File location (string) -> .txt containing keywords to count
def setupCommentAnalyzer(self, filename):
+ """
+ Function to obtain Comment Analyzer with parameters
+ Inputs: File location (string) -> .txt containing keywords to count
+ """
word_list = []
# Open file to obtain list of words in Comment Analzyer
@@ -296,9 +315,11 @@ Classes
self.analysis_features = self.analysis_features + word_list
print("Comment Analyzer Setup")
- # Function to create/export dataset with desired features
- # Inputs: Name of file to export (string), export flag (boolean)
def formFeatures(self, export_filename="", export=True):
+ """
+ Function to create/export dataset with desired features
+ Inputs: Name of file to export (string), export flag (boolean)
+ """
features = [] # List of rows to convert to dataframe
# Iterate over each pull
@@ -391,12 +412,16 @@ Classes
return export_df
- # Function to form conversation string given communication on a pull request
- # Inputs: Name of file to export (string), export flag (boolean)
def pullStringConversation(self, export_filename="", export=True):
+ """
+ Function to form conversation string given communication on a pull request
+ Inputs: Name of file to export (string), export flag (boolean)
+ """
+
# Store each interaction and pull URL for export
string_conversations = []
pull_urls = []
+ pull_numbers = []
for index, row in self.raw_data.iterrows():
# Make pull message
@@ -415,9 +440,11 @@ Classes
comment_row["Body"])
string_conversations.append(conversation.encode("ascii", "ignore").decode())
pull_urls.append(row["URL"])
+ pull_numbers.append(row["Number"])
# Export converation field dataset
export_df = pd.DataFrame()
+ export_df["Number"] = pull_numbers
export_df["URL"] = pull_urls
export_df["Thread"] = string_conversations
@@ -436,12 +463,17 @@ Methods
def formFeatures(self, export_filename='', export=True)
-
+Function to create/export dataset with desired features
+Inputs: Name of file to export (string), export flag (boolean)
Expand source code
def formFeatures(self, export_filename="", export=True):
+ """
+ Function to create/export dataset with desired features
+ Inputs: Name of file to export (string), export flag (boolean)
+ """
features = [] # List of rows to convert to dataframe
# Iterate over each pull
@@ -539,15 +571,22 @@ Methods
def pullStringConversation(self, export_filename='', export=True)
-
+Function to form conversation string given communication on a pull request
+Inputs: Name of file to export (string), export flag (boolean)
Expand source code
def pullStringConversation(self, export_filename="", export=True):
+ """
+ Function to form conversation string given communication on a pull request
+ Inputs: Name of file to export (string), export flag (boolean)
+ """
+
# Store each interaction and pull URL for export
string_conversations = []
pull_urls = []
+ pull_numbers = []
for index, row in self.raw_data.iterrows():
# Make pull message
@@ -566,9 +605,11 @@ Methods
comment_row["Body"])
string_conversations.append(conversation.encode("ascii", "ignore").decode())
pull_urls.append(row["URL"])
+ pull_numbers.append(row["Number"])
# Export converation field dataset
export_df = pd.DataFrame()
+ export_df["Number"] = pull_numbers
export_df["URL"] = pull_urls
export_df["Thread"] = string_conversations
@@ -586,12 +627,17 @@ Methods
def readRawData(self, filename)
-
+Function to read raw data stored as csv
+Inputs: File location (string) -> input raw data file location
Expand source code
def readRawData(self, filename):
+ """
+ Function to read raw data stored as csv
+ Inputs: File location (string) -> input raw data file location
+ """
self.raw_filename = filename
self.raw_data = pd.read_csv(filename)
@@ -612,12 +658,17 @@ Methods
def setupCommentAnalyzer(self, filename)
-
+Function to obtain Comment Analyzer with parameters
+Inputs: File location (string) -> .txt containing keywords to count
Expand source code
def setupCommentAnalyzer(self, filename):
+ """
+ Function to obtain Comment Analyzer with parameters
+ Inputs: File location (string) -> .txt containing keywords to count
+ """
word_list = []
# Open file to obtain list of words in Comment Analzyer
diff --git a/docs/ml-conversational-analytic-tool/githubDataExtraction.html b/docs/ml-conversational-analytic-tool/githubDataExtraction.html
index fa70a3e..93adbb9 100644
--- a/docs/ml-conversational-analytic-tool/githubDataExtraction.html
+++ b/docs/ml-conversational-analytic-tool/githubDataExtraction.html
@@ -29,20 +29,20 @@ Module ml-conversational-analytic-tool.githubDataExtract
# Copyright 2021 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
-import argparse
import os
-# Imports
import time
+import argparse
import pandas as pd
from github import Github
from github.GithubException import RateLimitExceededException
-# Class to extract pull data from a github repo. The data includes reviews, comments, and other metadata from each pull.
class GithubDataExtractor:
- # Constructor requires an access token to start a Github session, and specifies instance variables
def __init__(self, access_token):
+ """
+ Constructor requires an access token to start a Github session, and specifies instance variables
+ """
self.g_ses = Github(access_token) # Github object is used as a channel to the Github API
self.current_repo = None # Current Opended Repo
self.reaction_flag = False
@@ -50,18 +50,22 @@ Module ml-conversational-analytic-tool.githubDataExtract
self.repo_name = ""
self.organization = ""
- # Method to open (access) repository with given organization and repository name (reponame).
- # Parameters: usernmae - owner of the repository, repo_name - name of repo to be opened
def openRepo(self, organization, repo_name):
+ """
+ Method to open (access) repository with given organization and repository name (reponame).
+ Parameters: username - owner of the repository, repo_name - name of repo to be opened
+ """
self.current_repo = self.g_ses.get_repo(organization + "/" + repo_name) # Open repo
self.repo_opened = True
self.repo_name = repo_name
self.organization = organization
print("Opened repo {} - {}".format(repo_name, organization))
- # Method to form a dataframe containing pull information. Parameters: name - name of exported csv file,
- # export - if the dataframe should be exported to csv. Returns: Dataframe with pull data
def getAllPulls(self, name="", reaction_flag=False, export_to_csv=True):
+ """
+ Method to form a dataframe containing pull information. Parameters: name - name of exported csv file,
+ export - if the dataframe should be exported to csv. Returns: Dataframe with pull data
+ """
self.reaction_flag = reaction_flag
if self.repo_opened: # Verify if a repo has been opened
pull_data = []
@@ -78,9 +82,11 @@ Module ml-conversational-analytic-tool.githubDataExtract
return pull_df
print("Please open a Repo")
- # Extract pulls with given state. Parameters: state - state of the pull (open or closed)
- # Return: list of dictionaries containing data regardining each pull
def getPullsByState(self, state):
+ """
+ Extract pulls with given state. Parameters: state - state of the pull (open or closed)
+ Return: list of dictionaries containing data regardining each pull
+ """
pull_data = []
try: # Call the Github api to get all pulls
pulls = self.current_repo.get_pulls(state=state, sort='create')
@@ -98,9 +104,11 @@ Module ml-conversational-analytic-tool.githubDataExtract
pull_data.append(self.getPullFeatures(pull))
return pull_data
- # Method to form a list of json strings rerpesenting comments (reviews or issue).
- # Parameters: comments - list of comment objects. Returns: List of json strings
def listOfComments(self, comments):
+ """
+ Method to form a list of json strings rerpesenting comments (reviews or issue).
+ Parameters: comments - list of comment objects. Returns: List of json strings
+ """
list_comments = []
# Iterate over each comment
@@ -142,9 +150,11 @@ Module ml-conversational-analytic-tool.githubDataExtract
"Updated_At": str(comment.updated_at)}))
return list_comments
- # Method to get all data for a particular pull. Parameters: pull - object representing a pull
- # Returns: dictionary containing all data of a pull
def getPullFeatures(self, pull):
+ """
+ Method to get all data for a particular pull. Parameters: pull - object representing a pull
+ Returns: dictionary containing all data of a pull
+ """
pull_dict = {}
pull_dict["Number"] = pull.number
pull_dict["Title"] = pull.title
@@ -187,6 +197,7 @@ Module ml-conversational-analytic-tool.githubDataExtract
ACCESS_TOKEN = os.environ["GITACCESS"] # Access Github token from environment for security purposes
extractor = GithubDataExtractor(ACCESS_TOKEN) # Create object
extractor.openRepo(args.organization, args.reponame) # Open repo
+
# Extract all pulls and export them to .csv
if args.filename:
extractor.getAllPulls(args.filename, args.reactions)
@@ -208,14 +219,16 @@ Classes
(access_token)
-
+Constructor requires an access token to start a Github session, and specifies instance variables
Expand source code
class GithubDataExtractor:
- # Constructor requires an access token to start a Github session, and specifies instance variables
def __init__(self, access_token):
+ """
+ Constructor requires an access token to start a Github session, and specifies instance variables
+ """
self.g_ses = Github(access_token) # Github object is used as a channel to the Github API
self.current_repo = None # Current Opended Repo
self.reaction_flag = False
@@ -223,18 +236,22 @@ Classes
self.repo_name = ""
self.organization = ""
- # Method to open (access) repository with given organization and repository name (reponame).
- # Parameters: usernmae - owner of the repository, repo_name - name of repo to be opened
def openRepo(self, organization, repo_name):
+ """
+ Method to open (access) repository with given organization and repository name (reponame).
+ Parameters: username - owner of the repository, repo_name - name of repo to be opened
+ """
self.current_repo = self.g_ses.get_repo(organization + "/" + repo_name) # Open repo
self.repo_opened = True
self.repo_name = repo_name
self.organization = organization
print("Opened repo {} - {}".format(repo_name, organization))
- # Method to form a dataframe containing pull information. Parameters: name - name of exported csv file,
- # export - if the dataframe should be exported to csv. Returns: Dataframe with pull data
def getAllPulls(self, name="", reaction_flag=False, export_to_csv=True):
+ """
+ Method to form a dataframe containing pull information. Parameters: name - name of exported csv file,
+ export - if the dataframe should be exported to csv. Returns: Dataframe with pull data
+ """
self.reaction_flag = reaction_flag
if self.repo_opened: # Verify if a repo has been opened
pull_data = []
@@ -251,9 +268,11 @@ Classes
return pull_df
print("Please open a Repo")
- # Extract pulls with given state. Parameters: state - state of the pull (open or closed)
- # Return: list of dictionaries containing data regardining each pull
def getPullsByState(self, state):
+ """
+ Extract pulls with given state. Parameters: state - state of the pull (open or closed)
+ Return: list of dictionaries containing data regardining each pull
+ """
pull_data = []
try: # Call the Github api to get all pulls
pulls = self.current_repo.get_pulls(state=state, sort='create')
@@ -271,9 +290,11 @@ Classes
pull_data.append(self.getPullFeatures(pull))
return pull_data
- # Method to form a list of json strings rerpesenting comments (reviews or issue).
- # Parameters: comments - list of comment objects. Returns: List of json strings
def listOfComments(self, comments):
+ """
+ Method to form a list of json strings rerpesenting comments (reviews or issue).
+ Parameters: comments - list of comment objects. Returns: List of json strings
+ """
list_comments = []
# Iterate over each comment
@@ -315,9 +336,11 @@ Classes
"Updated_At": str(comment.updated_at)}))
return list_comments
- # Method to get all data for a particular pull. Parameters: pull - object representing a pull
- # Returns: dictionary containing all data of a pull
def getPullFeatures(self, pull):
+ """
+ Method to get all data for a particular pull. Parameters: pull - object representing a pull
+ Returns: dictionary containing all data of a pull
+ """
pull_dict = {}
pull_dict["Number"] = pull.number
pull_dict["Title"] = pull.title
@@ -353,12 +376,17 @@ Methods
def getAllPulls(self, name='', reaction_flag=False, export_to_csv=True)
-
+Method to form a dataframe containing pull information. Parameters: name - name of exported csv file,
+export - if the dataframe should be exported to csv. Returns: Dataframe with pull data
Expand source code
def getAllPulls(self, name="", reaction_flag=False, export_to_csv=True):
+ """
+ Method to form a dataframe containing pull information. Parameters: name - name of exported csv file,
+ export - if the dataframe should be exported to csv. Returns: Dataframe with pull data
+ """
self.reaction_flag = reaction_flag
if self.repo_opened: # Verify if a repo has been opened
pull_data = []
@@ -380,12 +408,17 @@ Methods
def getPullFeatures(self, pull)
-
+Method to get all data for a particular pull. Parameters: pull - object representing a pull
+Returns: dictionary containing all data of a pull
Expand source code
def getPullFeatures(self, pull):
+ """
+ Method to get all data for a particular pull. Parameters: pull - object representing a pull
+ Returns: dictionary containing all data of a pull
+ """
pull_dict = {}
pull_dict["Number"] = pull.number
pull_dict["Title"] = pull.title
@@ -420,12 +453,17 @@ Methods
def getPullsByState(self, state)
-
+Extract pulls with given state. Parameters: state - state of the pull (open or closed)
+Return: list of dictionaries containing data regardining each pull
Expand source code
def getPullsByState(self, state):
+ """
+ Extract pulls with given state. Parameters: state - state of the pull (open or closed)
+ Return: list of dictionaries containing data regardining each pull
+ """
pull_data = []
try: # Call the Github api to get all pulls
pulls = self.current_repo.get_pulls(state=state, sort='create')
@@ -448,12 +486,17 @@ Methods
def listOfComments(self, comments)
-
+Method to form a list of json strings rerpesenting comments (reviews or issue).
+Parameters: comments - list of comment objects. Returns: List of json strings
Expand source code
def listOfComments(self, comments):
+ """
+ Method to form a list of json strings rerpesenting comments (reviews or issue).
+ Parameters: comments - list of comment objects. Returns: List of json strings
+ """
list_comments = []
# Iterate over each comment
@@ -500,12 +543,18 @@ Methods
def openRepo(self, organization, repo_name)
-
+Method to
+open (access) repository with given organization and repository name (reponame).
+Parameters: username - owner of the repository, repo_name - name of repo to be opened
Expand source code
def openRepo(self, organization, repo_name):
+ """
+ Method to open (access) repository with given organization and repository name (reponame).
+ Parameters: username - owner of the repository, repo_name - name of repo to be opened
+ """
self.current_repo = self.g_ses.get_repo(organization + "/" + repo_name) # Open repo
self.repo_opened = True
self.repo_name = repo_name
diff --git a/docs/ml-conversational-analytic-tool/preProcessedDataset.html b/docs/ml-conversational-analytic-tool/preProcessedDataset.html
index 4f2d49c..c810642 100644
--- a/docs/ml-conversational-analytic-tool/preProcessedDataset.html
+++ b/docs/ml-conversational-analytic-tool/preProcessedDataset.html
@@ -36,10 +36,11 @@ Module ml-conversational-analytic-tool.preProcessedDatas
import tensorflow as tf
-# Class to prepare dataset for machine learning tasks
class PreProcessedDataset:
- # Set flags and instance variables in constructor
def __init__(self, vocab_size=1000, no_tokens=512, max_pull_length=100):
+ """
+ Set flags and instance variables in constructor
+ """
self.annotated_data = None
self.dataset = None
self.full_dataset = None
@@ -59,8 +60,11 @@ Module ml-conversational-analytic-tool.preProcessedDatas
self.full_dataset_ready = False
self.encode_ready = False
- # Setup the preprocessed dataset
def setupPreProcess(self, annotated_filename, dataset_filename):
+ """
+ Setup the preprocessed dataset
+ """
+
# Load datasets
self.loadAnnotatedData(annotated_filename)
self.loadDataset(dataset_filename)
@@ -79,8 +83,11 @@ Module ml-conversational-analytic-tool.preProcessedDatas
self.full_dataset['Review_Comments'] = self.full_dataset['Review_Comments'].apply(
lambda comment: stringToDict(comment))
- # Encode all utterances
def encodeData(self):
+ """
+ Encode all utterances
+ """
+
all_utterances = self._getObsResUsers()
# Encode each utterance
for utterances in all_utterances:
@@ -98,8 +105,11 @@ Module ml-conversational-analytic-tool.preProcessedDatas
self.results['Inclusive'] = self.results['Inclusive'][:idx] + self.results['Inclusive'][idx + 1:]
self.curr_max_length = max([len(x) for x in self.all_encoded_utterances])
- # Get matrix observation and results for ML task soutcome = Inclusive, Constructive, or None -> both
def getRoleAgnosticMatrix(self, outcome=None, padPull=True):
+ """
+ Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both
+ """
+
obs = []
for i in range(len(self.all_encoded_utterances)):
def pad(inp):
@@ -116,17 +126,21 @@ Module ml-conversational-analytic-tool.preProcessedDatas
res = self.getRes(outcome)
return obs, res
- # Get stacked matrix observation and results for ML task
def getRoleMatrix(self, outcome=None, padPull=True):
+ """
+ Get stacked matrix observation and results for ML task
+ """
# Check if results must be padded to same length for each pull
if padPull:
# Author and reviwer layers
layer_writer = []
layer_reviewer = []
+
for i in range(len(self.all_encoded_utterances)):
writer_comments = []
reviewer_comments = []
writer = self.all_users[i][0]
+
for j in range(len(self.all_encoded_utterances[i])):
# Check if author or reviewer
if self.all_users[i][j] == writer:
@@ -136,12 +150,14 @@ Module ml-conversational-analytic-tool.preProcessedDatas
reviewer_comments.append(self.all_encoded_utterances[i][j])
writer_comments.append(np.zeros(self.no_tokens))
padding = [0] * self.no_tokens
+
# Pad both reviewer and writer layers
while len(writer_comments) < self.curr_max_length:
writer_comments.append(padding.copy())
reviewer_comments.append(padding.copy())
layer_writer.append(np.array(writer_comments))
layer_reviewer.append(np.array(reviewer_comments))
+
# Stack reviwer and author matrices
obs = np.stack((layer_writer, layer_reviewer), axis=3)
res = self.getRes(outcome)
@@ -152,6 +168,7 @@ Module ml-conversational-analytic-tool.preProcessedDatas
writer_comments = []
reviewer_comments = []
writer = self.all_users[i][0]
+
for j in range(len(self.all_encoded_utterances[i])):
if self.all_users[i][j] == writer:
writer_comments.append(self.all_encoded_utterances[i][j])
@@ -164,24 +181,32 @@ Module ml-conversational-analytic-tool.preProcessedDatas
res = self.getRes(outcome)
return obs, res
- # Get list of results
def getRes(self, outcome=None):
+ """
+ Get list of results
+ """
if outcome:
return self.results[outcome]
return pd.DataFrame(data=self.results)
- # Load annotated dataset from file
def loadAnnotatedData(self, filename):
+ """
+ Load annotated dataset from file
+ """
self.annotated_data = pd.read_csv(filename)
self.annotated_data_open = True
- # Load raw data from file
def loadDataset(self, filename):
+ """
+ Load raw data from file
+ """
self.dataset = pd.read_csv(filename)
self.dataset_open = True
- # Encode an utterance through the lookup dictionary
def encode(self, utterances):
+ """
+ Encode an utterance through the lookup dictionary
+ """
encoded_utterances = []
for utterance in utterances:
obs = []
@@ -200,8 +225,10 @@ Module ml-conversational-analytic-tool.preProcessedDatas
encoded_utterances.append(np.array(obs))
return np.array(encoded_utterances)
- # Setup the lookup dictionary through frequency encoding
def _setupEncode(self):
+ """
+ Setup the lookup dictionary through frequency encoding
+ """
word_counts = {}
for index, row in self.annotated_data.iterrows():
words = row['Thread'].split(" ")
@@ -210,6 +237,7 @@ Module ml-conversational-analytic-tool.preProcessedDatas
word_counts[word] = word_counts[word] + 1
else:
word_counts[word] = 1
+
# Sort the words to generate code
# Lower number : higher count
# 0 - Padding, 1 - End, 2 - Missing
@@ -224,8 +252,10 @@ Module ml-conversational-analytic-tool.preProcessedDatas
break
self.encode_ready = True
- # Get all utterances by parsing dictionary for each pull
def _getObsResUsers(self):
+ """
+ Get all utterances by parsing dictionary for each pull
+ """
all_utterances = []
for index, row in self.full_dataset.iterrows():
utterances = []
@@ -234,17 +264,21 @@ Module ml-conversational-analytic-tool.preProcessedDatas
users.append(row['User'])
temp_df_comments = pd.DataFrame(row['Comments'])
temp_df_review_comments = pd.DataFrame(row["Review_Comments"])
+
if len(temp_df_comments) > 0 or len(temp_df_review_comments) > 0:
all_comments = temp_df_comments.append(temp_df_review_comments)
all_comments['Created_At'] = pd.to_datetime(all_comments['Created_At'])
all_comments = all_comments.sort_values(by=['Created_At'])
+
for comment_index, comment_row in all_comments.iterrows():
utterances.append(comment_row['Body'])
users.append(comment_row['User'])
+
all_utterances.append(utterances)
self.all_users.append(users)
self.results['Constructive'].append(row['Constructive'])
self.results['Inclusive'].append(row['Inclusive'])
+
return all_utterances
@@ -262,14 +296,16 @@ Classes
(vocab_size=1000, no_tokens=512, max_pull_length=100)
-
+Set flags and instance variables in constructor
Expand source code
class PreProcessedDataset:
- # Set flags and instance variables in constructor
def __init__(self, vocab_size=1000, no_tokens=512, max_pull_length=100):
+ """
+ Set flags and instance variables in constructor
+ """
self.annotated_data = None
self.dataset = None
self.full_dataset = None
@@ -289,8 +325,11 @@ Classes
self.full_dataset_ready = False
self.encode_ready = False
- # Setup the preprocessed dataset
def setupPreProcess(self, annotated_filename, dataset_filename):
+ """
+ Setup the preprocessed dataset
+ """
+
# Load datasets
self.loadAnnotatedData(annotated_filename)
self.loadDataset(dataset_filename)
@@ -309,8 +348,11 @@ Classes
self.full_dataset['Review_Comments'] = self.full_dataset['Review_Comments'].apply(
lambda comment: stringToDict(comment))
- # Encode all utterances
def encodeData(self):
+ """
+ Encode all utterances
+ """
+
all_utterances = self._getObsResUsers()
# Encode each utterance
for utterances in all_utterances:
@@ -328,8 +370,11 @@ Classes
self.results['Inclusive'] = self.results['Inclusive'][:idx] + self.results['Inclusive'][idx + 1:]
self.curr_max_length = max([len(x) for x in self.all_encoded_utterances])
- # Get matrix observation and results for ML task soutcome = Inclusive, Constructive, or None -> both
def getRoleAgnosticMatrix(self, outcome=None, padPull=True):
+ """
+ Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both
+ """
+
obs = []
for i in range(len(self.all_encoded_utterances)):
def pad(inp):
@@ -346,17 +391,21 @@ Classes
res = self.getRes(outcome)
return obs, res
- # Get stacked matrix observation and results for ML task
def getRoleMatrix(self, outcome=None, padPull=True):
+ """
+ Get stacked matrix observation and results for ML task
+ """
# Check if results must be padded to same length for each pull
if padPull:
# Author and reviwer layers
layer_writer = []
layer_reviewer = []
+
for i in range(len(self.all_encoded_utterances)):
writer_comments = []
reviewer_comments = []
writer = self.all_users[i][0]
+
for j in range(len(self.all_encoded_utterances[i])):
# Check if author or reviewer
if self.all_users[i][j] == writer:
@@ -366,12 +415,14 @@ Classes
reviewer_comments.append(self.all_encoded_utterances[i][j])
writer_comments.append(np.zeros(self.no_tokens))
padding = [0] * self.no_tokens
+
# Pad both reviewer and writer layers
while len(writer_comments) < self.curr_max_length:
writer_comments.append(padding.copy())
reviewer_comments.append(padding.copy())
layer_writer.append(np.array(writer_comments))
layer_reviewer.append(np.array(reviewer_comments))
+
# Stack reviwer and author matrices
obs = np.stack((layer_writer, layer_reviewer), axis=3)
res = self.getRes(outcome)
@@ -382,6 +433,7 @@ Classes
writer_comments = []
reviewer_comments = []
writer = self.all_users[i][0]
+
for j in range(len(self.all_encoded_utterances[i])):
if self.all_users[i][j] == writer:
writer_comments.append(self.all_encoded_utterances[i][j])
@@ -394,24 +446,32 @@ Classes
res = self.getRes(outcome)
return obs, res
- # Get list of results
def getRes(self, outcome=None):
+ """
+ Get list of results
+ """
if outcome:
return self.results[outcome]
return pd.DataFrame(data=self.results)
- # Load annotated dataset from file
def loadAnnotatedData(self, filename):
+ """
+ Load annotated dataset from file
+ """
self.annotated_data = pd.read_csv(filename)
self.annotated_data_open = True
- # Load raw data from file
def loadDataset(self, filename):
+ """
+ Load raw data from file
+ """
self.dataset = pd.read_csv(filename)
self.dataset_open = True
- # Encode an utterance through the lookup dictionary
def encode(self, utterances):
+ """
+ Encode an utterance through the lookup dictionary
+ """
encoded_utterances = []
for utterance in utterances:
obs = []
@@ -430,8 +490,10 @@ Classes
encoded_utterances.append(np.array(obs))
return np.array(encoded_utterances)
- # Setup the lookup dictionary through frequency encoding
def _setupEncode(self):
+ """
+ Setup the lookup dictionary through frequency encoding
+ """
word_counts = {}
for index, row in self.annotated_data.iterrows():
words = row['Thread'].split(" ")
@@ -440,6 +502,7 @@ Classes
word_counts[word] = word_counts[word] + 1
else:
word_counts[word] = 1
+
# Sort the words to generate code
# Lower number : higher count
# 0 - Padding, 1 - End, 2 - Missing
@@ -454,8 +517,10 @@ Classes
break
self.encode_ready = True
- # Get all utterances by parsing dictionary for each pull
def _getObsResUsers(self):
+ """
+ Get all utterances by parsing dictionary for each pull
+ """
all_utterances = []
for index, row in self.full_dataset.iterrows():
utterances = []
@@ -464,17 +529,21 @@ Classes
users.append(row['User'])
temp_df_comments = pd.DataFrame(row['Comments'])
temp_df_review_comments = pd.DataFrame(row["Review_Comments"])
+
if len(temp_df_comments) > 0 or len(temp_df_review_comments) > 0:
all_comments = temp_df_comments.append(temp_df_review_comments)
all_comments['Created_At'] = pd.to_datetime(all_comments['Created_At'])
all_comments = all_comments.sort_values(by=['Created_At'])
+
for comment_index, comment_row in all_comments.iterrows():
utterances.append(comment_row['Body'])
users.append(comment_row['User'])
+
all_utterances.append(utterances)
self.all_users.append(users)
self.results['Constructive'].append(row['Constructive'])
self.results['Inclusive'].append(row['Inclusive'])
+
return all_utterances
Methods
@@ -483,12 +552,15 @@ Methods
def encode(self, utterances)
-
+Encode an utterance through the lookup dictionary
Expand source code
def encode(self, utterances):
+ """
+ Encode an utterance through the lookup dictionary
+ """
encoded_utterances = []
for utterance in utterances:
obs = []
@@ -512,12 +584,16 @@ Methods
def encodeData(self)
-
+Encode all utterances
Expand source code
def encodeData(self):
+ """
+ Encode all utterances
+ """
+
all_utterances = self._getObsResUsers()
# Encode each utterance
for utterances in all_utterances:
@@ -540,12 +616,15 @@ Methods
def getRes(self, outcome=None)
-
+Get list of results
Expand source code
def getRes(self, outcome=None):
+ """
+ Get list of results
+ """
if outcome:
return self.results[outcome]
return pd.DataFrame(data=self.results)
@@ -555,12 +634,16 @@ Methods
def getRoleAgnosticMatrix(self, outcome=None, padPull=True)
-
+Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both
Expand source code
def getRoleAgnosticMatrix(self, outcome=None, padPull=True):
+ """
+ Get matrix observation and results for ML task outcome = Inclusive, Constructive, or None -> both
+ """
+
obs = []
for i in range(len(self.all_encoded_utterances)):
def pad(inp):
@@ -582,21 +665,26 @@ Methods
def getRoleMatrix(self, outcome=None, padPull=True)
-
+Get stacked matrix observation and results for ML task
Expand source code
def getRoleMatrix(self, outcome=None, padPull=True):
+ """
+ Get stacked matrix observation and results for ML task
+ """
# Check if results must be padded to same length for each pull
if padPull:
# Author and reviwer layers
layer_writer = []
layer_reviewer = []
+
for i in range(len(self.all_encoded_utterances)):
writer_comments = []
reviewer_comments = []
writer = self.all_users[i][0]
+
for j in range(len(self.all_encoded_utterances[i])):
# Check if author or reviewer
if self.all_users[i][j] == writer:
@@ -606,12 +694,14 @@ Methods
reviewer_comments.append(self.all_encoded_utterances[i][j])
writer_comments.append(np.zeros(self.no_tokens))
padding = [0] * self.no_tokens
+
# Pad both reviewer and writer layers
while len(writer_comments) < self.curr_max_length:
writer_comments.append(padding.copy())
reviewer_comments.append(padding.copy())
layer_writer.append(np.array(writer_comments))
layer_reviewer.append(np.array(reviewer_comments))
+
# Stack reviwer and author matrices
obs = np.stack((layer_writer, layer_reviewer), axis=3)
res = self.getRes(outcome)
@@ -622,6 +712,7 @@ Methods
writer_comments = []
reviewer_comments = []
writer = self.all_users[i][0]
+
for j in range(len(self.all_encoded_utterances[i])):
if self.all_users[i][j] == writer:
writer_comments.append(self.all_encoded_utterances[i][j])
@@ -639,12 +730,15 @@ Methods
def loadAnnotatedData(self, filename)
-
+Load annotated dataset from file
Expand source code
def loadAnnotatedData(self, filename):
+ """
+ Load annotated dataset from file
+ """
self.annotated_data = pd.read_csv(filename)
self.annotated_data_open = True
@@ -653,12 +747,15 @@ Methods
def loadDataset(self, filename)
-
+Load raw data from file
Expand source code
def loadDataset(self, filename):
+ """
+ Load raw data from file
+ """
self.dataset = pd.read_csv(filename)
self.dataset_open = True
@@ -667,12 +764,16 @@ Methods
def setupPreProcess(self, annotated_filename, dataset_filename)
-
+Setup the preprocessed dataset
Expand source code
def setupPreProcess(self, annotated_filename, dataset_filename):
+ """
+ Setup the preprocessed dataset
+ """
+
# Load datasets
self.loadAnnotatedData(annotated_filename)
self.loadDataset(dataset_filename)
diff --git a/docs/ml-conversational-analytic-tool/run.html b/docs/ml-conversational-analytic-tool/run.html
index 396398b..1c8639a 100644
--- a/docs/ml-conversational-analytic-tool/run.html
+++ b/docs/ml-conversational-analytic-tool/run.html
@@ -30,6 +30,8 @@ Module ml-conversational-analytic-tool.run
# SPDX-License-Identifier: Apache-2.0
import argparse
+import tarfile
+import os
from sklearn.model_selection import train_test_split
@@ -37,27 +39,39 @@ Module ml-conversational-analytic-tool.run
from baseLSTM import BaseLSTM
from preProcessedDataset import PreProcessedDataset
-
-def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding):
+model_directory = 'models'
+
+def save_model(model, name, version):
+ if not os.path.exists(model_directory):
+ os.makedirs(model_directory)
+ model_path = "{}/{}".format(model_directory, name)
+ tar_file_name = "{}-{}.tar.gz".format(name, version)
+ model.saveModel(name=model_path, version=version)
+ os.chdir(model_path)
+ tar = tarfile.open(tar_file_name, "w:gz")
+ tar.add(version)
+ tar.close()
+ os.chdir("../../")
+ print("Model saved in {}/{}; {}/{}".format(model_path, version, model_path, tar_file_name))
+
+def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding, save_name, model_ver):
# Setup dataset
data = PreProcessedDataset()
data.setupPreProcess(annotated_filename, dataset_filename)
data.encodeData()
- # Get data for training
- if encoding_type == 'role':
- obs, res = data.getRoleMatrix(outcome, padding)
- elif encoding_type == 'role-agnostic':
- obs, res = data.getRoleAgnosticMatrix(outcome, padding)
-
# Create models
- if model_type == 'CNN':
- model = BaseCNN()
- elif model_type == 'LSTM':
+ if model_type == 'LSTM':
model = BaseLSTM()
+ else:
+ model = BaseCNN()
+
+ # Get data for training
if encoding_type == 'role':
+ obs, res = data.getRoleMatrix(outcome, padding)
model.makeModel2D(obs[0].shape)
- elif encoding_type == 'role-agnostic':
+ else:
+ obs, res = data.getRoleAgnosticMatrix(outcome, padding)
model.makeModel(obs[0].shape)
# Train model
@@ -66,6 +80,10 @@ Module ml-conversational-analytic-tool.run
# Score model
scores = model.scoreModel(test_obs, test_res)
+
+ # Save model
+ if save_name is not None and len(save_name) > 0:
+ save_model(model=model, name=save_name+"-"+outcome, version=model_ver)
return scores
@@ -75,27 +93,36 @@ Module ml-conversational-analytic-tool.run
description="Obtain models to determine constructive and inclusive feedback in Open source communities")
parser.add_argument('annotated_filename', help='File location of annotated file')
parser.add_argument('dataset_filename', help='File location of extracted dataset')
- parser.add_argument('model', help='Model type to use for training')
+ parser.add_argument('model', help='Model type to use for training, supported CNN and LSTM')
parser.add_argument('outcome', help='Inclusive, Constructive, or Both')
+ parser.add_argument('-save', metavar='NAME', help='Save the model using given NAME')
+ parser.add_argument('-save_version', metavar='VERSION', default='001',
+ help='Together with -save NAME: save the model using given NAME and VERSION. '\
+ 'If omitted, 001 is used. The parameter is ignored if -save is missing.')
parser.add_argument('-roleRelevant', action='store_true', default=False,
help='Encoding method differentiates b/w conversation roles')
parser.add_argument('-pad', action='store_true', default=False, help='Pad total length of each pull')
args = parser.parse_args()
+
+ if args.model != 'CNN' and args.model != 'LSTM':
+ raise Exception("Model must be either CNN or LSTM")
+
encodingType = 'role'
if not args.roleRelevant:
encodingType = 'role-agnostic'
if args.outcome != 'Both':
- run_res = run(args.annotated_filename, args.dataset_filename, args.outcome, encodingType, args.model, args.pad)
+ run_res = run(args.annotated_filename, args.dataset_filename, args.outcome, encodingType,
+ args.model, args.pad, args.save, args.save_version)
print(run_res)
else:
run_res_constructive = run(args.annotated_filename, args.dataset_filename, 'Constructive', encodingType,
- args.model, args.pad)
+ args.model, args.pad, args.save, args.save_version)
print("Constructive: {}".format(run_res_constructive))
- run_res_inclusive = run(args.annotated_filename, args.dataset_filename, 'Inclusive', encodingType, args.model,
- args.pad)
+ run_res_inclusive = run(args.annotated_filename, args.dataset_filename, 'Inclusive', encodingType,
+ args.model, args.pad, args.save, args.save_version)
print("Inclusvie: {}".format(run_res_inclusive))
@@ -107,7 +134,7 @@ Module ml-conversational-analytic-tool.run
Functions
-def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding)
+def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding, save_name, model_ver)
-
@@ -115,26 +142,24 @@
Functions
Expand source code
-def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding):
+def run(annotated_filename, dataset_filename, outcome, encoding_type, model_type, padding, save_name, model_ver):
# Setup dataset
data = PreProcessedDataset()
data.setupPreProcess(annotated_filename, dataset_filename)
data.encodeData()
- # Get data for training
- if encoding_type == 'role':
- obs, res = data.getRoleMatrix(outcome, padding)
- elif encoding_type == 'role-agnostic':
- obs, res = data.getRoleAgnosticMatrix(outcome, padding)
-
# Create models
- if model_type == 'CNN':
- model = BaseCNN()
- elif model_type == 'LSTM':
+ if model_type == 'LSTM':
model = BaseLSTM()
+ else:
+ model = BaseCNN()
+
+ # Get data for training
if encoding_type == 'role':
+ obs, res = data.getRoleMatrix(outcome, padding)
model.makeModel2D(obs[0].shape)
- elif encoding_type == 'role-agnostic':
+ else:
+ obs, res = data.getRoleAgnosticMatrix(outcome, padding)
model.makeModel(obs[0].shape)
# Train model
@@ -143,10 +168,37 @@ Functions
# Score model
scores = model.scoreModel(test_obs, test_res)
+
+ # Save model
+ if save_name is not None and len(save_name) > 0:
+ save_model(model=model, name=save_name+"-"+outcome, version=model_ver)
return scores
+
+def save_model(model, name, version)
+
+
+
+
+
+Expand source code
+
+def save_model(model, name, version):
+ if not os.path.exists(model_directory):
+ os.makedirs(model_directory)
+ model_path = "{}/{}".format(model_directory, name)
+ tar_file_name = "{}-{}.tar.gz".format(name, version)
+ model.saveModel(name=model_path, version=version)
+ os.chdir(model_path)
+ tar = tarfile.open(tar_file_name, "w:gz")
+ tar.add(version)
+ tar.close()
+ os.chdir("../../")
+ print("Model saved in {}/{}; {}/{}".format(model_path, version, model_path, tar_file_name))
+
+
@@ -166,6 +218,7 @@ Index
Functions
diff --git a/docs/ml-conversational-analytic-tool/runDataExtraction.html b/docs/ml-conversational-analytic-tool/runDataExtraction.html
index 1ebe4fa..8d7d03b 100644
--- a/docs/ml-conversational-analytic-tool/runDataExtraction.html
+++ b/docs/ml-conversational-analytic-tool/runDataExtraction.html
@@ -35,8 +35,10 @@ Module ml-conversational-analytic-tool.runDataExtraction
from githubDataExtraction import GithubDataExtractor
-# Method to extract data for all repositories in organization
def getRepos(access_token, organization, reaction):
+ """
+ Method to extract data for all repositories in organization
+ """
extractor = GithubDataExtractor(access_token) # Create object
repos = extractor.g_ses.get_organization(organization).get_repos()
for repo in repos:
@@ -45,8 +47,10 @@ Module ml-conversational-analytic-tool.runDataExtraction
extractor.getAllPulls("", reaction)
-# Method to extract data for an individual repository
def getRepo(access_token, organization, reponame, reaction):
+ """
+ Method to extract data for an individual repository
+ """
extractor = GithubDataExtractor(access_token) # Create object
extractor.openRepo(organization, reponame)
extractor.getAllPulls("", reaction)
@@ -77,12 +81,15 @@ Functions
def getRepo(access_token, organization, reponame, reaction)
-
+Method to extract data for an individual repository
Expand source code
def getRepo(access_token, organization, reponame, reaction):
+ """
+ Method to extract data for an individual repository
+ """
extractor = GithubDataExtractor(access_token) # Create object
extractor.openRepo(organization, reponame)
extractor.getAllPulls("", reaction)
@@ -92,12 +99,15 @@ Functions
def getRepos(access_token, organization, reaction)
-
+Method to extract data for all repositories in organization
Expand source code
def getRepos(access_token, organization, reaction):
+ """
+ Method to extract data for all repositories in organization
+ """
extractor = GithubDataExtractor(access_token) # Create object
repos = extractor.g_ses.get_organization(organization).get_repos()
for repo in repos: