-
Notifications
You must be signed in to change notification settings - Fork 135
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
28 changed files
with
12,339 additions
and
3,019 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .col_normed_tfidf import ColNormedTfidf, ColNormedTfidfTransformer | ||
from .expected_context_model import ExpectedContextModel, ClusterWrapper, ExpectedContextModelTransformer | ||
from .dual_context_wrapper import DualContextWrapper | ||
from .expected_context_model_pipeline import ExpectedContextModelPipeline, DualContextPipeline |
162 changes: 162 additions & 0 deletions
162
convokit/expected_context_framework/col_normed_tfidf.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.base import TransformerMixin | ||
from sklearn.preprocessing import Normalizer, normalize | ||
from scipy import sparse | ||
import numpy as np | ||
import joblib | ||
import os | ||
import json | ||
|
||
from convokit.transformer import Transformer | ||
|
||
class ColNormedTfidfTransformer(Transformer): | ||
""" | ||
Transformer that derives tf-idf reweighted representations of utterances, | ||
which are normalized by column, i.e., per term. This may be helpful in deriving downstream representations that are less sensitive to relative term frequency; for instance, it could be used to derive input representations to `ExpectedContextModelWrapper`. | ||
:param input_field: the name of the attribute of utterances to use as input to fit. note that unless `token_pattern` is specified as an additional argument, this attribute must be a string consisting of whitespace-separated features. | ||
:param output_field: the name of the attribute to write to in the transform step. | ||
:param model: optional, an exisitng `ColNormedTfidfTransformer` | ||
:param kwargs: other keyword arguments used to initialize the underlying `TfidfVectorizer` from scikit-learn, see that documentation for details. | ||
""" | ||
def __init__(self, input_field, output_field='col_normed_tfidf', | ||
model=None, **kwargs): | ||
|
||
if model is not None: | ||
self.tfidf_obj = model.tfidf_obj | ||
else: | ||
self.tfidf_obj = ColNormedTfidf(**kwargs) | ||
self.input_field = input_field | ||
self.output_field = output_field | ||
if self.input_field == 'text': | ||
self.text_func = lambda x: x.text | ||
else: | ||
self.text_func = lambda x: x.meta[self.input_field] | ||
|
||
|
||
def fit(self, corpus, y=None, selector=lambda x: True): | ||
""" | ||
Fits a transformer over training data. | ||
:param corpus: Corpus | ||
:param selector: which utterances to fit the transformer over. a boolean function of the form filter(utterance) that defaults to True (i.e., all utterances). | ||
:return: None | ||
""" | ||
docs = [self.text_func(ut) for ut in corpus.iter_utterances(selector=selector)] | ||
self.tfidf_obj.fit(docs) | ||
return self | ||
|
||
def transform(self, corpus, selector=lambda x: True): | ||
""" | ||
Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as `<output_field>`. Also annotates each utterance with a metadata field, | ||
`<output_field>__n_feats`, indicating the number of terms in the vocabulary that utterance contains. | ||
:param corpus: Corpus | ||
:param selector: which utterances to transform | ||
:return: corpus, with per-utterance representations and vocabulary counts | ||
""" | ||
ids = [] | ||
docs = [] | ||
for ut in corpus.iter_utterances(selector=selector): | ||
ids.append(ut.id) | ||
docs.append(self.text_func(ut)) | ||
ut.add_vector(self.output_field) | ||
vects = self.tfidf_obj.transform(docs) | ||
column_names = self.tfidf_obj.get_feature_names() | ||
corpus.set_vector_matrix(self.output_field, matrix=vects, ids=ids, columns=column_names) | ||
n_feats = np.array((vects>0).sum(axis=1)).flatten() | ||
for id, n in zip(ids, n_feats): | ||
corpus.get_utterance(id).meta[self.output_field + '__n_feats'] = int(n) | ||
return corpus | ||
|
||
def transform_utterance(self, utt): | ||
""" | ||
Computes tf-idf representations for a single utterance. Representation is stored in the utterance as `<output_field>__vect`; | ||
number of vocabulary terms that utterance contains is stored as `<output_field>__n_feats` | ||
:param utt: Utterance | ||
:return: utterance, with representation and vocabulary count | ||
""" | ||
docs = [self.text_func(utt)] | ||
vect_ = np.array(self.tfidf_obj.transform(docs)) | ||
n_feats = np.array((vect_>0).sum(axis=1)).flatten() | ||
utt.meta[self.output_field] = [float(x) for x in vect_[0]] | ||
utt.meta[self.output_field + '__n_feats'] = int(n_feats[0]) | ||
return utt | ||
|
||
def fit_transform(self, corpus, y=None, selector=lambda x: True): | ||
self.fit(corpus, y, selector) | ||
return self.transform(corpus, selector) | ||
|
||
def get_vocabulary(self): | ||
""" | ||
:return: array of feature names | ||
""" | ||
return self.tfidf_obj.get_feature_names() | ||
|
||
def load(self, dirname): | ||
""" | ||
Loads model from disk. | ||
:param dirname: directory to load from | ||
:return: None | ||
""" | ||
self.tfidf_obj.load(dirname) | ||
|
||
def dump(self, dirname): | ||
""" | ||
Dumps model to disk. | ||
:param dirname: directory to write to | ||
:return: None | ||
""" | ||
self.tfidf_obj.dump(dirname) | ||
|
||
class ColNormedTfidf(TransformerMixin): | ||
|
||
""" | ||
Model that derives tf-idf reweighted representations of utterances, | ||
which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfWrapper` transformer; see documentation of that transformer for further details. | ||
""" | ||
|
||
def __init__(self, **kwargs): | ||
if 'token_pattern' in kwargs: | ||
self.tfidf_model = TfidfVectorizer(**kwargs) | ||
else: | ||
self.tfidf_model = TfidfVectorizer(token_pattern=r'(?u)(\S+)',**kwargs) | ||
|
||
def fit(self, X, y=None): | ||
tfidf_vects_raw = self.tfidf_model.fit_transform(X) | ||
self.col_norms = sparse.linalg.norm(tfidf_vects_raw, axis=0) | ||
|
||
def transform(self, X): | ||
tfidf_vects_raw = self.tfidf_model.transform(X) | ||
tfidf_vect = tfidf_vects_raw / self.col_norms | ||
return tfidf_vect | ||
|
||
def fit_transform(self, X, y=None): | ||
self.fit(X, y) | ||
return self.transform(X) | ||
|
||
def get_feature_names(self): | ||
return self.tfidf_model.get_feature_names() | ||
|
||
def get_params(self, deep=True): | ||
return self.tfidf_model.get_params(deep=deep) | ||
|
||
def set_params(self, **params): | ||
return self.tfidf_model.set_params(**params) | ||
|
||
def load(self, dirname): | ||
self.tfidf_model = joblib.load(os.path.join(dirname, 'tfidf_model.joblib')) | ||
self.col_norms = np.load(os.path.join(dirname, 'tfidf_col_norms.npy')) | ||
|
||
def dump(self, dirname): | ||
try: | ||
os.mkdir(dirname) | ||
except: pass | ||
np.save(os.path.join(dirname, 'tfidf_col_norms.npy'), self.col_norms) | ||
joblib.dump(self.tfidf_model, os.path.join(dirname, 'tfidf_model.joblib')) |
56 changes: 56 additions & 0 deletions
56
convokit/expected_context_framework/demos/demo_text_pipelines.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from convokit.text_processing import TextProcessor,TextParser,TextToArcs | ||
from convokit.phrasing_motifs import CensorNouns, QuestionSentences | ||
from convokit.convokitPipeline import ConvokitPipeline | ||
|
||
""" | ||
Some pipelines to compute the feature representations used in each Expected Context Model demo. | ||
""" | ||
|
||
def parliament_arc_pipeline(): | ||
return ConvokitPipeline([ | ||
# to avoid most computations, we'll only run the pipeline if the desired attributes don't exist | ||
('parser', TextParser(input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('censor_nouns', CensorNouns('parsed_censored', | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('arcs', TextToArcs('arc_arr', input_field='parsed_censored', root_only=True, | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('question_sentence_filter', QuestionSentences('q_arc_arr', input_field='arc_arr', | ||
input_filter=lambda utt, aux: utt.get_info('q_arcs') is None)), | ||
('join_arcs', TextProcessor(output_field='arcs', input_field='arc_arr', | ||
proc_fn=lambda x: '\n'.join(x), | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('join_q_arcs', TextProcessor(output_field='q_arcs', input_field='q_arc_arr', | ||
proc_fn=lambda x: '\n'.join(x), | ||
input_filter=lambda utt, aux: utt.get_info('q_arcs') is None)) | ||
]) | ||
|
||
def wiki_arc_pipeline(): | ||
return ConvokitPipeline([ | ||
('parser', TextParser(input_filter=lambda utt, aux: | ||
(utt.get_info('arcs') is None) | ||
and (utt.get_info('parsed') is None))), | ||
('censor_nouns', CensorNouns('parsed_censored', | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('arcs', TextToArcs('arc_arr', input_field='parsed_censored', root_only=False, | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('join_arcs', TextProcessor(output_field='arcs', input_field='arc_arr', | ||
proc_fn=lambda x: '\n'.join(x), | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)) | ||
]) | ||
|
||
def scotus_arc_pipeline(): | ||
return ConvokitPipeline([ | ||
('parser', TextParser(input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('arcs', TextToArcs('arc_arr', input_field='parsed', root_only=False, | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)), | ||
('join_arcs', TextProcessor(output_field='arcs', input_field='arc_arr', | ||
proc_fn=lambda x: '\n'.join(x), | ||
input_filter=lambda utt, aux: utt.get_info('arcs') is None)) | ||
]) | ||
|
||
def switchboard_text_pipeline(): | ||
# here we don't want to overwrite alpha_text fields that already exist | ||
return ConvokitPipeline([ | ||
('text', TextProcessor(proc_fn=lambda x: x, output_field='alpha_text', | ||
input_filter=lambda utt, aux: utt.get_info('alpha_text') is None)) | ||
]) |
Oops, something went wrong.