Skip to content

Commit

Permalink
Merge branch 'ecf'
Browse files Browse the repository at this point in the history
  • Loading branch information
tisjune committed Jul 5, 2021
2 parents cad1964 + 597a28f commit a73d74b
Show file tree
Hide file tree
Showing 28 changed files with 12,339 additions and 3,019 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,18 @@ Example: [exploring the balance of power in the U.S. Supreme Court](https://gith
A set of lexical and parse-based features correlating with politeness and impoliteness.
Example: [understanding the (mis)use of politeness strategies in conversations gone awry on Wikipedia](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/conversations-gone-awry/Conversations_Gone_Awry_Prediction.ipynb).

### [Prompt types](http://www.cs.cornell.edu/~cristian/Asking_too_much.html) <sub><sup>[(API)](https://convokit.cornell.edu/documentation/promptTypes.html)</sup></sub>
### [Expected Conversational Context Framework](https://tisjune.github.io/research/dissertation) <sub><sup>[(API)](https://convokit.cornell.edu/documentation/expected_context_model.html)</sup></sub>

A framework for characterizing utterances and terms based on their expected conversational context, consisting of model implementations and wrapper pipelines.
Examples: [deriving question types and other characterizations in British parliamentary question periods](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/parliament_demo.ipynb),
[exploration of Switchboard dialog acts corpus](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/switchboard_exploration_demo.ipynb) [examining Wikipedia talk page discussions](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/wiki_awry_demo.ipynb) and [computing the orientation of justice utterances in the US Supreme Court](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/scotus_orientation_demo.ipynb)

<!-- ### [Prompt types](http://www.cs.cornell.edu/~cristian/Asking_too_much.html) <sub><sup>[(API)](https://convokit.cornell.edu/documentation/promptTypes.html)</sup></sub>
An unsupervised method for grouping utterances and utterance features by their rhetorical role.
Examples: [extracting question types in the U.K. parliament](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/prompt-types/prompt-type-wrapper-demo.ipynb), [extended version demonstrating additional functionality](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/prompt-types/prompt-type-demo.ipynb), [understanding the use of conversational prompts in conversations gone awry on Wikipedia](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/conversations-gone-awry/Conversations_Gone_Awry_Prediction.ipynb).
Also includes functionality to extract surface motifs to represent utterances, used in the above paper [(API)](https://convokit.cornell.edu/documentation/phrasingMotifs.html).
Also includes functionality to extract surface motifs to represent utterances, used in the above paper [(API)](https://convokit.cornell.edu/documentation/phrasingMotifs.html). -->

### [Hypergraph conversation representation](http://www.cs.cornell.edu/~cristian/Patterns_of_participant_interactions.html) <sub><sup>[(API)](https://convokit.cornell.edu/documentation/hyperconvo.html)</sup></sub>
A method for extracting structural features of conversations through a hypergraph representation.
Expand Down
1 change: 1 addition & 0 deletions convokit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@
from .fighting_words import *
from .paired_prediction import *
from .bag_of_words import *
from .expected_context_framework import *

#__path__ = __import__('pkgutil').extend_path(__path__, __name__)
8 changes: 5 additions & 3 deletions convokit/download_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,23 @@
"gap-corpus": 1,
"iq2-corpus": 1,
"movie-corpus": 2,
"parliament-corpus": 3,
"parliament-corpus": 4,
"persuasionforgood-corpus": 1,
"reddit-coarse-discourse-corpus": 1,
"reddit-corpus": 0,
"reddit-corpus-small": 2,
"stack-exchange-politeness-corpus": 1,
"subreddit": 0,
"switchboard-corpus": 1,
"switchboard-processed-corpus": 1,
"tennis-corpus": 3,
"wiki-corpus": 3,
"wiki-corpus": 4,
"wiki-politeness-annotated": 0,
"wikiconv": 0,
"wikiconv-corpus": 1,
"wikipedia-politeness-corpus": 1,
"winning-args-corpus": 1,
"supreme-corpus": 1,
"supreme-corpus": 2,
"supreme": 1,
"wiki-articles-for-deletion-corpus": 1
},
Expand Down Expand Up @@ -55,6 +56,7 @@
"reddit-corpus-small": "http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/reddit-corpus-small.corpus.zip",
"stack-exchange-politeness-corpus": "http://zissou.infosci.cornell.edu/convokit/datasets/stack-exchange-politeness-corpus/stack-exchange-politeness-corpus.zip",
"switchboard-corpus": "http://zissou.infosci.cornell.edu/convokit/datasets/switchboard-corpus/switchboard-corpus.zip",
"switchboard-processed-corpus": "http://zissou.infosci.cornell.edu/convokit/datasets/switchboard-corpus/switchboard-processed-corpus.zip",
"tennis-corpus": "http://zissou.infosci.cornell.edu/convokit/datasets/tennis-corpus/tennis-corpus.zip",
"tennis-motifs": [
"http://zissou.infosci.cornell.edu/socialkit/datasets/tennis-corpus/tennis-motifs/answer_arcs.json",
Expand Down
4 changes: 4 additions & 0 deletions convokit/expected_context_framework/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .col_normed_tfidf import ColNormedTfidf, ColNormedTfidfTransformer
from .expected_context_model import ExpectedContextModel, ClusterWrapper, ExpectedContextModelTransformer
from .dual_context_wrapper import DualContextWrapper
from .expected_context_model_pipeline import ExpectedContextModelPipeline, DualContextPipeline
162 changes: 162 additions & 0 deletions convokit/expected_context_framework/col_normed_tfidf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.preprocessing import Normalizer, normalize
from scipy import sparse
import numpy as np
import joblib
import os
import json

from convokit.transformer import Transformer

class ColNormedTfidfTransformer(Transformer):
"""
Transformer that derives tf-idf reweighted representations of utterances,
which are normalized by column, i.e., per term. This may be helpful in deriving downstream representations that are less sensitive to relative term frequency; for instance, it could be used to derive input representations to `ExpectedContextModelWrapper`.
:param input_field: the name of the attribute of utterances to use as input to fit. note that unless `token_pattern` is specified as an additional argument, this attribute must be a string consisting of whitespace-separated features.
:param output_field: the name of the attribute to write to in the transform step.
:param model: optional, an exisitng `ColNormedTfidfTransformer`
:param kwargs: other keyword arguments used to initialize the underlying `TfidfVectorizer` from scikit-learn, see that documentation for details.
"""
def __init__(self, input_field, output_field='col_normed_tfidf',
model=None, **kwargs):

if model is not None:
self.tfidf_obj = model.tfidf_obj
else:
self.tfidf_obj = ColNormedTfidf(**kwargs)
self.input_field = input_field
self.output_field = output_field
if self.input_field == 'text':
self.text_func = lambda x: x.text
else:
self.text_func = lambda x: x.meta[self.input_field]


def fit(self, corpus, y=None, selector=lambda x: True):
"""
Fits a transformer over training data.
:param corpus: Corpus
:param selector: which utterances to fit the transformer over. a boolean function of the form filter(utterance) that defaults to True (i.e., all utterances).
:return: None
"""
docs = [self.text_func(ut) for ut in corpus.iter_utterances(selector=selector)]
self.tfidf_obj.fit(docs)
return self

def transform(self, corpus, selector=lambda x: True):
"""
Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as `<output_field>`. Also annotates each utterance with a metadata field,
`<output_field>__n_feats`, indicating the number of terms in the vocabulary that utterance contains.
:param corpus: Corpus
:param selector: which utterances to transform
:return: corpus, with per-utterance representations and vocabulary counts
"""
ids = []
docs = []
for ut in corpus.iter_utterances(selector=selector):
ids.append(ut.id)
docs.append(self.text_func(ut))
ut.add_vector(self.output_field)
vects = self.tfidf_obj.transform(docs)
column_names = self.tfidf_obj.get_feature_names()
corpus.set_vector_matrix(self.output_field, matrix=vects, ids=ids, columns=column_names)
n_feats = np.array((vects>0).sum(axis=1)).flatten()
for id, n in zip(ids, n_feats):
corpus.get_utterance(id).meta[self.output_field + '__n_feats'] = int(n)
return corpus

def transform_utterance(self, utt):
"""
Computes tf-idf representations for a single utterance. Representation is stored in the utterance as `<output_field>__vect`;
number of vocabulary terms that utterance contains is stored as `<output_field>__n_feats`
:param utt: Utterance
:return: utterance, with representation and vocabulary count
"""
docs = [self.text_func(utt)]
vect_ = np.array(self.tfidf_obj.transform(docs))
n_feats = np.array((vect_>0).sum(axis=1)).flatten()
utt.meta[self.output_field] = [float(x) for x in vect_[0]]
utt.meta[self.output_field + '__n_feats'] = int(n_feats[0])
return utt

def fit_transform(self, corpus, y=None, selector=lambda x: True):
self.fit(corpus, y, selector)
return self.transform(corpus, selector)

def get_vocabulary(self):
"""
:return: array of feature names
"""
return self.tfidf_obj.get_feature_names()

def load(self, dirname):
"""
Loads model from disk.
:param dirname: directory to load from
:return: None
"""
self.tfidf_obj.load(dirname)

def dump(self, dirname):
"""
Dumps model to disk.
:param dirname: directory to write to
:return: None
"""
self.tfidf_obj.dump(dirname)

class ColNormedTfidf(TransformerMixin):

"""
Model that derives tf-idf reweighted representations of utterances,
which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfWrapper` transformer; see documentation of that transformer for further details.
"""

def __init__(self, **kwargs):
if 'token_pattern' in kwargs:
self.tfidf_model = TfidfVectorizer(**kwargs)
else:
self.tfidf_model = TfidfVectorizer(token_pattern=r'(?u)(\S+)',**kwargs)

def fit(self, X, y=None):
tfidf_vects_raw = self.tfidf_model.fit_transform(X)
self.col_norms = sparse.linalg.norm(tfidf_vects_raw, axis=0)

def transform(self, X):
tfidf_vects_raw = self.tfidf_model.transform(X)
tfidf_vect = tfidf_vects_raw / self.col_norms
return tfidf_vect

def fit_transform(self, X, y=None):
self.fit(X, y)
return self.transform(X)

def get_feature_names(self):
return self.tfidf_model.get_feature_names()

def get_params(self, deep=True):
return self.tfidf_model.get_params(deep=deep)

def set_params(self, **params):
return self.tfidf_model.set_params(**params)

def load(self, dirname):
self.tfidf_model = joblib.load(os.path.join(dirname, 'tfidf_model.joblib'))
self.col_norms = np.load(os.path.join(dirname, 'tfidf_col_norms.npy'))

def dump(self, dirname):
try:
os.mkdir(dirname)
except: pass
np.save(os.path.join(dirname, 'tfidf_col_norms.npy'), self.col_norms)
joblib.dump(self.tfidf_model, os.path.join(dirname, 'tfidf_model.joblib'))
56 changes: 56 additions & 0 deletions convokit/expected_context_framework/demos/demo_text_pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from convokit.text_processing import TextProcessor,TextParser,TextToArcs
from convokit.phrasing_motifs import CensorNouns, QuestionSentences
from convokit.convokitPipeline import ConvokitPipeline

"""
Some pipelines to compute the feature representations used in each Expected Context Model demo.
"""

def parliament_arc_pipeline():
return ConvokitPipeline([
# to avoid most computations, we'll only run the pipeline if the desired attributes don't exist
('parser', TextParser(input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('censor_nouns', CensorNouns('parsed_censored',
input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('arcs', TextToArcs('arc_arr', input_field='parsed_censored', root_only=True,
input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('question_sentence_filter', QuestionSentences('q_arc_arr', input_field='arc_arr',
input_filter=lambda utt, aux: utt.get_info('q_arcs') is None)),
('join_arcs', TextProcessor(output_field='arcs', input_field='arc_arr',
proc_fn=lambda x: '\n'.join(x),
input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('join_q_arcs', TextProcessor(output_field='q_arcs', input_field='q_arc_arr',
proc_fn=lambda x: '\n'.join(x),
input_filter=lambda utt, aux: utt.get_info('q_arcs') is None))
])

def wiki_arc_pipeline():
return ConvokitPipeline([
('parser', TextParser(input_filter=lambda utt, aux:
(utt.get_info('arcs') is None)
and (utt.get_info('parsed') is None))),
('censor_nouns', CensorNouns('parsed_censored',
input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('arcs', TextToArcs('arc_arr', input_field='parsed_censored', root_only=False,
input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('join_arcs', TextProcessor(output_field='arcs', input_field='arc_arr',
proc_fn=lambda x: '\n'.join(x),
input_filter=lambda utt, aux: utt.get_info('arcs') is None))
])

def scotus_arc_pipeline():
return ConvokitPipeline([
('parser', TextParser(input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('arcs', TextToArcs('arc_arr', input_field='parsed', root_only=False,
input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
('join_arcs', TextProcessor(output_field='arcs', input_field='arc_arr',
proc_fn=lambda x: '\n'.join(x),
input_filter=lambda utt, aux: utt.get_info('arcs') is None))
])

def switchboard_text_pipeline():
# here we don't want to overwrite alpha_text fields that already exist
return ConvokitPipeline([
('text', TextProcessor(proc_fn=lambda x: x, output_field='alpha_text',
input_filter=lambda utt, aux: utt.get_info('alpha_text') is None))
])
Loading

0 comments on commit a73d74b

Please sign in to comment.