From 01a7153ac9171629dff92e5fa53aeb57056d1908 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Mon, 27 Jun 2022 13:52:56 -0400 Subject: [PATCH 01/21] initialize storage-abstraction branch as 2.6.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c59cf225..e0ae8c43 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ author_email="cristian@cs.cornell.edu", url="https://github.com/CornellNLP/ConvoKit", description="ConvoKit", - version="2.5.3", + version="2.6.0", packages=["convokit", "convokit.bag_of_words", "convokit.classifier", From eca5d628bb5e5c53dd77b12bc04fa1c874c1df42 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Mon, 27 Jun 2022 14:44:43 -0400 Subject: [PATCH 02/21] define StorageManager base class --- convokit/model/storageManager.py | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 convokit/model/storageManager.py diff --git a/convokit/model/storageManager.py b/convokit/model/storageManager.py new file mode 100644 index 00000000..148c6466 --- /dev/null +++ b/convokit/model/storageManager.py @@ -0,0 +1,33 @@ +from typing import Optional + +class StorageManager: + """ + Abstraction layer for the concrete representation of data and metadata + within corpus components (e.g., Utterance text and timestamps). All requests + to access or modify corpusComponent fields (with the exception of ID) are + actually routed through one of StorageManager's concrete subclasses. Each + subclass implements a storage backend that contains the actual data. + """ + def __init__( + self, + corpus_id: Optional[str] = None, + version: Optional[str] = "0" + ): + self.corpus_id = corpus_id + self.version = version + + # concrete data storage (i.e., collections) for each component type + # this will be assigned in subclasses + self.data = { + "utterance": {}, + "conversation": {}, + "speaker": {}, + "meta": {} + } + + def get_collection(self, component_type: str): + if component_type not in self.data: + raise ValueError('component_type must be one of "utterance", "conversation", "speaker", or "meta".') + return self.data[component_type] + + \ No newline at end of file From fe2587bae7b03435143383e402bf95e990d42f60 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Mon, 27 Jun 2022 15:25:37 -0400 Subject: [PATCH 03/21] implement MemStorageManager --- convokit/model/storageManager.py | 45 +++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/convokit/model/storageManager.py b/convokit/model/storageManager.py index 148c6466..afb526f3 100644 --- a/convokit/model/storageManager.py +++ b/convokit/model/storageManager.py @@ -19,10 +19,10 @@ def __init__( # concrete data storage (i.e., collections) for each component type # this will be assigned in subclasses self.data = { - "utterance": {}, - "conversation": {}, - "speaker": {}, - "meta": {} + "utterance": None, + "conversation": None, + "speaker": None, + "meta": None } def get_collection(self, component_type: str): @@ -30,4 +30,41 @@ def get_collection(self, component_type: str): raise ValueError('component_type must be one of "utterance", "conversation", "speaker", or "meta".') return self.data[component_type] +class MemStorageManager(StorageManager): + """ + Concrete StorageManager implementation for in-memory data storage. + Collections are implemented as vanilla Python dicts. + """ + def __init__( + self, + corpus_id: Optional[str] = None, + version: Optional[str] = "0" + ): + super().__init__(corpus_id, version) + + # initialize component collections as dicts + for key in self.data: + self.data[key] = {} + + def get_data(self, component_type: str, component_id: str, property_name: str): + """ + Retrieve the property data for the component of type component_type with + id component_id + """ + collection = self.get_collection(component_type) + if component_id not in collection: + raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") + return collection[component_id][property_name] + + def update_data(self, component_type: str, component_id: str, property_name: str, new_value): + """ + Set or update the property data for the component of type component_type + with id component_id + """ + collection = self.get_collection(component_type) + # don't create new collections if the ID is not found; this is supposed to be handled in the + # CorpusComponent constructor so if the ID is missing that indicates something is wrong + if component_id not in collection: + raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") + collection[component_id][property_name] = new_value \ No newline at end of file From 4a2ad769e782ced59c11618ae65194802c8d55fb Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 5 Jul 2022 14:31:05 -0400 Subject: [PATCH 04/21] implementing temp storage for ownerless components --- convokit/model/convoKitMeta.py | 39 ++++++++++---- convokit/model/corpus.py | 35 +++++++++---- convokit/model/corpusComponent.py | 52 ++++++++++++++++-- convokit/model/storageManager.py | 51 ++++++++++++++---- convokit/model/utterance.py | 87 +++++++++++++++++++++++++------ 5 files changed, 218 insertions(+), 46 deletions(-) diff --git a/convokit/model/convoKitMeta.py b/convokit/model/convoKitMeta.py index e8e4eff2..86f45171 100644 --- a/convokit/model/convoKitMeta.py +++ b/convokit/model/convoKitMeta.py @@ -13,12 +13,30 @@ class ConvoKitMeta(MutableMapping, dict): """ ConvoKitMeta is a dictlike object that stores the metadata attributes of a corpus component """ - def __init__(self, convokit_index, obj_type): + def __init__(self, owner, convokit_index, obj_type): + self.owner = owner # Corpus or CorpusComponent self.index: ConvoKitIndex = convokit_index self.obj_type = obj_type + self._get_storage().initialize_data_for_component("meta", self.storage_key) + + @property + def storage_key(self) -> str: + return f"{self.obj_type}_{self.owner.id}" + def __getitem__(self, item): - return dict.__getitem__(self, item) + return self._get_storage().get_data("meta", self.storage_key, item) + + def _get_storage(self): + # special case for Corpus meta since that's the only time owner is not a CorpusComponent + # since cannot directly import Corpus to check the type (circular import), as a proxy we + # check for the obj_type attribute which is common to all CorpusComponent but not + # present in Corpus + if not hasattr(self.owner, "obj_type"): + return self.owner.storage + # self.owner -> CorpusComponent + # self.owner.owner -> Corpus that owns the CorpusComponent (only Corpus has direct pointer to storage) + return self.owner.owner.storage @staticmethod def _check_type_and_update_index(index, obj_type, key, value): @@ -44,12 +62,12 @@ def __setitem__(self, key, value): if self.index.type_check: ConvoKitMeta._check_type_and_update_index(self.index, self.obj_type, key, value) - dict.__setitem__(self, key, value) + self._get_storage().update_data("meta", self.storage_key, key, value) def __delitem__(self, key): if self.obj_type == 'corpus': - dict.__delitem__(self, key) self.index.del_from_index(self.obj_type, key) + self._get_storage().delete_data("meta", self.storage_key, key) else: if self.index.lock_metadata_deletion[self.obj_type]: warn("For consistency in metadata attributes in Corpus component objects, deleting metadata attributes " @@ -57,19 +75,22 @@ def __delitem__(self, key): "To delete this metadata attribute from all Corpus components of this type, " "use corpus.delete_metadata(obj_type='{}', attribute='{}') instead.".format(self.obj_type, key)) else: - dict.__delitem__(self, key) + self._get_storage().delete_data("meta", self.storage_key, key) def __iter__(self): - return dict.__iter__(self) + return self._get_storage().get_data("meta", self.storage_key).__iter__() def __len__(self): - return dict.__len__(self) + return self._get_storage().get_data("meta", self.storage_key).__len__() def __contains__(self, x): - return dict.__contains__(self, x) + return self._get_storage().get_data("meta", self.storage_key).__contains__(x) + + def __repr__(self) -> str: + return "ConvoKitMeta(" + self.to_dict().__repr__() + ")" def to_dict(self): - return self.__dict__ + return dict(self._get_storage().get_data("meta", self.storage_key)) _basic_types = {type(0), type(1.0), type('str'), type(True)} # cannot include lists or dicts diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index 95d4abbf..a203c31b 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -8,6 +8,7 @@ import random from .convoKitMeta import ConvoKitMeta from .convoKitMatrix import ConvoKitMatrix +from .storageManager import StorageManager, MemStorageManager import shutil @@ -39,14 +40,21 @@ class Corpus: :ivar corpus_dirpath: path to the directory the corpus was loaded from """ - def __init__(self, filename: Optional[str] = None, utterances: Optional[List[Utterance]] = None, - preload_vectors: List[str] = None, utterance_start_index: int = None, - utterance_end_index: int = None, merge_lines: bool = False, - exclude_utterance_meta: Optional[List[str]] = None, - exclude_conversation_meta: Optional[List[str]] = None, - exclude_speaker_meta: Optional[List[str]] = None, - exclude_overall_meta: Optional[List[str]] = None, - disable_type_check=True): + def __init__( + self, + filename: Optional[str] = None, + utterances: Optional[List[Utterance]] = None, + preload_vectors: List[str] = None, + utterance_start_index: int = None, + utterance_end_index: int = None, + merge_lines: bool = False, + exclude_utterance_meta: Optional[List[str]] = None, + exclude_conversation_meta: Optional[List[str]] = None, + exclude_speaker_meta: Optional[List[str]] = None, + exclude_overall_meta: Optional[List[str]] = None, + disable_type_check=True, + storage: Optional[StorageManager] = None + ): if filename is None: self.corpus_dirpath = None @@ -55,8 +63,17 @@ def __init__(self, filename: Optional[str] = None, utterances: Optional[List[Utt else: self.corpus_dirpath = os.path.dirname(filename) + corpus_id = os.path.basename(os.path.normpath(filename)) + self.id = corpus_id + + # Setup storage + if storage is not None: + self.storage = storage + else: + self.storage = MemStorageManager(corpus_id) + self.meta_index = ConvoKitIndex(self) - self.meta = ConvoKitMeta(self.meta_index, 'corpus') + self.meta = ConvoKitMeta(self, self.meta_index, 'corpus') # private storage self._vector_matrices = dict() diff --git a/convokit/model/corpusComponent.py b/convokit/model/corpusComponent.py index 5c3963d5..10b2697d 100644 --- a/convokit/model/corpusComponent.py +++ b/convokit/model/corpusComponent.py @@ -5,30 +5,63 @@ class CorpusComponent: - def __init__(self, obj_type: str, owner=None, id=None, vectors: List[str]=None, meta=None): + def __init__( + self, + obj_type: str, + owner=None, + id=None, + initial_data=None, + vectors: List[str]=None, + meta=None + ): self.obj_type = obj_type # utterance, speaker, conversation self._owner = owner + self._id = id + self.vectors = vectors if vectors is not None else [] + + # if the CorpusComponent is initialized with an owner set up an entry + # in the owner's storage; if it is not initialized with an owner + # (i.e. it is a standalone object) set up a dict-based temp storage + if self.owner is None: + self._temp_storage = initial_data if initial_data is not None else {} + else: + self.owner.storage.initialize_data_for_component(self.obj_type, self._id, initial_value=(initial_data if initial_data is not None else {})) + if meta is None: meta = dict() self.meta = self.init_meta(meta) - self.id = id - self.vectors = vectors if vectors is not None else [] def get_owner(self): return self._owner def set_owner(self, owner): + previous_owner = self._owner self._owner = owner if owner is not None: + # when a new owner Corpus is assigned, we must take the following steps: + # (1) transfer this component's data to the new owner's StorageManager + # (2) avoid duplicates by removing the data from the old owner (or temp storage if there was no prior owner) + # (3) reinitialize the metadata instance + data_dict = dict(previous_owner.storage.get_data(self.id)) if previous_owner is not None else self._temp_storage + self.owner.storage.initialize_data_for_component(self.obj_type, self.id, initial_value=data_dict) + if previous_owner is not None: + previous_owner.storage.delete_data(self.obj_type, self.id) + else: + del self._temp_storage self.meta = self.init_meta(self.meta) owner = property(get_owner, set_owner) def init_meta(self, meta): if self._owner is None: + # ConvoKitMeta instances are not allowed for ownerless (standalone) + # components since they must be backed by a StorageManager. In this + # case we must forcibly convert the ConvoKitMeta instance to dict + if type(meta) == ConvoKitMeta: + meta = meta.to_dict() return meta else: - ck_meta = ConvoKitMeta(self.owner.meta_index, self.obj_type) + ck_meta = ConvoKitMeta(self, self.owner.meta_index, self.obj_type) for key, value in meta.items(): ck_meta[key] = value return ck_meta @@ -45,6 +78,17 @@ def set_id(self, value): id = property(get_id, set_id) + def get_data(self, property_name): + if self._owner is None: + return self._temp_storage[property_name] + return self.owner.storage.get_data(self.obj_type, self.id, property_name) + + def set_data(self, property_name, value): + if self._owner is None: + self._temp_storage[property_name] = value + else: + self.owner.storage.update_data(self.obj_type, self.id, property_name, value) + # def __eq__(self, other): # if type(self) != type(other): return False # # do not compare 'utterances' and 'conversations' in Speaker.__dict__. recursion loop will occur. diff --git a/convokit/model/storageManager.py b/convokit/model/storageManager.py index afb526f3..627d467d 100644 --- a/convokit/model/storageManager.py +++ b/convokit/model/storageManager.py @@ -10,11 +10,9 @@ class StorageManager: """ def __init__( self, - corpus_id: Optional[str] = None, - version: Optional[str] = "0" + corpus_id: Optional[str] = None ): self.corpus_id = corpus_id - self.version = version # concrete data storage (i.e., collections) for each component type # this will be assigned in subclasses @@ -37,24 +35,45 @@ class MemStorageManager(StorageManager): """ def __init__( self, - corpus_id: Optional[str] = None, - version: Optional[str] = "0" + corpus_id: Optional[str] = None ): - super().__init__(corpus_id, version) + super().__init__(corpus_id) # initialize component collections as dicts for key in self.data: self.data[key] = {} + + def has_data_for_component(self, component_type: str, component_id: str) -> bool: + """ + Check if there is an existing entry for the component of type component_type + with id component_id + """ + collection = self.get_collection(component_type) + return component_id in collection + + def initialize_data_for_component(self, component_type: str, component_id: str, overwrite: bool = False, initial_value = None): + """ + Create a blank entry for a component of type component_type with id + component_id. Will avoid overwriting any existing data unless the + overwrite parameter is set to True. + """ + collection = self.get_collection(component_type) + if overwrite or not self.has_data_for_component(component_type, component_id): + collection[component_id] = initial_value if initial_value is not None else {} - def get_data(self, component_type: str, component_id: str, property_name: str): + def get_data(self, component_type: str, component_id: str, property_name: Optional[str] = None): """ Retrieve the property data for the component of type component_type with - id component_id + id component_id. If property_name is specified return only the data for + that property, otherwise return the dict containing all properties. """ collection = self.get_collection(component_type) if component_id not in collection: raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") - return collection[component_id][property_name] + if property_name is None: + return collection[component_id] + else: + return collection[component_id][property_name] def update_data(self, component_type: str, component_id: str, property_name: str, new_value): """ @@ -67,4 +86,18 @@ def update_data(self, component_type: str, component_id: str, property_name: str if component_id not in collection: raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") collection[component_id][property_name] = new_value + + def delete_data(self, component_type: str, component_id: str, property_name: Optional[str] = None): + """ + Delete a data entry from this StorageManager for the component of type + component_type with id component_id. If property_name is specified + delete only that property, otherwise delete the entire entry. + """ + collection = self.get_collection(component_type) + if component_id not in collection: + raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") + if property_name is None: + del collection[component_id] + else: + del collection[component_id][property_name] \ No newline at end of file diff --git a/convokit/model/utterance.py b/convokit/model/utterance.py index 4f7cf801..ee2a09be 100644 --- a/convokit/model/utterance.py +++ b/convokit/model/utterance.py @@ -30,25 +30,78 @@ def __init__(self, owner=None, id: Optional[str] = None, speaker: Optional[Speak root: Optional[str] = None, reply_to: Optional[str] = None, timestamp: Optional[int] = None, text: str = '', meta: Optional[Dict] = None): - super().__init__(obj_type="utterance", owner=owner, id=id, meta=meta) - speaker_ = speaker if speaker is not None else user - self.speaker = speaker_ - if self.speaker is None: - raise ValueError("No Speaker found: Utterance must be initialized with a Speaker.") - self.user = speaker # for backwards compatbility - self.conversation_id = conversation_id if conversation_id is not None else root - if self.conversation_id is not None and not isinstance(self.conversation_id, str): + # check arguments that have alternate naming due to backwards compatibility + if speaker is None: + if user is not None: + speaker = user + else: + raise ValueError("No Speaker found: Utterance must be initialized with a Speaker.") + if conversation_id is None and root is not None: + conversation_id = root + + if conversation_id is not None and not isinstance(conversation_id, str): warn("Utterance conversation_id must be a string: conversation_id of utterance with ID: {} " - "has been casted to a string.".format(self.id)) - self.conversation_id = str(self.conversation_id) - self._root = self.conversation_id - self.reply_to = reply_to - self.timestamp = timestamp # int(timestamp) if timestamp is not None else timestamp + "has been casted to a string.".format(id)) + conversation_id = str(conversation_id) if not isinstance(text, str): warn("Utterance text must be a string: text of utterance with ID: {} " - "has been casted to a string.".format(self.id)) + "has been casted to a string.".format(id)) text = '' if text is None else str(text) - self.text = text + + props = { + 'speaker': speaker.id, + 'conversation_id': conversation_id, + 'reply_to': reply_to, + 'timestamp': timestamp, + 'text': text + } + super().__init__(obj_type="utterance", owner=owner, id=id, initial_data=props, meta=meta) + self.speaker_ = speaker + + ############################################################################ + ## directly-accessible class properties (roughly equivalent to keys in the + ## JSON, plus aliases for compatibility) + ############################################################################ + + def _get_speaker(self): + return self.speaker_ + + def _set_speaker(self, val): + self.speaker_ = val + + speaker = property(_get_speaker, _set_speaker) + + def _get_conversation_id(self): + return self.get_data("conversation_id") + + def _set_conversation_id(self, val): + self.set_data("conversation_id", val) + + conversation_id = property(_get_conversation_id, _set_conversation_id) + + def _get_reply_to(self): + return self.get_data("reply_to") + + def _set_reply_to(self, val): + self.set_data("reply_to", val) + + reply_to = property(_get_reply_to, _set_reply_to) + + def _get_timestamp(self): + return self.get_data("timestamp") + + def _set_timestamp(self, val): + self.set_data("timestamp", val) + + timestamp = property(_get_timestamp, _set_timestamp) + + def _get_text(self): + return self.get_data("text") + + def _set_text(self, val): + self.set_data("text", val) + + text = property(_get_text, _set_text) def _get_root(self): deprecation("utterance.root", "utterance.conversation_id") @@ -61,6 +114,10 @@ def _set_root(self, value: str): root = property(_get_root, _set_root) + ############################################################################ + ## end properties + ############################################################################ + def get_conversation(self): """ Get the Conversation (identified by Utterance.conversation_id) this Utterance belongs to From ec5e0a150cd0ce6c5ac18709316993c7b1f56698 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 5 Jul 2022 15:47:14 -0400 Subject: [PATCH 05/21] dataframe conversion and corpus id fixes --- .pre-commit-config.yaml | 1 - convokit/model/convoKitMeta.py | 7 ++-- convokit/model/corpus.py | 20 ++++++----- convokit/model/corpusComponent.py | 40 +++++++++++++++------- convokit/model/corpusUtil.py | 16 +++------ convokit/model/storageManager.py | 50 ++++++++++++++------------- convokit/model/utterance.py | 57 ++++++++++++++++++++++--------- 7 files changed, 115 insertions(+), 76 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ae1ffbba..711116ee 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,4 +9,3 @@ repos: rev: 22.3.0 hooks: - id: black - language_version: python3.9 diff --git a/convokit/model/convoKitMeta.py b/convokit/model/convoKitMeta.py index 75f6f8d9..d6d182a7 100644 --- a/convokit/model/convoKitMeta.py +++ b/convokit/model/convoKitMeta.py @@ -13,8 +13,9 @@ class ConvoKitMeta(MutableMapping, dict): """ ConvoKitMeta is a dictlike object that stores the metadata attributes of a corpus component """ + def __init__(self, owner, convokit_index, obj_type): - self.owner = owner # Corpus or CorpusComponent + self.owner = owner # Corpus or CorpusComponent self.index: ConvoKitIndex = convokit_index self.obj_type = obj_type @@ -65,7 +66,7 @@ def __setitem__(self, key, value): self._get_storage().update_data("meta", self.storage_key, key, value) def __delitem__(self, key): - if self.obj_type == 'corpus': + if self.obj_type == "corpus": self.index.del_from_index(self.obj_type, key) self._get_storage().delete_data("meta", self.storage_key, key) else: @@ -89,7 +90,7 @@ def __len__(self): def __contains__(self, x): return self._get_storage().get_data("meta", self.storage_key).__contains__(x) - + def __repr__(self) -> str: return "ConvoKitMeta(" + self.to_dict().__repr__() + ")" diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index d04516a7..4c0c43f0 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -41,19 +41,19 @@ class Corpus: """ def __init__( - self, - filename: Optional[str] = None, + self, + filename: Optional[str] = None, utterances: Optional[List[Utterance]] = None, - preload_vectors: List[str] = None, + preload_vectors: List[str] = None, utterance_start_index: int = None, - utterance_end_index: int = None, + utterance_end_index: int = None, merge_lines: bool = False, exclude_utterance_meta: Optional[List[str]] = None, exclude_conversation_meta: Optional[List[str]] = None, exclude_speaker_meta: Optional[List[str]] = None, exclude_overall_meta: Optional[List[str]] = None, disable_type_check=True, - storage: Optional[StorageManager] = None + storage: Optional[StorageManager] = None, ): if filename is None: @@ -63,17 +63,19 @@ def __init__( else: self.corpus_dirpath = os.path.dirname(filename) - corpus_id = os.path.basename(os.path.normpath(filename)) - self.id = corpus_id + self.id = None + if filename is not None: + # automatically derive an ID from the file path + self.id = os.path.basename(os.path.normpath(filename)) # Setup storage if storage is not None: self.storage = storage else: - self.storage = MemStorageManager(corpus_id) + self.storage = MemStorageManager() self.meta_index = ConvoKitIndex(self) - self.meta = ConvoKitMeta(self, self.meta_index, 'corpus') + self.meta = ConvoKitMeta(self, self.meta_index, "corpus") # private storage self._vector_matrices = dict() diff --git a/convokit/model/corpusComponent.py b/convokit/model/corpusComponent.py index aff8ca8f..397663bb 100644 --- a/convokit/model/corpusComponent.py +++ b/convokit/model/corpusComponent.py @@ -4,28 +4,31 @@ class CorpusComponent: - def __init__( - self, - obj_type: str, - owner=None, - id=None, - initial_data=None, - vectors: List[str]=None, - meta=None + self, + obj_type: str, + owner=None, + id=None, + initial_data=None, + vectors: List[str] = None, + meta=None, ): self.obj_type = obj_type # utterance, speaker, conversation self._owner = owner self._id = id self.vectors = vectors if vectors is not None else [] - + # if the CorpusComponent is initialized with an owner set up an entry # in the owner's storage; if it is not initialized with an owner # (i.e. it is a standalone object) set up a dict-based temp storage if self.owner is None: self._temp_storage = initial_data if initial_data is not None else {} else: - self.owner.storage.initialize_data_for_component(self.obj_type, self._id, initial_value=(initial_data if initial_data is not None else {})) + self.owner.storage.initialize_data_for_component( + self.obj_type, + self._id, + initial_value=(initial_data if initial_data is not None else {}), + ) if meta is None: meta = dict() @@ -42,8 +45,14 @@ def set_owner(self, owner): # (1) transfer this component's data to the new owner's StorageManager # (2) avoid duplicates by removing the data from the old owner (or temp storage if there was no prior owner) # (3) reinitialize the metadata instance - data_dict = dict(previous_owner.storage.get_data(self.id)) if previous_owner is not None else self._temp_storage - self.owner.storage.initialize_data_for_component(self.obj_type, self.id, initial_value=data_dict) + data_dict = ( + dict(previous_owner.storage.get_data(self.id)) + if previous_owner is not None + else self._temp_storage + ) + self.owner.storage.initialize_data_for_component( + self.obj_type, self.id, initial_value=data_dict + ) if previous_owner is not None: previous_owner.storage.delete_data(self.obj_type, self.id) else: @@ -180,6 +189,13 @@ def delete_vector(self, vector_name: str): """ self.vectors.remove(vector_name) + def to_dict(self): + return { + "id": self.id, + "vectors": self.vectors, + "meta": self.meta if type(self.meta) == dict else self.meta.to_dict(), + } + def __str__(self): return "{}(id: {}, vectors: {}, meta: {})".format( self.obj_type.capitalize(), self.id, self.vectors, self.meta diff --git a/convokit/model/corpusUtil.py b/convokit/model/corpusUtil.py index e33bab46..99e36a19 100644 --- a/convokit/model/corpusUtil.py +++ b/convokit/model/corpusUtil.py @@ -19,7 +19,7 @@ def get_utterances_dataframe(obj, selector=lambda utt: True, exclude_meta: bool """ ds = dict() for utt in obj.iter_utterances(selector): - d = utt.__dict__.copy() + d = utt.to_dict().copy() if not exclude_meta: for k, v in d["meta"].items(): d["meta." + k] = v @@ -27,9 +27,7 @@ def get_utterances_dataframe(obj, selector=lambda utt: True, exclude_meta: bool ds[utt.id] = d df = pd.DataFrame(ds).T - df["id"] = df["_id"] df = df.set_index("id") - df = df.drop(["_id", "_owner", "obj_type", "user", "_root"], axis=1) df["speaker"] = df["speaker"].map(lambda spkr: spkr.id) meta_columns = [k for k in df.columns if k.startswith("meta.")] return df[ @@ -50,7 +48,7 @@ def get_conversations_dataframe(obj, selector=lambda convo: True, exclude_meta: """ ds = dict() for convo in obj.iter_conversations(selector): - d = convo.__dict__.copy() + d = convo.to_dict().copy() if not exclude_meta: for k, v in d["meta"].items(): d["meta." + k] = v @@ -58,9 +56,7 @@ def get_conversations_dataframe(obj, selector=lambda convo: True, exclude_meta: ds[convo.id] = d df = pd.DataFrame(ds).T - df["id"] = df["_id"] - df = df.set_index("id") - return df.drop(["_owner", "obj_type", "_utterance_ids", "_speaker_ids", "tree", "_id"], axis=1) + return df.set_index("id") def get_speakers_dataframe(obj, selector=lambda utt: True, exclude_meta: bool = False): @@ -75,7 +71,7 @@ def get_speakers_dataframe(obj, selector=lambda utt: True, exclude_meta: bool = """ ds = dict() for spkr in obj.iter_speakers(selector): - d = spkr.__dict__.copy() + d = spkr.to_dict().copy() if not exclude_meta: for k, v in d["meta"].items(): d["meta." + k] = v @@ -83,6 +79,4 @@ def get_speakers_dataframe(obj, selector=lambda utt: True, exclude_meta: bool = ds[spkr.id] = d df = pd.DataFrame(ds).T - df["id"] = df["_id"] - df = df.set_index("id") - return df.drop(["_owner", "obj_type", "utterances", "conversations", "_id"], axis=1) + return df.set_index("id") diff --git a/convokit/model/storageManager.py b/convokit/model/storageManager.py index 627d467d..15bfbec1 100644 --- a/convokit/model/storageManager.py +++ b/convokit/model/storageManager.py @@ -1,5 +1,6 @@ from typing import Optional + class StorageManager: """ Abstraction layer for the concrete representation of data and metadata @@ -8,41 +9,33 @@ class StorageManager: actually routed through one of StorageManager's concrete subclasses. Each subclass implements a storage backend that contains the actual data. """ - def __init__( - self, - corpus_id: Optional[str] = None - ): - self.corpus_id = corpus_id + def __init__(self): # concrete data storage (i.e., collections) for each component type # this will be assigned in subclasses - self.data = { - "utterance": None, - "conversation": None, - "speaker": None, - "meta": None - } + self.data = {"utterance": None, "conversation": None, "speaker": None, "meta": None} def get_collection(self, component_type: str): if component_type not in self.data: - raise ValueError('component_type must be one of "utterance", "conversation", "speaker", or "meta".') + raise ValueError( + 'component_type must be one of "utterance", "conversation", "speaker", or "meta".' + ) return self.data[component_type] + class MemStorageManager(StorageManager): """ Concrete StorageManager implementation for in-memory data storage. Collections are implemented as vanilla Python dicts. """ - def __init__( - self, - corpus_id: Optional[str] = None - ): - super().__init__(corpus_id) + + def __init__(self): + super().__init__() # initialize component collections as dicts for key in self.data: self.data[key] = {} - + def has_data_for_component(self, component_type: str, component_id: str) -> bool: """ Check if there is an existing entry for the component of type component_type @@ -51,7 +44,9 @@ def has_data_for_component(self, component_type: str, component_id: str) -> bool collection = self.get_collection(component_type) return component_id in collection - def initialize_data_for_component(self, component_type: str, component_id: str, overwrite: bool = False, initial_value = None): + def initialize_data_for_component( + self, component_type: str, component_id: str, overwrite: bool = False, initial_value=None + ): """ Create a blank entry for a component of type component_type with id component_id. Will avoid overwriting any existing data unless the @@ -69,7 +64,9 @@ def get_data(self, component_type: str, component_id: str, property_name: Option """ collection = self.get_collection(component_type) if component_id not in collection: - raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") + raise KeyError( + f"This StorageManager does not have an entry for the {component_type} with id {component_id}." + ) if property_name is None: return collection[component_id] else: @@ -84,10 +81,14 @@ def update_data(self, component_type: str, component_id: str, property_name: str # don't create new collections if the ID is not found; this is supposed to be handled in the # CorpusComponent constructor so if the ID is missing that indicates something is wrong if component_id not in collection: - raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") + raise KeyError( + f"This StorageManager does not have an entry for the {component_type} with id {component_id}." + ) collection[component_id][property_name] = new_value - def delete_data(self, component_type: str, component_id: str, property_name: Optional[str] = None): + def delete_data( + self, component_type: str, component_id: str, property_name: Optional[str] = None + ): """ Delete a data entry from this StorageManager for the component of type component_type with id component_id. If property_name is specified @@ -95,9 +96,10 @@ def delete_data(self, component_type: str, component_id: str, property_name: Opt """ collection = self.get_collection(component_type) if component_id not in collection: - raise KeyError(f"This StorageManager does not have an entry for the {component_type} with id {component_id}.") + raise KeyError( + f"This StorageManager does not have an entry for the {component_type} with id {component_id}." + ) if property_name is None: del collection[component_id] else: del collection[component_id][property_name] - \ No newline at end of file diff --git a/convokit/model/utterance.py b/convokit/model/utterance.py index 770932eb..a3a4edda 100644 --- a/convokit/model/utterance.py +++ b/convokit/model/utterance.py @@ -25,11 +25,19 @@ class Utterance(CorpusComponent): utterance-level metadata. """ - def __init__(self, owner=None, id: Optional[str] = None, speaker: Optional[Speaker] = None, - user: Optional[Speaker] = None, conversation_id: Optional[str] = None, - root: Optional[str] = None, reply_to: Optional[str] = None, - timestamp: Optional[int] = None, text: str = '', - meta: Optional[Dict] = None): + def __init__( + self, + owner=None, + id: Optional[str] = None, + speaker: Optional[Speaker] = None, + user: Optional[Speaker] = None, + conversation_id: Optional[str] = None, + root: Optional[str] = None, + reply_to: Optional[str] = None, + timestamp: Optional[int] = None, + text: str = "", + meta: Optional[Dict] = None, + ): # check arguments that have alternate naming due to backwards compatibility if speaker is None: if user is not None: @@ -40,26 +48,30 @@ def __init__(self, owner=None, id: Optional[str] = None, speaker: Optional[Speak conversation_id = root if conversation_id is not None and not isinstance(conversation_id, str): - warn("Utterance conversation_id must be a string: conversation_id of utterance with ID: {} " - "has been casted to a string.".format(id)) + warn( + "Utterance conversation_id must be a string: conversation_id of utterance with ID: {} " + "has been casted to a string.".format(id) + ) conversation_id = str(conversation_id) if not isinstance(text, str): - warn("Utterance text must be a string: text of utterance with ID: {} " - "has been casted to a string.".format(id)) - text = '' if text is None else str(text) + warn( + "Utterance text must be a string: text of utterance with ID: {} " + "has been casted to a string.".format(id) + ) + text = "" if text is None else str(text) props = { - 'speaker': speaker.id, - 'conversation_id': conversation_id, - 'reply_to': reply_to, - 'timestamp': timestamp, - 'text': text + "speaker_id": speaker.id, + "conversation_id": conversation_id, + "reply_to": reply_to, + "timestamp": timestamp, + "text": text, } super().__init__(obj_type="utterance", owner=owner, id=id, initial_data=props, meta=meta) self.speaker_ = speaker ############################################################################ - ## directly-accessible class properties (roughly equivalent to keys in the + ## directly-accessible class properties (roughly equivalent to keys in the ## JSON, plus aliases for compatibility) ############################################################################ @@ -68,6 +80,7 @@ def _get_speaker(self): def _set_speaker(self, val): self.speaker_ = val + self.set_data("speaker_id", self.speaker.id) speaker = property(_get_speaker, _set_speaker) @@ -135,6 +148,18 @@ def get_speaker(self): return self.speaker + def to_dict(self): + return { + "id": self.id, + "conversation_id": self.conversation_id, + "reply_to": self.reply_to, + "speaker": self.speaker, + "timestamp": self.timestamp, + "text": self.text, + "vectors": self.vectors, + "meta": self.meta if type(self.meta) == dict else self.meta.to_dict(), + } + def __hash__(self): return super().__hash__() From 17b6de9c8aabd315c76259b2957410d821b4ca5c Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 5 Jul 2022 15:55:33 -0400 Subject: [PATCH 06/21] fix missing parameter when assigning new owner --- convokit/model/corpusComponent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convokit/model/corpusComponent.py b/convokit/model/corpusComponent.py index 397663bb..e68b8fa0 100644 --- a/convokit/model/corpusComponent.py +++ b/convokit/model/corpusComponent.py @@ -46,7 +46,7 @@ def set_owner(self, owner): # (2) avoid duplicates by removing the data from the old owner (or temp storage if there was no prior owner) # (3) reinitialize the metadata instance data_dict = ( - dict(previous_owner.storage.get_data(self.id)) + dict(previous_owner.storage.get_data(self.obj_type, self.id)) if previous_owner is not None else self._temp_storage ) From b81112b8b75d4febe9d058be30560ce1784e3851 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Wed, 6 Jul 2022 11:41:45 -0400 Subject: [PATCH 07/21] correctly handle metadata in owner reassignment --- convokit/model/corpusComponent.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/convokit/model/corpusComponent.py b/convokit/model/corpusComponent.py index e68b8fa0..effb789d 100644 --- a/convokit/model/corpusComponent.py +++ b/convokit/model/corpusComponent.py @@ -38,6 +38,8 @@ def get_owner(self): return self._owner def set_owner(self, owner): + # stash the metadata first since reassigning self._owner will break its storage connection + meta_vals = {k: v for k, v in self.meta.items()} previous_owner = self._owner self._owner = owner if owner is not None: @@ -55,9 +57,10 @@ def set_owner(self, owner): ) if previous_owner is not None: previous_owner.storage.delete_data(self.obj_type, self.id) + previous_owner.storage.delete_data("meta", self.meta.storage_key) else: del self._temp_storage - self.meta = self.init_meta(self.meta) + self.meta = self.init_meta(meta_vals) owner = property(get_owner, set_owner) From 590314338aab90b947b8c05432eb6416952a61b7 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 12 Jul 2022 14:25:04 -0400 Subject: [PATCH 08/21] enforce cleanup of old StorageManagers in Corpus operations --- convokit/model/corpus.py | 34 +++++- convokit/model/storageManager.py | 114 +++++++++++++++----- convokit/tests/general/test_merge_corpus.py | 6 +- 3 files changed, 122 insertions(+), 32 deletions(-) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index 4c0c43f0..2864aa23 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -614,6 +614,19 @@ def filter_conversations_by(self, selector: Callable[[Conversation], bool]): } self.update_speakers_data() self.reinitialize_index() + + # clear all storage entries corresponding to filtered-out components + meta_ids = [] + for utt in self.iter_utterances(): + meta_ids.append(utt.meta.storage_key) + for convo in self.iter_conversations(): + meta_ids.append(convo.meta.storage_key) + for speaker in self.iter_speakers(): + meta_ids.append(speaker.meta.storage_key) + self.storage.purge_obsolete_entries( + self.get_utterance_ids, self.get_conversation_ids(), self.get_speaker_ids(), meta_ids + ) + return self def filter_utterances_by(self, selector: Callable[[Utterance], bool]): @@ -621,7 +634,7 @@ def filter_utterances_by(self, selector: Callable[[Utterance], bool]): Returns a new corpus that includes only a subset of Utterances within this Corpus. This filtering provides no guarantees with regard to maintaining conversational integrity and should be used with care. - Vectors are not preserved. + Vectors are not preserved. The original corpus will be invalidated and no longer usable. :param selector: function for selecting which :return: a new Corpus with a subset of the Utterances @@ -630,6 +643,11 @@ def filter_utterances_by(self, selector: Callable[[Utterance], bool]): new_corpus = Corpus(utterances=utts) for convo in new_corpus.iter_conversations(): convo.meta.update(self.get_conversation(convo.id).meta) + + # original Corpus is invalidated and no longer usable; clear all data from + # its now-orphaned StorageManager to avoid having duplicates in memory + self.storage.clear_all_data() + return new_corpus def reindex_conversations( @@ -644,7 +662,7 @@ def reindex_conversations( The subtrees denoted by these utterance ids should be distinct and should not overlap, otherwise there may be unexpected behavior. - Vectors are not preserved. The original Corpus will be mutated. + Vectors are not preserved. The original Corpus will be invalidated and no longer usable. :param new_convo_roots: List of utterance ids to use as conversation ids :param preserve_corpus_meta: set as True to copy original Corpus metadata to new Corpus @@ -697,6 +715,10 @@ def reindex_conversations( warn("Failed to find some of the specified new convo roots:\n") print(missing_convo_roots) + # original Corpus is invalidated and no longer usable; clear all data from + # its now-orphaned StorageManager to avoid having duplicates in memory + self.storage.clear_all_data() + return new_corpus def get_meta(self) -> Dict: @@ -929,7 +951,7 @@ def merge(self, other_corpus, warnings: bool = True): other corpus, the other corpus's metadata (or its conversations / utterances) values will be used. A warning is printed when this happens. - May mutate original and other corpus in the process. + Will invalidate original and other corpus in the process. (Updates internal ConvoKit Index to match post-merge state and uses this Corpus's version number.) @@ -983,6 +1005,12 @@ def merge(self, other_corpus, warnings: bool = True): new_corpus.update_speakers_data() new_corpus.reinitialize_index() + # source corpora are now invalidated and all needed data has been copied + # into the new merged corpus; clear the source corpora's storage to + # prevent having duplicates in memory + self.storage.clear_all_data() + other_corpus.storage.clear_all_data() + return new_corpus def add_utterances(self, utterances=List[Utterance], warnings: bool = False, with_checks=True): diff --git a/convokit/model/storageManager.py b/convokit/model/storageManager.py index 15bfbec1..303ad617 100644 --- a/convokit/model/storageManager.py +++ b/convokit/model/storageManager.py @@ -1,7 +1,8 @@ from typing import Optional +from abc import ABCMeta, abstractmethod -class StorageManager: +class StorageManager(metaclass=ABCMeta): """ Abstraction layer for the concrete representation of data and metadata within corpus components (e.g., Utterance text and timestamps). All requests @@ -15,6 +16,70 @@ def __init__(self): # this will be assigned in subclasses self.data = {"utterance": None, "conversation": None, "speaker": None, "meta": None} + @abstractmethod + def get_collection_ids(self, component_type: str): + """ + Returns a list of all object IDs within the component_type collection + """ + return NotImplemented + + @abstractmethod + def has_data_for_component(self, component_type: str, component_id: str) -> bool: + """ + Check if there is an existing entry for the component of type component_type + with id component_id + """ + return NotImplemented + + @abstractmethod + def initialize_data_for_component( + self, component_type: str, component_id: str, overwrite: bool = False, initial_value=None + ): + """ + Create a blank entry for a component of type component_type with id + component_id. Will avoid overwriting any existing data unless the + overwrite parameter is set to True. + """ + return NotImplemented + + @abstractmethod + def get_data(self, component_type: str, component_id: str, property_name: Optional[str] = None): + """ + Retrieve the property data for the component of type component_type with + id component_id. If property_name is specified return only the data for + that property, otherwise return the dict containing all properties. + """ + return NotImplemented + + @abstractmethod + def update_data(self, component_type: str, component_id: str, property_name: str, new_value): + """ + Set or update the property data for the component of type component_type + with id component_id + """ + return NotImplemented + + @abstractmethod + def delete_data( + self, component_type: str, component_id: str, property_name: Optional[str] = None + ): + """ + Delete a data entry from this StorageManager for the component of type + component_type with id component_id. If property_name is specified + delete only that property, otherwise delete the entire entry. + """ + return NotImplemented + + @abstractmethod + def clear_all_data(self): + """ + Erase all data from this StorageManager (i.e., reset self.data to its + initial empty state; Python will garbage-collect the now-unreferenced + old data entries). This is used for cleanup after destructive Corpus + operations. + """ + return NotImplemented + def get_collection(self, component_type: str): if component_type not in self.data: raise ValueError( @@ -22,6 +87,23 @@ def get_collection(self, component_type: str): ) return self.data[component_type] + def purge_obsolete_entries(self, utterance_ids, conversation_ids, speaker_ids, meta_ids): + """ + Compare the entries in this StorageManager to the existing component ids + provided as parameters, and delete any entries that are not found in the + parameter ids. + """ + ref_ids = { + "utterance": set(utterance_ids), + "conversation": set(conversation_ids), + "speaker": set(speaker_ids), + "meta": set(meta_ids), + } + for obj_type in self.data: + for obj_id in self.get_collection_ids(obj_type): + if obj_id not in ref_ids[obj_type]: + self.delete_data(obj_type, obj_id) + class MemStorageManager(StorageManager): """ @@ -36,32 +118,21 @@ def __init__(self): for key in self.data: self.data[key] = {} + def get_collection_ids(self, component_type: str): + return list(self.get_collection(component_type).keys()) + def has_data_for_component(self, component_type: str, component_id: str) -> bool: - """ - Check if there is an existing entry for the component of type component_type - with id component_id - """ collection = self.get_collection(component_type) return component_id in collection def initialize_data_for_component( self, component_type: str, component_id: str, overwrite: bool = False, initial_value=None ): - """ - Create a blank entry for a component of type component_type with id - component_id. Will avoid overwriting any existing data unless the - overwrite parameter is set to True. - """ collection = self.get_collection(component_type) if overwrite or not self.has_data_for_component(component_type, component_id): collection[component_id] = initial_value if initial_value is not None else {} def get_data(self, component_type: str, component_id: str, property_name: Optional[str] = None): - """ - Retrieve the property data for the component of type component_type with - id component_id. If property_name is specified return only the data for - that property, otherwise return the dict containing all properties. - """ collection = self.get_collection(component_type) if component_id not in collection: raise KeyError( @@ -73,10 +144,6 @@ def get_data(self, component_type: str, component_id: str, property_name: Option return collection[component_id][property_name] def update_data(self, component_type: str, component_id: str, property_name: str, new_value): - """ - Set or update the property data for the component of type component_type - with id component_id - """ collection = self.get_collection(component_type) # don't create new collections if the ID is not found; this is supposed to be handled in the # CorpusComponent constructor so if the ID is missing that indicates something is wrong @@ -89,11 +156,6 @@ def update_data(self, component_type: str, component_id: str, property_name: str def delete_data( self, component_type: str, component_id: str, property_name: Optional[str] = None ): - """ - Delete a data entry from this StorageManager for the component of type - component_type with id component_id. If property_name is specified - delete only that property, otherwise delete the entire entry. - """ collection = self.get_collection(component_type) if component_id not in collection: raise KeyError( @@ -103,3 +165,7 @@ def delete_data( del collection[component_id] else: del collection[component_id][property_name] + + def clear_all_data(self): + for key in self.data: + self.data[key] = {} diff --git a/convokit/tests/general/test_merge_corpus.py b/convokit/tests/general/test_merge_corpus.py index 95725501..11d7aba9 100644 --- a/convokit/tests/general/test_merge_corpus.py +++ b/convokit/tests/general/test_merge_corpus.py @@ -26,8 +26,6 @@ def test_no_overlap(self): merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 6) self.assertEqual(len(list(merged.iter_speakers())), 6) - self.assertEqual(len(list(corpus1.iter_utterances())), 3) - self.assertEqual(len(list(corpus2.iter_utterances())), 3) def test_with_overlap(self): """ @@ -52,8 +50,6 @@ def test_with_overlap(self): merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) - self.assertEqual(len(list(corpus1.iter_utterances())), 3) - self.assertEqual(len(list(corpus2.iter_utterances())), 3) def test_overlap_diff_data(self): """ @@ -84,7 +80,7 @@ def test_overlap_diff_data(self): self.assertEqual(len(list(corpus2.iter_utterances())), 3) self.assertEqual(merged.get_utterance("2").text, "this is a test") - self.assertEqual(merged.get_utterance("2").speaker, Speaker(id="charlie")) + self.assertEqual(merged.get_utterance("2").speaker.id, "charlie") def test_overlap_diff_metadata(self): """ From a409fb112bda9894f3f4ef91d5cc8d7ad8c30217 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 12 Jul 2022 14:56:30 -0400 Subject: [PATCH 09/21] Convert old meta to dict in reindex_conversations --- convokit/model/corpus.py | 2 +- convokit/tests/general/test_convo_traversal.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index 2864aa23..c57b93e6 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -705,7 +705,7 @@ def reindex_conversations( for convo in new_corpus.iter_conversations(): convo.meta["original_convo_meta"] = self.get_conversation( original_utt_to_convo_id[convo.id] - ).meta + ).meta.to_dict() convo.meta["original_convo_id"] = original_utt_to_convo_id[convo.id] if verbose: missing_convo_roots = list( diff --git a/convokit/tests/general/test_convo_traversal.py b/convokit/tests/general/test_convo_traversal.py index 29ade7d5..847cc54f 100644 --- a/convokit/tests/general/test_convo_traversal.py +++ b/convokit/tests/general/test_convo_traversal.py @@ -236,6 +236,7 @@ def test_one_utt_convo(self): self.assertEqual([utt.id for utt in convo.traverse("preorder")], ["other"]) def test_reindex_corpus(self): + self.setUp() # reinitialize corpus since reindex is destructive new_convo_conversation_ids = ["1", "2", "3"] new_corpus = self.corpus.reindex_conversations(new_convo_conversation_ids) # checking for correct number of conversations and utterances @@ -251,6 +252,7 @@ def test_reindex_corpus(self): self.assertEqual(self.corpus.meta, new_corpus.meta) def test_reindex_corpus2(self): + self.setUp() # reinitialize corpus since reindex is destructive new_convo_conversation_ids = ["1", "2", "3"] new_corpus = self.corpus.reindex_conversations( new_convo_conversation_ids, preserve_convo_meta=False, preserve_corpus_meta=False From a6925fc48f0a6ca3dbb138d6f02d947f2a1357f6 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 12 Jul 2022 15:14:04 -0400 Subject: [PATCH 10/21] fix metadata reassignment in merge --- convokit/model/convoKitMeta.py | 10 ++++++++++ convokit/model/corpus.py | 4 ++-- convokit/tests/general/test_convo_traversal.py | 5 ++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/convokit/model/convoKitMeta.py b/convokit/model/convoKitMeta.py index d6d182a7..832fc18a 100644 --- a/convokit/model/convoKitMeta.py +++ b/convokit/model/convoKitMeta.py @@ -97,6 +97,16 @@ def __repr__(self) -> str: def to_dict(self): return dict(self._get_storage().get_data("meta", self.storage_key)) + def reinitialize_from_other(self, other): + """ + Reinitialize this ConvoKitMeta instance with the data from other + """ + if type(other) == ConvoKitMeta: + other = {k: v for k, v in other.to_dict().items()} + self._get_storage().initialize_data_for_component( + "meta", self.storage_key, overwrite=True, initial_value=other + ) + _basic_types = {type(0), type(1.0), type("str"), type(True)} # cannot include lists or dicts diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index c57b93e6..66776c9f 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -972,7 +972,7 @@ def merge(self, other_corpus, warnings: bool = True): ) # Merge CORPUS metadata - new_corpus.meta = self.meta + new_corpus.meta.reinitialize_from_other(self.meta) for key, val in other_corpus.meta.items(): if key in new_corpus.meta and new_corpus.meta[key] != val: if warnings: @@ -987,7 +987,7 @@ def merge(self, other_corpus, warnings: bool = True): convos2 = other_corpus.iter_conversations() for convo in convos1: - new_corpus.get_conversation(convo.id).meta = convo.meta + new_corpus.get_conversation(convo.id).meta.reinitialize_from_other(convo.meta) for convo in convos2: for key, val in convo.meta.items(): diff --git a/convokit/tests/general/test_convo_traversal.py b/convokit/tests/general/test_convo_traversal.py index 847cc54f..9fdad66c 100644 --- a/convokit/tests/general/test_convo_traversal.py +++ b/convokit/tests/general/test_convo_traversal.py @@ -237,6 +237,7 @@ def test_one_utt_convo(self): def test_reindex_corpus(self): self.setUp() # reinitialize corpus since reindex is destructive + original_meta = {k: v for k, v in self.corpus.get_conversation("0").meta.to_dict().items()} new_convo_conversation_ids = ["1", "2", "3"] new_corpus = self.corpus.reindex_conversations(new_convo_conversation_ids) # checking for correct number of conversations and utterances @@ -245,9 +246,7 @@ def test_reindex_corpus(self): # checking that corpus and conversation metadata was preserved for convo in new_corpus.iter_conversations(): - self.assertEqual( - convo.meta["original_convo_meta"], self.corpus.get_conversation("0").meta - ) + self.assertEqual(convo.meta["original_convo_meta"], original_meta) self.assertEqual(self.corpus.meta, new_corpus.meta) From c450cd3a875a30e188410813b48466aa6cd60cb5 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 12 Jul 2022 15:22:35 -0400 Subject: [PATCH 11/21] update reindex test to account for source corpus destruction --- convokit/tests/general/test_convo_traversal.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/convokit/tests/general/test_convo_traversal.py b/convokit/tests/general/test_convo_traversal.py index 9fdad66c..63f87b90 100644 --- a/convokit/tests/general/test_convo_traversal.py +++ b/convokit/tests/general/test_convo_traversal.py @@ -237,7 +237,10 @@ def test_one_utt_convo(self): def test_reindex_corpus(self): self.setUp() # reinitialize corpus since reindex is destructive - original_meta = {k: v for k, v in self.corpus.get_conversation("0").meta.to_dict().items()} + original_convo_meta = { + k: v for k, v in self.corpus.get_conversation("0").meta.to_dict().items() + } + original_corpus_meta = {k: v for k, v in self.corpus.meta} new_convo_conversation_ids = ["1", "2", "3"] new_corpus = self.corpus.reindex_conversations(new_convo_conversation_ids) # checking for correct number of conversations and utterances @@ -246,9 +249,9 @@ def test_reindex_corpus(self): # checking that corpus and conversation metadata was preserved for convo in new_corpus.iter_conversations(): - self.assertEqual(convo.meta["original_convo_meta"], original_meta) + self.assertEqual(convo.meta["original_convo_meta"], original_convo_meta) - self.assertEqual(self.corpus.meta, new_corpus.meta) + self.assertEqual(original_corpus_meta, new_corpus.meta) def test_reindex_corpus2(self): self.setUp() # reinitialize corpus since reindex is destructive From a580a50609e77fd89745ad609002e2df6aa9cdf4 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 12 Jul 2022 15:28:05 -0400 Subject: [PATCH 12/21] reindex test fix --- convokit/tests/general/test_convo_traversal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convokit/tests/general/test_convo_traversal.py b/convokit/tests/general/test_convo_traversal.py index 63f87b90..7ec2e941 100644 --- a/convokit/tests/general/test_convo_traversal.py +++ b/convokit/tests/general/test_convo_traversal.py @@ -240,7 +240,7 @@ def test_reindex_corpus(self): original_convo_meta = { k: v for k, v in self.corpus.get_conversation("0").meta.to_dict().items() } - original_corpus_meta = {k: v for k, v in self.corpus.meta} + original_corpus_meta = {k: v for k, v in self.corpus.meta.to_dict().items()} new_convo_conversation_ids = ["1", "2", "3"] new_corpus = self.corpus.reindex_conversations(new_convo_conversation_ids) # checking for correct number of conversations and utterances From 40b1011dc28242c2016e30c8412d5eb7beefed43 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Tue, 12 Jul 2022 16:07:12 -0400 Subject: [PATCH 13/21] create new storage integrity tests --- convokit/tests/general/test_merge_corpus.py | 70 +++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/convokit/tests/general/test_merge_corpus.py b/convokit/tests/general/test_merge_corpus.py index 11d7aba9..3c958318 100644 --- a/convokit/tests/general/test_merge_corpus.py +++ b/convokit/tests/general/test_merge_corpus.py @@ -23,10 +23,23 @@ def test_no_overlap(self): ] ) + all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) + all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) + merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 6) self.assertEqual(len(list(merged.iter_speakers())), 6) + for utt_id in all_utt_ids: + self.assertTrue(utt_id in merged.storage.get_collection_ids("utterance")) + for speaker_id in all_speaker_ids: + self.assertTrue(speaker_id in merged.storage.get_collection_ids("speaker")) + + for collection in corpus1.storage.data.values(): + self.assertEqual(len(collection), 0) + for collection in corpus2.storage.data.values(): + self.assertEqual(len(collection), 0) + def test_with_overlap(self): """ Basic merge: with overlap in utterance id (but utterance has same data & metadata) @@ -47,10 +60,23 @@ def test_with_overlap(self): ] ) + all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) + all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) + merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) + for utt_id in all_utt_ids: + self.assertTrue(utt_id in merged.storage.get_collection_ids("utterance")) + for speaker_id in all_speaker_ids: + self.assertTrue(speaker_id in merged.storage.get_collection_ids("speaker")) + + for collection in corpus1.storage.data.values(): + self.assertEqual(len(collection), 0) + for collection in corpus2.storage.data.values(): + self.assertEqual(len(collection), 0) + def test_overlap_diff_data(self): """ Merge with overlap in utterance id and utterance has diff data but same metadata @@ -73,6 +99,9 @@ def test_overlap_diff_data(self): ] ) + all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) + all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) + merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) @@ -82,6 +111,21 @@ def test_overlap_diff_data(self): self.assertEqual(merged.get_utterance("2").text, "this is a test") self.assertEqual(merged.get_utterance("2").speaker.id, "charlie") + for utt_id in all_utt_ids: + self.assertTrue(utt_id in merged.storage.get_collection_ids("utterance")) + for speaker_id in all_speaker_ids: + if ( + speaker_id == "candace" + ): # this speaker shouldn't be present due to overlap prioritization + self.assertFalse(speaker_id in merged.storage.get_collection_ids("speaker")) + else: + self.assertTrue(speaker_id in merged.storage.get_collection_ids("speaker")) + + for collection in corpus1.storage.data.values(): + self.assertEqual(len(collection), 0) + for collection in corpus2.storage.data.values(): + self.assertEqual(len(collection), 0) + def test_overlap_diff_metadata(self): """ Merge with overlap in utterance id and utterance has same data but diff metadata @@ -114,6 +158,9 @@ def test_overlap_diff_metadata(self): ] ) + all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) + all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) + merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) @@ -121,6 +168,18 @@ def test_overlap_diff_metadata(self): self.assertEqual(len(merged.get_utterance("2").meta), 3) self.assertEqual(merged.get_utterance("2").meta["the"], "ringo") + for utt_id in all_utt_ids: + self.assertTrue(utt_id in merged.storage.get_collection_ids("utterance")) + self.assertTrue(f"utterance_{utt_id}" in merged.storage.get_collection_ids("meta")) + for speaker_id in all_speaker_ids: + self.assertTrue(speaker_id in merged.storage.get_collection_ids("speaker")) + self.assertTrue(f"speaker_{speaker_id}" in merged.storage.get_collection_ids("meta")) + + for collection in corpus1.storage.data.values(): + self.assertEqual(len(collection), 0) + for collection in corpus2.storage.data.values(): + self.assertEqual(len(collection), 0) + def test_overlap_convo_metadata(self): """ Merge with overlap in conversation with metadata differences. @@ -181,6 +240,14 @@ def test_overlap_convo_metadata(self): self.assertEqual(len(merged.get_conversation("convo1").meta), 3) self.assertEqual(merged.get_conversation("convo1").meta["hello"], "food") + self.assertTrue("convo1" in merged.storage.get_collection_ids("conversation")) + self.assertTrue("conversation_convo1" in merged.storage.get_collection_ids("meta")) + + self.assertFalse("convo1" in corpus1.storage.get_collection_ids("conversation")) + self.assertFalse("convo1" in corpus2.storage.get_collection_ids("conversation")) + self.assertFalse("conversation_convo1" in corpus1.storage.get_collection_ids("meta")) + self.assertFalse("conversation_convo1" in corpus2.storage.get_collection_ids("meta")) + def test_corpus_metadata(self): """ Merge with overlap in corpus metadata @@ -243,6 +310,9 @@ def test_add_utterance(self): self.assertEqual(len(added.get_utterance("2").meta), 3) self.assertEqual(added.get_utterance("2").meta["hello"], "food") + for utt in utts: + self.assertFalse(hasattr(utt, "_temp_storage")) + if __name__ == "__main__": unittest.main() From 7dd6698246c304837b56ac980ecd0845d7e4b952 Mon Sep 17 00:00:00 2001 From: calebchiam Date: Wed, 13 Jul 2022 00:29:31 +0300 Subject: [PATCH 14/21] black fmt setup --- setup.py | 61 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index af13b52e..23672fd5 100644 --- a/setup.py +++ b/setup.py @@ -7,34 +7,39 @@ url="https://github.com/CornellNLP/ConvoKit", description="ConvoKit", version="2.6.0", - packages=["convokit", - "convokit.bag_of_words", - "convokit.classifier", - "convokit.coordination", - "convokit.fighting_words", - "convokit.forecaster", - "convokit.forecaster.CRAFT", - "convokit.hyperconvo", - "convokit.model", - "convokit.paired_prediction", - "convokit.phrasing_motifs", - "convokit.politeness_collections", - "convokit.politeness_collections.politeness_api", - "convokit.politeness_collections.politeness_api.features", - "convokit.politeness_collections.politeness_local", - "convokit.politeness_collections.politeness_cscw_zh", - "convokit.politenessStrategies", - "convokit.prompt_types", - "convokit.ranker", - "convokit.text_processing", - "convokit.speaker_convo_helpers", - "convokit.speakerConvoDiversity", - "convokit.expected_context_framework", - "convokit.surprise" - ], - package_data={"convokit": ["data/*.txt", - "politeness_collections/politeness_local/lexicons/*.json", - "politeness_collections/politeness_cscw_zh/lexicons/*.json"]}, + packages=[ + "convokit", + "convokit.bag_of_words", + "convokit.classifier", + "convokit.coordination", + "convokit.fighting_words", + "convokit.forecaster", + "convokit.forecaster.CRAFT", + "convokit.hyperconvo", + "convokit.model", + "convokit.paired_prediction", + "convokit.phrasing_motifs", + "convokit.politeness_collections", + "convokit.politeness_collections.politeness_api", + "convokit.politeness_collections.politeness_api.features", + "convokit.politeness_collections.politeness_local", + "convokit.politeness_collections.politeness_cscw_zh", + "convokit.politenessStrategies", + "convokit.prompt_types", + "convokit.ranker", + "convokit.text_processing", + "convokit.speaker_convo_helpers", + "convokit.speakerConvoDiversity", + "convokit.expected_context_framework", + "convokit.surprise", + ], + package_data={ + "convokit": [ + "data/*.txt", + "politeness_collections/politeness_local/lexicons/*.json", + "politeness_collections/politeness_cscw_zh/lexicons/*.json", + ] + }, install_requires=[ "matplotlib>=3.0.0", "pandas>=0.23.4", From 456dc1b6d46d3778ad094f9d3058eb43c0d159e1 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Mon, 18 Jul 2022 11:31:02 -0400 Subject: [PATCH 15/21] Merge, filter, and reindex now static --- convokit/model/corpus.py | 159 ++++++++++-------- convokit/model/corpusComponent.py | 3 + .../tests/general/test_convo_traversal.py | 9 +- convokit/tests/general/test_merge_corpus.py | 22 ++- 4 files changed, 116 insertions(+), 77 deletions(-) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index 66776c9f..b20c424d 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -629,41 +629,45 @@ def filter_conversations_by(self, selector: Callable[[Conversation], bool]): return self - def filter_utterances_by(self, selector: Callable[[Utterance], bool]): + @staticmethod + def filter_corpus_utterances(source_corpus: "Corpus", selector: Callable[[Utterance], bool]): """ - Returns a new corpus that includes only a subset of Utterances within this Corpus. This filtering provides no + Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no guarantees with regard to maintaining conversational integrity and should be used with care. - Vectors are not preserved. The original corpus will be invalidated and no longer usable. + Vectors are not preserved. The source corpus will be invalidated and no longer usable. + :param source_corpus: the Corpus to subset from :param selector: function for selecting which :return: a new Corpus with a subset of the Utterances """ - utts = list(self.iter_utterances(selector)) + utts = list(source_corpus.iter_utterances(selector)) new_corpus = Corpus(utterances=utts) for convo in new_corpus.iter_conversations(): - convo.meta.update(self.get_conversation(convo.id).meta) + convo.meta.update(source_corpus.get_conversation(convo.id).meta) # original Corpus is invalidated and no longer usable; clear all data from # its now-orphaned StorageManager to avoid having duplicates in memory - self.storage.clear_all_data() + source_corpus.storage.clear_all_data() return new_corpus + @staticmethod def reindex_conversations( - self, + source_corpus: "Corpus", new_convo_roots: List[str], preserve_corpus_meta: bool = True, preserve_convo_meta: bool = True, verbose=True, ) -> "Corpus": """ - Generates a new Corpus from current Corpus with specified list of utterance ids to use as conversation ids. + Generates a new Corpus from source Corpus with specified list of utterance ids to use as conversation ids. The subtrees denoted by these utterance ids should be distinct and should not overlap, otherwise there may be unexpected behavior. - Vectors are not preserved. The original Corpus will be invalidated and no longer usable. + Vectors are not preserved. The source Corpus will be invalidated and no longer usable. + :param source_corpus: the Corpus containing the original data to select from :param new_convo_roots: List of utterance ids to use as conversation ids :param preserve_corpus_meta: set as True to copy original Corpus metadata to new Corpus :param preserve_convo_meta: set as True to copy original Conversation metadata to new Conversation metadata @@ -672,7 +676,7 @@ def reindex_conversations( :return: new Corpus with reindexed Conversations """ "" new_convo_roots = set(new_convo_roots) - for convo in self.iter_conversations(): + for convo in source_corpus.iter_conversations(): try: convo.initialize_tree_structure() except ValueError as e: @@ -683,7 +687,9 @@ def reindex_conversations( original_utt_to_convo_id = dict() for utt_id in new_convo_roots: - orig_convo = self.get_conversation(self.get_utterance(utt_id).conversation_id) + orig_convo = source_corpus.get_conversation( + source_corpus.get_utterance(utt_id).conversation_id + ) original_utt_to_convo_id[utt_id] = orig_convo.id try: subtree = orig_convo.get_subtree(utt_id) @@ -699,11 +705,11 @@ def reindex_conversations( new_corpus = Corpus(utterances=new_corpus_utts) if preserve_corpus_meta: - new_corpus.meta.update(self.meta) + new_corpus.meta.update(source_corpus.meta) if preserve_convo_meta: for convo in new_corpus.iter_conversations(): - convo.meta["original_convo_meta"] = self.get_conversation( + convo.meta["original_convo_meta"] = source_corpus.get_conversation( original_utt_to_convo_id[convo.id] ).meta.to_dict() convo.meta["original_convo_id"] = original_utt_to_convo_id[convo.id] @@ -717,7 +723,7 @@ def reindex_conversations( # original Corpus is invalidated and no longer usable; clear all data from # its now-orphaned StorageManager to avoid having duplicates in memory - self.storage.clear_all_data() + source_corpus.storage.clear_all_data() return new_corpus @@ -940,51 +946,53 @@ def reinitialize_index(self): new_index.version = old_index.version self.meta_index = new_index - def merge(self, other_corpus, warnings: bool = True): + @staticmethod + def merge(primary: "Corpus", secondary: "Corpus", warnings: bool = True): """ - Merges this corpus with another corpus. + Merges primary and secondary, creating a new Corpus with their combined data. - Utterances with the same id must share the same data, otherwise the other corpus utterance data & metadata + Utterances with the same id must share the same data. In case of conflicts, + primary will take precedence and the conflicting Utterance from secondary will be ignored. A warning is printed when this happens. - If metadata of this corpus (or its conversations / utterances) shares a key with the metadata of the - other corpus, the other corpus's metadata (or its conversations / utterances) values will be used. A warning + If metadata of primary (or its conversations / utterances) shares a key with the metadata of the + secondary, secondary's metadata (or its conversations / utterances) values will be used. A warning is printed when this happens. - Will invalidate original and other corpus in the process. + Will invalidate primary and secondary in the process. - (Updates internal ConvoKit Index to match post-merge state and uses this Corpus's version number.) + The resulting Corpus will inherit primary's id and version number. :param other_corpus: Corpus :param warnings: print warnings when data conflicts are encountered :return: new Corpus constructed from combined lists of utterances """ - utts1 = list(self.iter_utterances()) - utts2 = list(other_corpus.iter_utterances()) - combined_utts = self._merge_utterances(utts1, utts2, warnings=warnings) + utts1 = list(primary.iter_utterances()) + utts2 = list(secondary.iter_utterances()) + combined_utts = primary._merge_utterances(utts1, utts2, warnings=warnings) new_corpus = Corpus(utterances=list(combined_utts)) # Note that we collect Speakers from the utt sets directly instead of the combined utts, otherwise # differences in Speaker meta will not be registered for duplicate Utterances (because utts would be discarded # during merging) - speakers_meta, speakers_meta_conflict = self._collect_speaker_data([utts1, utts2]) + speakers_meta, speakers_meta_conflict = primary._collect_speaker_data([utts1, utts2]) Corpus._update_corpus_speaker_data( new_corpus, speakers_meta, speakers_meta_conflict, warnings=warnings ) # Merge CORPUS metadata - new_corpus.meta.reinitialize_from_other(self.meta) - for key, val in other_corpus.meta.items(): + new_corpus.meta.reinitialize_from_other(primary.meta) + for key, val in secondary.meta.items(): if key in new_corpus.meta and new_corpus.meta[key] != val: if warnings: warn( - "Found conflicting values for Corpus metadata key: {}. " - "Overwriting with other Corpus's metadata.".format(repr(key)) + "Found conflicting values for primary Corpus metadata key: {}. " + "Overwriting with secondary Corpus's metadata.".format(repr(key)) ) new_corpus.meta[key] = val # Merge CONVERSATION metadata - convos1 = self.iter_conversations() - convos2 = other_corpus.iter_conversations() + convos1 = primary.iter_conversations() + convos2 = secondary.iter_conversations() for convo in convos1: new_corpus.get_conversation(convo.id).meta.reinitialize_from_other(convo.meta) @@ -996,7 +1004,7 @@ def merge(self, other_corpus, warnings: bool = True): if warnings: warn( "Found conflicting values for Conversation {} for metadata key: {}. " - "Overwriting with other corpus's Conversation metadata.".format( + "Overwriting with secondary corpus's Conversation metadata.".format( repr(convo.id), repr(key) ) ) @@ -1008,8 +1016,8 @@ def merge(self, other_corpus, warnings: bool = True): # source corpora are now invalidated and all needed data has been copied # into the new merged corpus; clear the source corpora's storage to # prevent having duplicates in memory - self.storage.clear_all_data() - other_corpus.storage.clear_all_data() + primary.storage.clear_all_data() + secondary.storage.clear_all_data() return new_corpus @@ -1029,40 +1037,57 @@ def add_utterances(self, utterances=List[Utterance], warnings: bool = False, wit :return: a Corpus with the utterances from this Corpus and the input utterances combined """ if with_checks: - helper_corpus = Corpus(utterances=utterances) - return self.merge(helper_corpus, warnings=warnings) - else: - new_speakers = {u.speaker.id: u.speaker for u in utterances} - new_utterances = {u.id: u for u in utterances} - for speaker in new_speakers.values(): - speaker.owner = self - for utt in new_utterances.values(): - utt.owner = self - - # update corpus speakers - for new_speaker_id, new_speaker in new_speakers.items(): - if new_speaker_id not in self.speakers: - self.speakers[new_speaker_id] = new_speaker - - # update corpus utterances + (link speaker -> utt) - for new_utt_id, new_utt in new_utterances.items(): - if new_utt_id not in self.utterances: - self.utterances[new_utt_id] = new_utt - self.speakers[new_utt.speaker.id]._add_utterance(new_utt) - - # update corpus conversations + (link convo <-> utt) - new_convos = defaultdict(list) - for utt in new_utterances.values(): - if utt.conversation_id in self.conversations: + # leverage the merge method's _merge_utterances method to run the checks + # (but then run a subsequent filtering operation since we aren't actually doing a merge) + added_utt_ids = {utt.id for utt in utterances} + combined_utts = self._merge_utterances( + list(self.iter_utterances()), utterances, warnings + ) + speakers_meta, speakers_meta_conflict = self._collect_speaker_data( + [list(self.iter_utterances()), utterances] + ) + utterances = [utt for utt in combined_utts if utt.id in added_utt_ids] + + new_speakers = {u.speaker.id: u.speaker for u in utterances} + new_utterances = {u.id: u for u in utterances} + for speaker in new_speakers.values(): + speaker.owner = self + for utt in new_utterances.values(): + utt.owner = self + + # update corpus speakers + for new_speaker_id, new_speaker in new_speakers.items(): + if new_speaker_id not in self.speakers: + self.speakers[new_speaker_id] = new_speaker + + # update corpus utterances + (link speaker -> utt) + for new_utt_id, new_utt in new_utterances.items(): + if new_utt_id not in self.utterances: + self.utterances[new_utt_id] = new_utt + self.speakers[new_utt.speaker.id]._add_utterance(new_utt) + + # update corpus conversations + (link convo <-> utt) + new_convos = defaultdict(list) + for utt in new_utterances.values(): + if utt.conversation_id in self.conversations: + if (not with_checks) or ( + utt.id not in self.conversations[utt.conversation_id].get_utterance_ids() + ): self.conversations[utt.conversation_id]._add_utterance(utt) - else: - new_convos[utt.conversation_id].append(utt.id) - for convo_id, convo_utts in new_convos.items(): - new_convo = Conversation(owner=self, id=convo_id, utterances=convo_utts, meta=None) - self.conversations[convo_id] = new_convo - # (link speaker -> convo) - new_convo_speaker = self.speakers[new_convo.get_utterance(convo_id).speaker.id] - new_convo_speaker._add_conversation(new_convo) + else: + new_convos[utt.conversation_id].append(utt.id) + for convo_id, convo_utts in new_convos.items(): + new_convo = Conversation(owner=self, id=convo_id, utterances=convo_utts, meta=None) + self.conversations[convo_id] = new_convo + # (link speaker -> convo) + new_convo_speaker = self.speakers[new_convo.get_utterance(convo_id).speaker.id] + new_convo_speaker._add_conversation(new_convo) + + # update speaker metadata (only in cases of conflict) + if with_checks: + Corpus._update_corpus_speaker_data( + self, speakers_meta, speakers_meta_conflict, warnings + ) return self diff --git a/convokit/model/corpusComponent.py b/convokit/model/corpusComponent.py index effb789d..929fc24f 100644 --- a/convokit/model/corpusComponent.py +++ b/convokit/model/corpusComponent.py @@ -38,6 +38,9 @@ def get_owner(self): return self._owner def set_owner(self, owner): + if owner is self._owner: + # no action needed + return # stash the metadata first since reassigning self._owner will break its storage connection meta_vals = {k: v for k, v in self.meta.items()} previous_owner = self._owner diff --git a/convokit/tests/general/test_convo_traversal.py b/convokit/tests/general/test_convo_traversal.py index 7ec2e941..b96685cf 100644 --- a/convokit/tests/general/test_convo_traversal.py +++ b/convokit/tests/general/test_convo_traversal.py @@ -242,7 +242,7 @@ def test_reindex_corpus(self): } original_corpus_meta = {k: v for k, v in self.corpus.meta.to_dict().items()} new_convo_conversation_ids = ["1", "2", "3"] - new_corpus = self.corpus.reindex_conversations(new_convo_conversation_ids) + new_corpus = Corpus.reindex_conversations(self.corpus, new_convo_conversation_ids) # checking for correct number of conversations and utterances self.assertEqual(len(list(new_corpus.iter_conversations())), 3) self.assertEqual(len(list(new_corpus.iter_utterances())), 11) @@ -256,8 +256,11 @@ def test_reindex_corpus(self): def test_reindex_corpus2(self): self.setUp() # reinitialize corpus since reindex is destructive new_convo_conversation_ids = ["1", "2", "3"] - new_corpus = self.corpus.reindex_conversations( - new_convo_conversation_ids, preserve_convo_meta=False, preserve_corpus_meta=False + new_corpus = Corpus.reindex_conversations( + self.corpus, + new_convo_conversation_ids, + preserve_convo_meta=False, + preserve_corpus_meta=False, ) # checking for correct number of conversations and utterances self.assertEqual(len(list(new_corpus.iter_conversations())), 3) diff --git a/convokit/tests/general/test_merge_corpus.py b/convokit/tests/general/test_merge_corpus.py index 3c958318..23b25c42 100644 --- a/convokit/tests/general/test_merge_corpus.py +++ b/convokit/tests/general/test_merge_corpus.py @@ -26,7 +26,7 @@ def test_no_overlap(self): all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) - merged = corpus1.merge(corpus2) + merged = Corpus.merge(corpus1, corpus2) self.assertEqual(len(list(merged.iter_utterances())), 6) self.assertEqual(len(list(merged.iter_speakers())), 6) @@ -63,7 +63,7 @@ def test_with_overlap(self): all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) - merged = corpus1.merge(corpus2) + merged = Corpus.merge(corpus1, corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) @@ -102,7 +102,7 @@ def test_overlap_diff_data(self): all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) - merged = corpus1.merge(corpus2) + merged = Corpus.merge(corpus1, corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) self.assertEqual(len(list(corpus1.iter_utterances())), 3) @@ -161,7 +161,7 @@ def test_overlap_diff_metadata(self): all_utt_ids = set(corpus1.get_utterance_ids()) | set(corpus2.get_utterance_ids()) all_speaker_ids = set(corpus1.get_speaker_ids()) | set(corpus2.get_speaker_ids()) - merged = corpus1.merge(corpus2) + merged = Corpus.merge(corpus1, corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) @@ -236,7 +236,7 @@ def test_overlap_convo_metadata(self): corpus2.get_conversation("convo1").add_meta("hello", "food") corpus2.get_conversation("convo1").add_meta("what", "a mood") - merged = corpus1.merge(corpus2) + merged = Corpus.merge(corpus1, corpus2) self.assertEqual(len(merged.get_conversation("convo1").meta), 3) self.assertEqual(merged.get_conversation("convo1").meta["hello"], "food") @@ -276,7 +276,7 @@ def test_corpus_metadata(self): corpus2.add_meta("toxicity", 0.9) corpus2.add_meta("paggro", 1.0) - merged = corpus1.merge(corpus2) + merged = Corpus.merge(corpus1, corpus2) self.assertEqual(len(merged.meta), 3) self.assertEqual(merged.meta["toxicity"], 0.9) @@ -306,11 +306,19 @@ def test_add_utterance(self): ] added = corpus1.add_utterances(utts) + self.assertIs(added, corpus1) self.assertEqual(len(list(added.iter_utterances())), 4) + self.assertEqual(set(added.get_utterance_ids()), {"0", "1", "2", "5"}) + self.assertEqual(set(added.get_speaker_ids()), {"alice", "bob", "charlie", "foxtrot"}) self.assertEqual(len(added.get_utterance("2").meta), 3) self.assertEqual(added.get_utterance("2").meta["hello"], "food") + self.assertIn("what", added.get_utterance("2").meta) + self.assertEqual(added.get_utterance("1").text, "my name is bob") + self.assertEqual(added.get_utterance("1").speaker.id, "bob") + self.assertEqual(added.get_utterance("5").text, "goodbye") + self.assertEqual(added.get_utterance("5").speaker.id, "foxtrot") - for utt in utts: + for utt in added.iter_utterances(): self.assertFalse(hasattr(utt, "_temp_storage")) From c4398b2821a972a8aa6ab4bea7e4fd4d592abfa7 Mon Sep 17 00:00:00 2001 From: Oscar Date: Tue, 19 Jul 2022 14:25:22 +0800 Subject: [PATCH 16/21] boilerplate template for adding speaker and utterances individually --- convokit/model/corpus.py | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index 66776c9f..0508206c 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -180,6 +180,10 @@ def __init__( speaker.owner = self for _, utt in self.utterances.items(): utt.owner = self + + else: #initialize empty Corpus + self.utterances = dict() + self.speakers = dict() if merge_lines: self.utterances = merge_utterance_lines(self.utterances) @@ -190,6 +194,43 @@ def __init__( self.meta_index.enable_type_check() self.update_speakers_data() + def create_speaker( + self, + owner=None, + id: str = None, + name: str = None, + utts=None, + convos=None, + meta: Optional[Dict] = None, + ): + speaker = Speaker(owner=owner,id=id,name=name,utts=utts,) + self.speakers[speaker.id] = speaker + speaker.owner = self + # self.conversations = initialize_conversations(self, self.utterances, convos_data) + self.meta_index.enable_type_check() + self.update_speakers_data() + + def create_utterance( + self, + owner=None, + id: Optional[str] = None, + speaker: Optional[Speaker] = None, + user: Optional[Speaker] = None, + conversation_id: Optional[str] = None, + root: Optional[str] = None, + reply_to: Optional[str] = None, + timestamp: Optional[int] = None, + text: str = "", + meta: Optional[Dict] = None, + ): + utt = Utterance(owner=owner,id=id,speaker=speaker,user=user,conversation_id=conversation_id,root=root,reply_to=reply_to,timestamp=timestamp,text=text,meta=meta) + self.utterances[utt.id] = utt + utt.owner = self + # self.conversations = initialize_conversations(self, self.utterances, convos_data) + self.meta_index.enable_type_check() + self.update_speakers_data() + + @property def vectors(self): return self.meta_index.vectors From 1d262e32008b1d7da877ce11606990593fc39488 Mon Sep 17 00:00:00 2001 From: Oscar Date: Tue, 19 Jul 2022 14:30:08 +0800 Subject: [PATCH 17/21] black format --- convokit/model/corpus.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index f690c1d4..f992fc58 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -180,8 +180,8 @@ def __init__( speaker.owner = self for _, utt in self.utterances.items(): utt.owner = self - - else: #initialize empty Corpus + + else: # initialize empty Corpus self.utterances = dict() self.speakers = dict() @@ -203,13 +203,18 @@ def create_speaker( convos=None, meta: Optional[Dict] = None, ): - speaker = Speaker(owner=owner,id=id,name=name,utts=utts,) + speaker = Speaker( + owner=owner, + id=id, + name=name, + utts=utts, + ) self.speakers[speaker.id] = speaker speaker.owner = self # self.conversations = initialize_conversations(self, self.utterances, convos_data) self.meta_index.enable_type_check() self.update_speakers_data() - + def create_utterance( self, owner=None, @@ -223,14 +228,24 @@ def create_utterance( text: str = "", meta: Optional[Dict] = None, ): - utt = Utterance(owner=owner,id=id,speaker=speaker,user=user,conversation_id=conversation_id,root=root,reply_to=reply_to,timestamp=timestamp,text=text,meta=meta) + utt = Utterance( + owner=owner, + id=id, + speaker=speaker, + user=user, + conversation_id=conversation_id, + root=root, + reply_to=reply_to, + timestamp=timestamp, + text=text, + meta=meta, + ) self.utterances[utt.id] = utt utt.owner = self # self.conversations = initialize_conversations(self, self.utterances, convos_data) self.meta_index.enable_type_check() self.update_speakers_data() - @property def vectors(self): return self.meta_index.vectors From 64b49d25b828236dec059056a8430a3a3dd4600a Mon Sep 17 00:00:00 2001 From: Oscar Date: Tue, 19 Jul 2022 15:05:32 +0800 Subject: [PATCH 18/21] remove extra work --- convokit/model/corpus.py | 56 ---------------------------------------- 1 file changed, 56 deletions(-) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index f992fc58..b20c424d 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -181,10 +181,6 @@ def __init__( for _, utt in self.utterances.items(): utt.owner = self - else: # initialize empty Corpus - self.utterances = dict() - self.speakers = dict() - if merge_lines: self.utterances = merge_utterance_lines(self.utterances) @@ -194,58 +190,6 @@ def __init__( self.meta_index.enable_type_check() self.update_speakers_data() - def create_speaker( - self, - owner=None, - id: str = None, - name: str = None, - utts=None, - convos=None, - meta: Optional[Dict] = None, - ): - speaker = Speaker( - owner=owner, - id=id, - name=name, - utts=utts, - ) - self.speakers[speaker.id] = speaker - speaker.owner = self - # self.conversations = initialize_conversations(self, self.utterances, convos_data) - self.meta_index.enable_type_check() - self.update_speakers_data() - - def create_utterance( - self, - owner=None, - id: Optional[str] = None, - speaker: Optional[Speaker] = None, - user: Optional[Speaker] = None, - conversation_id: Optional[str] = None, - root: Optional[str] = None, - reply_to: Optional[str] = None, - timestamp: Optional[int] = None, - text: str = "", - meta: Optional[Dict] = None, - ): - utt = Utterance( - owner=owner, - id=id, - speaker=speaker, - user=user, - conversation_id=conversation_id, - root=root, - reply_to=reply_to, - timestamp=timestamp, - text=text, - meta=meta, - ) - self.utterances[utt.id] = utt - utt.owner = self - # self.conversations = initialize_conversations(self, self.utterances, convos_data) - self.meta_index.enable_type_check() - self.update_speakers_data() - @property def vectors(self): return self.meta_index.vectors From 03001c73ae7a902fccc522c9f6618494e58c9d7d Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Wed, 20 Jul 2022 14:36:23 -0400 Subject: [PATCH 19/21] Minor style changes --- convokit/model/convoKitMeta.py | 10 ++++++++-- convokit/model/corpus.py | 17 +++++++++-------- setup.py | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/convokit/model/convoKitMeta.py b/convokit/model/convoKitMeta.py index 832fc18a..299047ad 100644 --- a/convokit/model/convoKitMeta.py +++ b/convokit/model/convoKitMeta.py @@ -2,9 +2,11 @@ from collections.abc import MutableMapping except: from collections import MutableMapping +from numpy import isin from convokit.util import warn from .convoKitIndex import ConvoKitIndex import json +from typing import Union # See reference: https://stackoverflow.com/questions/7760916/correct-usage-of-a-getter-setter-for-dictionary-values @@ -97,12 +99,16 @@ def __repr__(self) -> str: def to_dict(self): return dict(self._get_storage().get_data("meta", self.storage_key)) - def reinitialize_from_other(self, other): + def reinitialize_from(self, other: Union["ConvoKitMeta", dict]): """ Reinitialize this ConvoKitMeta instance with the data from other """ - if type(other) == ConvoKitMeta: + if isinstance(other, ConvoKitMeta): other = {k: v for k, v in other.to_dict().items()} + elif not isinstance(other, dict): + raise TypeError( + "ConvoKitMeta can only be reinitialized from a dict instance or another ConvoKitMeta" + ) self._get_storage().initialize_data_for_component( "meta", self.storage_key, overwrite=True, initial_value=other ) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index b20c424d..8e3e7a57 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -630,12 +630,12 @@ def filter_conversations_by(self, selector: Callable[[Conversation], bool]): return self @staticmethod - def filter_corpus_utterances(source_corpus: "Corpus", selector: Callable[[Utterance], bool]): + def filter_utterances(source_corpus: "Corpus", selector: Callable[[Utterance], bool]): """ Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no guarantees with regard to maintaining conversational integrity and should be used with care. - Vectors are not preserved. The source corpus will be invalidated and no longer usable. + Vectors are not preserved. The source corpus will be invalidated and will no longer be usable. :param source_corpus: the Corpus to subset from :param selector: function for selecting which @@ -949,21 +949,22 @@ def reinitialize_index(self): @staticmethod def merge(primary: "Corpus", secondary: "Corpus", warnings: bool = True): """ - Merges primary and secondary, creating a new Corpus with their combined data. + Merges two corpora (one primary and one secondary), creating a new Corpus with their combined data. Utterances with the same id must share the same data. In case of conflicts, - primary will take precedence and the conflicting Utterance from secondary + the primary Corpus will take precedence and the conflicting Utterance from secondary will be ignored. A warning is printed when this happens. - If metadata of primary (or its conversations / utterances) shares a key with the metadata of the - secondary, secondary's metadata (or its conversations / utterances) values will be used. A warning + If metadata of the primary Corpus (or its conversations / utterances) shares a key with the metadata of the + secondary Corpus, the secondary's metadata (or its conversations / utterances) values will be used. A warning is printed when this happens. Will invalidate primary and secondary in the process. - The resulting Corpus will inherit primary's id and version number. + The resulting Corpus will inherit the primary Corpus's id and version number. - :param other_corpus: Corpus + :param primary: the primary Corpus + :param secondary: the secondary Corpus :param warnings: print warnings when data conflicts are encountered :return: new Corpus constructed from combined lists of utterances """ diff --git a/setup.py b/setup.py index b130c730..e398fe42 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ author_email="cristian@cs.cornell.edu", url="https://github.com/CornellNLP/ConvoKit", description="ConvoKit", - version="2.6.0", + version="2.5.3", packages=[ "convokit", "convokit.bag_of_words", From 735e275ad7e707d1df60ff874331a299af1caf40 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Wed, 20 Jul 2022 14:42:10 -0400 Subject: [PATCH 20/21] fix outdated references to reinitialize_from_other --- convokit/model/corpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index 8e3e7a57..8f151ff7 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -981,7 +981,7 @@ def merge(primary: "Corpus", secondary: "Corpus", warnings: bool = True): ) # Merge CORPUS metadata - new_corpus.meta.reinitialize_from_other(primary.meta) + new_corpus.meta.reinitialize_from(primary.meta) for key, val in secondary.meta.items(): if key in new_corpus.meta and new_corpus.meta[key] != val: if warnings: @@ -996,7 +996,7 @@ def merge(primary: "Corpus", secondary: "Corpus", warnings: bool = True): convos2 = secondary.iter_conversations() for convo in convos1: - new_corpus.get_conversation(convo.id).meta.reinitialize_from_other(convo.meta) + new_corpus.get_conversation(convo.id).meta.reinitialize_from(convo.meta) for convo in convos2: for key, val in convo.meta.items(): From 26fb5c0eb7275bace87e6092f04eec39d934f746 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Wed, 20 Jul 2022 15:00:50 -0400 Subject: [PATCH 21/21] __str__ implementation for Speaker, matching style of Utterance and Conversation --- convokit/model/speaker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convokit/model/speaker.py b/convokit/model/speaker.py index 09e27c20..a6392d49 100644 --- a/convokit/model/speaker.py +++ b/convokit/model/speaker.py @@ -176,3 +176,8 @@ def __eq__(self, other): return self.id == other.id except AttributeError: return self.__dict__["_name"] == other.__dict__["_name"] + + def __str__(self): + return "Speaker(id: {}, vectors: {}, meta: {})".format( + repr(self.id), self.vectors, self.meta + )