Skip to content

Commit

Permalink
Deprecate Tags for Document Stores (#286)
Browse files Browse the repository at this point in the history
  • Loading branch information
tanaysoni authored Aug 4, 2020
1 parent 6a10325 commit 5937f9c
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 245 deletions.
20 changes: 7 additions & 13 deletions haystack/database/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ def __init__(self, text: str,
query_score: Optional[float] = None,
question: Optional[str] = None,
meta: Dict[str, Any] = None,
tags: Optional[Dict[str, Any]] = None,
embedding: Optional[List[float]] = None):
"""
Object used to represent documents / passages in a standardized way within Haystack.
Expand All @@ -24,7 +23,6 @@ def __init__(self, text: str,
:param query_score: Retriever's query score for a retrieved document
:param question: Question text for FAQs.
:param meta: Meta fields for a document like name, url, or author.
:param tags: Tags that allow filtering of the data
:param embedding: Vector encoding of the text
"""

Expand All @@ -38,7 +36,6 @@ def __init__(self, text: str,
self.query_score = query_score
self.question = question
self.meta = meta
self.tags = tags # deprecate?
self.embedding = embedding

def to_dict(self):
Expand All @@ -47,7 +44,7 @@ def to_dict(self):
@classmethod
def from_dict(cls, dict):
_doc = dict.copy()
init_args = ["text", "id", "query_score", "question", "meta", "tags", "embedding"]
init_args = ["text", "id", "query_score", "question", "meta", "embedding"]
if "meta" not in _doc.keys():
_doc["meta"] = {}
# copy additional fields into "meta"
Expand Down Expand Up @@ -110,14 +107,15 @@ class BaseDocumentStore(ABC):
Base class for implementing Document Stores.
"""
index: Optional[str]
label_index: Optional[str]

@abstractmethod
def write_documents(self, documents: List[dict], index: Optional[str] = None):
"""
Indexes documents for later queries.
:param documents: List of dictionaries.
Default format: {"text": "<the-actual-text>"}
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
Expand All @@ -129,29 +127,25 @@ def write_documents(self, documents: List[dict], index: Optional[str] = None):
pass

@abstractmethod
def get_all_documents(self, index: Optional[str] = None) -> List[Document]:
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
pass

@abstractmethod
def get_all_labels(self, index: str = "label", filters: Optional[dict] = None) -> List[Label]:
def get_all_labels(self, index: str = "label", filters: Optional[Optional[Dict[str, List[str]]]] = None) -> List[Label]:
pass

@abstractmethod
def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
pass

@abstractmethod
def get_document_ids_by_tags(self, tag, index) -> List[str]:
pass

@abstractmethod
def get_document_count(self, index: Optional[str] = None) -> int:
pass

@abstractmethod
def query_by_embedding(self,
query_emb: List[float],
filters: Optional[dict] = None,
filters: Optional[Optional[Dict[str, List[str]]]] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
pass
Expand Down
34 changes: 13 additions & 21 deletions haystack/database/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from haystack.database.base import BaseDocumentStore, Document, Label
from haystack.indexing.utils import eval_data_from_file
from haystack.retriever.base import BaseRetriever

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -89,7 +90,7 @@ def __init__(
self.index: str = index

self._create_label_index(label_index)
self.label_index = label_index
self.label_index: str = label_index
self.update_existing_documents = update_existing_documents

def _create_document_index(self, index_name):
Expand Down Expand Up @@ -136,17 +137,6 @@ def get_document_by_id(self, id: str, index=None) -> Optional[Document]:
document = self._convert_es_hit_to_document(result[0]) if result else None
return document

def get_document_ids_by_tags(self, tags: dict, index: Optional[str]) -> List[str]:
index = index or self.index
term_queries = [{"terms": {key: value}} for key, value in tags.items()]
query = {"query": {"bool": {"must": term_queries}}}
logger.debug(f"Tag filter query: {query}")
result = self.client.search(index=index, body=query, size=10000)["hits"]["hits"]
doc_ids = []
for hit in result:
doc_ids.append(hit["_id"])
return doc_ids

def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
"""
Indexes documents for later queries in Elasticsearch.
Expand Down Expand Up @@ -198,7 +188,8 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
documents_to_index.append(_doc)
bulk(self.client, documents_to_index, request_timeout=300, refresh="wait_for")

def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = "label"):
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None):
index = index or self.label_index
if index and not self.client.indices.exists(index=index):
self._create_label_index(index)

Expand Down Expand Up @@ -230,7 +221,7 @@ def get_document_count(self, index: Optional[str] = None) -> int:
def get_label_count(self, index: Optional[str] = None) -> int:
return self.get_document_count(index=index)

def get_all_documents(self, index: Optional[str] = None, filters: Optional[dict] = None) -> List[Document]:
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
if index is None:
index = self.index

Expand All @@ -239,12 +230,13 @@ def get_all_documents(self, index: Optional[str] = None, filters: Optional[dict]

return documents

def get_all_labels(self, index: str = "label", filters: Optional[dict] = None) -> List[Label]:
def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
index = index or self.label_index
result = self.get_all_documents_in_index(index=index, filters=filters)
labels = [Label.from_dict(hit["_source"]) for hit in result]
return labels

def get_all_documents_in_index(self, index: str, filters: Optional[dict] = None) -> List[dict]:
def get_all_documents_in_index(self, index: str, filters: Optional[Dict[str, List[str]]] = None) -> List[dict]:
body = {
"query": {
"bool": {
Expand Down Expand Up @@ -346,7 +338,7 @@ def query(

def query_by_embedding(self,
query_emb: np.array,
filters: Optional[dict] = None,
filters: Optional[Dict[str, List[str]]] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
if index is None:
Expand Down Expand Up @@ -392,7 +384,7 @@ def query_by_embedding(self,

def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document:
# We put all additional data of the doc into meta_data and return it in the API
meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field, "tags")}
meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
meta_data["name"] = meta_data.pop(self.name_field, None)

document = Document(
Expand All @@ -401,7 +393,6 @@ def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> D
meta=meta_data,
query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
question=hit["_source"].get(self.faq_question_field),
tags=hit["_source"].get("tags"),
embedding=hit["_source"].get(self.embedding_field)
)
return document
Expand All @@ -420,12 +411,13 @@ def describe_documents(self, index=None):
}
return stats

def update_embeddings(self, retriever, index=None):
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
"""
Updates the embeddings in the the document store using the encoding model specified in the retriever.
This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
:param retriever: Retriever
:param index: Index name to update
:return: None
"""
if index is None:
Expand All @@ -439,7 +431,7 @@ def update_embeddings(self, retriever, index=None):

#TODO Index embeddings every X batches to avoid OOM for huge document collections
logger.info(f"Updating embeddings for {len(passages)} docs ...")
embeddings = retriever.embed_passages(passages)
embeddings = retriever.embed_passages(passages) # type: ignore

assert len(docs) == len(embeddings)

Expand Down
89 changes: 32 additions & 57 deletions haystack/database/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
"""

def __init__(self, embedding_field: Optional[str] = None):
self.doc_tags: Dict[str, Any] = {}
self.indexes: Dict[str, Dict] = defaultdict(dict)
self.index: str = "document"
self.label_index: str = "label"
Expand All @@ -22,10 +21,11 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
Indexes documents for later queries.
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
Optionally, you can also supply "tags": ["one-tag", "another-one"]
or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta": {"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
:param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
separate index than the documents for search.
:return: None
Expand All @@ -37,10 +37,6 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
for document in documents_objects:
self.indexes[index][document.id] = document

#TODO fix tags after id refactoring
tags = document.tags
self._map_tags_to_ids(document.id, tags)

def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
index = index or self.label_index
label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]
Expand All @@ -49,21 +45,6 @@ def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[s
label_id = str(uuid4())
self.indexes[index][label_id] = label

def _map_tags_to_ids(self, hash: str, tags: List[str]):
if isinstance(tags, list):
for tag in tags:
if isinstance(tag, dict):
tag_keys = tag.keys()
for tag_key in tag_keys:
tag_values = tag.get(tag_key, [])
if tag_values:
for tag_value in tag_values:
comp_key = str((tag_key, tag_value))
if comp_key in self.doc_tags:
self.doc_tags[comp_key].append(hash)
else:
self.doc_tags[comp_key] = [hash]

def get_document_by_id(self, id: str, index: Optional[str] = None) -> Document:
index = index or self.index
return self.indexes[index][id]
Expand All @@ -79,7 +60,7 @@ def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[

def query_by_embedding(self,
query_emb: List[float],
filters: Optional[dict] = None,
filters: Optional[Dict[str, List[str]]] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:

Expand Down Expand Up @@ -116,44 +97,36 @@ def update_embeddings(self, retriever):
#TODO
raise NotImplementedError("update_embeddings() is not yet implemented for this DocumentStore")

def get_document_ids_by_tags(self, tags: Union[List[Dict[str, Union[str, List[str]]]], Dict[str, Union[str, List[str]]]], index: Optional[str] = None) -> List[str]:
"""
The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
"""
index = index or self.index
if not isinstance(tags, list):
tags = [tags]
result = self._find_ids_by_tags(tags, index=index)
return result

def _find_ids_by_tags(self, tags: List[Dict[str, Union[str, List[str]]]], index: str):
result = []
for tag in tags:
tag_keys = tag.keys()
for tag_key in tag_keys:
tag_values = tag.get(tag_key, None)
if tag_values:
for tag_value in tag_values:
comp_key = str((tag_key, tag_value))
doc_ids = self.doc_tags.get(comp_key, [])
for doc_id in doc_ids:
result.append(self.indexes[index].get(doc_id))
return result

def get_document_count(self, index=None) -> int:
def get_document_count(self, index: Optional[str] = None) -> int:
index = index or self.index
return len(self.indexes[index].items())

def get_label_count(self, index=None) -> int:
def get_label_count(self, index: Optional[str] = None) -> int:
index = index or self.label_index
return len(self.indexes[index].items())

def get_all_documents(self, index=None) -> List[Document]:
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
index = index or self.index
return list(self.indexes[index].values())
documents = list(self.indexes[index].values())
filtered_documents = []

if filters:
for doc in documents:
is_hit = True
for key, values in filters.items():
if doc.meta.get(key):
if doc.meta[key] not in values:
is_hit = False
else:
is_hit = False
if is_hit:
filtered_documents.append(doc)
else:
filtered_documents = documents

return filtered_documents

def get_all_labels(self, index=None, filters=None) -> List[Label]:
def get_all_labels(self, index: str = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
index = index or self.label_index

if filters:
Expand All @@ -172,7 +145,7 @@ def get_all_labels(self, index=None, filters=None) -> List[Label]:

return result

def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "label"):
def add_eval_data(self, filename: str, doc_index: Optional[str] = None, label_index: Optional[str] = None):
"""
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
Expand All @@ -185,10 +158,12 @@ def add_eval_data(self, filename: str, doc_index: str = "document", label_index:
"""

docs, labels = eval_data_from_file(filename)
doc_index = doc_index or self.index
label_index = label_index or self.label_index
self.write_documents(docs, index=doc_index)
self.write_labels(labels, index=label_index)

def delete_all_documents(self, index=None):
def delete_all_documents(self, index: Optional[str] = None):
"""
Delete all documents in a index.
Expand All @@ -197,4 +172,4 @@ def delete_all_documents(self, index=None):
"""

index = index or self.index
self.indexes[index] = {}
self.indexes[index] = {}
Loading

0 comments on commit 5937f9c

Please sign in to comment.