Deprecate Tags for Document Stores (#286)

deepset-ai · Aug 4, 2020 · 5937f9c · 5937f9c
1 parent 6a10325
commit 5937f9c
Show file tree

Hide file tree

Showing 8 changed files with 117 additions and 245 deletions.
diff --git a/haystack/database/base.py b/haystack/database/base.py
@@ -9,7 +9,6 @@ def __init__(self, text: str,
                  query_score: Optional[float] = None,
                  question: Optional[str] = None,
                  meta: Dict[str, Any] = None,
-                 tags: Optional[Dict[str, Any]] = None,
                  embedding: Optional[List[float]] = None):
         """
         Object used to represent documents / passages in a standardized way within Haystack.
@@ -24,7 +23,6 @@ def __init__(self, text: str,
         :param query_score: Retriever's query score for a retrieved document
         :param question: Question text for FAQs.
         :param meta: Meta fields for a document like name, url, or author.
-        :param tags: Tags that allow filtering of the data
         :param embedding: Vector encoding of the text
         """
 
@@ -38,7 +36,6 @@ def __init__(self, text: str,
         self.query_score = query_score
         self.question = question
         self.meta = meta
-        self.tags = tags # deprecate?
         self.embedding = embedding
 
     def to_dict(self):
@@ -47,7 +44,7 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, dict):
         _doc = dict.copy()
-        init_args = ["text", "id", "query_score", "question", "meta", "tags", "embedding"]
+        init_args = ["text", "id", "query_score", "question", "meta", "embedding"]
         if "meta" not in _doc.keys():
             _doc["meta"] = {}
         # copy additional fields into "meta"
@@ -110,14 +107,15 @@ class BaseDocumentStore(ABC):
     Base class for implementing Document Stores.
     """
     index: Optional[str]
+    label_index: Optional[str]
 
     @abstractmethod
     def write_documents(self, documents: List[dict], index: Optional[str] = None):
         """
         Indexes documents for later queries.
 
-        :param documents: List of dictionaries.
-                          Default format: {"text": "<the-actual-text>"}
+        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
+                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                           Optionally: Include meta data via {"text": "<the-actual-text>",
                           "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                           It can be used for filtering and is accessible in the responses of the Finder.
@@ -129,29 +127,25 @@ def write_documents(self, documents: List[dict], index: Optional[str] = None):
         pass
 
     @abstractmethod
-    def get_all_documents(self, index: Optional[str] = None) -> List[Document]:
+    def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
         pass
 
     @abstractmethod
-    def get_all_labels(self, index: str = "label", filters: Optional[dict] = None) -> List[Label]:
+    def get_all_labels(self, index: str = "label", filters: Optional[Optional[Dict[str, List[str]]]] = None) -> List[Label]:
         pass
 
     @abstractmethod
     def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
         pass
 
-    @abstractmethod
-    def get_document_ids_by_tags(self, tag, index) -> List[str]:
-        pass
-
     @abstractmethod
     def get_document_count(self, index: Optional[str] = None) -> int:
         pass
 
     @abstractmethod
     def query_by_embedding(self,
                            query_emb: List[float],
-                           filters: Optional[dict] = None,
+                           filters: Optional[Optional[Dict[str, List[str]]]] = None,
                            top_k: int = 10,
                            index: Optional[str] = None) -> List[Document]:
         pass

diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py
@@ -9,6 +9,7 @@
 
 from haystack.database.base import BaseDocumentStore, Document, Label
 from haystack.indexing.utils import eval_data_from_file
+from haystack.retriever.base import BaseRetriever
 
 logger = logging.getLogger(__name__)
 
@@ -89,7 +90,7 @@ def __init__(
         self.index: str = index
 
         self._create_label_index(label_index)
-        self.label_index = label_index
+        self.label_index: str = label_index
         self.update_existing_documents = update_existing_documents
 
     def _create_document_index(self, index_name):
@@ -136,17 +137,6 @@ def get_document_by_id(self, id: str, index=None) -> Optional[Document]:
         document = self._convert_es_hit_to_document(result[0]) if result else None
         return document
 
-    def get_document_ids_by_tags(self, tags: dict, index: Optional[str]) -> List[str]:
-        index = index or self.index
-        term_queries = [{"terms": {key: value}} for key, value in tags.items()]
-        query = {"query": {"bool": {"must": term_queries}}}
-        logger.debug(f"Tag filter query: {query}")
-        result = self.client.search(index=index, body=query, size=10000)["hits"]["hits"]
-        doc_ids = []
-        for hit in result:
-            doc_ids.append(hit["_id"])
-        return doc_ids
-
     def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
         """
         Indexes documents for later queries in Elasticsearch.
@@ -198,7 +188,8 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
             documents_to_index.append(_doc)
         bulk(self.client, documents_to_index, request_timeout=300, refresh="wait_for")
 
-    def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = "label"):
+    def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None):
+        index = index or self.label_index
         if index and not self.client.indices.exists(index=index):
             self._create_label_index(index)
 
@@ -230,7 +221,7 @@ def get_document_count(self, index: Optional[str] = None) -> int:
     def get_label_count(self, index: Optional[str] = None) -> int:
         return self.get_document_count(index=index)
 
-    def get_all_documents(self, index: Optional[str] = None, filters: Optional[dict] = None) -> List[Document]:
+    def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
         if index is None:
             index = self.index
 
@@ -239,12 +230,13 @@ def get_all_documents(self, index: Optional[str] = None, filters: Optional[dict]
 
         return documents
 
-    def get_all_labels(self, index: str = "label", filters: Optional[dict] = None) -> List[Label]:
+    def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
+        index = index or self.label_index
         result = self.get_all_documents_in_index(index=index, filters=filters)
         labels = [Label.from_dict(hit["_source"]) for hit in result]
         return labels
 
-    def get_all_documents_in_index(self, index: str, filters: Optional[dict] = None) -> List[dict]:
+    def get_all_documents_in_index(self, index: str, filters: Optional[Dict[str, List[str]]] = None) -> List[dict]:
         body = {
             "query": {
                 "bool": {
@@ -346,7 +338,7 @@ def query(
 
     def query_by_embedding(self,
                            query_emb: np.array,
-                           filters: Optional[dict] = None,
+                           filters: Optional[Dict[str, List[str]]] = None,
                            top_k: int = 10,
                            index: Optional[str] = None) -> List[Document]:
         if index is None:
@@ -392,7 +384,7 @@ def query_by_embedding(self,
 
     def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document:
         # We put all additional data of the doc into meta_data and return it in the API
-        meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field, "tags")}
+        meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
         meta_data["name"] = meta_data.pop(self.name_field, None)
 
         document = Document(
@@ -401,7 +393,6 @@ def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> D
             meta=meta_data,
             query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
             question=hit["_source"].get(self.faq_question_field),
-            tags=hit["_source"].get("tags"),
             embedding=hit["_source"].get(self.embedding_field)
         )
         return document
@@ -420,12 +411,13 @@ def describe_documents(self, index=None):
                  }
         return stats
 
-    def update_embeddings(self, retriever, index=None):
+    def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
         """
         Updates the embeddings in the the document store using the encoding model specified in the retriever.
         This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
 
         :param retriever: Retriever
+        :param index: Index name to update
         :return: None
         """
         if index is None:
@@ -439,7 +431,7 @@ def update_embeddings(self, retriever, index=None):
 
         #TODO Index embeddings every X batches to avoid OOM for huge document collections
         logger.info(f"Updating embeddings for {len(passages)} docs ...")
-        embeddings = retriever.embed_passages(passages)
+        embeddings = retriever.embed_passages(passages)  # type: ignore
 
         assert len(docs) == len(embeddings)
 

diff --git a/haystack/database/memory.py b/haystack/database/memory.py
@@ -12,7 +12,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
     """
 
     def __init__(self, embedding_field: Optional[str] = None):
-        self.doc_tags: Dict[str, Any] = {}
         self.indexes: Dict[str, Dict] = defaultdict(dict)
         self.index: str = "document"
         self.label_index: str = "label"
@@ -22,10 +21,11 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
         Indexes documents for later queries.
 
 
-        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
+       :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                           For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
-                          Optionally, you can also supply "tags": ["one-tag", "another-one"]
-                          or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
+                          Optionally: Include meta data via {"text": "<the-actual-text>",
+                          "meta": {"name": "<some-document-name>, "author": "somebody", ...}}
+                          It can be used for filtering and is accessible in the responses of the Finder.
         :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
                       separate index than the documents for search.
         :return: None
@@ -37,10 +37,6 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O
         for document in documents_objects:
             self.indexes[index][document.id] = document
 
-            #TODO fix tags after id refactoring
-            tags = document.tags
-            self._map_tags_to_ids(document.id, tags)
-
     def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
         index = index or self.label_index
         label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]
@@ -49,21 +45,6 @@ def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[s
             label_id = str(uuid4())
             self.indexes[index][label_id] = label
 
-    def _map_tags_to_ids(self, hash: str, tags: List[str]):
-        if isinstance(tags, list):
-            for tag in tags:
-                if isinstance(tag, dict):
-                    tag_keys = tag.keys()
-                    for tag_key in tag_keys:
-                        tag_values = tag.get(tag_key, [])
-                        if tag_values:
-                            for tag_value in tag_values:
-                                comp_key = str((tag_key, tag_value))
-                                if comp_key in self.doc_tags:
-                                    self.doc_tags[comp_key].append(hash)
-                                else:
-                                    self.doc_tags[comp_key] = [hash]
-
     def get_document_by_id(self, id: str, index: Optional[str] = None) -> Document:
         index = index or self.index
         return self.indexes[index][id]
@@ -79,7 +60,7 @@ def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[
 
     def query_by_embedding(self,
                            query_emb: List[float],
-                           filters: Optional[dict] = None,
+                           filters: Optional[Dict[str, List[str]]] = None,
                            top_k: int = 10,
                            index: Optional[str] = None) -> List[Document]:
 
@@ -116,44 +97,36 @@ def update_embeddings(self, retriever):
         #TODO
         raise NotImplementedError("update_embeddings() is not yet implemented for this DocumentStore")
 
-    def get_document_ids_by_tags(self, tags: Union[List[Dict[str, Union[str, List[str]]]], Dict[str, Union[str, List[str]]]], index: Optional[str] = None) -> List[str]:
-        """
-        The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
-        The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
-        """
-        index = index or self.index
-        if not isinstance(tags, list):
-            tags = [tags]
-        result = self._find_ids_by_tags(tags, index=index)
-        return result
-
-    def _find_ids_by_tags(self, tags: List[Dict[str, Union[str, List[str]]]], index: str):
-        result = []
-        for tag in tags:
-            tag_keys = tag.keys()
-            for tag_key in tag_keys:
-                tag_values = tag.get(tag_key, None)
-                if tag_values:
-                    for tag_value in tag_values:
-                        comp_key = str((tag_key, tag_value))
-                        doc_ids = self.doc_tags.get(comp_key, [])
-                        for doc_id in doc_ids:
-                            result.append(self.indexes[index].get(doc_id))
-        return result
-
-    def get_document_count(self, index=None) -> int:
+    def get_document_count(self, index: Optional[str] = None) -> int:
         index = index or self.index
         return len(self.indexes[index].items())
 
-    def get_label_count(self, index=None) -> int:
+    def get_label_count(self, index: Optional[str] = None) -> int:
         index = index or self.label_index
         return len(self.indexes[index].items())
 
-    def get_all_documents(self, index=None) -> List[Document]:
+    def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
         index = index or self.index
-        return list(self.indexes[index].values())
+        documents = list(self.indexes[index].values())
+        filtered_documents = []
+
+        if filters:
+            for doc in documents:
+                is_hit = True
+                for key, values in filters.items():
+                    if doc.meta.get(key):
+                        if doc.meta[key] not in values:
+                            is_hit = False
+                    else:
+                        is_hit = False
+                if is_hit:
+                    filtered_documents.append(doc)
+        else:
+            filtered_documents = documents
+
+        return filtered_documents
 
-    def get_all_labels(self, index=None, filters=None) -> List[Label]:
+    def get_all_labels(self, index: str = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
         index = index or self.label_index
 
         if filters:
@@ -172,7 +145,7 @@ def get_all_labels(self, index=None, filters=None) -> List[Label]:
 
         return result
 
-    def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "label"):
+    def add_eval_data(self, filename: str, doc_index: Optional[str] = None, label_index: Optional[str] = None):
         """
         Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
 
@@ -185,10 +158,12 @@ def add_eval_data(self, filename: str, doc_index: str = "document", label_index:
         """
 
         docs, labels = eval_data_from_file(filename)
+        doc_index = doc_index or self.index
+        label_index = label_index or self.label_index
         self.write_documents(docs, index=doc_index)
         self.write_labels(labels, index=label_index)
 
-    def delete_all_documents(self, index=None):
+    def delete_all_documents(self, index: Optional[str] = None):
         """
         Delete all documents in a index.
 
@@ -197,4 +172,4 @@ def delete_all_documents(self, index=None):
         """
 
         index = index or self.index
-        self.indexes[index] = {}
+        self.indexes[index] = {}