Add Elasticsearch Document Store (#13)

deepset-ai · Jan 24, 2020 · f83a164 · f83a164
1 parent a0293cc
commit f83a164
Show file tree

Hide file tree

Showing 15 changed files with 164 additions and 63 deletions.
diff --git a/README.rst b/README.rst
@@ -65,21 +65,26 @@ Usage
 .. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/code_snippet_usage.png
 
 
-Configuration
--------------
-The configuration can be supplied in a :code:`qa_config.py` placed in the PYTHONPATH. Alternatively, the :code:`DATABASE_URL` can also be set as an environment variable.
-
 
 Deployment
 ==========
 
-SQL Backend
------------
-The database ORM layer is implemented using SQLAlchemy library. By default, it uses the file-based SQLite database. For large scale deployments, the configuration can be changed to use other compatible databases like PostgreSQL or MySQL.
+Haystack has an extensible document store layer.
+There are currently implementations of Elasticsearch and SQL (see :code:`haystack.database.elasticsearch.ElasticsearchDocumentStore`  and :code:`haystack.database.sql.SQLDocumentStore`).
 
 Elasticsearch Backend
-----------------------
-(Coming soon)
+---------------------
+Elasticsearch is recommended for deploying on a large scale. The documents can optionally be chunked into smaller units (e.g., paragraphs) before indexing to make the results returned by the Retriever more granular and accurate.
+Retrievers can access an Elasticsearch index to find the relevant paragraphs(or documents) for a query. The default `ElasticsearchRetriever` uses Elasticsearch's native scoring (BM25), but can be extended easily with custom implementations.
+
+You can get started by running a single Elasticsearch node using docker::
+
+     docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.5.1
+
+SQL Backend
+-----------
+The SQL backend layer is mainly meant to simplify the first development steps. By default, a local file-based SQLite database is initialized.
+However, if you prefer a PostgreSQL or MySQL backend for production, you can easily configure this since our implementation is based on SQLAlchemy.
 
 REST API
 --------

diff --git a/docs/img/code_snippet_usage.png b/docs/img/code_snippet_usage.png
diff --git a/haystack/__init__.py b/haystack/__init__.py
@@ -1,8 +1,7 @@
-from haystack.retriever.tfidf import TfidfRetriever
-from haystack.reader.farm import FARMReader
 import logging
 
 import pandas as pd
+
 pd.options.display.max_colwidth = 80
 
 logger = logging.getLogger(__name__)
@@ -21,8 +20,6 @@ class Finder:
 
     def __init__(self, reader, retriever):
         self.retriever = retriever
-        self.retriever.fit()
-
         self.reader = reader
 
     def get_answers(self, question, top_k_reader=1, top_k_retriever=10, filters=None):
@@ -39,7 +36,7 @@ def get_answers(self, question, top_k_reader=1, top_k_retriever=10, filters=None
 
         # 1) Optional: reduce the search space via document tags
         if filters:
-            candidate_doc_ids = self.retriever.datastore.get_document_ids_by_tags(filters)
+            candidate_doc_ids = self.retriever.document_store.get_document_ids_by_tags(filters)
         else:
             candidate_doc_ids = None
 

diff --git a/haystack/database/base.py b/haystack/database/base.py
@@ -3,7 +3,7 @@
 
 class BaseDocumentStore:
     """
-    Base class for implementing DataStores.
+    Base class for implementing Document Stores.
     """
 
     @abstractmethod

diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py
@@ -0,0 +1,85 @@
+from elasticsearch import Elasticsearch
+from elasticsearch_dsl import Search, Document as ESDoc, Text, connections
+from haystack.database.base import BaseDocumentStore
+
+
+class Document(ESDoc):
+    name = Text()
+    text = Text()
+    tags = Text()
+
+    class Index:
+        name = "document"
+
+
+class ElasticsearchDocumentStore(BaseDocumentStore):
+    def __init__(self, host="localhost", username="", password="", index="document"):
+        self.client = Elasticsearch(hosts=[{"host": host}], http_auth=(username, password))
+        self.connections = connections.create_connection(hosts=[{"host": host}], http_auth=(username, password))
+        Document.init()  # create mapping if not exists.
+        self.index = index
+
+    def get_document_by_id(self, id):
+        query = {"filter": {"term": {"_id": id}}}
+        result = self.client.search(index=self.index, body=query)["hits"]["hits"]
+        if result:
+            document = {"id": result["_id"], "name": result["name"], "text": result["text"]}
+        else:
+            document = None
+        return document
+
+    def get_document_ids_by_tags(self, tags):
+        query = {
+            "query": {
+                "bool": {
+                    "should": [
+                        {
+                            "terms": {
+                                "tags": tags
+                            }
+                        }
+                    ]
+                }
+            }
+        }
+        result = self.client.search(index=self.index, body=query)["hits"]["hits"]
+        documents = []
+        for hit in result:
+            documents.append({"id": hit["_id"], "name": hit["name"], "text": hit["text"]})
+        return documents
+
+    def write_documents(self, documents):
+        for doc in documents:
+            d = Document(
+                name=doc["name"],
+                text=doc["text"],
+                document_id=doc.get("document_id", None),
+                tags=doc.get("tags", None),
+            )
+            d.save()
+
+    def get_document_count(self):
+        s = Search(using=self.client, index=self.index)
+        return s.count()
+
+    def get_all_documents(self):
+        search = Search(using=self.client, index=self.index)
+        documents = []
+        for hit in search:
+            documents.append(
+                {
+                    "id": hit.meta["id"],
+                    "name": hit["name"],
+                    "text": hit["text"],
+                }
+            )
+        return documents
+
+    def query(self, query, top_k=10):
+        search = Search(using=self.client, index=self.index).query("match", text=query)[:top_k].execute()
+        paragraphs = []
+        meta_data = []
+        for hit in search:
+            paragraphs.append(hit["text"])
+            meta_data.append({"paragraph_id": hit.meta["id"], "document_id": hit["document_id"]})
+        return paragraphs, meta_data
diff --git a/haystack/database/sql.py b/haystack/database/sql.py
@@ -83,9 +83,8 @@ def get_document_ids_by_tags(self, tags):
                       GROUP BY dt.document_id
               """
         tag_filters = []
-        for tag, value in tags.items():
-            if value:
-                tag_filters.append(f"SUM(CASE WHEN t.value='{value}' THEN 1 ELSE 0 END) > 0")
+        for tag in tags:
+            tag_filters.append(f"SUM(CASE WHEN t.value='{tag}' THEN 1 ELSE 0 END) > 0")
 
         final_query = f"{query} HAVING {' AND '.join(tag_filters)});"
         query_results = self.session.execute(final_query)

diff --git a/haystack/indexing/io.py b/haystack/indexing/io.py
@@ -8,7 +8,7 @@
 logger = logging.getLogger(__name__)
 
 
-def write_documents_to_db(datastore, document_dir, clean_func=None, only_empty_db=False):
+def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragrahs=False):
     """
     Write all text files(.txt) in the sub-directories of the given path to the connected database.
 
@@ -22,28 +22,42 @@ def write_documents_to_db(datastore, document_dir, clean_func=None, only_empty_d
 
     # check if db has already docs
     if only_empty_db:
-        n_docs = datastore.get_document_count()
+        n_docs = document_store.get_document_count()
         if n_docs > 0:
             logger.info(f"Skip writing documents since DB already contains {n_docs} docs ...  "
                         "(Disable `only_empty_db`, if you want to add docs anyway.)")
             return None
 
     # read and add docs
-    documents_to_write = []
+    docs_to_index = []
+    doc_id = 1
     for path in file_paths:
         with open(path) as doc:
             text = doc.read()
             if clean_func:
                 text = clean_func(text)
 
-            documents_to_write.append(
-                {
-                    "name": path.name,
-                    "text": text,
-                }
-            )
-    datastore.write_documents(documents_to_write)
-    logger.info(f"Wrote {len(documents_to_write)} docs to DB")
+            if split_paragrahs:
+                for para in text.split("\n\n"):
+                    if not para.strip():  # skip empty paragraphs
+                        continue
+                    docs_to_index.append(
+                        {
+                            "name": path.name,
+                            "text": para,
+                            "document_id": doc_id
+                        }
+                    )
+                    doc_id += 1
+            else:
+                docs_to_index.append(
+                    {
+                        "name": path.name,
+                        "text": text,
+                    }
+                )
+    document_store.write_documents(docs_to_index)
+    logger.info(f"Wrote {len(docs_to_index)} docs to DB")
 
 
 def fetch_archive_from_http(url, output_dir, proxies=None):

diff --git a/haystack/retriever/base.py b/haystack/retriever/base.py
@@ -0,0 +1,7 @@
+from abc import ABC, abstractmethod
+
+
+class BaseRetriever(ABC):
+    @abstractmethod
+    def retrieve(self, query, candidate_doc_ids=None, top_k=1):
+        pass
diff --git a/haystack/retriever/elasticsearch.py b/haystack/retriever/elasticsearch.py
@@ -0,0 +1,9 @@
+from haystack.retriever.base import BaseRetriever
+
+
+class ElasticsearchRetriever(BaseRetriever):
+    def __init__(self, document_store):
+        self.document_store = document_store
+
+    def retrieve(self, query, candidate_doc_ids=None, top_k=10):
+        return self.document_store.query(query, top_k)
diff --git a/haystack/retriever/tfidf.py b/haystack/retriever/tfidf.py
@@ -1,7 +1,8 @@
-from abc import ABC, abstractmethod
-from collections import OrderedDict, namedtuple
 import logging
+from collections import OrderedDict, namedtuple
+
 import pandas as pd
+from haystack.retriever.base import BaseRetriever
 from sklearn.feature_extraction.text import TfidfVectorizer
 
 
@@ -11,20 +12,6 @@
 Paragraph = namedtuple("Paragraph", ["paragraph_id", "document_id", "text"])
 
 
-class BaseRetriever(ABC):
-    @abstractmethod
-    def _get_all_paragraphs(self):
-        pass
-
-    @abstractmethod
-    def retrieve(self, query, candidate_doc_ids=None, top_k=1):
-        pass
-
-    @abstractmethod
-    def fit(self):
-        pass
-
-
 class TfidfRetriever(BaseRetriever):
     """
     Read all documents from a SQL backend.
@@ -35,15 +22,15 @@ class TfidfRetriever(BaseRetriever):
     It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
     """
 
-    def __init__(self, datastore):
+    def __init__(self, document_store):
         self.vectorizer = TfidfVectorizer(
             lowercase=True,
             stop_words=None,
             token_pattern=r"(?u)\b\w\w+\b",
             ngram_range=(1, 1),
         )
 
-        self.datastore = datastore
+        self.document_store = document_store
         self.paragraphs = self._get_all_paragraphs()
         self.df = None
         self.fit()
@@ -52,12 +39,11 @@ def _get_all_paragraphs(self):
         """
         Split the list of documents in paragraphs
         """
-        documents = self.datastore.get_all_documents()
+        documents = self.document_store.get_all_documents()
 
         paragraphs = []
         p_id = 0
         for doc in documents:
-            _pgs = [d for d in doc["text"].splitlines() if d.strip()]
             for p in doc["text"].split("\n\n"):
                 if not p.strip():  # skip empty paragraphs
                     continue

diff --git a/qa_config.py b/qa_config.py
diff --git a/test/test_db.py b/test/test_db.py
@@ -3,9 +3,9 @@
 
 
 def test_db_write_read():
-    sql_datastore = SQLDocumentStore()
-    write_documents_to_db(datastore=sql_datastore, document_dir="samples/docs")
-    documents = sql_datastore.get_all_documents()
+    sql_document_store = SQLDocumentStore()
+    write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs")
+    documents = sql_document_store.get_all_documents()
     assert len(documents) == 2
-    doc = sql_datastore.get_document_by_id("1")
+    doc = sql_document_store.get_document_by_id("1")
     assert doc.keys() == {"id", "name", "text", "tags"}
diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb
@@ -86,12 +86,12 @@
     "# The documents can be stored in different types of \"DocumentStores\".\n",
     "# For dev we suggest a light-weight SQL DB\n",
     "# For production we suggest elasticsearch\n",
-    "datastore = SQLDocumentStore(url=\"sqlite:///qa.db\")\n",
+    "document_store = SQLDocumentStore(url=\"sqlite:///qa.db\")\n",
     "\n",
     "# Now, let's write the docs to our DB.\n",
     "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
     "# It must take a str as input, and return a str.\n",
-    "write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)"
+    "write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)"
    ]
   },
   {
@@ -122,7 +122,7 @@
    "source": [
     "# A retriever identifies the k most promising chunks of text that might contain the answer for our question\n",
     "# Retrievers use some simple but fast algorithm, here: TF-IDF\n",
-    "retriever = TfidfRetriever(datastore=datastore)"
+    "retriever = TfidfRetriever(document_store=document_store)"
    ]
   },
   {

diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.py b/tutorials/Tutorial1_Basic_QA_Pipeline.py
@@ -20,18 +20,18 @@
 # The documents can be stored in different types of "DocumentStores".
 # For dev we suggest a light-weight SQL DB
 # For production we suggest elasticsearch
-datastore = SQLDocumentStore(url="sqlite:///qa.db")
+document_store = SQLDocumentStore(url="sqlite:///qa.db")
 
 # Now, let's write the docs to our DB.
 # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
 # It must take a str as input, and return a str.
-write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)
+write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)
 
 ## Initalize Reader, Retriever & Finder
 
 # A retriever identifies the k most promising chunks of text that might contain the answer for our question
 # Retrievers use some simple but fast algorithm, here: TF-IDF
-retriever = TfidfRetriever(datastore=datastore)
+retriever = TfidfRetriever(document_store=document_store)
 
 # A reader scans the text chunks in detail and extracts the k best answers
 # Reader use more powerful but slower deep learning models

diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py
@@ -27,14 +27,14 @@
 
 
 # Init Document store & write docs to it
-datastore = SQLDocumentStore(url="sqlite:///qa.db")
-write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)
+document_store = SQLDocumentStore(url="sqlite:///qa.db")
+write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)
 
 ## Initalize Reader, Retriever & Finder
 
 # A retriever identifies the k most promising chunks of text that might contain the answer for our question
 # Retrievers use some simple but fast algorithm, here: TF-IDF
-retriever = TfidfRetriever(datastore=datastore)
+retriever = TfidfRetriever(document_store=document_store)
 
 # The Finder sticks together retriever and retriever in a pipeline to answer our actual questions
 finder = Finder(reader, retriever)