deepset-ai · mayankjobanputra · Feb 16, 2023 · Feb 8, 2023 · Feb 9, 2023 · Feb 9, 2023
@@ -47,3 +47,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # - the schema will be already there when the container runs, saving the generation overhead when a container starts
 # - derived images don't need to write the schema and can run with lower user privileges
 RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"
+
+# Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences.
+# We cache these models for seemless user experience.
+RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()"
@@ -3,6 +3,13 @@
 from haystack.nodes._json_schema import load_schema
 
 
+def cache_nltk_model(model: str = "punkt"):
+    logging.info("Caching %s model...", model)
+    import nltk
+
+    nltk.download(model)
+
+
 def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None):
     """
     Small function that caches models and other data.
@@ -19,12 +26,6 @@ def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Un
     if models is None:
         models = ["deepset/roberta-base-squad2"]
 
-    # download punkt tokenizer
-    logging.info("Caching punkt data")
-    import nltk
-
-    nltk.download("punkt")
-
     # Cache models
     import transformers