diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 6b0fafb97e..eff9de5083 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -47,3 +47,7 @@ ENV PATH="/opt/venv/bin:$PATH" # - the schema will be already there when the container runs, saving the generation overhead when a container starts # - derived images don't need to write the schema and can run with lower user privileges RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()" + +# Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences. +# We cache these models for seemless user experience. +RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()" diff --git a/haystack/utils/docker.py b/haystack/utils/docker.py index a270568454..83843f6993 100644 --- a/haystack/utils/docker.py +++ b/haystack/utils/docker.py @@ -3,6 +3,13 @@ from haystack.nodes._json_schema import load_schema +def cache_nltk_model(model: str = "punkt"): + logging.info("Caching %s model...", model) + import nltk + + nltk.download(model) + + def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None): """ Small function that caches models and other data. @@ -19,12 +26,6 @@ def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Un if models is None: models = ["deepset/roberta-base-squad2"] - # download punkt tokenizer - logging.info("Caching punkt data") - import nltk - - nltk.download("punkt") - # Cache models import transformers