Skip to content

Commit

Permalink
build: cache nltk models into the docker image (#4118)
Browse files Browse the repository at this point in the history
* separated nltk cache

* separated nltk caching

* fixed pylint lazy log error

* using model name as default value
  • Loading branch information
mayankjobanputra authored Feb 16, 2023
1 parent ec72dd7 commit d27f372
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
4 changes: 4 additions & 0 deletions docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ ENV PATH="/opt/venv/bin:$PATH"
# - the schema will be already there when the container runs, saving the generation overhead when a container starts
# - derived images don't need to write the schema and can run with lower user privileges
RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"

# Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences.
# We cache these models for seemless user experience.
RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()"
13 changes: 7 additions & 6 deletions haystack/utils/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
from haystack.nodes._json_schema import load_schema


def cache_nltk_model(model: str = "punkt"):
logging.info("Caching %s model...", model)
import nltk

nltk.download(model)


def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None):
"""
Small function that caches models and other data.
Expand All @@ -19,12 +26,6 @@ def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Un
if models is None:
models = ["deepset/roberta-base-squad2"]

# download punkt tokenizer
logging.info("Caching punkt data")
import nltk

nltk.download("punkt")

# Cache models
import transformers

Expand Down

0 comments on commit d27f372

Please sign in to comment.