From 26a41d34f3a09dcf46d9fe3cf10281b1737b48bd Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 10 Feb 2025 16:59:44 +0100 Subject: [PATCH] use new models download CLI Signed-off-by: Michele Dolfi --- transforms/language/pdf2parquet/Dockerfile.python | 5 ++--- transforms/language/pdf2parquet/Dockerfile.ray | 11 ++++------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/transforms/language/pdf2parquet/Dockerfile.python b/transforms/language/pdf2parquet/Dockerfile.python index 4ecaaa89c..a10833bc7 100644 --- a/transforms/language/pdf2parquet/Dockerfile.python +++ b/transforms/language/pdf2parquet/Dockerfile.python @@ -32,11 +32,10 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt # Set environment ENV PYTHONPATH /home/dpk +ENV PATH="/home/dpk/.local/bin:${PATH}" # Download models -RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")' - +RUN docling-tools models download layout tableformer picture_classifier easyocr # Parallelism ENV OMP_NUM_THREADS=2 diff --git a/transforms/language/pdf2parquet/Dockerfile.ray b/transforms/language/pdf2parquet/Dockerfile.ray index 4dc62538e..6cbd20ea4 100644 --- a/transforms/language/pdf2parquet/Dockerfile.ray +++ b/transforms/language/pdf2parquet/Dockerfile.ray @@ -32,15 +32,12 @@ COPY --chmod=775 --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt - - -# Download models -RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -# RUN python -c 'from docling.document_converter import DocumentConverter; from pathlib import Path; DocumentConverter.download_models_hf(local_dir=Path("./artifacts/"));' -RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")' - # Set environment ENV PYTHONPATH /home/ray +ENV PATH="/home/ray/.local/bin:${PATH}" + +# Download models +RUN docling-tools models download layout tableformer picture_classifier easyocr # Parallelism ENV OMP_NUM_THREADS=2