Skip to content

Commit

Permalink
remove cgi, download in chunks
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Jul 26, 2024
1 parent 33d5d7d commit 02cf8e5
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 13 deletions.
29 changes: 17 additions & 12 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import cgi
import functools
import logging
import tempfile
import time
import traceback
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from urllib.request import urlopen

import requests
from docling_core.types import Document
from PIL import ImageDraw
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
Expand Down Expand Up @@ -102,17 +101,23 @@ def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
with tempfile.TemporaryDirectory() as temp_dir:
try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
with urlopen(str(source)) as resp:
cont_disp = resp.info().get("Content-Disposition")
content = resp.read()
if cont_disp:
_, params = cgi.parse_header(cont_disp)
filename = params.get("filename", self._default_download_filename)
else:
filename = http_url.path or self._default_download_filename
local_path = Path(temp_dir) / filename
res = requests.get(http_url, stream=True)
res.raise_for_status()
fname = None
# try to get filename from response header
if cont_disp := res.headers.get("Content-Disposition"):
for par in cont_disp.strip().split(";"):
# currently only handling directive "filename" (not "*filename")
if (split := par.split("=")) and split[0].strip() == "filename":
fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path).name or self._default_download_filename
local_path = Path(temp_dir) / fname
with open(local_path, "wb") as f:
f.write(content)
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError:
try:
local_path = TypeAdapter(Path).validate_python(source)
Expand Down
3 changes: 2 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = { version = "^1.7", optional = true }

[tool.poetry.group.dev.dependencies]
Expand Down

0 comments on commit 02cf8e5

Please sign in to comment.