diff --git a/camelot/backends/image_conversion.py b/camelot/backends/image_conversion.py index 7ab2f19c..34e123ad 100644 --- a/camelot/backends/image_conversion.py +++ b/camelot/backends/image_conversion.py @@ -1,18 +1,24 @@ +from .pdfium_backend import PdfiumBackend from .ghostscript_backend import GhostscriptBackend from .poppler_backend import PopplerBackend -BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend} +BACKENDS = { + "pdfium": PdfiumBackend, + "poppler": PopplerBackend, + "ghostscript": GhostscriptBackend, +} class ImageConversionBackend: - def __init__(self, backend="poppler", use_fallback=True): + def __init__(self, backend="pdfium", use_fallback=True): if backend not in BACKENDS.keys(): raise ValueError(f"Image conversion backend '{backend}' not supported") self.backend = backend self.use_fallback = use_fallback - self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys())) + self.fallbacks = list(BACKENDS.keys()) + self.fallbacks.remove(self.backend) def convert(self, pdf_path, png_path): try: diff --git a/camelot/backends/pdfium_backend.py b/camelot/backends/pdfium_backend.py new file mode 100644 index 00000000..a658ff9e --- /dev/null +++ b/camelot/backends/pdfium_backend.py @@ -0,0 +1,18 @@ +try: + import pypdfium2 as pdfium +except Exception as e: + pdfium = None + pdfium_exc = e +else: + pdfium_exc = None + + +class PdfiumBackend: + def convert(self, pdf_path, png_path, resolution=300): + if not pdfium: + raise OSError(f"pypdfium2 is not available: {pdfium_exc!r}") + doc = pdfium.PdfDocument(pdf_path) + assert len(doc) == 1 + doc.init_forms() + image = doc[0].render(scale=resolution/72).to_pil() + image.save(png_path) diff --git a/noxfile.py b/noxfile.py index 0f02b6e1..654eb7d4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -155,7 +155,7 @@ def mypy(session: Session) -> None: session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py") -base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17"] +base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pypdfium2>=4,<5"] plot_requires = [ "matplotlib>=2.2.3", diff --git a/pyproject.toml b/pyproject.toml index 156cd8dd..734e1921 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,8 @@ myst-parser = {version = ">=0.16.1"} camelot = "camelot.__main__:main" [tool.poetry.group.base.dependencies] -ghostscript = "^0.7" +pypdfium2 = "^4" +ghostscript = "^0.7" # remove in favor of pypdfium2? opencv-python = "^4.7.0.68" diff --git a/tests/test_common.py b/tests/test_common.py index ca9910d0..6db6215f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -50,6 +50,15 @@ def test_password(testdir): assert_frame_equal(df, tables[0].df) +@skip_on_windows +def test_repr_pdfium(testdir): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "" + assert repr(tables[0].cells[0][0]) == "" + + @skip_pdftopng def test_repr_poppler(testdir): filename = os.path.join(testdir, "foo.pdf") @@ -68,6 +77,15 @@ def test_repr_ghostscript(testdir): assert repr(tables[0].cells[0][0]) == "" +@skip_on_windows +def test_url_pdfium(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + @skip_pdftopng def test_url_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" @@ -86,6 +104,25 @@ def test_url_ghostscript(testdir): assert repr(tables[0].cells[0][0]) == "" +@skip_on_windows +def test_pages_pdfium(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages="1-end", backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages="all", backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + @skip_pdftopng def test_pages_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" diff --git a/tests/test_image_conversion_backend.py b/tests/test_image_conversion_backend.py index d1d85b0b..98b2c45f 100644 --- a/tests/test_image_conversion_backend.py +++ b/tests/test_image_conversion_backend.py @@ -2,6 +2,8 @@ from camelot.backends import ImageConversionBackend +# TODO consider adding pdfium backend + @pytest.fixture def patch_backends(monkeypatch): @@ -31,7 +33,7 @@ def convert(self, pdf_path, png_path): def test_poppler_backend_error_when_no_use_fallback(patch_backends): - backend = ImageConversionBackend(use_fallback=False) + backend = ImageConversionBackend(backend="poppler", use_fallback=False) message = "Image conversion failed with image conversion backend 'poppler'" with pytest.raises(ValueError, match=message): @@ -39,7 +41,7 @@ def test_poppler_backend_error_when_no_use_fallback(patch_backends): def test_ghostscript_backend_when_use_fallback(patch_backends): - backend = ImageConversionBackend() + backend = ImageConversionBackend(backend="ghostscript") backend.convert("foo", "bar") @@ -49,7 +51,7 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch): monkeypatch.setattr( "camelot.backends.image_conversion.BACKENDS", backends, raising=True ) - backend = ImageConversionBackend() + backend = ImageConversionBackend(backend="poppler") message = "Image conversion failed with image conversion backend 'ghostscript'" with pytest.raises(ValueError, match=message):