From 0557c5997f6a92055f7b0db56e9a04e93f1f3464 Mon Sep 17 00:00:00 2001 From: yelinz Date: Tue, 7 May 2024 15:32:49 +0200 Subject: [PATCH] feat(file): add apache tika for file content extraction --- docker-compose.yml | 13 +++++++++++++ poetry.lock | 36 ++++++++++++++++++++++++++++++++++-- pyproject.toml | 1 + 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 1a76f4c6..2ae7aa25 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,6 +9,7 @@ services: # - POSTGRES_PASSWORD= volumes: - dbdata:/var/lib/postgresql/data + alexandria: image: ghcr.io/projectcaluma/alexandria:dev ports: @@ -16,9 +17,12 @@ services: depends_on: - db - minio + - tika environment: - DATABASE_HOST=db - DATABASE_PORT=5432 + - TIKA_CLIENT_ONLY=true + - TIKA_SERVER_ENDPOINT=http://tika:9998 # following options are a must to configure on production system: # https://docs.djangoproject.com/en/2.1/ref/settings/#std:setting-SECRET_KEY # - SECRET_KEY= @@ -27,6 +31,7 @@ services: # https://docs.djangoproject.com/en/2.1/ref/settings/#password # same as postgres password above # - DATABASE_PASSWORD= + dms: image: ghcr.io/adfinis/document-merge-service:6.4.4 depends_on: @@ -42,6 +47,7 @@ services: - OIDC_BEARER_TOKEN_REVALIDATION_TIME=300 - SECRET_KEY=aaa - DOCXTEMPLATE_JINJA_EXTENSIONS= + minio: image: minio/minio:RELEASE.2023-11-06T22-26-08Z volumes: @@ -54,6 +60,7 @@ services: - MINIO_ROOT_USER=very - MINIO_ROOT_PASSWORD=secret command: server data --console-address ":9090" + mc: image: minio/mc:RELEASE.2023-11-06T04-19-23Z restart: on-failure @@ -68,6 +75,12 @@ services: depends_on: - minio + tika: + image: apache/tika:2.9.2.0-full + ports: + - "9998:9998" + + volumes: dbdata: minio_data: diff --git a/poetry.lock b/poetry.lock index b39e6d41..06cfd7c9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2372,7 +2372,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2554,6 +2553,21 @@ files = [ {file = "semver-2.13.0.tar.gz", hash = "sha256:fa0fe2722ee1c3f57eac478820c3a5ae2f624af8264cbdf9000c980ff7f75e3f"}, ] +[[package]] +name = "setuptools" +version = "70.0.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, + {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] + [[package]] name = "sh" version = "1.14.3" @@ -2674,6 +2688,24 @@ files = [ [package.dependencies] pytest = ">=7.0.0,<9.0.0" +[[package]] +name = "tika" +version = "2.6.0" +description = "Apache Tika Python library" +optional = false +python-versions = "*" +files = [ + {file = "tika-2.6.0.tar.gz", hash = "sha256:56670eb812944eb25ed73f1b3b075aa41e7a135b74b240822f28b819e5b373da"}, +] + +[package.dependencies] +requests = "*" +setuptools = "*" + +[package.extras] +all = ["memory-profiler (>=0.57.0)", "memory-profiler (>=0.57.0)", "pytest-benchmark (>=3.2.2)", "pytest-benchmark (>=3.2.2)"] +tests = ["memory-profiler (>=0.57.0)", "pytest-benchmark (>=3.2.2)"] + [[package]] name = "tomli" version = "2.0.1" @@ -3060,4 +3092,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more [metadata] lock-version = "2.0" python-versions = ">=3.10.0,<4.0" -content-hash = "1dcb23004309623987adc18ec0fc80f8a61ca931284421a4f8698d56445927c8" +content-hash = "236c302dae85531ff2c62ac794b537beeb94d423e7774fceff5204073edc328a" diff --git a/pyproject.toml b/pyproject.toml index 3892f35b..6944e004 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ boto3 = "^1.29.7" tqdm = "^4.66.1" manabi = "^1.4.0" python-magic = "^0.4.27" +tika = "^2.6.0" [tool.poetry.group.dev.dependencies] black = "24.4.2"