Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chore: Use pytest fixtures effectively #24

Merged
merged 2 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22))
- Update `pre-commit` to 4.0.1 ([#23](https://github.com/stumpylog/tika-client/pull/23))
- Use pytest fixtures effectively ([#24](https://github.com/stumpylog/tika-client/pull/24))

## [0.6.0] - 2024-07-18

Expand Down
75 changes: 68 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,85 @@
import logging
import os
from pathlib import Path
from typing import Final
from typing import Generator

import pytest

from tika_client.client import TikaClient

TIKA_URL: Final[str] = os.getenv("TIKA_URL", "http://localhost:9998")

SAMPLE_DIR: Final[Path] = Path(__file__).parent.resolve() / "samples"
@pytest.fixture(scope="session")
def tika_host() -> str:
return os.getenv("TIKA_URL", "http://localhost:9998")


@pytest.fixture(scope="session")
def samples_dir() -> Path:
return Path(__file__).parent.resolve() / "samples"


@pytest.fixture(scope="session")
def sample_libre_office_writer_file(samples_dir: Path) -> Path:
return samples_dir / "sample-libre-office.odt"


@pytest.fixture(scope="session")
def sample_google_docs_to_libre_office_writer_file(samples_dir: Path) -> Path:
return samples_dir / "sample.odt"


@pytest.fixture(scope="session")
def sample_google_docs_to_docx_file(samples_dir: Path) -> Path:
return samples_dir / "sample.docx"


@pytest.fixture(scope="session")
def sample_docx_file(samples_dir: Path) -> Path:
return samples_dir / "microsoft-sample.docx"


@pytest.fixture(scope="session")
def sample_doc_file(samples_dir: Path) -> Path:
return samples_dir / "sample.doc"


@pytest.fixture(scope="session")
def sample_html_file(samples_dir: Path) -> Path:
return samples_dir / "sample.html"


@pytest.fixture(scope="session")
def sample_office_doc_with_images_file(samples_dir: Path) -> Path:
return samples_dir / "test-document-images.odt"


@pytest.fixture(scope="session")
def sample_jpeg_file(samples_dir: Path) -> Path:
return samples_dir / "sample.jpg"


@pytest.fixture(scope="session")
def sample_png_file(samples_dir: Path) -> Path:
return samples_dir / "sample.png"


@pytest.fixture(scope="session")
def sample_ods_file(samples_dir: Path) -> Path:
return samples_dir / "sample-spreadsheet.ods"


@pytest.fixture(scope="session")
def sample_xlsx_file(samples_dir: Path) -> Path:
return samples_dir / "sample-spreadsheet.xlsx"


@pytest.fixture
def tika_client() -> TikaClient:
with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO) as client:
def tika_client(tika_host: str) -> Generator[TikaClient, None, None]:
with TikaClient(tika_url=tika_host, log_level=logging.INFO) as client:
yield client


@pytest.fixture
def tika_client_compressed() -> TikaClient:
with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO, compress=True) as client:
def tika_client_compressed(tika_host: str) -> Generator[TikaClient, None, None]:
with TikaClient(tika_url=tika_host, log_level=logging.INFO, compress=True) as client:
yield client
80 changes: 61 additions & 19 deletions tests/test_datetime_formats.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,36 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from pathlib import Path

import magic
import pytest
from pytest_httpx import HTTPXMock

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
from tika_client.data_models import DublinCoreKey
from tika_client.data_models import TikaKey


class TestDateTimeFormat:
def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_utc(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the +xx:yy format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-05-17T16:30:44+00:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == datetime(
year=2023,
Expand All @@ -35,17 +42,24 @@ def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock:
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_zulu(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the Z format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-01-17T16:35:44Z"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == datetime(
year=2023,
Expand All @@ -57,34 +71,48 @@ def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_positive(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_positive(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the +xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44+08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(
datetime(year=2023, month=6, day=17, hour=16, minute=30, second=44, tzinfo=timezone(timedelta(hours=8))),
rel=timedelta(seconds=1),
)

def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_negative(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the -xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44-08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(
datetime(
Expand All @@ -99,32 +127,46 @@ def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_
rel=timedelta(seconds=1),
)

def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_python_isoformat(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the ISO 8061 format (as done by Python)
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

expected = datetime.now(tz=timezone.utc)

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: expected.isoformat()},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(expected, rel=timedelta(seconds=1))

def test_parse_offset_date_no_match(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_no_match(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time string which doesn't match the correct formats
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "202-06-17T16:30:44-0"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created is None
11 changes: 7 additions & 4 deletions tests/test_file_formats.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
from datetime import datetime
from pathlib import Path

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


class TestLibreOfficeFormats:
def test_parse_libre_office_writer_document(self, tika_client: TikaClient):
def test_parse_libre_office_writer_document(self, tika_client: TikaClient, sample_libre_office_writer_file: Path):
"""
Test handling of a ODT document produced by LibreOffice
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"
resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.tika.as_html.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.type == "application/vnd.oasis.opendocument.text"
assert resp.content is not None
assert (
"<body><p>This is a document created by LibreOffice Writer 7.5.12, on July 19th, 2023</p>\n</body>"
in resp.content
Expand Down
13 changes: 6 additions & 7 deletions tests/test_image_files.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
from pathlib import Path

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


class TestParseImageMetadata:
def test_image_jpeg(self, tika_client: TikaClient):
def test_image_jpeg(self, tika_client: TikaClient, sample_jpeg_file: Path):
"""
Test the handling of a JPEG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.jpg"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(sample_jpeg_file, magic.from_file(str(sample_jpeg_file), mime=True))

assert resp.type == "image/jpeg"

def test_image_png(self, tika_client: TikaClient):
def test_image_png(self, tika_client: TikaClient, sample_png_file: Path):
"""
Test the handling of a PNG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.png"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(sample_png_file, magic.from_file(str(sample_png_file), mime=True))

assert resp.type == "image/png"
Loading