diff --git a/mex/common/primary_source/helpers.py b/mex/common/primary_source/helpers.py new file mode 100644 index 00000000..8cf517e4 --- /dev/null +++ b/mex/common/primary_source/helpers.py @@ -0,0 +1,43 @@ +from functools import cache + +from mex.common.models import ( + ExtractedPrimarySource, +) +from mex.common.primary_source.extract import extract_seed_primary_sources +from mex.common.primary_source.transform import ( + transform_seed_primary_sources_to_extracted_primary_sources, +) + + +@cache +def get_all_extracted_primary_sources() -> list[ExtractedPrimarySource]: + """Extract and transform all primary sources. + + Extract the primary sources from the raw-data JSON file and transform them into + a list of ExtractedPrimarySources. + + Returns: + List of all ExtractedPrimarySources + """ + seed_primary_sources = extract_seed_primary_sources() + return list( + transform_seed_primary_sources_to_extracted_primary_sources( + seed_primary_sources + ) + ) + + +@cache +def get_extracted_primary_source_by_name(name: str) -> ExtractedPrimarySource | None: + """Pick the extracted primary source with the given name and return it. + + Args: + name: Name (`identifierInPrimarySource`) of the primary source + + Returns: + Extracted primary source if it was found, else None + """ + primary_sources_by_name = { + p.identifierInPrimarySource: p for p in get_all_extracted_primary_sources() + } + return primary_sources_by_name.get(name) diff --git a/mex/common/primary_source/transform.py b/mex/common/primary_source/transform.py index c23ce50a..7100a681 100644 --- a/mex/common/primary_source/transform.py +++ b/mex/common/primary_source/transform.py @@ -28,6 +28,7 @@ def transform_seed_primary_sources_to_extracted_primary_sources( ) +# TODO: Remove this in MX-1698 def get_primary_sources_by_name( extracted_primary_sources: Iterable[ExtractedPrimarySource], *names: str ) -> tuple[ExtractedPrimarySource, ...]: diff --git a/mex/common/wikidata/convenience.py b/mex/common/wikidata/convenience.py deleted file mode 100644 index 282674f5..00000000 --- a/mex/common/wikidata/convenience.py +++ /dev/null @@ -1,71 +0,0 @@ -from collections.abc import Callable, Iterable - -from mex.common.models import AnyExtractedModel, ExtractedPrimarySource -from mex.common.types import MergedOrganizationIdentifier, MergedPrimarySourceIdentifier -from mex.common.wikidata.extract import search_organization_by_label -from mex.common.wikidata.transform import ( - transform_wikidata_organization_to_extracted_organization, -) - - -class _QueryCache(dict[str, MergedOrganizationIdentifier]): - primary_source_and_load_function: tuple[ - MergedPrimarySourceIdentifier | None, int | None - ] = (None, None) - - -_ORGANIZATION_BY_QUERY_CACHE: _QueryCache = _QueryCache() - - -def get_merged_organization_id_by_query_with_extract_transform_and_load( - query_string: str, - wikidata_primary_source: ExtractedPrimarySource, - load_function: Callable[[Iterable[AnyExtractedModel]], None], -) -> MergedOrganizationIdentifier | None: - """Get stableTargetId of an organization matching the query string. - - Search wikidata for organization, transform it into an ExtractedOrganization and - load it using the provided load_function. - - Args: - query_string: query string to search in wikidata - wikidata_primary_source: wikidata primary source - load_function: function to pass ExtractedOrganization to - - Returns: - ExtractedOrganization stableTargetId if one matching organization is found in - Wikidata lookup. - None if multiple matches / no organization is found - """ - primary_source_and_load_function = ( - wikidata_primary_source.stableTargetId, - id(load_function), - ) - if ( - _ORGANIZATION_BY_QUERY_CACHE.primary_source_and_load_function - != primary_source_and_load_function - ): - _ORGANIZATION_BY_QUERY_CACHE.clear() - _ORGANIZATION_BY_QUERY_CACHE.primary_source_and_load_function = ( - primary_source_and_load_function - ) - elif organization_id := _ORGANIZATION_BY_QUERY_CACHE.get(query_string): - return organization_id - - found_organization = search_organization_by_label(query_string) - - if found_organization is None: - return None - - extracted_organization = transform_wikidata_organization_to_extracted_organization( - found_organization, wikidata_primary_source - ) - - if extracted_organization is None: - return None - - load_function([extracted_organization]) - - _ORGANIZATION_BY_QUERY_CACHE[query_string] = extracted_organization.stableTargetId - - return extracted_organization.stableTargetId diff --git a/mex/common/wikidata/helpers.py b/mex/common/wikidata/helpers.py new file mode 100644 index 00000000..dc06d489 --- /dev/null +++ b/mex/common/wikidata/helpers.py @@ -0,0 +1,40 @@ +from functools import cache + +from mex.common.models import ExtractedOrganization, ExtractedPrimarySource +from mex.common.wikidata.extract import search_organization_by_label +from mex.common.wikidata.transform import ( + transform_wikidata_organization_to_extracted_organization, +) + + +@cache +def get_extracted_organization_from_wikidata( + query_string: str, + wikidata_primary_source: ExtractedPrimarySource, +) -> ExtractedOrganization | None: + """Get extracted organization matching the query string. + + Search wikidata for organization and transform it into an ExtractedOrganization. + + Args: + query_string: query string to search in wikidata + wikidata_primary_source: wikidata primary source + + Returns: + ExtractedOrganization if one matching organization is found in + Wikidata lookup. + None if multiple matches / no organization is found. + """ + found_organization = search_organization_by_label(query_string) + + if found_organization is None: + return None + + extracted_organization = transform_wikidata_organization_to_extracted_organization( + found_organization, wikidata_primary_source + ) + + if extracted_organization is None: + return None + + return extracted_organization diff --git a/tests/wikidata/test_convenience.py b/tests/wikidata/test_convenience.py deleted file mode 100644 index 7d7e7660..00000000 --- a/tests/wikidata/test_convenience.py +++ /dev/null @@ -1,130 +0,0 @@ -from unittest.mock import Mock - -import pytest -from pytest import MonkeyPatch - -from mex.common.models import ExtractedOrganization, ExtractedPrimarySource -from mex.common.wikidata import convenience -from mex.common.wikidata.convenience import ( - _ORGANIZATION_BY_QUERY_CACHE, - get_merged_organization_id_by_query_with_extract_transform_and_load, -) -from mex.common.wikidata.extract import search_organization_by_label -from mex.common.wikidata.models.organization import WikidataOrganization -from mex.common.wikidata.transform import ( - transform_wikidata_organization_to_extracted_organization, -) - - -@pytest.mark.usefixtures( - "mocked_wikidata", -) -def test_get_merged_organization_id_by_query_with_extract_transform_and_load_mocked( - wikidata_organization: WikidataOrganization, - extracted_primary_sources: dict[str, ExtractedPrimarySource], - monkeypatch: MonkeyPatch, -) -> None: - query_string = "Robert Koch-Institut" - wikidata_primary_source = extracted_primary_sources["wikidata"] - extracted_wikidata_organization = ( - transform_wikidata_organization_to_extracted_organization( - wikidata_organization, wikidata_primary_source - ) - ) - assert isinstance(extracted_wikidata_organization, ExtractedOrganization) - - # mock all the things - mocked_search_organization_by_label = Mock(side_effect=search_organization_by_label) - monkeypatch.setattr( - convenience, "search_organization_by_label", mocked_search_organization_by_label - ) - mocked_transform_wikidata_organization_to_extracted_organization = Mock( - side_effect=transform_wikidata_organization_to_extracted_organization - ) - monkeypatch.setattr( - convenience, - "transform_wikidata_organization_to_extracted_organization", - mocked_transform_wikidata_organization_to_extracted_organization, - ) - load_function = Mock() - - # organization found and transformed - _ORGANIZATION_BY_QUERY_CACHE.clear() - returned = get_merged_organization_id_by_query_with_extract_transform_and_load( - query_string, wikidata_primary_source, load_function - ) - assert returned == extracted_wikidata_organization.stableTargetId - mocked_search_organization_by_label.assert_called_once_with(query_string) - mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with( - wikidata_organization, wikidata_primary_source - ) - load_function.assert_called_once_with([extracted_wikidata_organization]) - - # make sure caching works - mocked_search_organization_by_label.reset_mock() - mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() - load_function.reset_mock() - returned = get_merged_organization_id_by_query_with_extract_transform_and_load( - query_string, wikidata_primary_source, load_function - ) - assert returned == extracted_wikidata_organization.stableTargetId - mocked_search_organization_by_label.assert_not_called() - mocked_transform_wikidata_organization_to_extracted_organization.assert_not_called() - load_function.assert_not_called() - - # make sure cache is reset for different load function - mocked_search_organization_by_label.reset_mock() - mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() - load_function = Mock() - returned = get_merged_organization_id_by_query_with_extract_transform_and_load( - query_string, wikidata_primary_source, load_function - ) - assert returned == extracted_wikidata_organization.stableTargetId - mocked_search_organization_by_label.assert_called_once_with(query_string) - mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with( - wikidata_organization, wikidata_primary_source - ) - load_function.assert_called_once_with([extracted_wikidata_organization]) - - # transformation returns no organization - mocked_search_organization_by_label.reset_mock() - mocked_transform_wikidata_organization_to_extracted_organization.side_effect = None - mocked_transform_wikidata_organization_to_extracted_organization.return_value = None - mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() - load_function.reset_mock() - _ORGANIZATION_BY_QUERY_CACHE.clear() - returned = get_merged_organization_id_by_query_with_extract_transform_and_load( - query_string, wikidata_primary_source, load_function - ) - assert returned is None - mocked_search_organization_by_label.assert_called_once_with(query_string) - mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with( - wikidata_organization, wikidata_primary_source - ) - load_function.assert_not_called() - - # search returns no organization - mocked_search_organization_by_label.side_effect = None - mocked_search_organization_by_label.return_value = None - mocked_search_organization_by_label.reset_mock() - mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() - load_function.reset_mock() - _ORGANIZATION_BY_QUERY_CACHE.clear() - returned = get_merged_organization_id_by_query_with_extract_transform_and_load( - query_string, wikidata_primary_source, load_function - ) - assert returned is None - mocked_search_organization_by_label.assert_called_once_with(query_string) - mocked_transform_wikidata_organization_to_extracted_organization.assert_not_called() - load_function.assert_not_called() - - -@pytest.mark.integration -def test_get_merged_organization_id_by_query_with_extract_transform_and_load( - extracted_primary_sources: dict[str, ExtractedPrimarySource], -) -> None: - wikidata_primary_source = extracted_primary_sources["wikidata"] - returned = get_merged_organization_id_by_query_with_extract_transform_and_load( - "Robert Koch-Institut", wikidata_primary_source, lambda _: None - ) - assert returned == "ga6xh6pgMwgq7DC7r6Wjqg"