From c9149ced9f5c3658d568d53d71236cf77ada3695 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eric=20He=C3=9Fe?= <hessee@rki.de>
Date: Thu, 12 Sep 2024 13:33:15 +0200
Subject: [PATCH] rework wikidata and add primary source helpers

---
 mex/common/primary_source/helpers.py   |  43 ++++++++
 mex/common/primary_source/transform.py |   1 +
 mex/common/wikidata/convenience.py     |  71 --------------
 mex/common/wikidata/helpers.py         |  40 ++++++++
 tests/wikidata/test_convenience.py     | 130 -------------------------
 5 files changed, 84 insertions(+), 201 deletions(-)
 create mode 100644 mex/common/primary_source/helpers.py
 delete mode 100644 mex/common/wikidata/convenience.py
 create mode 100644 mex/common/wikidata/helpers.py
 delete mode 100644 tests/wikidata/test_convenience.py

diff --git a/mex/common/primary_source/helpers.py b/mex/common/primary_source/helpers.py
new file mode 100644
index 00000000..8cf517e4
--- /dev/null
+++ b/mex/common/primary_source/helpers.py
@@ -0,0 +1,43 @@
+from functools import cache
+
+from mex.common.models import (
+    ExtractedPrimarySource,
+)
+from mex.common.primary_source.extract import extract_seed_primary_sources
+from mex.common.primary_source.transform import (
+    transform_seed_primary_sources_to_extracted_primary_sources,
+)
+
+
+@cache
+def get_all_extracted_primary_sources() -> list[ExtractedPrimarySource]:
+    """Extract and transform all primary sources.
+
+    Extract the primary sources from the raw-data JSON file and transform them into
+    a list of ExtractedPrimarySources.
+
+    Returns:
+        List of all ExtractedPrimarySources
+    """
+    seed_primary_sources = extract_seed_primary_sources()
+    return list(
+        transform_seed_primary_sources_to_extracted_primary_sources(
+            seed_primary_sources
+        )
+    )
+
+
+@cache
+def get_extracted_primary_source_by_name(name: str) -> ExtractedPrimarySource | None:
+    """Pick the extracted primary source with the given name and return it.
+
+    Args:
+        name: Name (`identifierInPrimarySource`) of the primary source
+
+    Returns:
+        Extracted primary source if it was found, else None
+    """
+    primary_sources_by_name = {
+        p.identifierInPrimarySource: p for p in get_all_extracted_primary_sources()
+    }
+    return primary_sources_by_name.get(name)
diff --git a/mex/common/primary_source/transform.py b/mex/common/primary_source/transform.py
index c23ce50a..7100a681 100644
--- a/mex/common/primary_source/transform.py
+++ b/mex/common/primary_source/transform.py
@@ -28,6 +28,7 @@ def transform_seed_primary_sources_to_extracted_primary_sources(
         )
 
 
+# TODO: Remove this in MX-1698
 def get_primary_sources_by_name(
     extracted_primary_sources: Iterable[ExtractedPrimarySource], *names: str
 ) -> tuple[ExtractedPrimarySource, ...]:
diff --git a/mex/common/wikidata/convenience.py b/mex/common/wikidata/convenience.py
deleted file mode 100644
index 282674f5..00000000
--- a/mex/common/wikidata/convenience.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from collections.abc import Callable, Iterable
-
-from mex.common.models import AnyExtractedModel, ExtractedPrimarySource
-from mex.common.types import MergedOrganizationIdentifier, MergedPrimarySourceIdentifier
-from mex.common.wikidata.extract import search_organization_by_label
-from mex.common.wikidata.transform import (
-    transform_wikidata_organization_to_extracted_organization,
-)
-
-
-class _QueryCache(dict[str, MergedOrganizationIdentifier]):
-    primary_source_and_load_function: tuple[
-        MergedPrimarySourceIdentifier | None, int | None
-    ] = (None, None)
-
-
-_ORGANIZATION_BY_QUERY_CACHE: _QueryCache = _QueryCache()
-
-
-def get_merged_organization_id_by_query_with_extract_transform_and_load(
-    query_string: str,
-    wikidata_primary_source: ExtractedPrimarySource,
-    load_function: Callable[[Iterable[AnyExtractedModel]], None],
-) -> MergedOrganizationIdentifier | None:
-    """Get stableTargetId of an organization matching the query string.
-
-    Search wikidata for organization, transform it into an ExtractedOrganization and
-      load it using the provided load_function.
-
-    Args:
-         query_string: query string to search in wikidata
-         wikidata_primary_source: wikidata primary source
-         load_function: function to pass ExtractedOrganization to
-
-    Returns:
-         ExtractedOrganization stableTargetId if one matching organization is found in
-           Wikidata lookup.
-         None if multiple matches / no organization is found
-    """
-    primary_source_and_load_function = (
-        wikidata_primary_source.stableTargetId,
-        id(load_function),
-    )
-    if (
-        _ORGANIZATION_BY_QUERY_CACHE.primary_source_and_load_function
-        != primary_source_and_load_function
-    ):
-        _ORGANIZATION_BY_QUERY_CACHE.clear()
-        _ORGANIZATION_BY_QUERY_CACHE.primary_source_and_load_function = (
-            primary_source_and_load_function
-        )
-    elif organization_id := _ORGANIZATION_BY_QUERY_CACHE.get(query_string):
-        return organization_id
-
-    found_organization = search_organization_by_label(query_string)
-
-    if found_organization is None:
-        return None
-
-    extracted_organization = transform_wikidata_organization_to_extracted_organization(
-        found_organization, wikidata_primary_source
-    )
-
-    if extracted_organization is None:
-        return None
-
-    load_function([extracted_organization])
-
-    _ORGANIZATION_BY_QUERY_CACHE[query_string] = extracted_organization.stableTargetId
-
-    return extracted_organization.stableTargetId
diff --git a/mex/common/wikidata/helpers.py b/mex/common/wikidata/helpers.py
new file mode 100644
index 00000000..dc06d489
--- /dev/null
+++ b/mex/common/wikidata/helpers.py
@@ -0,0 +1,40 @@
+from functools import cache
+
+from mex.common.models import ExtractedOrganization, ExtractedPrimarySource
+from mex.common.wikidata.extract import search_organization_by_label
+from mex.common.wikidata.transform import (
+    transform_wikidata_organization_to_extracted_organization,
+)
+
+
+@cache
+def get_extracted_organization_from_wikidata(
+    query_string: str,
+    wikidata_primary_source: ExtractedPrimarySource,
+) -> ExtractedOrganization | None:
+    """Get extracted organization matching the query string.
+
+    Search wikidata for organization and transform it into an ExtractedOrganization.
+
+    Args:
+        query_string: query string to search in wikidata
+        wikidata_primary_source: wikidata primary source
+
+    Returns:
+        ExtractedOrganization if one matching organization is found in
+           Wikidata lookup.
+        None if multiple matches / no organization is found.
+    """
+    found_organization = search_organization_by_label(query_string)
+
+    if found_organization is None:
+        return None
+
+    extracted_organization = transform_wikidata_organization_to_extracted_organization(
+        found_organization, wikidata_primary_source
+    )
+
+    if extracted_organization is None:
+        return None
+
+    return extracted_organization
diff --git a/tests/wikidata/test_convenience.py b/tests/wikidata/test_convenience.py
deleted file mode 100644
index 7d7e7660..00000000
--- a/tests/wikidata/test_convenience.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from unittest.mock import Mock
-
-import pytest
-from pytest import MonkeyPatch
-
-from mex.common.models import ExtractedOrganization, ExtractedPrimarySource
-from mex.common.wikidata import convenience
-from mex.common.wikidata.convenience import (
-    _ORGANIZATION_BY_QUERY_CACHE,
-    get_merged_organization_id_by_query_with_extract_transform_and_load,
-)
-from mex.common.wikidata.extract import search_organization_by_label
-from mex.common.wikidata.models.organization import WikidataOrganization
-from mex.common.wikidata.transform import (
-    transform_wikidata_organization_to_extracted_organization,
-)
-
-
-@pytest.mark.usefixtures(
-    "mocked_wikidata",
-)
-def test_get_merged_organization_id_by_query_with_extract_transform_and_load_mocked(
-    wikidata_organization: WikidataOrganization,
-    extracted_primary_sources: dict[str, ExtractedPrimarySource],
-    monkeypatch: MonkeyPatch,
-) -> None:
-    query_string = "Robert Koch-Institut"
-    wikidata_primary_source = extracted_primary_sources["wikidata"]
-    extracted_wikidata_organization = (
-        transform_wikidata_organization_to_extracted_organization(
-            wikidata_organization, wikidata_primary_source
-        )
-    )
-    assert isinstance(extracted_wikidata_organization, ExtractedOrganization)
-
-    # mock all the things
-    mocked_search_organization_by_label = Mock(side_effect=search_organization_by_label)
-    monkeypatch.setattr(
-        convenience, "search_organization_by_label", mocked_search_organization_by_label
-    )
-    mocked_transform_wikidata_organization_to_extracted_organization = Mock(
-        side_effect=transform_wikidata_organization_to_extracted_organization
-    )
-    monkeypatch.setattr(
-        convenience,
-        "transform_wikidata_organization_to_extracted_organization",
-        mocked_transform_wikidata_organization_to_extracted_organization,
-    )
-    load_function = Mock()
-
-    # organization found and transformed
-    _ORGANIZATION_BY_QUERY_CACHE.clear()
-    returned = get_merged_organization_id_by_query_with_extract_transform_and_load(
-        query_string, wikidata_primary_source, load_function
-    )
-    assert returned == extracted_wikidata_organization.stableTargetId
-    mocked_search_organization_by_label.assert_called_once_with(query_string)
-    mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with(
-        wikidata_organization, wikidata_primary_source
-    )
-    load_function.assert_called_once_with([extracted_wikidata_organization])
-
-    # make sure caching works
-    mocked_search_organization_by_label.reset_mock()
-    mocked_transform_wikidata_organization_to_extracted_organization.reset_mock()
-    load_function.reset_mock()
-    returned = get_merged_organization_id_by_query_with_extract_transform_and_load(
-        query_string, wikidata_primary_source, load_function
-    )
-    assert returned == extracted_wikidata_organization.stableTargetId
-    mocked_search_organization_by_label.assert_not_called()
-    mocked_transform_wikidata_organization_to_extracted_organization.assert_not_called()
-    load_function.assert_not_called()
-
-    # make sure cache is reset for different load function
-    mocked_search_organization_by_label.reset_mock()
-    mocked_transform_wikidata_organization_to_extracted_organization.reset_mock()
-    load_function = Mock()
-    returned = get_merged_organization_id_by_query_with_extract_transform_and_load(
-        query_string, wikidata_primary_source, load_function
-    )
-    assert returned == extracted_wikidata_organization.stableTargetId
-    mocked_search_organization_by_label.assert_called_once_with(query_string)
-    mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with(
-        wikidata_organization, wikidata_primary_source
-    )
-    load_function.assert_called_once_with([extracted_wikidata_organization])
-
-    # transformation returns no organization
-    mocked_search_organization_by_label.reset_mock()
-    mocked_transform_wikidata_organization_to_extracted_organization.side_effect = None
-    mocked_transform_wikidata_organization_to_extracted_organization.return_value = None
-    mocked_transform_wikidata_organization_to_extracted_organization.reset_mock()
-    load_function.reset_mock()
-    _ORGANIZATION_BY_QUERY_CACHE.clear()
-    returned = get_merged_organization_id_by_query_with_extract_transform_and_load(
-        query_string, wikidata_primary_source, load_function
-    )
-    assert returned is None
-    mocked_search_organization_by_label.assert_called_once_with(query_string)
-    mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with(
-        wikidata_organization, wikidata_primary_source
-    )
-    load_function.assert_not_called()
-
-    # search returns no organization
-    mocked_search_organization_by_label.side_effect = None
-    mocked_search_organization_by_label.return_value = None
-    mocked_search_organization_by_label.reset_mock()
-    mocked_transform_wikidata_organization_to_extracted_organization.reset_mock()
-    load_function.reset_mock()
-    _ORGANIZATION_BY_QUERY_CACHE.clear()
-    returned = get_merged_organization_id_by_query_with_extract_transform_and_load(
-        query_string, wikidata_primary_source, load_function
-    )
-    assert returned is None
-    mocked_search_organization_by_label.assert_called_once_with(query_string)
-    mocked_transform_wikidata_organization_to_extracted_organization.assert_not_called()
-    load_function.assert_not_called()
-
-
-@pytest.mark.integration
-def test_get_merged_organization_id_by_query_with_extract_transform_and_load(
-    extracted_primary_sources: dict[str, ExtractedPrimarySource],
-) -> None:
-    wikidata_primary_source = extracted_primary_sources["wikidata"]
-    returned = get_merged_organization_id_by_query_with_extract_transform_and_load(
-        "Robert Koch-Institut", wikidata_primary_source, lambda _: None
-    )
-    assert returned == "ga6xh6pgMwgq7DC7r6Wjqg"