From 336204b685781bf9db42a722d535529111f3742e Mon Sep 17 00:00:00 2001 From: rababerladuseladim Date: Wed, 7 Aug 2024 13:15:56 +0200 Subject: [PATCH] feature/mx1613 wikidata convenience function (#250) # PR Context # Added - wikidata fixtures to pytest plugin: wikidata_organization_raw, wikidata_organization, mocked_wikidata - convenience function `get_merged_organization_id_by_query_with_extract_transform_and_load` for getting the stableTargetId of an organization, while transforming and loading the organization using the provided load function --------- Co-authored-by: Janina Esins --- CHANGELOG.md | 6 + mex/common/testing/plugin.py | 83 ++++++- .../test_data/wikidata_organization_raw.json | 228 ++++++++++++++++++ mex/common/wikidata/convenience.py | 71 ++++++ tests/wikidata/test_convenience.py | 130 ++++++++++ 5 files changed, 516 insertions(+), 2 deletions(-) create mode 100644 mex/common/testing/test_data/wikidata_organization_raw.json create mode 100644 mex/common/wikidata/convenience.py create mode 100644 tests/wikidata/test_convenience.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 136645c5..ba32a791 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- wikidata fixtures to pytest plugin: wikidata_organization_raw, wikidata_organization, + mocked_wikidata +- convenience function `get_merged_organization_id_by_query_with_extract_transform_and_load` + for getting the stableTargetId of an organization, while transforming and loading the + organization using the provided load function + ### Changes ### Deprecated diff --git a/mex/common/testing/plugin.py b/mex/common/testing/plugin.py index f76e7755..1715fd66 100644 --- a/mex/common/testing/plugin.py +++ b/mex/common/testing/plugin.py @@ -4,15 +4,18 @@ to the `conftest.py` in your root test folder. """ +import json import os from collections.abc import Generator from enum import Enum from pathlib import Path -from typing import Any -from unittest.mock import MagicMock +from typing import Any, cast +from unittest.mock import MagicMock, Mock +import requests from langdetect import DetectorFactory from pydantic import AnyUrl +from requests import Response from mex.common.connector import CONNECTOR_STORE from mex.common.models import ExtractedPrimarySource @@ -21,6 +24,11 @@ transform_seed_primary_sources_to_extracted_primary_sources, ) from mex.common.settings import SETTINGS_STORE, BaseSettings +from mex.common.wikidata.connector import ( + WikidataAPIConnector, + WikidataQueryServiceConnector, +) +from mex.common.wikidata.models.organization import WikidataOrganization class NoOpPytest: @@ -120,3 +128,74 @@ def extracted_primary_sources() -> dict[str, ExtractedPrimarySource]: ) ) return {p.identifierInPrimarySource: p for p in extracted_primary_sources} + + +@pytest.fixture +def wikidata_organization_raw() -> dict[str, Any]: + """Return a raw wikidata organization.""" + with open( + Path(__file__).parent / "test_data" / "wikidata_organization_raw.json" + ) as fh: + return cast(dict[str, Any], json.load(fh)) + + +@pytest.fixture +def wikidata_organization( + wikidata_organization_raw: dict[str, Any], +) -> WikidataOrganization: + """Return a wikidata organization instance.""" + return WikidataOrganization.model_validate(wikidata_organization_raw) + + +@pytest.fixture +def mocked_wikidata( + monkeypatch: pytest.MonkeyPatch, wikidata_organization_raw: dict[str, Any] +) -> None: + """Mock wikidata connector.""" + response_query = Mock(spec=Response, status_code=200) + + session = MagicMock(spec=requests.Session) + session.get = MagicMock(side_effect=[response_query]) + + def mocked_init(self: WikidataQueryServiceConnector) -> None: + self.session = session + + monkeypatch.setattr(WikidataQueryServiceConnector, "__init__", mocked_init) + monkeypatch.setattr(WikidataAPIConnector, "__init__", mocked_init) + + # mock search_wikidata_with_query + + def get_data_by_query( + self: WikidataQueryServiceConnector, query: str + ) -> list[dict[str, dict[str, str]]]: + return [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q26678", + }, + "itemLabel": {"xml:lang": "en", "type": "literal", "value": "BMW"}, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "German automotive manufacturer, and conglomerate", + }, + }, + ] + + monkeypatch.setattr( + WikidataQueryServiceConnector, "get_data_by_query", get_data_by_query + ) + + # mock get_wikidata_org_with_org_id + + def get_wikidata_item_details_by_id( + self: WikidataAPIConnector, item_id: str + ) -> dict[str, str]: + return wikidata_organization_raw + + monkeypatch.setattr( + WikidataAPIConnector, + "get_wikidata_item_details_by_id", + get_wikidata_item_details_by_id, + ) diff --git a/mex/common/testing/test_data/wikidata_organization_raw.json b/mex/common/testing/test_data/wikidata_organization_raw.json new file mode 100644 index 00000000..d8afc9ca --- /dev/null +++ b/mex/common/testing/test_data/wikidata_organization_raw.json @@ -0,0 +1,228 @@ +{ + "aliases": { + "de": [ + { + "language": "de", + "value": "alias_de_1" + }, + { + "language": "de", + "value": "alias_de_2" + }, + { + "language": "de", + "value": "alias_de_3" + } + ], + "en": [ + { + "language": "en", + "value": "alias_en_1" + }, + { + "language": "en", + "value": "alias_en_2" + }, + { + "language": "en", + "value": "alias_en_3" + }, + { + "language": "en", + "value": "alias_en_4" + } + ] + }, + "claims": { + "P1813": [ + { + "id": "Q679041$AAE01E9A-03EA-424E-A51A-222A4858C4DD", + "mainsnak": { + "datatype": "monolingualtext", + "datavalue": { + "type": "monolingualtext", + "value": { + "language": "en", + "text": "RKI" + } + }, + "hash": "6cd9c230521797cef15c529e5bb006a0c51e801e", + "property": "P1813", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$20A515C6-206D-4001-A408-4DA10F41533A", + "mainsnak": { + "datatype": "monolingualtext", + "datavalue": { + "type": "monolingualtext", + "value": { + "language": "de", + "text": "RKI" + } + }, + "hash": "03dcb3e47ca24e8ab90a1b11eb7602ceca2d07ad", + "property": "P1813", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$ac3e29c1-4ace-df94-91f7-d74b410c3582", + "mainsnak": { + "datatype": "monolingualtext", + "datavalue": { + "type": "monolingualtext", + "value": { + "language": "fr", + "text": "IRK" + } + }, + "hash": "966f7d0aee390d96edaafd00d04a07ec88844a1e", + "property": "P1813", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + } + ], + "P213": [ + { + "id": "Q679041$0ABA944D-81E3-4ED0-A792-52EC80175170", + "mainsnak": { + "datatype": "external-id", + "datavalue": { + "type": "string", + "value": "0000 0001 0940 3744" + }, + "hash": "17d825de2b5559de23b14b54519731a55a733ba4", + "property": "P213", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + } + ], + "P6782": [ + { + "id": "Q679041$42EED77F-B584-48C1-B1D7-DD1C27815BA6", + "mainsnak": { + "datatype": "external-id", + "datavalue": { + "type": "string", + "value": "01k5qnb77" + }, + "hash": "dd1172552e08b0ce0ac4f5af1c3b086fe95f4bdb", + "property": "P6782", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + } + ], + "P856": [ + { + "id": "Q679041$ccd210f4-4f33-9140-5060-a83edd44a7f2", + "mainsnak": { + "datatype": "url", + "datavalue": { + "type": "string", + "value": "https://www.rki.de/" + }, + "hash": "d07d9f8d73b9fa174b86cbbc7c5d3154f84e7a29", + "property": "P856", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$3FE8023E-41AE-4DB3-B0B7-51419DA6CAE7", + "mainsnak": { + "datatype": "url", + "datavalue": { + "type": "string", + "value": "https://www.rki.de/DE/Home/homepage_node.html" + }, + "hash": "4da1521afde56c04ad95ba5d0b5977dc4cda248f", + "property": "P856", + "snaktype": "value" + }, + "qualifiers": { + "P407": [ + { + "datatype": "wikibase-item", + "datavalue": { + "type": "wikibase-entityid", + "value": { + "entity-type": "item", + "id": "Q188", + "numeric-id": 188 + } + }, + "hash": "46bfd327b830f66f7061ea92d1be430c135fa91f", + "property": "P407", + "snaktype": "value" + } + ] + }, + "qualifiers-order": [ + "P407" + ], + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$324BC651-7212-4CE7-89A1-9E9135AAAA09", + "mainsnak": { + "datatype": "url", + "datavalue": { + "type": "string", + "value": "https://www.rki.de/EN/Home/homepage_node.html" + }, + "hash": "9e7237708fdfec88603db5ead3645b9d5d825808", + "property": "P856", + "snaktype": "value" + }, + "qualifiers": { + "P407": [ + { + "datatype": "wikibase-item", + "datavalue": { + "type": "wikibase-entityid", + "value": { + "entity-type": "item", + "id": "Q1860", + "numeric-id": 1860 + } + }, + "hash": "daf1c4fcb58181b02dff9cc89deb084004ddae4b", + "property": "P407", + "snaktype": "value" + } + ] + }, + "qualifiers-order": [ + "P407" + ], + "rank": "normal", + "type": "statement" + } + ] + }, + "id": "Q679041", + "labels": { + "de": { + "language": "de", + "value": "Robert Koch-Institut" + }, + "en": { + "language": "en", + "value": "Robert Koch Institute" + } + } +} diff --git a/mex/common/wikidata/convenience.py b/mex/common/wikidata/convenience.py new file mode 100644 index 00000000..e8fe6c3d --- /dev/null +++ b/mex/common/wikidata/convenience.py @@ -0,0 +1,71 @@ +from collections.abc import Callable, Iterable + +from mex.common.models import ExtractedData, ExtractedPrimarySource +from mex.common.types import MergedOrganizationIdentifier, MergedPrimarySourceIdentifier +from mex.common.wikidata.extract import search_organization_by_label +from mex.common.wikidata.transform import ( + transform_wikidata_organization_to_extracted_organization, +) + + +class _QueryCache(dict[str, MergedOrganizationIdentifier]): + primary_source_and_load_function: tuple[ + MergedPrimarySourceIdentifier | None, int | None + ] = (None, None) + + +_ORGANIZATION_BY_QUERY_CACHE: _QueryCache = _QueryCache() + + +def get_merged_organization_id_by_query_with_extract_transform_and_load( + query_string: str, + wikidata_primary_source: ExtractedPrimarySource, + load_function: Callable[[Iterable[ExtractedData]], None], +) -> MergedOrganizationIdentifier | None: + """Get stableTargetId of an organization matching the query string. + + Search wikidata for organization, transform it into an ExtractedOrganization and + load it using the provided load_function. + + Args: + query_string: query string to search in wikidata + wikidata_primary_source: wikidata primary source + load_function: function to pass ExtractedOrganization to + + Returns: + ExtractedOrganization stableTargetId if one matching organization is found in + Wikidata lookup. + None if multiple matches / no organization is found + """ + primary_source_and_load_function = ( + wikidata_primary_source.stableTargetId, + id(load_function), + ) + if ( + _ORGANIZATION_BY_QUERY_CACHE.primary_source_and_load_function + != primary_source_and_load_function + ): + _ORGANIZATION_BY_QUERY_CACHE.clear() + _ORGANIZATION_BY_QUERY_CACHE.primary_source_and_load_function = ( + primary_source_and_load_function + ) + elif organization_id := _ORGANIZATION_BY_QUERY_CACHE.get(query_string): + return organization_id + + found_organization = search_organization_by_label(query_string) + + if found_organization is None: + return None + + extracted_organization = transform_wikidata_organization_to_extracted_organization( + found_organization, wikidata_primary_source + ) + + if extracted_organization is None: + return None + + load_function([extracted_organization]) + + _ORGANIZATION_BY_QUERY_CACHE[query_string] = extracted_organization.stableTargetId + + return extracted_organization.stableTargetId diff --git a/tests/wikidata/test_convenience.py b/tests/wikidata/test_convenience.py new file mode 100644 index 00000000..7d7e7660 --- /dev/null +++ b/tests/wikidata/test_convenience.py @@ -0,0 +1,130 @@ +from unittest.mock import Mock + +import pytest +from pytest import MonkeyPatch + +from mex.common.models import ExtractedOrganization, ExtractedPrimarySource +from mex.common.wikidata import convenience +from mex.common.wikidata.convenience import ( + _ORGANIZATION_BY_QUERY_CACHE, + get_merged_organization_id_by_query_with_extract_transform_and_load, +) +from mex.common.wikidata.extract import search_organization_by_label +from mex.common.wikidata.models.organization import WikidataOrganization +from mex.common.wikidata.transform import ( + transform_wikidata_organization_to_extracted_organization, +) + + +@pytest.mark.usefixtures( + "mocked_wikidata", +) +def test_get_merged_organization_id_by_query_with_extract_transform_and_load_mocked( + wikidata_organization: WikidataOrganization, + extracted_primary_sources: dict[str, ExtractedPrimarySource], + monkeypatch: MonkeyPatch, +) -> None: + query_string = "Robert Koch-Institut" + wikidata_primary_source = extracted_primary_sources["wikidata"] + extracted_wikidata_organization = ( + transform_wikidata_organization_to_extracted_organization( + wikidata_organization, wikidata_primary_source + ) + ) + assert isinstance(extracted_wikidata_organization, ExtractedOrganization) + + # mock all the things + mocked_search_organization_by_label = Mock(side_effect=search_organization_by_label) + monkeypatch.setattr( + convenience, "search_organization_by_label", mocked_search_organization_by_label + ) + mocked_transform_wikidata_organization_to_extracted_organization = Mock( + side_effect=transform_wikidata_organization_to_extracted_organization + ) + monkeypatch.setattr( + convenience, + "transform_wikidata_organization_to_extracted_organization", + mocked_transform_wikidata_organization_to_extracted_organization, + ) + load_function = Mock() + + # organization found and transformed + _ORGANIZATION_BY_QUERY_CACHE.clear() + returned = get_merged_organization_id_by_query_with_extract_transform_and_load( + query_string, wikidata_primary_source, load_function + ) + assert returned == extracted_wikidata_organization.stableTargetId + mocked_search_organization_by_label.assert_called_once_with(query_string) + mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with( + wikidata_organization, wikidata_primary_source + ) + load_function.assert_called_once_with([extracted_wikidata_organization]) + + # make sure caching works + mocked_search_organization_by_label.reset_mock() + mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() + load_function.reset_mock() + returned = get_merged_organization_id_by_query_with_extract_transform_and_load( + query_string, wikidata_primary_source, load_function + ) + assert returned == extracted_wikidata_organization.stableTargetId + mocked_search_organization_by_label.assert_not_called() + mocked_transform_wikidata_organization_to_extracted_organization.assert_not_called() + load_function.assert_not_called() + + # make sure cache is reset for different load function + mocked_search_organization_by_label.reset_mock() + mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() + load_function = Mock() + returned = get_merged_organization_id_by_query_with_extract_transform_and_load( + query_string, wikidata_primary_source, load_function + ) + assert returned == extracted_wikidata_organization.stableTargetId + mocked_search_organization_by_label.assert_called_once_with(query_string) + mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with( + wikidata_organization, wikidata_primary_source + ) + load_function.assert_called_once_with([extracted_wikidata_organization]) + + # transformation returns no organization + mocked_search_organization_by_label.reset_mock() + mocked_transform_wikidata_organization_to_extracted_organization.side_effect = None + mocked_transform_wikidata_organization_to_extracted_organization.return_value = None + mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() + load_function.reset_mock() + _ORGANIZATION_BY_QUERY_CACHE.clear() + returned = get_merged_organization_id_by_query_with_extract_transform_and_load( + query_string, wikidata_primary_source, load_function + ) + assert returned is None + mocked_search_organization_by_label.assert_called_once_with(query_string) + mocked_transform_wikidata_organization_to_extracted_organization.assert_called_once_with( + wikidata_organization, wikidata_primary_source + ) + load_function.assert_not_called() + + # search returns no organization + mocked_search_organization_by_label.side_effect = None + mocked_search_organization_by_label.return_value = None + mocked_search_organization_by_label.reset_mock() + mocked_transform_wikidata_organization_to_extracted_organization.reset_mock() + load_function.reset_mock() + _ORGANIZATION_BY_QUERY_CACHE.clear() + returned = get_merged_organization_id_by_query_with_extract_transform_and_load( + query_string, wikidata_primary_source, load_function + ) + assert returned is None + mocked_search_organization_by_label.assert_called_once_with(query_string) + mocked_transform_wikidata_organization_to_extracted_organization.assert_not_called() + load_function.assert_not_called() + + +@pytest.mark.integration +def test_get_merged_organization_id_by_query_with_extract_transform_and_load( + extracted_primary_sources: dict[str, ExtractedPrimarySource], +) -> None: + wikidata_primary_source = extracted_primary_sources["wikidata"] + returned = get_merged_organization_id_by_query_with_extract_transform_and_load( + "Robert Koch-Institut", wikidata_primary_source, lambda _: None + ) + assert returned == "ga6xh6pgMwgq7DC7r6Wjqg"