From 3668519a13aa80839456e16db16a5b9d52686c8e Mon Sep 17 00:00:00 2001 From: Kamran Ali <33874616+mr-kamran-ali@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:50:11 +0200 Subject: [PATCH] feature/mx-1658 fix wikidata 403 bug (#240) ### Added - HTTP connector backoff for 10 retries on 403 from server - `rki/mex` user agent is sent with query requests via wikidata connector ### Changes - update wikidata search organization request query, with optional language parameter wikidata query search can be enhanced by specifying the language. EN is the default language. --- CHANGELOG.md | 10 ++++++++- mex/common/connector/http.py | 8 ++++++- mex/common/settings.py | 6 +++++ mex/common/wikidata/connector.py | 15 ++++++++++--- mex/common/wikidata/extract.py | 38 ++++++++++++++++++++------------ tests/wikidata/test_extract.py | 18 ++++----------- 6 files changed, 62 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 07a3ac66..5736af50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- HTTP connector backoff for 10 retries on 403 from server +- `rki/mex` user agent is sent with query requests via wikidata connector + ### Changes +- update wikidata search organization request query, with optional language parameter + wikidata query search can be enhanced by specifying the language. + EN is the default language. + ### Deprecated ### Removed @@ -36,8 +43,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.31.0] - 2024-07-17 ### Removed + - BREAKING: ability to store different settings instances at the same time. Dependent - repositories now must bundle all settings in a single class. + repositories now must bundle all settings in a single class. ## [0.30.0] - 2024-07-16 diff --git a/mex/common/connector/http.py b/mex/common/connector/http.py index db624fb4..8e037954 100644 --- a/mex/common/connector/http.py +++ b/mex/common/connector/http.py @@ -74,7 +74,8 @@ def request( else: url = self.url kwargs.setdefault("timeout", self.TIMEOUT) - kwargs.setdefault("headers", {}) + if not kwargs.get("headers"): + kwargs.setdefault("headers", {}) kwargs["headers"].setdefault("Accept", "application/json") if payload: @@ -106,6 +107,11 @@ def request( lambda response: cast(Response, response).status_code == 429, max_tries=10, ) + @backoff.on_predicate( + backoff.fibo, + lambda response: cast(Response, response).status_code == 403, + max_tries=10, + ) @backoff.on_exception(backoff.fibo, RequestException, max_tries=6) def _send_request( self, method: str, url: str, params: dict[str, str] | None, **kwargs: Any diff --git a/mex/common/settings.py b/mex/common/settings.py index 39e28e7e..fbdd7db4 100644 --- a/mex/common/settings.py +++ b/mex/common/settings.py @@ -180,6 +180,12 @@ def get(cls) -> Self: "organization ID", validation_alias="MEX_WIKI_QUERY_SERVICE_URL", ) + mex_web_user_agent: str = Field( + "rki/mex", + description="a user agent is sent in the header of some requests to external " + "services ", + validation_alias="MEX_WEB_USER_AGENT", + ) def text(self) -> str: """Dump the current settings into a readable table.""" diff --git a/mex/common/wikidata/connector.py b/mex/common/wikidata/connector.py index d53ad723..a055b744 100644 --- a/mex/common/wikidata/connector.py +++ b/mex/common/wikidata/connector.py @@ -28,9 +28,14 @@ def get_data_by_query(self, query: str) -> list[dict[str, dict[str, str]]]: Returns: list: list of all items found """ + settings = BaseSettings.get() params = {"format": "json", "query": query} + headers = { + "User-Agent": f"{settings.mex_web_user_agent}", + "Api-User-Agent": f"{settings.mex_web_user_agent}", + } - results = self.request("GET", params=params) + results = self.request("GET", params=params, headers=headers) return results["results"]["bindings"] # type: ignore @@ -59,6 +64,7 @@ def get_wikidata_item_details_by_id(self, item_id: str) -> dict[str, str]: Returns: dict[str, Any]: details of the found item. """ + settings = BaseSettings.get() params = { "action": "wbgetentities", "format": "json", @@ -77,6 +83,9 @@ def get_wikidata_item_details_by_id(self, item_id: str) -> dict[str, str]: ), "formatversion": "2", } - - results = self.request("GET", params=params) + headers = { + "User-Agent": f"{settings.mex_web_user_agent}", + "Api-User-Agent": f"{settings.mex_web_user_agent}", + } + results = self.request("GET", params=params, headers=headers) return results["entities"][item_id] # type: ignore diff --git a/mex/common/wikidata/extract.py b/mex/common/wikidata/extract.py index 8ecdd8ee..1aa1e5ff 100644 --- a/mex/common/wikidata/extract.py +++ b/mex/common/wikidata/extract.py @@ -11,31 +11,41 @@ def search_organization_by_label( item_label: str, + lang: TextLanguage = TextLanguage.EN, ) -> WikidataOrganization | None: - """Search for an item in wikidata. Only organizations are fetched. + """Search for an item in wikidata. Only organizations are searched. Args: item_label: Item title or label to be searched + lang: lang in which item should be searched. Default: TextLanguage.EN Returns: - WikidataOrganization if only one organization is found - None if no or multiple organizations are found + WikidataOrganization if organization is found + None if no organization is found """ connector = WikidataQueryServiceConnector.get() item_label = item_label.replace('"', "") query_string = ( "SELECT distinct ?item ?itemLabel ?itemDescription " - "WHERE{" - "?item (wdt:P31/wdt:P8225*/wdt:P279*) wd:Q43229." - f'?item ?label "{item_label}"@en.' - "?article schema:about ?item ." - 'SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }' - "}" + "WHERE { " + "SERVICE wikibase:mwapi { " + 'bd:serviceParam wikibase:api "EntitySearch" . ' + 'bd:serviceParam wikibase:endpoint "www.wikidata.org" . ' + f'bd:serviceParam mwapi:search "{item_label}" . ' + f'bd:serviceParam mwapi:language "{lang}" . ' + "?item wikibase:apiOutputItem mwapi:item . " + "?num wikibase:apiOrdinal true . " + "} " + "?item (wdt:P31/wdt:P8225*/wdt:P279*) wd:Q43229. " + 'SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,de". } ' # noqa: E501 + "} " + "ORDER BY ASC(?num) " + "LIMIT 1 " ) results = connector.get_data_by_query(query_string) - if len(results) != 1: + if not results: return None try: @@ -61,7 +71,7 @@ def get_count_of_found_organizations_by_label( """ connector = WikidataQueryServiceConnector.get() item_label = item_label.replace('"', "") - query_string_new = ( + query_string = ( "SELECT (COUNT(distinct ?item) AS ?count) " "WHERE { " "SERVICE wikibase:mwapi { " @@ -78,7 +88,7 @@ def get_count_of_found_organizations_by_label( "ORDER BY ASC(?num) " ) - result = connector.get_data_by_query(query_string_new) + result = connector.get_data_by_query(query_string) return int(result[0]["count"]["value"]) @@ -101,7 +111,7 @@ def search_organizations_by_label( """ connector = WikidataQueryServiceConnector.get() item_label = item_label.replace('"', "") - query_string_new = ( + query_string = ( "SELECT distinct ?item ?itemLabel ?itemDescription " "WHERE { " "SERVICE wikibase:mwapi { " @@ -120,7 +130,7 @@ def search_organizations_by_label( f"LIMIT {limit} " ) - results = connector.get_data_by_query(query_string_new) + results = connector.get_data_by_query(query_string) for item in results: try: wd_item_id = item["item"]["value"].split("/")[-1] diff --git a/tests/wikidata/test_extract.py b/tests/wikidata/test_extract.py index 81b74bca..20c98fcb 100644 --- a/tests/wikidata/test_extract.py +++ b/tests/wikidata/test_extract.py @@ -63,7 +63,9 @@ def test_get_count_of_found_organizations_by_label() -> None: @pytest.mark.integration def test_search_organization_by_label_for_none() -> None: """Test if None is returned when multiple organizations are found.""" - search_result = search_organization_by_label(item_label="BMW") + search_result = search_organization_by_label( + item_label="Blah-test128%3h2 .1 12 bus" + ) assert search_result is None @@ -229,22 +231,10 @@ def mocked_item_details_response() -> Any: "mocked_session_wikidata_query_service", "mocked_session_wikidata_api" ) def test_search_organization_by_label_for_none_mocked(monkeypatch: MonkeyPatch) -> None: - expected_query_response = [ - { - "item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q26678"}, - }, - { - "item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q821937"}, - }, - ] - - def mocked_query_response() -> list[dict[str, dict[str, str]]]: - return expected_query_response - monkeypatch.setattr( WikidataQueryServiceConnector, "get_data_by_query", - lambda self, _: mocked_query_response(), + lambda self, _: [], ) def mocked_item_details_response() -> Any: