Skip to content

Commit

Permalink
feature/mx-1658 fix wikidata 403 bug (#240)
Browse files Browse the repository at this point in the history
### Added

- HTTP connector backoff for 10 retries on 403 from server
- `rki/mex` user agent is sent with query requests via wikidata
connector

### Changes

- update wikidata search organization request query, with optional
language parameter
  wikidata query search can be enhanced by specifying the language.
  EN is the default language.
  • Loading branch information
mr-kamran-ali authored Jul 29, 2024
1 parent e3c7ba3 commit 3668519
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 33 deletions.
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- HTTP connector backoff for 10 retries on 403 from server
- `rki/mex` user agent is sent with query requests via wikidata connector

### Changes

- update wikidata search organization request query, with optional language parameter
wikidata query search can be enhanced by specifying the language.
EN is the default language.

### Deprecated

### Removed
Expand All @@ -36,8 +43,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [0.31.0] - 2024-07-17

### Removed

- BREAKING: ability to store different settings instances at the same time. Dependent
repositories now must bundle all settings in a single class.
repositories now must bundle all settings in a single class.

## [0.30.0] - 2024-07-16

Expand Down
8 changes: 7 additions & 1 deletion mex/common/connector/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def request(
else:
url = self.url
kwargs.setdefault("timeout", self.TIMEOUT)
kwargs.setdefault("headers", {})
if not kwargs.get("headers"):
kwargs.setdefault("headers", {})
kwargs["headers"].setdefault("Accept", "application/json")

if payload:
Expand Down Expand Up @@ -106,6 +107,11 @@ def request(
lambda response: cast(Response, response).status_code == 429,
max_tries=10,
)
@backoff.on_predicate(
backoff.fibo,
lambda response: cast(Response, response).status_code == 403,
max_tries=10,
)
@backoff.on_exception(backoff.fibo, RequestException, max_tries=6)
def _send_request(
self, method: str, url: str, params: dict[str, str] | None, **kwargs: Any
Expand Down
6 changes: 6 additions & 0 deletions mex/common/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ def get(cls) -> Self:
"organization ID",
validation_alias="MEX_WIKI_QUERY_SERVICE_URL",
)
mex_web_user_agent: str = Field(
"rki/mex",
description="a user agent is sent in the header of some requests to external "
"services ",
validation_alias="MEX_WEB_USER_AGENT",
)

def text(self) -> str:
"""Dump the current settings into a readable table."""
Expand Down
15 changes: 12 additions & 3 deletions mex/common/wikidata/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,14 @@ def get_data_by_query(self, query: str) -> list[dict[str, dict[str, str]]]:
Returns:
list: list of all items found
"""
settings = BaseSettings.get()
params = {"format": "json", "query": query}
headers = {
"User-Agent": f"{settings.mex_web_user_agent}",
"Api-User-Agent": f"{settings.mex_web_user_agent}",
}

results = self.request("GET", params=params)
results = self.request("GET", params=params, headers=headers)

return results["results"]["bindings"] # type: ignore

Expand Down Expand Up @@ -59,6 +64,7 @@ def get_wikidata_item_details_by_id(self, item_id: str) -> dict[str, str]:
Returns:
dict[str, Any]: details of the found item.
"""
settings = BaseSettings.get()
params = {
"action": "wbgetentities",
"format": "json",
Expand All @@ -77,6 +83,9 @@ def get_wikidata_item_details_by_id(self, item_id: str) -> dict[str, str]:
),
"formatversion": "2",
}

results = self.request("GET", params=params)
headers = {
"User-Agent": f"{settings.mex_web_user_agent}",
"Api-User-Agent": f"{settings.mex_web_user_agent}",
}
results = self.request("GET", params=params, headers=headers)
return results["entities"][item_id] # type: ignore
38 changes: 24 additions & 14 deletions mex/common/wikidata/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,41 @@

def search_organization_by_label(
item_label: str,
lang: TextLanguage = TextLanguage.EN,
) -> WikidataOrganization | None:
"""Search for an item in wikidata. Only organizations are fetched.
"""Search for an item in wikidata. Only organizations are searched.
Args:
item_label: Item title or label to be searched
lang: lang in which item should be searched. Default: TextLanguage.EN
Returns:
WikidataOrganization if only one organization is found
None if no or multiple organizations are found
WikidataOrganization if organization is found
None if no organization is found
"""
connector = WikidataQueryServiceConnector.get()
item_label = item_label.replace('"', "")
query_string = (
"SELECT distinct ?item ?itemLabel ?itemDescription "
"WHERE{"
"?item (wdt:P31/wdt:P8225*/wdt:P279*) wd:Q43229."
f'?item ?label "{item_label}"@en.'
"?article schema:about ?item ."
'SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }'
"}"
"WHERE { "
"SERVICE wikibase:mwapi { "
'bd:serviceParam wikibase:api "EntitySearch" . '
'bd:serviceParam wikibase:endpoint "www.wikidata.org" . '
f'bd:serviceParam mwapi:search "{item_label}" . '
f'bd:serviceParam mwapi:language "{lang}" . '
"?item wikibase:apiOutputItem mwapi:item . "
"?num wikibase:apiOrdinal true . "
"} "
"?item (wdt:P31/wdt:P8225*/wdt:P279*) wd:Q43229. "
'SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,de". } ' # noqa: E501
"} "
"ORDER BY ASC(?num) "
"LIMIT 1 "
)

results = connector.get_data_by_query(query_string)

if len(results) != 1:
if not results:
return None

try:
Expand All @@ -61,7 +71,7 @@ def get_count_of_found_organizations_by_label(
"""
connector = WikidataQueryServiceConnector.get()
item_label = item_label.replace('"', "")
query_string_new = (
query_string = (
"SELECT (COUNT(distinct ?item) AS ?count) "
"WHERE { "
"SERVICE wikibase:mwapi { "
Expand All @@ -78,7 +88,7 @@ def get_count_of_found_organizations_by_label(
"ORDER BY ASC(?num) "
)

result = connector.get_data_by_query(query_string_new)
result = connector.get_data_by_query(query_string)
return int(result[0]["count"]["value"])


Expand All @@ -101,7 +111,7 @@ def search_organizations_by_label(
"""
connector = WikidataQueryServiceConnector.get()
item_label = item_label.replace('"', "")
query_string_new = (
query_string = (
"SELECT distinct ?item ?itemLabel ?itemDescription "
"WHERE { "
"SERVICE wikibase:mwapi { "
Expand All @@ -120,7 +130,7 @@ def search_organizations_by_label(
f"LIMIT {limit} "
)

results = connector.get_data_by_query(query_string_new)
results = connector.get_data_by_query(query_string)
for item in results:
try:
wd_item_id = item["item"]["value"].split("/")[-1]
Expand Down
18 changes: 4 additions & 14 deletions tests/wikidata/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ def test_get_count_of_found_organizations_by_label() -> None:
@pytest.mark.integration
def test_search_organization_by_label_for_none() -> None:
"""Test if None is returned when multiple organizations are found."""
search_result = search_organization_by_label(item_label="BMW")
search_result = search_organization_by_label(
item_label="Blah-test128%3h2 .1 12 bus"
)
assert search_result is None


Expand Down Expand Up @@ -229,22 +231,10 @@ def mocked_item_details_response() -> Any:
"mocked_session_wikidata_query_service", "mocked_session_wikidata_api"
)
def test_search_organization_by_label_for_none_mocked(monkeypatch: MonkeyPatch) -> None:
expected_query_response = [
{
"item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q26678"},
},
{
"item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q821937"},
},
]

def mocked_query_response() -> list[dict[str, dict[str, str]]]:
return expected_query_response

monkeypatch.setattr(
WikidataQueryServiceConnector,
"get_data_by_query",
lambda self, _: mocked_query_response(),
lambda self, _: [],
)

def mocked_item_details_response() -> Any:
Expand Down

0 comments on commit 3668519

Please sign in to comment.