Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/mx-1664-use-wikidata-helper #279

Merged
merged 31 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a2035a5
implement loading and ID-returning part of the convenience/helper fun…
esinsj Oct 10, 2024
9c33a22
implement tests for helpers
esinsj Oct 11, 2024
02bf0ba
repair tests by adding integration mark
esinsj Oct 16, 2024
417ec04
reorganize folder structure
esinsj Oct 21, 2024
47c6f6f
corryt typo 'sySnopse' in biospecimen
esinsj Nov 8, 2024
1221a41
Merge remote-tracking branch 'origin/main' into feature/mx-1663_conve…
esinsj Nov 12, 2024
c6802b1
add init-file to test
esinsj Nov 12, 2024
a785926
add caching
esinsj Nov 12, 2024
94c8a55
Merge branch 'feature/mx-1663_convenience_function_2' into feature/mx…
esinsj Nov 12, 2024
655dbf3
use wikidata helper in biospecimen
esinsj Nov 14, 2024
780a71d
use wikidata helper in blueant
esinsj Nov 14, 2024
ca0a434
use wikidata helper in datscha-web
esinsj Nov 14, 2024
dd75640
use wikidata helper in ff-projects
esinsj Nov 14, 2024
2c704cc
Merge remote-tracking branch 'origin/main' into feature/mx-1664-use-w…
esinsj Nov 18, 2024
e551545
use wikidata helper in international-projects
esinsj Nov 18, 2024
65ad513
slighty improve extraction and transformation of ExtractedOrganisatio…
esinsj Nov 19, 2024
02561c4
use wikidata helper in grippeweb
esinsj Nov 20, 2024
096925f
WIP: improve wikidata and primary source helpers (tests need rework)
esinsj Nov 21, 2024
9d5cdad
use wikidata helper in ODK
esinsj Nov 22, 2024
cad75cd
Merge remote-tracking branch 'origin/main' into feature/mx-1664-use-w…
esinsj Nov 22, 2024
60c24e7
use wikidata helper in synopse
esinsj Nov 25, 2024
f9a185a
Merge remote-tracking branch 'origin/main' into feature/mx-1664-use-w…
esinsj Nov 25, 2024
437ef70
use wikidata helper in voxco
esinsj Nov 25, 2024
f119d7a
improve testing for primary source helper
esinsj Nov 26, 2024
09e2aa9
clean up old wikidata extractor files.
esinsj Nov 26, 2024
397f610
Merge remote-tracking branch 'origin/main' into feature/mx-1664-use-w…
esinsj Dec 4, 2024
5f704d6
update to newest mex-common version
esinsj Dec 4, 2024
e6a76f3
improve wikidatat helper test with cache clearing
esinsj Dec 4, 2024
7a53ebd
Merge remote-tracking branch 'origin/main' into feature/mx-1664-use-w…
esinsj Dec 10, 2024
b9d6c72
update changelog
esinsj Dec 10, 2024
752a52b
remove double wikidata test fixtures
esinsj Dec 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

### Changes
- extractors now use wikidata helper function

### Deprecated

Expand Down
18 changes: 12 additions & 6 deletions mex/extractors/biospecimen/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
from mex.common.ldap.connector import LDAPConnector
from mex.common.ldap.models.person import LDAPPerson
from mex.common.logging import watch
from mex.common.wikidata.extract import search_organization_by_label
from mex.common.wikidata.models.organization import WikidataOrganization
from mex.common.types import MergedOrganizationIdentifier
from mex.extractors.biospecimen.models.source import BiospecimenResource
from mex.extractors.settings import Settings
from mex.extractors.wikidata.helpers import (
get_wikidata_extracted_organization_id_by_name,
)


@watch
Expand Down Expand Up @@ -42,20 +44,24 @@ def extract_biospecimen_contacts_by_email(

def extract_biospecimen_organizations(
biospecimen_resources: list[BiospecimenResource],
) -> dict[str, WikidataOrganization]:
) -> dict[str, MergedOrganizationIdentifier]:
"""Search and extract organization from wikidata.

Args:
biospecimen_resources: Iterable of biospecimen resources

Returns:
dict with WikidataOrganization by externe partner
dict with WikidataOrganization ID by externe partner
"""
return {
resource.externe_partner: org
resource.externe_partner: org_id
for resource in biospecimen_resources
if resource.externe_partner
and (org := search_organization_by_label(resource.externe_partner))
and (
org_id := get_wikidata_extracted_organization_id_by_name(
resource.externe_partner
)
)
}


Expand Down
12 changes: 2 additions & 10 deletions mex/extractors/biospecimen/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@
from mex.extractors.pipeline import asset, run_job_in_process
from mex.extractors.settings import Settings
from mex.extractors.sinks import load
from mex.extractors.wikidata.extract import (
get_merged_organization_id_by_query_with_transform_and_load,
)


@asset(group_name="biospecimen", deps=["extracted_primary_source_mex"])
Expand Down Expand Up @@ -72,20 +69,15 @@ def extracted_biospecimen_resources(
unit_stable_target_ids_by_synonym: dict[str, MergedOrganizationalUnitIdentifier],
extracted_organization_rki: ExtractedOrganization,
extracted_synopse_activities: list[ExtractedActivity],
extracted_primary_source_wikidata: ExtractedPrimarySource,
) -> list[ExtractedResource]:
"""Transform biospecimen resources to extracted resources and load them to the sinks.""" # noqa: E501
settings = Settings.get()
resource_mapping = transform_mapping_data_to_model(
extract_mapping_data(settings.biospecimen.mapping_path / "resource.yaml"),
ExtractedResource,
)
biospecimen_organizations = extract_biospecimen_organizations(biospecimen_resources)
extracted_organizations = (
get_merged_organization_id_by_query_with_transform_and_load(
biospecimen_organizations, extracted_primary_source_wikidata
)
)
extracted_organizations = extract_biospecimen_organizations(biospecimen_resources)

mex_sources = list(
transform_biospecimen_resource_to_mex_resource(
biospecimen_resources,
Expand Down
4 changes: 2 additions & 2 deletions mex/extractors/biospecimen/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def transform_biospecimen_resource_to_mex_resource(
person_stable_target_id_by_email = {
str(p.email[0]): Identifier(p.stableTargetId) for p in mex_persons
}
sysnopse_stable_target_id_by_studien_id = {
synopse_stable_target_id_by_studien_id = {
activity.identifierInPrimarySource: activity.stableTargetId
cutoffthetop marked this conversation as resolved.
Show resolved Hide resolved
for activity in extracted_synopse_activities
}
Expand Down Expand Up @@ -96,7 +96,7 @@ def transform_biospecimen_resource_to_mex_resource(
contact.append(k)
elif k := unit_stable_target_ids_by_synonym.get(kontakt):
contact.append(k)
was_generated_by = sysnopse_stable_target_id_by_studien_id.get(
was_generated_by = synopse_stable_target_id_by_studien_id.get(
resource.studienbezug[0], None
)
if resource.weiterfuehrende_dokumentation_url_oder_dateipfad:
Expand Down
14 changes: 8 additions & 6 deletions mex/extractors/blueant/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
from mex.common.ldap.connector import LDAPConnector
from mex.common.ldap.models.person import LDAPPerson
from mex.common.logging import watch
from mex.common.wikidata.extract import search_organization_by_label
from mex.common.wikidata.models.organization import WikidataOrganization
from mex.common.types import MergedOrganizationIdentifier
from mex.extractors.blueant.connector import BlueAntConnector
from mex.extractors.blueant.models.source import BlueAntSource
from mex.extractors.settings import Settings
from mex.extractors.wikidata.helpers import (
get_wikidata_extracted_organization_id_by_name,
)


@watch
Expand Down Expand Up @@ -96,19 +98,19 @@ def remove_prefixes_from_name(name: str) -> str:

def extract_blueant_organizations(
blueant_sources: list[BlueAntSource],
) -> dict[str, WikidataOrganization]:
) -> dict[str, MergedOrganizationIdentifier]:
"""Search and extract organization from wikidata.

Args:
blueant_sources: Iterable of blueant sources

Returns:
Dict with organization label and WikidataOrganization
Dict with organization label and WikidataOrganization ID
"""
return {
name: org
name: org_id
for source in blueant_sources
for name in source.client_names
if name not in ["Robert Koch-Institut", "RKI"]
and (org := search_organization_by_label(name))
and (org_id := get_wikidata_extracted_organization_id_by_name(name))
}
10 changes: 1 addition & 9 deletions mex/extractors/blueant/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@
from mex.extractors.pipeline import asset, run_job_in_process
from mex.extractors.settings import Settings
from mex.extractors.sinks import load
from mex.extractors.wikidata.extract import (
get_merged_organization_id_by_query_with_transform_and_load,
)


@asset(group_name="blueant", deps=["extracted_primary_source_mex"])
Expand Down Expand Up @@ -81,15 +78,10 @@ def blueant_project_leaders_by_employee_id(

@asset(group_name="blueant")
def blueant_organization_ids_by_query_string(
extracted_primary_source_wikidata: ExtractedPrimarySource,
blueant_sources: list[BlueAntSource],
) -> dict[str, MergedOrganizationIdentifier]:
"""Extract organizations for blueant from wikidata and group them by query."""
wikidata_organizations_by_query = extract_blueant_organizations(blueant_sources)

return get_merged_organization_id_by_query_with_transform_and_load(
wikidata_organizations_by_query, extracted_primary_source_wikidata
)
return extract_blueant_organizations(blueant_sources)


@asset(group_name="blueant")
Expand Down
14 changes: 9 additions & 5 deletions mex/extractors/datscha_web/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from mex.common.ldap.models.person import LDAPPersonWithQuery
from mex.common.ldap.transform import analyse_person_string
from mex.common.logging import watch
from mex.common.wikidata.extract import search_organization_by_label
from mex.common.wikidata.models.organization import WikidataOrganization
from mex.common.types import MergedOrganizationIdentifier
from mex.extractors.datscha_web.connector import DatschaWebConnector
from mex.extractors.datscha_web.models.item import DatschaWebItem
from mex.extractors.wikidata.helpers import (
get_wikidata_extracted_organization_id_by_name,
)


@watch
Expand Down Expand Up @@ -52,7 +54,7 @@ def extract_datscha_web_source_contacts(

def extract_datscha_web_organizations(
datscha_web_items: Iterable[DatschaWebItem],
) -> dict[str, WikidataOrganization]:
) -> dict[str, MergedOrganizationIdentifier]:
"""Search and extract organization from wikidata.

Args:
Expand All @@ -62,12 +64,14 @@ def extract_datscha_web_organizations(
Dict with keys DatschaWebItem.Auftragsverarbeiter,
DatschaWebItem.Empfaenger_der_Daten_im_Drittstaat, and
DatschaWebItem.Empfaenger_der_verarbeiteten_uebermittelten_oder_offengelegten_Daten,
and values: WikidataOrganization
and values: MergedOrganizationIdentifier
"""
partner_to_org_map = {}
for item in datscha_web_items:
for partner in item.get_partners():
if partner and partner != "None":
if organization := search_organization_by_label(partner):
if organization := get_wikidata_extracted_organization_id_by_name(
partner
):
partner_to_org_map[partner] = organization
return partner_to_org_map
12 changes: 1 addition & 11 deletions mex/extractors/datscha_web/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@
from mex.extractors.pipeline import asset, run_job_in_process
from mex.extractors.settings import Settings
from mex.extractors.sinks import load
from mex.extractors.wikidata.extract import (
get_merged_organization_id_by_query_with_transform_and_load,
)


@asset(group_name="datscha_web", deps=["extracted_primary_source_mex"])
Expand Down Expand Up @@ -82,16 +79,9 @@ def datscha_web_person_ids_by_query_string(
@asset(group_name="datscha_web")
def datscha_web_organization_ids_by_query_string(
extracted_datscha_web_items: list[DatschaWebItem],
extracted_primary_source_wikidata: ExtractedPrimarySource,
) -> dict[str, MergedOrganizationIdentifier]:
"""Extract organizations for Datscha Web from wikidata and group them by query."""
wikidata_organizations_by_query = extract_datscha_web_organizations(
extracted_datscha_web_items
)

return get_merged_organization_id_by_query_with_transform_and_load(
wikidata_organizations_by_query, extracted_primary_source_wikidata
)
return extract_datscha_web_organizations(extracted_datscha_web_items)


@asset(group_name="datscha_web")
Expand Down
18 changes: 12 additions & 6 deletions mex/extractors/ff_projects/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
from mex.common.ldap.transform import analyse_person_string
from mex.common.logging import watch
from mex.common.types import (
MergedOrganizationIdentifier,
TemporalEntity,
TemporalEntityPrecision,
)
from mex.common.wikidata.extract import search_organization_by_label
from mex.common.wikidata.models.organization import WikidataOrganization
from mex.extractors.ff_projects.models.source import FFProjectsSource
from mex.extractors.settings import Settings
from mex.extractors.wikidata.helpers import (
get_wikidata_extracted_organization_id_by_name,
)


@watch
Expand Down Expand Up @@ -198,22 +200,26 @@ def extract_ff_project_authors(

def extract_ff_projects_organizations(
ff_projects_sources: Iterable[FFProjectsSource],
) -> dict[str, WikidataOrganization]:
) -> dict[str, MergedOrganizationIdentifier]:
"""Search and extract organization from wikidata.

Args:
ff_projects_sources: Iterable of ff-project sources

Returns:
Dict with organization label and WikidataOrganization
Dict with organization label and WikidataOrganization ID
"""
return {
zuwendungs_oder_auftraggeber: org
zuwendungs_oder_auftraggeber: org_id
for source in ff_projects_sources
if source.zuwendungs_oder_auftraggeber
and source.zuwendungs_oder_auftraggeber != "Sonderforschung"
for zuwendungs_oder_auftraggeber in source.zuwendungs_oder_auftraggeber.split(
"/"
)
if (org := search_organization_by_label(zuwendungs_oder_auftraggeber))
if (
org_id := get_wikidata_extracted_organization_id_by_name(
zuwendungs_oder_auftraggeber
)
)
}
12 changes: 1 addition & 11 deletions mex/extractors/ff_projects/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@
from mex.extractors.pipeline import asset, run_job_in_process
from mex.extractors.settings import Settings
from mex.extractors.sinks import load
from mex.extractors.wikidata.extract import (
get_merged_organization_id_by_query_with_transform_and_load,
)


@asset(group_name="ff_projects", deps=["extracted_primary_source_mex"])
Expand Down Expand Up @@ -86,17 +83,10 @@ def ff_projects_person_ids_by_query_string(

@asset(group_name="ff_projects")
def ff_projects_organization_ids_by_query_string(
extracted_primary_source_wikidata: ExtractedPrimarySource,
ff_projects_sources: list[FFProjectsSource],
) -> dict[str, MergedOrganizationIdentifier]:
"""Extract organizations for FF Projects from wikidata and group them by query."""
wikidata_organizations_by_query = extract_ff_projects_organizations(
ff_projects_sources
)

return get_merged_organization_id_by_query_with_transform_and_load(
wikidata_organizations_by_query, extracted_primary_source_wikidata
)
return extract_ff_projects_organizations(ff_projects_sources)


@asset(group_name="ff_projects")
Expand Down
14 changes: 8 additions & 6 deletions mex/extractors/grippeweb/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
from mex.common.ldap.connector import LDAPConnector
from mex.common.ldap.models.actor import LDAPActor
from mex.common.ldap.models.person import LDAPPerson
from mex.common.wikidata.extract import search_organization_by_label
from mex.common.wikidata.models.organization import WikidataOrganization
from mex.common.types import MergedOrganizationIdentifier
from mex.extractors.grippeweb.connector import QUERY_BY_TABLE_NAME, GrippewebConnector
from mex.extractors.mapping.types import AnyMappingModel
from mex.extractors.wikidata.helpers import (
get_wikidata_extracted_organization_id_by_name,
)


def extract_columns_by_table_and_column_name() -> dict[str, dict[str, list[Any]]]:
Expand Down Expand Up @@ -71,23 +73,23 @@ def extract_ldap_persons(

def extract_grippeweb_organizations(
grippeweb_resource_mappings: list[AnyMappingModel],
) -> dict[str, WikidataOrganization]:
) -> dict[str, MergedOrganizationIdentifier]:
"""Search and extract grippeweb organization from wikidata.

Args:
grippeweb_resource_mappings: grippeweb resource mapping models

Returns:
Dict with keys: mapping default values
and values: WikidataOrganization
and values: MergedOrganizationIdentifier
"""
organization_by_name = {}
for resource in grippeweb_resource_mappings:
if external_partner_dict := resource.externalPartner:
external_partner = external_partner_dict[0].mappingRules[0].forValues[0]
if org := search_organization_by_label(external_partner):
if org := get_wikidata_extracted_organization_id_by_name(external_partner):
organization_by_name[external_partner] = org
publisher_name = resource.publisher[0].mappingRules[0].forValues[0]
if publisher := search_organization_by_label(publisher_name):
if publisher := get_wikidata_extracted_organization_id_by_name(publisher_name):
organization_by_name[publisher_name] = publisher
return organization_by_name
10 changes: 1 addition & 9 deletions mex/extractors/grippeweb/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@
from mex.extractors.settings import Settings
from mex.extractors.sinks import load
from mex.extractors.sumo.transform import get_contact_merged_ids_by_emails
from mex.extractors.wikidata.extract import (
get_merged_organization_id_by_query_with_transform_and_load,
)


@asset(group_name="grippeweb", deps=["extracted_primary_source_mex"])
Expand Down Expand Up @@ -148,17 +145,12 @@ def extracted_mex_persons_grippeweb(
@asset(group_name="grippeweb")
def grippeweb_organization_ids_by_query_string(
grippeweb_resource_mappings: list[dict[str, Any]],
extracted_primary_source_wikidata: ExtractedPrimarySource,
) -> dict[str, MergedOrganizationIdentifier]:
"""Extract organizations for grippeweb from wikidata and group them by query."""
wikidata_organizations_by_query = extract_grippeweb_organizations(
return extract_grippeweb_organizations(
transform_mapping_data_to_models(grippeweb_resource_mappings, ExtractedResource)
)

return get_merged_organization_id_by_query_with_transform_and_load(
wikidata_organizations_by_query, extracted_primary_source_wikidata
)


@asset(group_name="grippeweb")
def extracted_access_platform_grippeweb(
Expand Down
Loading
Loading