feature/mx-1470 Ensure correct types (#47)

- ensure correct types for helper extract function in ldap and organigram - fix tiny issue with pandas not parsing str subclasses with resolution - upgrade to pydantic 2.5 - remove mypy-extensions dependency - upgrade all other dependencies - add yaml-checker pre-commit (for .github folder)
robert-koch-institut · Nov 22, 2023 · 5ef55be · 5ef55be
1 parent 97a1ef1
commit 5ef55be
Show file tree

Hide file tree

Showing 6 changed files with 315 additions and 359 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,12 +3,12 @@ default_language_version:
   python: python3.11
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.4
+    rev: v0.1.6
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
   - repo: https://github.com/psf/black
-    rev: 23.10.1
+    rev: 23.11.0
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -17,24 +17,26 @@ repos:
       - id: pretty-format-json
         name: json
         args: [--autofix]
+      - id: check-yaml
+        name: yaml
   - repo: https://github.com/python-poetry/poetry
-    rev: 1.6.1
+    rev: 1.7.0
     hooks:
       - id: poetry-check
         name: poetry
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.1
+    rev: v1.7.0
     hooks:
       - id: mypy
         name: mypy
         files: ^mex/
         additional_dependencies:
           - "backoff>=2.2.1,<3"
           - "click>=8.1.7,<9"
-          - "pandas-stubs>=2.0.3.230814"
-          - "pydantic>=2.1.1,<3"
-          - "pydantic-settings>=2.0.2,<3"
+          - "pandas-stubs>=2.1.1,<3"
+          - "pydantic-settings>=2.1.0,<3"
+          - "pydantic>=2.5.1,<3"
           - "pytest>=7.4.3,<8"
           - "types-pytz>=2023.3.1.1,<2024"
-          - "types-requests>=2.31.0.8,<3"
-          - "types-setuptools>=68.2.0.0,<69"
+          - "types-requests>=2.31.0.10,<3"
+          - "types-setuptools>=68.2.0.1,<69"
diff --git a/mex/common/ldap/extract.py b/mex/common/ldap/extract.py
@@ -1,29 +1,29 @@
 from collections import defaultdict
-from typing import Hashable, Iterable, cast
+from typing import Iterable
 
 from mex.common.identity import get_provider
 from mex.common.ldap.models.person import LDAPPerson, LDAPPersonWithQuery
 from mex.common.models import ExtractedPrimarySource
-from mex.common.types import Identifier
+from mex.common.types import PersonID
 
 
 def _get_merged_ids_by_attribute(
     attribute: str,
     persons: Iterable[LDAPPerson],
     primary_source: ExtractedPrimarySource,
-) -> dict[Hashable, list[Identifier]]:
-    """Return a mapping from a dynamic Person attribute to the merged IDs.
+) -> dict[str, list[PersonID]]:
+    """Return a mapping from a dynamic Person attribute to corresponding PersonIDs.
 
-    Merged IDs are looked up in the identity table and will be omitted
-    for any person that has not yet been transformed and indexed there.
+    PersonIDs are looked up in the identity provider and will be omitted
+    for any person that has not yet been assigned an `Identity` there.
 
     Args:
         attribute: The key to use for the resulting mapping
-        persons: Iterable of LDP persons
+        persons: Iterable of LDAP persons
         primary_source: Primary source for LDAP
 
     Returns:
-        Mapping from `LDAPPerson[attribute]` to corresponding `Identity.stableTargetId`
+        Mapping from a stringified `LDAPPerson[attribute]` to corresponding PersonIDs
     """
     if attribute not in LDAPPerson.model_fields:
         raise RuntimeError(f"Not a valid LDAPPerson field: {attribute}")
@@ -35,63 +35,62 @@ def _get_merged_ids_by_attribute(
             identifier_in_primary_source=str(person.objectGUID),
         ):
             merged_ids_by_attribute[str(getattr(person, attribute))].append(
-                Identifier(identities[0].stableTargetId)
+                PersonID(identities[0].stableTargetId)
             )
-    return cast(dict[Hashable, list[Identifier]], merged_ids_by_attribute)
+    return merged_ids_by_attribute
 
 
 def get_merged_ids_by_employee_ids(
     persons: Iterable[LDAPPerson], primary_source: ExtractedPrimarySource
-) -> dict[Hashable, list[Identifier]]:
-    """Return a mapping from person's employeeID to the merged IDs.
+) -> dict[str, list[PersonID]]:
+    """Return a mapping from a person's employeeID to their PersonIDs.
 
-    Merged IDs are looked up in the identity table and will be omitted
-    for any person that has not yet been transformed and indexed there.
+    PersonIDs are looked up in the identity provider and will be omitted
+    for any person that has not yet been assigned an `Identity` there.
 
     Args:
-        persons: Iterable of LDP persons
+        persons: Iterable of LDAP persons
         primary_source: Primary source for LDAP
 
     Returns:
-        Mapping from `LDAPPerson.employeeID` to corresponding `Identity.stableTargetId`
+        Mapping from `LDAPPerson.employeeID` to corresponding PersonIDs
     """
     return _get_merged_ids_by_attribute("employeeID", persons, primary_source)
 
 
 def get_merged_ids_by_email(
     persons: Iterable[LDAPPerson], primary_source: ExtractedPrimarySource
-) -> dict[Hashable, list[Identifier]]:
-    """Return a mapping from person's e-mail to the merged IDs.
+) -> dict[str, list[PersonID]]:
+    """Return a mapping from a person's e-mail to their PersonIDs.
 
-    Merged IDs are looked up in the identity table and will be omitted
-    for any person that has not yet been transformed and indexed there.
+    PersonIDs are looked up in the identity provider and will be omitted
+    for any person that has not yet been assigned an `Identity` there.
 
     Args:
         persons: Iterable of LDP persons
         primary_source: Primary source for LDAP
 
     Returns:
-        Mapping from `LDAPPerson.mail` to corresponding `Identity.stableTargetId`
+        Mapping from `LDAPPerson.mail` to corresponding PersonIDs
     """
     return _get_merged_ids_by_attribute("mail", persons, primary_source)
 
 
 def get_merged_ids_by_query_string(
     persons_with_query: Iterable[LDAPPersonWithQuery],
     primary_source: ExtractedPrimarySource,
-) -> dict[Hashable, list[Identifier]]:
-    """Return a mapping from an author query string to the resolved merged IDs.
+) -> dict[str, list[PersonID]]:
+    """Return a mapping from a person query string to their PersonIDs.
 
-    Merged IDs are looked up in the identity table and will be omitted
-    for any person that has not yet been transformed and indexed there.
+    PersonIDs are looked up in the identity provider and will be omitted
+    for any person that has not yet been assigned an `Identity` there.
 
     Args:
         persons_with_query: Iterable of LDP persons with query
         primary_source: Primary source for LDAP
 
     Returns:
-        Mapping from `LDAPPersonWithQuery.query` to corresponding
-        `Identity.stableTargetId`
+        Mapping from `LDAPPersonWithQuery.query` to corresponding PersonIDs
     """
     merged_ids_by_attribute = defaultdict(list)
     provider = get_provider()
@@ -100,7 +99,7 @@ def get_merged_ids_by_query_string(
             had_primary_source=primary_source.stableTargetId,
             identifier_in_primary_source=str(person_with_query.person.objectGUID),
         ):
-            merged_ids_by_attribute[person_with_query.query].append(
-                Identifier(identities[0].stableTargetId)
+            merged_ids_by_attribute[str(person_with_query.query)].append(
+                PersonID(identities[0].stableTargetId)
             )
-    return cast(dict[Hashable, list[Identifier]], merged_ids_by_attribute)
+    return merged_ids_by_attribute
diff --git a/mex/common/organigram/extract.py b/mex/common/organigram/extract.py
@@ -59,7 +59,7 @@ def get_unit_merged_ids_by_synonyms(
         Mapping from unit synonyms to stableTargetIds
     """
     return {
-        synonym: extracted_unit.stableTargetId
+        synonym: OrganizationalUnitID(extracted_unit.stableTargetId)
         for extracted_unit in extracted_units
         for synonym in _get_synonyms(extracted_unit)
     }
@@ -79,7 +79,7 @@ def get_unit_merged_ids_by_emails(
         Mapping from lowercased `email` to stableTargetIds
     """
     return {
-        email.lower(): extracted_unit.stableTargetId
+        email.lower(): OrganizationalUnitID(extracted_unit.stableTargetId)
         for extracted_unit in extracted_units
         for email in extracted_unit.email
     }
diff --git a/mex/common/types/timestamp.py b/mex/common/types/timestamp.py
@@ -203,7 +203,7 @@ def _parse_timestamp(value: "Timestamp") -> tuple[datetime, TimestampPrecision]:
     def _parse_string(value: str) -> tuple[datetime, TimestampPrecision]:
         """Parse a string containing a timestamp using pandas' tslibs."""
         parsed, precision = parsing.parse_datetime_string_with_reso(  # type: ignore[attr-defined]
-            value, freq=None, dayfirst=False, yearfirst=True
+            str(value), freq=None, dayfirst=False, yearfirst=True
         )
         if parsed.tzinfo is None:
             parsed = parsed.replace(tzinfo=CET)