feature/mx-1381 rework database model (#25)

# Changes - re-implemented queries as templated cql files - updated graph connector for new queries - improved isolation of neo4j dependency - improved documentation and code-readability # Removed - trashed hydration module
robert-koch-institut · Apr 8, 2024 · e8ec12a · e8ec12a
1 parent ade4cda
commit e8ec12a
Show file tree

Hide file tree

Showing 51 changed files with 2,256 additions and 1,896 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,12 +3,12 @@ default_language_version:
   python: python3.11
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
+    rev: v0.3.5
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
   - repo: https://github.com/psf/black
-    rev: 24.2.0
+    rev: 24.3.0
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/pre-commit-hooks

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,10 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changes
 
+- re-implemented queries as templated cql files
+- updated graph connector for new queries
+- improved isolation of neo4j dependency
+- improved documentation and code-readability
+
 ### Deprecated
 
 ### Removed
 
+- trashed hydration module
+
 ### Fixed
 
 ### Security

diff --git a/mex/backend/extracted/main.py b/mex/backend/extracted/main.py
@@ -2,11 +2,9 @@
 
 from fastapi import APIRouter, Query
 
-from mex.backend.extracted.models import ExtractedItemSearchResponse, ExtractedType
-from mex.backend.extracted.transform import (
-    transform_graph_results_to_extracted_item_search_response,
-)
+from mex.backend.extracted.models import ExtractedItemSearchResponse
 from mex.backend.graph.connector import GraphConnector
+from mex.backend.types import ExtractedType
 from mex.common.types import Identifier
 
 router = APIRouter()
@@ -15,20 +13,20 @@
 @router.get("/extracted-item", tags=["editor"])
 def search_extracted_items(
     q: Annotated[str, Query(max_length=100)] = "",
-    stableTargetId: Identifier | None = None,  # noqa: N803
-    entityType: Annotated[  # noqa: N803
+    stableTargetId: Identifier | None = None,
+    entityType: Annotated[
         Sequence[ExtractedType], Query(max_length=len(ExtractedType))
     ] = [],
     skip: Annotated[int, Query(ge=0, le=10e10)] = 0,
     limit: Annotated[int, Query(ge=1, le=100)] = 10,
 ) -> ExtractedItemSearchResponse:
     """Search for extracted items by query text or by type and id."""
     graph = GraphConnector.get()
-    query_results = graph.query_nodes(
+    result = graph.fetch_extracted_data(
         q,
         stableTargetId,
         [str(t.value) for t in entityType or ExtractedType],
         skip,
         limit,
     )
-    return transform_graph_results_to_extracted_item_search_response(query_results)
+    return ExtractedItemSearchResponse.model_validate(result.one())
diff --git a/mex/backend/extracted/models.py b/mex/backend/extracted/models.py
@@ -1,30 +1,12 @@
-from enum import Enum
-from typing import TYPE_CHECKING, Union
+from typing import Annotated
 
 from pydantic import Field
 
-from mex.backend.types import DynamicStrEnum
-from mex.common.models import (
-    EXTRACTED_MODEL_CLASSES_BY_NAME,
-    BaseExtractedData,
-    BaseModel,
-)
-
-
-class ExtractedType(Enum, metaclass=DynamicStrEnum):
-    """Enumeration of possible types for extracted items."""
-
-    __names__ = list(EXTRACTED_MODEL_CLASSES_BY_NAME)
-
-
-if TYPE_CHECKING:  # pragma: no cover
-    AnyExtractedModel = BaseExtractedData
-else:
-    AnyExtractedModel = Union[*EXTRACTED_MODEL_CLASSES_BY_NAME.values()]
+from mex.common.models import AnyExtractedModel, BaseModel
 
 
 class ExtractedItemSearchResponse(BaseModel):
     """Response body for the extracted item search endpoint."""
 
     total: int
-    items: list[AnyExtractedModel] = Field(discriminator="entityType")
+    items: Annotated[list[AnyExtractedModel], Field(discriminator="entityType")]
diff --git a/mex/backend/extracted/transform.py b/mex/backend/extracted/transform.py
diff --git a/mex/backend/fields.py b/mex/backend/fields.py
@@ -1,60 +1,164 @@
-from types import UnionType
-from typing import Annotated, Any, Generator, Union, get_args, get_origin
+from types import NoneType, UnionType
+from typing import (
+    Annotated,
+    Any,
+    Callable,
+    Generator,
+    Mapping,
+    Union,
+    get_args,
+    get_origin,
+)
 
+from pydantic import BaseModel
 from pydantic.fields import FieldInfo
 
+from mex.backend.types import LiteralStringType
 from mex.common.models import EXTRACTED_MODEL_CLASSES_BY_NAME
-from mex.common.types import Identifier, Text
+from mex.common.types import MERGED_IDENTIFIER_CLASSES, Link, Text
 
 
 def _get_inner_types(annotation: Any) -> Generator[type, None, None]:
-    """Yield all inner types from Unions, lists and annotations."""
+    """Yield all inner types from unions, lists and type annotations (except NoneType).
+
+    Args:
+        annotation: A valid python type annotation
+
+    Returns:
+        A generator for all (non-NoneType) types found in the annotation
+    """
     if get_origin(annotation) == Annotated:
         yield from _get_inner_types(get_args(annotation)[0])
     elif get_origin(annotation) in (Union, UnionType, list):
         for arg in get_args(annotation):
             yield from _get_inner_types(arg)
-    elif annotation is None:
-        yield type(None)
-    else:
+    elif annotation not in (None, NoneType):
         yield annotation
 
 
-def is_reference_field(field: FieldInfo) -> bool:
-    """Return whether the given field contains a stable target id."""
-    return any(
-        isinstance(t, type) and issubclass(t, Identifier)
-        for t in _get_inner_types(field.annotation)
-    )
+def _contains_only_types(field: FieldInfo, *types: type) -> bool:
+    """Return whether a `field` is annotated as one of the given `types`.
 
+    Unions, lists and type annotations are checked for their inner types and only the
+    non-`NoneType` types are considered for the type-check.
 
-def is_text_field(field: FieldInfo) -> bool:
-    """Return whether the given field is holding text objects."""
-    return any(
-        isinstance(t, type) and issubclass(t, Text)
-        for t in _get_inner_types(field.annotation)
-    )
+    Args:
+        field: A pydantic `FieldInfo` object
+        types: Types to look for in the field's annotation
 
+    Returns:
+        Whether the field contains any of the given types
+    """
+    if inner_types := list(_get_inner_types(field.annotation)):
+        return all(inner_type in types for inner_type in inner_types)
+    return False
 
-REFERENCE_FIELDS_BY_CLASS_NAME = {
-    name: {
-        field_name
-        for field_name, field_info in cls.model_fields.items()
-        if field_name
-        not in (
-            "identifier",
-            "stableTargetId",
+
+def _group_fields_by_class_name(
+    model_classes_by_name: Mapping[str, type[BaseModel]],
+    predicate: Callable[[FieldInfo], bool],
+) -> dict[str, list[str]]:
+    """Group the field names by model class and filter them by the given predicate.
+
+    Args:
+        model_classes_by_name: Map from class names to model classes
+        predicate: Function to filter the fields of the classes by
+
+    Returns:
+        Dictionary mapping class names to a list of field names filtered by `predicate`
+    """
+    return {
+        name: sorted(
+            {
+                field_name
+                for field_name, field_info in cls.model_fields.items()
+                if predicate(field_info)
+            }
         )
-        and is_reference_field(field_info)
+        for name, cls in model_classes_by_name.items()
     }
+
+
+# fields that are immutable and can only be set once
+FROZEN_FIELDS_BY_CLASS_NAME = _group_fields_by_class_name(
+    EXTRACTED_MODEL_CLASSES_BY_NAME, lambda field_info: field_info.frozen is True
+)
+
+# static fields that are set once on class-level to a literal type
+LITERAL_FIELDS_BY_CLASS_NAME = _group_fields_by_class_name(
+    EXTRACTED_MODEL_CLASSES_BY_NAME,
+    lambda field_info: isinstance(field_info.annotation, LiteralStringType),
+)
+
+# fields typed as merged identifiers containing references to merged items
+REFERENCE_FIELDS_BY_CLASS_NAME = _group_fields_by_class_name(
+    EXTRACTED_MODEL_CLASSES_BY_NAME,
+    lambda field_info: _contains_only_types(field_info, *MERGED_IDENTIFIER_CLASSES),
+)
+
+# nested fields that contain `Text` objects
+TEXT_FIELDS_BY_CLASS_NAME = _group_fields_by_class_name(
+    EXTRACTED_MODEL_CLASSES_BY_NAME,
+    lambda field_info: _contains_only_types(field_info, Text),
+)
+
+# nested fields that contain `Link` objects
+LINK_FIELDS_BY_CLASS_NAME = _group_fields_by_class_name(
+    EXTRACTED_MODEL_CLASSES_BY_NAME,
+    lambda field_info: _contains_only_types(field_info, Link),
+)
+
+# fields annotated as `str` type
+STRING_FIELDS_BY_CLASS_NAME = _group_fields_by_class_name(
+    EXTRACTED_MODEL_CLASSES_BY_NAME,
+    lambda field_info: _contains_only_types(field_info, str),
+)
+
+# fields that should be indexed as searchable fields
+SEARCHABLE_FIELDS = sorted(
+    {
+        field_name
+        for field_names in STRING_FIELDS_BY_CLASS_NAME.values()
+        for field_name in field_names
+    }
+)
+
+# classes that have fields that should be searchable
+SEARCHABLE_CLASSES = sorted(
+    {name for name, field_names in STRING_FIELDS_BY_CLASS_NAME.items() if field_names}
+)
+
+# fields with changeable values that are not nested objects or merged item references
+MUTABLE_FIELDS_BY_CLASS_NAME = {
+    name: sorted(
+        {
+            field_name
+            for field_name in cls.model_fields
+            if field_name
+            not in (
+                *FROZEN_FIELDS_BY_CLASS_NAME[name],
+                *REFERENCE_FIELDS_BY_CLASS_NAME[name],
+                *TEXT_FIELDS_BY_CLASS_NAME[name],
+                *LINK_FIELDS_BY_CLASS_NAME[name],
+            )
+        }
+    )
     for name, cls in EXTRACTED_MODEL_CLASSES_BY_NAME.items()
 }
 
-TEXT_FIELDS_BY_CLASS_NAME = {
-    name: {
-        f"{field_name}_value"
-        for field_name, field_info in cls.model_fields.items()
-        if is_text_field(field_info)
-    }
+# fields with values that should be set once but are neither literal nor references
+FINAL_FIELDS_BY_CLASS_NAME = {
+    name: sorted(
+        {
+            field_name
+            for field_name in cls.model_fields
+            if field_name in FROZEN_FIELDS_BY_CLASS_NAME[name]
+            and field_name
+            not in (
+                *LITERAL_FIELDS_BY_CLASS_NAME[name],
+                *REFERENCE_FIELDS_BY_CLASS_NAME[name],
+            )
+        }
+    )
     for name, cls in EXTRACTED_MODEL_CLASSES_BY_NAME.items()
 }