Skip to content

Commit

Permalink
Clean up and rename MExModel
Browse files Browse the repository at this point in the history
  • Loading branch information
cutoffthetop committed Feb 21, 2024
1 parent c785f4c commit 452c5c6
Show file tree
Hide file tree
Showing 28 changed files with 627 additions and 516 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ linter:
pre-commit run --all-files; \

pytest:
# run the pytest test suite with unit and integration tests in random order
# run the pytest test suite with unit and integration tests
@ echo running unit tests; \
poetry run pytest --random-order-bucket=global -m "not integration"; \
poetry run pytest -m "not integration"; \

wheel:
# build the python package
Expand Down
5 changes: 2 additions & 3 deletions mex.bat
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ echo linting all files
pre-commit run --all-files
if %errorlevel% neq 0 exit /b %errorlevel%

@REM run the pytest test suite with unit and integration tests in random order
@REM distributed across all available CPU cores
@REM run pytest unit and integration tests distributed across available cores
echo running all tests
poetry run pytest --random-order-bucket=global --numprocesses=auto --dist=worksteal
poetry run pytest --numprocesses=auto --dist=worksteal
exit /b %errorlevel%


Expand Down
3 changes: 1 addition & 2 deletions mex/common/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
MergedAccessPlatform,
)
from mex.common.models.activity import BaseActivity, ExtractedActivity, MergedActivity
from mex.common.models.base import BaseModel, MExModel
from mex.common.models.base import BaseModel
from mex.common.models.contact_point import (
BaseContactPoint,
ExtractedContactPoint,
Expand Down Expand Up @@ -97,7 +97,6 @@
"MEX_PRIMARY_SOURCE_IDENTIFIER_IN_PRIMARY_SOURCE",
"MEX_PRIMARY_SOURCE_IDENTIFIER",
"MEX_PRIMARY_SOURCE_STABLE_TARGET_ID",
"MExModel",
)

AnyBaseModel = Union[
Expand Down
12 changes: 6 additions & 6 deletions mex/common/models/access_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,17 @@ class BaseAccessPlatform(BaseModel):
class ExtractedAccessPlatform(BaseAccessPlatform, ExtractedData):
"""An automatically extracted metadata set describing an access platform."""

entityType: Literal["ExtractedAccessPlatform"] = Field(
"ExtractedAccessPlatform", alias="$type", frozen=True
)
entityType: Annotated[
Literal["ExtractedAccessPlatform"], Field(alias="$type", frozen=True)
] = "ExtractedAccessPlatform"
identifier: Annotated[ExtractedAccessPlatformIdentifier, Field(frozen=True)]
stableTargetId: MergedAccessPlatformIdentifier


class MergedAccessPlatform(BaseAccessPlatform, MergedItem):
"""The result of merging all extracted data and rules for an access platform."""

entityType: Literal["MergedAccessPlatform"] = Field(
"MergedAccessPlatform", alias="$type", frozen=True
)
entityType: Annotated[
Literal["MergedAccessPlatform"], Field(alias="$type", frozen=True)
] = "MergedAccessPlatform"
identifier: Annotated[MergedAccessPlatformIdentifier, Field(frozen=True)]
12 changes: 6 additions & 6 deletions mex/common/models/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,17 @@ class BaseActivity(BaseModel):
class ExtractedActivity(BaseActivity, ExtractedData):
"""An automatically extracted metadata set describing an activity."""

entityType: Literal["ExtractedActivity"] = Field(
"ExtractedActivity", alias="$type", frozen=True
)
entityType: Annotated[
Literal["ExtractedActivity"], Field(alias="$type", frozen=True)
] = "ExtractedActivity"
identifier: Annotated[ExtractedActivityIdentifier, Field(frozen=True)]
stableTargetId: MergedActivityIdentifier


class MergedActivity(BaseActivity, MergedItem):
"""The result of merging all extracted data and rules for an activity."""

entityType: Literal["MergedActivity"] = Field(
"MergedActivity", alias="$type", frozen=True
)
entityType: Annotated[
Literal["MergedActivity"], Field(alias="$type", frozen=True)
] = "MergedActivity"
identifier: Annotated[MergedActivityIdentifier, Field(frozen=True)]
55 changes: 4 additions & 51 deletions mex/common/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,29 @@
from collections.abc import MutableMapping
from functools import cache
from typing import (
TYPE_CHECKING,
Any,
TypeVar,
Union,
get_args,
get_origin,
)

from pydantic import (
BaseModel as PydanticBaseModel,
)
from pydantic import BaseModel as PydanticBaseModel
from pydantic import (
ConfigDict,
TypeAdapter,
ValidationError,
model_validator,
)
from pydantic.fields import FieldInfo
from pydantic.json_schema import DEFAULT_REF_TEMPLATE, JsonSchemaMode, JsonSchemaValue
from pydantic.json_schema import (
GenerateJsonSchema as PydanticJsonSchemaGenerator,
)
from pydantic.json_schema import DEFAULT_REF_TEMPLATE, JsonSchemaMode
from pydantic.json_schema import GenerateJsonSchema as PydanticJsonSchemaGenerator

from mex.common.types import Identifier
from mex.common.models.schema import JsonSchemaGenerator

RawModelDataT = TypeVar("RawModelDataT")


class JsonSchemaGenerator(PydanticJsonSchemaGenerator):
"""Customization of the pydantic class for generating JSON schemas."""

def handle_ref_overrides(self, json_schema: JsonSchemaValue) -> JsonSchemaValue:
"""Disable pydantic behavior to wrap top-level `$ref` keys in an `allOf`.
For example, pydantic would convert
{"$ref": "#/$defs/APIType", "examples": ["api-type-1"]}
into
{"allOf": {"$ref": "#/$defs/APIType"}, "examples": ["api-type-1"]}
which is in fact recommended by JSON schema, but we need to disable this
to stay compatible with mex-editor and mex-model.
"""
return json_schema


class BaseModel(PydanticBaseModel):
"""Common base class for all MEx model classes."""

Expand Down Expand Up @@ -207,29 +186,3 @@ def checksum(self) -> str:
def __str__(self) -> str:
"""Format this model as a string for logging."""
return f"{self.__class__.__name__}: {self.checksum()}"


class MExModel(BaseModel):
"""Abstract base model for extracted data, merged item and rule set classes.
This class defines an `identifier` field and gives a type hint for the frozen class
variable `entityType`.
"""

model_config = ConfigDict(extra="forbid")

if TYPE_CHECKING:
# We add the entityType as a final class variable to all `MExModel` subclasses.
# This helps with assigning the correct class when reading raw JSON entities.
# Simple duck-typing would not work, because some entity types have overlapping
# attributes, like `Person.email` and `ContactPoint.email`.
entityType: str

# A globally unique identifier for this item. Regardless of the entity-type or
# whether this item was extracted, merged, etc., identifiers will be assigned
# just once and should be declared as `frozen` on subclasses.
identifier: Identifier

def __str__(self) -> str:
"""Format this instance as a string for logging."""
return f"{self.entityType}: {self.identifier}"
12 changes: 6 additions & 6 deletions mex/common/models/contact_point.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ class BaseContactPoint(BaseModel):
class ExtractedContactPoint(BaseContactPoint, ExtractedData):
"""An automatically extracted metadata set describing a contact point."""

entityType: Literal["ExtractedContactPoint"] = Field(
"ExtractedContactPoint", alias="$type", frozen=True
)
entityType: Annotated[
Literal["ExtractedContactPoint"], Field(alias="$type", frozen=True)
] = "ExtractedContactPoint"
identifier: Annotated[ExtractedContactPointIdentifier, Field(frozen=True)]
stableTargetId: MergedContactPointIdentifier


class MergedContactPoint(BaseContactPoint, MergedItem):
"""The result of merging all extracted data and rules for a contact point."""

entityType: Literal["MergedContactPoint"] = Field(
"MergedContactPoint", alias="$type", frozen=True
)
entityType: Annotated[
Literal["MergedContactPoint"], Field(alias="$type", frozen=True)
] = "MergedContactPoint"
identifier: Annotated[MergedContactPointIdentifier, Field(frozen=True)]
12 changes: 6 additions & 6 deletions mex/common/models/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,17 @@ class BaseDistribution(BaseModel):
class ExtractedDistribution(BaseDistribution, ExtractedData):
"""An automatically extracted metadata set describing a distribution."""

entityType: Literal["ExtractedDistribution"] = Field(
"ExtractedDistribution", alias="$type", frozen=True
)
entityType: Annotated[
Literal["ExtractedDistribution"], Field(alias="$type", frozen=True)
] = "ExtractedDistribution"
identifier: Annotated[ExtractedDistributionIdentifier, Field(frozen=True)]
stableTargetId: MergedDistributionIdentifier


class MergedDistribution(BaseDistribution, MergedItem):
"""The result of merging all extracted data and rules for a distribution."""

entityType: Literal["MergedDistribution"] = Field(
"MergedDistribution", alias="$type", frozen=True
)
entityType: Annotated[
Literal["MergedDistribution"], Field(alias="$type", frozen=True)
] = "MergedDistribution"
identifier: Annotated[MergedDistributionIdentifier, Field(frozen=True)]
35 changes: 35 additions & 0 deletions mex/common/models/entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import TYPE_CHECKING

from pydantic import ConfigDict

from mex.common.models.base import BaseModel
from mex.common.types import Identifier


class BaseEntity(BaseModel):
"""Abstract base model for extracted data, merged item and rule set classes.
This class gives type hints for an `identifier` field and the frozen class variable
`entityType`. Subclasses should implement both fields and set the correct identifier
type as well as the correct literal value for the entity type.
"""

model_config = ConfigDict(extra="forbid")

if TYPE_CHECKING: # pragma: no cover
# The `entityType` class variable is added to all `BaseEntity` subclasses to
# help with assigning the correct class when reading raw JSON entities.
# E.g.: https://docs.pydantic.dev/latest/concepts/fields/#discriminator
# Simple duck-typing would not work, because some entity-types have overlapping
# attributes, like `Person.email` and `ContactPoint.email`.
entityType: str

# A globally unique identifier is added to all `BaseEntity` subclasses and
# should be typed to the correct identifier type. Regardless of the entity-type
# or whether this item was extracted, merged, etc., identifiers will be assigned
# just once and should be declared as `frozen` on subclasses.
identifier: Identifier

def __str__(self) -> str:
"""Format this instance as a string for logging."""
return f"{self.entityType}: {self.identifier}"
44 changes: 22 additions & 22 deletions mex/common/models/extracted_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import Field, model_validator, validate_call

from mex.common.models.base import MExModel
from mex.common.models.entity import BaseEntity
from mex.common.types import (
ExtractedPrimarySourceIdentifier,
MergedPrimarySourceIdentifier,
Expand All @@ -13,7 +13,7 @@
MEX_PRIMARY_SOURCE_STABLE_TARGET_ID = MergedPrimarySourceIdentifier("00000000000000")


class ExtractedData(MExModel):
class ExtractedData(BaseEntity):
"""Base model for all extracted data classes.
This class adds two important attributes for metadata provenance: `hadPrimarySource`
Expand All @@ -26,30 +26,30 @@ class ExtractedData(MExModel):
See below, for a full description.
"""

if TYPE_CHECKING:
if TYPE_CHECKING: # pragma: no cover
# Sometimes multiple primary sources describe the same activity, resource, etc.
# and a complete metadata item can only be created by merging these fragments.
# The `stableTargetID` is part of all models in `mex.common.models` to allow
# MEx to identify which extracted items describe the same thing and should be
# merged to create a complete metadata item.
# The name might be a bit misleading (also due to historical reasons), but the
# "stability" is only guaranteed for one "real world" or "digital world" thing
# having the same ID in MEx over time. But not as a guarantee, that the same
# metadata sources contribute to the complete metadata item.
# Because we anticipate that items have to be merged, the `stableTargetID` is
# The `stableTargetId` is part of all extracted models to allow MEx to identify
# which items describe the same thing and should be merged to create a complete
# metadata item. The name `stableTargetId` might be a bit misleading, because
# the "stability" is only guaranteed for one "real world" or "digital world"
# thing having the same ID in MEx over time. But it is not a guarantee, that the
# same metadata sources contribute to the complete metadata item. The naming has
# its historical reasons, but we plan to change it in the near future.
# Because we anticipate that items have to be merged, the `stableTargetId` is
# also used as the foreign key for all fields containing references.
stableTargetId: Any

hadPrimarySource: Annotated[
MergedPrimarySourceIdentifier,
Field(
description=(
"The stableTargetID of the primary source, that this item was "
"The stableTargetId of the primary source, that this item was "
"extracted from. This field is mandatory for all extracted items to "
"aid with data provenance. Extracted primary sources also have this "
"field and are all extracted from a static primary source for MEx. "
"The primary source for MEx has itself as a primary source, which "
"is meant to be the only loop in the graph formed by MEx metadata."
"The extracted primary source for MEx has its own merged item as a "
"primary source."
),
frozen=True,
),
Expand All @@ -62,8 +62,8 @@ class ExtractedData(MExModel):
"It is only unique amongst items coming from the same system, because "
"identifier formats are likely to overlap between systems. "
"The value for `identifierInPrimarySource` is therefore only unique in "
"composition with `hadPrimarySource`. MEx uses this composite key "
"to assign a stable and globally unique `identifier` to each item."
"composition with `hadPrimarySource`. MEx uses this composite key to "
"assign a stable and globally unique `identifier` per extracted item."
),
examples=["123456", "item-501", "D7/x4/zz.final3"],
min_length=1,
Expand All @@ -76,10 +76,10 @@ class ExtractedData(MExModel):
@classmethod
@validate_call
def set_identifiers(cls, values: dict[str, Any]) -> dict[str, Any]: # noqa: C901
"""Ensure identifier and provenance attributes are set for this instance.
"""Ensure identifiers and provenance attributes are set for this instance.
All extracted data classes have four important identifiers that are defined
by `MExModel` and `BaseExtractedData`:
by `BaseEntity`, `ExtractedData` and the concrete classes themselves.
- identifierInPrimarySource
- hadPrimarySource
Expand All @@ -93,21 +93,21 @@ def set_identifiers(cls, values: dict[str, Any]) -> dict[str, Any]: # noqa: C90
because otherwise we cannot reliably determine the origin of this item.
These two identifiers are the only two that need to be set during extraction.
Next we query the configured `IdentityProvider` to determine whether this item
Next, we query the configured `IdentityProvider` to determine whether this item
already has an `identifier` and `stableTargetId`. If not, we let the identity
provider generate new identifiers.
If an `identifier` has been passed to the constructor, we check that it matches
with what we got from the identity provider, because we don't allow any system
to change the association from `identifierInPrimarySource` and
`hadPrimarySource` to the `identifier`.
A use case for passing a matching `identifier` to the constructor would be
A use-case for passing a matching `identifier` to the constructor would be
parsing an already extracted item from an NDJSON file or an API endpoint.
If a `stableTargetId` has been passed to the constructor, we use that as the
new value, because changes to the stable target ID are generally allowed.
A use case for changing the `stableTargetId` will be the matching of
multiple extracted items (see `MExModel.stableTargetId` for details).
A use-case for changing the `stableTargetId` will be the matching of
multiple extracted items (see `BaseEntity.stableTargetId` for details).
Args:
values: Raw values to validate
Expand Down
4 changes: 2 additions & 2 deletions mex/common/models/merged_item.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from mex.common.models.base import MExModel
from mex.common.models.entity import BaseEntity


class MergedItem(MExModel):
class MergedItem(BaseEntity):
"""Base model for all merged item classes."""
Loading

0 comments on commit 452c5c6

Please sign in to comment.