-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
QA engine: add adoption metrics to the QA report #21917
Changes from 6 commits
15a056b
943a90c
ad0136a
95dd606
6d4feb3
76e3cbf
dd33060
a687435
6a6396d
3b8c2c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,10 @@ | |
|
||
import pandas as pd | ||
|
||
def get_enriched_catalog(oss_catalog: pd.DataFrame, cloud_catalog: pd.DataFrame) -> pd.DataFrame: | ||
def get_enriched_catalog( | ||
oss_catalog: pd.DataFrame, | ||
cloud_catalog: pd.DataFrame, | ||
adoption_metrics_per_connector_version: pd.DataFrame) -> pd.DataFrame: | ||
"""Merge OSS and Cloud catalog in a single dataframe on their definition id. | ||
Transformations: | ||
- Rename columns to snake case. | ||
|
@@ -15,9 +18,11 @@ def get_enriched_catalog(oss_catalog: pd.DataFrame, cloud_catalog: pd.DataFrame) | |
Enrichments: | ||
- is_on_cloud: determined by the merge operation results. | ||
- connector_technical_name: built from the docker repository field. airbyte/source-pokeapi -> source-pokeapi. | ||
- Adoptions metrics: add the columns from the adoption_metrics_per_connector_version dataframe. | ||
Args: | ||
oss_catalog (pd.DataFrame): The open source catalog dataframe. | ||
cloud_catalog (pd.DataFrame): The cloud catalog dataframe. | ||
adoption_metrics_per_connector_version (pd.DataFrame): The crowd sourced adoptions metrics. | ||
|
||
Returns: | ||
pd.DataFrame: The enriched catalog. | ||
|
@@ -33,10 +38,13 @@ def get_enriched_catalog(oss_catalog: pd.DataFrame, cloud_catalog: pd.DataFrame) | |
enriched_catalog.columns = enriched_catalog.columns.str.replace( | ||
"(?<=[a-z])(?=[A-Z])", "_", regex=True | ||
).str.lower() # column names to snake case | ||
enriched_catalog = enriched_catalog[[c for c in enriched_catalog.columns if "_del" not in c]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Optional: I took a minute to understand why we were removing these - maybe using the suffix |
||
enriched_catalog["is_on_cloud"] = enriched_catalog["_merge"] == "both" | ||
enriched_catalog = enriched_catalog.drop(columns="_merge") | ||
enriched_catalog["connector_name"] = enriched_catalog["name"] | ||
enriched_catalog["connector_technical_name"] = enriched_catalog["docker_repository"].str.replace("airbyte/", "") | ||
enriched_catalog["connector_version"] = enriched_catalog["docker_image_tag"] | ||
enriched_catalog["release_stage"] = enriched_catalog["release_stage"].fillna("unknown") | ||
enriched_catalog = enriched_catalog.merge(adoption_metrics_per_connector_version, how="left", on=["connector_definition_id", "connector_version"]) | ||
enriched_catalog[adoption_metrics_per_connector_version.columns] = enriched_catalog[adoption_metrics_per_connector_version.columns].fillna(0) | ||
return enriched_catalog |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,6 @@ | |
import requests | ||
import pandas as pd | ||
|
||
from .constants import CLOUD_CATALOG_URL, OSS_CATALOG_URL | ||
|
||
def fetch_remote_catalog(catalog_url: str) -> pd.DataFrame: | ||
"""Fetch a combined remote catalog and return a single DataFrame | ||
|
@@ -50,6 +49,3 @@ def fetch_adoption_metrics_per_connector_version() -> pd.DataFrame: | |
"total_syncs_count", | ||
"sync_success_rate", | ||
]] | ||
|
||
CLOUD_CATALOG = fetch_remote_catalog(CLOUD_CATALOG_URL) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed these constants from the module because they cause unnecessary network call on module import. I preferred to use dependency injections and test fixtures to expose these (direct calls to the |
||
OSS_CATALOG = fetch_remote_catalog(OSS_CATALOG_URL) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
from enum import Enum | ||
from typing import List | ||
|
||
from pydantic import BaseModel | ||
from pydantic import BaseModel, Field | ||
|
||
class ConnectorTypeEnum(str, Enum): | ||
source = "source" | ||
|
@@ -18,20 +18,26 @@ class ReleaseStageEnum(str, Enum): | |
beta = "beta" | ||
generally_available = "generally_available" | ||
|
||
PUBLIC_FIELD = Field(..., is_public=True) | ||
PRIVATE_FIELD = Field(..., is_public=False) | ||
Comment on lines
+22
to
+23
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! |
||
|
||
class ConnectorQAReport(BaseModel): | ||
connector_type: ConnectorTypeEnum | ||
connector_name: str | ||
connector_technical_name: str | ||
connector_definition_id: str | ||
connector_version: str | ||
release_stage: ReleaseStageEnum | ||
is_on_cloud: bool | ||
is_appropriate_for_cloud_use: bool | ||
latest_build_is_successful: bool | ||
documentation_is_available: bool | ||
number_of_connections: int | ||
number_of_users: int | ||
sync_success_rate: float | ||
connector_type: ConnectorTypeEnum = PUBLIC_FIELD | ||
connector_name: str = PUBLIC_FIELD | ||
connector_technical_name: str = PUBLIC_FIELD | ||
connector_definition_id: str = PUBLIC_FIELD | ||
connector_version: str = PUBLIC_FIELD | ||
release_stage: ReleaseStageEnum = PUBLIC_FIELD | ||
is_on_cloud: bool = PUBLIC_FIELD | ||
is_appropriate_for_cloud_use: bool = PUBLIC_FIELD | ||
latest_build_is_successful: bool = PUBLIC_FIELD | ||
documentation_is_available: bool = PUBLIC_FIELD | ||
number_of_connections: int = PRIVATE_FIELD | ||
number_of_users: int = PRIVATE_FIELD | ||
sync_success_rate: float = PRIVATE_FIELD | ||
total_syncs_count: int = PRIVATE_FIELD | ||
failed_syncs_count: int = PRIVATE_FIELD | ||
succeeded_syncs_count: int = PRIVATE_FIELD | ||
|
||
class QAReport(BaseModel): | ||
connectors_qa_report: List[ConnectorQAReport] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# | ||
# Copyright (c) 2023 Airbyte, Inc., all rights reserved. | ||
# | ||
|
||
|
||
import pandas as pd | ||
|
||
from .models import ConnectorQAReport | ||
|
||
def persist_qa_report(qa_report: pd.DataFrame, path: str, public_fields_only: bool =True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not called as I disabled GCS persistence in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍🏻 I assume we'll usually persist with all of the info, but can use the public-only if we plan to use it in the public repo anywhere |
||
final_fields = [ | ||
field.name for field in ConnectorQAReport.__fields__.values() | ||
if field.field_info.extra["is_public"] or not public_fields_only | ||
] | ||
qa_report[final_fields].to_json(path, orient="records") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,6 @@ | |
import requests | ||
|
||
from .constants import INAPPROPRIATE_FOR_CLOUD_USE_CONNECTORS | ||
from .inputs import OSS_CATALOG | ||
from .models import ConnectorQAReport, QAReport | ||
|
||
class QAReportGenerationError(Exception): | ||
|
@@ -20,7 +19,7 @@ def url_is_reachable(url: str) -> bool: | |
def is_appropriate_for_cloud_use(definition_id: str) -> bool: | ||
return definition_id not in INAPPROPRIATE_FOR_CLOUD_USE_CONNECTORS | ||
|
||
def get_qa_report(enriched_catalog: pd.DataFrame) -> pd.DataFrame: | ||
def get_qa_report(enriched_catalog: pd.DataFrame, oss_catalog_length: int) -> pd.DataFrame: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Always a fan of bring only the data you need 👍 |
||
"""Perform validation steps on top of the enriched catalog. | ||
Adds the following columns: | ||
- documentation_is_available: | ||
|
@@ -37,6 +36,7 @@ def get_qa_report(enriched_catalog: pd.DataFrame) -> pd.DataFrame: | |
Get the sync success rate of the connections with this connector version from our datawarehouse. | ||
Args: | ||
enriched_catalog (pd.DataFrame): The enriched catalog. | ||
oss_catalog_length (pd.DataFrame): The length of the OSS catalog, for sanity check. | ||
|
||
Returns: | ||
pd.DataFrame: The final QA report. | ||
|
@@ -47,14 +47,11 @@ def get_qa_report(enriched_catalog: pd.DataFrame) -> pd.DataFrame: | |
|
||
# TODO YET TO IMPLEMENT VALIDATIONS | ||
qa_report["latest_build_is_successful"] = False # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21720 | ||
qa_report["number_of_connections"] = 0 # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21721 | ||
qa_report["number_of_users"] = 0 # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21721 | ||
qa_report["sync_success_rate"] = .0 # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21721 | ||
|
||
# Only select dataframe columns defined in the ConnectorQAReport model. | ||
qa_report= qa_report[[field.name for field in ConnectorQAReport.__fields__.values()]] | ||
# Validate the report structure with pydantic QAReport model. | ||
QAReport(connectors_qa_report=qa_report.to_dict(orient="records")) | ||
if len(qa_report) != len(OSS_CATALOG): | ||
raise QAReportGenerationError("The QA report does not contain all the connectors defined in the OSS catalog.") | ||
if len(qa_report) != oss_catalog_length: | ||
raise QAReportGenerationError("The QA report does not contain all the connectors defined in the OSS catalog.") | ||
return qa_report |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# | ||
# Copyright (c) 2023 Airbyte, Inc., all rights reserved. | ||
# | ||
|
||
|
||
import pandas as pd | ||
import pytest | ||
|
||
from ci_connector_ops.qa_engine.constants import OSS_CATALOG_URL, CLOUD_CATALOG_URL | ||
from ci_connector_ops.qa_engine.inputs import fetch_remote_catalog | ||
|
||
@pytest.fixture(scope="module") | ||
def oss_catalog(): | ||
return fetch_remote_catalog(OSS_CATALOG_URL) | ||
|
||
@pytest.fixture(scope="module") | ||
def cloud_catalog(): | ||
return fetch_remote_catalog(CLOUD_CATALOG_URL) | ||
|
||
@pytest.fixture(scope="module") | ||
def adoption_metrics_per_connector_version(): | ||
return pd.DataFrame([{ | ||
"connector_definition_id": "dfd88b22-b603-4c3d-aad7-3701784586b1", | ||
"connector_version": "2.0.0", | ||
"number_of_connections": 0, | ||
"number_of_users": 0, | ||
"succeeded_syncs_count": 0, | ||
"failed_syncs_count": 0, | ||
"total_syncs_count": 0, | ||
"sync_success_rate": 0.0, | ||
}]) | ||
|
||
@pytest.fixture | ||
def dummy_qa_report() -> pd.DataFrame: | ||
return pd.DataFrame([ | ||
{ | ||
"connector_type": "source", | ||
"connector_name": "test", | ||
"connector_technical_name": "source-test", | ||
"connector_definition_id": "foobar", | ||
"connector_version": "0.0.0", | ||
"release_stage": "alpha", | ||
"is_on_cloud": False, | ||
"is_appropriate_for_cloud_use": True, | ||
"latest_build_is_successful": True, | ||
"documentation_is_available": False, | ||
"number_of_connections": 0, | ||
"number_of_users": 0, | ||
"sync_success_rate": .99, | ||
"total_syncs_count": 0, | ||
"failed_syncs_count": 0, | ||
"succeeded_syncs_count": 0 | ||
} | ||
]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# | ||
# Copyright (c) 2023 Airbyte, Inc., all rights reserved. | ||
# | ||
|
||
|
||
import pandas as pd | ||
import pytest | ||
|
||
from ci_connector_ops.qa_engine import outputs | ||
|
||
@pytest.mark.parametrize("public_fields_only", [True, False]) | ||
def test_persist_qa_report_public_fields_only(tmp_path, dummy_qa_report, public_fields_only): | ||
output_path = tmp_path / "qa_report.json" | ||
outputs.persist_qa_report(dummy_qa_report, output_path, public_fields_only=public_fields_only) | ||
qa_report_from_disk = pd.read_json(output_path) | ||
private_fields = { | ||
field.name for field in outputs.ConnectorQAReport.__fields__.values() | ||
if not field.field_info.extra["is_public"] | ||
} | ||
available_fields = set(qa_report_from_disk.columns) | ||
if public_fields_only: | ||
assert not private_fields.issubset(available_fields) | ||
else: | ||
assert private_fields.issubset(available_fields) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: might be nice to move this into a function instead of needing comment to explain what its doing: