Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QA engine: add adoption metrics to the QA report #21917

Merged
merged 10 commits into from
Jan 27, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

import pandas as pd

def get_enriched_catalog(oss_catalog: pd.DataFrame, cloud_catalog: pd.DataFrame) -> pd.DataFrame:
def get_enriched_catalog(
oss_catalog: pd.DataFrame,
cloud_catalog: pd.DataFrame,
adoption_metrics_per_connector_version: pd.DataFrame) -> pd.DataFrame:
"""Merge OSS and Cloud catalog in a single dataframe on their definition id.
Transformations:
- Rename columns to snake case.
Expand All @@ -15,9 +18,11 @@ def get_enriched_catalog(oss_catalog: pd.DataFrame, cloud_catalog: pd.DataFrame)
Enrichments:
- is_on_cloud: determined by the merge operation results.
- connector_technical_name: built from the docker repository field. airbyte/source-pokeapi -> source-pokeapi.
- Adoptions metrics: add the columns from the adoption_metrics_per_connector_version dataframe.
Args:
oss_catalog (pd.DataFrame): The open source catalog dataframe.
cloud_catalog (pd.DataFrame): The cloud catalog dataframe.
adoption_metrics_per_connector_version (pd.DataFrame): The crowd sourced adoptions metrics.

Returns:
pd.DataFrame: The enriched catalog.
Expand All @@ -33,10 +38,13 @@ def get_enriched_catalog(oss_catalog: pd.DataFrame, cloud_catalog: pd.DataFrame)
enriched_catalog.columns = enriched_catalog.columns.str.replace(
"(?<=[a-z])(?=[A-Z])", "_", regex=True
).str.lower() # column names to snake case
Comment on lines 39 to 41
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: might be nice to move this into a function instead of needing comment to explain what its doing:

enriched_catalog.columns = enriched_catalog.columns.to_snake_case()

enriched_catalog = enriched_catalog[[c for c in enriched_catalog.columns if "_del" not in c]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: I took a minute to understand why we were removing these - maybe using the suffix _cloud would make it clear that when merging the two, we prefer the OSS column and drop the cloud one

enriched_catalog["is_on_cloud"] = enriched_catalog["_merge"] == "both"
enriched_catalog = enriched_catalog.drop(columns="_merge")
enriched_catalog["connector_name"] = enriched_catalog["name"]
enriched_catalog["connector_technical_name"] = enriched_catalog["docker_repository"].str.replace("airbyte/", "")
enriched_catalog["connector_version"] = enriched_catalog["docker_image_tag"]
enriched_catalog["release_stage"] = enriched_catalog["release_stage"].fillna("unknown")
enriched_catalog = enriched_catalog.merge(adoption_metrics_per_connector_version, how="left", on=["connector_definition_id", "connector_version"])
enriched_catalog[adoption_metrics_per_connector_version.columns] = enriched_catalog[adoption_metrics_per_connector_version.columns].fillna(0)
return enriched_catalog
4 changes: 0 additions & 4 deletions tools/ci_connector_ops/ci_connector_ops/qa_engine/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import requests
import pandas as pd

from .constants import CLOUD_CATALOG_URL, OSS_CATALOG_URL

def fetch_remote_catalog(catalog_url: str) -> pd.DataFrame:
"""Fetch a combined remote catalog and return a single DataFrame
Expand Down Expand Up @@ -50,6 +49,3 @@ def fetch_adoption_metrics_per_connector_version() -> pd.DataFrame:
"total_syncs_count",
"sync_success_rate",
]]

CLOUD_CATALOG = fetch_remote_catalog(CLOUD_CATALOG_URL)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed these constants from the module because they cause unnecessary network call on module import. I preferred to use dependency injections and test fixtures to expose these (direct calls to the fetch_remote_catalog function.

OSS_CATALOG = fetch_remote_catalog(OSS_CATALOG_URL)
18 changes: 11 additions & 7 deletions tools/ci_connector_ops/ci_connector_ops/qa_engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
#


from .constants import GCS_QA_REPORT_PATH
from .enrichments import get_enriched_catalog
from .inputs import CLOUD_CATALOG, OSS_CATALOG
from .validations import get_qa_report
from .constants import CLOUD_CATALOG_URL, GCS_QA_REPORT_PATH, OSS_CATALOG_URL
from . import enrichments, inputs, validations, outputs


def main():
enriched_catalog = get_enriched_catalog(OSS_CATALOG, CLOUD_CATALOG)
qa_report = get_qa_report(enriched_catalog)
qa_report.to_json(GCS_QA_REPORT_PATH, orient="records")
oss_catalog = inputs.fetch_remote_catalog(OSS_CATALOG_URL)
cloud_catalog = inputs.fetch_remote_catalog(CLOUD_CATALOG_URL)
adoption_metrics_per_connector_version = inputs.fetch_adoption_metrics_per_connector_version()
enriched_catalog = enrichments.get_enriched_catalog(
oss_catalog,
cloud_catalog,
adoption_metrics_per_connector_version
)
validations.get_qa_report(enriched_catalog, len(oss_catalog))
34 changes: 20 additions & 14 deletions tools/ci_connector_ops/ci_connector_ops/qa_engine/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from enum import Enum
from typing import List

from pydantic import BaseModel
from pydantic import BaseModel, Field

class ConnectorTypeEnum(str, Enum):
source = "source"
Expand All @@ -18,20 +18,26 @@ class ReleaseStageEnum(str, Enum):
beta = "beta"
generally_available = "generally_available"

PUBLIC_FIELD = Field(..., is_public=True)
PRIVATE_FIELD = Field(..., is_public=False)
Comment on lines +22 to +23
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!


class ConnectorQAReport(BaseModel):
connector_type: ConnectorTypeEnum
connector_name: str
connector_technical_name: str
connector_definition_id: str
connector_version: str
release_stage: ReleaseStageEnum
is_on_cloud: bool
is_appropriate_for_cloud_use: bool
latest_build_is_successful: bool
documentation_is_available: bool
number_of_connections: int
number_of_users: int
sync_success_rate: float
connector_type: ConnectorTypeEnum = PUBLIC_FIELD
connector_name: str = PUBLIC_FIELD
connector_technical_name: str = PUBLIC_FIELD
connector_definition_id: str = PUBLIC_FIELD
connector_version: str = PUBLIC_FIELD
release_stage: ReleaseStageEnum = PUBLIC_FIELD
is_on_cloud: bool = PUBLIC_FIELD
is_appropriate_for_cloud_use: bool = PUBLIC_FIELD
latest_build_is_successful: bool = PUBLIC_FIELD
documentation_is_available: bool = PUBLIC_FIELD
number_of_connections: int = PRIVATE_FIELD
number_of_users: int = PRIVATE_FIELD
sync_success_rate: float = PRIVATE_FIELD
total_syncs_count: int = PRIVATE_FIELD
failed_syncs_count: int = PRIVATE_FIELD
succeeded_syncs_count: int = PRIVATE_FIELD

class QAReport(BaseModel):
connectors_qa_report: List[ConnectorQAReport]
15 changes: 15 additions & 0 deletions tools/ci_connector_ops/ci_connector_ops/qa_engine/outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#


import pandas as pd

from .models import ConnectorQAReport

def persist_qa_report(qa_report: pd.DataFrame, path: str, public_fields_only: bool =True):
Copy link
Contributor Author

@alafanechere alafanechere Jan 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not called as I disabled GCS persistence in main for safety.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍🏻 I assume we'll usually persist with all of the info, but can use the public-only if we plan to use it in the public repo anywhere

final_fields = [
field.name for field in ConnectorQAReport.__fields__.values()
if field.field_info.extra["is_public"] or not public_fields_only
]
qa_report[final_fields].to_json(path, orient="records")
11 changes: 4 additions & 7 deletions tools/ci_connector_ops/ci_connector_ops/qa_engine/validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import requests

from .constants import INAPPROPRIATE_FOR_CLOUD_USE_CONNECTORS
from .inputs import OSS_CATALOG
from .models import ConnectorQAReport, QAReport

class QAReportGenerationError(Exception):
Expand All @@ -20,7 +19,7 @@ def url_is_reachable(url: str) -> bool:
def is_appropriate_for_cloud_use(definition_id: str) -> bool:
return definition_id not in INAPPROPRIATE_FOR_CLOUD_USE_CONNECTORS

def get_qa_report(enriched_catalog: pd.DataFrame) -> pd.DataFrame:
def get_qa_report(enriched_catalog: pd.DataFrame, oss_catalog_length: int) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always a fan of bring only the data you need 👍

"""Perform validation steps on top of the enriched catalog.
Adds the following columns:
- documentation_is_available:
Expand All @@ -37,6 +36,7 @@ def get_qa_report(enriched_catalog: pd.DataFrame) -> pd.DataFrame:
Get the sync success rate of the connections with this connector version from our datawarehouse.
Args:
enriched_catalog (pd.DataFrame): The enriched catalog.
oss_catalog_length (pd.DataFrame): The length of the OSS catalog, for sanity check.

Returns:
pd.DataFrame: The final QA report.
Expand All @@ -47,14 +47,11 @@ def get_qa_report(enriched_catalog: pd.DataFrame) -> pd.DataFrame:

# TODO YET TO IMPLEMENT VALIDATIONS
qa_report["latest_build_is_successful"] = False # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21720
qa_report["number_of_connections"] = 0 # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21721
qa_report["number_of_users"] = 0 # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21721
qa_report["sync_success_rate"] = .0 # TODO, tracked in https://github.com/airbytehq/airbyte/issues/21721

# Only select dataframe columns defined in the ConnectorQAReport model.
qa_report= qa_report[[field.name for field in ConnectorQAReport.__fields__.values()]]
# Validate the report structure with pydantic QAReport model.
QAReport(connectors_qa_report=qa_report.to_dict(orient="records"))
if len(qa_report) != len(OSS_CATALOG):
raise QAReportGenerationError("The QA report does not contain all the connectors defined in the OSS catalog.")
if len(qa_report) != oss_catalog_length:
raise QAReportGenerationError("The QA report does not contain all the connectors defined in the OSS catalog.")
return qa_report
2 changes: 1 addition & 1 deletion tools/ci_connector_ops/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


setup(
version="0.1.6",
version="0.1.7",
name="ci_connector_ops",
description="Packaged maintained by the connector operations team to perform CI for connectors",
author="Airbyte",
Expand Down
54 changes: 54 additions & 0 deletions tools/ci_connector_ops/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#


import pandas as pd
import pytest

from ci_connector_ops.qa_engine.constants import OSS_CATALOG_URL, CLOUD_CATALOG_URL
from ci_connector_ops.qa_engine.inputs import fetch_remote_catalog

@pytest.fixture(scope="module")
def oss_catalog():
return fetch_remote_catalog(OSS_CATALOG_URL)

@pytest.fixture(scope="module")
def cloud_catalog():
return fetch_remote_catalog(CLOUD_CATALOG_URL)

@pytest.fixture(scope="module")
def adoption_metrics_per_connector_version():
return pd.DataFrame([{
"connector_definition_id": "dfd88b22-b603-4c3d-aad7-3701784586b1",
"connector_version": "2.0.0",
"number_of_connections": 0,
"number_of_users": 0,
"succeeded_syncs_count": 0,
"failed_syncs_count": 0,
"total_syncs_count": 0,
"sync_success_rate": 0.0,
}])

@pytest.fixture
def dummy_qa_report() -> pd.DataFrame:
return pd.DataFrame([
{
"connector_type": "source",
"connector_name": "test",
"connector_technical_name": "source-test",
"connector_definition_id": "foobar",
"connector_version": "0.0.0",
"release_stage": "alpha",
"is_on_cloud": False,
"is_appropriate_for_cloud_use": True,
"latest_build_is_successful": True,
"documentation_is_available": False,
"number_of_connections": 0,
"number_of_users": 0,
"sync_success_rate": .99,
"total_syncs_count": 0,
"failed_syncs_count": 0,
"succeeded_syncs_count": 0
}
])
23 changes: 15 additions & 8 deletions tools/ci_connector_ops/tests/test_qa_engine/test_enrichments.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,37 @@
import pandas as pd
import pytest

from ci_connector_ops.qa_engine import inputs, enrichments
from ci_connector_ops.qa_engine import enrichments


@pytest.fixture
def enriched_catalog() -> pd.DataFrame:
return enrichments.get_enriched_catalog(inputs.OSS_CATALOG, inputs.CLOUD_CATALOG)
def enriched_catalog(oss_catalog, cloud_catalog, adoption_metrics_per_connector_version) -> pd.DataFrame:
return enrichments.get_enriched_catalog(oss_catalog, cloud_catalog, adoption_metrics_per_connector_version)

@pytest.fixture
def enriched_catalog_columns(enriched_catalog: pd.DataFrame) -> set:
return set(enriched_catalog.columns)

def test_merge_performed_correctly(enriched_catalog):
assert len(enriched_catalog) == len(inputs.OSS_CATALOG)
def test_merge_performed_correctly(enriched_catalog, oss_catalog):
assert len(enriched_catalog) == len(oss_catalog)

def test_new_columns_are_added(enriched_catalog_columns):
expected_new_columns = {
"is_on_cloud",
"connector_name",
"connector_technical_name",
"connector_version"
"connector_version",
"number_of_connections",
"number_of_users",
"succeeded_syncs_count",
"failed_syncs_count",
"total_syncs_count",
"sync_success_rate",
}
assert expected_new_columns.issubset(enriched_catalog_columns)

def test_no_column_are_removed_and_lowercased(enriched_catalog_columns):
for column in inputs.OSS_CATALOG:
def test_no_column_are_removed_and_lowercased(enriched_catalog_columns, oss_catalog):
for column in oss_catalog:
assert re.sub(r"(?<!^)(?=[A-Z])", "_", column).lower() in enriched_catalog_columns

def test_release_stage_not_null(enriched_catalog):
Expand Down
4 changes: 2 additions & 2 deletions tools/ci_connector_ops/tests/test_qa_engine/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import pandas as pd
import pytest

from ci_connector_ops.qa_engine import inputs
from ci_connector_ops.qa_engine import inputs, constants

@pytest.mark.parametrize("catalog_url", [inputs.OSS_CATALOG_URL, inputs.CLOUD_CATALOG_URL])
@pytest.mark.parametrize("catalog_url", [constants.OSS_CATALOG_URL, constants.CLOUD_CATALOG_URL])
def test_fetch_remote_catalog(catalog_url):
catalog = inputs.fetch_remote_catalog(catalog_url)
assert isinstance(catalog, pd.DataFrame)
Expand Down
58 changes: 31 additions & 27 deletions tools/ci_connector_ops/tests/test_qa_engine/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,38 @@
#


import pandas as pd
import pytest

from ci_connector_ops.qa_engine import main

@pytest.fixture
def dummy_report() -> pd.DataFrame:
return pd.DataFrame([
{
"connector_type": "source",
"connector_name": "test",
"docker_image_tag": "0.0.0",
"release_stage": "alpha",
"is_on_cloud": False,
"latest_build_is_successful": False,
"documentation_is_available": False,
"number_of_connections": 0,
"number_of_users": 0,
"sync_success_rate": .99
}
])

def test_main(tmp_path, mocker, dummy_report):
output_path = tmp_path / "output.json"
mocker.patch.object(main, "GCS_QA_REPORT_PATH", output_path)
mocker.patch.object(main, "get_enriched_catalog")
mocker.patch.object(main, "get_qa_report", mocker.Mock(return_value=dummy_report))
def test_main(mocker, dummy_qa_report):
mock_oss_catalog = mocker.Mock(__len__=mocker.Mock(return_value=42))
mock_cloud_catalog = mocker.Mock()

mocker.patch.object(main, "enrichments")
mocker.patch.object(main, "outputs")
mocker.patch.object(
main.inputs,
"fetch_remote_catalog",
mocker.Mock(side_effect=[mock_oss_catalog, mock_cloud_catalog]))
mocker.patch.object(main.inputs, "fetch_adoption_metrics_per_connector_version")
mocker.patch.object(main.validations, "get_qa_report", mocker.Mock(return_value=dummy_qa_report))

main.main()
main.get_enriched_catalog.assert_called_with(main.OSS_CATALOG, main.CLOUD_CATALOG)
main.get_qa_report.assert_called_with(main.get_enriched_catalog.return_value)
assert pd.read_json(output_path).to_dict() == dummy_report.to_dict()

assert main.inputs.fetch_remote_catalog.call_count == 2
main.inputs.fetch_remote_catalog.assert_has_calls(
[
mocker.call(main.OSS_CATALOG_URL),
mocker.call(main.CLOUD_CATALOG_URL)
]
)
main.enrichments.get_enriched_catalog.assert_called_with(
mock_oss_catalog,
mock_cloud_catalog,
main.inputs.fetch_adoption_metrics_per_connector_version.return_value
)
main.validations.get_qa_report.assert_called_with(
main.enrichments.get_enriched_catalog.return_value,
len(mock_oss_catalog)
)

24 changes: 24 additions & 0 deletions tools/ci_connector_ops/tests/test_qa_engine/test_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#


import pandas as pd
import pytest

from ci_connector_ops.qa_engine import outputs

@pytest.mark.parametrize("public_fields_only", [True, False])
def test_persist_qa_report_public_fields_only(tmp_path, dummy_qa_report, public_fields_only):
output_path = tmp_path / "qa_report.json"
outputs.persist_qa_report(dummy_qa_report, output_path, public_fields_only=public_fields_only)
qa_report_from_disk = pd.read_json(output_path)
private_fields = {
field.name for field in outputs.ConnectorQAReport.__fields__.values()
if not field.field_info.extra["is_public"]
}
available_fields = set(qa_report_from_disk.columns)
if public_fields_only:
assert not private_fields.issubset(available_fields)
else:
assert private_fields.issubset(available_fields)
Loading