From aaf84bd081b057a926f8e3491b6d5c3442b8fefe Mon Sep 17 00:00:00 2001 From: NiharDoshi99 <51595473+NiharDoshi99@users.noreply.github.com> Date: Wed, 24 Jan 2024 15:57:36 +0530 Subject: [PATCH] #13974 handle for hyphen in schema and median function (#14834) --- .../interface/profiler_interface_factory.py | 7 ++ .../sqlalchemy/mariadb/profiler_interface.py | 85 +++++++++++++++++++ .../source/mariadb/functions/median.py | 20 +++++ .../mariadb/metrics/window/first_quartile.py | 10 +++ .../source/mariadb/metrics/window/median.py | 10 +++ .../mariadb/metrics/window/third_quartile.py | 10 +++ .../unit/profiler/test_profiler_interface.py | 7 ++ 7 files changed, 149 insertions(+) create mode 100644 ingestion/src/metadata/profiler/interface/sqlalchemy/mariadb/profiler_interface.py create mode 100644 ingestion/src/metadata/profiler/source/mariadb/functions/median.py create mode 100644 ingestion/src/metadata/profiler/source/mariadb/metrics/window/first_quartile.py create mode 100644 ingestion/src/metadata/profiler/source/mariadb/metrics/window/median.py create mode 100644 ingestion/src/metadata/profiler/source/mariadb/metrics/window/third_quartile.py diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py index 10c30f810e96..661bf4f3d8e5 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py @@ -24,6 +24,9 @@ from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) +from metadata.generated.schema.entity.services.connections.database.mariaDBConnection import ( + MariaDBConnection, +) from metadata.generated.schema.entity.services.connections.database.singleStoreConnection import ( SingleStoreConnection, ) @@ -47,6 +50,9 @@ from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( DatabricksProfilerInterface, ) +from metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface import ( + MariaDBProfilerInterface, +) from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) @@ -100,6 +106,7 @@ def create(self, interface_type: str, *args, **kwargs): BigQueryConnection.__name__: BigQueryProfilerInterface, SingleStoreConnection.__name__: SingleStoreProfilerInterface, DatalakeConnection.__name__: PandasProfilerInterface, + MariaDBConnection.__name__: MariaDBProfilerInterface, SnowflakeConnection.__name__: SnowflakeProfilerInterface, TrinoConnection.__name__: TrinoProfilerInterface, UnityCatalogConnection.__name__: UnityCatalogProfilerInterface, diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/mariadb/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/mariadb/profiler_interface.py new file mode 100644 index 000000000000..153a04a852c2 --- /dev/null +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/mariadb/profiler_interface.py @@ -0,0 +1,85 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Interfaces with database for all database engine +supporting sqlalchemy abstraction layer +""" + +from typing import List + +from sqlalchemy.exc import ProgrammingError + +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, + handle_query_exception, +) +from metadata.profiler.metrics.registry import Metrics +from metadata.profiler.processor.runner import QueryRunner +from metadata.profiler.source.mariadb.metrics.window.first_quartile import ( + MariaDBFirstQuartile, +) +from metadata.profiler.source.mariadb.metrics.window.median import MariaDBMedian +from metadata.profiler.source.mariadb.metrics.window.third_quartile import ( + MariaDBThirdQuartile, +) +from metadata.utils.logger import profiler_interface_registry_logger + +logger = profiler_interface_registry_logger() + + +class MariaDBProfilerInterface(SQAProfilerInterface): + """ + Interface to interact with registry supporting + sqlalchemy. + """ + + def _compute_window_metrics( + self, + metrics: List[Metrics], + runner: QueryRunner, + *args, + **kwargs, + ): + """Given a list of metrics, compute the given results + and returns the values + + Args: + column: the column to compute the metrics against + metrics: list of metrics to compute + Returns: + dictionnary of results + """ + session = kwargs.get("session") + column = kwargs.get("column") + + if not metrics: + return None + + try: + # we patch the metrics at runtime to use the MariaDB specific functions + # as we can't compile the query based on the dialect as it return `mysql` + metrics = [MariaDBFirstQuartile, MariaDBMedian, MariaDBThirdQuartile] # type: ignore + row = runner.select_first_from_sample( + *[metric(column).fn() for metric in metrics], + ) + if row: + return dict(row) + except ProgrammingError: + logger.info( + f"Skipping window metrics for {runner.table.__tablename__}.{column.name} due to overflow" + ) + return None + + except Exception as exc: + msg = f"Error trying to compute profile for {runner.table.__tablename__}.{column.name}: {exc}" + handle_query_exception(msg, exc, session) + return None diff --git a/ingestion/src/metadata/profiler/source/mariadb/functions/median.py b/ingestion/src/metadata/profiler/source/mariadb/functions/median.py new file mode 100644 index 000000000000..bb90356c64da --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/functions/median.py @@ -0,0 +1,20 @@ +"""Median function for MariaDB""" + +from sqlalchemy.ext.compiler import compiles +from sqlalchemy.sql.functions import FunctionElement + +from metadata.profiler.metrics.core import CACHE + + +class MariaDBMedianFn(FunctionElement): + inherit_cache = CACHE + + +@compiles(MariaDBMedianFn) +def _(elements, compiler, **kwargs): # pylint: disable=unused-argument + col = compiler.process(elements.clauses.clauses[0]) + percentile = elements.clauses.clauses[2].value + # According to the documentation available at https://mariadb.com/kb/en/median/#description, + # the PERCENTILE_CONT function can be utilized to calculate the median. Therefore, it is + # being used in this context. + return f"PERCENTILE_CONT({percentile:.2f}) WITHIN GROUP (ORDER BY {col}) OVER()" diff --git a/ingestion/src/metadata/profiler/source/mariadb/metrics/window/first_quartile.py b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/first_quartile.py new file mode 100644 index 000000000000..a56cf84cca63 --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/first_quartile.py @@ -0,0 +1,10 @@ +"""Override first quartile metric definition for MariaDB""" + +from metadata.profiler.metrics.window.first_quartile import FirstQuartile +from metadata.profiler.source.mariadb.functions.median import MariaDBMedianFn + + +class MariaDBFirstQuartile(FirstQuartile): + def _compute_sqa_fn(self, column, table, percentile): + """Generic method to compute the quartile using sqlalchemy""" + return MariaDBMedianFn(column, table, percentile) diff --git a/ingestion/src/metadata/profiler/source/mariadb/metrics/window/median.py b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/median.py new file mode 100644 index 000000000000..a5903175f115 --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/median.py @@ -0,0 +1,10 @@ +"""Override first quartile metric definition for MariaDB""" + +from metadata.profiler.metrics.window.median import Median +from metadata.profiler.source.mariadb.functions.median import MariaDBMedianFn + + +class MariaDBMedian(Median): + def _compute_sqa_fn(self, column, table, percentile): + """Generic method to compute the quartile using sqlalchemy""" + return MariaDBMedianFn(column, table, percentile) diff --git a/ingestion/src/metadata/profiler/source/mariadb/metrics/window/third_quartile.py b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/third_quartile.py new file mode 100644 index 000000000000..9211f5dfd752 --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/third_quartile.py @@ -0,0 +1,10 @@ +"""Override first quartile metric definition for MariaDB""" + +from metadata.profiler.metrics.window.third_quartile import ThirdQuartile +from metadata.profiler.source.mariadb.functions.median import MariaDBMedianFn + + +class MariaDBThirdQuartile(ThirdQuartile): + def _compute_sqa_fn(self, column, table, percentile): + """Generic method to compute the quartile using sqlalchemy""" + return MariaDBMedianFn(column, table, percentile) diff --git a/ingestion/tests/unit/profiler/test_profiler_interface.py b/ingestion/tests/unit/profiler/test_profiler_interface.py index 2dc878a8e37c..bff960b031c8 100644 --- a/ingestion/tests/unit/profiler/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/test_profiler_interface.py @@ -39,6 +39,9 @@ from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) +from metadata.generated.schema.entity.services.connections.database.mariaDBConnection import ( + MariaDBConnection, +) from metadata.generated.schema.entity.services.connections.database.singleStoreConnection import ( SingleStoreConnection, ) @@ -75,6 +78,9 @@ from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( DatabricksProfilerInterface, ) +from metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface import ( + MariaDBProfilerInterface, +) from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) @@ -369,6 +375,7 @@ def test_register_many(self): TrinoConnection.__name__: TrinoProfilerInterface, UnityCatalogConnection.__name__: UnityCatalogProfilerInterface, DatabricksConnection.__name__: DatabricksProfilerInterface, + MariaDBConnection.__name__: MariaDBProfilerInterface, } # Register profiles