From 16900155dab39247af7e0fbf8a044b877929f3a2 Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Mon, 17 Jan 2022 13:12:25 +0200 Subject: [PATCH 1/9] initial draft on impala templates Signed-off-by: mrMoZ1 --- .../vdk-plugins/vdk-impala/requirements.txt | 4 + .../src/vdk/plugin/impala/impala_helper.py | 167 +++++ .../src/vdk/plugin/impala/impala_plugin.py | 42 ++ .../src/vdk/plugin/impala/templates/README.md | 3 + .../vdk/plugin/impala/templates/__init__.py | 2 + .../src/vdk/plugin/impala/templates/errors.py | 20 + .../plugin/impala/templates/load/__init__.py | 2 + .../templates/load/dimension/__init__.py | 2 + .../load/dimension/scd1/00-definition.py | 35 ++ .../scd1/00-test-if-view-matches-target.sql | 3 + .../dimension/scd1/01-insert-into-target.sql | 9 + .../load/dimension/scd1/02-refresh.sql | 2 + .../load/dimension/scd1/03-compute-stats.sql | 1 + .../templates/load/dimension/scd1/README.md | 43 ++ .../templates/load/dimension/scd1/__init__.py | 2 + .../load/dimension/scd2/00-definition.py | 40 ++ .../scd2/00-test-if-view-matches-target.sql | 3 + .../dimension/scd2/01-insert-into-target.sql | 40 ++ .../load/dimension/scd2/02-refresh.sql | 2 + .../load/dimension/scd2/03-compute-stats.sql | 1 + .../templates/load/dimension/scd2/README.md | 55 ++ .../templates/load/dimension/scd2/__init__.py | 2 + .../impala/templates/load/fact/__init__.py | 2 + .../load/fact/snapshot/00-definition.py | 36 ++ .../00-test-if-view-matches-target.sql | 3 + .../fact/snapshot/01-insert-into-target.sql | 13 + .../load/fact/snapshot/02-refresh.sql | 1 + .../load/fact/snapshot/03-compute-stats.sql | 1 + .../templates/load/fact/snapshot/README.md | 48 ++ .../templates/load/fact/snapshot/__init__.py | 2 + .../templates/load/fact/snapshot/params.json | 7 + .../templates/load/versioned/00-definition.py | 74 +++ .../00-test-if-view-matches-target.sql | 19 + .../load/versioned/01-insert-into-target.sql | 117 ++++ .../templates/load/versioned/02-refresh.sql | 2 + .../load/versioned/03-compute-stats.sql | 1 + .../impala/templates/load/versioned/README.md | 56 ++ .../templates/load/versioned/__init__.py | 2 + .../vdk/plugin/impala/templates/template.py | 80 +++ .../impala/templates/template_executor.py | 80 +++ .../vdk-impala/tests/jobs/__init__.py | 2 + .../01_prepare_input_data.py | 81 +++ .../02_run_load_dimension_scd1_template.py | 15 + .../__init__.py | 2 + .../01_prepare_input_data.py | 67 ++ .../02_run_load_dimension_scd1_template.py | 15 + .../__init__.py | 2 + .../01_prepare_input_data.py | 193 ++++++ .../02_run_load_dimension_scd2_template.py | 15 + .../__init__.py | 2 + .../01_prepare_input_data.py | 128 ++++ .../02_run_load_fact_snapshot_template.py | 15 + .../__init__.py | 2 + .../01_prepare_input_data.py | 106 ++++ .../02_run_load_fact_snapshot_template.py | 15 + .../__init__.py | 2 + .../01_prepare_input_data.py | 120 ++++ .../02_run_load_versioned_template.py | 15 + .../load_versioned_template_job/__init__.py | 2 + .../01_prepare_input_data.py | 124 ++++ .../02_run_load_versioned_template.py | 15 + .../__init__.py | 2 + .../tests/template_regression_test.py | 594 ++++++++++++++++++ 63 files changed, 2558 insertions(+) create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-definition.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-test-if-view-matches-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-insert-into-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-refresh.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-compute-stats.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/README.md create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-definition.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-test-if-view-matches-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-insert-into-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-refresh.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-compute-stats.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-test-if-view-matches-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-insert-into-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-refresh.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-compute-stats.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/params.json create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-definition.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-test-if-view-matches-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-insert-into-target.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-refresh.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-compute-stats.sql create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/template_regression_test.py diff --git a/projects/vdk-plugins/vdk-impala/requirements.txt b/projects/vdk-plugins/vdk-impala/requirements.txt index 76421477c0..eba0c42ea0 100644 --- a/projects/vdk-plugins/vdk-impala/requirements.txt +++ b/projects/vdk-plugins/vdk-impala/requirements.txt @@ -2,6 +2,10 @@ vdk-core impyla tabulate +# template requirements +pydantic +pyarrow + # testing requirements click vdk-test-utils diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py new file mode 100644 index 0000000000..c63a32f08c --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py @@ -0,0 +1,167 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import logging +from collections import OrderedDict + +import pyarrow +from vdk.internal.builtin_plugins.connection.managed_connection_base import ( + ManagedConnectionBase, +) +from vdk.internal.core import errors +from vdk.plugin.impala.impala_connection import ImpalaConnection + + +class ImpalaHelper: + def __init__(self, db_connection: ImpalaConnection) -> None: + self._log = logging.getLogger(__name__) + self._db_connection = db_connection + + def get_table_description(self, table_name): + self._log.debug(f"Retrieving details for table {table_name}.") + try: + return self._db_connection.execute_query(f"DESCRIBE formatted {table_name}") + except Exception as e: + if errors.exception_matches( + e, "impala.error.HiveServer2Error", ".*AuthorizationException.*" + ): + errors.log_and_throw( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=self._log, + what_happened=f"Data loading into table {table_name} has failed.", + why_it_happened=( + f"You are trying to load data into a table which you do not have access to or it does not " + f"exist: {table_name}." + ), + consequences="Data load will be aborted.", + countermeasures="Make sure that the destination table exists and you have access to it.", + ) + else: + raise e + + def __get_table_schema(self, table_description, section_start="#", second_end="#"): + """ + Gets column names and data types from Impala table. + It would be a lot more easier to execute pure describe statement, but then we will execute 2 describe statements + to get the table schema and check if the table is stored as parquet + Will return the full table schema including partition columns order the same way as in Impala + """ + self._log.debug("Retrieving destination table schema.") + column_name_to_column_type_map = OrderedDict() + is_in_columns_section = False + searched_section_ended = False + searched_sectioned_started = False + for ( + column_name, + column_type, + _, + ) in table_description: # 3rd value is column comment + if column_name is None or column_name.strip() == "": + continue + if column_name.startswith(section_start): # new section begins + searched_sectioned_started = True + if searched_sectioned_started and not is_in_columns_section: + if column_name.strip() == "# col_name": # column info follows + is_in_columns_section = True + else: + is_in_columns_section = False + continue + if searched_sectioned_started: + if column_name.startswith(second_end): + searched_section_ended = True + if is_in_columns_section and not searched_section_ended: + column_name_to_column_type_map[ + column_name.strip() + ] = column_type.strip() + return column_name_to_column_type_map + + def get_table_columns(self, table_description): + """ + :param table_description: result of #get_table_description + :return: dict with column name and type + """ + return self.__get_table_schema(table_description, section_start="#") + + def get_table_partitions(self, table_description): + """ + :param table_description: result of #get_table_description + :return: dict with partition name and type + """ + return self.__get_table_schema( + table_description, section_start="# Partition Information" + ) + + def ensure_table_format_is_parquet(self, table_name, table_description): + for key, value, _ in table_description: # 3rd value is column comment + # Input format of parquet table is "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat" + if key is not None and key.strip() == "InputFormat:": + if "parquet" in value: # table is stored as parquet + return + else: # table is not stored as parquet + errors.log_and_throw( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=self._log, + what_happened="Data loading has failed.", # FIXME: this is too specific + why_it_happened=( + f"You are trying to load data into a table {table_name} with an unsupported format. " + f"Currently only Parquet table format is supported." + ), + consequences="Data load will be aborted.", # FIXME: this is too specific + countermeasures=( + "Make sure that the destination table is stored as parquet: " + "https://www.cloudera.com/documentation/enterprise/5-11-x/topics/impala_parquet.html" + "#parquet_ddl" + ), + ) + # TODO once there is more robust loading implemented the below error can be removed. We can try to load even if + # we cannot determine the table storage type + errors.log_and_throw( + to_be_fixed_by=errors.ResolvableBy.PLATFORM_ERROR, + log=self._log, + what_happened="Cannot determine the target table file format, which is needed to load data into it.", + why_it_happened="There's a bug in VDK code.", + consequences="Application will exit.", + countermeasures="Report this bug to Super Collider team.", + ) + + def generate_parquet_schema_from_table_schema(self, table_columns): + """ + Builds the parquet schema based on the column types and order in the target table, in order to ensure the new file + will be compatible with the table + """ + self._log.debug("Generating parquet file schema from table schema.") + impala_type_to_pyarrow_type_map = { + "string": pyarrow.string(), + "boolean": pyarrow.bool_(), + "double": pyarrow.float64(), + "float": pyarrow.float32(), + "int": pyarrow.int32(), + "bigint": pyarrow.int64(), + "timestamp": pyarrow.timestamp("ns"), + } + # including the decimal types in the map + for precision_value in range(1, 39): + for scale_value in range(0, precision_value + 1): + impala_type_to_pyarrow_type_map[ + f"decimal({precision_value},{scale_value})" + ] = pyarrow.decimal128(precision_value, scale_value) + + parquet_schema = [] + for column_name, column_type in table_columns.items(): + parquet_schema.append( + (column_name, impala_type_to_pyarrow_type_map[column_type]) + ) + return pyarrow.schema(parquet_schema) + + def get_parquet_schema(self, table): + table_description = self.get_table_description(table) + self.ensure_table_format_is_parquet(table, table_description) + table_columns = self.get_table_columns(table_description) + return self.generate_parquet_schema_from_table_schema(table_columns) + + @staticmethod + def get_insert_sql_partition_clause(partitions): + # https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/impala_insert.html + + # NOTE: https://github.com/kayak/pypika looks interesting if we start having more complex query buildings + sql = "PARTITION (" + ",".join("`" + p + "`" for p in partitions.keys()) + ")" + return sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py index b692ab568e..4d45d7b14c 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py @@ -1,6 +1,8 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 import logging +import os +import pathlib from typing import List import click @@ -71,6 +73,22 @@ def initialize_job(self, context: JobContext) -> None: lambda: _connection_by_configuration(self._impala_cfg), ) + context.templates.add_template( + "load/dimension/scd1", pathlib.Path(get_job_path("load/dimension/scd1")) + ) + + context.templates.add_template( + "load/dimension/scd2", pathlib.Path(get_job_path("load/dimension/scd2")) + ) + + context.templates.add_template( + "load/fact/snapshot", pathlib.Path(get_job_path("load/fact/snapshot")) + ) + + context.templates.add_template( + "load/versioned", pathlib.Path(get_job_path("load/versioned")) + ) + @staticmethod @hookimpl(hookwrapper=True, tryfirst=True) def run_step(context: JobContext, step: Step) -> None: @@ -116,3 +134,27 @@ def db_connection_decorate_operation(self, decoration_cursor: DecorationCursor): @hookimpl def vdk_start(plugin_registry: IPluginRegistry, command_line_args: List): plugin_registry.load_plugin_with_hooks_impl(ImpalaPlugin(), "impala-plugin") + + +def db_connection_recover_operation(recovery_cursor: RecoveryCursor) -> None: + impala_error_handler = ImpalaErrorHandler() + + if impala_error_handler.handle_error( + recovery_cursor.get_exception(), recovery_cursor + ): + logging.getLogger(__name__).info( + "Error handled successfully! Query execution has succeeded." + ) + else: + raise recovery_cursor.get_exception() + + +def get_jobs_parent_directory() -> pathlib.Path: + current_dir = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) + jobs_dir = current_dir.joinpath("templates") + return jobs_dir + + +def get_job_path(job_name: str) -> str: + """Get the path of the test data job returned as string so it can be passed easier as cmd line args""" + return str(get_jobs_parent_directory().joinpath(job_name)) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md new file mode 100644 index 0000000000..8754d37def --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md @@ -0,0 +1,3 @@ +### Types of data loading templates +- Slowly Changing Dimension Type 1 - [see details and usage](https://gitlab.eng.vmware.com/product-analytics/data-pipelines/vdk/tree/master/src/vacloud/vdk//templates/load/dimension/scd1/README.md) +- Snapshot Accumulating Fact Table - [see details and usage](https://gitlab.eng.vmware.com/product-analytics/data-pipelines/vdk/tree/master/src/vacloud/vdk/templates/load/fact/snapshot/README.md) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py new file mode 100644 index 0000000000..ab27010464 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py @@ -0,0 +1,20 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import error_wrappers +from pydantic import ValidationError + + +class TemplateParametersError(Exception): + __slots__ = "cause", "template_name" + + def __init__(self, cause: ValidationError, template_name: str) -> None: + self.cause = cause + self.template_name = template_name + + def __str__(self) -> str: + validation_errors = self.cause.errors() + no_errors = len(validation_errors) + return ( + f'{no_errors} validation error{"" if no_errors == 1 else "s"} for {self.template_name} template\n' + f"{error_wrappers.display_errors(validation_errors)}" + ) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-definition.py new file mode 100644 index 0000000000..ce0bad25bd --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-definition.py @@ -0,0 +1,35 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import BaseModel +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.template_executor import TemplateExecutor + + +class SlowlyChangingDimensionTypeOverwriteParams(BaseModel): + target_schema: str + target_table: str + source_schema: str + source_view: str + + +class SlowlyChangingDimensionTypeOverwrite(TemplateExecutor): + TemplateParams = SlowlyChangingDimensionTypeOverwriteParams + + def __init__(self) -> None: + super().__init__( + template_name="scd1", + sql_files=[ + "00-test-if-view-matches-target.sql", + "01-insert-into-target.sql", + "02-refresh.sql", + "03-compute-stats.sql", + ], + sql_files_platform_is_responsible=[ + "02-refresh.sql", + "03-compute-stats.sql", + ], + ) + + +def run(job_input: IJobInput): + SlowlyChangingDimensionTypeOverwrite().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-test-if-view-matches-target.sql new file mode 100644 index 0000000000..a0cb6c08ba --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-test-if-view-matches-target.sql @@ -0,0 +1,3 @@ +SELECT * FROM {source_schema}.{source_view} LIMIT 0 +UNION ALL +SELECT * FROM {target_schema}.{target_table} LIMIT 0; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-insert-into-target.sql new file mode 100644 index 0000000000..bf328c471c --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-insert-into-target.sql @@ -0,0 +1,9 @@ +/* TO DO DROP AND RECREATE TARGET TABLE ON FULL RELOAD OR DATA TYPE CHANGE */ + +-- DROP TABLE {target_schema}.{target_table}; +-- CREATE TABLE {target_schema}.{target_table} STORED AS PARQUET AS SELECT * FROM {target_schema}.{target_table}; + +-- /* +SHUFFLE */ below is a query hint to Impala. Do not remove! +-- See https://www.cloudera.com/documentation/enterprise/5-9-x/topics/impala_hints.html for details. +INSERT OVERWRITE TABLE {target_schema}.{target_table} {_vdk_template_insert_partition_clause} /* +SHUFFLE */ +SELECT * FROM {source_schema}.{source_view}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-refresh.sql new file mode 100644 index 0000000000..ee3e889787 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-refresh.sql @@ -0,0 +1,2 @@ +-- make sure metadata about the new blocks in the {target_table} is propagated to the other impalad deamons +REFRESH {target_schema}.{target_table}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-compute-stats.sql new file mode 100644 index 0000000000..00d3e0ef15 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-compute-stats.sql @@ -0,0 +1 @@ +COMPUTE STATS {target_schema}.{target_table}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/README.md new file mode 100644 index 0000000000..39d5319308 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/README.md @@ -0,0 +1,43 @@ +### Purpose: + +This template can be used to load raw data from the Data Lake to target 'Slowly Changing Dimension Type 1' table in the Data Warehouse. +In summary, it overwrites the target table with the source data. + +### Details: + + +### Template Name (template_name): + +- "load/dimension/scd1" + +### Template Parameters (template_args): + +- target_schema - SC Data Warehouse schema, where target data is loaded +- target_table - SC Data Warehouse table of DW type 'Slowly Changing Dimension Type 1', where target data is loaded +- source_schema - SC Data Lake schema, where source raw data is loaded from +- source_view - SC Data Lake view, where source raw data is loaded from + +### Prerequisites: + +In order to use this template you need to ensure the following: +- {source_schema}.{source_view} exists +- {target_schema}.{target_table} exists +- {source_schema}.{source_view} has the exact same schema as {target_schema}.{target_table} + +### Sample Usage: + +Say there is SDDC-related 'Slowly Changing Dimension Type 1' target table called 'dim_sddc' in 'history' schema. +Updating it with the latest raw data from the Data Lake (from source view called 'vw_dim_sddc' in 'default' schema) is done in the following manner: + +```python +def run(job_input): + # . . . + template_args = { + 'source_schema': 'default', + 'source_view': 'vw_dim_sddc', + 'target_schema': 'history', + 'target_table': 'dim_sddc', + } + job_input.execute_template("load/dimension/scd1", template_args) + # . . . +``` diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-definition.py new file mode 100644 index 0000000000..aaadf5ac24 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-definition.py @@ -0,0 +1,40 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import BaseModel +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.template_executor import TemplateExecutor + + +class SlowlyChangingDimensionType2Params(BaseModel): + target_schema: str + target_table: str + source_schema: str + source_view: str + start_time_column: str + end_time_column: str + end_time_default_value: str + surrogate_key_column: str + id_column: str + + +class SlowlyChangingDimensionType2(TemplateExecutor): + TemplateParams = SlowlyChangingDimensionType2Params + + def __init__(self) -> None: + super().__init__( + template_name="load/dimension/scd2", + sql_files=[ + "00-test-if-view-matches-target.sql", + "01-insert-into-target.sql", + "02-refresh.sql", + "03-compute-stats.sql", + ], + sql_files_platform_is_responsible=[ + "02-refresh.sql", + "03-compute-stats.sql", + ], + ) + + +def run(job_input: IJobInput): + SlowlyChangingDimensionType2().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-test-if-view-matches-target.sql new file mode 100644 index 0000000000..7a565278b0 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-test-if-view-matches-target.sql @@ -0,0 +1,3 @@ +SELECT uuid(), * FROM {source_schema}.{source_view} LIMIT 0 +UNION ALL +SELECT * FROM {target_schema}.{target_table} LIMIT 0; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-insert-into-target.sql new file mode 100644 index 0000000000..024abc5c28 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-insert-into-target.sql @@ -0,0 +1,40 @@ +/* TO DO DROP AND RECREATE TARGET TABLE ON FULL RELOAD OR DATA TYPE CHANGE */ + +-- DROP TABLE {target_schema}.{target_table}; +-- CREATE TABLE {target_schema}.{target_table} STORED AS PARQUET AS SELECT * FROM {target_schema}.stg_{target_table}; + +-- /* +SHUFFLE */ below is a query hint to Impala. +-- Do not remove! https://www.cloudera.com/documentation/enterprise/5-9-x/topics/impala_hints.html +INSERT OVERWRITE TABLE {target_schema}.{target_table} {_vdk_template_insert_partition_clause} /* +SHUFFLE */ +WITH + -- filter from the target all elements that define non-current state and are not updated/present in the source + tgt_filtered AS ( + SELECT * + FROM {target_schema}.{target_table} + WHERE {end_time_column} != '{end_time_default_value}' + AND CONCAT(CAST({id_column} AS STRING), CAST({start_time_column} AS STRING)) NOT IN ( + SELECT CONCAT(CAST({id_column} AS STRING), CAST({start_time_column} AS STRING)) + FROM {source_schema}.{source_view} + ) + ), + -- filter from the source all elements which are present in tgt_filtered + src_filtered AS ( + SELECT * + FROM {source_schema}.{source_view} + WHERE CONCAT(CAST({id_column} AS STRING), CAST({start_time_column} AS STRING)) NOT IN ( + SELECT CONCAT(CAST({id_column} AS STRING), CAST({start_time_column} AS STRING)) + FROM tgt_filtered + ) + ) +( + SELECT * + FROM tgt_filtered +) +UNION ALL +( + SELECT COALESCE(tgt.{surrogate_key_column}, uuid()), src.* + FROM src_filtered AS src + LEFT JOIN {target_schema}.{target_table} AS tgt + ON src.{id_column} = tgt.{id_column} + AND src.{start_time_column} = tgt.{start_time_column} +) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-refresh.sql new file mode 100644 index 0000000000..ee3e889787 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-refresh.sql @@ -0,0 +1,2 @@ +-- make sure metadata about the new blocks in the {target_table} is propagated to the other impalad deamons +REFRESH {target_schema}.{target_table}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-compute-stats.sql new file mode 100644 index 0000000000..00d3e0ef15 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-compute-stats.sql @@ -0,0 +1 @@ +COMPUTE STATS {target_schema}.{target_table}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md new file mode 100644 index 0000000000..12500f4f47 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md @@ -0,0 +1,55 @@ +### Purpose: + +Template used to load raw data from Super Collider Data Lake to target 'Slowly Changing Dimension Type 2' table in Super Collider Data Warehouse. + +### Details: + +Explanation of SCD type 2 can be seen here: + +### Template Name (template_name): + +- "load/dimension/scd2" + +### Template Parameters (template_args): + +- target_schema - SC Data Warehouse schema, where target data is loaded +- target_table - SC Data Warehouse table of DW type 'Slowly Changing Dimension Type 2', where target data is loaded +- source_schema - SC Data Lake schema, where source raw data is loaded from +- source_view - SC Data Lake view, where source raw data is loaded from +- start_time_column - Column that holds the start time of the period for which a record is valid +- end_time_column - Column that holds the end time of the period for which a record is valid +- end_time_default_value - Default value for end time column used to indicate whether this is the current state of the record, e.g. '2999-01-01T00:00:00Z' +- surrogate_key_column - Column that holds unique id permanently bound to the time period defined by that row of the slowly changing dimension table. Useful for efficient joins with other fact tables. +- id_column - Column that holds the natural key of the target table. + +### Prerequisites: + +In order to use this template you need to ensure the following: +- {source_schema}.{source_view} exists +- {target_schema}.{target_table} exists +- The schema of {target_schema}.{target_table} must begin with a string column (used to hold the surrogate key) followed by all columns of {source_schema}.{source_view}. +- {source_schema}.{source_view} must contain all columns specified in the Parameters section. +- In {source_schema}.{source_view}, for records which represent current state their end_time value must be the same as the value provided as end_time_default_value + +### Sample Usage: + +Say there is SDDC-related 'Slowly Changing Dimension Type 2' target table called 'dim_sddc_h' in 'history' schema. +Updating end date of existing current records representing current state and adding new state records (from source view called 'vw_dim_sddc_h' in 'default' schema) is done in the following manner: + +```python +def run(job_input): + # . . . + template_args = { + 'source_schema': 'default', + 'source_view': 'vw_dim_sddc_h', + 'target_schema': 'history', + 'target_table': 'dim_sddc_h', + 'start_time_column': '', + 'end_time_default_value': '2999-01-01T00:00:00Z', + 'end_time_column': '', + 'surrogate_key_column': '', + 'id_column': '' + } + job_input.execute_template('load/dimension/scd2', template_args) + # . . . +``` diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py new file mode 100644 index 0000000000..f9552c07b6 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py @@ -0,0 +1,36 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import BaseModel +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.template_executor import TemplateExecutor + + +class FactDailySnapshotParams(BaseModel): + target_schema: str + target_table: str + source_schema: str + source_view: str + last_arrival_ts: str + + +class FactDailySnapshot(TemplateExecutor): + TemplateParams = FactDailySnapshotParams + + def __init__(self) -> None: + super().__init__( + template_name="load/fact/snapshot", + sql_files=[ + "00-test-if-view-matches-target.sql", + "01-insert-into-target.sql", + "02-refresh.sql", + "03-compute-stats.sql", + ], + sql_files_platform_is_responsible=[ + "02-refresh.sql", + "03-compute-stats.sql", + ], + ) + + +def run(job_input: IJobInput): + FactDailySnapshot().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-test-if-view-matches-target.sql new file mode 100644 index 0000000000..a0cb6c08ba --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-test-if-view-matches-target.sql @@ -0,0 +1,3 @@ +SELECT * FROM {source_schema}.{source_view} LIMIT 0 +UNION ALL +SELECT * FROM {target_schema}.{target_table} LIMIT 0; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-insert-into-target.sql new file mode 100644 index 0000000000..9e4a727bc4 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-insert-into-target.sql @@ -0,0 +1,13 @@ +-- /* +SHUFFLE */ below is a query hint to Impala. +-- Do not remove! https://www.cloudera.com/documentation/enterprise/5-9-x/topics/impala_hints.html +INSERT OVERWRITE TABLE {target_schema}.{target_table} {_vdk_template_insert_partition_clause} /* +SHUFFLE */ +( + SELECT * + FROM {target_schema}.{target_table} + WHERE {last_arrival_ts} < (SELECT MIN({last_arrival_ts}) FROM {source_schema}.{source_view}) +) +UNION ALL +( + SELECT * + FROM {source_schema}.{source_view} +) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-refresh.sql new file mode 100644 index 0000000000..62002e767f --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-refresh.sql @@ -0,0 +1 @@ +REFRESH {target_schema}.{target_table} diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-compute-stats.sql new file mode 100644 index 0000000000..ab96a798fc --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-compute-stats.sql @@ -0,0 +1 @@ +COMPUTE STATS {target_schema}.{target_table} diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md new file mode 100644 index 0000000000..bb57f400cb --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md @@ -0,0 +1,48 @@ +### Purpose: + +This template can be used to load raw data from SuperCollider Data Lake to target 'Snapshot Periodic Fact Table' in Data Warehouse. +In summary, it appends a snapshot of records observed between time t1 and t2 from the source table to the target table, +truncating all present target table records observed after t1. + +### Details: + + + +### Template Name (template_name): + +- "load/fact/snapshot" + +### Template Parameters (template_args): + +- target_schema - SC Data Warehouse schema, where target data is loaded +- target_table - SC Data Warehouse table of DW type 'Snapshot Periodic Fact Table', where target data is loaded +- source_schema - SC Data Lake schema, where source raw data is loaded from +- source_view - SC Data Lake view, where source raw data is loaded from +- last_arrival_ts - Timestamp column, on which increments to target_table are done + +### Prerequisites: + +In order to use this template you need to ensure the following: +- {source_schema}.{source_view} exists +- {target_schema}.{target_table} exists +- {source_schema}.{source_view} has the exact same schema as {target_schema}.{target_table} +- {last_arrival_ts} is timestamp column suitable for 'Snapshot Periodic Fact Table' increments + +### Sample Usage: + +Say there is SDDC-related 'Snapshot Periodic Fact Table' called 'fact_sddc_daily' in 'history' schema. +Updating it with the latest raw data from the Super Collider Data Lake (from source view called 'vw_fact_sddc_daily' in 'default' schema) is done in the following manner: + +```python +def run(job_input): + # . . . + template_args = { + 'source_schema': 'default', + 'source_view': 'vw_fact_sddc_daily', + 'target_schema': 'history', + 'target_table': 'fact_sddc_daily', + 'last_arrival_ts': 'updated_at', + } + job_input.execute_template('load/fact/snapshot', template_args) + # . . . +``` diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/params.json b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/params.json new file mode 100644 index 0000000000..59c534dde7 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/params.json @@ -0,0 +1,7 @@ +[ + "target_schema", + "target_table", + "source_schema", + "source_view", + "last_arrival_ts" +] diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-definition.py new file mode 100644 index 0000000000..585a581d46 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-definition.py @@ -0,0 +1,74 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from typing import List + +from pydantic import BaseModel +from pydantic import validator +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.template_executor import TemplateExecutor + + +class LoadVersionedParams(BaseModel): + target_schema: str + target_table: str + source_schema: str + source_view: str + id_column: str + value_columns: List[str] + tracked_columns: List[str] + updated_at_column: str = "updated_at" + sk_column: str = "sk" + active_from_column: str = "active_from" + active_to_column: str = "active_to" + active_to_max_value: str = "9999-12-31" + + @validator("tracked_columns") + def passwords_match(cls, tracked_columns, values, **kwargs): + value_columns = values.get("value_columns") + if type(value_columns) == list and not tracked_columns: + raise ValueError("The list must contain at least one column") + if type(value_columns) == list == type(value_columns) and not set( + tracked_columns + ) <= set(value_columns): + raise ValueError( + "All elements in the list must be also present in `value_columns`" + ) + return tracked_columns + + +class LoadVersioned(TemplateExecutor): + TemplateParams = LoadVersionedParams + + def __init__(self) -> None: + super().__init__( + template_name="load/versioned", + sql_files=[ + "00-test-if-view-matches-target.sql", + "01-insert-into-target.sql", + "02-refresh.sql", + "03-compute-stats.sql", + ], + sql_files_platform_is_responsible=[ + "02-refresh.sql", + "03-compute-stats.sql", + ], + ) + + def _validate_args(self, args: dict) -> dict: + args = super()._validate_args(args) + return dict( + **args, + value_columns_str=", ".join( + [f"`{column}`" for column in args["value_columns"]] + ), + hash_expr_str=",\n".join( + [ + f" COALESCE(CAST(`{column}` AS STRING), '#')" + for column in args["tracked_columns"] + ] + ).lstrip(), + ) + + +def run(job_input: IJobInput): + LoadVersioned().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-test-if-view-matches-target.sql new file mode 100644 index 0000000000..8ad6fc66d7 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-test-if-view-matches-target.sql @@ -0,0 +1,19 @@ +( + SELECT + NULL as `{sk_column}`, + NULL as `{active_from_column}`, + NULL as `{active_to_column}`, + `{id_column}`, + {value_columns_str} + FROM + `{source_schema}`.`{source_view}` + LIMIT 0 +) +UNION ALL +( + SELECT + * + FROM + `{target_schema}`.`{target_table}` + LIMIT 0 +); diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-insert-into-target.sql new file mode 100644 index 0000000000..80324a7403 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-insert-into-target.sql @@ -0,0 +1,117 @@ +/* TO DO DROP AND RECREATE TARGET TABLE ON FULL RELOAD OR DATA TYPE CHANGE */ + +-- DROP TABLE {target_schema}.{target_table}; +-- CREATE TABLE {target_schema}.{target_table} STORED AS PARQUET AS SELECT * FROM {target_schema}.stg_{target_table}; + +-- /* +SHUFFLE */ below is a query hint to Impala. +-- Do not remove! https://www.cloudera.com/documentation/enterprise/5-9-x/topics/impala_hints.html +INSERT OVERWRITE TABLE {target_schema}.{target_table} {_vdk_template_insert_partition_clause} /* +SHUFFLE */ +WITH + -- Compute the union of the source and target tables and augment each side with the following columns: + -- `_values_hash`: + -- A hash of the tracked values. + -- `_lineage`: + -- Tag the record source (either "source" or "target"). + `{target_table}_union` AS ( + ( + SELECT + 'source' AS `_lineage`, + -- FIXME: change from '#' to NULL results in collision + FNV_HASH( + CONCAT_WS('|', + {hash_expr_str} + ) + ) AS `_values_hash`, + UUID() as `{sk_column}`, + `{updated_at_column}` AS `{active_from_column}`, + CAST("9999-12-31" AS TIMESTAMP) AS `{active_to_column}`, + `{id_column}`, + {value_columns_str} + FROM + `{source_schema}`.`{source_view}` + ) + UNION ALL + ( + SELECT + 'target' AS `_lineage`, + -- FIXME: change from '#' to NULL results in collision + FNV_HASH( + CONCAT_WS('|', + {hash_expr_str} + ) + ) AS `_values_hash`, + `{sk_column}`, + `{active_from_column}`, + `{active_to_column}`, + `{id_column}`, + {value_columns_str} + FROM + `{target_schema}`.`{target_table}` + ) + ), + -- Extend `{target_table}_union` with the following expressions: + -- `_is_overridden`: + -- True if and only if the record is overridden by a following record in a partition of records that share the + -- same primary key and start time, where "target" records are listed before "source". + -- `{sk_column}`, `{active_to_column}`: + -- The first value of that column from the partition of records sharing the same primary key and start time, + -- where "target" records are listed before "source". + -- The result set is ready for elimination of overridden records. + `{target_table}_union_extended_v1` AS ( + SELECT + `_lineage`, + `_values_hash`, + LEAD(TRUE, 1, FALSE) OVER( + PARTITION BY `{id_column}`, `{active_from_column}` + ORDER BY `_lineage` DESC + ) AS `_is_overridden`, + FIRST_VALUE(`{sk_column}`) OVER( + PARTITION BY `{id_column}`, `{active_from_column}` + ORDER BY `_lineage` DESC + ) AS `{sk_column}`, + `{active_from_column}`, + FIRST_VALUE(`{active_to_column}`) OVER( + PARTITION BY `{id_column}`, `{active_from_column}` + ORDER BY `_lineage` DESC + ) AS `{active_to_column}`, + `{id_column}`, + {value_columns_str} + FROM + `{target_table}_union` + ), + -- Exclude overridden records from `{target_table}_union_extended_v1` and + -- extend the result with the following expressions: + -- `_merge_with_previous`: + -- A boolean flag indicating whether the record will be merged with the previous record within a partition of + -- records that share the same primary key and values hash and are ordered by their "active from" timestamp. + -- The result set is ready for merging of adjacent records with the same values hash. + `{target_table}_union_extended_v2` AS ( + SELECT + LAG(`_values_hash`, 1, 0) OVER( + PARTITION BY `{id_column}` + ORDER BY `{active_from_column}` + ) = `_values_hash` AS `_merge_with_previous`, + `{sk_column}`, + `{active_from_column}`, + `{active_to_column}`, + `{id_column}`, + {value_columns_str} + FROM + `{target_table}_union_extended_v1` + WHERE + `_is_overridden` = FALSE + ) + -- Exclude records that are merged with the preceding record and fix the "active to" timestamp. + SELECT + `{sk_column}`, + `{active_from_column}`, + LEAD(`{active_from_column}`, 1, '9999-12-31') OVER( + PARTITION BY `{id_column}` + ORDER BY `{active_from_column}` + ) AS `{active_to_column}`, + `{id_column}`, + {value_columns_str} + FROM + `{target_table}_union_extended_v2` + WHERE + `_merge_with_previous` = FALSE; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-refresh.sql new file mode 100644 index 0000000000..ee3e889787 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-refresh.sql @@ -0,0 +1,2 @@ +-- make sure metadata about the new blocks in the {target_table} is propagated to the other impalad deamons +REFRESH {target_schema}.{target_table}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-compute-stats.sql new file mode 100644 index 0000000000..00d3e0ef15 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-compute-stats.sql @@ -0,0 +1 @@ +COMPUTE STATS {target_schema}.{target_table}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md new file mode 100644 index 0000000000..aa088bfecb --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md @@ -0,0 +1,56 @@ +### Purpose: + +Template used to load raw data from Super Collider Data Lake to target 'Slowly Changing Dimension Type 2' table in Super Collider Data Warehouse. +In summary, it accumulates updates from the data source as versioned records in the target table. + +### Details: + +Explanation of SCD type 2 can be seen here: + +### Template Name (template_name): + +- "load/versioned" + +### Template Parameters (template_args): + +- target_schema - Target schema where the versioned data is stored. Typically, a Data Warehouse (DW) schema. +- target_table - Target table where the versioned data is loaded. Typically, a Slowly Changing Dimension (SCD) of Type 2. +- source_schema - SC Data Lake schema containing the source view. +- source_view - SC Data Lake view where source data is loaded from. +- id_column - Column that holds the natural key of the target table. +- value_columns - A list of columns from the source that are considered errors. Present both in the source and the target tables. +- tracked_columns - A sublist of the value columns that are tracked for changes. Present both in the source and the target tables. +- updated_at_column - A column containing the update time of a record. Present in the source table. Optional (default value is "updated_at"). +- sk_column - A surrogate key column that is automatically generated in the target table. Optional (default value is "sk"). +- active_from_column - A column denoting the start time of a record in the target table. Optional (default value is "active_from"). +- active_to_column - A column denoting the end time of a record in the target table. Equals `active_to_max_value` if the record is not closed. Optional (default value is "active_to"). +- active_to_max_value - A value denoting an open record in the target table. Optional (default value is "9999-12-31"). + +### Prerequisites: + +In order to use this template you need to ensure the following: + +- `{source_schema}`.`{source_view}` exists and consists of the `id_column`, the `value_columns`, and the `updated_at_column`. +- `{target_schema}`.`{target_table}` exists and consists of the following columns (in this order): `{sk_column}`, `{active_from_column}`, `{active_to_column}`, `{id_column}`, and `{value_columns}`. + +### Sample Usage: + +Say there is SDDC-related 'Slowly Changing Dimension Type 2' target table called 'dim_sddc_h' in 'history' schema. + +Integrating a date of existing current records representing current state and adding new state records (from source view called 'vw_dim_sddc_h' in 'default' schema) is done in the following manner: + +```python +def run(job_input): + # . . . + template_args = { + 'source_schema': 'default', + 'source_view': 'vw_dim_sddc_h', + 'target_schema': 'history', + 'target_table': 'dim_sddc_h', + 'id_column': 'sddc_id', + 'value_columns': ['hosts', 'state', 'is_nsxt', 'cloud_vendor', 'version'], + 'tracked_columns': ['hosts', 'state', 'is_nsxt', 'cloud_vendor', 'version'], + } + job_input.execute_template('load/versioned', template_args) + # . . . +``` diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py new file mode 100644 index 0000000000..28d3a5ea86 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py @@ -0,0 +1,80 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import logging +import os +import time + +from vdk.api.job_input import IJobInput +from vdk.api.job_input import ITemplate +from vdk.internal.builtin_plugins.run.execution_results import ExecutionResult + +# from vacloud.vdk.connection.impl.builder import ManagedConnectionBuilder + +log = logging.getLogger(__name__) + + +class Template(ITemplate): + def __init__( + self, + git_hash: str, + opid: str, + managed_connection_builder, + job_input_only_used_to_pass_to_python_scripts: IJobInput, + ): + + self.git_hash = git_hash + self.opid = opid + self.managed_connection_builder = managed_connection_builder + self.job_input_only_used_to_pass_to_python_scripts = ( + job_input_only_used_to_pass_to_python_scripts + ) + + @staticmethod + def get_folder_where_i_am() -> str: + my_path = os.path.realpath(__file__) + abspath = os.path.abspath(my_path) + folder = os.path.join(abspath, os.pardir) + return os.path.abspath(folder) + + @staticmethod + def get_templates_folder() -> str: + return os.path.join(Template.get_folder_where_i_am(), "templates") + + def execute_template( + self, template_name: str, template_args: dict + ) -> ExecutionResult: + log.debug(f"Execute template {template_name} {template_args}") + start_of_execution = time.time() + exception_message = None + import importlib + + try: + package_name = ( + "vacloud.vdk.templates." + + template_name.replace("/", ".") + + ".definition" + ) + module = importlib.import_module(package_name) + load = getattr(module, "load") + load(self.job_input_only_used_to_pass_to_python_scripts, template_args) + except Exception as e: + exception_message = str(e) + raise + finally: + data = { + "@type": "pa__dp_template_usage", + "template_name": template_name, + "template_args": ",".join( + [str(v) for kv in template_args.items() for v in kv] + ), + "template_execution_time_seconds": round( + time.time() - start_of_execution + ), + "template_execution_status": "error" + if exception_message + else "success", + "exception_message": exception_message if exception_message else None, + } + log.info(data) + template_args_data = {"arg_" + k: v for k, v in template_args.items()} + data.update(template_args_data) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py new file mode 100644 index 0000000000..93034325e5 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py @@ -0,0 +1,80 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from logging import getLogger +from typing import cast +from typing import List +from typing import Type + +from pydantic import BaseModel +from pydantic import ValidationError +from vdk.api.job_input import IJobInput +from vdk.internal.builtin_plugins.run.job_input import JobInput +from vdk.internal.core import errors +from vdk.plugin.impala.impala_helper import ImpalaHelper +from vdk.plugin.impala.templates.errors import TemplateParametersError + +log = getLogger(__name__) + + +class TemplateExecutor: + TemplateParams: Type[BaseModel] + + def __init__( + self, + template_name: str, + sql_files: List[str], + sql_files_platform_is_responsible: List[str], + ) -> None: + self.template_name = ( + template_name # FIXME: could be inferred from the template path + ) + self.sql_files = sql_files + self.sql_files_platform_is_responsible = sql_files_platform_is_responsible # used to decide blamee for failure, defaults to user + + def start(self, job_input: IJobInput, args: dict) -> None: + # args = self._validate_args(args) + args["_vdk_template_insert_partition_clause"] = "" + + impala_helper = ImpalaHelper(cast(JobInput, job_input).get_managed_connection()) + table_name = "`{target_schema}`.`{target_table}`".format(**args) + table_description = impala_helper.get_table_description(table_name) + partitions = impala_helper.get_table_partitions(table_description) + if partitions: + args[ + "_vdk_template_insert_partition_clause" + ] = impala_helper.get_insert_sql_partition_clause(partitions) + + impala_helper.ensure_table_format_is_parquet(table_name, table_description) + + source_view_full_name = "`{source_schema}`.`{source_view}`".format(**args) + raw_source_view_has_results = job_input.execute_query( + """ + WITH limited_view AS (SELECT * FROM {} LIMIT 1) + SELECT COUNT(1) > 0 FROM limited_view + """.format( + source_view_full_name + ) + ) + source_view_has_results = raw_source_view_has_results[0][0] + if not source_view_has_results: + log.info(f"Source view returns no results. Will NOT execute template!") + raise Exception( + "Source view returns no results. Will NOT execute template!" + ) + + def _validate_args(self, args: dict) -> dict: + try: + return self.TemplateParams(**args).dict() + except ValidationError as error: + wrapped_error = TemplateParametersError( + error, template_name=self.template_name + ) + errors.log_and_rethrow( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=log, + what_happened="Template execution in Data Job finished with error", + why_it_happened=errors.MSG_WHY_FROM_EXCEPTION(wrapped_error), + consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, + countermeasures=errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION, + exception=wrapped_error, + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py new file mode 100644 index 0000000000..849163ff1c --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py @@ -0,0 +1,81 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +""" +Load example input data for an scd1 template test. +""" +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + + # job_input.execute_query(u""" + # CREATE DATABASE IF NOT EXISTS `{target_schema}` + # """) + + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `org_id` INT, + `org_name` STRING, + `org_type` STRING, + `company_name` STRING, + `sddc_limit` INT, + `org_host_limit` INT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{target_schema}`.`{target_table}` VALUES ( + (2, "johnlocke@vmware.com" , "CUSTOMER_POC" , "VMware" , 1, 6 ), + (3, "lilly.johnsonn@goofys.com", "CUSTOMER" , "Goofy's" , 2, 16), + (4, "jilliandoe@uncanny.ca" , "PARTNER_SISO" , "Uncanny Company" , 2, 16), + (5, "jane.doe@vmware.com" , "CUSTOMER" , "VMware" , 2, 32), + (6, "john.doe@pharmamed.com" , "CUSTOMER" , "PharmaMed" , 1, 32), + (7, "andrej.maya@acme.com" , "PARTNER_SISO" , "ACME" , 1, 32), + (8, "guang@vmware.com" , "INTERNAL_CORE" , "VMware" , 4, 32) + ) + """ + ) + + # Step 2: create a table that represents the next state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `org_id` INT, + `org_name` STRING, + `org_type` STRING, + `company_name` STRING, + `sddc_limit` INT, + `org_host_limit` INT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( + (1, "mullen@actual.com" , "CUSTOMER_MSP_TENANT", "actual Master Org", 2, 32), + (2, "johnlocke@vmware.com" , "CUSTOMER_POC" , "VMware" , 1, 6 ), + (3, "lilly.johnsonn@goofys.com", "CUSTOMER" , "Goofy's" , 2, 32), + (4, "jilliandoe@uncanny.ca" , "PARTNER_SISO" , "Uncanny Company" , 2, 32), + (5, "jane.doe@vmware.com" , "CUSTOMER" , "VMware" , 2, 32), + (6, "john.doe@pharmamed.com" , "CUSTOMER" , "PharmaMed" , 2, 32), + (7, "andrej.maya@acme.com" , "PARTNER_SISO" , "ACME" , 2, 32), + (8, "guang@vmware.com" , "INTERNAL_CORE" , "VMware" , 2, 32) + ) + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py new file mode 100644 index 0000000000..dd439171f1 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py @@ -0,0 +1,15 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + job_input.execute_template( + template_name="load/dimension/scd1", + template_args=job_input.get_arguments(), + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py new file mode 100644 index 0000000000..28a010253e --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py @@ -0,0 +1,67 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +""" +Load example input data for an scd1 template test. +""" +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `org_id` INT, + `org_name` STRING, + `sddc_limit` INT, + `org_host_limit` INT + ) + PARTITIONED BY (`org_type` STRING, `company_name` STRING) + STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + TRUNCATE `{target_schema}`.`{target_table}` + """ + ) + # Step 2: create a table that represents the next state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `org_id` INT, + `org_name` STRING, + `sddc_limit` INT, + `org_host_limit` INT, + `org_type` STRING, + `company_name` STRING + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( + (1, "mullen@actual.com" , 2, 32, "CUSTOMER_MSP_TENANT", "actual Master Org"), + (2, "johnlocke@vmware.com" , 1, 6 , "CUSTOMER_POC" , "VMware" ), + (3, "lilly.johnsonn@goofys.com", 2, 32, "CUSTOMER" , "Goofy ), + (4, "jilliandoe@uncanny.ca" , 2, 32, "PARTNER_SISO" , "Uncanny Company" ), + (5, "jane.doe@vmware.com" , 2, 32, "CUSTOMER" , "VMware" ), + (6, "john.doe@pharmamed.com" , 2, 32, "CUSTOMER" , "PharmaMed" ), + (7, "andrej.maya@acme.com" , 2, 32, "PARTNER_SISO" , "ACME" ), + (8, "guang@vmware.com" , 2, 32, "INTERNAL_CORE" , "VMware" ) + ) + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py new file mode 100644 index 0000000000..dd439171f1 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py @@ -0,0 +1,15 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + job_input.execute_template( + template_name="load/dimension/scd1", + template_args=job_input.get_arguments(), + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py new file mode 100644 index 0000000000..deab47d4d4 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py @@ -0,0 +1,193 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +""" +Load example input data for an scd2 template test. + +The data is constructed working backwards from the current scd2 template definition as follows. + +We have a source relation `S` (usually a view) and a target relation `T`. The elements in these two relations are +uniquely identified by the (`{id_column}`, `{start_time_column}`) composite key. An element is said to be "current in (a +relation) R" if and only if its `{end_time_column}` equals the user-defined `{end_time_default_value}` (which usually is +the largest possible timestamp). + + +We partition the items in `T` according to the following predicates: + +- `C`: elements that are current in `T`, +- `P`: elements that are present in `S`, +- `M`: elements that are modified in `S`. + +Obviously, it does not make sense to distinguish between modified or unmodified elements if these elements are not +present in `S`. In other words, `M` does not further partition equivalence classes that contain `¬P`. + +``` + C ∧ ¬P ∧ M = C ∧ ¬P ∧ ¬M = C ∧ ¬P +¬C ∧ ¬P ∧ M = ¬C ∧ ¬P ∧ ¬M = ¬C ∧ ¬P +``` + +In total, this means that `T` is partitioned into the following six equivalence classes. + +``` + C ∧ P ∧ M + C ∧ P ∧ ¬M + C ∧ ¬P +¬C ∧ P ∧ M +¬C ∧ P ∧ ¬M +¬C ∧ ¬P +``` + +The user contract that the load.dimension.scd2 template defines is as follows: + +1. Non-current state that is not present in the source view (¬C ∧ ¬P) is retained in the target view. +2. Non-current state that is present in the source view (¬C ∧ P) is overridden in the target view. +3. Current state that is not present in the source view (C ∧ ¬P) is dropped from the target view. In other words, + all current state should be present in the source view in order to avoid data loss. + +The sample data loaded here defines entries for each non-empty class in the target relation. We load the expected +result of the application in a third relation `R`. The data in `R` and in the updated `T` relations should be the same +up to differences in the surrogate keys of the new items present in `S`. +""" +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `{surrogate_key_column}` STRING, + `{id_column}` SMALLINT, + `{start_time_column}` TIMESTAMP, + `{end_time_column}` TIMESTAMP, + gender CHAR(1), + name STRING + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + REFRESH `{target_schema}`.`{target_table}` + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{target_schema}`.`{target_table}` VALUES ( + ("p10", 1, "1400-01-01", "9999-12-31", CAST("m" AS CHAR(1)), "Alfred Hitchcock" ), -- C ∧ P ∧ ¬M + + ("p20", 2, "1400-01-01", "2019-10-24", CAST("m" AS CHAR(1)), "ANDREI TARKOVSKY" ), -- ¬C ∧ ¬P + ("p21", 2, "2019-10-24", "9999-12-31", CAST("m" AS CHAR(1)), "Andrii Tarkowski" ), -- C ∧ P ∧ M + + ("p30", 3, "1400-01-01", "9999-12-31", CAST("m" AS CHAR(1)), "Ingmar Bergman" ), -- C ∧ ¬P + + ("p40", 4, "1400-01-01", "2009-01-01", CAST("m" AS CHAR(1)), "Laurence WACHOWSKI"), -- ¬C ∧ P ∧ M + ("p41", 4, "2009-01-01", "9999-12-31", CAST("m" AS CHAR(1)), "Lana Washowski" ), -- C ∧ P ∧ M + + ("p50", 5, "1400-01-01", "2016-03-01", CAST("m" AS CHAR(1)), "Andrew Wachowski" ), -- ¬C ∧ P ∧ ¬M + ("p51", 5, "2016-03-01", "9999-12-31", CAST("f" AS CHAR(1)), "Andrew Wachowski" ) -- C ∧ P ∧ ¬M + ) + """ + ) + job_input.execute_query( + """ + REFRESH `{target_schema}`.`{target_table}` + """ + ) + + # Step 2: create a table that represents the delta to be applied + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `{id_column}` SMALLINT, + `{start_time_column}` TIMESTAMP, + `{end_time_column}` TIMESTAMP, + gender CHAR(1), + name STRING + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + REFRESH `{source_schema}`.`{source_view}` + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( + (1, "1400-01-01", "9999-12-31", CAST("m" AS CHAR(1)), "Alfred Hitchcock" ), -- p10: unmodified + + (2, "2019-10-24", "9999-12-31", CAST("m" AS CHAR(1)), "Andrei Tarkovsky" ), -- p21: fix typos in name + + (4, "1400-01-01", "2009-01-01", CAST("m" AS CHAR(1)), "Laurence Wachowski"), -- p40: fix case in name + (4, "2009-01-01", "2018-12-31", CAST("f" AS CHAR(1)), "Lana Washowski" ), -- p41: fix gender (closing) + (4, "2018-12-31", "9999-12-31", CAST("f" AS CHAR(1)), "Lana Wachowski" ), -- p42: change name (new) + + (5, "1400-01-01", "2016-03-01", CAST("m" AS CHAR(1)), "Andrew Wachowski" ), -- p50: unmodified + (5, "2016-03-01", "2018-12-31", CAST("f" AS CHAR(1)), "Andrew Wachowski" ), -- p51: unmodified (closing) + (5, "2018-12-31", "9999-12-31", CAST("f" AS CHAR(1)), "Lilly Wachowski" ) -- p52: change name (new) + ) + """ + ) + job_input.execute_query( + """ + REFRESH `{source_schema}`.`{source_view}` + """ + ) + + # Step 3: Create a table containing the state expected after updating the current state with the given delta + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{expect_schema}`.`{expect_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{expect_schema}`.`{expect_table}` ( + `{surrogate_key_column}` STRING, + `{id_column}` SMALLINT, + `{start_time_column}` TIMESTAMP, + `{end_time_column}` TIMESTAMP, + gender CHAR(1), + name STRING + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + REFRESH `{expect_schema}`.`{expect_table}` + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{expect_schema}`.`{expect_table}` VALUES ( + ("p10", 1, "1400-01-01", "9999-12-31", CAST("m" AS CHAR(1)), "Alfred Hitchcock" ), -- C ∧ P ∧ ¬M + + ("p20", 2, "1400-01-01", "2019-10-24", CAST("m" AS CHAR(1)), "ANDREI TARKOVSKY" ), -- ¬C ∧ ¬P + ("p21", 2, "2019-10-24", "9999-12-31", CAST("m" AS CHAR(1)), "Andrei Tarkovsky" ), -- C ∧ P ∧ M + + ("p40", 4, "1400-01-01", "2009-01-01", CAST("m" AS CHAR(1)), "Laurence Wachowski"), -- ¬C ∧ P ∧ M + ("p41", 4, "2009-01-01", "2018-12-31", CAST("f" AS CHAR(1)), "Lana Washowski" ), -- C ∧ P ∧ M + ("p42", 4, "2018-12-31", "9999-12-31", CAST("f" AS CHAR(1)), "Lana Wachowski" ), -- C ∧ P ∧ M (new) + + ("p50", 5, "1400-01-01", "2016-03-01", CAST("m" AS CHAR(1)), "Andrew Wachowski" ), -- ¬C ∧ P ∧ ¬M + ("p51", 5, "2016-03-01", "2018-12-31", CAST("f" AS CHAR(1)), "Andrew Wachowski" ), -- C ∧ P ∧ ¬M + ("p52", 5, "2018-12-31", "9999-12-31", CAST("f" AS CHAR(1)), "Lilly Wachowski" ) -- C ∧ P ∧ ¬M (new) + ) + """ + ) + job_input.execute_query( + """ + REFRESH `{expect_schema}`.`{expect_table}` + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py new file mode 100644 index 0000000000..a91e341f27 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py @@ -0,0 +1,15 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + job_input.execute_template( + template_name="load/dimension/scd2", + template_args=job_input.get_arguments(), + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py new file mode 100644 index 0000000000..d562001bee --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py @@ -0,0 +1,128 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `cluster_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{target_schema}`.`{target_table}` VALUES ( + -- 2019-11-18 + ("sddc01-r01", 1, "2019-11-18", 5 , 1, "2019-11-18 09:00:00"), + ("sddc02-r01", 2, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + ("sddc03-r01", 3, "2019-11-18", 12, 3, "2019-11-18 09:00:00"), + ("sddc04-r01", 4, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + -- 2019-11-19 + ("sddc01-r01", 1, "2019-11-19", 5 , 1, "2019-11-19 09:00:00"), + ("sddc02-r01", 2, "2019-11-19", 4 , 1, "2019-11-19 09:00:00"), + ("sddc03-r01", 3, "2019-11-19", 13, 3, "2019-11-19 09:00:00"), + ("sddc04-r01", 4, "2019-11-19", 3 , 1, "2019-11-19 09:00:00"), + ("sddc05-r02", 5, "2019-11-19", 20, 4, "2019-11-19 09:00:00") + ) + """ + ) + + # Step 2: create a table that represents the next snapshot + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `cluster_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( + -- 2019-11-18 + ("sddc05-r01", 5, "2019-11-18", 18, 4, "2019-11-18 09:30:00"), -- late arrival + -- 2019-11-19 (duplicated) + ("sddc01-r01", 1, "2019-11-19", 5 , 1, "2019-11-19 09:00:00"), -- duplicated + ("sddc02-r01", 2, "2019-11-19", 4 , 1, "2019-11-19 09:00:00"), -- duplicated + ("sddc03-r01", 3, "2019-11-19", 13, 3, "2019-11-19 09:00:00"), -- duplicated + ("sddc04-r01", 4, "2019-11-19", 3 , 1, "2019-11-19 09:00:00"), -- duplicated + ("sddc05-r02", 5, "2019-11-19", 20, 5, "2019-11-19 09:00:00"), -- changed + -- 2019-11-20 + ("sddc01-r01", 1, "2019-11-20", 10, 2, "2019-11-20 09:00:00"), -- new + ("sddc02-r02", 2, "2019-11-20", 7 , 1, "2019-11-20 09:00:00"), -- new + ("sddc03-r01", 3, "2019-11-20", 13, 3, "2019-11-20 09:00:00"), -- new + ("sddc04-r01", 4, "2019-11-20", 3 , 1, "2019-11-20 09:00:00"), -- new + ("sddc05-r04", 5, "2019-11-20", 3 , 1, "2019-11-20 09:00:00"), -- new + ("sddc06-r01", 1, "2019-11-20", 3 , 1, "2019-11-20 09:00:00") -- new + ) + """ + ) + + # Step 3: Create a table containing the state expected after updating the current state with the next snapshot + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{expect_schema}`.`{expect_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{expect_schema}`.`{expect_table}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `cluster_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{expect_schema}`.`{expect_table}` VALUES ( + -- 2019-11-18 + ("sddc01-r01", 1, "2019-11-18", 5 , 1, "2019-11-18 09:00:00"), + ("sddc02-r01", 2, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + ("sddc03-r01", 3, "2019-11-18", 12, 3, "2019-11-18 09:00:00"), + ("sddc04-r01", 4, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + ("sddc05-r01", 5, "2019-11-18", 18, 4, "2019-11-18 09:30:00"), + -- 2019-11-19 (duplicated) + ("sddc01-r01", 1, "2019-11-19", 5 , 1, "2019-11-19 09:00:00"), + ("sddc02-r01", 2, "2019-11-19", 4 , 1, "2019-11-19 09:00:00"), + ("sddc03-r01", 3, "2019-11-19", 13, 3, "2019-11-19 09:00:00"), + ("sddc04-r01", 4, "2019-11-19", 3 , 1, "2019-11-19 09:00:00"), + ("sddc05-r02", 5, "2019-11-19", 20, 5, "2019-11-19 09:00:00"), + -- 2019-11-20 + ("sddc01-r01", 1, "2019-11-20", 10, 2, "2019-11-20 09:00:00"), + ("sddc02-r02", 2, "2019-11-20", 7 , 1, "2019-11-20 09:00:00"), + ("sddc03-r01", 3, "2019-11-20", 13, 3, "2019-11-20 09:00:00"), + ("sddc04-r01", 4, "2019-11-20", 3 , 1, "2019-11-20 09:00:00"), + ("sddc05-r04", 5, "2019-11-20", 3 , 1, "2019-11-20 09:00:00"), + ("sddc06-r01", 1, "2019-11-20", 3 , 1, "2019-11-20 09:00:00") + ) + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py new file mode 100644 index 0000000000..6d070d1dd0 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py @@ -0,0 +1,15 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + job_input.execute_template( + template_name="load/fact/snapshot", + template_args=job_input.get_arguments(), + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py new file mode 100644 index 0000000000..9dc3d4f5bc --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py @@ -0,0 +1,106 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `cluster_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{target_schema}`.`{target_table}` VALUES ( + -- 2019-11-18 + ("sddc01-r01", 1, "2019-11-18", 5 , 1, "2019-11-18 09:00:00"), + ("sddc02-r01", 2, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + ("sddc03-r01", 3, "2019-11-18", 12, 3, "2019-11-18 09:00:00"), + ("sddc04-r01", 4, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + -- 2019-11-19 + ("sddc01-r01", 1, "2019-11-19", 5 , 1, "2019-11-19 09:00:00"), + ("sddc02-r01", 2, "2019-11-19", 4 , 1, "2019-11-19 09:00:00"), + ("sddc03-r01", 3, "2019-11-19", 13, 3, "2019-11-19 09:00:00"), + ("sddc04-r01", 4, "2019-11-19", 3 , 1, "2019-11-19 09:00:00"), + ("sddc05-r02", 5, "2019-11-19", 20, 4, "2019-11-19 09:00:00") + ) + """ + ) + + # Step 2: create a table that represents the next snapshot + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `cluster_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP + ) STORED AS PARQUET + """ + ) + # We are testing the case when the next snapshot is empty + job_input.execute_query( + """ + TRUNCATE `{source_schema}`.`{source_view}` + """ + ) + + # Step 3: Create a table containing the state expected after updating the current state with the next snapshot + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{expect_schema}`.`{expect_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{expect_schema}`.`{expect_table}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `cluster_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{expect_schema}`.`{expect_table}` VALUES ( + -- 2019-11-18 + -- 2019-11-18 + ("sddc01-r01", 1, "2019-11-18", 5 , 1, "2019-11-18 09:00:00"), + ("sddc02-r01", 2, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + ("sddc03-r01", 3, "2019-11-18", 12, 3, "2019-11-18 09:00:00"), + ("sddc04-r01", 4, "2019-11-18", 4 , 1, "2019-11-18 09:00:00"), + -- 2019-11-19 + ("sddc01-r01", 1, "2019-11-19", 5 , 1, "2019-11-19 09:00:00"), + ("sddc02-r01", 2, "2019-11-19", 4 , 1, "2019-11-19 09:00:00"), + ("sddc03-r01", 3, "2019-11-19", 13, 3, "2019-11-19 09:00:00"), + ("sddc04-r01", 4, "2019-11-19", 3 , 1, "2019-11-19 09:00:00"), + ("sddc05-r02", 5, "2019-11-19", 20, 4, "2019-11-19 09:00:00") + ) + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py new file mode 100644 index 0000000000..6d070d1dd0 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py @@ -0,0 +1,15 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + job_input.execute_template( + template_name="load/fact/snapshot", + template_args=job_input.get_arguments(), + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/01_prepare_input_data.py new file mode 100644 index 0000000000..f396876046 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/01_prepare_input_data.py @@ -0,0 +1,120 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +""" +Load example input data for an scd2 template test. +""" +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `{sk_column}` STRING, + `{active_from_column}` TIMESTAMP, + `{active_to_column}` TIMESTAMP, + `{id_column}` INT, + `updated_by_user_id` INT, + `state` STRING, + `is_nsxt` BOOLEAN, + `cloud_vendor` STRING, + `version` SMALLINT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{target_schema}`.`{target_table}` VALUES ( + ("sddc01-v01", "2019-01-01", "9999-12-31", 1, 7, "RUNNING" , false, 'Azure', 498), + ("sddc02-v01", "2019-02-01", "9999-12-31", 2, 9, "STOPPED" , false, 'AWS' , 500), + ("sddc03-v01", "2019-03-01", "9999-12-31", 3, 3, "PROVISIONING", false, 'Azure', 497), + ("sddc04-v01", "2019-04-01", "9999-12-31", 4, 5, "PROVISIONING", true , 'Azure', 498), + ("sddc05-v01", "2019-05-01", "2019-05-02", 5, 9, "STARTING" , true , 'AWS' , 500), + ("sddc05-v02", "2019-05-02", "2019-05-03", 5, 2, "STARTING" , true , 'AWS' , 500), + ("sddc05-v03", "2019-05-03", "9999-12-31", 5, 3, "STARTING" , true , 'AWS' , 500) + ) + """ + ) + + # Step 2: create a table that represents the delta to be applied + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `{updated_at_column}` TIMESTAMP, + `{id_column}` INT, + `updated_by_user_id` INT, + `state` STRING, + `is_nsxt` BOOLEAN, + `cloud_vendor` STRING, + `version` SMALLINT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( + ("2019-02-02", 2, 1, "STARTING" , false, 'AWS' , 500), -- Update (1) - new time, new values + ("2019-03-01", 3, 4, "RUNNING" , false, 'Azure', 497), -- Update (2) - same time, new values + ("2019-04-02", 4, 5, "PROVISIONING", true , 'Azure', 498), -- Update (3) - new time, same values + ("2019-05-01", 5, 9, "STARTING" , true , 'AWS' , 500), -- Update (4) - same time, same values + ("2019-05-02", 5, 9, "STARTING" , true , 'AWS' , 500), -- Update (5) - same time, prev values + ("2019-05-04", 5, 9, "STARTING" , true , 'AWS' , 500), -- Update (1) - new time, new values + ("2019-06-01", 6, 9, "STARTING" , true , 'AWS' , 499) -- Insert + ) + """ + ) + + # Step 3: Create a table containing the state expected after updating the current state with the given delta + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{expect_schema}`.`{expect_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{expect_schema}`.`{expect_table}` ( + `{sk_column}` STRING, + `{active_from_column}` TIMESTAMP, + `{active_to_column}` TIMESTAMP, + `{id_column}` INT, + `updated_by_user_id` INT, + `state` STRING, + `is_nsxt` BOOLEAN, + `cloud_vendor` STRING, + `version` SMALLINT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{expect_schema}`.`{expect_table}` VALUES ( + ("sddc01-v01", "2019-01-01", "9999-12-31", 1, 7, "RUNNING" , false, 'Azure', 498), + + ("sddc02-v01", "2019-02-01", "2019-02-02", 2, 9, "STOPPED" , false, 'AWS' , 500), + ("sddc02-v02", "2019-02-02", "9999-12-31", 2, 1, "STARTING" , false, 'AWS' , 500), + + ("sddc03-v01", "2019-03-01", "9999-12-31", 3, 4, "RUNNING" , false, 'Azure', 497), + + ("sddc04-v01", "2019-04-01", "9999-12-31", 4, 5, "PROVISIONING", true , 'Azure', 498), + + ("sddc05-v01", "2019-05-01", "2019-05-03", 5, 9, "STARTING" , true , 'AWS' , 500), + ("sddc05-v03", "2019-05-03", "2019-05-04", 5, 3, "STARTING" , true , 'AWS' , 500), + ("sddc05-v04", "2019-05-04", "9999-12-31", 5, 9, "STARTING" , true , 'AWS' , 500), + + ("sddc06-v01", "2019-06-01", "9999-12-31", 6, 9, "STARTING" , true , 'AWS' , 499) + ) + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py new file mode 100644 index 0000000000..e5e13a815e --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py @@ -0,0 +1,15 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + job_input.execute_template( + template_name="load/versioned", + template_args=job_input.get_arguments(), + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/01_prepare_input_data.py new file mode 100644 index 0000000000..b6bc6bdcc3 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/01_prepare_input_data.py @@ -0,0 +1,124 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +""" +Load example input data for an scd2 template test. +""" +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current state + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `{sk_column}` STRING, + `{active_from_column}` TIMESTAMP, + `{active_to_column}` TIMESTAMP, + `{id_column}` INT, + `updated_by_user_id` INT, + `state` STRING, + `is_nsxt` BOOLEAN, + `cloud_vendor` STRING + ) PARTITIONED BY (`version` SMALLINT) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + TRUNCATE `{target_schema}`.`{target_table}` + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{target_schema}`.`{target_table}` PARTITION (version) VALUES ( + ("sddc01-v01", "2019-01-01", "9999-12-31", 1, 7, "RUNNING" , false, 'Azure', 498), + ("sddc02-v01", "2019-02-01", "9999-12-31", 2, 9, "STOPPED" , false, 'AWS' , 500), + ("sddc03-v01", "2019-03-01", "9999-12-31", 3, 3, "PROVISIONING", false, 'Azure', 497), + ("sddc04-v01", "2019-04-01", "9999-12-31", 4, 5, "PROVISIONING", true , 'Azure', 498), + ("sddc05-v01", "2019-05-01", "2019-05-02", 5, 9, "STARTING" , true , 'AWS' , 500), + ("sddc05-v02", "2019-05-02", "2019-05-03", 5, 2, "STARTING" , true , 'AWS' , 500), + ("sddc05-v03", "2019-05-03", "9999-12-31", 5, 3, "STARTING" , true , 'AWS' , 500) + ) + """ + ) + + # Step 2: create a table that represents the delta to be applied + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `{updated_at_column}` TIMESTAMP, + `{id_column}` INT, + `updated_by_user_id` INT, + `state` STRING, + `is_nsxt` BOOLEAN, + `cloud_vendor` STRING, + `version` SMALLINT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( + ("2019-02-02", 2, 1, "STARTING" , false, 'AWS' , 500), -- Update (1) - new time, new values + ("2019-03-01", 3, 4, "RUNNING" , false, 'Azure', 497), -- Update (2) - same time, new values + ("2019-04-02", 4, 5, "PROVISIONING", true , 'Azure', 498), -- Update (3) - new time, same values + ("2019-05-01", 5, 9, "STARTING" , true , 'AWS' , 500), -- Update (4) - same time, same values + ("2019-05-02", 5, 9, "STARTING" , true , 'AWS' , 500), -- Update (5) - same time, prev values + ("2019-05-04", 5, 9, "STARTING" , true , 'AWS' , 500), -- Update (1) - new time, new values + ("2019-06-01", 6, 9, "STARTING" , true , 'AWS' , 499) -- Insert + ) + """ + ) + + # Step 3: Create a table containing the state expected after updating the current state with the given delta + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{expect_schema}`.`{expect_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{expect_schema}`.`{expect_table}` ( + `{sk_column}` STRING, + `{active_from_column}` TIMESTAMP, + `{active_to_column}` TIMESTAMP, + `{id_column}` INT, + `updated_by_user_id` INT, + `state` STRING, + `is_nsxt` BOOLEAN, + `cloud_vendor` STRING, + `version` SMALLINT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{expect_schema}`.`{expect_table}` VALUES ( + ("sddc01-v01", "2019-01-01", "9999-12-31", 1, 7, "RUNNING" , false, 'Azure', 498), + + ("sddc02-v01", "2019-02-01", "2019-02-02", 2, 9, "STOPPED" , false, 'AWS' , 500), + ("sddc02-v02", "2019-02-02", "9999-12-31", 2, 1, "STARTING" , false, 'AWS' , 500), + + ("sddc03-v01", "2019-03-01", "9999-12-31", 3, 4, "RUNNING" , false, 'Azure', 497), + + ("sddc04-v01", "2019-04-01", "9999-12-31", 4, 5, "PROVISIONING", true , 'Azure', 498), + + ("sddc05-v01", "2019-05-01", "2019-05-03", 5, 9, "STARTING" , true , 'AWS' , 500), + ("sddc05-v03", "2019-05-03", "2019-05-04", 5, 3, "STARTING" , true , 'AWS' , 500), + ("sddc05-v04", "2019-05-04", "9999-12-31", 5, 9, "STARTING" , true , 'AWS' , 500), + + ("sddc06-v01", "2019-06-01", "9999-12-31", 6, 9, "STARTING" , true , 'AWS' , 499) + ) + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py new file mode 100644 index 0000000000..e5e13a815e --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py @@ -0,0 +1,15 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + job_input.execute_template( + template_name="load/versioned", + template_args=job_input.get_arguments(), + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py new file mode 100644 index 0000000000..3a8a30d9c2 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py @@ -0,0 +1,594 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import json +import os +import pathlib +import re +import time +import unittest +from unittest.mock import ANY +from unittest.mock import DEFAULT +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest +from vdk.internal.core import errors +from vdk.plugin.impala import impala_plugin +from vdk.plugin.test_utils.util_funcs import CliEntryBasedTestRunner +from vdk.plugin.test_utils.util_funcs import get_test_job_path + + +@pytest.mark.skip( + reason="We need to test this with a recent impala instance. Current test instance is too old" +) +class TemplateRegressionTests(unittest.TestCase): + def setUp(self) -> None: + self.__runner = CliEntryBasedTestRunner(impala_plugin) + time.sleep(10) # wait for impala instance to come online + + def test_load_dimension_scd1(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_dim_org" + target_table = "dw_dim_org" + + self._run_job( + "load_dimension_scd1_template_job", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + }, + ) + + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{source_view}") + assert actual_rs.output == expected_rs.output + + def test_load_dimension_scd1_partitioned(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_dim_org_partition_test" + target_table = "dw_dim_org_partitioned" + + self._run_job( + "load_dimension_scd1_template_partition_job", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + }, + ) + + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{source_view}") + assert actual_rs.output == expected_rs.output + + def test_load_dimension_scd1_parameter_validation(self) -> None: + self._run_template_with_bad_arguments( + template_name="load/dimension/scd1", template_args={}, num_exp_errors=4 + ) + self._run_template_with_bad_arguments( + template_name="load/dimension/scd1", + template_args={"source_view": "foo", "extra_parameter": "bar"}, + num_exp_errors=3, + ) + + def test_load_dimension_scd1_bad_target_schema(self) -> None: + template_args = { + "source_schema": "vdkprototypes", + "source_view": "vw_dim_org", + "target_schema": "vdkprototypes", + "target_table": "dw_dim_org_as_textfile", + } + + self._run_template_with_bad_target_schema( + template_name="load/dimension/scd1", + template_args=template_args, + ) + + def test_load_dimension_scd2(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_scmdb_people" + target_table = "dw_scmdb_people" + expect_table = "ex_scmdb_people" + + self._run_job( + "load_dimension_scd2_template_job", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + "staging_schema": test_schema, + "expect_schema": test_schema, + "expect_table": expect_table, + "start_time_column": "start_time", + "end_time_column": "end_time", + "end_time_default_value": "9999-12-31", + "surrogate_key_column": "sk", + "id_column": "id", + }, + ) + + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ + actual = {x[38:] for x in actual_rs.output.split("\n")} + expected = {x[5:] for x in expected_rs.output.split("\n")} + + self.assertSetEqual( + expected, actual, f"Elements in {expect_table} and {target_table} differ." + ) + + def test_load_dimension_scd2_parameter_validation(self) -> None: + self._run_template_with_bad_arguments( + template_name="load/dimension/scd2", template_args={}, num_exp_errors=9 + ) + self._run_template_with_bad_arguments( + template_name="load/dimension/scd2", + template_args={"source_view": "foo", "extra_parameter": "bar"}, + num_exp_errors=8, + ) + + def test_load_dimension_scd2_bad_target_schema(self) -> None: + template_args = { + "source_schema": "vdkprototypes", + "source_view": "vw_scmdb_people", + "target_schema": "vdkprototypes", + "target_table": "dw_fact_sddc_daily_as_textfile", + "staging_schema": "vdkprototypes", + "start_time_column": "start_time", + "end_time_column": "end_time", + "end_time_default_value": "9999-12-31", + "surrogate_key_column": "sk", + "id_column": "id", + } + + self._run_template_with_bad_target_schema( + template_name="load/dimension/scd2", + template_args=template_args, + ) + + def test_load_versioned(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_sddc_h_updates" + target_table = "dim_sddc_h" + expect_table = "ex_dim_sddc_h" + + self._run_job( + "load_versioned_template_job", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + "expect_schema": test_schema, + "expect_table": expect_table, + "id_column": "sddc_id", + "sk_column": "sddc_sk", + "value_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "cloud_vendor", + "version", + ], + "tracked_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "version", + ], + "active_from_column": "active_from", + "active_to_column": "active_to", + "active_to_max_value": "9999-12-31", + "updated_at_column": "updated_at", + }, + ) + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + + # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ + actual = {x[38:] for x in actual_rs.output.split("\n")} + expected = {x[5:] for x in expected_rs.output.split("\n")} + + self.assertSetEqual( + actual, expected, f"Elements in {expect_table} and {target_table} differ." + ) + + def test_load_versioned_partitioned(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_sddc_h_updates_partition_test" + target_table = "dim_sddc_h_partitioned" + expect_table = "ex_dim_sddc_h_partition_test" + + self._run_job( + "load_versioned_template_partition_job", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + "expect_schema": test_schema, + "expect_table": expect_table, + "id_column": "sddc_id", + "sk_column": "sddc_sk", + "value_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "cloud_vendor", + "version", + ], + "tracked_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "version", + ], + "active_from_column": "active_from", + "active_to_column": "active_to", + "active_to_max_value": "9999-12-31", + "updated_at_column": "updated_at", + }, + ) + + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ + actual = {x[38:] for x in actual_rs.output.split("\n")} + expected = {x[5:] for x in expected_rs.output.split("\n")} + + self.assertSetEqual( + actual, expected, f"Elements in {expect_table} and {target_table} differ." + ) + + def test_load_versioned_parameter_validation(self) -> None: + self._run_template_with_bad_arguments( + template_name="load/versioned", template_args={}, num_exp_errors=7 + ) + + good_template_args = { + "source_schema": "vdkprototypes", + "source_view": "vw_sddc_h_updates", + "target_schema": "vdkprototypes", + "target_table": "dim_sddc_h_as_textfile", + "id_column": "sddc_id", + "sk_column": "sddc_sk", + "value_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "cloud_vendor", + "version", + ], + "tracked_columns": ["updated_by_user_id", "state", "is_nsxt", "version"], + "active_from_column": "active_from", + "active_to_column": "active_to", + "active_to_max_value": "9999-12-31", + "updated_at_column": "updated_at", + "extra_parameter": "bar", + } + + self._run_template_with_bad_arguments( + template_name="load/versioned", + template_args={ + **good_template_args, + **{ + "value_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "cloud_vendor", + ], + "tracked_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "version", + ], + }, + }, + num_exp_errors=1, + ) + + self._run_template_with_bad_arguments( + template_name="load/versioned", + template_args={ + **good_template_args, + **{ + "value_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "cloud_vendor", + ], + "tracked_columns": [], + }, + }, + num_exp_errors=1, + ) + + def test_load_versioned_bad_target_schema(self) -> None: + template_args = { + "source_schema": "vdkprototypes", + "source_view": "vw_sddc_h_updates", + "target_schema": "vdkprototypes", + "target_table": "dim_sddc_h_as_textfile", + "id_column": "sddc_id", + "sk_column": "sddc_sk", + "value_columns": [ + "updated_by_user_id", + "state", + "is_nsxt", + "cloud_vendor", + "version", + ], + "tracked_columns": ["updated_by_user_id", "state", "is_nsxt", "version"], + "active_from_column": "active_from", + "active_to_column": "active_to", + "active_to_max_value": "9999-12-31", + "updated_at_column": "updated_at", + } + + self._run_template_with_bad_target_schema( + template_name="load/versioned", + template_args=template_args, + ) + + def test_load_fact_snapshot(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_fact_sddc_daily" + target_table = "dw_fact_sddc_daily" + expect_table = "ex_fact_sddc_daily" + + self._run_job( + "load_fact_snapshot_template_job", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + "expect_schema": test_schema, + "expect_table": expect_table, + "last_arrival_ts": "updated_at", + }, + ) + + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + + assert actual_rs.output == expected_rs.output + + def test_load_fact_snapshot_empty_source(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_fact_sddc_daily_empty_source" + target_table = "dw_fact_sddc_daily_empty_source" + expect_table = "ex_fact_sddc_daily_empty_source" + + self._run_job( + "load_fact_snapshot_template_job_empty_source", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + "expect_schema": test_schema, + "expect_table": expect_table, + "last_arrival_ts": "updated_at", + }, + ) + + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + + assert actual_rs.output == expected_rs.output + + def test_load_fact_snapshot_partition(self) -> None: + test_schema = "vdkprototypes" + source_view = "vw_fact_sddc_daily_partition" + target_table = "dw_fact_sddc_daily_partition" + expect_table = "ex_fact_sddc_daily_partition" + + self._run_job( + "load_fact_snapshot_template_partition_job", + { + "source_schema": test_schema, + "source_view": source_view, + "target_schema": test_schema, + "target_table": target_table, + "expect_schema": test_schema, + "expect_table": expect_table, + "last_arrival_ts": "updated_at", + }, + ) + + actual_rs = set( + self.job_input.execute_query(f"SELECT * FROM {test_schema}.{target_table}") + ) + expected_rs = set( + self.job_input.execute_query(f"SELECT * FROM {test_schema}.{expect_table}") + ) + + self.assertSetEqual( + expected_rs, + actual_rs, + f"Elements in {expect_table} and {target_table} differ.", + ) + + def test_load_fact_snapshot_parameter_validation(self) -> None: + self._run_template_with_bad_arguments( + template_name="load/fact/snapshot", template_args={}, num_exp_errors=5 + ) + self._run_template_with_bad_arguments( + template_name="load/fact/snapshot", + template_args={"source_view": "foo", "target_table": None}, + num_exp_errors=4, + ) + + def test_load_fact_snapshot_bad_target_schema(self) -> None: + template_args = { + "source_schema": "vdkprototypes", + "source_view": "vw_fact_sddc_daily", + "target_schema": "vdkprototypes", + "target_table": "dw_fact_sddc_daily_as_textfile", + "expect_schema": "vdkprototypes", + "last_arrival_ts": "updated_at", + } + + self._run_template_with_bad_target_schema( + template_name="load/fact/snapshot", + template_args=template_args, + ) + + def test_template_user_error(self): + template_args = { + "source_schema": "vdkprototypes", + "source_view": "vw_dim_org", + "target_schema": "vdkprototypes", + "target_table": "dw_dim_org", + } + + def run_job(): + self.job_input.execute_template( + template_name="load/dimension/scd1", template_args=template_args + ) + + with patch( + "vacloud.vdk.templates.template_executor.JobInput.execute_query", + side_effect=[DEFAULT, Exception], + ): # TemplateExecutor makes 5 calls to execute_query (given 4 + # sql files in a template), errors in the first two files + # would be due to user errors, while errors in the second two files due to + # platform error, so we patch in an exception in the execution of the first file + self.assertRaises(errors.DeriverCodeError, run_job) + + def test_template_platform_error(self): + template_args = { + "source_schema": "vdkprototypes", + "source_view": "vw_dim_org", + "target_schema": "vdkprototypes", + "target_table": "dw_dim_org", + } + + def run_job(): + self.job_input.execute_template( + template_name="load/dimension/scd1", template_args=template_args + ) + + with patch( + "vacloud.vdk.templates.template_executor.JobInput.execute_query", + side_effect=[DEFAULT, DEFAULT, DEFAULT, Exception], + ): # TemplateExecutor makes 5 calls to execute_query (given 4 + # sql files in a template), errors in the first two files + # would be due to user errors, while errors in the second two files due to + # platform error, so we patch in an exception in the execution of the first file + self.assertRaises(Exception) + + def _run_job(self, job_name: str, args: dict): + return self.__runner.invoke( + [ + "run", + get_test_job_path( + pathlib.Path(os.path.dirname(os.path.abspath(__file__))), + job_name, + ), + "--arguments", + json.dumps(args), + ] + ) + + def _run_query(self, query_string): + return self.__runner.invoke( + [ + "impala-query", + "--query", + query_string, + ] + ) + + def _run_template_with_bad_arguments( + self, template_name: str, template_args: dict, num_exp_errors: int + ) -> None: + def just_rethrow(*_, **kwargs): + raise kwargs["exception"] + + errors.log_and_rethrow = MagicMock(side_effect=just_rethrow) + + expected_error_regex = re.escape( + f'{num_exp_errors} validation {"errors" if num_exp_errors > 1 else "error"} ' + f"for {template_name} template" + ) + with self.assertRaisesRegex(Exception, expected_error_regex): + result = self._run_job(template_name, template_args) + + errors.log_and_rethrow.assert_called_once_with( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=ANY, + what_happened="Template execution in Data Job finished with error", + why_it_happened=ANY, + consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, + countermeasures=errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION, + exception=ANY, + ) + + def _run_template_with_bad_target_schema( + self, template_name: str, template_args: dict + ) -> None: + self._run_query( + """ + DROP TABLE IF EXISTS {target_schema}.{target_table} + """.format( + **template_args + ) + ) + self._run_query( + """ + CREATE TABLE {target_schema}.{target_table} ( + attr_a INT, + attr_b STRING, + updated_at TIMESTAMP + ) STORED AS TEXTFILE + """.format( + **template_args + ) + ) + self._run_query( + """ + REFRESH {target_schema}.{target_table} + """.format( + **template_args + ) + ) + + table_name = "`{target_schema}`.`{target_table}`".format(**template_args) + + expected_why_it_happened_msg = ( + f'The target table {table_name} must be created with a "STORED AS PARQUET" ' + f"clause. Please change the table definition accordingly and re-create the table." + ) + + def just_throw(*_, **kwargs): + raise Exception() + + errors.log_and_throw = MagicMock(side_effect=just_throw) + + with self.assertRaises(Exception) as context: + self._run_job(template_name, template_args) + + errors.log_and_throw.assert_called_once_with( + to_be_fixed_by=errors.ResolvableBy.USER_ERROR, + log=ANY, + what_happened="Data loading has failed.", + why_it_happened=( + f"You are trying to load data into a table {table_name} with an unsupported format. " + f"Currently only Parquet table format is supported." + ), + consequences="Data load will be aborted.", + countermeasures=( + "Make sure that the destination table is stored as parquet: " + "https://www.cloudera.com/documentation/enterprise/5-11-x/topics/impala_parquet.html" + "#parquet_ddl" + ), + ) From da261066deeff7f2568d86c165b1c944dd71c4f3 Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Tue, 18 Jan 2022 15:39:23 +0200 Subject: [PATCH 2/9] template regression tests pass Signed-off-by: mrMoZ1 --- .../vdk-plugins/vdk-impala/requirements.txt | 6 +- projects/vdk-plugins/vdk-impala/setup.py | 2 +- .../src/vdk/plugin/impala/impala_helper.py | 8 +- .../templates/load/validators/__init__.py | 2 + .../dimension_scd1_definition.py} | 6 +- .../dimension_scd2_definition.py} | 4 +- .../validators/fact_snapshot_definition.py | 36 ++++ .../versioned_definition.py} | 4 +- .../vdk/plugin/impala/templates/template.py | 80 --------- .../impala/templates/template_executor.py | 5 +- .../02_run_load_dimension_scd1_template.py | 4 +- .../01_run_load_dimension_scd1_template.py | 17 ++ .../__init__.py | 2 + .../02_run_load_dimension_scd1_template.py | 4 +- .../02_run_load_dimension_scd2_template.py | 4 +- .../01_run_load_dimension_scd2_template.py | 17 ++ .../__init__.py | 2 + .../02_run_load_fact_snapshot_template.py | 4 +- .../02_run_load_fact_snapshot_template.py | 4 +- .../__init__.py | 2 + .../run_fact_snapshot_template.py | 17 ++ .../01_prepare_input_data.py | 132 ++++++++++++++ .../02_run_load_fact_snapshot_template.py | 17 ++ .../__init__.py | 2 + .../02_run_load_versioned_template.py | 4 +- .../01_run_versioned_template.py | 17 ++ .../load_versioned_template_only/__init__.py | 2 + .../02_run_load_versioned_template.py | 4 +- .../tests/template_regression_test.py | 167 ++++++++---------- 29 files changed, 380 insertions(+), 195 deletions(-) create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/{dimension/scd1/00-definition.py => validators/dimension_scd1_definition.py} (86%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/{dimension/scd2/00-definition.py => validators/dimension_scd2_definition.py} (89%) create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/{versioned/00-definition.py => validators/versioned_definition.py} (95%) delete mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/__init__.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py create mode 100644 projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/requirements.txt b/projects/vdk-plugins/vdk-impala/requirements.txt index eba0c42ea0..906d1c4217 100644 --- a/projects/vdk-plugins/vdk-impala/requirements.txt +++ b/projects/vdk-plugins/vdk-impala/requirements.txt @@ -2,12 +2,10 @@ vdk-core impyla tabulate -# template requirements -pydantic -pyarrow - # testing requirements click vdk-test-utils pytest-docker docker-compose +pydantic +pyarrow diff --git a/projects/vdk-plugins/vdk-impala/setup.py b/projects/vdk-plugins/vdk-impala/setup.py index 333514cee0..426722e7dc 100644 --- a/projects/vdk-plugins/vdk-impala/setup.py +++ b/projects/vdk-plugins/vdk-impala/setup.py @@ -14,7 +14,7 @@ description="Versatile Data Kit SDK plugin provides support for Impala database.", long_description=pathlib.Path("README.md").read_text(), long_description_content_type="text/markdown", - install_requires=["vdk-core", "impyla", "tabulate"], + install_requires=["vdk-core", "impyla", "tabulate", "pydantic", "pyarrow"], package_dir={"": "src"}, packages=setuptools.find_namespace_packages(where="src"), include_package_data=True, diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py index c63a32f08c..899d192e78 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py @@ -4,10 +4,8 @@ from collections import OrderedDict import pyarrow -from vdk.internal.builtin_plugins.connection.managed_connection_base import ( - ManagedConnectionBase, -) from vdk.internal.core import errors +from vdk.plugin.impala import impala_error_classifier from vdk.plugin.impala.impala_connection import ImpalaConnection @@ -21,9 +19,7 @@ def get_table_description(self, table_name): try: return self._db_connection.execute_query(f"DESCRIBE formatted {table_name}") except Exception as e: - if errors.exception_matches( - e, "impala.error.HiveServer2Error", ".*AuthorizationException.*" - ): + if impala_error_classifier._is_authorization_error(e): errors.log_and_throw( to_be_fixed_by=errors.ResolvableBy.USER_ERROR, log=self._log, diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py similarity index 86% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-definition.py rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py index ce0bad25bd..d3e6dde2ee 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py @@ -31,5 +31,7 @@ def __init__(self) -> None: ) -def run(job_input: IJobInput): - SlowlyChangingDimensionTypeOverwrite().start(job_input, job_input.get_arguments()) +def validate_arguments(job_input: IJobInput): + return SlowlyChangingDimensionTypeOverwrite().start( + job_input, job_input.get_arguments() + ) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py similarity index 89% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-definition.py rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py index aaadf5ac24..b0314b779a 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py @@ -36,5 +36,5 @@ def __init__(self) -> None: ) -def run(job_input: IJobInput): - SlowlyChangingDimensionType2().start(job_input, job_input.get_arguments()) +def validate_arguments(job_input: IJobInput): + return SlowlyChangingDimensionType2().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py new file mode 100644 index 0000000000..43b76c80c1 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py @@ -0,0 +1,36 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import BaseModel +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.template_executor import TemplateExecutor + + +class FactDailySnapshotParams(BaseModel): + target_schema: str + target_table: str + source_schema: str + source_view: str + last_arrival_ts: str + + +class FactDailySnapshot(TemplateExecutor): + TemplateParams = FactDailySnapshotParams + + def __init__(self) -> None: + super().__init__( + template_name="load/fact/snapshot", + sql_files=[ + "00-test-if-view-matches-target.sql", + "01-insert-into-target.sql", + "02-refresh.sql", + "03-compute-stats.sql", + ], + sql_files_platform_is_responsible=[ + "02-refresh.sql", + "03-compute-stats.sql", + ], + ) + + +def validate_arguments(job_input: IJobInput): + return FactDailySnapshot().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py similarity index 95% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-definition.py rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py index 585a581d46..b87bf150f6 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py @@ -70,5 +70,5 @@ def _validate_args(self, args: dict) -> dict: ) -def run(job_input: IJobInput): - LoadVersioned().start(job_input, job_input.get_arguments()) +def validate_arguments(job_input: IJobInput): + return LoadVersioned().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py deleted file mode 100644 index 28d3a5ea86..0000000000 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2021 VMware, Inc. -# SPDX-License-Identifier: Apache-2.0 -import logging -import os -import time - -from vdk.api.job_input import IJobInput -from vdk.api.job_input import ITemplate -from vdk.internal.builtin_plugins.run.execution_results import ExecutionResult - -# from vacloud.vdk.connection.impl.builder import ManagedConnectionBuilder - -log = logging.getLogger(__name__) - - -class Template(ITemplate): - def __init__( - self, - git_hash: str, - opid: str, - managed_connection_builder, - job_input_only_used_to_pass_to_python_scripts: IJobInput, - ): - - self.git_hash = git_hash - self.opid = opid - self.managed_connection_builder = managed_connection_builder - self.job_input_only_used_to_pass_to_python_scripts = ( - job_input_only_used_to_pass_to_python_scripts - ) - - @staticmethod - def get_folder_where_i_am() -> str: - my_path = os.path.realpath(__file__) - abspath = os.path.abspath(my_path) - folder = os.path.join(abspath, os.pardir) - return os.path.abspath(folder) - - @staticmethod - def get_templates_folder() -> str: - return os.path.join(Template.get_folder_where_i_am(), "templates") - - def execute_template( - self, template_name: str, template_args: dict - ) -> ExecutionResult: - log.debug(f"Execute template {template_name} {template_args}") - start_of_execution = time.time() - exception_message = None - import importlib - - try: - package_name = ( - "vacloud.vdk.templates." - + template_name.replace("/", ".") - + ".definition" - ) - module = importlib.import_module(package_name) - load = getattr(module, "load") - load(self.job_input_only_used_to_pass_to_python_scripts, template_args) - except Exception as e: - exception_message = str(e) - raise - finally: - data = { - "@type": "pa__dp_template_usage", - "template_name": template_name, - "template_args": ",".join( - [str(v) for kv in template_args.items() for v in kv] - ), - "template_execution_time_seconds": round( - time.time() - start_of_execution - ), - "template_execution_status": "error" - if exception_message - else "success", - "exception_message": exception_message if exception_message else None, - } - log.info(data) - template_args_data = {"arg_" + k: v for k, v in template_args.items()} - data.update(template_args_data) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py index 93034325e5..4968ba22dc 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py @@ -31,8 +31,8 @@ def __init__( self.sql_files = sql_files self.sql_files_platform_is_responsible = sql_files_platform_is_responsible # used to decide blamee for failure, defaults to user - def start(self, job_input: IJobInput, args: dict) -> None: - # args = self._validate_args(args) + def start(self, job_input: IJobInput, args: dict) -> dict: + args = self._validate_args(args) args["_vdk_template_insert_partition_clause"] = "" impala_helper = ImpalaHelper(cast(JobInput, job_input).get_managed_connection()) @@ -61,6 +61,7 @@ def start(self, job_input: IJobInput, args: dict) -> None: raise Exception( "Source view returns no results. Will NOT execute template!" ) + return args def _validate_args(self, args: dict) -> dict: try: diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py index dd439171f1..3380b319eb 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import dimension_scd1_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -9,7 +10,8 @@ def run(job_input: IJobInput) -> None: + args = dimension_scd1_definition.validate_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", - template_args=job_input.get_arguments(), + template_args=args, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py new file mode 100644 index 0000000000..3380b319eb --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py @@ -0,0 +1,17 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import dimension_scd1_definition + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + args = dimension_scd1_definition.validate_arguments(job_input) + job_input.execute_template( + template_name="load/dimension/scd1", + template_args=args, + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py index dd439171f1..3380b319eb 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import dimension_scd1_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -9,7 +10,8 @@ def run(job_input: IJobInput) -> None: + args = dimension_scd1_definition.validate_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", - template_args=job_input.get_arguments(), + template_args=args, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py index a91e341f27..b2427a48bb 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import dimension_scd2_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -9,7 +10,8 @@ def run(job_input: IJobInput) -> None: + args = dimension_scd2_definition.validate_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd2", - template_args=job_input.get_arguments(), + template_args=args, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py new file mode 100644 index 0000000000..b2427a48bb --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py @@ -0,0 +1,17 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import dimension_scd2_definition + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + args = dimension_scd2_definition.validate_arguments(job_input) + job_input.execute_template( + template_name="load/dimension/scd2", + template_args=args, + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py index 6d070d1dd0..a116b72996 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -9,7 +10,8 @@ def run(job_input: IJobInput) -> None: + args = fact_snapshot_definition.validate_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", - template_args=job_input.get_arguments(), + template_args=args, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py index 6d070d1dd0..a116b72996 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -9,7 +10,8 @@ def run(job_input: IJobInput) -> None: + args = fact_snapshot_definition.validate_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", - template_args=job_input.get_arguments(), + template_args=args, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py new file mode 100644 index 0000000000..a116b72996 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py @@ -0,0 +1,17 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + args = fact_snapshot_definition.validate_arguments(job_input) + job_input.execute_template( + template_name="load/fact/snapshot", + template_args=args, + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py new file mode 100644 index 0000000000..6dfc8c32ee --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py @@ -0,0 +1,132 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput + + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + # Step 1: create a table that represents the current стате + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{target_schema}`.`{target_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{target_schema}`.`{target_table}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP + ) PARTITIONED BY (`cluster_count` BIGINT) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + TRUNCATE `{target_schema}`.`{target_table}` + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{target_schema}`.`{target_table}` PARTITION (cluster_count) VALUES ( + -- 2019-11-18 + ("sddc01-r01", 1, "2019-11-18", 5 , "2019-11-18 09:00:00", 1), + ("sddc02-r01", 2, "2019-11-18", 4 , "2019-11-18 09:00:00", 1), + ("sddc03-r01", 3, "2019-11-18", 12, "2019-11-18 09:00:00", 3), + ("sddc04-r01", 4, "2019-11-18", 4 , "2019-11-18 09:00:00", 1), + -- 2019-11-19 + ("sddc01-r01", 1, "2019-11-19", 5 , "2019-11-19 09:00:00", 1), + ("sddc02-r01", 2, "2019-11-19", 4 , "2019-11-19 09:00:00", 1), + ("sddc03-r01", 3, "2019-11-19", 13, "2019-11-19 09:00:00", 3), + ("sddc04-r01", 4, "2019-11-19", 3 , "2019-11-19 09:00:00", 1), + ("sddc05-r02", 5, "2019-11-19", 20, "2019-11-19 09:00:00", 4) + ) + """ + ) + + # Step 2: create a table that represents the next snapshot + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{source_schema}`.`{source_view}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{source_schema}`.`{source_view}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP, + `cluster_count` BIGINT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( + -- 2019-11-18 + ("sddc05-r01", 5, "2019-11-18", 18, "2019-11-18 09:30:00", 4), -- late arrival + -- 2019-11-19 (duplicated) + ("sddc01-r01", 1, "2019-11-19", 5 , "2019-11-19 09:00:00", 1), -- duplicated + ("sddc02-r01", 2, "2019-11-19", 4 , "2019-11-19 09:00:00", 1), -- duplicated + ("sddc03-r01", 3, "2019-11-19", 13, "2019-11-19 09:00:00", 3), -- duplicated + ("sddc04-r01", 4, "2019-11-19", 3 , "2019-11-19 09:00:00", 1), -- duplicated + ("sddc05-r02", 5, "2019-11-19", 20, "2019-11-19 09:00:00", 5), -- changed + -- 2019-11-20 + ("sddc01-r01", 1, "2019-11-20", 10, "2019-11-20 09:00:00", 2), -- new + ("sddc02-r02", 2, "2019-11-20", 7 , "2019-11-20 09:00:00", 1), -- new + ("sddc03-r01", 3, "2019-11-20", 13, "2019-11-20 09:00:00", 3), -- new + ("sddc04-r01", 4, "2019-11-20", 3 , "2019-11-20 09:00:00", 1), -- new + ("sddc05-r04", 5, "2019-11-20", 3 , "2019-11-20 09:00:00", 1), -- new + ("sddc06-r01", 1, "2019-11-20", 3 , "2019-11-20 09:00:00", 1) -- new + ) + """ + ) + + # Step 3: Create a table containing the state expected after updating the current state with the next snapshot + + # job_input.execute_query(u''' + # DROP TABLE IF EXISTS `{expect_schema}`.`{expect_table}` + # ''') + job_input.execute_query( + """ + CREATE TABLE IF NOT EXISTS `{expect_schema}`.`{expect_table}` ( + `dim_sddc_sk` STRING, + `dim_org_id` INT, + `dim_date_id` TIMESTAMP, + `host_count` BIGINT, + `{last_arrival_ts}` TIMESTAMP, + `cluster_count` BIGINT + ) STORED AS PARQUET + """ + ) + job_input.execute_query( + """ + INSERT OVERWRITE TABLE `{expect_schema}`.`{expect_table}` VALUES ( + -- 2019-11-18 + ("sddc01-r01", 1, "2019-11-18", 5 , "2019-11-18 09:00:00", 1), + ("sddc02-r01", 2, "2019-11-18", 4 , "2019-11-18 09:00:00", 1), + ("sddc03-r01", 3, "2019-11-18", 12, "2019-11-18 09:00:00", 3), + ("sddc04-r01", 4, "2019-11-18", 4 , "2019-11-18 09:00:00", 1), + ("sddc05-r01", 5, "2019-11-18", 18, "2019-11-18 09:30:00", 4), + -- 2019-11-19 (duplicated) + ("sddc01-r01", 1, "2019-11-19", 5 , "2019-11-19 09:00:00", 1), + ("sddc02-r01", 2, "2019-11-19", 4 , "2019-11-19 09:00:00", 1), + ("sddc03-r01", 3, "2019-11-19", 13, "2019-11-19 09:00:00", 3), + ("sddc04-r01", 4, "2019-11-19", 3 , "2019-11-19 09:00:00", 1), + ("sddc05-r02", 5, "2019-11-19", 20, "2019-11-19 09:00:00", 5), + -- 2019-11-20 + ("sddc01-r01", 1, "2019-11-20", 10, "2019-11-20 09:00:00", 2), + ("sddc02-r02", 2, "2019-11-20", 7 , "2019-11-20 09:00:00", 1), + ("sddc03-r01", 3, "2019-11-20", 13, "2019-11-20 09:00:00", 3), + ("sddc04-r01", 4, "2019-11-20", 3 , "2019-11-20 09:00:00", 1), + ("sddc05-r04", 5, "2019-11-20", 3 , "2019-11-20 09:00:00", 1), + ("sddc06-r01", 1, "2019-11-20", 3 , "2019-11-20 09:00:00", 1) + ) + """ + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py new file mode 100644 index 0000000000..a116b72996 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py @@ -0,0 +1,17 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + args = fact_snapshot_definition.validate_arguments(job_input) + job_input.execute_template( + template_name="load/fact/snapshot", + template_args=args, + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py index e5e13a815e..8e0696c6a6 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import versioned_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -9,7 +10,8 @@ def run(job_input: IJobInput) -> None: + args = versioned_definition.validate_arguments(job_input) job_input.execute_template( template_name="load/versioned", - template_args=job_input.get_arguments(), + template_args=args, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py new file mode 100644 index 0000000000..8e0696c6a6 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py @@ -0,0 +1,17 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import versioned_definition + +__author__ = "VMware, Inc." +__copyright__ = ( + "Copyright 2019 VMware, Inc. All rights reserved. -- VMware Confidential" +) + + +def run(job_input: IJobInput) -> None: + args = versioned_definition.validate_arguments(job_input) + job_input.execute_template( + template_name="load/versioned", + template_args=args, + ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py index e5e13a815e..8e0696c6a6 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py @@ -1,6 +1,7 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.load.validators import versioned_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -9,7 +10,8 @@ def run(job_input: IJobInput) -> None: + args = versioned_definition.validate_arguments(job_input) job_input.execute_template( template_name="load/versioned", - template_args=job_input.get_arguments(), + template_args=args, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py index 3a8a30d9c2..aeadf08de5 100644 --- a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py +++ b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py @@ -7,7 +7,6 @@ import time import unittest from unittest.mock import ANY -from unittest.mock import DEFAULT from unittest.mock import MagicMock from unittest.mock import patch @@ -17,10 +16,22 @@ from vdk.plugin.test_utils.util_funcs import CliEntryBasedTestRunner from vdk.plugin.test_utils.util_funcs import get_test_job_path +VDK_DB_DEFAULT_TYPE = "VDK_DB_DEFAULT_TYPE" +VDK_IMPALA_HOST = "VDK_IMPALA_HOST" +VDK_IMPALA_PORT = "VDK_IMPALA_PORT" + @pytest.mark.skip( reason="We need to test this with a recent impala instance. Current test instance is too old" ) +@patch.dict( + os.environ, + { + VDK_DB_DEFAULT_TYPE: "IMPALA", + VDK_IMPALA_HOST: "localhost", + VDK_IMPALA_PORT: "21050", + }, +) class TemplateRegressionTests(unittest.TestCase): def setUp(self) -> None: self.__runner = CliEntryBasedTestRunner(impala_plugin) @@ -66,10 +77,12 @@ def test_load_dimension_scd1_partitioned(self) -> None: def test_load_dimension_scd1_parameter_validation(self) -> None: self._run_template_with_bad_arguments( - template_name="load/dimension/scd1", template_args={}, num_exp_errors=4 + template_name="load_dimension_scd1_template_only", + template_args={}, + num_exp_errors=4, ) self._run_template_with_bad_arguments( - template_name="load/dimension/scd1", + template_name="load_dimension_scd1_template_only", template_args={"source_view": "foo", "extra_parameter": "bar"}, num_exp_errors=3, ) @@ -83,7 +96,7 @@ def test_load_dimension_scd1_bad_target_schema(self) -> None: } self._run_template_with_bad_target_schema( - template_name="load/dimension/scd1", + template_name="load_dimension_scd1_template_only", template_args=template_args, ) @@ -123,10 +136,12 @@ def test_load_dimension_scd2(self) -> None: def test_load_dimension_scd2_parameter_validation(self) -> None: self._run_template_with_bad_arguments( - template_name="load/dimension/scd2", template_args={}, num_exp_errors=9 + template_name="load_dimension_scd2_template_only", + template_args={}, + num_exp_errors=9, ) self._run_template_with_bad_arguments( - template_name="load/dimension/scd2", + template_name="load_dimension_scd2_template_only", template_args={"source_view": "foo", "extra_parameter": "bar"}, num_exp_errors=8, ) @@ -146,7 +161,7 @@ def test_load_dimension_scd2_bad_target_schema(self) -> None: } self._run_template_with_bad_target_schema( - template_name="load/dimension/scd2", + template_name="load_dimension_scd2_template_only", template_args=template_args, ) @@ -191,7 +206,7 @@ def test_load_versioned(self) -> None: # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ actual = {x[38:] for x in actual_rs.output.split("\n")} - expected = {x[5:] for x in expected_rs.output.split("\n")} + expected = {x[12:] for x in expected_rs.output.split("\n")} self.assertSetEqual( actual, expected, f"Elements in {expect_table} and {target_table} differ." @@ -238,7 +253,7 @@ def test_load_versioned_partitioned(self) -> None: expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ actual = {x[38:] for x in actual_rs.output.split("\n")} - expected = {x[5:] for x in expected_rs.output.split("\n")} + expected = {x[12:] for x in expected_rs.output.split("\n")} self.assertSetEqual( actual, expected, f"Elements in {expect_table} and {target_table} differ." @@ -246,7 +261,9 @@ def test_load_versioned_partitioned(self) -> None: def test_load_versioned_parameter_validation(self) -> None: self._run_template_with_bad_arguments( - template_name="load/versioned", template_args={}, num_exp_errors=7 + template_name="load_versioned_template_only", + template_args={}, + num_exp_errors=7, ) good_template_args = { @@ -272,7 +289,7 @@ def test_load_versioned_parameter_validation(self) -> None: } self._run_template_with_bad_arguments( - template_name="load/versioned", + template_name="load_versioned_template_only", template_args={ **good_template_args, **{ @@ -294,7 +311,7 @@ def test_load_versioned_parameter_validation(self) -> None: ) self._run_template_with_bad_arguments( - template_name="load/versioned", + template_name="load_versioned_template_only", template_args={ **good_template_args, **{ @@ -333,7 +350,7 @@ def test_load_versioned_bad_target_schema(self) -> None: } self._run_template_with_bad_target_schema( - template_name="load/versioned", + template_name="load_versioned_template_only", template_args=template_args, ) @@ -359,7 +376,12 @@ def test_load_fact_snapshot(self) -> None: actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") - assert actual_rs.output == expected_rs.output + actual = {x for x in actual_rs.output.split("\n")} + expected = {x for x in expected_rs.output.split("\n")} + + self.assertSetEqual( + actual, expected, f"Elements in {expect_table} and {target_table} differ." + ) def test_load_fact_snapshot_empty_source(self) -> None: test_schema = "vdkprototypes" @@ -383,7 +405,12 @@ def test_load_fact_snapshot_empty_source(self) -> None: actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") - assert actual_rs.output == expected_rs.output + actual = {x for x in actual_rs.output.split("\n")} + expected = {x for x in expected_rs.output.split("\n")} + + self.assertSetEqual( + actual, expected, f"Elements in {expect_table} and {target_table} differ." + ) def test_load_fact_snapshot_partition(self) -> None: test_schema = "vdkprototypes" @@ -404,25 +431,24 @@ def test_load_fact_snapshot_partition(self) -> None: }, ) - actual_rs = set( - self.job_input.execute_query(f"SELECT * FROM {test_schema}.{target_table}") - ) - expected_rs = set( - self.job_input.execute_query(f"SELECT * FROM {test_schema}.{expect_table}") - ) + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") + expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + + actual = {x for x in actual_rs.output.split("\n")} + expected = {x for x in expected_rs.output.split("\n")} self.assertSetEqual( - expected_rs, - actual_rs, - f"Elements in {expect_table} and {target_table} differ.", + actual, expected, f"Elements in {expect_table} and {target_table} differ." ) def test_load_fact_snapshot_parameter_validation(self) -> None: self._run_template_with_bad_arguments( - template_name="load/fact/snapshot", template_args={}, num_exp_errors=5 + template_name="load_fact_snapshot_template_only", + template_args={}, + num_exp_errors=5, ) self._run_template_with_bad_arguments( - template_name="load/fact/snapshot", + template_name="load_fact_snapshot_template_only", template_args={"source_view": "foo", "target_table": None}, num_exp_errors=4, ) @@ -438,54 +464,10 @@ def test_load_fact_snapshot_bad_target_schema(self) -> None: } self._run_template_with_bad_target_schema( - template_name="load/fact/snapshot", + template_name="load_fact_snapshot_template_only", template_args=template_args, ) - def test_template_user_error(self): - template_args = { - "source_schema": "vdkprototypes", - "source_view": "vw_dim_org", - "target_schema": "vdkprototypes", - "target_table": "dw_dim_org", - } - - def run_job(): - self.job_input.execute_template( - template_name="load/dimension/scd1", template_args=template_args - ) - - with patch( - "vacloud.vdk.templates.template_executor.JobInput.execute_query", - side_effect=[DEFAULT, Exception], - ): # TemplateExecutor makes 5 calls to execute_query (given 4 - # sql files in a template), errors in the first two files - # would be due to user errors, while errors in the second two files due to - # platform error, so we patch in an exception in the execution of the first file - self.assertRaises(errors.DeriverCodeError, run_job) - - def test_template_platform_error(self): - template_args = { - "source_schema": "vdkprototypes", - "source_view": "vw_dim_org", - "target_schema": "vdkprototypes", - "target_table": "dw_dim_org", - } - - def run_job(): - self.job_input.execute_template( - template_name="load/dimension/scd1", template_args=template_args - ) - - with patch( - "vacloud.vdk.templates.template_executor.JobInput.execute_query", - side_effect=[DEFAULT, DEFAULT, DEFAULT, Exception], - ): # TemplateExecutor makes 5 calls to execute_query (given 4 - # sql files in a template), errors in the first two files - # would be due to user errors, while errors in the second two files due to - # platform error, so we patch in an exception in the execution of the first file - self.assertRaises(Exception) - def _run_job(self, job_name: str, args: dict): return self.__runner.invoke( [ @@ -511,26 +493,34 @@ def _run_query(self, query_string): def _run_template_with_bad_arguments( self, template_name: str, template_args: dict, num_exp_errors: int ) -> None: - def just_rethrow(*_, **kwargs): - raise kwargs["exception"] - - errors.log_and_rethrow = MagicMock(side_effect=just_rethrow) expected_error_regex = re.escape( f'{num_exp_errors} validation {"errors" if num_exp_errors > 1 else "error"} ' f"for {template_name} template" ) - with self.assertRaisesRegex(Exception, expected_error_regex): - result = self._run_job(template_name, template_args) - errors.log_and_rethrow.assert_called_once_with( - to_be_fixed_by=errors.ResolvableBy.USER_ERROR, - log=ANY, - what_happened="Template execution in Data Job finished with error", - why_it_happened=ANY, - consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, - countermeasures=errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION, - exception=ANY, + def just_rethrow(*_, **kwargs): + raise Exception(expected_error_regex) + + errors.log_and_rethrow = MagicMock(side_effect=just_rethrow) + + result = self._run_job(template_name, template_args) + assert expected_error_regex in result.output + assert ( + errors.log_and_rethrow.call_args[1]["what_happened"] + == "Failed executing job." + ) + assert ( + errors.log_and_rethrow.call_args[1]["why_it_happened"] + == f"An exception occurred, exception message was: {expected_error_regex}" + ) + assert ( + errors.log_and_rethrow.call_args[1]["consequences"] + == errors.MSG_CONSEQUENCE_TERMINATING_APP + ) + assert ( + errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION + in errors.log_and_rethrow.call_args[1]["countermeasures"] ) def _run_template_with_bad_target_schema( @@ -570,13 +560,12 @@ def _run_template_with_bad_target_schema( ) def just_throw(*_, **kwargs): - raise Exception() + raise Exception(expected_why_it_happened_msg) errors.log_and_throw = MagicMock(side_effect=just_throw) - with self.assertRaises(Exception) as context: - self._run_job(template_name, template_args) - + res = self._run_job(template_name, template_args) + assert expected_why_it_happened_msg in res.output errors.log_and_throw.assert_called_once_with( to_be_fixed_by=errors.ResolvableBy.USER_ERROR, log=ANY, From c5705acd3af1f642b20ad6c715d5c322d78af219 Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Tue, 18 Jan 2022 18:33:09 +0200 Subject: [PATCH 3/9] comments and refactor Signed-off-by: mrMoZ1 --- .../src/vdk/plugin/impala/impala_helper.py | 2 +- .../src/vdk/plugin/impala/impala_plugin.py | 13 ------- .../src/vdk/plugin/impala/templates/errors.py | 20 ----------- .../templates/load/dimension/scd2/README.md | 2 +- .../load/fact/snapshot/00-definition.py | 36 ------------------- .../templates/load/fact/snapshot/README.md | 4 +-- .../validators/dimension_scd1_definition.py | 10 +++--- .../validators/dimension_scd2_definition.py | 12 ++++--- .../validators/fact_snapshot_definition.py | 10 +++--- .../load/validators/versioned_definition.py | 10 +++--- .../impala/templates/load/versioned/README.md | 2 +- ...tor.py => template_arguments_validator.py} | 12 +++---- .../02_run_load_dimension_scd1_template.py | 2 +- .../01_run_load_dimension_scd1_template.py | 2 +- .../02_run_load_dimension_scd1_template.py | 2 +- .../02_run_load_dimension_scd2_template.py | 2 +- .../01_run_load_dimension_scd2_template.py | 2 +- .../02_run_load_fact_snapshot_template.py | 2 +- .../02_run_load_fact_snapshot_template.py | 2 +- .../run_fact_snapshot_template.py | 2 +- .../02_run_load_fact_snapshot_template.py | 2 +- .../02_run_load_versioned_template.py | 2 +- .../01_run_versioned_template.py | 2 +- .../02_run_load_versioned_template.py | 2 +- .../tests/template_regression_test.py | 9 +++-- 25 files changed, 51 insertions(+), 115 deletions(-) delete mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py delete mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/{template_executor.py => template_arguments_validator.py} (87%) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py index 899d192e78..7aebeb9f6f 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_helper.py @@ -116,7 +116,7 @@ def ensure_table_format_is_parquet(self, table_name, table_description): what_happened="Cannot determine the target table file format, which is needed to load data into it.", why_it_happened="There's a bug in VDK code.", consequences="Application will exit.", - countermeasures="Report this bug to Super Collider team.", + countermeasures="Report this bug to versatile data kit team.", ) def generate_parquet_schema_from_table_schema(self, table_columns): diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py index 4d45d7b14c..4f65153eb1 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py @@ -136,19 +136,6 @@ def vdk_start(plugin_registry: IPluginRegistry, command_line_args: List): plugin_registry.load_plugin_with_hooks_impl(ImpalaPlugin(), "impala-plugin") -def db_connection_recover_operation(recovery_cursor: RecoveryCursor) -> None: - impala_error_handler = ImpalaErrorHandler() - - if impala_error_handler.handle_error( - recovery_cursor.get_exception(), recovery_cursor - ): - logging.getLogger(__name__).info( - "Error handled successfully! Query execution has succeeded." - ) - else: - raise recovery_cursor.get_exception() - - def get_jobs_parent_directory() -> pathlib.Path: current_dir = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) jobs_dir = current_dir.joinpath("templates") diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py deleted file mode 100644 index ab27010464..0000000000 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/errors.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2021 VMware, Inc. -# SPDX-License-Identifier: Apache-2.0 -from pydantic import error_wrappers -from pydantic import ValidationError - - -class TemplateParametersError(Exception): - __slots__ = "cause", "template_name" - - def __init__(self, cause: ValidationError, template_name: str) -> None: - self.cause = cause - self.template_name = template_name - - def __str__(self) -> str: - validation_errors = self.cause.errors() - no_errors = len(validation_errors) - return ( - f'{no_errors} validation error{"" if no_errors == 1 else "s"} for {self.template_name} template\n' - f"{error_wrappers.display_errors(validation_errors)}" - ) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md index 12500f4f47..318a3a0680 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/README.md @@ -1,6 +1,6 @@ ### Purpose: -Template used to load raw data from Super Collider Data Lake to target 'Slowly Changing Dimension Type 2' table in Super Collider Data Warehouse. +Template used to load raw data from a Data Lake to target 'Slowly Changing Dimension Type 2' table in a Data Warehouse. ### Details: diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py deleted file mode 100644 index f9552c07b6..0000000000 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-definition.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2021 VMware, Inc. -# SPDX-License-Identifier: Apache-2.0 -from pydantic import BaseModel -from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.template_executor import TemplateExecutor - - -class FactDailySnapshotParams(BaseModel): - target_schema: str - target_table: str - source_schema: str - source_view: str - last_arrival_ts: str - - -class FactDailySnapshot(TemplateExecutor): - TemplateParams = FactDailySnapshotParams - - def __init__(self) -> None: - super().__init__( - template_name="load/fact/snapshot", - sql_files=[ - "00-test-if-view-matches-target.sql", - "01-insert-into-target.sql", - "02-refresh.sql", - "03-compute-stats.sql", - ], - sql_files_platform_is_responsible=[ - "02-refresh.sql", - "03-compute-stats.sql", - ], - ) - - -def run(job_input: IJobInput): - FactDailySnapshot().start(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md index bb57f400cb..d521f6a820 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/README.md @@ -1,6 +1,6 @@ ### Purpose: -This template can be used to load raw data from SuperCollider Data Lake to target 'Snapshot Periodic Fact Table' in Data Warehouse. +This template can be used to load raw data from Data Lake to target 'Snapshot Periodic Fact Table' in Data Warehouse. In summary, it appends a snapshot of records observed between time t1 and t2 from the source table to the target table, truncating all present target table records observed after t1. @@ -31,7 +31,7 @@ In order to use this template you need to ensure the following: ### Sample Usage: Say there is SDDC-related 'Snapshot Periodic Fact Table' called 'fact_sddc_daily' in 'history' schema. -Updating it with the latest raw data from the Super Collider Data Lake (from source view called 'vw_fact_sddc_daily' in 'default' schema) is done in the following manner: +Updating it with the latest raw data from a Data Lake (from source view called 'vw_fact_sddc_daily' in 'default' schema) is done in the following manner: ```python def run(job_input): diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py index d3e6dde2ee..cfa96634ce 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from pydantic import BaseModel from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.template_executor import TemplateExecutor +from vdk.plugin.impala.templates.template_arguments_validator import ( + TemplateArgumentsValidator, +) class SlowlyChangingDimensionTypeOverwriteParams(BaseModel): @@ -12,7 +14,7 @@ class SlowlyChangingDimensionTypeOverwriteParams(BaseModel): source_view: str -class SlowlyChangingDimensionTypeOverwrite(TemplateExecutor): +class SlowlyChangingDimensionTypeOverwrite(TemplateArgumentsValidator): TemplateParams = SlowlyChangingDimensionTypeOverwriteParams def __init__(self) -> None: @@ -31,7 +33,7 @@ def __init__(self) -> None: ) -def validate_arguments(job_input: IJobInput): - return SlowlyChangingDimensionTypeOverwrite().start( +def get_validated_arguments(job_input: IJobInput): + return SlowlyChangingDimensionTypeOverwrite().get_validated_args( job_input, job_input.get_arguments() ) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py index b0314b779a..2a2c1ff195 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from pydantic import BaseModel from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.template_executor import TemplateExecutor +from vdk.plugin.impala.templates.template_arguments_validator import ( + TemplateArgumentsValidator, +) class SlowlyChangingDimensionType2Params(BaseModel): @@ -17,7 +19,7 @@ class SlowlyChangingDimensionType2Params(BaseModel): id_column: str -class SlowlyChangingDimensionType2(TemplateExecutor): +class SlowlyChangingDimensionType2(TemplateArgumentsValidator): TemplateParams = SlowlyChangingDimensionType2Params def __init__(self) -> None: @@ -36,5 +38,7 @@ def __init__(self) -> None: ) -def validate_arguments(job_input: IJobInput): - return SlowlyChangingDimensionType2().start(job_input, job_input.get_arguments()) +def get_validated_arguments(job_input: IJobInput): + return SlowlyChangingDimensionType2().get_validated_args( + job_input, job_input.get_arguments() + ) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py index 43b76c80c1..8b8df41e39 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from pydantic import BaseModel from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.template_executor import TemplateExecutor +from vdk.plugin.impala.templates.template_arguments_validator import ( + TemplateArgumentsValidator, +) class FactDailySnapshotParams(BaseModel): @@ -13,7 +15,7 @@ class FactDailySnapshotParams(BaseModel): last_arrival_ts: str -class FactDailySnapshot(TemplateExecutor): +class FactDailySnapshot(TemplateArgumentsValidator): TemplateParams = FactDailySnapshotParams def __init__(self) -> None: @@ -32,5 +34,5 @@ def __init__(self) -> None: ) -def validate_arguments(job_input: IJobInput): - return FactDailySnapshot().start(job_input, job_input.get_arguments()) +def get_validated_arguments(job_input: IJobInput): + return FactDailySnapshot().get_validated_args(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py index b87bf150f6..86aed0881f 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py @@ -5,7 +5,9 @@ from pydantic import BaseModel from pydantic import validator from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.template_executor import TemplateExecutor +from vdk.plugin.impala.templates.template_arguments_validator import ( + TemplateArgumentsValidator, +) class LoadVersionedParams(BaseModel): @@ -36,7 +38,7 @@ def passwords_match(cls, tracked_columns, values, **kwargs): return tracked_columns -class LoadVersioned(TemplateExecutor): +class LoadVersioned(TemplateArgumentsValidator): TemplateParams = LoadVersionedParams def __init__(self) -> None: @@ -70,5 +72,5 @@ def _validate_args(self, args: dict) -> dict: ) -def validate_arguments(job_input: IJobInput): - return LoadVersioned().start(job_input, job_input.get_arguments()) +def get_validated_arguments(job_input: IJobInput): + return LoadVersioned().get_validated_args(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md index aa088bfecb..e2b36f7054 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/README.md @@ -1,6 +1,6 @@ ### Purpose: -Template used to load raw data from Super Collider Data Lake to target 'Slowly Changing Dimension Type 2' table in Super Collider Data Warehouse. +Template used to load raw data from a Data Lake to target 'Slowly Changing Dimension Type 2' table in a Data Warehouse. In summary, it accumulates updates from the data source as versioned records in the target table. ### Details: diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py similarity index 87% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py index 4968ba22dc..b75c59dbaf 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_executor.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py @@ -11,12 +11,11 @@ from vdk.internal.builtin_plugins.run.job_input import JobInput from vdk.internal.core import errors from vdk.plugin.impala.impala_helper import ImpalaHelper -from vdk.plugin.impala.templates.errors import TemplateParametersError log = getLogger(__name__) -class TemplateExecutor: +class TemplateArgumentsValidator: TemplateParams: Type[BaseModel] def __init__( @@ -31,7 +30,7 @@ def __init__( self.sql_files = sql_files self.sql_files_platform_is_responsible = sql_files_platform_is_responsible # used to decide blamee for failure, defaults to user - def start(self, job_input: IJobInput, args: dict) -> dict: + def get_validated_args(self, job_input: IJobInput, args: dict) -> dict: args = self._validate_args(args) args["_vdk_template_insert_partition_clause"] = "" @@ -67,15 +66,12 @@ def _validate_args(self, args: dict) -> dict: try: return self.TemplateParams(**args).dict() except ValidationError as error: - wrapped_error = TemplateParametersError( - error, template_name=self.template_name - ) errors.log_and_rethrow( to_be_fixed_by=errors.ResolvableBy.USER_ERROR, log=log, what_happened="Template execution in Data Job finished with error", - why_it_happened=errors.MSG_WHY_FROM_EXCEPTION(wrapped_error), + why_it_happened=errors.MSG_WHY_FROM_EXCEPTION(error), consequences=errors.MSG_CONSEQUENCE_TERMINATING_APP, countermeasures=errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION, - exception=wrapped_error, + exception=error, ) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py index 3380b319eb..15e191ad30 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd1_definition.validate_arguments(job_input) + args = dimension_scd1_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py index 3380b319eb..15e191ad30 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd1_definition.validate_arguments(job_input) + args = dimension_scd1_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py index 3380b319eb..15e191ad30 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd1_definition.validate_arguments(job_input) + args = dimension_scd1_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py index b2427a48bb..2012877d5f 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd2_definition.validate_arguments(job_input) + args = dimension_scd2_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd2", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py index b2427a48bb..2012877d5f 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd2_definition.validate_arguments(job_input) + args = dimension_scd2_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd2", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py index a116b72996..d22b8c374b 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.validate_arguments(job_input) + args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py index a116b72996..d22b8c374b 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.validate_arguments(job_input) + args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py index a116b72996..d22b8c374b 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.validate_arguments(job_input) + args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py index a116b72996..d22b8c374b 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.validate_arguments(job_input) + args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py index 8e0696c6a6..67d432b3fd 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = versioned_definition.validate_arguments(job_input) + args = versioned_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/versioned", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py index 8e0696c6a6..67d432b3fd 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = versioned_definition.validate_arguments(job_input) + args = versioned_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/versioned", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py index 8e0696c6a6..67d432b3fd 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py @@ -10,7 +10,7 @@ def run(job_input: IJobInput) -> None: - args = versioned_definition.validate_arguments(job_input) + args = versioned_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/versioned", template_args=args, diff --git a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py index aeadf08de5..2e5ef5e4e1 100644 --- a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py +++ b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py @@ -508,11 +508,10 @@ def just_rethrow(*_, **kwargs): assert expected_error_regex in result.output assert ( errors.log_and_rethrow.call_args[1]["what_happened"] - == "Failed executing job." + == "Template execution in Data Job finished with error" ) - assert ( - errors.log_and_rethrow.call_args[1]["why_it_happened"] - == f"An exception occurred, exception message was: {expected_error_regex}" + assert errors.log_and_rethrow.call_args[1]["why_it_happened"].startswith( + f"An exception occurred, exception message was: {num_exp_errors} validation error" ) assert ( errors.log_and_rethrow.call_args[1]["consequences"] @@ -520,7 +519,7 @@ def just_rethrow(*_, **kwargs): ) assert ( errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION - in errors.log_and_rethrow.call_args[1]["countermeasures"] + == errors.log_and_rethrow.call_args[1]["countermeasures"] ) def _run_template_with_bad_target_schema( From 1b37069a691b80d825d0418d9a8d5b441515058d Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Thu, 20 Jan 2022 17:50:47 +0200 Subject: [PATCH 4/9] refactor template_regression tests and comments Signed-off-by: mrMoZ1 --- .../src/vdk/plugin/impala/impala_plugin.py | 16 ++++ .../src/vdk/plugin/impala/templates/README.md | 4 +- .../tests/template_regression_test.py | 75 +++++++++++-------- 3 files changed, 63 insertions(+), 32 deletions(-) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py index 4f65153eb1..441a08a782 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/impala_plugin.py @@ -77,18 +77,34 @@ def initialize_job(self, context: JobContext) -> None: "load/dimension/scd1", pathlib.Path(get_job_path("load/dimension/scd1")) ) + context.templates.add_template( + "scd1", pathlib.Path(get_job_path("load/dimension/scd1")) + ) + context.templates.add_template( "load/dimension/scd2", pathlib.Path(get_job_path("load/dimension/scd2")) ) + context.templates.add_template( + "scd2", pathlib.Path(get_job_path("load/dimension/scd2")) + ) + context.templates.add_template( "load/fact/snapshot", pathlib.Path(get_job_path("load/fact/snapshot")) ) + context.templates.add_template( + "snapshot", pathlib.Path(get_job_path("load/fact/snapshot")) + ) + context.templates.add_template( "load/versioned", pathlib.Path(get_job_path("load/versioned")) ) + context.templates.add_template( + "versioned", pathlib.Path(get_job_path("load/versioned")) + ) + @staticmethod @hookimpl(hookwrapper=True, tryfirst=True) def run_step(context: JobContext, step: Step) -> None: diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md index 8754d37def..b69adc88c8 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/README.md @@ -1,3 +1,3 @@ ### Types of data loading templates -- Slowly Changing Dimension Type 1 - [see details and usage](https://gitlab.eng.vmware.com/product-analytics/data-pipelines/vdk/tree/master/src/vacloud/vdk//templates/load/dimension/scd1/README.md) -- Snapshot Accumulating Fact Table - [see details and usage](https://gitlab.eng.vmware.com/product-analytics/data-pipelines/vdk/tree/master/src/vacloud/vdk/templates/load/fact/snapshot/README.md) +- Slowly Changing Dimension Type 1 - TBD +- Snapshot Accumulating Fact Table - TBD diff --git a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py index 2e5ef5e4e1..562bae12d2 100644 --- a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py +++ b/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py @@ -32,6 +32,7 @@ VDK_IMPALA_PORT: "21050", }, ) +@pytest.mark.usefixtures("impala_service") class TemplateRegressionTests(unittest.TestCase): def setUp(self) -> None: self.__runner = CliEntryBasedTestRunner(impala_plugin) @@ -42,7 +43,7 @@ def test_load_dimension_scd1(self) -> None: source_view = "vw_dim_org" target_table = "dw_dim_org" - self._run_job( + res = self._run_job( "load_dimension_scd1_template_job", { "source_schema": test_schema, @@ -51,9 +52,10 @@ def test_load_dimension_scd1(self) -> None: "target_table": target_table, }, ) - + assert not res.exception actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{source_view}") + assert actual_rs.output and expected_rs.output assert actual_rs.output == expected_rs.output def test_load_dimension_scd1_partitioned(self) -> None: @@ -61,7 +63,7 @@ def test_load_dimension_scd1_partitioned(self) -> None: source_view = "vw_dim_org_partition_test" target_table = "dw_dim_org_partitioned" - self._run_job( + res = self._run_job( "load_dimension_scd1_template_partition_job", { "source_schema": test_schema, @@ -70,9 +72,11 @@ def test_load_dimension_scd1_partitioned(self) -> None: "target_table": target_table, }, ) + assert not res.exception actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{source_view}") + assert actual_rs.output and expected_rs.output assert actual_rs.output == expected_rs.output def test_load_dimension_scd1_parameter_validation(self) -> None: @@ -106,7 +110,7 @@ def test_load_dimension_scd2(self) -> None: target_table = "dw_scmdb_people" expect_table = "ex_scmdb_people" - self._run_job( + res = self._run_job( "load_dimension_scd2_template_job", { "source_schema": test_schema, @@ -123,13 +127,14 @@ def test_load_dimension_scd2(self) -> None: "id_column": "id", }, ) + assert not res.exception actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ actual = {x[38:] for x in actual_rs.output.split("\n")} expected = {x[5:] for x in expected_rs.output.split("\n")} - + assert actual_rs.output and expected_rs.output self.assertSetEqual( expected, actual, f"Elements in {expect_table} and {target_table} differ." ) @@ -171,7 +176,7 @@ def test_load_versioned(self) -> None: target_table = "dim_sddc_h" expect_table = "ex_dim_sddc_h" - self._run_job( + res = self._run_job( "load_versioned_template_job", { "source_schema": test_schema, @@ -201,9 +206,11 @@ def test_load_versioned(self) -> None: "updated_at_column": "updated_at", }, ) + assert not res.exception + actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") - + assert actual_rs.output and expected_rs.output # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ actual = {x[38:] for x in actual_rs.output.split("\n")} expected = {x[12:] for x in expected_rs.output.split("\n")} @@ -218,7 +225,7 @@ def test_load_versioned_partitioned(self) -> None: target_table = "dim_sddc_h_partitioned" expect_table = "ex_dim_sddc_h_partition_test" - self._run_job( + res = self._run_job( "load_versioned_template_partition_job", { "source_schema": test_schema, @@ -248,9 +255,11 @@ def test_load_versioned_partitioned(self) -> None: "updated_at_column": "updated_at", }, ) + assert not res.exception actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + assert actual_rs.output and expected_rs.output # delete first (surrogate key) column from the two results, as those are uniquely generated and might differ actual = {x[38:] for x in actual_rs.output.split("\n")} expected = {x[12:] for x in expected_rs.output.split("\n")} @@ -360,7 +369,7 @@ def test_load_fact_snapshot(self) -> None: target_table = "dw_fact_sddc_daily" expect_table = "ex_fact_sddc_daily" - self._run_job( + res = self._run_job( "load_fact_snapshot_template_job", { "source_schema": test_schema, @@ -372,9 +381,11 @@ def test_load_fact_snapshot(self) -> None: "last_arrival_ts": "updated_at", }, ) + assert not res.exception actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + assert actual_rs.output and expected_rs.output actual = {x for x in actual_rs.output.split("\n")} expected = {x for x in expected_rs.output.split("\n")} @@ -389,7 +400,7 @@ def test_load_fact_snapshot_empty_source(self) -> None: target_table = "dw_fact_sddc_daily_empty_source" expect_table = "ex_fact_sddc_daily_empty_source" - self._run_job( + res = self._run_job( "load_fact_snapshot_template_job_empty_source", { "source_schema": test_schema, @@ -401,9 +412,11 @@ def test_load_fact_snapshot_empty_source(self) -> None: "last_arrival_ts": "updated_at", }, ) + assert not res.exception actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + assert actual_rs.output and expected_rs.output actual = {x for x in actual_rs.output.split("\n")} expected = {x for x in expected_rs.output.split("\n")} @@ -418,7 +431,7 @@ def test_load_fact_snapshot_partition(self) -> None: target_table = "dw_fact_sddc_daily_partition" expect_table = "ex_fact_sddc_daily_partition" - self._run_job( + res = self._run_job( "load_fact_snapshot_template_partition_job", { "source_schema": test_schema, @@ -430,9 +443,11 @@ def test_load_fact_snapshot_partition(self) -> None: "last_arrival_ts": "updated_at", }, ) + assert not res.exception actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") + assert actual_rs.output and expected_rs.output actual = {x for x in actual_rs.output.split("\n")} expected = {x for x in expected_rs.output.split("\n")} @@ -502,25 +517,25 @@ def _run_template_with_bad_arguments( def just_rethrow(*_, **kwargs): raise Exception(expected_error_regex) - errors.log_and_rethrow = MagicMock(side_effect=just_rethrow) - - result = self._run_job(template_name, template_args) - assert expected_error_regex in result.output - assert ( - errors.log_and_rethrow.call_args[1]["what_happened"] - == "Template execution in Data Job finished with error" - ) - assert errors.log_and_rethrow.call_args[1]["why_it_happened"].startswith( - f"An exception occurred, exception message was: {num_exp_errors} validation error" - ) - assert ( - errors.log_and_rethrow.call_args[1]["consequences"] - == errors.MSG_CONSEQUENCE_TERMINATING_APP - ) - assert ( - errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION - == errors.log_and_rethrow.call_args[1]["countermeasures"] - ) + with patch.object(errors, "log_and_rethrow") as patched_log_and_rethrow: + patched_log_and_rethrow.side_effect = just_rethrow + result = self._run_job(template_name, template_args) + assert expected_error_regex in result.output + assert ( + errors.log_and_rethrow.call_args[1]["what_happened"] + == "Template execution in Data Job finished with error" + ) + assert errors.log_and_rethrow.call_args[1]["why_it_happened"].startswith( + f"An exception occurred, exception message was: {num_exp_errors} validation error" + ) + assert ( + errors.log_and_rethrow.call_args[1]["consequences"] + == errors.MSG_CONSEQUENCE_TERMINATING_APP + ) + assert ( + errors.MSG_COUNTERMEASURE_FIX_PARENT_EXCEPTION + == errors.log_and_rethrow.call_args[1]["countermeasures"] + ) def _run_template_with_bad_target_schema( self, template_name: str, template_args: dict From 459cdf3da7e676497f792c3b2edd065822f82a63 Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Fri, 21 Jan 2022 13:46:50 +0200 Subject: [PATCH 5/9] comments Signed-off-by: mrMoZ1 --- .../tests/{jobs => functional}/__init__.py | 0 .../jobs}/__init__.py | 0 .../01_prepare_input_data.py | 0 .../02_run_load_dimension_scd1_template.py | 0 .../load_dimension_scd1_template_job}/__init__.py | 0 .../01_run_load_dimension_scd1_template.py | 0 .../load_dimension_scd1_template_only}/__init__.py | 0 .../01_prepare_input_data.py | 2 +- .../02_run_load_dimension_scd1_template.py | 0 .../__init__.py | 0 .../01_prepare_input_data.py | 0 .../02_run_load_dimension_scd2_template.py | 0 .../load_dimension_scd2_template_job}/__init__.py | 0 .../01_run_load_dimension_scd2_template.py | 0 .../load_dimension_scd2_template_only}/__init__.py | 0 .../01_prepare_input_data.py | 0 .../02_run_load_fact_snapshot_template.py | 0 .../load_fact_snapshot_template_job}/__init__.py | 0 .../01_prepare_input_data.py | 0 .../02_run_load_fact_snapshot_template.py | 0 .../__init__.py | 0 .../load_fact_snapshot_template_only}/__init__.py | 0 .../run_fact_snapshot_template.py | 0 .../01_prepare_input_data.py | 0 .../02_run_load_fact_snapshot_template.py | 0 .../__init__.py | 0 .../01_prepare_input_data.py | 0 .../02_run_load_versioned_template.py | 0 .../jobs/load_versioned_template_job}/__init__.py | 0 .../01_run_versioned_template.py | 0 .../jobs/load_versioned_template_only}/__init__.py | 0 .../01_prepare_input_data.py | 0 .../02_run_load_versioned_template.py | 0 .../__init__.py | 2 ++ .../{ => functional}/template_regression_test.py | 12 ++++++++++-- 35 files changed, 13 insertions(+), 3 deletions(-) rename projects/vdk-plugins/vdk-impala/tests/{jobs => functional}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_dimension_scd1_template_job => functional/jobs}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_dimension_scd1_template_only => functional/jobs/load_dimension_scd1_template_job}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_dimension_scd1_template_partition_job => functional/jobs/load_dimension_scd1_template_only}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py (98%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_dimension_scd2_template_job => functional/jobs/load_dimension_scd1_template_partition_job}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_dimension_scd2_template_only => functional/jobs/load_dimension_scd2_template_job}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_fact_snapshot_template_job => functional/jobs/load_dimension_scd2_template_only}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_fact_snapshot_template_job_empty_source => functional/jobs/load_fact_snapshot_template_job}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_fact_snapshot_template_only => functional/jobs/load_fact_snapshot_template_job_empty_source}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_fact_snapshot_template_partition_job => functional/jobs/load_fact_snapshot_template_only}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_versioned_template_job => functional/jobs/load_fact_snapshot_template_partition_job}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_versioned_template_job/01_prepare_input_data.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_versioned_template_job/02_run_load_versioned_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_versioned_template_only => functional/jobs/load_versioned_template_job}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_versioned_template_only/01_run_versioned_template.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{jobs/load_versioned_template_partition_job => functional/jobs/load_versioned_template_only}/__init__.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_versioned_template_partition_job/01_prepare_input_data.py (100%) rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py (100%) create mode 100644 projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/__init__.py rename projects/vdk-plugins/vdk-impala/tests/{ => functional}/template_regression_test.py (98%) diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/01_prepare_input_data.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py similarity index 98% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py index 28a010253e..b47cbc3ab0 100644 --- a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/01_prepare_input_data.py @@ -56,7 +56,7 @@ def run(job_input: IJobInput) -> None: INSERT OVERWRITE TABLE `{source_schema}`.`{source_view}` VALUES ( (1, "mullen@actual.com" , 2, 32, "CUSTOMER_MSP_TENANT", "actual Master Org"), (2, "johnlocke@vmware.com" , 1, 6 , "CUSTOMER_POC" , "VMware" ), - (3, "lilly.johnsonn@goofys.com", 2, 32, "CUSTOMER" , "Goofy ), + (3, "lilly.johnsonn@goofys.com", 2, 32, "CUSTOMER" , "Goofy" ), (4, "jilliandoe@uncanny.ca" , 2, 32, "PARTNER_SISO" , "Uncanny Company" ), (5, "jane.doe@vmware.com" , 2, 32, "CUSTOMER" , "VMware" ), (6, "john.doe@pharmamed.com" , 2, 32, "CUSTOMER" , "PharmaMed" ), diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/01_prepare_input_data.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/01_prepare_input_data.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/01_prepare_input_data.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/01_prepare_input_data.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/01_prepare_input_data.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/01_prepare_input_data.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/02_run_load_versioned_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_job/02_run_load_versioned_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/02_run_load_versioned_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/01_run_versioned_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_only/01_run_versioned_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/01_run_versioned_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/__init__.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/__init__.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/__init__.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/01_prepare_input_data.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/01_prepare_input_data.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/01_prepare_input_data.py diff --git a/projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py similarity index 100% rename from projects/vdk-plugins/vdk-impala/tests/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py rename to projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/__init__.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/__init__.py new file mode 100644 index 0000000000..50c007580a --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py b/projects/vdk-plugins/vdk-impala/tests/functional/template_regression_test.py similarity index 98% rename from projects/vdk-plugins/vdk-impala/tests/template_regression_test.py rename to projects/vdk-plugins/vdk-impala/tests/functional/template_regression_test.py index 562bae12d2..75cdf7ec76 100644 --- a/projects/vdk-plugins/vdk-impala/tests/template_regression_test.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/template_regression_test.py @@ -76,8 +76,14 @@ def test_load_dimension_scd1_partitioned(self) -> None: actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{source_view}") + + actual = {x for x in actual_rs.output.split("\n")} + expected = {x for x in expected_rs.output.split("\n")} + assert actual_rs.output and expected_rs.output - assert actual_rs.output == expected_rs.output + self.assertSetEqual( + expected, actual, f"Elements in {source_view} and {target_table} differ." + ) def test_load_dimension_scd1_parameter_validation(self) -> None: self._run_template_with_bad_arguments( @@ -412,7 +418,9 @@ def test_load_fact_snapshot_empty_source(self) -> None: "last_arrival_ts": "updated_at", }, ) - assert not res.exception + # Expecting data job not to finish due to empty source. + assert res.exception + assert "Source view returns no results" in res.exception.args[0] actual_rs = self._run_query(f"SELECT * FROM {test_schema}.{target_table}") expected_rs = self._run_query(f"SELECT * FROM {test_schema}.{expect_table}") From f6d4a5f3f62e8f436e275906214176fe2b4f5547 Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Fri, 21 Jan 2022 14:01:18 +0200 Subject: [PATCH 6/9] run template tests against docker impala Signed-off-by: mrMoZ1 --- .../vdk-impala/tests/functional/template_regression_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/template_regression_test.py b/projects/vdk-plugins/vdk-impala/tests/functional/template_regression_test.py index 75cdf7ec76..0d4493ff9c 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/template_regression_test.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/template_regression_test.py @@ -21,9 +21,6 @@ VDK_IMPALA_PORT = "VDK_IMPALA_PORT" -@pytest.mark.skip( - reason="We need to test this with a recent impala instance. Current test instance is too old" -) @patch.dict( os.environ, { @@ -37,6 +34,7 @@ class TemplateRegressionTests(unittest.TestCase): def setUp(self) -> None: self.__runner = CliEntryBasedTestRunner(impala_plugin) time.sleep(10) # wait for impala instance to come online + self._run_query("CREATE DATABASE IF NOT EXISTS vdkprototypes") def test_load_dimension_scd1(self) -> None: test_schema = "vdkprototypes" From 9e989142a79efc6d992e8f1e2a6de0fdc91c6207 Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Wed, 26 Jan 2022 09:46:35 +0200 Subject: [PATCH 7/9] refactor, comments Signed-off-by: mrMoZ1 --- projects/vdk-plugins/vdk-impala/README.md | 4 ++ .../scd1/00-dimension-scd1-definition.py} | 18 ++------- ...sql => 01-test-if-view-matches-target.sql} | 0 ...o-target.sql => 02-insert-into-target.sql} | 0 .../scd1/{02-refresh.sql => 03-refresh.sql} | 0 ...compute-stats.sql => 04-compute-stats.sql} | 0 .../scd2/00-dimension-scd2-definition.py} | 18 ++------- ...sql => 01-test-if-view-matches-target.sql} | 0 ...o-target.sql => 02-insert-into-target.sql} | 0 .../scd2/{02-refresh.sql => 03-refresh.sql} | 0 ...compute-stats.sql => 04-compute-stats.sql} | 0 .../snapshot/00-fact-snapshot-definition.py | 26 +++++++++++++ ...sql => 01-test-if-view-matches-target.sql} | 0 ...o-target.sql => 02-insert-into-target.sql} | 0 .../{02-refresh.sql => 03-refresh.sql} | 0 ...compute-stats.sql => 04-compute-stats.sql} | 0 .../templates/load/validators/__init__.py | 2 - .../validators/fact_snapshot_definition.py | 38 ------------------- .../00-versioned-definition.py} | 20 ++-------- ...sql => 01-test-if-view-matches-target.sql} | 0 ...o-target.sql => 02-insert-into-target.sql} | 0 .../{02-refresh.sql => 03-refresh.sql} | 0 ...compute-stats.sql => 04-compute-stats.sql} | 0 .../templates/template_arguments_validator.py | 15 ++------ .../02_run_load_dimension_scd1_template.py | 4 +- .../01_run_load_dimension_scd1_template.py | 4 +- .../02_run_load_dimension_scd1_template.py | 4 +- .../02_run_load_dimension_scd2_template.py | 4 +- .../01_run_load_dimension_scd2_template.py | 4 +- .../02_run_load_fact_snapshot_template.py | 4 +- .../02_run_load_fact_snapshot_template.py | 4 +- .../run_fact_snapshot_template.py | 4 +- .../02_run_load_fact_snapshot_template.py | 4 +- .../02_run_load_versioned_template.py | 4 +- .../01_run_versioned_template.py | 4 +- .../02_run_load_versioned_template.py | 4 +- 36 files changed, 55 insertions(+), 134 deletions(-) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/{validators/dimension_scd1_definition.py => dimension/scd1/00-dimension-scd1-definition.py} (53%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/{00-test-if-view-matches-target.sql => 01-test-if-view-matches-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/{01-insert-into-target.sql => 02-insert-into-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/{02-refresh.sql => 03-refresh.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/{03-compute-stats.sql => 04-compute-stats.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/{validators/dimension_scd2_definition.py => dimension/scd2/00-dimension-scd2-definition.py} (57%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/{00-test-if-view-matches-target.sql => 01-test-if-view-matches-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/{01-insert-into-target.sql => 02-insert-into-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/{02-refresh.sql => 03-refresh.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/{03-compute-stats.sql => 04-compute-stats.sql} (100%) create mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-fact-snapshot-definition.py rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/{00-test-if-view-matches-target.sql => 01-test-if-view-matches-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/{01-insert-into-target.sql => 02-insert-into-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/{02-refresh.sql => 03-refresh.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/{03-compute-stats.sql => 04-compute-stats.sql} (100%) delete mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py delete mode 100644 projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/{validators/versioned_definition.py => versioned/00-versioned-definition.py} (75%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/{00-test-if-view-matches-target.sql => 01-test-if-view-matches-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/{01-insert-into-target.sql => 02-insert-into-target.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/{02-refresh.sql => 03-refresh.sql} (100%) rename projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/{03-compute-stats.sql => 04-compute-stats.sql} (100%) diff --git a/projects/vdk-plugins/vdk-impala/README.md b/projects/vdk-plugins/vdk-impala/README.md index cf354fc921..de3a4d7e10 100644 --- a/projects/vdk-plugins/vdk-impala/README.md +++ b/projects/vdk-plugins/vdk-impala/README.md @@ -35,6 +35,10 @@ Then, from inside the run function in a Python step, you can use the `send_objec Run vdk config-help - search for those prefixed with "IMPALA_" to see what configuration options are available. +# Disclaimer + +This plugin is tested against a specific impala version. The version comes from the docker-compose.yaml container's impala version. For more information on the imapala version tested against please google the docker image. + # Testing Testing this plugin locally requires installing the dependencies listed in vdk-plugins/vdk-impala/requirements.txt diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-dimension-scd1-definition.py similarity index 53% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-dimension-scd1-definition.py index cfa96634ce..115a16b2c7 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd1_definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-dimension-scd1-definition.py @@ -18,22 +18,10 @@ class SlowlyChangingDimensionTypeOverwrite(TemplateArgumentsValidator): TemplateParams = SlowlyChangingDimensionTypeOverwriteParams def __init__(self) -> None: - super().__init__( - template_name="scd1", - sql_files=[ - "00-test-if-view-matches-target.sql", - "01-insert-into-target.sql", - "02-refresh.sql", - "03-compute-stats.sql", - ], - sql_files_platform_is_responsible=[ - "02-refresh.sql", - "03-compute-stats.sql", - ], - ) + super().__init__() -def get_validated_arguments(job_input: IJobInput): - return SlowlyChangingDimensionTypeOverwrite().get_validated_args( +def run(job_input: IJobInput): + SlowlyChangingDimensionTypeOverwrite().get_validated_args( job_input, job_input.get_arguments() ) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-test-if-view-matches-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/00-test-if-view-matches-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-test-if-view-matches-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-insert-into-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/01-insert-into-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-insert-into-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-refresh.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/02-refresh.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-refresh.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/04-compute-stats.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/03-compute-stats.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd1/04-compute-stats.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-dimension-scd2-definition.py similarity index 57% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-dimension-scd2-definition.py index 2a2c1ff195..3885cc9a37 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/dimension_scd2_definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-dimension-scd2-definition.py @@ -23,22 +23,10 @@ class SlowlyChangingDimensionType2(TemplateArgumentsValidator): TemplateParams = SlowlyChangingDimensionType2Params def __init__(self) -> None: - super().__init__( - template_name="load/dimension/scd2", - sql_files=[ - "00-test-if-view-matches-target.sql", - "01-insert-into-target.sql", - "02-refresh.sql", - "03-compute-stats.sql", - ], - sql_files_platform_is_responsible=[ - "02-refresh.sql", - "03-compute-stats.sql", - ], - ) + super().__init__() -def get_validated_arguments(job_input: IJobInput): - return SlowlyChangingDimensionType2().get_validated_args( +def run(job_input: IJobInput): + SlowlyChangingDimensionType2().get_validated_args( job_input, job_input.get_arguments() ) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-test-if-view-matches-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/00-test-if-view-matches-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-test-if-view-matches-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-insert-into-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/01-insert-into-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-insert-into-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-refresh.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/02-refresh.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-refresh.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/04-compute-stats.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/03-compute-stats.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/dimension/scd2/04-compute-stats.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-fact-snapshot-definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-fact-snapshot-definition.py new file mode 100644 index 0000000000..501cb19cf7 --- /dev/null +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-fact-snapshot-definition.py @@ -0,0 +1,26 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +from pydantic import BaseModel +from vdk.api.job_input import IJobInput +from vdk.plugin.impala.templates.template_arguments_validator import ( + TemplateArgumentsValidator, +) + + +class FactDailySnapshotParams(BaseModel): + target_schema: str + target_table: str + source_schema: str + source_view: str + last_arrival_ts: str + + +class FactDailySnapshot(TemplateArgumentsValidator): + TemplateParams = FactDailySnapshotParams + + def __init__(self) -> None: + super().__init__() + + +def run(job_input: IJobInput): + FactDailySnapshot().get_validated_args(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-test-if-view-matches-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/00-test-if-view-matches-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-test-if-view-matches-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-insert-into-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/01-insert-into-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-insert-into-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-refresh.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-refresh.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-refresh.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/04-compute-stats.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-compute-stats.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/04-compute-stats.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py deleted file mode 100644 index 50c007580a..0000000000 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2021 VMware, Inc. -# SPDX-License-Identifier: Apache-2.0 diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py deleted file mode 100644 index 8b8df41e39..0000000000 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/fact_snapshot_definition.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2021 VMware, Inc. -# SPDX-License-Identifier: Apache-2.0 -from pydantic import BaseModel -from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.template_arguments_validator import ( - TemplateArgumentsValidator, -) - - -class FactDailySnapshotParams(BaseModel): - target_schema: str - target_table: str - source_schema: str - source_view: str - last_arrival_ts: str - - -class FactDailySnapshot(TemplateArgumentsValidator): - TemplateParams = FactDailySnapshotParams - - def __init__(self) -> None: - super().__init__( - template_name="load/fact/snapshot", - sql_files=[ - "00-test-if-view-matches-target.sql", - "01-insert-into-target.sql", - "02-refresh.sql", - "03-compute-stats.sql", - ], - sql_files_platform_is_responsible=[ - "02-refresh.sql", - "03-compute-stats.sql", - ], - ) - - -def get_validated_arguments(job_input: IJobInput): - return FactDailySnapshot().get_validated_args(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-versioned-definition.py similarity index 75% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-versioned-definition.py index 86aed0881f..44b7812dc1 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/validators/versioned_definition.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-versioned-definition.py @@ -24,7 +24,7 @@ class LoadVersionedParams(BaseModel): active_to_column: str = "active_to" active_to_max_value: str = "9999-12-31" - @validator("tracked_columns") + @validator("tracked_columns", allow_reuse=True) def passwords_match(cls, tracked_columns, values, **kwargs): value_columns = values.get("value_columns") if type(value_columns) == list and not tracked_columns: @@ -42,19 +42,7 @@ class LoadVersioned(TemplateArgumentsValidator): TemplateParams = LoadVersionedParams def __init__(self) -> None: - super().__init__( - template_name="load/versioned", - sql_files=[ - "00-test-if-view-matches-target.sql", - "01-insert-into-target.sql", - "02-refresh.sql", - "03-compute-stats.sql", - ], - sql_files_platform_is_responsible=[ - "02-refresh.sql", - "03-compute-stats.sql", - ], - ) + super().__init__() def _validate_args(self, args: dict) -> dict: args = super()._validate_args(args) @@ -72,5 +60,5 @@ def _validate_args(self, args: dict) -> dict: ) -def get_validated_arguments(job_input: IJobInput): - return LoadVersioned().get_validated_args(job_input, job_input.get_arguments()) +def run(job_input: IJobInput): + LoadVersioned().get_validated_args(job_input, job_input.get_arguments()) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-test-if-view-matches-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/00-test-if-view-matches-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-test-if-view-matches-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-insert-into-target.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/01-insert-into-target.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-insert-into-target.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-refresh.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/02-refresh.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-refresh.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/04-compute-stats.sql similarity index 100% rename from projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/03-compute-stats.sql rename to projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/versioned/04-compute-stats.sql diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py index b75c59dbaf..04aef7c6cc 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py @@ -18,20 +18,11 @@ class TemplateArgumentsValidator: TemplateParams: Type[BaseModel] - def __init__( - self, - template_name: str, - sql_files: List[str], - sql_files_platform_is_responsible: List[str], - ) -> None: - self.template_name = ( - template_name # FIXME: could be inferred from the template path - ) - self.sql_files = sql_files - self.sql_files_platform_is_responsible = sql_files_platform_is_responsible # used to decide blamee for failure, defaults to user + def __init__(self) -> None: + pass def get_validated_args(self, job_input: IJobInput, args: dict) -> dict: - args = self._validate_args(args) + args.update(self._validate_args(args)) args["_vdk_template_insert_partition_clause"] = "" impala_helper = ImpalaHelper(cast(JobInput, job_input).get_managed_connection()) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py index 15e191ad30..dd439171f1 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_job/02_run_load_dimension_scd1_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import dimension_scd1_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd1_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py index 15e191ad30..dd439171f1 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_only/01_run_load_dimension_scd1_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import dimension_scd1_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd1_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py index 15e191ad30..dd439171f1 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd1_template_partition_job/02_run_load_dimension_scd1_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import dimension_scd1_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd1_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd1", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py index 2012877d5f..a91e341f27 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_job/02_run_load_dimension_scd2_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import dimension_scd2_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd2_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd2", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py index 2012877d5f..a91e341f27 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_dimension_scd2_template_only/01_run_load_dimension_scd2_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import dimension_scd2_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = dimension_scd2_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/dimension/scd2", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py index d22b8c374b..6d070d1dd0 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job/02_run_load_fact_snapshot_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py index d22b8c374b..6d070d1dd0 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_job_empty_source/02_run_load_fact_snapshot_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py index d22b8c374b..6d070d1dd0 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_only/run_fact_snapshot_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py index d22b8c374b..6d070d1dd0 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_fact_snapshot_template_partition_job/02_run_load_fact_snapshot_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import fact_snapshot_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = fact_snapshot_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/fact/snapshot", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/02_run_load_versioned_template.py index 67d432b3fd..e5e13a815e 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/02_run_load_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_job/02_run_load_versioned_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import versioned_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = versioned_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/versioned", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/01_run_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/01_run_versioned_template.py index 67d432b3fd..e5e13a815e 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/01_run_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_only/01_run_versioned_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import versioned_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = versioned_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/versioned", - template_args=args, + template_args=job_input.get_arguments(), ) diff --git a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py index 67d432b3fd..e5e13a815e 100644 --- a/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py +++ b/projects/vdk-plugins/vdk-impala/tests/functional/jobs/load_versioned_template_partition_job/02_run_load_versioned_template.py @@ -1,7 +1,6 @@ # Copyright 2021 VMware, Inc. # SPDX-License-Identifier: Apache-2.0 from vdk.api.job_input import IJobInput -from vdk.plugin.impala.templates.load.validators import versioned_definition __author__ = "VMware, Inc." __copyright__ = ( @@ -10,8 +9,7 @@ def run(job_input: IJobInput) -> None: - args = versioned_definition.get_validated_arguments(job_input) job_input.execute_template( template_name="load/versioned", - template_args=args, + template_args=job_input.get_arguments(), ) From 36665b323d4d06996319df6fdfd285766f3ab23e Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Wed, 26 Jan 2022 10:02:32 +0200 Subject: [PATCH 8/9] codacy Signed-off-by: mrMoZ1 --- .../plugin/impala/templates/template_arguments_validator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py index 04aef7c6cc..38e96c4340 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/template_arguments_validator.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 from logging import getLogger from typing import cast -from typing import List from typing import Type from pydantic import BaseModel @@ -47,7 +46,7 @@ def get_validated_args(self, job_input: IJobInput, args: dict) -> dict: ) source_view_has_results = raw_source_view_has_results[0][0] if not source_view_has_results: - log.info(f"Source view returns no results. Will NOT execute template!") + log.info("Source view returns no results. Will NOT execute template!") raise Exception( "Source view returns no results. Will NOT execute template!" ) From ac284ede7beb2128bc5c24edc9c07766996c6616 Mon Sep 17 00:00:00 2001 From: mrMoZ1 Date: Wed, 26 Jan 2022 12:00:30 +0200 Subject: [PATCH 9/9] comment Signed-off-by: mrMoZ1 --- .../templates/load/fact/snapshot/02-insert-into-target.sql | 2 +- .../plugin/impala/templates/load/fact/snapshot/03-refresh.sql | 2 +- .../impala/templates/load/fact/snapshot/04-compute-stats.sql | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-insert-into-target.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-insert-into-target.sql index 9e4a727bc4..af01c94a4d 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-insert-into-target.sql +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/02-insert-into-target.sql @@ -10,4 +10,4 @@ UNION ALL ( SELECT * FROM {source_schema}.{source_view} -) +); diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-refresh.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-refresh.sql index 62002e767f..a865459dc4 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-refresh.sql +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/03-refresh.sql @@ -1 +1 @@ -REFRESH {target_schema}.{target_table} +REFRESH {target_schema}.{target_table}; diff --git a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/04-compute-stats.sql b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/04-compute-stats.sql index ab96a798fc..00d3e0ef15 100644 --- a/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/04-compute-stats.sql +++ b/projects/vdk-plugins/vdk-impala/src/vdk/plugin/impala/templates/load/fact/snapshot/04-compute-stats.sql @@ -1 +1 @@ -COMPUTE STATS {target_schema}.{target_table} +COMPUTE STATS {target_schema}.{target_table};