From 9c67520c3a32e105b1acd85210cc1254a7de262a Mon Sep 17 00:00:00 2001 From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com> Date: Tue, 7 May 2024 23:22:03 +0100 Subject: [PATCH] Add support for migrate-tables-ctas workflow in the cmd `databricks labs ucx migrate-tables` (#1660) ## Changes This change adds support for migrate-tables-ctas into the existing cli cmd for migrate-tables. Checks for the presence of an external table which cannot be synced and prompts the user to run the additional workflow Also updated relevant readme doc Resolves #1659 ### Functionality - [X] added relevant user documentation - [ ] added new CLI command - [X] modified existing command: `databricks labs ucx ...` - [ ] added a new workflow - [ ] modified existing workflow: `...` - [ ] added a new table - [ ] modified existing table: `...` ### Tests - [ ] manually tested - [X] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached) --- README.md | 3 +++ labs.yml | 2 +- src/databricks/labs/ucx/cli.py | 11 ++++++++++- tests/unit/test_cli.py | 21 +++++++++++++++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d41d6304ff..66d3301ddd 100644 --- a/README.md +++ b/README.md @@ -394,14 +394,17 @@ flowchart TB subgraph workflow[Table Migration Workflows] subgraph mt_workflow[workflow: migrate-tables] dbfs_root_delta_mt_task[migrate_dbfs_root_delta_tables] + dbfs_root_non_delta_mt_task[migrate_dbfs_root_non_delta_tables] external_tables_sync_mt_task[migrate_external_tables_sync] view_mt_task[roadmap: migrate_views] dbfs_root_delta_mt_task --> view_mt_task + dbfs_root_non_delta_mt_task --> view_mt_task external_tables_sync_mt_task --> view_mt_task end subgraph mt_ctas_wf[roadmap workflow: migrate-tables-ctas] ctas_mt_task[migrate_tables_ctas] --> view_mt_task_ctas[roadmap: migrate_views] + ctas_mt_task[migrate_hiveserde_ctas] --> view_mt_task_ctas[roadmap: migrate_views] end subgraph mt_serde_inplace_wf[roadmap workflow: migrate-external-hiveserde-tables-in-place-experimental] diff --git a/labs.yml b/labs.yml index efd67a2b25..fee75291da 100644 --- a/labs.yml +++ b/labs.yml @@ -207,4 +207,4 @@ commands: - name: migrate-tables description: | Trigger the migrate-tables workflow and, optionally, migrate-external-hiveserde-tables-in-place-experimental - workflow. + workflow and migrate-external-tables-ctas workflow. diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index ccb227d282..da0734deae 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -428,7 +428,7 @@ def assign_metastore( def migrate_tables(w: WorkspaceClient, prompts: Prompts, *, ctx: WorkspaceContext | None = None): """ Trigger the migrate-tables workflow and, optionally, the migrate-external-hiveserde-tables-in-place-experimental - workflow. + workflow and migrate-external-tables-ctas. """ if ctx is None: ctx = WorkspaceContext(w) @@ -445,6 +445,15 @@ def migrate_tables(w: WorkspaceClient, prompts: Prompts, *, ctx: WorkspaceContex ): deployed_workflows.run_workflow("migrate-external-hiveserde-tables-in-place-experimental") + external_ctas_tables = [table for table in tables if table.what == What.EXTERNAL_NO_SYNC] + if len(external_ctas_tables) > 0: + percentage_external_ctas_tables = len(external_ctas_tables) / len(tables) * 100 + if prompts.confirm( + f"Found {len(external_ctas_tables)} ({percentage_external_ctas_tables:.2f}%) external tables which cannot be migrated using sync" + f", do you want to run the migrate-external-tables-ctas workflow?" + ): + deployed_workflows.run_workflow("migrate-external-tables-ctas") + if __name__ == "__main__": ucx() diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index cfb04ce14a..61193fab7c 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -73,6 +73,7 @@ def ws(): 'assessment': '123', 'migrate-tables': '456', 'migrate-external-hiveserde-tables-in-place-experimental': '789', + 'migrate-external-tables-ctas': '987', } } } @@ -487,6 +488,26 @@ def test_migrate_external_hiveserde_tables_in_place(ws): ws.jobs.run_now.assert_called_with(789) +def test_migrate_external_tables_ctas(ws): + tables_crawler = create_autospec(TablesCrawler) + table = Table( + catalog="hive_metastore", database="test", name="externalctas", object_type="UNKNOWN", table_format="EXTERNAL" + ) + tables_crawler.snapshot.return_value = [table] + ctx = WorkspaceContext(ws).replace(tables_crawler=tables_crawler) + + prompt = ( + "Found 1 (.*) external tables which cannot be migrated using sync, do you want to run the " + "migrate-external-tables-ctas workflow?" + ) + + prompts = MockPrompts({prompt: "Yes"}) + + migrate_tables(ws, prompts, ctx=ctx) + + ws.jobs.run_now.assert_called_with(987) + + def test_create_missing_principal_aws(ws): aws_resource_permissions = create_autospec(AWSResourcePermissions) ctx = WorkspaceContext(ws).replace(is_aws=True, is_azure=False, aws_resource_permissions=aws_resource_permissions)