From ab0fedb99c0009e44e8047065d1bec5d5c580691 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Sat, 27 Apr 2024 13:35:44 -0400 Subject: [PATCH 01/19] Initial commit - cli command, get ws list --- labs.yml | 4 ++++ src/databricks/labs/ucx/cli.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/labs.yml b/labs.yml index 9cfcc15c2b..896e686896 100644 --- a/labs.yml +++ b/labs.yml @@ -43,6 +43,10 @@ commands: is_account_level: true description: upload workspace config to all workspaces in the account where ucx is installed + - name: aggregate-ucx-output + is_account_level: true + description: aggregation of UCX output of multiple workspaces in the account + - name: manual-workspace-info description: only supposed to be run if cannot get admins to run `databricks labs ucx sync-workspace-info` diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 5597ae0744..88c6b43b45 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -89,6 +89,13 @@ def sync_workspace_info(a: AccountClient): ctx = AccountContext(a) ctx.account_workspaces.sync_workspace_info() +@ucx.command(is_account=True) +def aggregate_ucx_output(a: AccountClient): + """upload workspace config to all workspaces in the account where ucx is installed""" + logger.info(f"Account ID: {a.config.account_id}") + ctx = AccountContext(a) + print(ctx.account_workspaces._workspaces()) + @ucx.command(is_account=True) def create_account_groups( From b42e97ad109522c4d0e3b6c887b985bc8f4eb7dd Mon Sep 17 00:00:00 2001 From: pritishpai Date: Tue, 30 Apr 2024 16:56:25 -0400 Subject: [PATCH 02/19] Checking if assessment was run before getting ucx output --- src/databricks/labs/ucx/cli.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 88c6b43b45..ad63dbc85e 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -1,4 +1,5 @@ import json +import random import webbrowser from pathlib import Path @@ -89,12 +90,32 @@ def sync_workspace_info(a: AccountClient): ctx = AccountContext(a) ctx.account_workspaces.sync_workspace_info() + @ucx.command(is_account=True) def aggregate_ucx_output(a: AccountClient): """upload workspace config to all workspaces in the account where ucx is installed""" logger.info(f"Account ID: {a.config.account_id}") ctx = AccountContext(a) - print(ctx.account_workspaces._workspaces()) + + # TODO: Add a check for Account Admin role? Else access to workspaces will fail + workspaces_list_original = ctx.account_workspaces.get_workspace_list() + + # workspaces_list = random.sample(workspaces_list_original, 5) #just for testing WIP + # workspaces_list = workspaces_list_original + for workspace in workspaces_list_original: + if workspace.workspace_name == "ppai_test_workspace": + logger.info(f"Workspace Name: {workspace.workspace_name}") + + workspace_client = ctx.account_workspaces.client_for(workspace) + try: + ws_ctx = WorkspaceContext(workspace_client) + deployed_workflows = ws_ctx.deployed_workflows + if not deployed_workflows.validate_step("assessment"): + logger.info(f"NOT RUN") + else: + logger.info(f"RUN") + except Exception as e: + logger.error(f"Error in workspace for {workspace.workspace_name}: {e}") @ucx.command(is_account=True) From bf4d96db2a04041b3b3da7445393ee56b425e5f8 Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Wed, 1 May 2024 12:25:34 +0200 Subject: [PATCH 03/19] added baseline for account-level aggregation --- labs.yml | 8 ++- src/databricks/labs/ucx/account/aggregate.py | 59 +++++++++++++++++++ src/databricks/labs/ucx/account/workspaces.py | 11 +++- src/databricks/labs/ucx/cli.py | 28 ++------- src/databricks/labs/ucx/install.py | 7 ++- 5 files changed, 84 insertions(+), 29 deletions(-) create mode 100644 src/databricks/labs/ucx/account/aggregate.py diff --git a/labs.yml b/labs.yml index 896e686896..089c0efcb8 100644 --- a/labs.yml +++ b/labs.yml @@ -43,9 +43,13 @@ commands: is_account_level: true description: upload workspace config to all workspaces in the account where ucx is installed - - name: aggregate-ucx-output + - name: report-account-compatibility is_account_level: true - description: aggregation of UCX output of multiple workspaces in the account + description: aggregation of UCX output of multiple workspaces in the account. + If include_workspace_ids is not provided, it will use all workspaces present in the account. + flags: + - name: include-workspace-ids + description: List of workspace IDs to create account groups from. - name: manual-workspace-info description: only supposed to be run if cannot get admins to run `databricks labs ucx sync-workspace-info` diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py new file mode 100644 index 0000000000..6b63a3c00f --- /dev/null +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -0,0 +1,59 @@ +import collections +import logging +from dataclasses import dataclass +from functools import cached_property +from databricks.labs.ucx.account import AccountWorkspaces + +logger = logging.getLogger(__name__) + + +@dataclass +class AssessmentObject: + workspace_id: int + object_type: str + object_id: str + failures: list[str] + + +class AccountAggregate: + def __init__(self, account_workspaces: AccountWorkspaces): + self.account_workspaces = account_workspaces + + @cached_property + def _workspace_contexts(self): + # this is the intended way to import WorkspaceContext, otherwise it will cause a circular import + from databricks.labs.ucx.contexts.cli_command import WorkspaceContext # pylint: disable=import-outside-toplevel + + contexts = [] + for workspace_client in self.workspace_clients(): + contexts.append(WorkspaceContext(workspace_client)) + return contexts + + @cached_property + def _aggregate_objects(self) -> list[AssessmentObject]: + objects = [] + # this is theoretically inefficient, but the number of workspaces is expected to be small. If this is a + # performance bottleneck, we can optimize it later via Threads.strict() + for ctx in self._workspace_contexts: + workspace_id = ctx.workspace_client.get_workspace_id() + # view is defined in src/databricks/labs/ucx/queries/views/objects.sql + for row in ctx.sql_backend.fetch(f'SELECT * FROM {ctx.config.inventory_database}.objects'): + objects.append(AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures))) + return objects + + def aggregate_report(self): + all_objects = 0 + incompatible_objects = 0 + failures = collections.defaultdict(list) + for obj in self._aggregate_objects: + all_objects += 1 + has_failures = False + for failure in obj.failures: + failures[failure].append(obj) + has_failures = True + if has_failures: + incompatible_objects += 1 + compatibility = (1 - incompatible_objects / all_objects if all_objects > 0 else 0) * 100 + logger.info(f"UC compatibility: {compatibility}% ({incompatible_objects}/{all_objects})") + for failure, objects in failures.items(): + logger.info(f"{failure}: {len(objects)} objects") \ No newline at end of file diff --git a/src/databricks/labs/ucx/account/workspaces.py b/src/databricks/labs/ucx/account/workspaces.py index 6cf9b8b95e..d4de828e3d 100644 --- a/src/databricks/labs/ucx/account/workspaces.py +++ b/src/databricks/labs/ucx/account/workspaces.py @@ -1,4 +1,8 @@ +import collections +import json import logging +from dataclasses import dataclass +from functools import cached_property from typing import ClassVar from databricks.labs.blueprint.installation import Installation @@ -18,7 +22,11 @@ def __init__(self, account_client: AccountClient, include_workspace_ids: list[in self._include_workspace_ids = include_workspace_ids if include_workspace_ids else [] def _workspaces(self): - return self._ac.workspaces.list() + for workspace in self._ac.workspaces.list(): + if self._include_workspace_ids and workspace.workspace_id not in self._include_workspace_ids: + logger.debug(f"Skipping {workspace.workspace_name} ({workspace.workspace_id}): not in include list") + continue + yield workspace def client_for(self, workspace: Workspace) -> WorkspaceClient: return self._ac.get_workspace_client(workspace) @@ -28,6 +36,7 @@ def workspace_clients(self, workspaces: list[Workspace] | None = None) -> list[W Return a list of WorkspaceClient for each configured workspace in the account :return: list[WorkspaceClient] """ + # TODO: move _can_administer() from account installer over here if workspaces is None: workspaces = self._workspaces() clients = [] diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index ad63dbc85e..57e5228447 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -1,5 +1,4 @@ import json -import random import webbrowser from pathlib import Path @@ -92,30 +91,11 @@ def sync_workspace_info(a: AccountClient): @ucx.command(is_account=True) -def aggregate_ucx_output(a: AccountClient): +def report_account_compatibility(a: AccountClient, ctx: AccountContext | None = None, **named_parameters): """upload workspace config to all workspaces in the account where ucx is installed""" - logger.info(f"Account ID: {a.config.account_id}") - ctx = AccountContext(a) - - # TODO: Add a check for Account Admin role? Else access to workspaces will fail - workspaces_list_original = ctx.account_workspaces.get_workspace_list() - - # workspaces_list = random.sample(workspaces_list_original, 5) #just for testing WIP - # workspaces_list = workspaces_list_original - for workspace in workspaces_list_original: - if workspace.workspace_name == "ppai_test_workspace": - logger.info(f"Workspace Name: {workspace.workspace_name}") - - workspace_client = ctx.account_workspaces.client_for(workspace) - try: - ws_ctx = WorkspaceContext(workspace_client) - deployed_workflows = ws_ctx.deployed_workflows - if not deployed_workflows.validate_step("assessment"): - logger.info(f"NOT RUN") - else: - logger.info(f"RUN") - except Exception as e: - logger.error(f"Error in workspace for {workspace.workspace_name}: {e}") + if not ctx: + ctx = AccountContext(a, named_parameters) + ctx.account_workspaces.aggregate_report() @ucx.command(is_account=True) diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index f0b0bef5fb..bebf31b546 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -10,7 +10,7 @@ from typing import Any import databricks.sdk.errors -from databricks.labs.blueprint.entrypoint import get_logger +from databricks.labs.blueprint.entrypoint import get_logger, is_in_debug from databricks.labs.blueprint.installation import Installation, SerdeError from databricks.labs.blueprint.installer import InstallState from databricks.labs.blueprint.parallel import ManyError, Threads @@ -612,6 +612,7 @@ def _get_safe_account_client(self) -> AccountClient: return AccountClient(host=host, account_id=account_id, product="ucx", product_version=__version__) def _can_administer(self, workspace: Workspace): + # TODO: move to AccountWorkspaces try: # check if user is a workspace admin ws = self.account_client.get_workspace_client(workspace) @@ -633,6 +634,7 @@ def _get_accessible_workspaces(self): """ Get all workspaces that the user has access to """ + # TODO: move to AccountWorkspaces accessible_workspaces = [] for workspace in self.account_client.workspaces.list(): if self._can_administer(workspace): @@ -684,7 +686,8 @@ def install_on_account(self): if __name__ == "__main__": logger = get_logger(__file__) - + if is_in_debug(): + logging.getLogger('databricks').setLevel(logging.DEBUG) env = dict(os.environ.items()) force_install = env.get("UCX_FORCE_INSTALL") if force_install == "account": From 08f49e23eb7662c237c398d19d995deadee05a40 Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Wed, 1 May 2024 15:52:04 +0200 Subject: [PATCH 04/19] rebase on main --- labs.yml | 4 ++-- src/databricks/labs/ucx/account/aggregate.py | 17 ++++++++++------- src/databricks/labs/ucx/account/workspaces.py | 4 ---- src/databricks/labs/ucx/cli.py | 2 +- .../labs/ucx/contexts/workspace_cli.py | 4 ++++ 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/labs.yml b/labs.yml index 089c0efcb8..8f61c6a3b8 100644 --- a/labs.yml +++ b/labs.yml @@ -46,9 +46,9 @@ commands: - name: report-account-compatibility is_account_level: true description: aggregation of UCX output of multiple workspaces in the account. - If include_workspace_ids is not provided, it will use all workspaces present in the account. + If --workspace-ids is not provided, it will use all workspaces present in the account. flags: - - name: include-workspace-ids + - name: workspace-ids description: List of workspace IDs to create account groups from. - name: manual-workspace-info diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index 6b63a3c00f..bb22394d10 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -1,8 +1,11 @@ import collections +import json import logging from dataclasses import dataclass from functools import cached_property -from databricks.labs.ucx.account import AccountWorkspaces + +from databricks.labs.ucx.account.workspaces import AccountWorkspaces + logger = logging.getLogger(__name__) @@ -17,15 +20,15 @@ class AssessmentObject: class AccountAggregate: def __init__(self, account_workspaces: AccountWorkspaces): - self.account_workspaces = account_workspaces + self._account_workspaces = account_workspaces @cached_property def _workspace_contexts(self): - # this is the intended way to import WorkspaceContext, otherwise it will cause a circular import - from databricks.labs.ucx.contexts.cli_command import WorkspaceContext # pylint: disable=import-outside-toplevel + # pylint: disable-next=import-outside-toplevel + from databricks.labs.ucx.contexts.cli_command import WorkspaceContext contexts = [] - for workspace_client in self.workspace_clients(): + for workspace_client in self._account_workspaces.workspace_clients(): contexts.append(WorkspaceContext(workspace_client)) return contexts @@ -41,7 +44,7 @@ def _aggregate_objects(self) -> list[AssessmentObject]: objects.append(AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures))) return objects - def aggregate_report(self): + def readiness_report(self): all_objects = 0 incompatible_objects = 0 failures = collections.defaultdict(list) @@ -56,4 +59,4 @@ def aggregate_report(self): compatibility = (1 - incompatible_objects / all_objects if all_objects > 0 else 0) * 100 logger.info(f"UC compatibility: {compatibility}% ({incompatible_objects}/{all_objects})") for failure, objects in failures.items(): - logger.info(f"{failure}: {len(objects)} objects") \ No newline at end of file + logger.info(f"{failure}: {len(objects)} objects") diff --git a/src/databricks/labs/ucx/account/workspaces.py b/src/databricks/labs/ucx/account/workspaces.py index d4de828e3d..4a28c823ad 100644 --- a/src/databricks/labs/ucx/account/workspaces.py +++ b/src/databricks/labs/ucx/account/workspaces.py @@ -1,8 +1,4 @@ -import collections -import json import logging -from dataclasses import dataclass -from functools import cached_property from typing import ClassVar from databricks.labs.blueprint.installation import Installation diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 57e5228447..e5f9a7a8fc 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -95,7 +95,7 @@ def report_account_compatibility(a: AccountClient, ctx: AccountContext | None = """upload workspace config to all workspaces in the account where ucx is installed""" if not ctx: ctx = AccountContext(a, named_parameters) - ctx.account_workspaces.aggregate_report() + ctx.account_aggregate.readiness_report() @ucx.command(is_account=True) diff --git a/src/databricks/labs/ucx/contexts/workspace_cli.py b/src/databricks/labs/ucx/contexts/workspace_cli.py index da257f239d..6545eefdaa 100644 --- a/src/databricks/labs/ucx/contexts/workspace_cli.py +++ b/src/databricks/labs/ucx/contexts/workspace_cli.py @@ -5,6 +5,9 @@ from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend from databricks.sdk import WorkspaceClient +from databricks.labs.ucx.account.aggregate import AccountAggregate +from databricks.labs.ucx.account.workspaces import AccountWorkspaces +from databricks.labs.ucx.account.metastores import AccountMetastores from databricks.labs.ucx.assessment.aws import run_command, AWSResources from databricks.labs.ucx.aws.access import AWSResourcePermissions from databricks.labs.ucx.aws.credentials import IamRoleMigration, IamRoleCreation @@ -167,3 +170,4 @@ def iam_role_creation(self): @cached_property def notebook_loader(self) -> NotebookLoader: return LocalNotebookLoader(self.path_lookup) + From 0017b328aa18aaba93c0b6ac69594de81643f6c8 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Wed, 1 May 2024 11:29:07 -0400 Subject: [PATCH 05/19] Catch NotInstalled error --- src/databricks/labs/ucx/account/aggregate.py | 23 +++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index bb22394d10..94e61a8932 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -5,9 +5,11 @@ from functools import cached_property from databricks.labs.ucx.account.workspaces import AccountWorkspaces +from databricks.labs.blueprint.installation import NotInstalled logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) @dataclass @@ -38,16 +40,23 @@ def _aggregate_objects(self) -> list[AssessmentObject]: # this is theoretically inefficient, but the number of workspaces is expected to be small. If this is a # performance bottleneck, we can optimize it later via Threads.strict() for ctx in self._workspace_contexts: - workspace_id = ctx.workspace_client.get_workspace_id() - # view is defined in src/databricks/labs/ucx/queries/views/objects.sql - for row in ctx.sql_backend.fetch(f'SELECT * FROM {ctx.config.inventory_database}.objects'): - objects.append(AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures))) + try: + workspace_id = ctx.workspace_client.get_workspace_id() + logger.info(f"Assessing workspace {workspace_id}") + + # view is defined in src/databricks/labs/ucx/queries/views/objects.sql + for row in ctx.sql_backend.fetch(f'SELECT * FROM {ctx.config.inventory_database}.objects'): + objects.append(AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures))) + except NotInstalled: + logger.warning(f"Workspace {ctx.workspace_client.get_workspace_id()} does not have UCX installed") return objects def readiness_report(self): + logger.info("Generating readiness report") all_objects = 0 incompatible_objects = 0 failures = collections.defaultdict(list) + for obj in self._aggregate_objects: all_objects += 1 has_failures = False @@ -58,5 +67,7 @@ def readiness_report(self): incompatible_objects += 1 compatibility = (1 - incompatible_objects / all_objects if all_objects > 0 else 0) * 100 logger.info(f"UC compatibility: {compatibility}% ({incompatible_objects}/{all_objects})") - for failure, objects in failures.items(): - logger.info(f"{failure}: {len(objects)} objects") + + # TODO: output failures in file + # for failure, objects in failures.items(): + # logger.info(f"{failure}: {len(objects)} objects") From fe6ee196ae5def3a47c2a0e26a8a060c920f17da Mon Sep 17 00:00:00 2001 From: pritishpai Date: Wed, 1 May 2024 14:20:14 -0400 Subject: [PATCH 06/19] Add basic unit test for readiness_report --- tests/unit/account/test_aggregate.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 tests/unit/account/test_aggregate.py diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py new file mode 100644 index 0000000000..36381079e9 --- /dev/null +++ b/tests/unit/account/test_aggregate.py @@ -0,0 +1,16 @@ +import logging +from unittest.mock import create_autospec +from databricks.sdk.service.provisioning import Workspace +from databricks.sdk import WorkspaceClient +from databricks.labs.ucx.account.aggregate import AccountAggregate +from databricks.labs.ucx.account.workspaces import AccountWorkspaces + + +def test_basic_readiness_report(acc_client, caplog): + account_ws = AccountWorkspaces(acc_client) + account_aggregate_obj = AccountAggregate(account_ws) + + with caplog.at_level(logging.INFO): + account_aggregate_obj.readiness_report() + + assert 'UC compatibility' in caplog.text From 79458a101054151ba96d7ead1b0185c499241cba Mon Sep 17 00:00:00 2001 From: pritishpai Date: Wed, 1 May 2024 16:06:40 -0400 Subject: [PATCH 07/19] Linting --- src/databricks/labs/ucx/account/aggregate.py | 8 +++----- tests/unit/account/test_aggregate.py | 3 --- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index 94e61a8932..3a6830b031 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -46,7 +46,9 @@ def _aggregate_objects(self) -> list[AssessmentObject]: # view is defined in src/databricks/labs/ucx/queries/views/objects.sql for row in ctx.sql_backend.fetch(f'SELECT * FROM {ctx.config.inventory_database}.objects'): - objects.append(AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures))) + objects.append( + AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures)) + ) except NotInstalled: logger.warning(f"Workspace {ctx.workspace_client.get_workspace_id()} does not have UCX installed") return objects @@ -67,7 +69,3 @@ def readiness_report(self): incompatible_objects += 1 compatibility = (1 - incompatible_objects / all_objects if all_objects > 0 else 0) * 100 logger.info(f"UC compatibility: {compatibility}% ({incompatible_objects}/{all_objects})") - - # TODO: output failures in file - # for failure, objects in failures.items(): - # logger.info(f"{failure}: {len(objects)} objects") diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index 36381079e9..c42ec96d91 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -1,7 +1,4 @@ import logging -from unittest.mock import create_autospec -from databricks.sdk.service.provisioning import Workspace -from databricks.sdk import WorkspaceClient from databricks.labs.ucx.account.aggregate import AccountAggregate from databricks.labs.ucx.account.workspaces import AccountWorkspaces From 006fb6f1456159a5c2ee76b92f65a0a71e7dc91c Mon Sep 17 00:00:00 2001 From: Serge Smertin <259697+nfx@users.noreply.github.com> Date: Thu, 2 May 2024 18:26:01 +0200 Subject: [PATCH 08/19] Added federated query (#1609) Small refactoring to enable things like: - https://github.com/databrickslabs/ucx/issues/673 --- src/databricks/labs/ucx/account/aggregate.py | 45 ++++++++++++++------ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index 3a6830b031..cf0275ae1e 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -1,12 +1,18 @@ import collections import json import logging +from collections.abc import Iterable from dataclasses import dataclass from functools import cached_property +from databricks.labs.lsql import Row + from databricks.labs.ucx.account.workspaces import AccountWorkspaces from databricks.labs.blueprint.installation import NotInstalled +from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.queries import FromTable logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -34,23 +40,34 @@ def _workspace_contexts(self): contexts.append(WorkspaceContext(workspace_client)) return contexts - @cached_property - def _aggregate_objects(self) -> list[AssessmentObject]: - objects = [] - # this is theoretically inefficient, but the number of workspaces is expected to be small. If this is a - # performance bottleneck, we can optimize it later via Threads.strict() + def _federated_ucx_query(self, query: str) -> Iterable[tuple[int, Row]]: + """Modifies a query with a workspace-specific UCX schema and executes it on each workspace, + yielding a tuple of workspace_id and Row. This means that you don't have to specify a database in the query, + as it will be replaced with the UCX schema for each workspace. Use this method to aggregate results across + all workspaces, where UCX is installed. + + At the moment, it's done sequentially, which is theoretically inefficient, but the number of workspaces is + expected to be small. If this becomes a performance bottleneck, we can optimize it later via Threads.strict() + """ + empty_index = MigrationIndex([]) for ctx in self._workspace_contexts: + workspace_id = ctx.workspace_client.get_workspace_id() try: - workspace_id = ctx.workspace_client.get_workspace_id() - logger.info(f"Assessing workspace {workspace_id}") - - # view is defined in src/databricks/labs/ucx/queries/views/objects.sql - for row in ctx.sql_backend.fetch(f'SELECT * FROM {ctx.config.inventory_database}.objects'): - objects.append( - AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures)) - ) + logger.debug(f"Assessing workspace {workspace_id}") + # use already existing code to replace tables in the query, assuming that UCX database is in HMS + from_table = FromTable(empty_index, CurrentSessionState(schema=ctx.config.inventory_database)) + workspace_specific_query = from_table.apply(query) + for row in ctx.sql_backend.fetch(workspace_specific_query): + yield workspace_id, row except NotInstalled: - logger.warning(f"Workspace {ctx.workspace_client.get_workspace_id()} does not have UCX installed") + logger.warning(f"Workspace {workspace_id} does not have UCX installed") + + @cached_property + def _aggregate_objects(self) -> list[AssessmentObject]: + objects = [] + # view is defined in src/databricks/labs/ucx/queries/views/objects.sql + for workspace_id, row in self._federated_ucx_query('SELECT * FROM objects'): + objects.append(AssessmentObject(workspace_id, row.object_type, row.object_id, json.loads(row.failures))) return objects def readiness_report(self): From 52880b3acb835e332b6248b54745ddaeb9b2449d Mon Sep 17 00:00:00 2001 From: pritishpai Date: Thu, 2 May 2024 15:11:57 -0400 Subject: [PATCH 09/19] WIP code debugging --- src/databricks/labs/ucx/account/aggregate.py | 14 +++++++-- tests/unit/account/test_aggregate.py | 30 +++++++++++++++++++- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index cf0275ae1e..466e707e0e 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -10,7 +10,7 @@ from databricks.labs.ucx.account.workspaces import AccountWorkspaces from databricks.labs.blueprint.installation import NotInstalled -from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex +from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex, MigrationStatus from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.queries import FromTable @@ -53,14 +53,20 @@ def _federated_ucx_query(self, query: str) -> Iterable[tuple[int, Row]]: for ctx in self._workspace_contexts: workspace_id = ctx.workspace_client.get_workspace_id() try: - logger.debug(f"Assessing workspace {workspace_id}") # use already existing code to replace tables in the query, assuming that UCX database is in HMS + # TODO: this worked, but changing to check CurrentSessionState + # empty_index = MigrationIndex([MigrationStatus(ctx.config.inventory_database, "objects", None, ctx.config.inventory_database, "objects", "test")]) from_table = FromTable(empty_index, CurrentSessionState(schema=ctx.config.inventory_database)) + logger.info(f"Querying Schema {ctx.config.inventory_database}") + workspace_specific_query = from_table.apply(query) for row in ctx.sql_backend.fetch(workspace_specific_query): yield workspace_id, row except NotInstalled: logger.warning(f"Workspace {workspace_id} does not have UCX installed") + # TODO: Add this exception handling + # except NotSuchTableException as e: + # logger.warning(f"Workspace {workspace_id} does not have the required table: {e.table_name}") @cached_property def _aggregate_objects(self) -> list[AssessmentObject]: @@ -86,3 +92,7 @@ def readiness_report(self): incompatible_objects += 1 compatibility = (1 - incompatible_objects / all_objects if all_objects > 0 else 0) * 100 logger.info(f"UC compatibility: {compatibility}% ({incompatible_objects}/{all_objects})") + + for failure, objects in failures.items(): + logger.info(f"{failure}: {len(objects)} objects") + diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index c42ec96d91..0374e23141 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -1,9 +1,16 @@ import logging +from unittest.mock import create_autospec +from databricks.sdk import Workspace, WorkspaceClient from databricks.labs.ucx.account.aggregate import AccountAggregate from databricks.labs.ucx.account.workspaces import AccountWorkspaces +from databricks.labs.lsql.backends import MockBackend, SqlBackend, StatementExecutionBackend +from databricks.labs.blueprint.installation import MockInstallation +from databricks.labs.ucx.contexts.cli_command import WorkspaceContext +from databricks.sdk.service import iam, sql, jobs -def test_basic_readiness_report(acc_client, caplog): + +def test_basic_readiness_report_no_workspaces(acc_client, caplog): account_ws = AccountWorkspaces(acc_client) account_aggregate_obj = AccountAggregate(account_ws) @@ -11,3 +18,24 @@ def test_basic_readiness_report(acc_client, caplog): account_aggregate_obj.readiness_report() assert 'UC compatibility' in caplog.text + + +def test_readiness_report_ucx_installed(acc_client, caplog): + account_ws = AccountWorkspaces(acc_client) + acc_client.workspaces.list.return_value = [ + Workspace(workspace_name="foo", workspace_id=123, workspace_status_message="Running", deployment_name="abc")] + + account_aggregate_obj = AccountAggregate(account_ws) + ws = create_autospec(WorkspaceClient) + acc_client.get_workspace_client.return_value = ws + ws.statement_execution.execute_statement.return_value \ + = sql.ExecuteStatementResponse(status=sql.StatementStatus( + state=sql.StatementState.SUCCEEDED), + result=sql.ResultData(data_array=[ + ["jobs", "123134", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"]"""], + ["clusters", "0325-3423-dfs", "[]"]])) + + with caplog.at_level(logging.INFO): + account_aggregate_obj.readiness_report() + + assert 'UC compatibility' in caplog.text From c2041c6560d795fb3f34f291f92eaa0b98b39a96 Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Thu, 2 May 2024 21:30:25 +0200 Subject: [PATCH 10/19] fixed stuff --- src/databricks/labs/ucx/account/aggregate.py | 29 ++++++++++++------- .../labs/ucx/contexts/account_cli.py | 5 ++++ .../labs/ucx/contexts/workspace_cli.py | 3 -- tests/unit/account/test_aggregate.py | 9 ++++-- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index 466e707e0e..ac81e42c18 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -4,12 +4,15 @@ from collections.abc import Iterable from dataclasses import dataclass from functools import cached_property +from typing import Callable from databricks.labs.lsql import Row +from databricks.sdk import WorkspaceClient from databricks.labs.ucx.account.workspaces import AccountWorkspaces from databricks.labs.blueprint.installation import NotInstalled +from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex, MigrationStatus from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.queries import FromTable @@ -27,20 +30,19 @@ class AssessmentObject: class AccountAggregate: - def __init__(self, account_workspaces: AccountWorkspaces): + def __init__(self, account_workspaces: AccountWorkspaces, workspace_context_factory: Callable[[WorkspaceClient], WorkspaceContext] = WorkspaceContext): self._account_workspaces = account_workspaces + self._workspace_context_factory = workspace_context_factory @cached_property def _workspace_contexts(self): - # pylint: disable-next=import-outside-toplevel - from databricks.labs.ucx.contexts.cli_command import WorkspaceContext - contexts = [] for workspace_client in self._account_workspaces.workspace_clients(): - contexts.append(WorkspaceContext(workspace_client)) + ctx = self._workspace_context_factory(workspace_client) + contexts.append(ctx) return contexts - def _federated_ucx_query(self, query: str) -> Iterable[tuple[int, Row]]: + def _federated_ucx_query(self, query: str, table_name='objects') -> Iterable[tuple[int, Row]]: """Modifies a query with a workspace-specific UCX schema and executes it on each workspace, yielding a tuple of workspace_id and Row. This means that you don't have to specify a database in the query, as it will be replaced with the UCX schema for each workspace. Use this method to aggregate results across @@ -49,15 +51,20 @@ def _federated_ucx_query(self, query: str) -> Iterable[tuple[int, Row]]: At the moment, it's done sequentially, which is theoretically inefficient, but the number of workspaces is expected to be small. If this becomes a performance bottleneck, we can optimize it later via Threads.strict() """ - empty_index = MigrationIndex([]) for ctx in self._workspace_contexts: workspace_id = ctx.workspace_client.get_workspace_id() try: # use already existing code to replace tables in the query, assuming that UCX database is in HMS - # TODO: this worked, but changing to check CurrentSessionState - # empty_index = MigrationIndex([MigrationStatus(ctx.config.inventory_database, "objects", None, ctx.config.inventory_database, "objects", "test")]) - from_table = FromTable(empty_index, CurrentSessionState(schema=ctx.config.inventory_database)) - logger.info(f"Querying Schema {ctx.config.inventory_database}") + inventory_database = ctx.config.inventory_database + stub_index = MigrationIndex([MigrationStatus( + src_schema=inventory_database, + src_table=table_name, + dst_catalog='hive_metastore', + dst_schema=inventory_database, + dst_table=table_name, + )]) + from_table = FromTable(stub_index, CurrentSessionState(schema=inventory_database)) + logger.info(f"Querying Schema {inventory_database}") workspace_specific_query = from_table.apply(query) for row in ctx.sql_backend.fetch(workspace_specific_query): diff --git a/src/databricks/labs/ucx/contexts/account_cli.py b/src/databricks/labs/ucx/contexts/account_cli.py index c16212629e..238e4c460a 100644 --- a/src/databricks/labs/ucx/contexts/account_cli.py +++ b/src/databricks/labs/ucx/contexts/account_cli.py @@ -2,6 +2,8 @@ from databricks.sdk import AccountClient + +from databricks.labs.ucx.account.aggregate import AccountAggregate from databricks.labs.ucx.account.metastores import AccountMetastores from databricks.labs.ucx.account.workspaces import AccountWorkspaces from databricks.labs.ucx.contexts.application import CliContext @@ -25,5 +27,8 @@ def account_workspaces(self): return AccountWorkspaces(self.account_client, self.workspace_ids) @cached_property + def account_aggregate(self): + return AccountAggregate(self.account_workspaces) + def account_metastores(self): return AccountMetastores(self.account_client) diff --git a/src/databricks/labs/ucx/contexts/workspace_cli.py b/src/databricks/labs/ucx/contexts/workspace_cli.py index 6545eefdaa..9e93b6e601 100644 --- a/src/databricks/labs/ucx/contexts/workspace_cli.py +++ b/src/databricks/labs/ucx/contexts/workspace_cli.py @@ -5,9 +5,6 @@ from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend from databricks.sdk import WorkspaceClient -from databricks.labs.ucx.account.aggregate import AccountAggregate -from databricks.labs.ucx.account.workspaces import AccountWorkspaces -from databricks.labs.ucx.account.metastores import AccountMetastores from databricks.labs.ucx.assessment.aws import run_command, AWSResources from databricks.labs.ucx.aws.access import AWSResourcePermissions from databricks.labs.ucx.aws.credentials import IamRoleMigration, IamRoleCreation diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index 0374e23141..4979e81804 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -5,9 +5,12 @@ from databricks.labs.ucx.account.workspaces import AccountWorkspaces from databricks.labs.lsql.backends import MockBackend, SqlBackend, StatementExecutionBackend from databricks.labs.blueprint.installation import MockInstallation -from databricks.labs.ucx.contexts.cli_command import WorkspaceContext + +from databricks.labs.ucx.config import WorkspaceConfig +from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext from databricks.sdk.service import iam, sql, jobs +from tests.unit import workspace_client_mock def test_basic_readiness_report_no_workspaces(acc_client, caplog): @@ -25,7 +28,6 @@ def test_readiness_report_ucx_installed(acc_client, caplog): acc_client.workspaces.list.return_value = [ Workspace(workspace_name="foo", workspace_id=123, workspace_status_message="Running", deployment_name="abc")] - account_aggregate_obj = AccountAggregate(account_ws) ws = create_autospec(WorkspaceClient) acc_client.get_workspace_client.return_value = ws ws.statement_execution.execute_statement.return_value \ @@ -35,6 +37,9 @@ def test_readiness_report_ucx_installed(acc_client, caplog): ["jobs", "123134", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"]"""], ["clusters", "0325-3423-dfs", "[]"]])) + ctx = WorkspaceContext(ws).replace(config=WorkspaceConfig(inventory_database="something")) + account_aggregate_obj = AccountAggregate(account_ws, workspace_context_factory=lambda _: ctx) + with caplog.at_level(logging.INFO): account_aggregate_obj.readiness_report() From ee50170fb106fd8180758a677c8b6fac5de33e1f Mon Sep 17 00:00:00 2001 From: pritishpai Date: Thu, 2 May 2024 15:34:19 -0400 Subject: [PATCH 11/19] warehouse_id --- tests/unit/account/test_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index 4979e81804..3fbbb275e3 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -37,7 +37,7 @@ def test_readiness_report_ucx_installed(acc_client, caplog): ["jobs", "123134", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"]"""], ["clusters", "0325-3423-dfs", "[]"]])) - ctx = WorkspaceContext(ws).replace(config=WorkspaceConfig(inventory_database="something")) + ctx = WorkspaceContext(ws).replace(config=WorkspaceConfig(inventory_database="something", warehouse_id="1234")) account_aggregate_obj = AccountAggregate(account_ws, workspace_context_factory=lambda _: ctx) with caplog.at_level(logging.INFO): From 3600e79842bbe7867474b5daff633d699be31210 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Thu, 2 May 2024 16:56:50 -0400 Subject: [PATCH 12/19] unit test sql result --- src/databricks/labs/ucx/account/aggregate.py | 3 --- tests/unit/account/test_aggregate.py | 9 +++++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index ac81e42c18..67ab519008 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -71,9 +71,6 @@ def _federated_ucx_query(self, query: str, table_name='objects') -> Iterable[tup yield workspace_id, row except NotInstalled: logger.warning(f"Workspace {workspace_id} does not have UCX installed") - # TODO: Add this exception handling - # except NotSuchTableException as e: - # logger.warning(f"Workspace {workspace_id} does not have the required table: {e.table_name}") @cached_property def _aggregate_objects(self) -> list[AssessmentObject]: diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index 3fbbb275e3..b4f56b5e60 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -34,8 +34,13 @@ def test_readiness_report_ucx_installed(acc_client, caplog): = sql.ExecuteStatementResponse(status=sql.StatementStatus( state=sql.StatementState.SUCCEEDED), result=sql.ResultData(data_array=[ - ["jobs", "123134", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"]"""], - ["clusters", "0325-3423-dfs", "[]"]])) + ["a", "b", "c"], + ["b", "c", "d"]], row_count=2), + manifest=sql.ResultManifest(schema=sql.ResultSchema( + columns=[sql.ColumnInfo(name="a", type_name=sql.ColumnInfoTypeName.STRING), + sql.ColumnInfo(name="b", type_name=sql.ColumnInfoTypeName.STRING), + sql.ColumnInfo(name="c", type_name=sql.ColumnInfoTypeName.STRING)])), + statement_id='123') ctx = WorkspaceContext(ws).replace(config=WorkspaceConfig(inventory_database="something", warehouse_id="1234")) account_aggregate_obj = AccountAggregate(account_ws, workspace_context_factory=lambda _: ctx) From 7a69156bac0ee1371d168a58ac4d85bde965adf5 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Thu, 2 May 2024 17:39:20 -0400 Subject: [PATCH 13/19] Fix column names in test --- tests/unit/account/test_aggregate.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index b4f56b5e60..4a4071ae01 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -37,9 +37,10 @@ def test_readiness_report_ucx_installed(acc_client, caplog): ["a", "b", "c"], ["b", "c", "d"]], row_count=2), manifest=sql.ResultManifest(schema=sql.ResultSchema( - columns=[sql.ColumnInfo(name="a", type_name=sql.ColumnInfoTypeName.STRING), - sql.ColumnInfo(name="b", type_name=sql.ColumnInfoTypeName.STRING), - sql.ColumnInfo(name="c", type_name=sql.ColumnInfoTypeName.STRING)])), + columns=[sql.ColumnInfo(name="object_type", type_name=sql.ColumnInfoTypeName.STRING), + sql.ColumnInfo(name="object_id", type_name=sql.ColumnInfoTypeName.STRING), + sql.ColumnInfo(name="failures", type_name=sql.ColumnInfoTypeName.STRING)], + column_count=3)), statement_id='123') ctx = WorkspaceContext(ws).replace(config=WorkspaceConfig(inventory_database="something", warehouse_id="1234")) From cc2e40da38a27add3a56e3599f4b79666f2d8eed Mon Sep 17 00:00:00 2001 From: pritishpai Date: Thu, 2 May 2024 17:43:31 -0400 Subject: [PATCH 14/19] Unit test working --- tests/unit/account/test_aggregate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index 4a4071ae01..a18a234f94 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -34,8 +34,8 @@ def test_readiness_report_ucx_installed(acc_client, caplog): = sql.ExecuteStatementResponse(status=sql.StatementStatus( state=sql.StatementState.SUCCEEDED), result=sql.ResultData(data_array=[ - ["a", "b", "c"], - ["b", "c", "d"]], row_count=2), + ["tables", "32432", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"""], + ["clusters", "234234234", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"""]], row_count=2), manifest=sql.ResultManifest(schema=sql.ResultSchema( columns=[sql.ColumnInfo(name="object_type", type_name=sql.ColumnInfoTypeName.STRING), sql.ColumnInfo(name="object_id", type_name=sql.ColumnInfoTypeName.STRING), From 999b26d5dd08c47392ee6a670cdd8fa26d2e8210 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Thu, 2 May 2024 17:51:38 -0400 Subject: [PATCH 15/19] Fmt fix --- src/databricks/labs/ucx/account/aggregate.py | 25 ++++++---- tests/unit/account/test_aggregate.py | 49 +++++++++++++------- 2 files changed, 48 insertions(+), 26 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index 67ab519008..465eccc442 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -30,7 +30,11 @@ class AssessmentObject: class AccountAggregate: - def __init__(self, account_workspaces: AccountWorkspaces, workspace_context_factory: Callable[[WorkspaceClient], WorkspaceContext] = WorkspaceContext): + def __init__( + self, + account_workspaces: AccountWorkspaces, + workspace_context_factory: Callable[[WorkspaceClient], WorkspaceContext] = WorkspaceContext, + ): self._account_workspaces = account_workspaces self._workspace_context_factory = workspace_context_factory @@ -56,13 +60,17 @@ def _federated_ucx_query(self, query: str, table_name='objects') -> Iterable[tup try: # use already existing code to replace tables in the query, assuming that UCX database is in HMS inventory_database = ctx.config.inventory_database - stub_index = MigrationIndex([MigrationStatus( - src_schema=inventory_database, - src_table=table_name, - dst_catalog='hive_metastore', - dst_schema=inventory_database, - dst_table=table_name, - )]) + stub_index = MigrationIndex( + [ + MigrationStatus( + src_schema=inventory_database, + src_table=table_name, + dst_catalog='hive_metastore', + dst_schema=inventory_database, + dst_table=table_name, + ) + ] + ) from_table = FromTable(stub_index, CurrentSessionState(schema=inventory_database)) logger.info(f"Querying Schema {inventory_database}") @@ -99,4 +107,3 @@ def readiness_report(self): for failure, objects in failures.items(): logger.info(f"{failure}: {len(objects)} objects") - diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index a18a234f94..35ac116907 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -3,14 +3,11 @@ from databricks.sdk import Workspace, WorkspaceClient from databricks.labs.ucx.account.aggregate import AccountAggregate from databricks.labs.ucx.account.workspaces import AccountWorkspaces -from databricks.labs.lsql.backends import MockBackend, SqlBackend, StatementExecutionBackend -from databricks.labs.blueprint.installation import MockInstallation from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext -from databricks.sdk.service import iam, sql, jobs +from databricks.sdk.service import sql -from tests.unit import workspace_client_mock def test_basic_readiness_report_no_workspaces(acc_client, caplog): @@ -26,22 +23,40 @@ def test_basic_readiness_report_no_workspaces(acc_client, caplog): def test_readiness_report_ucx_installed(acc_client, caplog): account_ws = AccountWorkspaces(acc_client) acc_client.workspaces.list.return_value = [ - Workspace(workspace_name="foo", workspace_id=123, workspace_status_message="Running", deployment_name="abc")] + Workspace(workspace_name="foo", workspace_id=123, workspace_status_message="Running", deployment_name="abc") + ] ws = create_autospec(WorkspaceClient) acc_client.get_workspace_client.return_value = ws - ws.statement_execution.execute_statement.return_value \ - = sql.ExecuteStatementResponse(status=sql.StatementStatus( - state=sql.StatementState.SUCCEEDED), - result=sql.ResultData(data_array=[ - ["tables", "32432", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"""], - ["clusters", "234234234", """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]"""]], row_count=2), - manifest=sql.ResultManifest(schema=sql.ResultSchema( - columns=[sql.ColumnInfo(name="object_type", type_name=sql.ColumnInfoTypeName.STRING), - sql.ColumnInfo(name="object_id", type_name=sql.ColumnInfoTypeName.STRING), - sql.ColumnInfo(name="failures", type_name=sql.ColumnInfoTypeName.STRING)], - column_count=3)), - statement_id='123') + ws.statement_execution.execute_statement.return_value = sql.ExecuteStatementResponse( + status=sql.StatementStatus(state=sql.StatementState.SUCCEEDED), + result=sql.ResultData( + data_array=[ + [ + "tables", + "32432", + """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]""", + ], + [ + "clusters", + "234234234", + """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]""", + ], + ], + row_count=2, + ), + manifest=sql.ResultManifest( + schema=sql.ResultSchema( + columns=[ + sql.ColumnInfo(name="object_type", type_name=sql.ColumnInfoTypeName.STRING), + sql.ColumnInfo(name="object_id", type_name=sql.ColumnInfoTypeName.STRING), + sql.ColumnInfo(name="failures", type_name=sql.ColumnInfoTypeName.STRING), + ], + column_count=3, + ) + ), + statement_id='123', + ) ctx = WorkspaceContext(ws).replace(config=WorkspaceConfig(inventory_database="something", warehouse_id="1234")) account_aggregate_obj = AccountAggregate(account_ws, workspace_context_factory=lambda _: ctx) From 6b34c743eee52c8038e0782c2fb52d5177514d3d Mon Sep 17 00:00:00 2001 From: pritishpai Date: Fri, 3 May 2024 10:35:07 -0400 Subject: [PATCH 16/19] Added asserts to test --- tests/unit/account/test_aggregate.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/unit/account/test_aggregate.py b/tests/unit/account/test_aggregate.py index 35ac116907..1591e5decc 100644 --- a/tests/unit/account/test_aggregate.py +++ b/tests/unit/account/test_aggregate.py @@ -9,7 +9,6 @@ from databricks.sdk.service import sql - def test_basic_readiness_report_no_workspaces(acc_client, caplog): account_ws = AccountWorkspaces(acc_client) account_aggregate_obj = AccountAggregate(account_ws) @@ -33,14 +32,25 @@ def test_readiness_report_ucx_installed(acc_client, caplog): result=sql.ResultData( data_array=[ [ - "tables", - "32432", - """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]""", + "jobs", + "32432123", + """["cluster type not supported : LEGACY_TABLE_ACL", + "cluster type not supported : LEGACY_SINGLE_USER"]""", ], [ - "clusters", + "jobs", "234234234", - """["cluster type not supported : LEGACY_TABLE_ACL", "cluster type not supported : LEGACY_SINGLE_USER"]""", + """["cluster type not supported : LEGACY_SINGLE_USER"]""", + ], + [ + "clusters", + "21312312", + """[]""", + ], + [ + "tables", + "34234324", + """["listTables returned null"]""", ], ], row_count=2, @@ -64,4 +74,6 @@ def test_readiness_report_ucx_installed(acc_client, caplog): with caplog.at_level(logging.INFO): account_aggregate_obj.readiness_report() - assert 'UC compatibility' in caplog.text + assert 'UC compatibility: 25.0% (3/4)' in caplog.text + assert 'cluster type not supported : LEGACY_TABLE_ACL: 1 objects' in caplog.text + assert 'cluster type not supported : LEGACY_SINGLE_USER: 2 objects' in caplog.text From 0d40b5657bea9acf8c4158e537cfa763ccc82664 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Fri, 3 May 2024 11:35:44 -0400 Subject: [PATCH 17/19] Doc and ToC WIP --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index fb3c98208b..1c331757ce 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`workflows` command](#workflows-command) * [`open-remote-config` command](#open-remote-config-command) * [`installations` command](#installations-command) + * [`report-account-compatibility` command](#report-account-compatibility-command) * [Metastore related commands](#metastore-related-commands) * [`show-all-metastores` command](#show-all-metastores-command) * [`assign-metastore` command](#assign-metastore-command) @@ -573,6 +574,12 @@ the installations where the `ucx` package is installed and prints their details for administrators who want to see which users have installed `ucx` and where. It can also be used to debug issues related to multiple installations of `ucx` on the same workspace. +## `report-account-compatibility` command + +```text + +``` + [[back to top](#databricks-labs-ucx)] # Metastore related commands From 365184050cc1a93cbbf5127025acf0b24e072380 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Fri, 3 May 2024 12:59:27 -0400 Subject: [PATCH 18/19] Documentation added --- README.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1c331757ce..23b276bc90 100644 --- a/README.md +++ b/README.md @@ -574,10 +574,28 @@ the installations where the `ucx` package is installed and prints their details for administrators who want to see which users have installed `ucx` and where. It can also be used to debug issues related to multiple installations of `ucx` on the same workspace. +[[back to top](#databricks-labs-ucx)] + + ## `report-account-compatibility` command ```text - +databricks labs ucx report-account-compatibility --profile labs-azure-account +12:56:09 INFO [databricks.sdk] Using Azure CLI authentication with AAD tokens +12:56:09 INFO [d.l.u.account.aggregate] Generating readiness report +12:56:10 INFO [databricks.sdk] Using Azure CLI authentication with AAD tokens +12:56:10 INFO [databricks.sdk] Using Azure CLI authentication with AAD tokens +12:56:15 INFO [databricks.sdk] Using Azure CLI authentication with AAD tokens +12:56:15 INFO [d.l.u.account.aggregate] Querying Schema ucx +12:56:21 WARN [d.l.u.account.aggregate] Workspace 4045495039142306 does not have UCX installed +12:56:21 INFO [d.l.u.account.aggregate] UC compatibility: 30.303030303030297% (69/99) +12:56:21 INFO [d.l.u.account.aggregate] cluster type not supported : LEGACY_TABLE_ACL: 22 objects +12:56:21 INFO [d.l.u.account.aggregate] cluster type not supported : LEGACY_SINGLE_USER: 24 objects +12:56:21 INFO [d.l.u.account.aggregate] unsupported config: spark.hadoop.javax.jdo.option.ConnectionURL: 10 objects +12:56:21 INFO [d.l.u.account.aggregate] Uses azure service principal credentials config in cluster.: 1 objects +12:56:21 INFO [d.l.u.account.aggregate] No isolation shared clusters not supported in UC: 1 objects +12:56:21 INFO [d.l.u.account.aggregate] Data is in DBFS Root: 23 objects +12:56:21 INFO [d.l.u.account.aggregate] Non-DELTA format: UNKNOWN: 5 objects ``` [[back to top](#databricks-labs-ucx)] From f2f8dd45f819c7e6dfbfcc56e36590343e088ae5 Mon Sep 17 00:00:00 2001 From: pritishpai Date: Mon, 6 May 2024 12:47:05 -0400 Subject: [PATCH 19/19] Fix callable issue and warning --- src/databricks/labs/ucx/account/aggregate.py | 3 +-- src/databricks/labs/ucx/contexts/account_cli.py | 1 + src/databricks/labs/ucx/contexts/workspace_cli.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/account/aggregate.py b/src/databricks/labs/ucx/account/aggregate.py index 465eccc442..1f8bf53a1b 100644 --- a/src/databricks/labs/ucx/account/aggregate.py +++ b/src/databricks/labs/ucx/account/aggregate.py @@ -1,10 +1,9 @@ import collections import json import logging -from collections.abc import Iterable +from collections.abc import Iterable, Callable from dataclasses import dataclass from functools import cached_property -from typing import Callable from databricks.labs.lsql import Row from databricks.sdk import WorkspaceClient diff --git a/src/databricks/labs/ucx/contexts/account_cli.py b/src/databricks/labs/ucx/contexts/account_cli.py index 238e4c460a..585f7230bc 100644 --- a/src/databricks/labs/ucx/contexts/account_cli.py +++ b/src/databricks/labs/ucx/contexts/account_cli.py @@ -30,5 +30,6 @@ def account_workspaces(self): def account_aggregate(self): return AccountAggregate(self.account_workspaces) + @cached_property def account_metastores(self): return AccountMetastores(self.account_client) diff --git a/src/databricks/labs/ucx/contexts/workspace_cli.py b/src/databricks/labs/ucx/contexts/workspace_cli.py index 9e93b6e601..da257f239d 100644 --- a/src/databricks/labs/ucx/contexts/workspace_cli.py +++ b/src/databricks/labs/ucx/contexts/workspace_cli.py @@ -167,4 +167,3 @@ def iam_role_creation(self): @cached_property def notebook_loader(self) -> NotebookLoader: return LocalNotebookLoader(self.path_lookup) -