Merge branch 'main' into feature/##1938

databrickslabs · Sep 26, 2024 · 466b13c · 466b13c
2 parents 0e331a7 + 1dd1a12
commit 466b13c
Show file tree

Hide file tree

Showing 35 changed files with 612 additions and 205 deletions.
diff --git a/README.md b/README.md
@@ -397,6 +397,9 @@ which can be used for further analysis and decision-making through the [assessme
 9. `assess_pipelines`: This task scans through all the Pipelines and identifies those pipelines that have Azure Service Principals embedded in their configurations. A list of all the pipelines with matching configurations is stored in the `$inventory.pipelines` table.
 10. `assess_azure_service_principals`: This task scans through all the clusters configurations, cluster policies, job cluster configurations, Pipeline configurations, and Warehouse configuration and identifies all the Azure Service Principals who have been given access to the Azure storage accounts via spark configurations referred in those entities. The list of all the Azure Service Principals referred in those configurations is saved in the `$inventory.azure_service_principals` table.
 11. `assess_global_init_scripts`: This task scans through all the global init scripts and identifies if there is an Azure Service Principal who has been given access to the Azure storage accounts via spark configurations referred in those scripts.
+12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems. It also collects direct filesystem access patterns that require attention.
+13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems. It also collects direct filesystem access patterns that require attention.
+
 
 ![report](docs/assessment-report.png)
 
@@ -712,11 +715,16 @@ in the Migration dashboard.
 
 > Please note that this is an experimental workflow.
 
-The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the
-workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The `experimental-workflow-linter` workflow lints accessible code from 2 sources:
+ - all workflows/jobs present in the workspace
+ - all dashboards/queries present in the workspace
+The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The linting also locates direct filesystem access that need to be migrated.
 
-Once the workflow completes, the output will be stored in `$inventory_database.workflow_problems` table, and displayed
-in the Migration dashboard.
+Once the workflow completes:
+ - problems are stored in the `$inventory_database.workflow_problems`/`$inventory_database.query_problems` table
+ - direct filesystem access are stored in the `$inventory_database.directfs_in_paths`/`$inventory_database.directfs_in_queries` table
+ - all the above are displayed in the Migration dashboard.
 
 ![code compatibility problems](docs/code_compatibility_problems.png)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -46,10 +46,10 @@ classifiers = [
 
 dependencies = ["databricks-sdk~=0.30",
                 "databricks-labs-lsql>=0.5,<0.13",
-                "databricks-labs-blueprint>=0.8,<0.9",
+                "databricks-labs-blueprint>=0.8,<0.10",
                 "PyYAML>=6.0.0,<7.0.0",
                 "sqlglot>=25.5.0,<25.23",
-                "astroid>=3.2.2"]
+                "astroid>=3.3.1"]
 
 [project.optional-dependencies]
 pylsp = [
@@ -74,7 +74,7 @@ dependencies = [
     "black~=24.3.0",
     "coverage[toml]~=7.4.4",
     "mypy~=1.9.0",
-    "pylint~=3.2.2",
+    "pylint~=3.3.1",
     "pylint-pytest==2.0.0a0",
     "databricks-labs-pylint~=0.4.0",
     "databricks-labs-pytester>=0.2.1",
@@ -209,7 +209,7 @@ fail-under = 10.0
 # ignore-list. The regex matches against paths and can be in Posix or Windows
 # format. Because '\\' represents the directory delimiter on Windows systems, it
 # can't be used as an escape character.
- ignore-paths='^tests/unit/source_code/samples/.*$'
+ignore-paths='^tests/unit/source_code/samples/.*$'
 
 # Files or directories matching the regular expression patterns are skipped. The
 # regex matches against base names, not paths. The default value ignores Emacs
@@ -587,7 +587,10 @@ disable = [
     "fixme",
     "consider-using-assignment-expr",
     "logging-fstring-interpolation",
-    "consider-using-any-or-all"
+    "consider-using-any-or-all",
+    "too-many-positional-arguments",
+    "unnecessary-default-type-args",
+    "logging-not-lazy"
 ]
 
 # Enable the message, report, category or checker with the given id(s). You can

diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
@@ -18,7 +18,6 @@
 from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.sdk import AccountClient, WorkspaceClient, core
-from databricks.sdk.errors import NotFound
 from databricks.sdk.service import sql
 
 from databricks.labs.ucx.account.workspaces import WorkspaceInfo
@@ -305,18 +304,15 @@ def aws_acl(self):
         )
 
     @cached_property
-    def principal_locations(self):
-        eligible_locations = {}
-        try:
+    def principal_locations_retriever(self):
+        def inner():
             if self.is_azure:
-                eligible_locations = self.azure_acl.get_eligible_locations_principals()
+                return self.azure_acl.get_eligible_locations_principals()
             if self.is_aws:
-                eligible_locations = self.aws_acl.get_eligible_locations_principals()
-            if self.is_gcp:
-                raise NotImplementedError("Not implemented for GCP.")
-        except NotFound:
-            pass
-        return eligible_locations
+                return self.aws_acl.get_eligible_locations_principals()
+            raise NotImplementedError("Not implemented for GCP.")
+
+        return inner
 
     @cached_property
     def principal_acl(self):
@@ -326,7 +322,7 @@ def principal_acl(self):
             self.installation,
             self.tables_crawler,
             self.mounts_crawler,
-            self.principal_locations,
+            self.principal_locations_retriever,
         )
 
     @cached_property

diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py
@@ -71,7 +71,7 @@ def pipelines_crawler(self):
 
     @cached_property
     def table_size_crawler(self):
-        return TableSizeCrawler(self.sql_backend, self.inventory_database)
+        return TableSizeCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
 
     @cached_property
     def policies_crawler(self):

diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py
@@ -579,7 +579,7 @@ def __init__(
         installation: Installation,
         tables_crawler: TablesCrawler,
         mounts_crawler: Mounts,
-        cluster_locations: list[ComputeLocations],
+        cluster_locations: Callable[[], list[ComputeLocations]],
     ):
         self._backend = backend
         self._ws = ws
@@ -593,7 +593,7 @@ def get_interactive_cluster_grants(self) -> list[Grant]:
         mounts = list(self._mounts_crawler.snapshot())
         grants: set[Grant] = set()
 
-        for compute_location in self._compute_locations:
+        for compute_location in self._compute_locations():
             principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type)
             if len(principals) == 0:
                 continue
@@ -697,7 +697,7 @@ def apply_location_acl(self):
             "CREATE EXTERNAL VOLUME and READ_FILES for existing eligible interactive cluster users"
         )
         # get the eligible location mapped for each interactive cluster
-        for compute_location in self._compute_locations:
+        for compute_location in self._compute_locations():
             # get interactive cluster users
             principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type)
             if len(principals) == 0:

diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py
@@ -1,12 +1,15 @@
 import logging
 from collections.abc import Iterable
 from dataclasses import dataclass
+from functools import partial
 
+from databricks.labs.blueprint.parallel import Threads
 from databricks.labs.lsql.backends import SqlBackend
 
 from databricks.labs.ucx.framework.crawlers import CrawlerBase
 from databricks.labs.ucx.framework.utils import escape_sql_identifier
 from databricks.labs.ucx.hive_metastore import TablesCrawler
+from databricks.labs.ucx.hive_metastore.tables import Table
 
 logger = logging.getLogger(__name__)
 
@@ -40,43 +43,43 @@ def _crawl(self) -> Iterable[TableSize]:
         """Crawls and lists tables using table crawler
         Identifies DBFS root tables and calculates the size for these.
         """
+        tasks = []
         for table in self._tables_crawler.snapshot():
             if not table.kind == "TABLE":
                 continue
             if not table.is_dbfs_root:
                 continue
-            size_in_bytes = self._safe_get_table_size(table.key)
-            if size_in_bytes is None:
-                continue  # table does not exist anymore or is corrupted
-
-            yield TableSize(
-                catalog=table.catalog, database=table.database, name=table.name, size_in_bytes=size_in_bytes
-            )
+            tasks.append(partial(self._safe_get_table_size, table))
+        return Threads.strict('DBFS root table sizes', tasks)
 
     def _try_fetch(self) -> Iterable[TableSize]:
         """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
         for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):
             yield TableSize(*row)
 
-    def _safe_get_table_size(self, table_full_name: str) -> int | None:
-        logger.debug(f"Evaluating {table_full_name} table size.")
+    def _safe_get_table_size(self, table: Table) -> TableSize | None:
+        logger.debug(f"Evaluating {table.key} table size.")
         try:
             # refresh table statistics to avoid stale stats in HMS
-            self._backend.execute(f"ANALYZE table {escape_sql_identifier(table_full_name)} compute STATISTICS NOSCAN")
-            # pylint: disable-next=protected-access
-            return self._spark._jsparkSession.table(table_full_name).queryExecution().analyzed().stats().sizeInBytes()
+            self._backend.execute(f"ANALYZE table {table.safe_sql_key} compute STATISTICS NOSCAN")
+            jvm_df = self._spark._jsparkSession.table(table.safe_sql_key)  # pylint: disable=protected-access
+            size_in_bytes = jvm_df.queryExecution().analyzed().stats().sizeInBytes()
+            return TableSize(
+                catalog=table.catalog,
+                database=table.database,
+                name=table.name,
+                size_in_bytes=size_in_bytes,
+            )
         except Exception as e:  # pylint: disable=broad-exception-caught
             if "[TABLE_OR_VIEW_NOT_FOUND]" in str(e) or "[DELTA_TABLE_NOT_FOUND]" in str(e):
-                logger.warning(f"Failed to evaluate {table_full_name} table size. Table not found.")
+                logger.warning(f"Failed to evaluate {table.key} table size. Table not found.")
                 return None
             if "[DELTA_INVALID_FORMAT]" in str(e):
-                logger.warning(
-                    f"Unable to read Delta table {table_full_name}, please check table structure and try again."
-                )
+                logger.warning(f"Unable to read Delta table {table.key}, please check table structure and try again.")
                 return None
             if "[DELTA_MISSING_TRANSACTION_LOG]" in str(e):
-                logger.warning(f"Delta table {table_full_name} is corrupted: missing transaction log.")
+                logger.warning(f"Delta table {table.key} is corrupt: missing transaction log.")
                 return None
-            logger.error(f"Failed to evaluate {table_full_name} table size: ", exc_info=True)
+            logger.error(f"Failed to evaluate {table.key} table size: ", exc_info=True)
 
         return None
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py
@@ -340,7 +340,7 @@ class MigrationCount:
     what_count: dict[What, int]
 
 
-class TablesCrawler(CrawlerBase):
+class TablesCrawler(CrawlerBase[Table]):
     def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None):
         """
         Initializes a TablesCrawler instance.

diff --git a/...databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md b/...databricks/labs/ucx/queries/assessment/main/35_0_code_compatibility_problems.md
@@ -0,0 +1,8 @@
+## Code compatibility problems
+
+The tables below assist with verifying if workflows and dashboards are Unity Catalog compatible. It can be filtered on the path,
+problem code and workflow name.
+Each row:
+- Points to a problem detected in the code using the code path, query or workflow & task reference and start/end line & column;
+- Explains the problem with a human-readable message and a code.
+
diff --git a/...bricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql b/...bricks/labs/ucx/queries/assessment/main/35_1_code_compatibility_problems_in_workflows.sql
@@ -0,0 +1,35 @@
+/*
+--title 'Workflow migration problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "/#workspace/{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"},
+        {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"},
+        {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"},
+        {"fieldName": "workflow_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_name"},
+        {"fieldName": "task_key", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/jobs/{{ workflow_id }}/tasks/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "task_key"},
+        {"fieldName": "start_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_line"},
+        {"fieldName": "start_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_col"},
+        {"fieldName": "end_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_line"},
+        {"fieldName": "end_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_col"}
+      ]},
+    "invisibleColumns": [
+      {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"},
+      {"name": "workflow_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_id"}
+    ]
+  }}'
+*/
+SELECT
+  substring_index(path, '@databricks.com/', -1) as path,
+  path as link,
+  code,
+  message,
+  job_id AS workflow_id,
+  job_name AS workflow_name,
+  task_key,
+  start_line,
+  start_col,
+  end_line,
+  end_col
+FROM inventory.workflow_problems
diff --git a/...ricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql b/...ricks/labs/ucx/queries/assessment/main/35_2_code_compatibility_problems_in_dashboards.sql
@@ -0,0 +1,29 @@
+/*
+--title 'Dashboard compatibility problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"},
+        {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"},
+        {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"},
+        {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"}
+      ]},
+    "invisibleColumns": [
+      {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"},
+      {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"},
+      {"name": "query_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_parent"},
+      {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"}
+    ]
+  }}'
+*/
+SELECT
+  dashboard_id,
+  dashboard_parent,
+  dashboard_name,
+  query_id,
+  query_parent,
+  query_name,
+  code,
+  message
+FROM inventory.query_problems
diff --git a/...icks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md b/...icks/labs/ucx/queries/assessment/main/36_0_direct_filesystem_access_problems.md
@@ -0,0 +1,14 @@
+---
+height: 4
+---
+
+# Direct filesystem access problems
+
+The table below assists with verifying if workflows and dashboards require direct filesystem access.
+As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally direct filesystem access is discouraged.
+Rather, data should be accessed via Unity tables.
+
+Each row:
+- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference and start/end line & column;
+- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that access.
+