Added initial version of databricks labs ucx migrate-local-code com…

…mand (#1067) The `databricks labs ucx migrate-local-code` command has been added to facilitate migration of local code to a Databricks environment. This initial version of the command is highly experimental, with support for migrating Python and SQL files only. The `.gitignore` file has been updated to exclude output files and specific configuration files from being committed to the repository. This command aims to help users and administrators manage code migration and maintain consistency across workspaces, while also enhancing the compatibility of local code with the Unity Catalog, a part of Databricks' offerings for data and AI.
databrickslabs · Mar 21, 2024 · 27500c2 · 27500c2
1 parent 40b454c
commit 27500c2
Show file tree

Hide file tree

Showing 23 changed files with 1,103 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -151,4 +151,5 @@ dev/cleanup.py
 
 .python-version
 .databricks-login.json
-*.out
+*.out
+foo
diff --git a/README.md b/README.md
@@ -48,6 +48,8 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
   * [`create-catalogs-schemas` command](#create-catalogs-schemas-command)
   * [`move` command](#move-command)
   * [`alias` command](#alias-command)
+* [Code migration commands](#code-migration-commands)
+  * [`migrate-local-code` command](#migrate-local-code-command)
 * [Cross-workspace installations](#cross-workspace-installations)
   * [`sync-workspace-info` command](#sync-workspace-info-command)
   * [`manual-workspace-info` command](#manual-workspace-info-command)
@@ -625,6 +627,23 @@ It can also be used to debug issues related to table aliasing.
 
 [[back to top](#databricks-labs-ucx)]
 
+# Code migration commands
+
+[[back to top](#databricks-labs-ucx)]
+
+## `migrate-local-code` command
+
+```text
+databricks labs ucx migrate-local-code
+```
+
+**(Experimental)** Once [table migration](#table-migration-commands) is complete, you can run this command to 
+migrate all python and SQL files in the current working directory. This command is highly experimental and
+at the moment only supports Python and SQL files and discards code comments and formatting during 
+the automated transformation process.
+
+[[back to top](#databricks-labs-ucx)]
+
 # Cross-workspace installations
 
 When installing UCX across multiple workspaces, administrators need to keep UCX configurations in sync.

diff --git a/labs.yml b/labs.yml
@@ -149,3 +149,6 @@ commands:
 
   - name: revert-cluster-remap
     description: Reverting the Re-mapping of the  cluster from  UC
+
+  - name: migrate-local-code
+    description: (Experimental) Migrate files in the current directory to be more compatible with Unity Catalog.
diff --git a/pyproject.toml b/pyproject.toml
@@ -154,7 +154,12 @@ branch = true
 parallel = true
 
 [tool.coverage.report]
-omit = ["src/databricks/labs/ucx/mixins/*", "*/working-copy/*", "*/fresh_wheel_file/*"]
+omit = [
+    "src/databricks/labs/ucx/mixins/*",
+    "src/databricks/labs/ucx/code/lsp.py",
+    "*/working-copy/*",
+    "*/fresh_wheel_file/*"
+]
 exclude_lines = [
     "no cov",
     "if __name__ == .__main__.:",

diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py
@@ -3,6 +3,7 @@
 import shutil
 import webbrowser
 from collections.abc import Callable
+from pathlib import Path
 
 from databricks.labs.blueprint.cli import App
 from databricks.labs.blueprint.entrypoint import get_logger
@@ -19,6 +20,7 @@
 from databricks.labs.ucx.azure.access import AzureResourcePermissions
 from databricks.labs.ucx.azure.credentials import ServicePrincipalMigration
 from databricks.labs.ucx.azure.locations import ExternalLocationsMigration
+from databricks.labs.ucx.code.files import Files
 from databricks.labs.ucx.config import WorkspaceConfig
 from databricks.labs.ucx.hive_metastore import ExternalLocations, TablesCrawler
 from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema
@@ -547,5 +549,15 @@ def revert_cluster_remap(w: WorkspaceClient, prompts: Prompts):
     cluster_details.revert_cluster_remap(cluster_list, cluster_ids)
 
 
+@ucx.command
+def migrate_local_code(w: WorkspaceClient, prompts: Prompts):
+    """Fix the code files based on their language."""
+    files = Files.for_cli(w)
+    working_directory = Path.cwd()
+    if not prompts.confirm("Do you want to apply UC migration to all files in the current directory?"):
+        return
+    files.apply(working_directory)
+
+
 if __name__ == "__main__":
     ucx()
diff --git a/src/databricks/labs/ucx/code/__init__.py b/src/databricks/labs/ucx/code/__init__.py
diff --git a/src/databricks/labs/ucx/code/base.py b/src/databricks/labs/ucx/code/base.py
@@ -0,0 +1,91 @@
+from abc import abstractmethod
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+# Code mapping between LSP, PyLint, and our own diagnostics:
+# | LSP                       | PyLint     | Our            |
+# |---------------------------|------------|----------------|
+# | Severity.ERROR            | Error      | Failure()      |
+# | Severity.WARN             | Warning    | Advisory()     |
+# | DiagnosticTag.DEPRECATED  | Warning    | Deprecation()  |
+# | Severity.INFO             | Info       | Advice()       |
+# | Severity.HINT             | Convention | Convention()   |
+# | DiagnosticTag.UNNECESSARY | Refactor   | Convention()   |
+
+
+@dataclass
+class Advice:
+    code: str
+    message: str
+    start_line: int
+    start_col: int
+    end_line: int
+    end_col: int
+
+    def replace(
+        self,
+        code: str | None = None,
+        message: str | None = None,
+        start_line: int | None = None,
+        start_col: int | None = None,
+        end_line: int | None = None,
+        end_col: int | None = None,
+    ) -> 'Advice':
+        return self.__class__(
+            code=code if code is not None else self.code,
+            message=message if message is not None else self.message,
+            start_line=start_line if start_line is not None else self.start_line,
+            start_col=start_col if start_col is not None else self.start_col,
+            end_line=end_line if end_line is not None else self.end_line,
+            end_col=end_col if end_col is not None else self.end_col,
+        )
+
+    def as_advisory(self) -> 'Advisory':
+        return Advisory(**self.__dict__)
+
+    def as_failure(self) -> 'Failure':
+        return Failure(**self.__dict__)
+
+    def as_deprecation(self) -> 'Deprecation':
+        return Deprecation(**self.__dict__)
+
+    def as_convention(self) -> 'Convention':
+        return Convention(**self.__dict__)
+
+
+class Advisory(Advice):
+    """A warning that does not prevent the code from running."""
+
+
+class Failure(Advisory):
+    """An error that prevents the code from running."""
+
+
+class Deprecation(Advisory):
+    """An advisory that suggests to replace the code with a newer version."""
+
+
+class Convention(Advice):
+    """A suggestion for a better way to write the code."""
+
+
+class Linter:
+    @abstractmethod
+    def lint(self, code: str) -> Iterable[Advice]: ...
+
+
+class Fixer:
+    @abstractmethod
+    def name(self) -> str: ...
+
+    @abstractmethod
+    def apply(self, code: str) -> str: ...
+
+
+class SequentialLinter(Linter):
+    def __init__(self, linters: list[Linter]):
+        self._linters = linters
+
+    def lint(self, code: str) -> Iterable[Advice]:
+        for linter in self._linters:
+            yield from linter.lint(code)
diff --git a/src/databricks/labs/ucx/code/files.py b/src/databricks/labs/ucx/code/files.py
@@ -0,0 +1,68 @@
+import logging
+from pathlib import Path
+
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.workspace import Language
+
+from databricks.labs.ucx.code.languages import Languages
+from databricks.labs.ucx.hive_metastore.table_migrate import TablesMigrate
+
+logger = logging.getLogger(__name__)
+
+
+class Files:
+    """The Files class is responsible for fixing code files based on their language."""
+
+    def __init__(self, languages: Languages):
+        self._languages = languages
+        self._extensions = {".py": Language.PYTHON, ".sql": Language.SQL}
+
+    @classmethod
+    def for_cli(cls, ws: WorkspaceClient):
+        tables_migrate = TablesMigrate.for_cli(ws)
+        index = tables_migrate.index()
+        languages = Languages(index)
+        return cls(languages)
+
+    def apply(self, path: Path) -> bool:
+        if path.is_dir():
+            for folder in path.iterdir():
+                self.apply(folder)
+            return True
+        return self._apply_file_fix(path)
+
+    def _apply_file_fix(self, path):
+        """
+        The fix method reads a file, lints it, applies fixes, and writes the fixed code back to the file.
+        """
+        # Check if the file extension is in the list of supported extensions
+        if path.suffix not in self._extensions:
+            return False
+        # Get the language corresponding to the file extension
+        language = self._extensions[path.suffix]
+        # If the language is not supported, return
+        if not language:
+            return False
+        logger.info(f"Analysing {path}")
+        # Get the linter for the language
+        linter = self._languages.linter(language)
+        # Open the file and read the code
+        with path.open("r") as f:
+            code = f.read()
+            applied = False
+            # Lint the code and apply fixes
+            for advice in linter.lint(code):
+                logger.info(f"Found: {advice}")
+                fixer = self._languages.fixer(language, advice.code)
+                if not fixer:
+                    continue
+                logger.info(f"Applying fix for {advice}")
+                code = fixer.apply(code)
+                applied = True
+            if not applied:
+                return False
+            # Write the fixed code back to the file
+            with path.open("w") as f:
+                logger.info(f"Overwriting {path}")
+                f.write(code)
+                return True
diff --git a/src/databricks/labs/ucx/code/languages.py b/src/databricks/labs/ucx/code/languages.py
@@ -0,0 +1,44 @@
+from databricks.sdk.service.workspace import Language
+
+from databricks.labs.ucx.code.base import Fixer, Linter, SequentialLinter
+from databricks.labs.ucx.code.pyspark import SparkSql
+from databricks.labs.ucx.code.queries import FromTable
+from databricks.labs.ucx.hive_metastore.table_migrate import Index
+
+
+class Languages:
+    def __init__(self, index: Index):
+        self._index = index
+        from_table = FromTable(index)
+        self._linters = {
+            Language.PYTHON: SequentialLinter([SparkSql(from_table)]),
+            Language.SQL: SequentialLinter([from_table]),
+        }
+        self._fixers: dict[Language, list[Fixer]] = {
+            Language.PYTHON: [SparkSql(from_table)],
+            Language.SQL: [from_table],
+        }
+
+    def is_supported(self, language: Language) -> bool:
+        return language in self._linters and language in self._fixers
+
+    def linter(self, language: Language) -> Linter:
+        if language not in self._linters:
+            raise ValueError(f"Unsupported language: {language}")
+        return self._linters[language]
+
+    def fixer(self, language: Language, diagnostic_code: str) -> Fixer | None:
+        if language not in self._fixers:
+            return None
+        for fixer in self._fixers[language]:
+            if fixer.name() == diagnostic_code:
+                return fixer
+        return None
+
+    def apply_fixes(self, language: Language, code: str) -> str:
+        linter = self.linter(language)
+        for advice in linter.lint(code):
+            fixer = self.fixer(language, advice.code)
+            if fixer:
+                code = fixer.apply(code)
+        return code
-Original file line number
+Diff line change
@@ Expand Up / @@ -151,4 +151,5 @@ dev/cleanup.py @@
     .python-version
     .databricks-login.json
-    *.out
+    *.out
+    foo