diff --git a/README.md b/README.md index 83ac7d3..bc70ac2 100644 --- a/README.md +++ b/README.md @@ -933,6 +933,33 @@ This will print something like: You can also do `wheels.upload_to_dbfs()`, though you're not able to set any access control over it. +### Publishing upstream dependencies to Databricks Workspace without Public Internet access + +Python wheel may have dependencies that are not included in the wheel itself. These dependencies are usually other Python packages that your wheel relies on. During installation on regular Databricks Workspaces, these dependencies get automatically fetched from [Python Package Index](https://pypi.org/). + +Some Databricks Workspaces are configured with extra layers of network security, that block all access to Public Internet, including [Python Package Index](https://pypi.org/). To ensure installations working on these kinds of workspaces, developers need to explicitly upload all upstream dependencies for their applications to work correctly. + +The `upload_wheel_dependencies(prefixes)` method can be used to upload these dependencies to Databricks Workspace. This method takes a list of prefixes as an argument. It will upload all the dependencies of the wheel that have names starting with any of the provided prefixes. + +Here is an example of how you can use this method: + +```python +from databricks.sdk import WorkspaceClient +from databricks.labs.blueprint.wheels import ProductInfo + +ws = WorkspaceClient() +product_info = ProductInfo(__file__) +installation = product_info.current_installation(ws) + +with product_info.wheels(ws) as wheels: + wheel_paths = wheels.upload_wheel_dependencies(['databricks_sdk', 'pandas']) + for path in wheel_paths: + print(f'Uploaded dependency to {path}') +``` + +In this example, the `upload_wheel_dependencies(['databricks_sdk', 'pandas'])` call will upload all the dependencies of the wheel that have names starting with 'databricks_sdk' or 'pandas'. This method excludes any platform specific dependencies (i.e. ending with `-none-any.whl`). Also the main wheel file is not uploaded. The method returns a list of paths to the uploaded dependencies on WorkspaceFS. + + [[back to top](#databricks-labs-blueprint)] ## Databricks CLI's `databricks labs ...` Router diff --git a/src/databricks/labs/blueprint/wheels.py b/src/databricks/labs/blueprint/wheels.py index 5bed53d..46beb98 100644 --- a/src/databricks/labs/blueprint/wheels.py +++ b/src/databricks/labs/blueprint/wheels.py @@ -232,6 +232,24 @@ def upload_to_wsfs(self) -> str: self._installation.save(Version(self._product_info.version(), remote_wheel, self._now_iso())) return remote_wheel + def upload_wheel_dependencies(self, prefixes: list[str]) -> list[str]: + """Uploads the wheel dependencies to WSFS location of installation and returns the remote paths. + :param prefixes : A list of prefixes to match against the wheel names. If a prefix matches, the wheel is uploaded. + """ + remote_paths = [] + for wheel in self._build_wheel(self._tmp_dir.name, verbose=self._verbose, no_deps=False, dirs_exist_ok=True): + if not wheel.name.endswith("-none-any.whl"): + continue + # main wheel is uploaded with upload_to_wsfs() method. + if wheel.name == self._local_wheel.name: + continue + for prefix in prefixes: + if not wheel.name.startswith(prefix): + continue + remote_wheel = self._installation.upload(f"wheels/{wheel.name}", wheel.read_bytes()) + remote_paths.append(remote_wheel) + return remote_paths + @staticmethod def _now_iso(): """Returns the current time in ISO format.""" @@ -240,20 +258,21 @@ def _now_iso(): def __enter__(self) -> "WheelsV2": """Builds the wheel and returns the instance. Use it as a context manager.""" self._tmp_dir = tempfile.TemporaryDirectory() - self._local_wheel = self._build_wheel(self._tmp_dir.name, verbose=self._verbose) + self._local_wheel = next(self._build_wheel(self._tmp_dir.name, verbose=self._verbose, no_deps=True)) return self def __exit__(self, __exc_type, __exc_value, __traceback): """Cleans up the temporary directory. Use it as a context manager.""" self._tmp_dir.cleanup() - def _build_wheel(self, tmp_dir: str, *, verbose: bool = False): + def _build_wheel(self, tmp_dir: str, *, verbose: bool = False, no_deps: bool = True, dirs_exist_ok: bool = False): """Helper to build the wheel package :param tmp_dir: str: :param *: :param verbose: bool: (Default value = False) - + :param no_deps: bool: (Default value = True) + :param dirs_exist_ok: bool: (Default value = False) """ stdout = subprocess.STDOUT stderr = subprocess.STDOUT @@ -263,18 +282,20 @@ def _build_wheel(self, tmp_dir: str, *, verbose: bool = False): checkout_root = self._product_info.checkout_root() if self._product_info.is_git_checkout() and self._product_info.is_unreleased_version(): # working copy becomes project root for building a wheel - checkout_root = self._copy_root_to(tmp_dir) + checkout_root = self._copy_root_to(tmp_dir, dirs_exist_ok) # and override the version file self._override_version_to_unreleased(checkout_root) + args = [sys.executable, "-m", "pip", "wheel", "--wheel-dir", tmp_dir, checkout_root.as_posix()] logger.debug(f"Building wheel for {checkout_root} in {tmp_dir}") + if no_deps: + args.append("--no-deps") subprocess.run( - [sys.executable, "-m", "pip", "wheel", "--no-deps", "--wheel-dir", tmp_dir, checkout_root.as_posix()], + args, check=True, stdout=stdout, stderr=stderr, ) - # get wheel name as first file in the temp directory - return next(Path(tmp_dir).glob("*.whl")) + return Path(tmp_dir).glob("*.whl") def _override_version_to_unreleased(self, tmp_dir_path: Path): """Overrides the version file to unreleased version.""" @@ -284,7 +305,7 @@ def _override_version_to_unreleased(self, tmp_dir_path: Path): with version_file.open("w") as f: f.write(f'__version__ = "{self._product_info.version()}"') - def _copy_root_to(self, tmp_dir: str | Path): + def _copy_root_to(self, tmp_dir: str | Path, dirs_exist_ok: bool = False): """Copies the root to a temporary directory.""" checkout_root = self._product_info.checkout_root() tmp_dir_path = Path(tmp_dir) / "working-copy" @@ -299,7 +320,7 @@ def copy_ignore(_, names: list[str]): ignored_names.append(name) return ignored_names - shutil.copytree(checkout_root, tmp_dir_path, ignore=copy_ignore) + shutil.copytree(checkout_root, tmp_dir_path, ignore=copy_ignore, dirs_exist_ok=dirs_exist_ok) return tmp_dir_path diff --git a/tests/integration/test_wheels.py b/tests/integration/test_wheels.py index 518ce68..aa59da5 100644 --- a/tests/integration/test_wheels.py +++ b/tests/integration/test_wheels.py @@ -17,3 +17,6 @@ def test_upload_dbfs(ws, new_installation): with WheelsV2(new_installation, product_info) as whl: remote_wheel = whl.upload_to_dbfs() ws.dbfs.get_status(remote_wheel) + + +# TODO: to add an integration test for upload_wheel_dependencies (currently getting an access issue to the test environment) diff --git a/tests/unit/test_wheels.py b/tests/unit/test_wheels.py index cf5b4b0..5ba6db6 100644 --- a/tests/unit/test_wheels.py +++ b/tests/unit/test_wheels.py @@ -39,6 +39,17 @@ def test_build_and_upload_wheel(): assert not os.path.exists(wheels._local_wheel) +def test_build_and_dependencies_upload_wheel(): + installation = MockInstallation() + product_info = ProductInfo.from_class(MockInstallation) + + wheels = WheelsV2(installation, product_info) + with wheels: + wheel_paths = wheels.upload_wheel_dependencies(["databricks_sdk"]) + assert len(wheel_paths) == 1 + installation.assert_file_uploaded(re.compile("wheels/databricks_sdk-*")) + + def test_unreleased_version(tmp_path): if not is_in_debug(): pytest.skip("fails without `git fetch --prune --unshallow` configured")