diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 3ce8438c638..d1782711ed8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -22,6 +22,7 @@ sdk/python/foundation-models/cohere/command_faiss_langchain.ipynb @stewart-co @k sdk/python/foundation-models/cohere/command_tools-langchain.ipynb @stewart-co @kseniia-cohere /sdk/python/foundation-models/nixtla/ @AzulGarza /sdk/python/foundation-models/healthcare-ai/ @jmerkow @ivantarapov +/sdk/python/assets/data/versioning.ipynb @ShakutaiGit #### files referenced in docs (DO NOT EDIT, except for Docs team!!!) ############################################################################################# /cli/assets/component/train.yml @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1 diff --git a/.github/workflows/sdk-assets-data-versioning.yml b/.github/workflows/sdk-assets-data-versioning.yml new file mode 100644 index 00000000000..05cb87bd8ac --- /dev/null +++ b/.github/workflows/sdk-assets-data-versioning.yml @@ -0,0 +1,94 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-assets-data-versioning +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "48 11/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/assets/data/** + - .github/workflows/sdk-assets-data-versioning.yml + - sdk/python/dev-requirements.txt + - infra/bootstrapping/** + - sdk/python/setup.sh + +permissions: + id-token: write +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.10" + - name: pip install notebook reqs + run: pip install --no-cache-dir -r sdk/python/dev-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + client-id: ${{ secrets.OIDC_AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.OIDC_AZURE_TENANT_ID }} + subscription-id: ${{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + - name: validate readme + run: | + python check-readme.py "${{ github.workspace }}/sdk/python/assets/data" + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: Eagerly cache access tokens for required scopes + run: | + # Workaround for azure-cli's lack of support for ID token refresh + # Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617 + + # Management + az account get-access-token --scope https://management.azure.com/.default --output none + # ML + az account get-access-token --scope https://ml.azure.com/.default --output none + - name: run assets/data/versioning.ipynb + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "versioning.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python versioning.ipynb versioning.output.ipynb + working-directory: sdk/python/assets/data + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: ./.github/actions/upload-artifact + with: + name: versioning + path: sdk/python/assets/data diff --git a/sdk/python/README.md b/sdk/python/README.md index cb853bf96ac..a0e3801bc56 100644 --- a/sdk/python/README.md +++ b/sdk/python/README.md @@ -39,6 +39,7 @@ Test Status is for branch - **_main_** |assets|assets-in-registry|[share-models-components-environments](assets/assets-in-registry/share-models-components-environments.ipynb)|*no description*|[![share-models-components-environments](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-assets-in-registry-share-models-components-environments.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-assets-in-registry-share-models-components-environments.yml)| |assets|component|[component](assets/component/component.ipynb)|Create a component asset|[![component](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-component-component.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-component-component.yml)| |assets|data|[data](assets/data/data.ipynb)|Read, write and register a data asset|[![data](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-data-data.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-data-data.yml)| +|assets|data|[versioning](assets/data/versioning.ipynb)|Compute and check dataset hash in Azure ML; register if unique for efficient versioning.|[![versioning](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-data-versioning.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-data-versioning.yml)| |assets|data|[working_with_mltable](assets/data/working_with_mltable.ipynb)|Read, write and register a data asset|[![working_with_mltable](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-data-working_with_mltable.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-data-working_with_mltable.yml)| |assets|environment|[environment](assets/environment/environment.ipynb)|Create custom environments from docker and/or conda YAML|[![environment](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-environment-environment.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-environment-environment.yml)| |assets|model|[model](assets/model/model.ipynb)|Create model from local files, cloud files, Runs|[![model](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-model-model.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-assets-model-model.yml)| diff --git a/sdk/python/assets/data/data.ipynb b/sdk/python/assets/data/data.ipynb index 4810983dc37..f2fe40cc0ef 100644 --- a/sdk/python/assets/data/data.ipynb +++ b/sdk/python/assets/data/data.ipynb @@ -522,7 +522,7 @@ " RecurrenceTrigger,\n", " RecurrencePattern,\n", ")\n", - "from datetime import datetime\n", + "from datetime import datetime, timedelta\n", "\n", "source = Database(connection=\"azureml:my_sf_connection\", query=\"select * from my_table\")\n", "\n", @@ -579,11 +579,13 @@ ")\n", "\n", "schedule_name = \"my_simple_sdk_create_schedule_cron\"\n", + "start_time = datetime.utcnow()\n", + "end_time = start_time + timedelta(days=7) # Set end_time to 7 days later\n", "\n", "cron_trigger = CronTrigger(\n", " expression=\"15 10 * * 1\",\n", - " start_time=datetime.utcnow(),\n", - " end_time=\"2024-12-03T18:40:00\",\n", + " start_time=start_time,\n", + " end_time=end_time,\n", ")\n", "import_schedule = ImportDataSchedule(\n", " name=schedule_name, trigger=cron_trigger, import_data=my_data\n", diff --git a/sdk/python/assets/data/versioning.ipynb b/sdk/python/assets/data/versioning.ipynb new file mode 100644 index 00000000000..e3acb0ce0ea --- /dev/null +++ b/sdk/python/assets/data/versioning.ipynb @@ -0,0 +1,314 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset Versioning in Azure Machine Learning\n", + "\n", + "In this notebook, you will:\n", + "1. Compute a hash for a dataset.\n", + "2. Check if a dataset with the same hash already exists in Azure ML.\n", + "3. If not, upload the dataset to Azure Blob Storage and register it as an asset with a hash tag.\n", + "4. If it exists, retrieve the asset name, version, and tag.\n", + "\n", + "> **Note**: Ensure you update the configuration values before running the notebook. \n", + "> **Note**: If you encounter any issues, refer to the troubleshooting section at the end.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install azure-ai-ml\n", + "!pip install azure-identity\n", + "!pip install azure-storage-blob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.ai.ml.entities import Data\n", + "from azure.ai.ml.constants import AssetTypes\n", + "import hashlib\n", + "import os\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.storage.blob import BlobServiceClient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace = \"\"\n", + "\n", + "asset_name = \"asset_name\" # Replace with the name you want to give your asset.\n", + "asset_description = \"asset_description\" # Provide a description of your asset.\n", + "asset_path = os.path.abspath(\n", + " \"./sample_data/\"\n", + ") # Provide the absolute path to your local DATA FOLDERS.\n", + "asset_type = \"dataset\"\n", + "\n", + "enable_blob_upload = (\n", + " False # Set to True if you want to upload the asset to Azure Blob Storage.\n", + ")\n", + "azure_storage_account_name = \"azure_storage_account_name\"\n", + "container_name = \"container_name\"\n", + "assets_folder = \"assets_folder\" # Provide a unique folder path to store the assets in Azure Blob Storage." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Connect to Azure Machine Learning Workspace\n", + "\n", + "Connect to the Azure ML workspace using `MLClient`. Ensure the configuration values in the previous cell are correct before running this code.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the Azure ML client\n", + "ml_client = MLClient(\n", + " DefaultAzureCredential(),\n", + " subscription_id,\n", + " resource_group,\n", + " workspace,\n", + ")\n", + "\n", + "if enable_blob_upload:\n", + " # Initialize the Azure Blob client\n", + " blob_client = BlobServiceClient(\n", + " account_url=f\"https://{azure_storage_account_name}.blob.core.windows.net\",\n", + " credential=DefaultAzureCredential(),\n", + " ).get_container_client(container_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute Hash for Dataset\n", + "\n", + "Compute a hash for the dataset to identify its uniqueness. This will help in checking if the dataset already exists in Azure ML.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the hash of the asset folder\n", + "hash_algo = hashlib.sha256()\n", + "for root, _, files in os.walk(asset_path):\n", + " for file in sorted(files): # Sort files for consistent hash\n", + " file_path = os.path.join(root, file)\n", + " with open(file_path, \"rb\") as f:\n", + " for chunk in iter(lambda: f.read(4096), b\"\"):\n", + " hash_algo.update(chunk)\n", + "asset_hash = hash_algo.hexdigest()\n", + "print(f\"Computed hash: {asset_hash}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check for Existing Asset in Azure ML\n", + "\n", + "Check if a dataset with the same hash already exists in the Azure ML workspace.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check if an asset with this hash already exists in Azure ML\n", + "asset_exists = False\n", + "existing_asset_info = None\n", + "for asset in ml_client.data.list():\n", + " for asset_version_info in ml_client.data.list(name=asset.name):\n", + " if asset_version_info.tags.get(\"hash\") == asset_hash:\n", + " asset_exists = True\n", + " existing_asset_info = {\n", + " \"asset_name\": asset_version_info.name,\n", + " \"asset_version\": asset_version_info.version,\n", + " }\n", + " break\n", + " if asset_exists:\n", + " break\n", + "\n", + "if asset_exists:\n", + " print(f\"Asset with hash {asset_hash} already exists in the workspace.\")\n", + " print(\n", + " f\"Asset name: {existing_asset_info['asset_name']}, version: {existing_asset_info['asset_version']}\"\n", + " )\n", + "else:\n", + " print(\n", + " \"No existing asset found with the same hash. Uploading and registering the asset.\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload to Azure Blob Storage (If Not Exists)\n", + "\n", + "If the dataset doesn't already exist in Azure ML, upload it to Azure Blob Storage.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not asset_exists:\n", + " # Determine the latest version number\n", + " try:\n", + " latest = ml_client.data._get_latest_version(asset_name)\n", + " latest_version = str(int(latest.version) + 1) if latest else \"1\"\n", + " except Exception as e:\n", + " print(f\"Error getting latest version: {e}, setting it to 1.\")\n", + " latest_version = \"1\"\n", + "\n", + " # Upload files to Azure Blob Storage\n", + " unique_folder_path = f\"{asset_name}_{latest_version}\"\n", + " print(f\"Uploading files from {asset_path} to {unique_folder_path}\")\n", + " if enable_blob_upload:\n", + " for root, _, files in os.walk(asset_path):\n", + " for file_name in files:\n", + " file_path = os.path.join(root, file_name)\n", + " blob_path = os.path.join(\n", + " unique_folder_path, os.path.relpath(file_path, asset_path)\n", + " )\n", + " blob_client_instance = blob_client.get_blob_client(blob_path)\n", + " with open(file_path, \"rb\") as data:\n", + " blob_client_instance.upload_blob(data, overwrite=True)\n", + " print(f\"Uploaded {file_path} to {blob_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Register the Dataset in Azure ML\n", + "\n", + "After uploading to Blob Storage, register the dataset as an asset in Azure ML.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not asset_exists:\n", + " # Register the asset in Azure ML\n", + " blob_url = f\"https://{azure_storage_account_name}.blob.core.windows.net/{container_name}/{unique_folder_path}\"\n", + " data_asset = Data(\n", + " path=blob_url,\n", + " type=AssetTypes.URI_FOLDER,\n", + " description=asset_description,\n", + " name=asset_name,\n", + " tags={\"hash\": asset_hash}, # Tagging the asset with the computed hash\n", + " )\n", + "\n", + " registered_asset = ml_client.data.create_or_update(data_asset)\n", + " print(\n", + " f\"New {asset_type} registered in the workspace: {asset_name} with version {registered_asset.version}\"\n", + " )\n", + " existing_asset_info = {\n", + " f\"asset_name\": registered_asset.name,\n", + " f\"asset_version\": registered_asset.version,\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results\n", + "\n", + "If the asset was uploaded and registered, or if it already existed, the asset name and version are displayed below.\n", + "\n", + "**You can use these results to add a tag to the AML job, creating a link to the dataset used.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if existing_asset_info:\n", + " print(\n", + " f\"Asset name: {existing_asset_info['asset_name']}, version: {existing_asset_info['asset_version']} found\"\n", + " )\n", + "else:\n", + " print(\"No action taken.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Troubleshooting\n", + "\n", + "1. **Permission Issues:** \n", + " - Ensure that your Azure Active Directory (AAD) account has the appropriate permissions:\n", + " - **Azure Machine Learning:** 'Contributor' or 'Owner' role for the resource group containing the Azure ML workspace.\n", + " - **Storage Account:** 'Storage Blob Data Contributor' role for the specific storage account used in your operations.\n", + "\n", + "2. **Path Error:** \n", + " - The dataset path (`asset_path`) must be an absolute path. Modify this path based on your environment to point to the correct location of the data files.\n", + "\n", + "3. **Asset Registration Issues:** \n", + " - If you encounter an error stating that an asset already exists, ensure that you have a unique asset name or update the existing asset version if necessary.\n", + "\n", + "4. **Tagging:** \n", + " - Each dataset is tagged with its computed hash. This tag is used to verify the uniqueness of the dataset in Azure ML. \n", + " Make sure to include the hash tag when registering a new dataset.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}