From f70cd95035831380e05f031f3688b4fe09ba794e Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Fri, 28 Oct 2022 11:31:24 -0700 Subject: [PATCH 01/18] Fix xdist test error. Also make a small cleanup some codes Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/client.py | 5 ++-- .../udf/_preprocessing_pyudf_manager.py | 1 + feathr_project/test/unit/utils/test_config.py | 26 ++++++++++++++----- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index dd39a70fa..c7dcd010d 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -415,7 +415,6 @@ def get_offline_features(self, output_path: Union[str, Sink], execution_configurations: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, config_file_name:str = "feature_join_conf/feature_join.conf", - udf_files = None, verbose: bool = False ): """ @@ -609,7 +608,7 @@ def _valid_materialize_keys(self, features: List[str], allow_empty_key=False): self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}") return False return True - + def materialize_features(self, settings: MaterializationSettings, execution_configurations: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, verbose: bool = False, allow_materialize_non_agg_feature: bool = False): """Materialize feature data @@ -621,7 +620,7 @@ def materialize_features(self, settings: MaterializationSettings, execution_conf feature_list = settings.feature_names if len(feature_list) > 0 and not self._valid_materialize_keys(feature_list): raise RuntimeError(f"Invalid materialization features: {feature_list}, since they have different keys. Currently Feathr only supports materializing features of the same keys.") - + if not allow_materialize_non_agg_feature: # Check if there are non-aggregation features in the list for fn in feature_list: diff --git a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py index 55756ba3d..c4f102566 100644 --- a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py +++ b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py @@ -176,6 +176,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir): for feature_name in feature_names: if feature_name in features_with_preprocessing: has_py_udf_preprocessing = True + break if has_py_udf_preprocessing: pyspark_driver_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_DRIVER_FILE_NAME) diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index 502a3a01d..a53e1764a 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -6,17 +6,27 @@ from feathr.utils.config import FEATHR_CONFIG_TEMPLATE, generate_config -@pytest.mark.parametrize( - "output_filepath", [None, NamedTemporaryFile().name], -) -def test__generate_config(output_filepath: str): - - config = FEATHR_CONFIG_TEMPLATE.format( +@pytest.fixture(scope="session") +def feathr_config_str() -> str: + return FEATHR_CONFIG_TEMPLATE.format( resource_prefix="test_prefix", project_name="test_project", spark_cluster="local", ) + +@pytest.mark.parametrize( + "output_filepath", [None, "config.yml"], +) +def test__generate_config( + output_filepath: str, + feathr_config_str: str, + tmp_path: Path, +): + # Use tmp_path so that the test files get cleaned up after the tests + if output_filepath: + output_filepath = str(tmp_path / output_filepath) + config_filepath = generate_config( resource_prefix="test_prefix", project_name="test_project", @@ -24,8 +34,10 @@ def test__generate_config(output_filepath: str): output_filepath=output_filepath, ) + # Assert if the config file was generated in the specified output path. if output_filepath: assert output_filepath == config_filepath + # Assert the generated config string is correct. with open(config_filepath, "r") as f: - assert config == f.read() + assert feathr_config_str == f.read() From 990208b066c4b5eefca9485f21f4558f09d0b902 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Mon, 31 Oct 2022 22:28:24 +0000 Subject: [PATCH 02/18] Revert "Revert 756 (#798)" This reverts commit ff438f5ed2ec11271dac8121f0f4071be0d5a279. --- docs/dev_guide/new_contributor_guide.md | 6 +- docs/quickstart_databricks.md | 6 +- docs/quickstart_synapse.md | 2 +- docs/samples/nyc_taxi_demo.ipynb | 1110 +++++++++++++++++ feathr_project/feathr/client.py | 2 +- feathr_project/feathr/datasets/__init__.py | 9 + feathr_project/feathr/datasets/constants.py | 3 + feathr_project/feathr/datasets/nyc_taxi.py | 87 ++ feathr_project/feathr/datasets/utils.py | 64 + .../spark_provider/_databricks_submission.py | 181 +-- feathr_project/feathr/utils/config.py | 61 + feathr_project/feathr/utils/job_utils.py | 218 +++- feathr_project/feathr/utils/platform.py | 45 + .../demo_data/green_tripdata_2020-04.csv | 14 - .../product_detail_mock_data.csv | 11 - .../user_observation_mock_data.csv | 35 - .../user_profile_mock_data.csv | 11 - .../user_purchase_history_mock_data.csv | 31 - .../nyc_driver_demo.ipynb | 720 ----------- feathr_project/setup.py | 3 +- feathr_project/test/samples/test_notebooks.py | 56 + .../test/unit/datasets/test_dataset_utils.py | 53 + .../test/unit/datasets/test_datasets.py | 106 ++ feathr_project/test/unit/utils/test_config.py | 31 + 24 files changed, 1907 insertions(+), 958 deletions(-) create mode 100644 docs/samples/nyc_taxi_demo.ipynb create mode 100644 feathr_project/feathr/datasets/__init__.py create mode 100644 feathr_project/feathr/datasets/constants.py create mode 100644 feathr_project/feathr/datasets/nyc_taxi.py create mode 100644 feathr_project/feathr/datasets/utils.py create mode 100644 feathr_project/feathr/utils/config.py create mode 100644 feathr_project/feathr/utils/platform.py delete mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv delete mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv delete mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv delete mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv delete mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv delete mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb create mode 100644 feathr_project/test/samples/test_notebooks.py create mode 100644 feathr_project/test/unit/datasets/test_dataset_utils.py create mode 100644 feathr_project/test/unit/datasets/test_datasets.py create mode 100644 feathr_project/test/unit/utils/test_config.py diff --git a/docs/dev_guide/new_contributor_guide.md b/docs/dev_guide/new_contributor_guide.md index 1856ffd84..223b7d91b 100644 --- a/docs/dev_guide/new_contributor_guide.md +++ b/docs/dev_guide/new_contributor_guide.md @@ -6,11 +6,11 @@ parent: Feathr Developer Guides # What can I contribute? All forms of contributions are welcome, including and not limited to: -* Improve or contribute new [notebook samples](https://github.com/feathr-ai/feathr/tree/main/feathr_project/feathrcli/data/feathr_user_workspace) +* Improve or contribute new [notebook samples](https://github.com/feathr-ai/feathr/tree/main/docs/samples) * Add tutorial, blog posts, tech talks etc * Increase media coverage and exposure * Improve user-facing documentation or developer-facing documentation -* Add testing code +* Add testing code * Add new features * Refactor and improve architecture * For any other forms of contribution and collaboration, don't hesitate to reach out to us. @@ -18,7 +18,7 @@ All forms of contributions are welcome, including and not limited to: # I am interested, how can I start? If you are new to this project, we recommend start with [`good-first-issue`](https://github.com/feathr-ai/feathr/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). -The issues are also labled with what types of programming language the task need. +The issues are also labled with what types of programming language the task need. * [`good-first-issue` and `Python`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Apython) * [`good-first-issue` and `Scala`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Ascala) * [`good-first-issue` and `Java`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Ajava) diff --git a/docs/quickstart_databricks.md b/docs/quickstart_databricks.md index dff5b5f0f..30eaaa835 100644 --- a/docs/quickstart_databricks.md +++ b/docs/quickstart_databricks.md @@ -5,13 +5,13 @@ title: Quick Start Guide with Databricks # Feathr Quick Start Guide with Databricks -For Databricks, you can simply upload [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb) to your Databricks cluster and just run it in the Databricks cluster. It has been pre-configured to use the current Databricks cluster to submit jobs. +For Databricks, you can simply upload [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb) to your Databricks cluster and just run it in the Databricks cluster. It has been pre-configured to use the current Databricks cluster to submit jobs. 1. Import Notebooks in your Databricks cluster: ![Import Notebooks](./images/databricks_quickstart1.png) -2. Paste the [link to Databricks getting started notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb): +2. Paste the [link to Databricks getting started notebook](./samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb): ![Import Notebooks](./images/databricks_quickstart2.png) @@ -21,7 +21,7 @@ For Databricks, you can simply upload [this notebook](./samples/databricks/datab Although Databricks Notebooks are great tools, there are also large developer communities that prefer the usage of Visual Studio Code, where [it has native support for Python and Jupyter Notebooks](https://code.visualstudio.com/docs/datascience/jupyter-notebooks) with many great features such as syntax highlight and IntelliSense. -In [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb), there are a few lines of code like this: +In [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb), there are a few lines of code like this: ```python # Get current databricks notebook context diff --git a/docs/quickstart_synapse.md b/docs/quickstart_synapse.md index d07198d92..c310dd789 100644 --- a/docs/quickstart_synapse.md +++ b/docs/quickstart_synapse.md @@ -24,7 +24,7 @@ Feathr has native cloud integration. Here are the steps to use Feathr on Azure: 1. Follow the [Feathr ARM deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to run Feathr on Azure. This allows you to quickly get started with automated deployment using Azure Resource Manager template. Alternatively, if you want to set up everything manually, you can checkout the [Feathr CLI deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html) to run Feathr on Azure. This allows you to understand what is going on and set up one resource at a time. -2. Once the deployment is complete,run the Feathr Jupyter Notebook by clicking this button: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/feathr-ai/feathr/main?labpath=feathr_project%2Ffeathrcli%2Fdata%2Ffeathr_user_workspace%2Fnyc_driver_demo.ipynb). +2. Once the deployment is complete,run the Feathr Jupyter Notebook by clicking this button: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/feathr-ai/feathr/main?labpath=docs%2Fsamples%2Fnyc_taxi_demo.ipynb). 3. You only need to change the specified `Resource Prefix`. ## Step 2: Install Feathr diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb new file mode 100644 index 000000000..b789e9bf2 --- /dev/null +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -0,0 +1,1110 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "384e5e16-7213-4186-9d04-09d03b155534", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Feathr Quick Start Notebook\n", + "\n", + "This notebook illustrates the use of Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n", + "\n", + "The major problems Feathr solves are:\n", + "\n", + "1. Create, share and manage useful features from raw source data.\n", + "2. Provide Point-in-time feature join to create training dataset to ensure no data leakage.\n", + "3. Deploy the same feature data to online store to eliminate training and inference data skew." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "\n", + "Feathr has native cloud integration. First step is to provision required cloud resources if you want to use Feathr.\n", + "\n", + "Follow the [Feathr ARM deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to run Feathr on Azure. This allows you to quickly get started with automated deployment using Azure Resource Manager template. For more details, please refer [README.md](https://github.com/feathr-ai/feathr#%EF%B8%8F-running-feathr-on-cloud-with-a-few-simple-steps).\n", + "\n", + "Additionally, to run this notebook, you'll need to install `feathr` pip package. For local spark, simply run `pip install feathr` on the machine that runs this notebook. To use Databricks or Azure Synapse Analytics, please see dependency management documents:\n", + "- [Azure Databricks dependency management](https://learn.microsoft.com/en-us/azure/databricks/libraries/)\n", + "- [Azure Synapse Analytics dependency management](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-portal-add-libraries)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notebook Steps\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install Feathr and necessary dependencies\n", + "2. Create shareable features with Feathr feature definition configs\n", + "3. Create training data using point-in-time correct feature join\n", + "4. Train a prediction model and evaluate the model and features\n", + "5. Register the features to share across teams\n", + "6. Materialize feature values for online scoring\n", + "\n", + "The overall data flow is as follows:\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install Feathr and Necessary Dependancies\n", + "\n", + "Install feathr and necessary packages by running `pip install feathr[notebook]` if you haven't installed them already." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "import glob\n", + "import json\n", + "from math import sqrt\n", + "import os\n", + "from pathlib import Path\n", + "import requests\n", + "from tempfile import TemporaryDirectory\n", + "\n", + "from azure.identity import AzureCliCredential, DefaultAzureCredential \n", + "from azure.keyvault.secrets import SecretClient\n", + "import pandas as pd\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.evaluation import RegressionEvaluator\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.regression import GBTRegressor\n", + "from pyspark.sql import DataFrame, SparkSession\n", + "import pyspark.sql.functions as F\n", + "\n", + "import feathr\n", + "from feathr import (\n", + " FeathrClient,\n", + " # Feature data types\n", + " BOOLEAN, FLOAT, INT32, ValueType,\n", + " # Feature data sources\n", + " INPUT_CONTEXT, HdfsSource,\n", + " # Feature aggregations\n", + " TypedKey, WindowAggTransformation,\n", + " # Feature types and anchor\n", + " DerivedFeature, Feature, FeatureAnchor,\n", + " # Materialization\n", + " BackfillTime, MaterializationSettings, RedisSink,\n", + " # Offline feature computation\n", + " FeatureQuery, ObservationSettings,\n", + ")\n", + "from feathr.datasets import nyc_taxi\n", + "from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n", + "from feathr.utils.config import generate_config\n", + "from feathr.utils.job_utils import get_result_df\n", + "from feathr.utils.platform import is_databricks, is_jupyter\n", + "\n", + "print(f\"Feathr version: {feathr.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Create Shareable Features with Feathr Feature Definition Configs\n", + "\n", + "First, we define all the necessary resource key values for authentication. These values are retrieved by using [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) cloud key value store. For authentication, we use Azure CLI credential in this notebook, but you may add secrets' list and get permission for the necessary service principal instead of running `az login --use-device-code`.\n", + "\n", + "Please refer to [A note on using azure key vault to store credentials](https://github.com/feathr-ai/feathr/blob/41e7496b38c43af6d7f8f1de842f657b27840f6d/docs/how-to-guides/feathr-configuration-and-env.md#a-note-on-using-azure-key-vault-to-store-credentials) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "RESOURCE_PREFIX = None # TODO fill the value\n", + "PROJECT_NAME = \"feathr_getting_started\"\n", + "\n", + "# Data store root path. Could be a local file system path or Azure storage path like abfs or wasbs\n", + "DATA_STORE_PATH = TemporaryDirectory().name\n", + "\n", + "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", + "SPARK_CLUSTER = \"local\"\n", + "# TODO -- Synapse spark pool name or Databricks cluster id\n", + "CLUSTER_NAME = None\n", + "\n", + "# If set True, use an interactive browser authentication\n", + "USE_CLI_AUTH = False\n", + "\n", + "# (For the notebook test pipeline) If true, use ScrapBook package to collect the results.\n", + "SCRAP_RESULTS = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KEY_VAULT = f\"{RESOURCE_PREFIX}kv\"\n", + "KEY_VAULT_URI = f\"https://{KEY_VAULT}.vault.azure.net\"\n", + "\n", + "ADLS_PATH = f\"abfss://{RESOURCE_PREFIX}fs@{RESOURCE_PREFIX}dls.dfs.core.windows.net/feathr_project\"\n", + "\n", + "if SPARK_CLUSTER == \"azure_synapse\":\n", + " os.environ['spark_config__azure_synapse__dev_url'] = f\"https://{resource_prefix}syws.dev.azuresynapse.net\"\n", + " os.environ['spark_config__azure_synapse__pool_name'] = CLUSTER_NAME\n", + " os.environ['spark_config__azure_synapse__workspace_dir'] = f\"abfss://{adls_fs_name}@{resource_prefix}dls.dfs.core.windows.net/{PROJECT_NAME}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if USE_CLI_AUTH:\n", + " !az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Get all the required credentials from Azure Key Vault\n", + "credential = AzureCliCredential() if USE_CLI_AUTH else DefaultAzureCredential()\n", + "secret_client = SecretClient(vault_url=KEY_VAULT_URI, credential=credential)\n", + "retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Redis credential\n", + "os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n", + "\n", + "if SPARK_CLUSTER == \"local\":\n", + " os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", + "\n", + "elif SPARK_CLUSTER == \"databricks\" and is_databricks():\n", + " ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", + " databricks_config = {\n", + " 'run_name': \"FEATHR_FILL_IN\",\n", + " 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n", + " 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n", + " 'spark_jar_task': {\n", + " 'main_class_name': \"FEATHR_FILL_IN\",\n", + " 'parameters': [\"FEATHR_FILL_IN\"],\n", + " },\n", + " }\n", + " os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n", + " os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n", + " os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n", + " os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Configurations\n", + "\n", + "Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n", + "\n", + "with open(config_path, 'r') as f: \n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initialize Feathr client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Prepare the NYC taxi fare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To run on a local spark, start a spark session:\n", + "if SPARK_CLUSTER == \"local\":\n", + " spark = (\n", + " SparkSession\n", + " .builder\n", + " .appName(\"feathr\")\n", + " .config(\"spark.jars.packages\", \"org.apache.spark:spark-avro_2.12:3.3.0\")\n", + " .config(\"spark.ui.port\", \"8080\") # Set ui port other than the default one (4040) so that feathr spark job doesn't fail. \n", + " .getOrCreate()\n", + " )\n", + " \n", + "# Else, you must already have spark session object available in databricks or synapse." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n", + "\n", + "# Download the data file\n", + "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n", + "df_raw.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Defining features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n", + "\n", + "* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n", + "* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", + "\n", + "Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n", + "\n", + "There are two types of features -- anchored features and derivated features:\n", + "\n", + "* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n", + "* **Derived features**: Features that are computed on top of other features.\n", + "\n", + "#### Define anchored features\n", + "\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n", + "TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# We define f_trip_distance and f_trip_time_duration features separately\n", + "# so that we can reuse them later for the derived features.\n", + "f_trip_distance = Feature(\n", + " name=\"f_trip_distance\",\n", + " feature_type=FLOAT,\n", + " transform=\"trip_distance\",\n", + ")\n", + "f_trip_time_duration = Feature(\n", + " name=\"f_trip_time_duration\",\n", + " feature_type=FLOAT,\n", + " transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n", + ")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " Feature(\n", + " name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"trip_distance > 30.0\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_month\",\n", + " feature_type=INT32,\n", + " transform=\"dayofmonth(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_hour_of_day\",\n", + " feature_type=INT32,\n", + " transform=\"hour(lpep_dropoff_datetime)\",\n", + " ),\n", + "]\n", + "\n", + "# After you have defined features, bring them together to build the anchor to the source.\n", + "feature_anchor = FeatureAnchor(\n", + " name=\"feature_anchor\",\n", + " source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n", + " features=features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can define the source with a preprocessing python function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocessing(df: DataFrame) -> DataFrame:\n", + " import pyspark.sql.functions as F\n", + " df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n", + " return df\n", + "\n", + "batch_source = HdfsSource(\n", + " name=\"nycTaxiBatchSource\",\n", + " path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " preprocessing=preprocessing,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the features with aggregation, the supported functions are as follows:\n", + "\n", + "| Aggregation Function | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agg_key = TypedKey(\n", + " key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\",\n", + ")\n", + "\n", + "agg_window = \"90d\"\n", + "\n", + "# Anchored features with aggregations\n", + "agg_features = [\n", + " Feature(\n", + " name=\"f_location_avg_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"AVG\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + " Feature(\n", + " name=\"f_location_max_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"MAX\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + "]\n", + "\n", + "agg_feature_anchor = FeatureAnchor(\n", + " name=\"agg_feature_anchor\",\n", + " source=batch_source, # External data source for feature. Typically a data table.\n", + " features=agg_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Define derived features\n", + "\n", + "We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "derived_features = [\n", + " DerivedFeature(\n", + " name=\"f_trip_time_distance\",\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " ],\n", + " transform=\"f_trip_distance / f_trip_time_duration\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build features\n", + "\n", + "Finally, we build the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.build_features(\n", + " anchor_list=[feature_anchor, agg_feature_anchor],\n", + " derived_feature_list=derived_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 3. Create Training Data Using Point-in-Time Correct Feature Join\n", + "\n", + "After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = [feature.name for feature in features + agg_features + derived_features]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_FORMAT = \"parquet\"\n", + "offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", + "showTitle": false, + "title": "" + }, + "scrolled": false + }, + "outputs": [], + "source": [ + "# Features that we want to request. Can use a subset of features\n", + "query = FeatureQuery(\n", + " feature_list=feature_names,\n", + " key=agg_key,\n", + ")\n", + "settings = ObservationSettings(\n", + " observation_path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")\n", + "client.get_offline_features(\n", + " observation_settings=settings,\n", + " feature_query=query,\n", + " # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n", + " execution_configurations=SparkExecutionConfiguration({\n", + " \"spark.feathr.outputFormat\": DATA_FORMAT,\n", + " }),\n", + " output_path=offline_features_path,\n", + ")\n", + "\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show feature results\n", + "df = get_result_df(\n", + " spark=spark,\n", + " client=client,\n", + " data_format=DATA_FORMAT,\n", + " res_url=offline_features_path,\n", + ")\n", + "df.select(feature_names).limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 4. Train a Prediction Model and Evaluate the Features\n", + "\n", + "After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n", + "\n", + "Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Train and Test Data from the Offline Feature Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train / test split\n", + "train_df, test_df = (\n", + " df # Dataframe that we generated from get_offline_features call.\n", + " .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n", + " .where(F.col(\"f_trip_time_duration\") > 0)\n", + " .fillna(0)\n", + " .randomSplit([0.8, 0.2])\n", + ")\n", + "\n", + "print(f\"Num train samples: {train_df.count()}\")\n", + "print(f\"Num test samples: {test_df.count()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build a ML Pipeline\n", + "\n", + "Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate a feature vector column for SparkML\n", + "vector_assembler = VectorAssembler(\n", + " inputCols=[x for x in df.columns if x in feature_names],\n", + " outputCol=\"features\",\n", + ")\n", + "\n", + "# Define a model\n", + "gbt = GBTRegressor(\n", + " featuresCol=\"features\",\n", + " maxIter=100,\n", + " maxDepth=5,\n", + " maxBins=16,\n", + ")\n", + "\n", + "# Create a ML pipeline\n", + "ml_pipeline = Pipeline(stages=[\n", + " vector_assembler,\n", + " gbt,\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and Evaluate the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train a model\n", + "model = ml_pipeline.fit(train_df)\n", + "\n", + "# Make predictions\n", + "predictions = model.transform(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate\n", + "evaluator = RegressionEvaluator(\n", + " labelCol=\"label\",\n", + " predictionCol=\"prediction\",\n", + ")\n", + "\n", + "rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n", + "mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n", + "print(f\"RMSE: {rmse}\\nMAE: {mae}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n", + "predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n", + "\n", + "predictions_pdf.plot(\n", + " x=\"index\",\n", + " y=[\"label\", \"prediction\"],\n", + " style=['-', ':'],\n", + " figsize=(20, 10),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions_pdf.plot.scatter(\n", + " x=\"label\",\n", + " y=\"prediction\",\n", + " xlim=(0, 100),\n", + " ylim=(0, 100),\n", + " figsize=(10, 10),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Register the Features to Share Across Teams\n", + "\n", + "You can register your features in the centralized registry and share the corresponding project with other team members who want to consume those features and for further use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " client.register_features()\n", + "except KeyError:\n", + " # TODO temporarily go around the \"Already exists\" error\n", + " \n", + " client.list_registered_features(project_name=PROJECT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 6. Materialize Feature Values for Online Scoring\n", + "\n", + "While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n", + "\n", + "Note, only the features anchored to offline data source can be materialized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get registered features\n", + "registered_features_dict = client.get_features_from_registry(PROJECT_NAME)\n", + "\n", + "observation_feature_names = []\n", + "materialized_feature_names = []\n", + "\n", + "for feature_name, feature in registered_features_dict.items():\n", + " if feature.key[0].key_column == \"NOT_NEEDED\":\n", + " observation_feature_names.append(feature_name)\n", + " else:\n", + " materialized_feature_names.append(feature_name)\n", + " \n", + "print(f\"Features that will be extracted directly from the observation: {observation_feature_names}\")\n", + "print(\"\")\n", + "print(f\"Features that will be extracted from the source data and materialized to online storage: {materialized_feature_names}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the last date from the dataset\n", + "backfill_timestamp = (\n", + " df_raw\n", + " .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n", + " .agg({TIMESTAMP_COL: \"max\"})\n", + " .collect()[0][0]\n", + ")\n", + "backfill_timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", + "showTitle": false, + "title": "" + }, + "scrolled": false + }, + "outputs": [], + "source": [ + "FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n", + "\n", + "# Time range to materialize\n", + "backfill_time = BackfillTime(\n", + " start=backfill_timestamp,\n", + " end=backfill_timestamp,\n", + " step=timedelta(days=1),\n", + ")\n", + "\n", + "# Destinations:\n", + "# For online store,\n", + "redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n", + "\n", + "# For offline store,\n", + "# adls_sink = HdfsSink(output_path=)\n", + "\n", + "settings = MaterializationSettings(\n", + " name=FEATURE_TABLE_NAME + \".job\", # job name\n", + " backfill_time=backfill_time,\n", + " sinks=[redis_sink], # or adls_sink\n", + " feature_names=materialized_feature_names,\n", + ")\n", + "\n", + "client.materialize_features(\n", + " settings=settings,\n", + " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", + ")\n", + "\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, you can retrieve features for online scoring as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Note, to get a single key, you may use client.get_online_features instead\n", + "materialized_feature_values = client.multi_get_online_features(\n", + " feature_table=FEATURE_TABLE_NAME,\n", + " keys=[\"239\", \"265\"],\n", + " feature_names=materialized_feature_names,\n", + ")\n", + "materialized_feature_values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Unregister or any other cleanups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the spark session if it is a local session.\n", + "if is_jupyter():\n", + " spark.stop()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Scrap Variables for Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if SCRAP_RESULTS:\n", + " # Record results for test pipelines\n", + " import scrapbook as sb\n", + " sb.glue(\"materialized_feature_values\", materialized_feature_values)\n", + " sb.glue(\"rmse\", rmse)\n", + " sb.glue(\"mae\", mae)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "nyc_driver_demo", + "notebookOrigID": 930353059183053, + "widgets": {} + }, + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3.10.4 ('feathr')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "ddb0e38f168d5afaa0b8ab4851ddd8c14364f1d087c15de6ff2ee5a559aec1f2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index b14bf868e..dd39a70fa 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -486,7 +486,7 @@ def _get_offline_features_with_config(self, job_tags = {OUTPUT_PATH_TAG:feature_join_job_params.job_output_path} # set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function if execution_configurations is not None and OUTPUT_FORMAT in execution_configurations: - job_tags[OUTPUT_FORMAT]= execution_configurations[OUTPUT_FORMAT] + job_tags[OUTPUT_FORMAT] = execution_configurations[OUTPUT_FORMAT] ''' - Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself. - Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to diff --git a/feathr_project/feathr/datasets/__init__.py b/feathr_project/feathr/datasets/__init__.py new file mode 100644 index 000000000..a1e2e5bf3 --- /dev/null +++ b/feathr_project/feathr/datasets/__init__.py @@ -0,0 +1,9 @@ +"""Utilities for downloading sample datasets""" + +from feathr.datasets.constants import ( + NYC_TAXI_SMALL_URL +) + +__all__ = [ + "NYC_TAXI_SMALL_URL", +] diff --git a/feathr_project/feathr/datasets/constants.py b/feathr_project/feathr/datasets/constants.py new file mode 100644 index 000000000..849865570 --- /dev/null +++ b/feathr_project/feathr/datasets/constants.py @@ -0,0 +1,3 @@ +NYC_TAXI_SMALL_URL = ( + "https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv" +) diff --git a/feathr_project/feathr/datasets/nyc_taxi.py b/feathr_project/feathr/datasets/nyc_taxi.py new file mode 100644 index 000000000..ec605aae6 --- /dev/null +++ b/feathr_project/feathr/datasets/nyc_taxi.py @@ -0,0 +1,87 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from threading import local +from urllib.parse import urlparse + +import pandas as pd +from pyspark.sql import DataFrame, SparkSession + +from feathr.datasets import NYC_TAXI_SMALL_URL +from feathr.datasets.utils import maybe_download +from feathr.utils.platform import is_databricks + + +def get_pandas_df( + local_cache_path: str = None, +) -> pd.DataFrame: + """Get NYC taxi fare prediction data samples as a pandas DataFrame. + + Refs: + https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page + + Args: + local_cache_path (optional): Local cache file path to download the data set. + If local_cache_path is a directory, the source file name will be added. + + Returns: + pandas DataFrame + """ + # if local_cache_path params is not provided then create a temporary folder + if local_cache_path is None: + local_cache_path = TemporaryDirectory().name + + # If local_cache_path is a directory, add the source file name. + src_filepath = Path(urlparse(NYC_TAXI_SMALL_URL).path) + dst_path = Path(local_cache_path) + if dst_path.suffix != src_filepath.suffix: + local_cache_path = str(dst_path.joinpath(src_filepath.name)) + + maybe_download(src_url=NYC_TAXI_SMALL_URL, dst_filepath=local_cache_path) + + pdf = pd.read_csv(local_cache_path) + + return pdf + + +def get_spark_df( + spark: SparkSession, + local_cache_path: str, +) -> DataFrame: + """Get NYC taxi fare prediction data samples as a spark DataFrame. + + Refs: + https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page + + Args: + spark: Spark session. + local_cache_path: Local cache file path to download the data set. + If local_cache_path is a directory, the source file name will be added. + + Returns: + Spark DataFrame + """ + # In spark, local_cache_path should be a persist directory or file path + if local_cache_path is None: + raise ValueError("In spark, `local_cache_path` should be a persist directory or file path.") + + # If local_cache_path is a directory, add the source file name. + src_filepath = Path(urlparse(NYC_TAXI_SMALL_URL).path) + dst_path = Path(local_cache_path) + if dst_path.suffix != src_filepath.suffix: + local_cache_path = str(dst_path.joinpath(src_filepath.name)) + + if is_databricks(): + # Databricks uses "dbfs:/" prefix for spark paths + if not local_cache_path.startswith("dbfs:"): + local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) + # Databricks uses "/dbfs/" prefix for python paths + python_local_cache_path = local_cache_path.replace("dbfs:", "/dbfs") + # TODO add "if is_synapse()" + else: + python_local_cache_path = local_cache_path + + maybe_download(src_url=NYC_TAXI_SMALL_URL, dst_filepath=python_local_cache_path) + + df = spark.read.option("header", True).csv(local_cache_path) + + return df diff --git a/feathr_project/feathr/datasets/utils.py b/feathr_project/feathr/datasets/utils.py new file mode 100644 index 000000000..5dcfb6e87 --- /dev/null +++ b/feathr_project/feathr/datasets/utils.py @@ -0,0 +1,64 @@ +"""Dataset utilities +""" +import logging +import math +from pathlib import Path +import requests +from urllib.parse import urlparse + +from tqdm import tqdm + + +log = logging.getLogger(__name__) + + +def maybe_download(src_url: str, dst_filepath: str, expected_bytes=None) -> bool: + """Check if file exists. If not, download and return True. Else, return False. + + Refs: + https://github.com/microsoft/recommenders/blob/main/recommenders/datasets/download_utils.py + + Args: + src_url: Source file URL. + dst_filepath: Destination file path. + expected_bytes (optional): Expected bytes of the file to verify. + + Returns: + bool: Whether the file was downloaded or not + """ + dst_filepath = Path(dst_filepath) + + if dst_filepath.is_file(): + log.info(f"File {str(dst_filepath)} already exists") + return False + + # Check dir if exists. If not, create one + dst_filepath.parent.mkdir(parents=True, exist_ok=True) + + response = requests.get(src_url, stream=True) + if response.status_code == 200: + log.info(f"Downloading {src_url}") + total_size = int(response.headers.get("content-length", 0)) + block_size = 1024 + num_iterables = math.ceil(total_size / block_size) + with open(str(dst_filepath.resolve()), "wb") as file: + for data in tqdm( + response.iter_content(block_size), + total=num_iterables, + unit="KB", + unit_scale=True, + ): + file.write(data) + + # Verify the file size + if expected_bytes is not None and expected_bytes != dst_filepath.stat().st_size: + # Delete the file since the size is not the same as the expected one. + dst_filepath.unlink() + raise IOError(f"Failed to verify {str(dst_filepath)}. Maybe interrupted while downloading?") + else: + return True + + else: + response.raise_for_status() + # If not HTTPError yet still cannot download + raise Exception(f"Problem downloading {src_url}") diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index cfff0180e..cc587e999 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -1,67 +1,65 @@ -from ast import Raise +from collections import namedtuple import copy import json import os -import time -from collections import namedtuple from os.path import basename from pathlib import Path -from typing import Any, Dict, List, Optional, Union +import time +from typing import Dict, List, Optional, Union from urllib.parse import urlparse from urllib.request import urlopen -import requests from databricks_cli.dbfs.api import DbfsApi from databricks_cli.runs.api import RunsApi from databricks_cli.sdk.api_client import ApiClient -from feathr.constants import * -from feathr.spark_provider._abc import SparkJobLauncher from loguru import logger +import requests from requests.structures import CaseInsensitiveDict +from feathr.constants import * +from feathr.spark_provider._abc import SparkJobLauncher + class _FeathrDatabricksJobLauncher(SparkJobLauncher): """Class to interact with Databricks Spark cluster - This is a light-weight databricks job runner, users should use the provided template json string to get more fine controlled environment for databricks cluster. - For example, user can control whether to use a new cluster to run the job or not, specify the cluster ID, running frequency, node size, workder no., whether to send out failed notification email, etc. - This runner will only fill in necessary arguments in the JSON template. - - This class will read from the provided configs string, and do the following steps. - This default template can be overwritten by users, but users need to make sure the template is compatible with the default template. Specifically: - 1. it's a SparkJarTask (rather than other types of jobs, say NotebookTask or others). See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details - 2. Use the Feathr Jar to run the job (hence will add an entry in `libraries` section) - 3. Only supports `new_cluster` type for now - 4. Will override `main_class_name` and `parameters` field in the JSON template `spark_jar_task` field - 5. will override the name of this job + This is a light-weight databricks job runner, users should use the provided template json string to get more fine controlled environment for databricks cluster. + For example, user can control whether to use a new cluster to run the job or not, specify the cluster ID, running frequency, node size, workder no., whether to send out failed notification email, etc. + This runner will only fill in necessary arguments in the JSON template. + + This class will read from the provided configs string, and do the following steps. + This default template can be overwritten by users, but users need to make sure the template is compatible with the default template. Specifically: + 1. it's a SparkJarTask (rather than other types of jobs, say NotebookTask or others). See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details + 2. Use the Feathr Jar to run the job (hence will add an entry in `libraries` section) + 3. Will override `main_class_name` and `parameters` field in the JSON template `spark_jar_task` field + 4. will override the name of this job + + Args: + workspace_instance_url (str): the workinstance url. Document to get workspace_instance_url: https://docs.microsoft.com/en-us/azure/databricks/workspace/workspace-details#workspace-url + token_value (str): see here on how to get tokens: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication + config_template (str): config template for databricks cluster. See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details. + databricks_work_dir (_type_, optional): databricks_work_dir must start with dbfs:/. Defaults to 'dbfs:/feathr_jobs'. + """ - Args: - workspace_instance_url (str): the workinstance url. Document to get workspace_instance_url: https://docs.microsoft.com/en-us/azure/databricks/workspace/workspace-details#workspace-url - token_value (str): see here on how to get tokens: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication - config_template (str): config template for databricks cluster. See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details. - databricks_work_dir (_type_, optional): databricks_work_dir must start with dbfs:/. Defaults to 'dbfs:/feathr_jobs'. - """ def __init__( - self, - workspace_instance_url: str, - token_value: str, - config_template: Union[str,Dict], - databricks_work_dir: str = 'dbfs:/feathr_jobs', + self, + workspace_instance_url: str, + token_value: str, + config_template: Union[str, Dict], + databricks_work_dir: str = "dbfs:/feathr_jobs", ): - - # Below we will use Databricks job APIs (as well as many other APIs) to submit jobs or transfer files # For Job APIs, see https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs # for DBFS APIs, see: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs self.config_template = config_template # remove possible trailing '/' due to wrong input format - self.workspace_instance_url = workspace_instance_url.rstrip('/') + self.workspace_instance_url = workspace_instance_url.rstrip("/") self.auth_headers = CaseInsensitiveDict() # Authenticate the REST APIs. Documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication - self.auth_headers['Accept'] = 'application/json' - self.auth_headers['Authorization'] = f'Bearer {token_value}' + self.auth_headers["Accept"] = "application/json" + self.auth_headers["Authorization"] = f"Bearer {token_value}" self.databricks_work_dir = databricks_work_dir - self.api_client = ApiClient(host=self.workspace_instance_url,token=token_value) + self.api_client = ApiClient(host=self.workspace_instance_url, token=token_value) def upload_or_get_cloud_path(self, local_path_or_http_path: str): """ @@ -77,7 +75,7 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): with urlopen(local_path_or_http_path) as f: # use REST API to avoid local temp file data = f.read() - files = {'file': data} + files = {"file": data} # for DBFS APIs, see: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs r = requests.post(url=self.workspace_instance_url+'/api/2.0/dbfs/put', headers=self.auth_headers, files=files, data={'overwrite': 'true', 'path': cloud_dest_path}) @@ -90,8 +88,12 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): cloud_dest_path = local_path_or_http_path elif src_parse_result.scheme.startswith(('wasb','s3','gs')): # if the path starts with a location that's not a local path - logger.error("File {} cannot be downloaded. Please upload the file to dbfs manually.", local_path_or_http_path) - raise RuntimeError(f"File {local_path_or_http_path} cannot be downloaded. Please upload the file to dbfs manually.") + logger.error( + "File {} cannot be downloaded. Please upload the file to dbfs manually.", local_path_or_http_path + ) + raise RuntimeError( + f"File {local_path_or_http_path} cannot be downloaded. Please upload the file to dbfs manually." + ) else: # else it should be a local file path or dir if os.path.isdir(local_path_or_http_path): @@ -122,7 +124,18 @@ def _upload_local_file_to_workspace(self, local_path: str) -> str: raise RuntimeError(f"The source path: {local_path}, or the destination path: {cloud_dest_path}, is/are not valid.") from e return cloud_dest_path - def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}): + def submit_feathr_job( + self, + job_name: str, + main_jar_path: str, + main_class_name: str, + arguments: List[str], + python_files: List[str], + reference_files_path: List[str] = [], + job_tags: Dict[str, str] = None, + configuration: Dict[str, str] = {}, + properties: Dict[str, str] = {}, + ): """ submit the feathr job to databricks Refer to the databricks doc for more details on the meaning of the parameters: @@ -146,72 +159,93 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: # otherwise users might have missed the quotes in the config. Treat them as dict # Note that we need to use deep copy here, in order to make `self.config_template` immutable # Otherwise, since we need to change submission_params later, which will modify `self.config_template` and cause unexpected behaviors - submission_params = copy.deepcopy(self.config_template) - - submission_params['run_name'] = job_name - if 'existing_cluster_id' not in submission_params: + submission_params = copy.deepcopy(self.config_template) + + submission_params["run_name"] = job_name + cfg = configuration.copy() + if "existing_cluster_id" in submission_params: + logger.info("Using an existing general purpose cluster to run the feathr job...") + if cfg: + logger.warning( + "Spark execution configuration will be ignored. To use job-specific spark configs, please use a new job cluster or set the configs via Databricks UI." + ) + if job_tags: + logger.warning( + "Job tags will be ignored. To assign job tags to the cluster, please use a new job cluster." + ) + elif "new_cluster" in submission_params: + logger.info("Using a new job cluster to run the feathr job...") # if users don't specify existing_cluster_id # Solving this issue: Handshake fails trying to connect from Azure Databricks to Azure PostgreSQL with SSL # https://docs.microsoft.com/en-us/answers/questions/170730/handshake-fails-trying-to-connect-from-azure-datab.html - configuration['spark.executor.extraJavaOptions'] = '-Djava.security.properties=' - configuration['spark.driver.extraJavaOptions'] = '-Djava.security.properties=' - submission_params['new_cluster']['spark_conf'] = configuration + cfg["spark.executor.extraJavaOptions"] = "-Djava.security.properties=" + cfg["spark.driver.extraJavaOptions"] = "-Djava.security.properties=" + submission_params["new_cluster"]["spark_conf"] = cfg if job_tags: - custom_tags = submission_params['new_cluster'].get('custom_tags', {}) + custom_tags = submission_params["new_cluster"].get("custom_tags", {}) for tag, value in job_tags.items(): custom_tags[tag] = value - submission_params['new_cluster']['custom_tags'] = custom_tags + submission_params["new_cluster"]["custom_tags"] = custom_tags + else: + # TODO we should fail fast -- maybe check this in config verification while initializing the client. + raise ValueError( + "No cluster specifications are found. Either 'existing_cluster_id' or 'new_cluster' should be configured via feathr config." + ) # the feathr main jar file is anyway needed regardless it's pyspark or scala spark if not main_jar_path: logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven") - submission_params['libraries'][0]['maven'] = { "coordinates": FEATHR_MAVEN_ARTIFACT } + submission_params["libraries"][0]["maven"] = {"coordinates": FEATHR_MAVEN_ARTIFACT} else: - submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path) + submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path(main_jar_path) # see here for the submission parameter definition https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6 if python_files: # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask # the first file is the pyspark driver code. we only need the driver code to execute pyspark - param_and_file_dict = {"parameters": arguments, "python_file": self.upload_or_get_cloud_path(python_files[0])} + param_and_file_dict = { + "parameters": arguments, + "python_file": self.upload_or_get_cloud_path(python_files[0]), + } # indicates this is a pyspark job # `setdefault` method will get the value of the "spark_python_task" item, if the "spark_python_task" item does not exist, insert "spark_python_task" with the value "param_and_file_dict": - submission_params.setdefault('spark_python_task',param_and_file_dict) + submission_params.setdefault("spark_python_task", param_and_file_dict) else: # this is a scala spark job - submission_params['spark_jar_task']['parameters'] = arguments - submission_params['spark_jar_task']['main_class_name'] = main_class_name + submission_params["spark_jar_task"]["parameters"] = arguments + submission_params["spark_jar_task"]["main_class_name"] = main_class_name result = RunsApi(self.api_client).submit_run(submission_params) try: # see if we can parse the returned result - self.res_job_id = result['run_id'] + self.res_job_id = result["run_id"] except: - logger.error("Submitting Feathr job to Databricks cluster failed. Message returned from Databricks: {}", result) + logger.error( + "Submitting Feathr job to Databricks cluster failed. Message returned from Databricks: {}", result + ) exit(1) result = RunsApi(self.api_client).get_run(self.res_job_id) - self.job_url = result['run_page_url'] - logger.info('Feathr job Submitted Successfully. View more details here: {}', self.job_url) + self.job_url = result["run_page_url"] + logger.info("Feathr job Submitted Successfully. View more details here: {}", self.job_url) # return ID as the submission result return self.res_job_id def wait_for_completion(self, timeout_seconds: Optional[int] = 600) -> bool: - """ Returns true if the job completed successfully - """ + """Returns true if the job completed successfully""" start_time = time.time() while (timeout_seconds is None) or (time.time() - start_time < timeout_seconds): status = self.get_status() - logger.debug('Current Spark job status: {}', status) + logger.debug("Current Spark job status: {}", status) # see all the status here: # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runlifecyclestate # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runresultstate - if status in {'SUCCESS'}: + if status in {"SUCCESS"}: return True - elif status in {'INTERNAL_ERROR', 'FAILED', 'TIMEDOUT', 'CANCELED'}: + elif status in {"INTERNAL_ERROR", "FAILED", "TIMEDOUT", "CANCELED"}: result = RunsApi(self.api_client).get_run_output(self.res_job_id) # See here for the returned fields: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-8 # print out logs and stack trace if the job has failed @@ -224,14 +258,14 @@ def wait_for_completion(self, timeout_seconds: Optional[int] = 600) -> bool: else: time.sleep(30) else: - raise TimeoutError('Timeout waiting for Feathr job to complete') + raise TimeoutError("Timeout waiting for Feathr job to complete") def get_status(self) -> str: assert self.res_job_id is not None result = RunsApi(self.api_client).get_run(self.res_job_id) # first try to get result state. it might not be available, and if that's the case, try to get life_cycle_state # see result structure: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-6 - res_state = result['state'].get('result_state') or result['state']['life_cycle_state'] + res_state = result["state"].get("result_state") or result["state"]["life_cycle_state"] assert res_state is not None return res_state @@ -245,7 +279,6 @@ def get_job_result_uri(self) -> str: # in case users call this API even when there's no tags available return None if custom_tags is None else custom_tags[OUTPUT_PATH_TAG] - def get_job_tags(self) -> Dict[str, str]: """Get job tags @@ -256,21 +289,23 @@ def get_job_tags(self) -> Dict[str, str]: # For result structure, see https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-6 result = RunsApi(self.api_client).get_run(self.res_job_id) - if 'new_cluster' in result['cluster_spec']: - custom_tags = result['cluster_spec']['new_cluster']['custom_tags'] + if "new_cluster" in result["cluster_spec"]: + custom_tags = result["cluster_spec"]["new_cluster"]["custom_tags"] return custom_tags else: # this is not a new cluster; it's an existing cluster. - logger.warning("Job tags are not available since you are using an existing Databricks cluster. Consider using 'new_cluster' in databricks configuration.") + logger.warning( + "Job tags are not available since you are using an existing Databricks cluster. Consider using 'new_cluster' in databricks configuration." + ) return None - def download_result(self, result_path: str, local_folder: str): """ Supports downloading files from the result folder. Only support paths starts with `dbfs:/` and only support downloading files in one folder (per Spark's design, everything will be in the result folder in a flat manner) """ - if not result_path.startswith('dbfs'): - raise RuntimeError('Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with \"dbfs:\" .') + if not result_path.startswith("dbfs"): + raise RuntimeError( + 'Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with "dbfs:" .' + ) DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder) - diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py new file mode 100644 index 000000000..9a9438567 --- /dev/null +++ b/feathr_project/feathr/utils/config.py @@ -0,0 +1,61 @@ +from tempfile import NamedTemporaryFile + + +FEATHR_CONFIG_TEMPLATE = """ +api_version: 1 + +project_config: + project_name: {project_name} + +feature_registry: + api_endpoint: 'https://{resource_prefix}webapp.azurewebsites.net/api/v1' + +spark_config: + # Currently support: 'azure_synapse', 'databricks', and 'local' + spark_cluster: {spark_cluster} + spark_result_output_parts: '1' + +offline_store: + wasb: + wasb_enabled: true + +online_store: + # You can skip this part if you don't have Redis and skip materialization later in this notebook. + redis: + host: '{resource_prefix}redis.redis.cache.windows.net' + port: 6380 + ssl_enabled: true +""" + + +def generate_config( + resource_prefix: str, + project_name: str, + spark_cluster: str, + output_filepath: str = None, +) -> str: + """Generate a feathr config yaml file + + Args: + resource_prefix: Resource name prefix. + project_name: Project name. + spark_cluster: Spark cluster to use. Either 'local', 'databricks', or 'azure_synapse'. + output_filepath: Output filepath. + + Returns: + str: Generated config file path. output_filepath if provided. Otherwise, NamedTemporaryFile path. + """ + + conf_str = FEATHR_CONFIG_TEMPLATE.format( + resource_prefix=resource_prefix, + project_name=project_name, + spark_cluster=spark_cluster, + ) + + if not output_filepath: + output_filepath = NamedTemporaryFile(mode="w", delete=False).name + + with open(output_filepath, "w") as conf_file: + conf_file.write(conf_str) + + return output_filepath diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 6a6bd63c0..815e26c21 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,77 +1,187 @@ -from feathr.client import FeathrClient -import os import glob -from feathr.constants import OUTPUT_FORMAT +import os +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Union + from loguru import logger import pandas as pd -import tempfile from pandas.errors import EmptyDataError +from pyspark.sql import DataFrame, SparkSession + +from feathr.client import FeathrClient +from feathr.constants import OUTPUT_FORMAT +def get_result_pandas_df( + client: FeathrClient, + data_format: str = None, + res_url: str = None, + local_cache_path: str = None, +) -> pd.DataFrame: + """Download the job result dataset from cloud as a Pandas DataFrame. -def get_result_df(client: FeathrClient, format: str = None, res_url: str = None, local_folder: str = None) -> pd.DataFrame: - """Download the job result dataset from cloud as a Pandas dataframe to make it easier for the client to read. + Args: + client: Feathr client + data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. + Default to `avro` if not specified. + res_url: Result URL to download files from. Note that this will not block the job so you need to make sure + the job is finished and the result URL contains actual data. + local_cache_path (optional): Specify the absolute download path. if the user does not provide this, + the function will create a temporary directory. - format: format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. Default to `avro` if not specified. - res_url: output URL to download files. Note that this will not block the job so you need to make sure the job is finished and result URL contains actual data. - local_folder: optional parameter to specify the absolute download path. if the user does not provide this, function will create a temporary directory and delete it after reading the dataframe. + Returns: + pandas DataFrame """ - # use a result url if it's provided by the user, otherwise use the one provided by the job + return get_result_df(client, data_format, res_url, local_cache_path) + + +def get_result_spark_df( + spark: SparkSession, + client: FeathrClient, + data_format: str = None, + res_url: str = None, + local_cache_path: str = None, +) -> DataFrame: + """Download the job result dataset from cloud as a Spark DataFrame. + + Args: + spark: Spark session + client: Feathr client + data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. + Default to `avro` if not specified. + res_url: Result URL to download files from. Note that this will not block the job so you need to make sure + the job is finished and the result URL contains actual data. + local_cache_path (optional): Specify the absolute download path. if the user does not provide this, + the function will create a temporary directory. + + Returns: + Spark DataFrame + """ + return get_result_df(client, data_format, res_url, local_cache_path, spark=spark) + + +def get_result_df( + client: FeathrClient, + data_format: str = None, + res_url: str = None, + local_cache_path: str = None, + spark: SparkSession = None, +) -> Union[DataFrame, pd.DataFrame]: + """Download the job result dataset from cloud as a Spark DataFrame or pandas DataFrame. + + Args: + client: Feathr client + data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. + Default to `avro` if not specified. + res_url: Result URL to download files from. Note that this will not block the job so you need to make sure + the job is finished and the result URL contains actual data. + local_cache_path (optional): Specify the absolute download path. if the user does not provide this, + the function will create a temporary directory. + spark (optional): Spark session. If provided, the function returns spark Dataframe. + Otherwise, it returns pd.DataFrame. + + Returns: + Either Spark or pandas DataFrame. + """ + # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: - raise RuntimeError("res_url is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI.") + raise RuntimeError( + "res_url is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI." + ) + + if client.spark_runtime == "local": + if local_cache_path is not None: + logger.warning( + "In local spark mode, the result files are expected to be stored at a local storage and thus `local_cache_path` argument will be ignored." + ) + local_cache_path = res_url + elif client.spark_runtime == "databricks": + if res_url.startswith("dbfs:"): + logger.warning( + "Result files are already in DBFS and thus `local_cache_path` will be ignored." + ) + local_cache_path = res_url + else: + # if local_cache_path params is not provided then create a temporary folder + if local_cache_path is None: + # We'll just use the name of a local TemporaryDirectory to cache the data into DBFS. + local_cache_path = TemporaryDirectory().name + + # Databricks uses "dbfs:/" prefix for spark paths + if not local_cache_path.startswith("dbfs:"): + local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) + # TODO elif azure_synapse + + if local_cache_path != res_url: + logger.info(f"{res_url} files will be downloaded into {local_cache_path}") + client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_cache_path) - # use user provided format, if there isn't one, then otherwise use the one provided by the job; + # use user provided format, if there isn't one, then otherwise use the one provided by the job; # if none of them is available, "avro" is the default format. - format: str = format or client.get_job_tags().get(OUTPUT_FORMAT, "") - if format is None or format == "": - format = "avro" + data_format: str = data_format or client.get_job_tags().get(OUTPUT_FORMAT, "") + if data_format is None or data_format == "": + data_format = "avro" - # if local_folder params is not provided then create a temporary folder - if local_folder is not None: - local_dir_path = local_folder + result_df = None + + if spark is not None: + result_df = spark.read.format(data_format).load(local_cache_path) else: - tmp_dir = tempfile.TemporaryDirectory() - local_dir_path = tmp_dir.name - - client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_dir_path) - dataframe_list = [] - # by default the result are in avro format - if format.casefold()=="parquet": - files = glob.glob(os.path.join(local_dir_path, '*.parquet')) + result_df = _read_files_to_pandas_df( + dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. + data_format=data_format, + ) + + return result_df + + +def _read_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.DataFrame: + + if data_format == "parquet": from pyarrow.parquet import ParquetDataset + + files = glob.glob(os.path.join(dir_path, "*.parquet")) ds = ParquetDataset(files) - result_df = ds.read().to_pandas() - elif format.casefold()=="delta": + return ds.read().to_pandas() + + elif data_format == "delta": from deltalake import DeltaTable - delta = DeltaTable(local_dir_path) - if not client.spark_runtime == 'azure_synapse': - # don't detect for synapse result with Delta as there's a problem with underlying system - # Issues are tracked here: https://github.com/delta-io/delta-rs/issues/582 - result_df = delta.to_pyarrow_table().to_pandas() - else: - logger.info("Please use Azure Synapse to read the result in the Azure Synapse cluster. Reading local results is not supported for Azure Synapse. Empty DataFrame is returned.") - result_df = pd.DataFrame() - elif format.casefold()=="avro": + + delta = DeltaTable(dir_path) + # if client.spark_runtime != "azure_synapse": + # don't detect for synapse result with Delta as there's a problem with underlying system + # Issues are tracked here: https://github.com/delta-io/delta-rs/issues/582 + return delta.to_pyarrow_table().to_pandas() + # else: + # TODO -- Proper warning messages. Is this applied to all the other formats? + # raise RuntimeError( + # "Please use Azure Synapse to read the result in the Azure Synapse cluster. Reading local results is not supported for Azure Synapse." + # ) + + elif data_format == "avro": import pandavro as pdx - for file in glob.glob(os.path.join(local_dir_path, '*.avro')): - dataframe_list.append(pdx.read_avro(file)) - result_df = pd.concat(dataframe_list, axis=0) - elif format.casefold()=="csv": - for file in glob.glob(os.path.join(local_dir_path, '*.csv')): + + dataframe_list = [pdx.read_avro(file) for file in glob.glob(os.path.join(dir_path, "*.avro"))] + return pd.concat(dataframe_list, axis=0) + + elif data_format == "csv": + dataframe_list = [] + for file in glob.glob(os.path.join(dir_path, "*.csv")): try: - df = pd.read_csv(file, index_col=None, header=None) + dataframe_list.append(pd.read_csv(file, index_col=None, header=None)) except EmptyDataError: # in case there are empty files - df = pd.DataFrame() - dataframe_list.append(df) - result_df = pd.concat(dataframe_list, axis=0) - # Reset index to avoid duplicated indices - result_df.reset_index(drop=True) - else: - raise RuntimeError(f"{format} is currently not supported in get_result_df. Currently only parquet, delta, avro, and csv are supported, please consider writing a customized function to read the result.") + pass + + if dataframe_list: + # Reset index to avoid duplicated indices -- TODO don't we need reset_index when reading avro too? + return pd.concat(dataframe_list, axis=0).reset_index(drop=True) + else: + raise ValueError(f"Empty files in {dir_path}.") - - if local_folder is None: - tmp_dir.cleanup() - return result_df \ No newline at end of file + else: + raise ValueError( + f"{data_format} is currently not supported in get_result_df. Currently only parquet, delta, avro, and csv are supported, please consider writing a customized function to read the result." + ) diff --git a/feathr_project/feathr/utils/platform.py b/feathr_project/feathr/utils/platform.py new file mode 100644 index 000000000..8f832f22d --- /dev/null +++ b/feathr_project/feathr/utils/platform.py @@ -0,0 +1,45 @@ +"""Platform utilities. +Refs: https://github.com/microsoft/recommenders/blob/main/recommenders/utils/notebook_utils.py +""" +from pathlib import Path + + +def is_jupyter() -> bool: + """Check if the module is running on Jupyter notebook/console. + Note - there might be better way to check if the code is running on a jupyter notebook or not, + but this hacky way still works. + + Ref: + https://stackoverflow.com/questions/15411967/how-can-i-check-if-code-is-executed-in-the-ipython-notebook + + Returns: + bool: True if the module is running on Jupyter notebook or Jupyter console, False otherwise. + """ + try: + # Pre-loaded module `get_ipython()` tells you whether you are running inside IPython or not. + shell_name = get_ipython().__class__.__name__ + # `ZMQInteractiveShell` tells you if this is an interactive mode (notebook). + if shell_name == "ZMQInteractiveShell": + return True + else: + return False + except NameError: + return False + + +def is_databricks() -> bool: + """Check if the module is running on Databricks. + + Returns: + bool: True if the module is running on Databricks notebook, False otherwise. + """ + try: + if str(Path(".").resolve()) == "/databricks/driver": + return True + else: + return False + except NameError: + return False + + +# TODO maybe add is_synapse() diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv deleted file mode 100644 index ce34f255a..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv +++ /dev/null @@ -1,14 +0,0 @@ -VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge -2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1,43,151,1,1.01,5.5,0.5,0.5,0,0,,0.3,6.8,2,1,0 -22,2021-01-01 11:25:59,2021-01-01 11:34:44,N,1,166,239,1,2.53,10,0.5,0.5,2.81,0,,0.3,16.86,1,1,2.75 -23,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1,41,42,1,1.12,6,0.5,0.5,1,0,,0.3,8.3,1,1,0 -24,2020-12-31 23:57:51,2021-01-01 23:04:56,N,1,168,75,1,1.99,8,0.5,0.5,0,0,,0.3,9.3,2,1,0 -25,2021-01-01 17:16:36,2021-01-01 17:16:40,N,2,265,265,3,.00,-52,0,-0.5,0,0,,-0.3,-52.8,3,1,0 -12,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2,265,265,3,.00,52,0,0.5,0,0,,0.3,52.8,2,1,0 -42,2021-01-01 05:19:14,2021-01-01 00:19:21,N,5,265,265,1,.00,180,0,0,36.06,0,,0.3,216.36,1,2,0 -52,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1,75,75,6,.45,3.5,0.5,0.5,0.96,0,,0.3,5.76,1,1,0 -2,2021-01-01 00:57:46,2021-01-01 00:57:57,N,1,225,225,1,.00,2.5,0.5,0.5,0,0,,0.3,3.8,2,1,0 -32,2021-01-01 00:58:32,2021-01-01 01:32:34,N,1,225,265,1,12.19,38,0.5,0.5,2.75,0,,0.3,42.05,1,1,0 -2,2021-01-01 18:39:57,2021-01-01 18:55:25,N,1,74,60,1,5.48,18,0.5,0.5,0,0,,0.3,19.3,2,1,0 -15,2021-01-01 00:51:27,2021-01-01 00:57:20,N,1,42,41,2,.90,6,0.5,0.5,0,0,,0.3,7.3,1,1,0 -15,2021-01-01 00:29:05,2021-01-01 00:29:07,N,5,42,264,1,9.00E-02,10,0,0,2.06,0,,0.3,12.36,1,2,0 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv deleted file mode 100644 index 476ea06f3..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv +++ /dev/null @@ -1,11 +0,0 @@ -product_id,category,price,quantity,recent_sold,made_in_state,discount -1,1,22,100,0,CA,7.5 -2,2,17,300,1,CA,7.5 -3,1,40,0,2,WA,7.5 -4,1,25,100,3,WA,7.5 -5,1,33,0,2,PA,0 -6,2,19,0,2,CA,7.5 -7,2,22,200,1,WA,7.5 -8,2,59,300,0,PA,8.5 -9,0,80,100,1,WA,8.5 -10,0,39,100,0,WA,7.5 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv deleted file mode 100644 index 38fe25ceb..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv +++ /dev/null @@ -1,35 +0,0 @@ -user_id,product_id,event_timestamp,product_rating -1,1,2021-04-01,4 -1,2,2021-04-01,4 -1,3,2021-04-01,4 -1,4,2021-04-01,4 -1,5,2021-04-01,4 -2,1,2021-04-01,5 -2,2,2021-04-01,5 -2,3,2021-04-01,5 -2,4,2021-04-01,5 -2,5,2021-04-01,5 -3,1,2021-04-01,5 -3,2,2021-04-01,5 -3,3,2021-04-01,5 -3,4,2021-04-01,5 -3,5,2021-04-01,5 -4,1,2021-04-01,1 -4,2,2021-04-01,1 -4,3,2021-04-01,1 -4,4,2021-04-01,1 -4,5,2021-04-01,1 -5,1,2021-04-01,5 -5,2,2021-04-01,5 -6,1,2021-04-01,2 -7,1,2021-04-01,5 -7,2,2021-04-01,5 -7,3,2021-04-01,5 -8,1,2021-04-01,2 -8,2,2021-04-01,2 -8,3,2021-04-01,2 -9,1,2021-04-01,5 -9,2,2021-04-01,5 -9,3,2021-04-01,5 -9,4,2021-04-01,5 -10,1,2021-04-01,3 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv deleted file mode 100644 index 6c38f51d7..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv +++ /dev/null @@ -1,11 +0,0 @@ -user_id,gender,age,gift_card_balance,number_of_credit_cards,state,tax_rate -1,1,22,100,0,CA,7.5 -2,2,17,300,1,CA,7.5 -3,1,40,0,2,WA,7.5 -4,1,25,100,3,WA,7.5 -5,1,33,0,2,PA,0 -6,2,19,0,2,CA,7.5 -7,2,22,200,1,WA,7.5 -8,2,59,300,0,PA,8.5 -9,0,80,100,1,WA,8.5 -10,0,39,100,0,WA,7.5 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv deleted file mode 100644 index 8c8481d1f..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv +++ /dev/null @@ -1,31 +0,0 @@ -user_id,purchase_date,purchase_amount -1,2021-01-01,0.33 -1,2021-03-03,574.35 -1,2021-01-03,796.07 -2,2021-01-04,342.15 -2,2021-03-05,280.46 -2,2021-01-06,664.18 -3,2021-01-07,359.02 -3,2021-01-08,357.12 -3,2021-01-09,845.40 -4,2021-01-10,103.92 -4,2021-02-21,670.12 -4,2021-02-12,698.65 -5,2021-01-13,110.52 -5,2021-01-14,931.72 -5,2021-02-15,388.14 -6,2021-01-16,822.96 -6,2021-01-17,292.39 -6,2021-01-18,524.76 -7,2021-01-19,262.00 -7,2021-03-20,715.94 -7,2021-01-21,345.70 -8,2021-01-22,379.00 -8,2021-01-23,194.96 -8,2021-01-24,862.33 -9,2021-01-25,430.41 -9,2021-01-26,398.72 -9,2021-02-27,158.52 -10,2021-01-28,550.01 -10,2021-03-02,157.88 -10,2021-03-03,528.43 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb deleted file mode 100644 index 38cec2ca9..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb +++ /dev/null @@ -1,720 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feathr Feature Store on Azure Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. It includes these steps:\n", - "\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", - "\n", - "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", - "First step is to provision required cloud resources if you want to use Feathr. Feathr provides a python based client to interact with cloud resources.\n", - "\n", - "Please follow the steps [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to provision required cloud resources. Due to the complexity of the possible cloud environment, it is almost impossible to create a script that works for all the use cases. Because of this, [azure_resource_provision.sh](https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure_resource_provision.sh) is a full end to end command line to create all the required resources, and you can tailor the script as needed, while [the companion documentation](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html) can be used as a complete guide for using that shell script.\n", - "\n", - "\n", - "![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Install Feathr \n", - "\n", - "Install Feathr using pip:\n", - "\n", - "`pip install -U feathr pandavro scikit-learn`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "resource_prefix = \"feathr_resource_prefix\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install feathr azure-cli pandavro scikit-learn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Login to Azure with a device code (You will see instructions in the output):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! az login --use-device-code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get all the required credentials from Azure KeyVault" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get all the required credentials from Azure Key Vault\n", - "key_vault_name=resource_prefix+\"kv\"\n", - "synapse_workspace_url=resource_prefix+\"syws\"\n", - "adls_account=resource_prefix+\"dls\"\n", - "adls_fs_name=resource_prefix+\"fs\"\n", - "purview_name=resource_prefix+\"purview\"\n", - "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", - "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", - "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", - "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", - "retrieved_secret = client.get_secret(secretName).value\n", - "\n", - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", - "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", - "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", - "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", - "\n", - "# Set the resource link\n", - "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", - "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", - "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password\n", - "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", - "\n", - "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - " - 'AZURE_CLIENT_ID'\n", - " - 'AZURE_TENANT_ID'\n", - " - 'AZURE_CLIENT_SECRET'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: 's3.amazonaws.com'\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: 'feathrtestdb'\n", - " jdbc_table: 'feathrtesttable'\n", - " snowflake:\n", - " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", - " user: \"feathrintegration\"\n", - " role: \"ACCOUNTADMIN\"\n", - "spark_config:\n", - " spark_cluster: 'azure_synapse'\n", - " spark_result_output_parts: '1'\n", - " azure_synapse:\n", - " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", - " pool_name: 'spark3'\n", - " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", - " executor_size: 'Small'\n", - " executor_num: 1\n", - " databricks:\n", - " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", - " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", - " work_dir: 'dbfs:/feathr_getting_started'\n", - "online_store:\n", - " redis:\n", - " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", - "\n", - "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", - "\n", - "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", - "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession, DataFrame\n", - "def feathr_udf_day_calc(df: DataFrame) -> DataFrame:\n", - " from pyspark.sql.functions import dayofweek, dayofyear, col\n", - " df = df.withColumn(\"fare_amount_cents\", col(\"fare_amount\")*100)\n", - " return df\n", - "\n", - "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " preprocessing=feathr_udf_day_calc,\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f_trip_distance = Feature(name=\"f_trip_distance\",\n", - " feature_type=FLOAT, transform=\"trip_distance\")\n", - "f_trip_time_duration = Feature(name=\"f_trip_time_duration\",\n", - " feature_type=INT32,\n", - " transform=\"(to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime))/60\")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " f_trip_time_duration,\n", - " Feature(name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"cast_float(trip_distance)>30\"),\n", - " Feature(name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"request_features\",\n", - " source=INPUT_CONTEXT,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "location_id = TypedKey(key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\")\n", - "agg_features = [Feature(name=\"f_location_avg_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_max_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"MAX\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_total_fare_cents\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"fare_amount_cents\",\n", - " agg_func=\"SUM\",\n", - " window=\"90d\")),\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=batch_source,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f_trip_time_distance = DerivedFeature(name=\"f_trip_time_distance\",\n", - " feature_type=FLOAT,\n", - " input_features=[\n", - " f_trip_distance, f_trip_time_duration],\n", - " transform=\"f_trip_distance * f_trip_time_duration\")\n", - "\n", - "f_trip_time_rounded = DerivedFeature(name=\"f_trip_time_rounded\",\n", - " feature_type=INT32,\n", - " input_features=[f_trip_time_duration],\n", - " transform=\"f_trip_time_duration % 10\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " f_trip_time_distance, f_trip_time_rounded])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if client.spark_runtime == 'databricks':\n", - " output_path = 'dbfs:/feathrazure_test.avro'\n", - "else:\n", - " output_path = feathr_output_path\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"f_location_avg_fare\", \"f_trip_time_rounded\", \"f_is_long_trip_distance\", \"f_location_total_fare_cents\"], key=location_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path)\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", - " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", - " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", - " tmp_dir = tempfile.TemporaryDirectory()\n", - " client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", - " dataframe_list = []\n", - " # assuming the result are in avro format\n", - " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", - " dataframe_list.append(pdx.read_avro(file))\n", - " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", - " tmp_dir.cleanup()\n", - " return vertical_concat_df\n", - "\n", - "df_res = get_result_df(client)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", - " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", - "\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", - " final_df[\"fare_amount\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", - "settings = MaterializationSettings(\"nycTaxiTable\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", - "\n", - "client.materialize_features(settings)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then get the features from the online store (Redis):\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Registering and Fetching features\n", - "\n", - "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.register_features()\n", - "client.list_registered_features(project_name=\"feathr_getting_started\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.9.5 ('base')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - }, - "vscode": { - "interpreter": { - "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/feathr_project/setup.py b/feathr_project/setup.py index 69a99351f..0a6b38d49 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -3,6 +3,7 @@ from setuptools import setup, find_packages from pathlib import Path + # Use the README.md from /docs root_path = Path(__file__).resolve().parent.parent long_description = (root_path / "docs/README.md").read_text(encoding="utf8") @@ -15,7 +16,7 @@ sys.exit(-1) VERSION = __version__ # noqa -os.environ["FEATHR_VERSION]"] = VERSION +os.environ["FEATHR_VERSION"] = VERSION extras_require=dict( dev=[ diff --git a/feathr_project/test/samples/test_notebooks.py b/feathr_project/test/samples/test_notebooks.py new file mode 100644 index 000000000..778b157d7 --- /dev/null +++ b/feathr_project/test/samples/test_notebooks.py @@ -0,0 +1,56 @@ +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest +try: + import papermill as pm + import scrapbook as sb +except ImportError: + pass # disable error while collecting tests for non-notebook environments + + +SAMPLES_DIR = ( + Path(__file__) + .parent # .../samples + .parent # .../test + .parent # .../feathr_project + .parent # .../feathr (root of the repo) + .joinpath("docs", "samples") +) +NOTEBOOK_PATHS = { + "nyc_taxi_demo": str(SAMPLES_DIR.joinpath("nyc_taxi_demo.ipynb")), +} + + +@pytest.mark.notebooks +def test__nyc_taxi_demo(tmp_path): + notebook_name = "nyc_taxi_demo" + + output_tmpdir = TemporaryDirectory() + output_notebook_path = str(tmp_path.joinpath(f"{notebook_name}.ipynb")) + + pm.execute_notebook( + input_path=NOTEBOOK_PATHS[notebook_name], + output_path=output_notebook_path, + # kernel_name="python3", + parameters=dict( + RESOURCE_PREFIX="feathrazuretest3", # Use the test resource group + PROJECT_NAME=notebook_name, + DATA_STORE_PATH=output_tmpdir.name, + SPARK_CLUSTER="local", + USE_CLI_AUTH=False, + SCRAP_RESULTS=True, + ), + ) + + # Read results from the Scrapbook and assert expected values + nb = sb.read_notebook(output_notebook_path) + outputs = nb.scraps + + assert outputs["materialized_feature_values"].data["239"] == pytest.approx([5707., 1480.], abs=1.) + assert outputs["materialized_feature_values"].data["265"] == pytest.approx([10000., 4160.], abs=1.) + assert outputs["rmse"].data == pytest.approx(5., abs=2.) + assert outputs["mae"].data == pytest.approx(2., abs=1.) + + # clean up + output_tmpdir.cleanup() diff --git a/feathr_project/test/unit/datasets/test_dataset_utils.py b/feathr_project/test/unit/datasets/test_dataset_utils.py new file mode 100644 index 000000000..2aabaa9a1 --- /dev/null +++ b/feathr_project/test/unit/datasets/test_dataset_utils.py @@ -0,0 +1,53 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from urllib.parse import urlparse + +import pytest + +from feathr.datasets.nyc_taxi import NYC_TAXI_SMALL_URL +from feathr.datasets.utils import maybe_download + + +@pytest.mark.parametrize( + # 3924447 is the nyc_taxi sample data's bytes + "expected_bytes", [3924447, None] +) +def test__maybe_download(expected_bytes: int): + """Test maybe_download utility function w/ nyc_taxi data cached at Azure blob.""" + + tmpdir = TemporaryDirectory() + dst_filepath = Path(tmpdir.name, "data.csv") + + # Assert the data is downloaded + assert maybe_download( + src_url=NYC_TAXI_SMALL_URL, + dst_filepath=str(dst_filepath), + expected_bytes=expected_bytes, + ) + + # Assert the downloaded file exists. + assert dst_filepath.is_file() + + # Assert the data is already exists and thus the function does not download + assert not maybe_download( + src_url=NYC_TAXI_SMALL_URL, + dst_filepath=str(dst_filepath), + expected_bytes=expected_bytes, + ) + + tmpdir.cleanup() + + +def test__maybe_download__raise_exception(): + """Test maby_download utility function to raise IOError when the expected bytes mismatches.""" + + tmpdir = TemporaryDirectory() + + with pytest.raises(IOError): + maybe_download( + src_url=NYC_TAXI_SMALL_URL, + dst_filepath=Path(tmpdir.name, "data.csv").resolve(), + expected_bytes=10, + ) + + tmpdir.cleanup() diff --git a/feathr_project/test/unit/datasets/test_datasets.py b/feathr_project/test/unit/datasets/test_datasets.py new file mode 100644 index 000000000..c1ac49a9b --- /dev/null +++ b/feathr_project/test/unit/datasets/test_datasets.py @@ -0,0 +1,106 @@ +from pathlib import Path +from unittest.mock import MagicMock + +from pyspark.sql import SparkSession +import pytest +from pytest_mock import MockerFixture + +from feathr.datasets import nyc_taxi + + +TEST_DATASET_DIR = Path(__file__).parent.parent.parent.joinpath("test_user_workspace") +NYC_TAXI_FILE_PATH = str(TEST_DATASET_DIR.joinpath("green_tripdata_2020-04_with_index.csv").resolve()) + + +@pytest.fixture(scope="module") +def spark() -> SparkSession: + """Generate a spark session for tests.""" + # Set ui port other than the default one (4040) so that feathr spark job may not fail. + spark_session = SparkSession.builder.appName("tests").config("spark.ui.port", "8080").getOrCreate() + yield spark_session + spark_session.stop() + + +@pytest.mark.parametrize( + "local_cache_path", + [ + None, # default temporary directory + NYC_TAXI_FILE_PATH, # full filepath + str(Path(NYC_TAXI_FILE_PATH).parent), # directory + ], +) +def test__nyc_taxi__get_pandas_df( + mocker: MockerFixture, + local_cache_path: str, +): + """Test if nyc_taxi.get_pandas_df returns pd.DataFrame. Also check if the proper modules are being called.""" + # Mock maybe_download and TempDirectory + mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") + mocked_tmpdir = MagicMock() + mocked_tmpdir.name = NYC_TAXI_FILE_PATH + mocked_TemporaryDirectory = mocker.patch("feathr.datasets.nyc_taxi.TemporaryDirectory", return_value=mocked_tmpdir) + + pdf = nyc_taxi.get_pandas_df(local_cache_path=local_cache_path) + assert len(pdf) == 35612 + + # Assert mock called + if local_cache_path: + mocked_TemporaryDirectory.assert_not_called() + else: + mocked_TemporaryDirectory.assert_called_once() + + # TODO check this is called w/ file extension added + mocked_maybe_download.assert_called_once_with(src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=NYC_TAXI_FILE_PATH) + + +@pytest.mark.parametrize( + "local_cache_path", [ + NYC_TAXI_FILE_PATH, # full filepath + str(Path(NYC_TAXI_FILE_PATH).parent), # directory + ], +) +def test__nyc_taxi__get_spark_df( + spark, + mocker: MockerFixture, + local_cache_path: str, +): + """Test if nyc_taxi.get_spark_df returns spark.sql.DataFrame.""" + # Mock maybe_download + mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") + + df = nyc_taxi.get_spark_df(spark=spark, local_cache_path=local_cache_path) + assert df.count() == 35612 + + mocked_maybe_download.assert_called_once_with( + src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=NYC_TAXI_FILE_PATH + ) + + +@pytest.mark.parametrize( + "local_cache_path", [ + NYC_TAXI_FILE_PATH, # full filepath + str(Path(NYC_TAXI_FILE_PATH).parent), # directory + ], +) +def test__nyc_taxi__get_spark_df__with_databricks( + mocker: MockerFixture, + local_cache_path: str, +): + # Mock maybe_download and spark session + mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") + mocked_is_databricks = mocker.patch("feathr.datasets.nyc_taxi.is_databricks", return_value=True) + mocked_spark = MagicMock(spec=SparkSession) + + nyc_taxi.get_spark_df(spark=mocked_spark, local_cache_path=local_cache_path) + + # Assert mock called with databricks paths + mocked_is_databricks.assert_called_once() + + expected_dst_filepath = str(Path("/dbfs", NYC_TAXI_FILE_PATH.lstrip("/"))) + mocked_maybe_download.assert_called_once_with( + src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=expected_dst_filepath + ) + + mocked_spark.read.option.return_value.csv.assert_called_once_with( + str(Path("dbfs:", NYC_TAXI_FILE_PATH.lstrip("/"))) + ) diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py new file mode 100644 index 000000000..502a3a01d --- /dev/null +++ b/feathr_project/test/unit/utils/test_config.py @@ -0,0 +1,31 @@ +from pathlib import Path +from tempfile import NamedTemporaryFile + +import pytest + +from feathr.utils.config import FEATHR_CONFIG_TEMPLATE, generate_config + + +@pytest.mark.parametrize( + "output_filepath", [None, NamedTemporaryFile().name], +) +def test__generate_config(output_filepath: str): + + config = FEATHR_CONFIG_TEMPLATE.format( + resource_prefix="test_prefix", + project_name="test_project", + spark_cluster="local", + ) + + config_filepath = generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_cluster="local", + output_filepath=output_filepath, + ) + + if output_filepath: + assert output_filepath == config_filepath + + with open(config_filepath, "r") as f: + assert config == f.read() From 1fab5f2d766f95f4cf7e18c91cfb2b9282d7119a Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Tue, 1 Nov 2022 20:18:55 +0000 Subject: [PATCH 03/18] revert 798 (revert756 - example notebook refactor). Also add job_utils unit tests Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../feathr/definition/feature_derivations.py | 6 +- .../spark_provider/_localspark_submission.py | 26 +++- feathr_project/feathr/utils/job_utils.py | 60 +++++----- feathr_project/pyproject.toml | 6 + feathr_project/test/conftest.py | 43 +++++++ .../_delta_log/00000000000000000000.json | 4 + ...45a6-a2cd-4b9a37427f86-c000.snappy.parquet | Bin 0 -> 6277 bytes ...af2d-d172-48cc-a65e-87a89526f97a-c000.avro | Bin 0 -> 1523 bytes .../mock_results/output.csv | 6 + ...4d58-a6e6-c1050f57ab99-c000.snappy.parquet | Bin 0 -> 6277 bytes ...ad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv | 5 + .../test/unit/datasets/test_datasets.py | 9 -- .../test_localspark_submission.py | 14 +++ .../test/unit/utils/test_job_utils.py | 112 ++++++++++++++++++ 14 files changed, 246 insertions(+), 45 deletions(-) create mode 100644 feathr_project/test/conftest.py create mode 100644 feathr_project/test/test_user_workspace/mock_results/output-delta/_delta_log/00000000000000000000.json create mode 100644 feathr_project/test/test_user_workspace/mock_results/output-delta/part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet create mode 100644 feathr_project/test/test_user_workspace/mock_results/output.avro/part-00000-979daf2d-d172-48cc-a65e-87a89526f97a-c000.avro create mode 100644 feathr_project/test/test_user_workspace/mock_results/output.csv create mode 100644 feathr_project/test/test_user_workspace/mock_results/output.parquet/part-00000-bfa76930-af3c-4d58-a6e6-c1050f57ab99-c000.snappy.parquet create mode 100644 feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv create mode 100644 feathr_project/test/unit/utils/test_job_utils.py diff --git a/feathr_project/feathr/definition/feature_derivations.py b/feathr_project/feathr/definition/feature_derivations.py index 9205685ce..5717f12b2 100644 --- a/feathr_project/feathr/definition/feature_derivations.py +++ b/feathr_project/feathr/definition/feature_derivations.py @@ -36,9 +36,9 @@ def __init__(self, def validate_feature(self): """Validate the derived feature is valid""" - + input_feature_key_alias = [] - # for new entity in Purview, the attributes are Camel cases, while the old logic works as snake cases. + # for new entity in Purview, the attributes are Camel cases, while the old logic works as snake cases. # Modify the conversion to work with both schema. for feature in self.input_features: input_feature_key_alias.extend([x['keyColumnAlias'] for x in feature['attributes']['key']] if isinstance(feature,dict) else feature.key_alias) @@ -58,7 +58,7 @@ def to_feature_config(self) -> str: } {% endfor %} } - definition.sqlExpr: {{derived_feature.transform.to_feature_config(False)}} + definition: {{derived_feature.transform.to_feature_config(False)}} {{derived_feature.feature_type.to_feature_config()}} } """) diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index afed9683d..50da91c08 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -1,3 +1,4 @@ +from copy import deepcopy from datetime import datetime import json import os @@ -10,7 +11,7 @@ from loguru import logger from pyspark import * -from feathr.constants import FEATHR_MAVEN_ARTIFACT +from feathr.constants import FEATHR_MAVEN_ARTIFACT, OUTPUT_PATH_TAG from feathr.spark_provider._abc import SparkJobLauncher @@ -40,6 +41,7 @@ def __init__( self.retry_sec = retry_sec self.packages = self._get_default_package() self.master = master or "local[*]" + self.job_tags = None def upload_or_get_cloud_path(self, local_path_or_http_path: str): """For Local Spark Case, no need to upload to cloud workspace.""" @@ -52,6 +54,7 @@ def submit_feathr_job( main_class_name: str, arguments: List[str] = None, python_files: List[str] = None, + job_tags: Dict[str, str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}, **_, @@ -66,9 +69,10 @@ def submit_feathr_job( main_class_name: name of your main class arguments: all the arguments you want to pass into the spark job python_files: required .zip, .egg, or .py files of spark job + job_tags: tags of the job, for example you might want to put your user ID, or a tag with a certain information configuration: Additional configs for the spark job properties: System properties configuration - **_: Not used arguments in local spark mode, such as reference_files_path and job_tags + **_: Not used arguments in local spark mode, such as reference_files_path """ logger.warning( f"Local Spark Mode only support basic params right now and should be used only for testing purpose." @@ -125,6 +129,8 @@ def submit_feathr_job( logger.info(f"Local Spark job submit with pid: {proc.pid}.") + self.job_tags = deepcopy(job_tags) + return proc def wait_for_completion(self, timeout_seconds: Optional[float] = 500) -> bool: @@ -198,6 +204,22 @@ def get_status(self) -> str: """Get the status of the job, only a placeholder for local spark""" return self.latest_spark_proc.returncode + def get_job_result_uri(self) -> str: + """Get job output path + + Returns: + str: output_path + """ + return self.job_tags.get(OUTPUT_PATH_TAG, None) if self.job_tags else None + + def get_job_tags(self) -> Dict[str, str]: + """Get job tags + + Returns: + Dict[str, str]: a dict of job tags + """ + return self.job_tags + def _init_args(self, job_name: str, confs: Dict[str, str]) -> List[str]: logger.info(f"Spark job: {job_name} is running on local spark with master: {self.master}.") args = [ diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 815e26c21..c804d4ca9 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,12 +1,10 @@ -import glob -import os +from multiprocessing.sharedctypes import Value from pathlib import Path from tempfile import TemporaryDirectory from typing import Union from loguru import logger import pandas as pd -from pandas.errors import EmptyDataError from pyspark.sql import DataFrame, SparkSession from feathr.client import FeathrClient @@ -87,8 +85,8 @@ def get_result_df( # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: - raise RuntimeError( - "res_url is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI." + raise ValueError( + "`res_url` is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI." ) if client.spark_runtime == "local": @@ -99,9 +97,10 @@ def get_result_df( local_cache_path = res_url elif client.spark_runtime == "databricks": if res_url.startswith("dbfs:"): - logger.warning( - "Result files are already in DBFS and thus `local_cache_path` will be ignored." - ) + if local_cache_path is not None: + logger.warning( + "Result files are already in DBFS and thus `local_cache_path` will be ignored." + ) local_cache_path = res_url else: # if local_cache_path params is not provided then create a temporary folder @@ -112,6 +111,8 @@ def get_result_df( # Databricks uses "dbfs:/" prefix for spark paths if not local_cache_path.startswith("dbfs:"): local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) + else: + logger.warning("This utility function currently supports local spark and databricks. You may encounter unexpected results on other platforms.") # TODO elif azure_synapse if local_cache_path != res_url: @@ -127,9 +128,12 @@ def get_result_df( result_df = None if spark is not None: - result_df = spark.read.format(data_format).load(local_cache_path) + if data_format == "csv": + result_df = spark.read.option("header", True).csv(local_cache_path) + else: + result_df = spark.read.format(data_format).load(local_cache_path) else: - result_df = _read_files_to_pandas_df( + result_df = _load_files_to_pandas_df( dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. data_format=data_format, ) @@ -137,14 +141,10 @@ def get_result_df( return result_df -def _read_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.DataFrame: +def _load_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.DataFrame: if data_format == "parquet": - from pyarrow.parquet import ParquetDataset - - files = glob.glob(os.path.join(dir_path, "*.parquet")) - ds = ParquetDataset(files) - return ds.read().to_pandas() + return pd.read_parquet(dir_path) elif data_format == "delta": from deltalake import DeltaTable @@ -162,24 +162,22 @@ def _read_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.Dat elif data_format == "avro": import pandavro as pdx - - dataframe_list = [pdx.read_avro(file) for file in glob.glob(os.path.join(dir_path, "*.avro"))] - return pd.concat(dataframe_list, axis=0) + if Path(dir_path).is_file(): + return pdx.read_avro(dir_path) + else: + try: + return pd.concat([pdx.read_avro(f) for f in Path(dir_path).glob("*.avro")]).reset_index(drop=True) + except ValueError: # No object to concat when the dir is empty + return pd.DataFrame() elif data_format == "csv": - dataframe_list = [] - for file in glob.glob(os.path.join(dir_path, "*.csv")): - try: - dataframe_list.append(pd.read_csv(file, index_col=None, header=None)) - except EmptyDataError: - # in case there are empty files - pass - - if dataframe_list: - # Reset index to avoid duplicated indices -- TODO don't we need reset_index when reading avro too? - return pd.concat(dataframe_list, axis=0).reset_index(drop=True) + if Path(dir_path).is_file(): + return pd.read_csv(dir_path) else: - raise ValueError(f"Empty files in {dir_path}.") + try: + return pd.concat([pd.read_csv(f) for f in Path(dir_path).glob("*.csv")]).reset_index(drop=True) + except ValueError: # No object to concat when the dir is empty + return pd.DataFrame() else: raise ValueError( diff --git a/feathr_project/pyproject.toml b/feathr_project/pyproject.toml index 693233dc2..0162ede04 100644 --- a/feathr_project/pyproject.toml +++ b/feathr_project/pyproject.toml @@ -9,6 +9,12 @@ known_first_party = ['feathr'] force_sort_within_sections = true multi_line_output = 3 +[tool.pytest.ini_options] +markers = [ + "notebooks: tests Jupyter notebooks", + "databricks: tests functions on a Databricks cluster", +] + [build-system] requires = [ "setuptools", diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py new file mode 100644 index 000000000..d1ecd081b --- /dev/null +++ b/feathr_project/test/conftest.py @@ -0,0 +1,43 @@ +from pathlib import Path +from pyspark.sql import SparkSession +import pytest + +from feathr import FeathrClient + + +@pytest.fixture(scope="session") +def workspace_dir() -> str: + """Workspace directory path containing data files and configs for testing.""" + return str(Path(__file__).parent.resolve().joinpath("test_user_workspace")) + + +@pytest.fixture(scope="function") +def feathr_client_local(workspace_dir) -> FeathrClient: + """Test function-scoped Feathr client""" + return FeathrClient(config_path=str(Path(workspace_dir, "feathr_config_local.yaml"))) + + +@pytest.fixture(scope="function") +def feathr_client_databricks(workspace_dir) -> FeathrClient: + """Test function-scoped Feathr client""" + return FeathrClient(config_path=str(Path(workspace_dir, "feathr_config.yaml"))) + + +@pytest.fixture(scope="module") +def spark() -> SparkSession: + """Generate a spark session for tests.""" + # Set ui port other than the default one (4040) so that feathr spark job may not fail. + spark_session = ( + SparkSession.builder + .appName("tests") + .config("spark.jars.packages", ",".join([ + "org.apache.spark:spark-avro_2.12:3.3.0", + "io.delta:delta-core_2.12:2.1.1", + ])) + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config("spark.ui.port", "8080") + .getOrCreate() + ) + yield spark_session + spark_session.stop() diff --git a/feathr_project/test/test_user_workspace/mock_results/output-delta/_delta_log/00000000000000000000.json b/feathr_project/test/test_user_workspace/mock_results/output-delta/_delta_log/00000000000000000000.json new file mode 100644 index 000000000..855c52b51 --- /dev/null +++ b/feathr_project/test/test_user_workspace/mock_results/output-delta/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"a3a34f62-adf4-428f-9595-dc1a0c1055e7","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"trip_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"VendorID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lpep_pickup_datetime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lpep_dropoff_datetime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"store_and_fwd_flag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"RatecodeID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"PULocationID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DOLocationID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"passenger_count\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"trip_distance\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"fare_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"extra\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mta_tax\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tip_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tolls_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ehail_fee\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"improvement_surcharge\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"total_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"payment_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"trip_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"congestion_surcharge\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1667325249843}} +{"add":{"path":"part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet","partitionValues":{},"size":6277,"modificationTime":1667325251596,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"trip_id\":\"0\",\"VendorID\":\"2.0\",\"lpep_pickup_datetime\":\"2020-04-01 00:00:23\",\"lpep_dropoff_datetime\":\"2020-04-01 00:16:13\",\"store_and_fwd_flag\":\"N\",\"RatecodeID\":\"1.0\",\"PULocationID\":\"244\",\"DOLocationID\":\"169\",\"passenger_count\":\"1.0\",\"trip_distance\":\"1.0\",\"fare_amount\":\"12.0\",\"extra\":\"0.5\",\"mta_tax\":\"0.5\",\"tip_amount\":\"0.0\",\"tolls_amount\":\"0.0\",\"improvement_surcharge\":\"0.3\",\"total_amount\":\"10.3\",\"payment_type\":\"1.0\",\"trip_type\":\"1.0\",\"congestion_surcharge\":\"0.0\"},\"maxValues\":{\"trip_id\":\"4\",\"VendorID\":\"2.0\",\"lpep_pickup_datetime\":\"2020-04-01 00:45:06\",\"lpep_dropoff_datetime\":\"2020-04-01 01:04:39\",\"store_and_fwd_flag\":\"N\",\"RatecodeID\":\"1.0\",\"PULocationID\":\"75\",\"DOLocationID\":\"41\",\"passenger_count\":\"3.0\",\"trip_distance\":\"6.79\",\"fare_amount\":\"9.0\",\"extra\":\"0.5\",\"mta_tax\":\"0.5\",\"tip_amount\":\"0.0\",\"tolls_amount\":\"0.0\",\"improvement_surcharge\":\"0.3\",\"total_amount\":\"9.3\",\"payment_type\":\"2.0\",\"trip_type\":\"1.0\",\"congestion_surcharge\":\"0.0\"},\"nullCount\":{\"trip_id\":0,\"VendorID\":0,\"lpep_pickup_datetime\":0,\"lpep_dropoff_datetime\":0,\"store_and_fwd_flag\":0,\"RatecodeID\":0,\"PULocationID\":0,\"DOLocationID\":0,\"passenger_count\":0,\"trip_distance\":0,\"fare_amount\":0,\"extra\":0,\"mta_tax\":0,\"tip_amount\":0,\"tolls_amount\":0,\"ehail_fee\":5,\"improvement_surcharge\":0,\"total_amount\":0,\"payment_type\":0,\"trip_type\":0,\"congestion_surcharge\":0}}"}} +{"commitInfo":{"timestamp":1667325251731,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"6277"},"engineInfo":"Apache-Spark/3.2.2 Delta-Lake/2.1.1","txnId":"a5e436e6-dfb6-4956-9e0c-b31b883128a0"}} diff --git a/feathr_project/test/test_user_workspace/mock_results/output-delta/part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet b/feathr_project/test/test_user_workspace/mock_results/output-delta/part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1d8214c42adad9d773440876908cee73520563c6 GIT binary patch literal 6277 zcmbuEe`p)m9l+m7{vpfquUPMTT9nv`^>91qxYM0wTbQtwP_j}|6Sh*e4snsrwpLkE zB{^*t0%i+tmO=;pBaC&;AD!_ZsWBzRmuSJ-z$B&-eSj-|zdr_mau4om42HN9kAT#%r&?8;sL1{A{BkLI#Ecgpl9> z2Tz%6olvlm#D|IxjkeHkdVsE;zk4e%K?Agfkai)6GLj68UK*tfuWq~-L1c@G48=_( z2|~}?9G#(8F23{It_d2#(a%29kE7+Jd_>lc$jYEBr!_4tCm(-^^d_~mI*tu(BrT_~ z0dJEkD2h^n{_5>NzTfUp6#lZQC=gr>g2*;UFL3nNKmEfM&N0C=Wm9Oc){!KjQHQFU z$|565T6s(&f|8b*Ark3TQfWnnd4$0YAO7_h7b2DuT6@twC-&n4^xwk$-+$=#pRzyy ziS$wW@{e||w@rX^!F~P5aXMO(g+SBr$LNUVpPTny+U@l8&o>}&G)2?&*2ZwP+6%ebGn|yh6nv4AM$z7E?)}0%#~?q@;2WR* z)a-LGp%IwboqvDydD!9ewkY!%v0U=`*1z6?lyNM5S;7_h1uh}<8HmW* z)%(k=Hdy6mc(M_XaYMZ=1woL z;sE98rysob8@A-F0@N{RD&d0N)*O5>s0q-gDXu;? z7#Fxx5IbAgnr!s8DCWe+-LkN_Kszq6hA95@>O?znNw_4NI=PrY;Da(@%i)k+Y@q5G z*rJ%dkb#a~%NOq0csmH}a9I=QL$nAnCb2lb!qLa9=6sMoZZ+3~bd9sxtF5-gN}Juh z)@}=LhV15i*lw3kir-q~pHMr_7Tm)+d%wwqf$c5|hd9<{7k?6Z3}`{?;P`K^9> z%#v7++7e6DY=XP)@y8?8vR-*EQ_ap8vpRhg%E-e3p}khgmoxd?!{o_0c!KqXo;HfP zQswK%oSo4^*(hhq`RsFZi5qNl7^t449!sARIGxnj+!q>Ifrm#^0J zV%BAU=ai1d%{Gr{HD0V$bSI;Iwx(xl`isuKa1GqJcW|UuDio?7FFK4FJzvO787>=W zezsgGoi=8TVl7jht7K>N%Ct)YfHl3~0gROOGi+GxOxZP}gUz_PH<~ShgR1D&J&O2n zEFQZ@#Za6^Vk05a7PHfY2KX>v;EQ5tB1MKc4QeH<9;oQBdI{mL9imbG2Biriu0UYPyuyDE-ioK=tfI09Plm9%vNnkg34~^Zs|sccxO9Pz8K)wI1ok;xinjq zy;(Ec1e{)+KTHMw@xQV z$NvO!C@d~_ATtPB3yV48t~CsLf#c^PH^P`3Y`3+LjU}`&0Kxw_ zMu&j~*8(&VS0q_qK)0@60ddbANvtvKVjJI?grLTRc-Qix472L z$Pf?M(oKWSwuiz0K=NHi-hw+y-C$Q?w%J=f;%d+S**nNyJc!O%={4CHt43{S;52^} zUQeXpY|{wyD}ylfb=G@J;$J`fD$riJ3(^7)= zx%LpoUf}riu!8!mWKtoSfbO(+Z4i>_QXJFrt^S6+u3d+rzR&R&Igk-=9$?Fcm31f_ z9_C#~^WKf4`<9~(W@E#$=e-;i|I-iit`{JjcfIWazQN|rz)Nr)smI)N&$D$bQSnxE z|FPdi9hVM)@%#$Uhs6;8cN&GGvC z>hlE&PG`^088xX=dO?~sYC0UUbYZjeYy$W0L^@HeRpzp_#BgFNZxnLXMEcuj6GeR% zUs;DZc-N547pLKOajsC%PZdB2MjOM4IyjL&d+yw@4XB^^{1d#6hYv_=K1=$??LBz< zXEjc#{u$0At$&JhBlJ&cIxPDKxlY#pF^*%ne+WCX`yXx{@cjcCw}A&>IIa%y8470S;STMS z7Z(mJxzDNPTY`p6MAo8}k$&+m8DFDAnr{Bxj}w|kx#e#ioIymwgQVd6sj+3{QY2Zg zC^AS>Yx{8V4Eum)5fgWpOMz2PdBBr!cff;)NGcO0sZ5tKF%pckIG&7|OD1SQvM7k3 zsFjk!#s6(>Ds>@?Xh~xm_t#iRWWutNgyr=K37q6P&3aS>Asb|}lnHfcD(57ZBn!*s z9Frofe(4T$PYO~>OJ9;elD$$`sbo(eLb5c?Cn3-slB7XQ%Vm=ExnM)ur>lk6QguEr> zTt&$DJRjk19c(nPk6lo?2XsfFsvcI$G0(RR$8NTP-d3ABvKw^`AUIt`6{y!-pm(4K zZ<^XHQlBf%<`I~|p6_cMq=)T|%XbnJOh11;g5FEju^CN)G z+ubP$8Zh(gpw@7Fr!#e7RFrq&o^}eF^M~4r_PS9uT6WuHroB)Fu6fZBnE+{|)vxl(Eu2 literal 0 HcmV?d00001 diff --git a/feathr_project/test/test_user_workspace/mock_results/output.csv b/feathr_project/test/test_user_workspace/mock_results/output.csv new file mode 100644 index 000000000..0468eb1b6 --- /dev/null +++ b/feathr_project/test/test_user_workspace/mock_results/output.csv @@ -0,0 +1,6 @@ +trip_id,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge +0,2.0,2020-04-01 00:44:02,2020-04-01 00:52:23,N,1.0,42,41,1.0,1.68,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,1.0,1.0,0.0 +1,2.0,2020-04-01 00:24:39,2020-04-01 00:33:06,N,1.0,244,247,2.0,1.94,9.0,0.5,0.5,0.0,0.0,,0.3,10.3,2.0,1.0,0.0 +2,2.0,2020-04-01 00:45:06,2020-04-01 00:51:13,N,1.0,244,243,3.0,1.0,6.5,0.5,0.5,0.0,0.0,,0.3,7.8,2.0,1.0,0.0 +3,2.0,2020-04-01 00:45:06,2020-04-01 01:04:39,N,1.0,244,243,2.0,2.81,12.0,0.5,0.5,0.0,0.0,,0.3,13.3,2.0,1.0,0.0 +4,2.0,2020-04-01 00:00:23,2020-04-01 00:16:13,N,1.0,75,169,1.0,6.79,21.0,0.5,0.5,0.0,0.0,,0.3,22.3,1.0,1.0,0.0 diff --git a/feathr_project/test/test_user_workspace/mock_results/output.parquet/part-00000-bfa76930-af3c-4d58-a6e6-c1050f57ab99-c000.snappy.parquet b/feathr_project/test/test_user_workspace/mock_results/output.parquet/part-00000-bfa76930-af3c-4d58-a6e6-c1050f57ab99-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0e2f9d13fd463db5326ec595f13fdf51ea8a1d26 GIT binary patch literal 6277 zcmbuEe`p)m9l+m7{vpfquUPMTT9nv`^>91qxYM0wTbQtwP_j}|6Sh*e4snsrwpLkE zB{^*t0%i+tmO=;pBaC&;AD!_ZsWBzRmuSJ-z$B&-eSj-|zdr_mau4om42HN9kAT#%r&?8;sL1{A{BkLI#Ecgpl9> z2Tz%6olvlm#D|IxjkeHkdVsE;zk4e%K?Agfkai)6GLj68UK*tfuWq~-L1c@G48=_( z2|~}?9G#(8F23{It_d2#(a%29kE7+Jd_>lc$jYEBr!_4tCm(-^^d_~mI*tu(BrT_~ z0dJEkD2h^n{_5>NzTfUp6#lZQC=gr>g2*;UFL3nNKmEfM&N0C=Wm9Oc){!KjQHQFU z$|565T6s(&f|8b*Ark3TQfWnnd4$0YAO7_h7b2DuT6@twC-&n4^xwk$-+$=#pRzyy ziS$wW@{e||w@rX^!F~P5aXMO(g+SBr$LNUVpPTny+U@l8&o>}&G)2?&*2ZwP+6%ebGn|yh6nv4AM$z7E?)}0%#~?q@;2WR* z)a-LGp%IwboqvDydD!9ewkY!%v0U=`*1z6?lyNM5S;7_h1uh}<8HmW* z)%(k=Hdy6mc(M_XaYMZ=1woL z;sE98rysob8@A-F0@N{RD&d0N)*O5>s0q-gDXu;? z7#Fxx5IbAgnr!s8DCWe+-LkN_Kszq6hA95@>O?znNw_4NI=PrY;Da(@%i)k+Y@q5G z*rJ%dkb#a~%NOq0csmH}a9I=QL$nAnCb2lb!qLa9=6sMoZZ+3~bd9sxtF5-gN}Juh z)@}=LhV15i*lw3kir-q~pHMr_7Tm)+d%wwqf$c5|hd9<{7k?6Z3}`{?;P`K^9> z%#v7++7e6DY=XP)@y8?8vR-*EQ_ap8vpRhg%E-e3p}khgmoxd?!{o_0c!KqXo;HfP zQswK%oSo4^*(hhq`RsFZi5qNl7^t449!sARIGxnj+!q>Ifrm#^0J zV%BAU=ai1d%{Gr{HD0V$bSI;Iwx(xl`isuKa1GqJcW|UuDio?7FFK4FJzvO787>=W zezsgGoi=8TVl7jht7K>N%Ct)YfHl3~0gROOGi+GxOxZP}gUz_PH<~ShgR1D&J&O2n zEFQZ@#Za6^Vk4n8Se*4Vp#eV37xzrs|PANtX@L+Ylmo*zd>n&h$~Rr zadb;tOt2G*4X^OugSX=8IIF1a!IL4ZuB=UB!?Du9=P&Y*v*J9Q6Y5yc zlw>O22=JeAAS>SH;$Wv&QEsl(tbTjwzKOb5dHP)P@}lrIMOH4cQ)LoUr$ zWpCEZHUXy>=MPhXzx)uYTy1Sk*JJrGeMSB>$cgiXEndcch2t-AAS2#x!_dHf1yR>D z59E7@+z{vx5jU+=#AJ#%mpwyX=J+KJWWjwsg~Av+ZH{D^>syTPF?J@U7RIe5=1gW1@y?0GLo#sBofyz2!B=Us1mfN!vQGw>2zN9r;6-1BT5OH{lS z-GA(NQOBi2U_8IV^I|D8tRC^;+$rOLFVmvuO2N$kiaoi-{}*l2}TRZ2=pB0RE3 zqWXM6g45aab4E?7lwOc#jhYUJEM3^_Je$D1JCROQYn8cdEis&!${U4THIe@I*+fyF z#aGrL4&F6n^TlcSU7RZv^iu^8g3-osq7F`^&z?IsYy;{iKK}%-=uhe~Nw2 z@Q?7kl=#OvZ!rET&g+hUihVcokMO!k{lB#DoFl>$R{cLRTN$)Aj6p?$e}PF34xO6I z7jlE6qlPMH$K-KU9ZMP6kz`iQPU)$vs+}4&rc^blsL5n@TzdpAZ*ZA^kPlIN37Lfd GfcP(sf;?&f literal 0 HcmV?d00001 diff --git a/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv b/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv new file mode 100644 index 000000000..b5b08ca83 --- /dev/null +++ b/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv @@ -0,0 +1,5 @@ +0,2.0,2020-04-01 00:44:02,2020-04-01 00:52:23,N,1.0,42,41,1.0,1.68,8.0,0.5,0.5,0.0,0.0,"",0.3,9.3,1.0,1.0,0.0 +1,2.0,2020-04-01 00:24:39,2020-04-01 00:33:06,N,1.0,244,247,2.0,1.94,9.0,0.5,0.5,0.0,0.0,"",0.3,10.3,2.0,1.0,0.0 +2,2.0,2020-04-01 00:45:06,2020-04-01 00:51:13,N,1.0,244,243,3.0,1.0,6.5,0.5,0.5,0.0,0.0,"",0.3,7.8,2.0,1.0,0.0 +3,2.0,2020-04-01 00:45:06,2020-04-01 01:04:39,N,1.0,244,243,2.0,2.81,12.0,0.5,0.5,0.0,0.0,"",0.3,13.3,2.0,1.0,0.0 +4,2.0,2020-04-01 00:00:23,2020-04-01 00:16:13,N,1.0,75,169,1.0,6.79,21.0,0.5,0.5,0.0,0.0,"",0.3,22.3,1.0,1.0,0.0 diff --git a/feathr_project/test/unit/datasets/test_datasets.py b/feathr_project/test/unit/datasets/test_datasets.py index c1ac49a9b..10d89c673 100644 --- a/feathr_project/test/unit/datasets/test_datasets.py +++ b/feathr_project/test/unit/datasets/test_datasets.py @@ -12,15 +12,6 @@ NYC_TAXI_FILE_PATH = str(TEST_DATASET_DIR.joinpath("green_tripdata_2020-04_with_index.csv").resolve()) -@pytest.fixture(scope="module") -def spark() -> SparkSession: - """Generate a spark session for tests.""" - # Set ui port other than the default one (4040) so that feathr spark job may not fail. - spark_session = SparkSession.builder.appName("tests").config("spark.ui.port", "8080").getOrCreate() - yield spark_session - spark_session.stop() - - @pytest.mark.parametrize( "local_cache_path", [ diff --git a/feathr_project/test/unit/spark_provider/test_localspark_submission.py b/feathr_project/test/unit/spark_provider/test_localspark_submission.py index 9a9d7238b..992f2015e 100644 --- a/feathr_project/test/unit/spark_provider/test_localspark_submission.py +++ b/feathr_project/test/unit/spark_provider/test_localspark_submission.py @@ -4,6 +4,7 @@ import pytest from pytest_mock import MockerFixture +from feathr.constants import OUTPUT_PATH_TAG from feathr.spark_provider._localspark_submission import _FeathrLocalSparkJobLauncher @@ -15,9 +16,17 @@ def local_spark_job_launcher(tmp_path) -> _FeathrLocalSparkJobLauncher: ) +@pytest.mark.parametrize( + "job_tags,expected_result_uri", [ + (None, None), + ({OUTPUT_PATH_TAG: "output"}, "output"), + ] +) def test__local_spark_job_launcher__submit_feathr_job( mocker: MockerFixture, local_spark_job_launcher: _FeathrLocalSparkJobLauncher, + job_tags: Dict[str, str], + expected_result_uri: str, ): # Mock necessary components local_spark_job_launcher._init_args = MagicMock(return_value=[]) @@ -31,11 +40,16 @@ def test__local_spark_job_launcher__submit_feathr_job( job_name="unit-test", main_jar_path="", main_class_name="", + job_tags=job_tags, ) # Assert if the mocked spark process has called once mocked_spark_proc.assert_called_once() + # Assert job tags + assert local_spark_job_launcher.get_job_tags() == job_tags + assert local_spark_job_launcher.get_job_result_uri() == expected_result_uri + @pytest.mark.parametrize( "confs", [{}, {"spark.feathr.outputFormat": "parquet"}] diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py new file mode 100644 index 000000000..21392bf84 --- /dev/null +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -0,0 +1,112 @@ +# TODO with, without optional args +# TODO test with no data files exception and unsupported format exception +from pathlib import Path +from tempfile import NamedTemporaryFile +from unittest.mock import MagicMock + +import pandas as pd +import pytest +from pytest_mock import MockerFixture +from pyspark.sql import DataFrame, SparkSession + +from feathr import FeathrClient +from feathr.utils.job_utils import ( + get_result_df, + get_result_pandas_df, + get_result_spark_df, +) + + +def test__get_result_pandas_df(mocker: MockerFixture): + # Assert if the base function, get_result_df, called w/ proper args + mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") + client = MagicMock() + data_format = "some_data_format" + res_url = "some_res_url" + local_cache_path = "some_local_cache_path" + get_result_pandas_df(client, data_format, res_url, local_cache_path) + mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path) + + +def test__get_result_spark_df(mocker: MockerFixture): + # Assert if the base function, get_result_df, called w/ proper args + mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") + client = MagicMock() + spark = MagicMock() + data_format = "some_data_format" + res_url = "some_res_url" + local_cache_path = "some_local_cache_path" + get_result_spark_df(spark, client, data_format, res_url, local_cache_path) + mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path, spark=spark) + + +# Local spark is expected to use a local filepath for res_url. Therefore, we mark this test to run with databricks. +@pytest.mark.databricks +def test__get_result_df__with_local_cache_path(feathr_client_databricks: FeathrClient): + # TODO Assert there is a local copy of the file in the given local_cache_path + pass + + +def test__get_result_df__exceptions(): + client = MagicMock() + client.get_job_result_uri = MagicMock(return_value=None) + + # Test ValueError when res_url is None + with pytest.raises(ValueError): + get_result_df(client) + + +@pytest.mark.parametrize( + "data_format,output_filename,expected_count", [ + ("csv", "output.csv", 5), + ("csv", "output_dir.csv", 4), # TODO add a header to the csv file and change expected_count to 5 after fixing the bug https://github.com/feathr-ai/feathr/issues/811 + ("parquet", "output.parquet", 5), + ("avro", "output.avro", 5), + ("delta", "output-delta", 5), + ] +) +def test__get_result_df( + workspace_dir: str, + feathr_client_local: FeathrClient, + data_format: str, + output_filename: str, + expected_count: int, +): + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + df = get_result_df( + client=feathr_client_local, + data_format=data_format, + res_url=res_url, + ) + assert isinstance(df, pd.DataFrame) + assert len(df) == expected_count + + +@pytest.mark.parametrize( + "data_format,output_filename,expected_count", [ + ("csv", "output.csv", 5), + ("csv", "output_dir.csv", 4), # TODO add a header to the csv file and change expected_count = 5 after fixing the bug https://github.com/feathr-ai/feathr/issues/811 + ("parquet", "output.parquet", 5), + ("avro", "output.avro", 5), + ("delta", "output-delta", 5), + ] +) +def test__get_result_df__with_spark_session( + workspace_dir: str, + feathr_client_local: FeathrClient, + spark: SparkSession, + data_format: str, + output_filename: str, + expected_count: int, +): + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + df = get_result_df( + client=feathr_client_local, + data_format=data_format, + res_url=res_url, + spark=spark, + ) + assert isinstance(df, DataFrame) + assert df.count() == expected_count From 15f4939b38b3c0bd9a0500b61404f0396eea34e0 Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Tue, 1 Nov 2022 10:26:40 +0800 Subject: [PATCH 04/18] Update test_azure_spark_e2e.py --- feathr_project/test/test_azure_spark_e2e.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/feathr_project/test/test_azure_spark_e2e.py b/feathr_project/test/test_azure_spark_e2e.py index 9c4ab8c5a..ae7c1cab2 100644 --- a/feathr_project/test/test_azure_spark_e2e.py +++ b/feathr_project/test/test_azure_spark_e2e.py @@ -183,7 +183,7 @@ def test_feathr_get_offline_features(): full_name="nyc_taxi.location_id") feature_query = FeatureQuery( - feature_list=["f_location_avg_fare"], key=location_id) + feature_list=["f_location_avg_fare", "f_trip_time_rounded"], key=location_id) settings = ObservationSettings( observation_path="wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04.csv", event_timestamp_column="lpep_dropoff_datetime", @@ -309,9 +309,9 @@ def test_feathr_materialize_to_aerospike(): # os.chdir(test_workspace_dir) now = datetime.now() # set workspace folder by time; make sure we don't have write conflict if there are many CI tests running - os.environ['SPARK_CONFIG__DATABRICKS__WORK_DIR'] = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) - os.environ['SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR'] = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_github_ci','_', str(now.minute), '_', str(now.second) ,'_', str(now.microsecond)]) - + os.environ['SPARK_CONFIG__DATABRICKS__WORK_DIR'] = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) + os.environ['SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR'] = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_github_ci','_', str(now.minute), '_', str(now.second) ,'_', str(now.microsecond)]) + client = FeathrClient(config_path="feathr_config.yaml") batch_source = HdfsSource(name="nycTaxiBatchSource", path="wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04.csv", @@ -396,4 +396,4 @@ def test_feathr_materialize_to_aerospike(): if __name__ == "__main__": test_feathr_materialize_to_aerospike() test_feathr_get_offline_features_to_sql() - test_feathr_materialize_to_cosmosdb() \ No newline at end of file + test_feathr_materialize_to_cosmosdb() From 26b7a0d08fc10524aa15eac8d1dfc68d37d52142 Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Tue, 1 Nov 2022 11:13:13 +0800 Subject: [PATCH 05/18] Fix doc dead links (#805) This PR fixes dead links detected in latest ci run. The doc scan ci action has been updated to run on main only, as running this in PR frequently reports false alarm due to changes in CI not deployed. --- .github/workflows/document-scan.yml | 5 ++++- .../databricks/databricks_quickstart_nyc_taxi_demo.ipynb | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/document-scan.yml b/.github/workflows/document-scan.yml index 3762ca2af..291a04f44 100644 --- a/.github/workflows/document-scan.yml +++ b/.github/workflows/document-scan.yml @@ -1,6 +1,9 @@ name: Feathr Documents' Broken Link Check -on: [push] +on: + push: + branches: [main] + jobs: check-links: runs-on: ubuntu-latest diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index 13187aa44..0bc099f11 100755 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["!pip install feathr"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n",""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.4 ('feathr')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"vscode":{"interpreter":{"hash":"ddb0e38f168d5afaa0b8ab4851ddd8c14364f1d087c15de6ff2ee5a559aec1f2"}}},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["!pip install feathr"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n",""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} From 8e401b4f840104d516a51f03c27a1961601eb754 Mon Sep 17 00:00:00 2001 From: Boli Guan Date: Tue, 1 Nov 2022 18:08:21 +0800 Subject: [PATCH 06/18] Improve UI experience and clean up ui code warnings (#801) * Add DataSourcesSelect and FlowGraph and ResizeTable components. Fix all warning and lint issues. Signed-off-by: Boli Guan * Add CardDescriptions component and fix ESlint warning. Signed-off-by: Boli Guan * Update FeatureDetails page title. Signed-off-by: Boli Guan * Rename ProjectSelect Signed-off-by: Boli Guan Signed-off-by: Boli Guan --- ui/.eslintrc | 3 +- ui/package-lock.json | 656 ++++++++++++------ ui/package.json | 8 +- ui/src/api/api.tsx | 46 +- ui/src/components/CardDescriptions/index.tsx | 32 + ui/src/components/FlowGraph/FlowGraph.tsx | 236 +++++++ ui/src/components/FlowGraph/LineageNode.tsx | 57 ++ ui/src/components/FlowGraph/index.module.less | 43 ++ ui/src/components/FlowGraph/index.ts | 5 + ui/src/components/FlowGraph/interface.ts | 30 + ui/src/components/FlowGraph/utils.ts | 192 +++++ ui/src/components/ProjectsSelect/index.tsx | 51 ++ .../components/ResizeTable/ResizableTitle.tsx | 37 + .../components/ResizeTable/ResizeHandle.tsx | 29 + ui/src/components/ResizeTable/ResizeTable.tsx | 68 ++ .../components/ResizeTable/index.module.less | 22 + ui/src/components/ResizeTable/index.tsx | 5 + ui/src/components/ResizeTable/interface.ts | 22 + ui/src/components/graph/graphNodeDetails.tsx | 107 ++- ui/src/components/header/headerWidgetMenu.tsx | 43 +- ui/src/components/sidemenu/siteMenu.tsx | 123 ++-- ui/src/models/model.ts | 14 +- .../components/DataSourceTable/index.tsx | 146 ++++ .../dataSource/components/SearchBar/index.tsx | 38 + ui/src/pages/dataSource/dataSourceDetails.tsx | 169 ++--- ui/src/pages/dataSource/dataSources.tsx | 25 +- .../feature/components/FeatureForm/index.tsx | 87 +++ .../feature/components/FeatureTable/index.tsx | 151 ++++ .../NodeDetails/FeatureNodeDetail.tsx | 44 ++ .../NodeDetails/SourceNodeDetial.tsx | 22 + .../feature/components/NodeDetails/index.tsx | 63 ++ .../feature/components/SearchBar/index.tsx | 67 ++ ui/src/pages/feature/featureDetails.tsx | 422 +++++------ ui/src/pages/feature/features.tsx | 29 +- ui/src/pages/feature/lineageGraph.tsx | 139 ++-- ui/src/pages/feature/newFeature.tsx | 11 +- .../management/components/RoleForm/index.tsx | 2 +- .../components/UserRolesTable/index.tsx | 23 +- ui/src/pages/management/management.tsx | 2 +- ui/src/pages/management/roleManagement.tsx | 2 +- .../project/components/ProjectTable/index.tsx | 99 +++ .../project/components/SearchBar/index.tsx | 51 ++ ui/src/pages/project/projects.tsx | 23 +- ui/src/site.css | 6 +- ui/src/utils/attributesMapping.ts | 48 ++ ui/src/utils/utils.tsx | 18 +- 46 files changed, 2655 insertions(+), 861 deletions(-) create mode 100644 ui/src/components/CardDescriptions/index.tsx create mode 100644 ui/src/components/FlowGraph/FlowGraph.tsx create mode 100644 ui/src/components/FlowGraph/LineageNode.tsx create mode 100644 ui/src/components/FlowGraph/index.module.less create mode 100644 ui/src/components/FlowGraph/index.ts create mode 100644 ui/src/components/FlowGraph/interface.ts create mode 100644 ui/src/components/FlowGraph/utils.ts create mode 100644 ui/src/components/ProjectsSelect/index.tsx create mode 100644 ui/src/components/ResizeTable/ResizableTitle.tsx create mode 100644 ui/src/components/ResizeTable/ResizeHandle.tsx create mode 100644 ui/src/components/ResizeTable/ResizeTable.tsx create mode 100644 ui/src/components/ResizeTable/index.module.less create mode 100644 ui/src/components/ResizeTable/index.tsx create mode 100644 ui/src/components/ResizeTable/interface.ts create mode 100644 ui/src/pages/dataSource/components/DataSourceTable/index.tsx create mode 100644 ui/src/pages/dataSource/components/SearchBar/index.tsx create mode 100644 ui/src/pages/feature/components/FeatureForm/index.tsx create mode 100644 ui/src/pages/feature/components/FeatureTable/index.tsx create mode 100644 ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx create mode 100644 ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx create mode 100644 ui/src/pages/feature/components/NodeDetails/index.tsx create mode 100644 ui/src/pages/feature/components/SearchBar/index.tsx create mode 100644 ui/src/pages/project/components/ProjectTable/index.tsx create mode 100644 ui/src/pages/project/components/SearchBar/index.tsx create mode 100644 ui/src/utils/attributesMapping.ts diff --git a/ui/.eslintrc b/ui/.eslintrc index 43eeb60eb..c271bfa24 100644 --- a/ui/.eslintrc +++ b/ui/.eslintrc @@ -20,7 +20,8 @@ "react-app", // https://reactjs.org/docs/hooks-rules.html "plugin:react-hooks/recommended", - "plugin:prettier/recommended" + "plugin:prettier/recommended", + "plugin:json/recommended" ], "parser": "@typescript-eslint/parser", "parserOptions": { diff --git a/ui/package-lock.json b/ui/package-lock.json index b1568ad00..480dfdc62 100644 --- a/ui/package-lock.json +++ b/ui/package-lock.json @@ -11,7 +11,7 @@ "@ant-design/icons": "^4.7.0", "@azure/msal-browser": "^2.24.0", "@azure/msal-react": "^1.4.0", - "antd": "^4.20.2", + "antd": "^4.23.6", "axios": "^0.27.2", "classnames": "^2.3.2", "dagre": "^0.8.5", @@ -20,6 +20,7 @@ "react-dom": "^17.0.2", "react-flow-renderer": "^9.7.4", "react-query": "^3.38.0", + "react-resizable": "^3.0.4", "react-router-dom": "^6.3.0" }, "devDependencies": { @@ -32,6 +33,7 @@ "@types/node": "^16.11.26", "@types/react": "^17.0.43", "@types/react-dom": "^17.0.14", + "@types/react-resizable": "^3.0.3", "@typescript-eslint/eslint-plugin": "^5.30.7", "@typescript-eslint/parser": "^5.30.7", "babel-plugin-import": "^1.13.5", @@ -40,6 +42,7 @@ "eslint-config-prettier": "^8.5.0", "eslint-import-resolver-typescript": "^3.5.1", "eslint-plugin-import": "^2.26.0", + "eslint-plugin-json": "^3.1.0", "eslint-plugin-prettier": "^4.2.1", "eslint-plugin-react-hooks": "^4.6.0", "husky": "^8.0.1", @@ -94,14 +97,15 @@ "license": "MIT" }, "node_modules/@ant-design/react-slick": { - "version": "0.28.4", - "license": "MIT", + "version": "0.29.2", + "resolved": "https://registry.npmjs.org/@ant-design/react-slick/-/react-slick-0.29.2.tgz", + "integrity": "sha512-kgjtKmkGHa19FW21lHnAfyyH9AAoh35pBdcJ53rHmQ3O+cfFHGHnUbj/HFrRNJ5vIts09FKJVAD8RpaC+RaWfA==", "dependencies": { "@babel/runtime": "^7.10.4", "classnames": "^2.2.5", "json2mq": "^0.2.0", "lodash": "^4.17.21", - "resize-observer-polyfill": "^1.5.0" + "resize-observer-polyfill": "^1.5.1" }, "peerDependencies": { "react": ">=16.9.0" @@ -1946,10 +1950,11 @@ } }, "node_modules/@babel/runtime": { - "version": "7.17.9", - "license": "MIT", + "version": "7.20.0", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.20.0.tgz", + "integrity": "sha512-NDYdls71fTXoU8TZHfbBWg7DiZfNzClcKui/+kyi6ppD2L1qnWW3VV6CjtaBXSUGGhiTWJ6ereOIkUvenif66Q==", "dependencies": { - "regenerator-runtime": "^0.13.4" + "regenerator-runtime": "^0.13.10" }, "engines": { "node": ">=6.9.0" @@ -3607,6 +3612,15 @@ "redux": "^4.0.0" } }, + "node_modules/@types/react-resizable": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/react-resizable/-/react-resizable-3.0.3.tgz", + "integrity": "sha512-W/QsUOZoXBAIBQNhNm95A5ohoaiUA874lWQytO2UP9dOjp5JHO9+a0cwYNabea7sA12ZDJnGVUFZxcNaNksAWA==", + "dev": true, + "dependencies": { + "@types/react": "*" + } + }, "node_modules/@types/resolve": { "version": "1.17.1", "dev": true, @@ -4566,52 +4580,53 @@ } }, "node_modules/antd": { - "version": "4.20.2", - "license": "MIT", + "version": "4.23.6", + "resolved": "https://registry.npmjs.org/antd/-/antd-4.23.6.tgz", + "integrity": "sha512-AYH57cWBDe1ChtbnvG8i9dpKG4WnjE3AG0zIKpXByFNnxsr4saV6/19ihE8/ImSGpohN4E2zTXmo7R5/MyVRKQ==", "dependencies": { "@ant-design/colors": "^6.0.0", "@ant-design/icons": "^4.7.0", - "@ant-design/react-slick": "~0.28.1", - "@babel/runtime": "^7.12.5", + "@ant-design/react-slick": "~0.29.1", + "@babel/runtime": "^7.18.3", "@ctrl/tinycolor": "^3.4.0", "classnames": "^2.2.6", "copy-to-clipboard": "^3.2.0", "lodash": "^4.17.21", "memoize-one": "^6.0.0", "moment": "^2.29.2", - "rc-cascader": "~3.5.0", + "rc-cascader": "~3.7.0", "rc-checkbox": "~2.3.0", - "rc-collapse": "~3.1.0", - "rc-dialog": "~8.8.1", - "rc-drawer": "~4.4.2", - "rc-dropdown": "~3.5.0", - "rc-field-form": "~1.26.1", - "rc-image": "~5.6.0", - "rc-input": "~0.0.1-alpha.5", - "rc-input-number": "~7.3.0", - "rc-mentions": "~1.7.0", - "rc-menu": "~9.5.5", - "rc-motion": "^2.5.1", + "rc-collapse": "~3.3.0", + "rc-dialog": "~8.9.0", + "rc-drawer": "~5.1.0", + "rc-dropdown": "~4.0.0", + "rc-field-form": "~1.27.0", + "rc-image": "~5.7.0", + "rc-input": "~0.1.2", + "rc-input-number": "~7.3.9", + "rc-mentions": "~1.10.0", + "rc-menu": "~9.6.3", + "rc-motion": "^2.6.1", "rc-notification": "~4.6.0", - "rc-pagination": "~3.1.9", - "rc-picker": "~2.6.4", - "rc-progress": "~3.2.1", + "rc-pagination": "~3.1.17", + "rc-picker": "~2.6.11", + "rc-progress": "~3.3.2", "rc-rate": "~2.9.0", "rc-resize-observer": "^1.2.0", - "rc-segmented": "~2.1.0 ", - "rc-select": "~14.1.1", + "rc-segmented": "~2.1.0", + "rc-select": "~14.1.13", "rc-slider": "~10.0.0", "rc-steps": "~4.1.0", "rc-switch": "~3.2.0", - "rc-table": "~7.24.0", - "rc-tabs": "~11.13.0", - "rc-textarea": "~0.3.0", - "rc-tooltip": "~5.1.1", - "rc-tree": "~5.5.0", - "rc-tree-select": "~5.3.0", + "rc-table": "~7.26.0", + "rc-tabs": "~12.2.0", + "rc-textarea": "~0.4.5", + "rc-tooltip": "~5.2.0", + "rc-tree": "~5.7.0", + "rc-tree-select": "~5.5.0", "rc-trigger": "^5.2.10", "rc-upload": "~4.3.0", - "rc-util": "^5.20.0", + "rc-util": "^5.22.5", "scroll-into-view-if-needed": "^2.2.25" }, "funding": { @@ -4681,7 +4696,8 @@ }, "node_modules/array-tree-filter": { "version": "2.1.0", - "license": "MIT" + "resolved": "https://registry.npmjs.org/array-tree-filter/-/array-tree-filter-2.1.0.tgz", + "integrity": "sha512-4ROwICNlNw/Hqa9v+rk5h22KjmzB1JGTMVKP2AKJBOCgb0yL0ASf0+YvCcLNNwquOHNX48jkeZIJ3a+oOQqKcw==" }, "node_modules/array-union": { "version": "2.1.0", @@ -4749,8 +4765,9 @@ "license": "MIT" }, "node_modules/async-validator": { - "version": "4.1.1", - "license": "MIT" + "version": "4.2.5", + "resolved": "https://registry.npmjs.org/async-validator/-/async-validator-4.2.5.tgz", + "integrity": "sha512-7HhHjtERjqlNbZtqNqy2rckN/SpOOlmDliet+lP7k+eKZEjPk3DgyeU9lIXLdeLz0uBbbVp+9Qdow9wJWgwwfg==" }, "node_modules/asynckit": { "version": "0.4.0", @@ -6520,8 +6537,9 @@ } }, "node_modules/date-fns": { - "version": "2.28.0", - "license": "MIT", + "version": "2.29.3", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.3.tgz", + "integrity": "sha512-dDCnyH2WnnKusqvZZ6+jA1O51Ibt8ZMRNkDZdyAyK4YfbDwa/cEmuztzG5pk6hqlp9aSBPYcjOlktquahGwGeA==", "engines": { "node": ">=0.11" }, @@ -7491,6 +7509,19 @@ } } }, + "node_modules/eslint-plugin-json": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-json/-/eslint-plugin-json-3.1.0.tgz", + "integrity": "sha512-MrlG2ynFEHe7wDGwbUuFPsaT2b1uhuEFhJ+W1f1u+1C2EkXmTYJp4B1aAdQQ8M+CC3t//N/oRKiIVw14L2HR1g==", + "dev": true, + "dependencies": { + "lodash": "^4.17.21", + "vscode-json-languageservice": "^4.1.6" + }, + "engines": { + "node": ">=12.0" + } + }, "node_modules/eslint-plugin-jsx-a11y": { "version": "6.5.1", "dev": true, @@ -10877,7 +10908,8 @@ }, "node_modules/json2mq": { "version": "0.2.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/json2mq/-/json2mq-0.2.0.tgz", + "integrity": "sha512-SzoRg7ux5DWTII9J2qkrZrqV1gt+rTaoufMxEzXbS26Uid0NwaJd123HcoB80TgubEppxxIGdNxCx50fEoEWQA==", "dependencies": { "string-convert": "^0.2.0" } @@ -10893,6 +10925,12 @@ "node": ">=6" } }, + "node_modules/jsonc-parser": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz", + "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==", + "dev": true + }, "node_modules/jsonfile": { "version": "6.1.0", "dev": true, @@ -13833,14 +13871,15 @@ } }, "node_modules/rc-cascader": { - "version": "3.5.0", - "license": "MIT", + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/rc-cascader/-/rc-cascader-3.7.0.tgz", + "integrity": "sha512-SFtGpwmYN7RaWEAGTS4Rkc62ZV/qmQGg/tajr/7mfIkleuu8ro9Hlk6J+aA0x1YS4zlaZBtTcSaXM01QMiEV/A==", "dependencies": { "@babel/runtime": "^7.12.5", "array-tree-filter": "^2.1.0", "classnames": "^2.3.1", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.6.1" }, "peerDependencies": { @@ -13861,8 +13900,9 @@ } }, "node_modules/rc-collapse": { - "version": "3.1.4", - "license": "MIT", + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/rc-collapse/-/rc-collapse-3.3.1.tgz", + "integrity": "sha512-cOJfcSe3R8vocrF8T+PgaHDrgeA1tX+lwfhwSj60NX9QVRidsILIbRNDLD6nAzmcvVC5PWiIRiR4S1OobxdhCg==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -13876,8 +13916,9 @@ } }, "node_modules/rc-dialog": { - "version": "8.8.1", - "license": "MIT", + "version": "8.9.0", + "resolved": "https://registry.npmjs.org/rc-dialog/-/rc-dialog-8.9.0.tgz", + "integrity": "sha512-Cp0tbJnrvPchJfnwIvOMWmJ4yjX3HWFatO6oBFD1jx8QkgsQCR0p8nUWAKdd3seLJhEC39/v56kZaEjwp9muoQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -13890,11 +13931,14 @@ } }, "node_modules/rc-drawer": { - "version": "4.4.3", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/rc-drawer/-/rc-drawer-5.1.0.tgz", + "integrity": "sha512-pU3Tsn99pxGdYowXehzZbdDVE+4lDXSGb7p8vA9mSmr569oc2Izh4Zw5vLKSe/Xxn2p5MSNbLVqD4tz+pK6SOw==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-util": "^5.7.0" + "rc-motion": "^2.6.1", + "rc-util": "^5.21.2" }, "peerDependencies": { "react": ">=16.9.0", @@ -13902,12 +13946,13 @@ } }, "node_modules/rc-dropdown": { - "version": "3.5.2", - "license": "MIT", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/rc-dropdown/-/rc-dropdown-4.0.1.tgz", + "integrity": "sha512-OdpXuOcme1rm45cR0Jzgfl1otzmU4vuBVb+etXM8vcaULGokAKVpKlw8p6xzspG7jGd/XxShvq+N3VNEfk/l5g==", "dependencies": { - "@babel/runtime": "^7.10.1", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", - "rc-trigger": "^5.0.4", + "rc-trigger": "^5.3.1", "rc-util": "^5.17.0" }, "peerDependencies": { @@ -13916,10 +13961,11 @@ } }, "node_modules/rc-field-form": { - "version": "1.26.3", - "license": "MIT", + "version": "1.27.3", + "resolved": "https://registry.npmjs.org/rc-field-form/-/rc-field-form-1.27.3.tgz", + "integrity": "sha512-HGqxHnmGQgkPApEcikV4qTg3BLPC82uB/cwBDftDt1pYaqitJfSl5TFTTUMKVEJVT5RqJ2Zi68ME1HmIMX2HAw==", "dependencies": { - "@babel/runtime": "^7.8.4", + "@babel/runtime": "^7.18.0", "async-validator": "^4.1.0", "rc-util": "^5.8.0" }, @@ -13932,12 +13978,13 @@ } }, "node_modules/rc-image": { - "version": "5.6.2", - "license": "MIT", + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/rc-image/-/rc-image-5.7.1.tgz", + "integrity": "sha512-QyMfdhoUfb5W14plqXSisaYwpdstcLYnB0MjX5ccIK2rydQM9sDPuekQWu500DDGR2dBaIF5vx9XbWkNFK17Fg==", "dependencies": { "@babel/runtime": "^7.11.2", "classnames": "^2.2.6", - "rc-dialog": "~8.8.0", + "rc-dialog": "~8.9.0", "rc-util": "^5.0.6" }, "peerDependencies": { @@ -13946,8 +13993,9 @@ } }, "node_modules/rc-input": { - "version": "0.0.1-alpha.7", - "license": "MIT", + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/rc-input/-/rc-input-0.1.4.tgz", + "integrity": "sha512-FqDdNz+fV2dKNgfXzcSLKvC+jEs1709t7nD+WdfjrdSaOcefpgc7BUJYadc3usaING+b7ediMTfKxuJBsEFbXA==", "dependencies": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -13959,12 +14007,13 @@ } }, "node_modules/rc-input-number": { - "version": "7.3.4", - "license": "MIT", + "version": "7.3.9", + "resolved": "https://registry.npmjs.org/rc-input-number/-/rc-input-number-7.3.9.tgz", + "integrity": "sha512-u0+miS+SATdb6DtssYei2JJ1WuZME+nXaG6XGtR8maNyW5uGDytfDu60OTWLQEb0Anv/AcCzehldV8CKmKyQfA==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", - "rc-util": "^5.9.8" + "rc-util": "^5.23.0" }, "peerDependencies": { "react": ">=16.9.0", @@ -13972,15 +14021,16 @@ } }, "node_modules/rc-mentions": { - "version": "1.7.1", - "license": "MIT", + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/rc-mentions/-/rc-mentions-1.10.0.tgz", + "integrity": "sha512-oMlYWnwXSxP2NQVlgxOTzuG/u9BUc3ySY78K3/t7MNhJWpZzXTao+/Bic6tyZLuNCO89//hVQJBdaR2rnFQl6Q==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-menu": "~9.5.1", - "rc-textarea": "^0.3.0", + "rc-menu": "~9.6.0", + "rc-textarea": "^0.4.0", "rc-trigger": "^5.0.4", - "rc-util": "^5.0.1" + "rc-util": "^5.22.5" }, "peerDependencies": { "react": ">=16.9.0", @@ -13988,8 +14038,9 @@ } }, "node_modules/rc-menu": { - "version": "9.5.5", - "license": "MIT", + "version": "9.6.4", + "resolved": "https://registry.npmjs.org/rc-menu/-/rc-menu-9.6.4.tgz", + "integrity": "sha512-6DiNAjxjVIPLZXHffXxxcyE15d4isRL7iQ1ru4MqYDH2Cqc5bW96wZOdMydFtGLyDdnmEQ9jVvdCE9yliGvzkw==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -14005,8 +14056,9 @@ } }, "node_modules/rc-motion": { - "version": "2.6.0", - "license": "MIT", + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/rc-motion/-/rc-motion-2.6.2.tgz", + "integrity": "sha512-4w1FaX3dtV749P8GwfS4fYnFG4Rb9pxvCYPc/b2fw1cmlHJWNNgOFIz7ysiD+eOrzJSvnLJWlNQQncpNMXwwpg==", "dependencies": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -14035,8 +14087,9 @@ } }, "node_modules/rc-overflow": { - "version": "1.2.5", - "license": "MIT", + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc-overflow/-/rc-overflow-1.2.8.tgz", + "integrity": "sha512-QJ0UItckWPQ37ZL1dMEBAdY1dhfTXFL9k6oTTcyydVwoUNMnMqCGqnRNA98axSr/OeDKqR6DVFyi8eA5RQI/uQ==", "dependencies": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -14049,8 +14102,9 @@ } }, "node_modules/rc-pagination": { - "version": "3.1.16", - "license": "MIT", + "version": "3.1.17", + "resolved": "https://registry.npmjs.org/rc-pagination/-/rc-pagination-3.1.17.tgz", + "integrity": "sha512-/BQ5UxcBnW28vFAcP2hfh+Xg15W0QZn8TWYwdCApchMH1H0CxiaUUcULP8uXcFM1TygcdKWdt3JqsL9cTAfdkQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1" @@ -14061,8 +14115,9 @@ } }, "node_modules/rc-picker": { - "version": "2.6.8", - "license": "MIT", + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/rc-picker/-/rc-picker-2.6.11.tgz", + "integrity": "sha512-INJ7ULu+Kj4UgqbcqE8Q+QpMw55xFf9kkyLBHJFk0ihjJpAV4glialRfqHE7k4KX2BWYPQfpILwhwR14x2EiRQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -14082,8 +14137,9 @@ } }, "node_modules/rc-progress": { - "version": "3.2.4", - "license": "MIT", + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/rc-progress/-/rc-progress-3.3.3.tgz", + "integrity": "sha512-MDVNVHzGanYtRy2KKraEaWeZLri2ZHWIRyaE1a9MQ2MuJ09m+Wxj5cfcaoaR6z5iRpHpA59YeUxAlpML8N4PJw==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -14112,7 +14168,8 @@ }, "node_modules/rc-resize-observer": { "version": "1.2.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/rc-resize-observer/-/rc-resize-observer-1.2.0.tgz", + "integrity": "sha512-6W+UzT3PyDM0wVCEHfoW3qTHPTvbdSgiA43buiy8PzmeMnfgnDeb9NjdimMXMl3/TcrvvWl5RRVdp+NqcR47pQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -14139,8 +14196,9 @@ } }, "node_modules/rc-select": { - "version": "14.1.2", - "license": "MIT", + "version": "14.1.13", + "resolved": "https://registry.npmjs.org/rc-select/-/rc-select-14.1.13.tgz", + "integrity": "sha512-WMEsC3gTwA1dbzWOdVIXDmWyidYNLq68AwvvUlRROw790uGUly0/vmqDozXrIr0QvN/A3CEULx12o+WtLCAefg==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -14206,13 +14264,14 @@ } }, "node_modules/rc-table": { - "version": "7.24.1", - "license": "MIT", + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/rc-table/-/rc-table-7.26.0.tgz", + "integrity": "sha512-0cD8e6S+DTGAt5nBZQIPFYEaIukn17sfa5uFL98faHlH/whZzD8ii3dbFL4wmUDEL4BLybhYop+QUfZJ4CPvNQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", "rc-resize-observer": "^1.1.0", - "rc-util": "^5.14.0", + "rc-util": "^5.22.5", "shallowequal": "^1.1.0" }, "engines": { @@ -14224,13 +14283,15 @@ } }, "node_modules/rc-tabs": { - "version": "11.13.0", - "license": "MIT", + "version": "12.2.1", + "resolved": "https://registry.npmjs.org/rc-tabs/-/rc-tabs-12.2.1.tgz", + "integrity": "sha512-09pVv4kN8VFqp6THceEmxOW8PAShQC08hrroeVYP4Y8YBFaP1PIWdyFL01czcbyz5YZFj9flZ7aljMaAl0jLVg==", "dependencies": { "@babel/runtime": "^7.11.2", "classnames": "2.x", - "rc-dropdown": "~3.5.0", - "rc-menu": "~9.5.1", + "rc-dropdown": "~4.0.0", + "rc-menu": "~9.6.0", + "rc-motion": "^2.6.2", "rc-resize-observer": "^1.0.0", "rc-util": "^5.5.0" }, @@ -14243,13 +14304,14 @@ } }, "node_modules/rc-textarea": { - "version": "0.3.7", - "license": "MIT", + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/rc-textarea/-/rc-textarea-0.4.6.tgz", + "integrity": "sha512-HEKCu8nouXXayqYelQnhQm8fdH7v92pAQvfVCz+jhIPv2PHTyBxVrmoZJMn3B8cU+wdyuvRGkshngO3/TzBn4w==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", "rc-resize-observer": "^1.0.0", - "rc-util": "^5.7.0", + "rc-util": "^5.24.4", "shallowequal": "^1.1.0" }, "peerDependencies": { @@ -14258,10 +14320,12 @@ } }, "node_modules/rc-tooltip": { - "version": "5.1.1", - "license": "MIT", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/rc-tooltip/-/rc-tooltip-5.2.2.tgz", + "integrity": "sha512-jtQzU/18S6EI3lhSGoDYhPqNpWajMtS5VV/ld1LwyfrDByQpYmw/LW6U7oFXXLukjfDHQ7Ju705A82PRNFWYhg==", "dependencies": { "@babel/runtime": "^7.11.2", + "classnames": "^2.3.1", "rc-trigger": "^5.0.0" }, "peerDependencies": { @@ -14270,14 +14334,15 @@ } }, "node_modules/rc-tree": { - "version": "5.5.0", - "license": "MIT", + "version": "5.7.0", + "resolved": "https://registry.npmjs.org/rc-tree/-/rc-tree-5.7.0.tgz", + "integrity": "sha512-F+Ewkv/UcutshnVBMISP+lPdHDlcsL+YH/MQDVWbk+QdkfID7vXiwrHMEZn31+2Rbbm21z/HPceGS8PXGMmnQg==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-motion": "^2.0.1", "rc-util": "^5.16.1", - "rc-virtual-list": "^3.4.2" + "rc-virtual-list": "^3.4.8" }, "engines": { "node": ">=10.x" @@ -14288,13 +14353,14 @@ } }, "node_modules/rc-tree-select": { - "version": "5.3.0", - "license": "MIT", + "version": "5.5.3", + "resolved": "https://registry.npmjs.org/rc-tree-select/-/rc-tree-select-5.5.3.tgz", + "integrity": "sha512-gv8KyC6J7f9e50OkGk1ibF7v8vL+iaBnA8Ep/EVlMma2/tGdBQXO9xIvPjX8eQrZL5PjoeTUndNPM3cY3721ng==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.16.1" }, "peerDependencies": { @@ -14303,10 +14369,11 @@ } }, "node_modules/rc-trigger": { - "version": "5.2.18", - "license": "MIT", + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/rc-trigger/-/rc-trigger-5.3.3.tgz", + "integrity": "sha512-IC4nuTSAME7RJSgwvHCNDQrIzhvGMKf6NDu5veX+zk1MG7i1UnwTWWthcP9WHw3+FZfP3oZGvkrHFPu/EGkFKw==", "dependencies": { - "@babel/runtime": "^7.11.2", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", "rc-align": "^4.0.0", "rc-motion": "^2.0.0", @@ -14334,10 +14401,11 @@ } }, "node_modules/rc-util": { - "version": "5.21.2", - "license": "MIT", + "version": "5.24.4", + "resolved": "https://registry.npmjs.org/rc-util/-/rc-util-5.24.4.tgz", + "integrity": "sha512-2a4RQnycV9eV7lVZPEJ7QwJRPlZNc06J7CwcwZo4vIHr3PfUqtYgl1EkUV9ETAc6VRRi8XZOMFhYG63whlIC9Q==", "dependencies": { - "@babel/runtime": "^7.12.5", + "@babel/runtime": "^7.18.3", "react-is": "^16.12.0", "shallowequal": "^1.1.0" }, @@ -14347,9 +14415,11 @@ } }, "node_modules/rc-virtual-list": { - "version": "3.4.7", - "license": "MIT", + "version": "3.4.11", + "resolved": "https://registry.npmjs.org/rc-virtual-list/-/rc-virtual-list-3.4.11.tgz", + "integrity": "sha512-BvUUH60kkeTBPigN5F89HtGaA5jSP4y2aM6cJ4dk9Y42I9yY+h6i08wF6UKeDcxdfOU8j3I5HxkSS/xA77J3wA==", "dependencies": { + "@babel/runtime": "^7.20.0", "classnames": "^2.2.6", "rc-resize-observer": "^1.0.0", "rc-util": "^5.15.0" @@ -14570,6 +14640,18 @@ "node": ">=0.10.0" } }, + "node_modules/react-resizable": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/react-resizable/-/react-resizable-3.0.4.tgz", + "integrity": "sha512-StnwmiESiamNzdRHbSSvA65b0ZQJ7eVQpPusrSmcpyGKzC0gojhtO62xxH6YOBmepk9dQTBi9yxidL3W4s3EBA==", + "dependencies": { + "prop-types": "15.x", + "react-draggable": "^4.0.3" + }, + "peerDependencies": { + "react": ">= 16.3" + } + }, "node_modules/react-router": { "version": "6.3.0", "license": "MIT", @@ -14746,8 +14828,9 @@ } }, "node_modules/regenerator-runtime": { - "version": "0.13.9", - "license": "MIT" + "version": "0.13.10", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.10.tgz", + "integrity": "sha512-KepLsg4dU12hryUO7bp/axHAKvwGOCV0sGloQtpagJ12ai+ojVDqkeGSiRX1zlq+kjIMZ1t7gpze+26QqtdGqw==" }, "node_modules/regenerator-transform": { "version": "0.15.0", @@ -15761,7 +15844,8 @@ }, "node_modules/string-convert": { "version": "0.2.1", - "license": "MIT" + "resolved": "https://registry.npmjs.org/string-convert/-/string-convert-0.2.1.tgz", + "integrity": "sha512-u/1tdPl4yQnPBjnVrmdLo9gtuLvELKsAoRapekWggdiQNvvvum+jYF329d84NAa660KQw7pB2n36KrIKVoXa3A==" }, "node_modules/string-length": { "version": "4.0.2", @@ -16735,6 +16819,43 @@ "node": ">= 0.8" } }, + "node_modules/vscode-json-languageservice": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/vscode-json-languageservice/-/vscode-json-languageservice-4.2.1.tgz", + "integrity": "sha512-xGmv9QIWs2H8obGbWg+sIPI/3/pFgj/5OWBhNzs00BkYQ9UaB2F6JJaGB/2/YOZJ3BvLXQTC4Q7muqU25QgAhA==", + "dev": true, + "dependencies": { + "jsonc-parser": "^3.0.0", + "vscode-languageserver-textdocument": "^1.0.3", + "vscode-languageserver-types": "^3.16.0", + "vscode-nls": "^5.0.0", + "vscode-uri": "^3.0.3" + } + }, + "node_modules/vscode-languageserver-textdocument": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/vscode-languageserver-textdocument/-/vscode-languageserver-textdocument-1.0.7.tgz", + "integrity": "sha512-bFJH7UQxlXT8kKeyiyu41r22jCZXG8kuuVVA33OEJn1diWOZK5n8zBSPZFHVBOu8kXZ6h0LIRhf5UnCo61J4Hg==", + "dev": true + }, + "node_modules/vscode-languageserver-types": { + "version": "3.17.2", + "resolved": "https://registry.npmjs.org/vscode-languageserver-types/-/vscode-languageserver-types-3.17.2.tgz", + "integrity": "sha512-zHhCWatviizPIq9B7Vh9uvrH6x3sK8itC84HkamnBWoDFJtzBf7SWlpLCZUit72b3os45h6RWQNC9xHRDF8dRA==", + "dev": true + }, + "node_modules/vscode-nls": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/vscode-nls/-/vscode-nls-5.2.0.tgz", + "integrity": "sha512-RAaHx7B14ZU04EU31pT+rKz2/zSl7xMsfIZuo8pd+KZO6PXtQmpevpq3vxvWNcrGbdmhM/rr5Uw5Mz+NBfhVng==", + "dev": true + }, + "node_modules/vscode-uri": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/vscode-uri/-/vscode-uri-3.0.6.tgz", + "integrity": "sha512-fmL7V1eiDBFRRnu+gfRWTzyPpNIHJTc4mWnFkwBUmO9U3KPgJAmTx7oxi2bl/Rh6HLdU7+4C9wlj0k2E4AdKFQ==", + "dev": true + }, "node_modules/w3c-hr-time": { "version": "1.0.2", "dev": true, @@ -17543,13 +17664,15 @@ "version": "4.2.1" }, "@ant-design/react-slick": { - "version": "0.28.4", + "version": "0.29.2", + "resolved": "https://registry.npmjs.org/@ant-design/react-slick/-/react-slick-0.29.2.tgz", + "integrity": "sha512-kgjtKmkGHa19FW21lHnAfyyH9AAoh35pBdcJ53rHmQ3O+cfFHGHnUbj/HFrRNJ5vIts09FKJVAD8RpaC+RaWfA==", "requires": { "@babel/runtime": "^7.10.4", "classnames": "^2.2.5", "json2mq": "^0.2.0", "lodash": "^4.17.21", - "resize-observer-polyfill": "^1.5.0" + "resize-observer-polyfill": "^1.5.1" } }, "@apideck/better-ajv-errors": { @@ -18653,9 +18776,11 @@ } }, "@babel/runtime": { - "version": "7.17.9", + "version": "7.20.0", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.20.0.tgz", + "integrity": "sha512-NDYdls71fTXoU8TZHfbBWg7DiZfNzClcKui/+kyi6ppD2L1qnWW3VV6CjtaBXSUGGhiTWJ6ereOIkUvenif66Q==", "requires": { - "regenerator-runtime": "^0.13.4" + "regenerator-runtime": "^0.13.10" } }, "@babel/runtime-corejs3": { @@ -19755,6 +19880,15 @@ "redux": "^4.0.0" } }, + "@types/react-resizable": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/react-resizable/-/react-resizable-3.0.3.tgz", + "integrity": "sha512-W/QsUOZoXBAIBQNhNm95A5ohoaiUA874lWQytO2UP9dOjp5JHO9+a0cwYNabea7sA12ZDJnGVUFZxcNaNksAWA==", + "dev": true, + "requires": { + "@types/react": "*" + } + }, "@types/resolve": { "version": "1.17.1", "dev": true, @@ -20335,51 +20469,53 @@ } }, "antd": { - "version": "4.20.2", + "version": "4.23.6", + "resolved": "https://registry.npmjs.org/antd/-/antd-4.23.6.tgz", + "integrity": "sha512-AYH57cWBDe1ChtbnvG8i9dpKG4WnjE3AG0zIKpXByFNnxsr4saV6/19ihE8/ImSGpohN4E2zTXmo7R5/MyVRKQ==", "requires": { "@ant-design/colors": "^6.0.0", "@ant-design/icons": "^4.7.0", - "@ant-design/react-slick": "~0.28.1", - "@babel/runtime": "^7.12.5", + "@ant-design/react-slick": "~0.29.1", + "@babel/runtime": "^7.18.3", "@ctrl/tinycolor": "^3.4.0", "classnames": "^2.2.6", "copy-to-clipboard": "^3.2.0", "lodash": "^4.17.21", "memoize-one": "^6.0.0", "moment": "^2.29.2", - "rc-cascader": "~3.5.0", + "rc-cascader": "~3.7.0", "rc-checkbox": "~2.3.0", - "rc-collapse": "~3.1.0", - "rc-dialog": "~8.8.1", - "rc-drawer": "~4.4.2", - "rc-dropdown": "~3.5.0", - "rc-field-form": "~1.26.1", - "rc-image": "~5.6.0", - "rc-input": "~0.0.1-alpha.5", - "rc-input-number": "~7.3.0", - "rc-mentions": "~1.7.0", - "rc-menu": "~9.5.5", - "rc-motion": "^2.5.1", + "rc-collapse": "~3.3.0", + "rc-dialog": "~8.9.0", + "rc-drawer": "~5.1.0", + "rc-dropdown": "~4.0.0", + "rc-field-form": "~1.27.0", + "rc-image": "~5.7.0", + "rc-input": "~0.1.2", + "rc-input-number": "~7.3.9", + "rc-mentions": "~1.10.0", + "rc-menu": "~9.6.3", + "rc-motion": "^2.6.1", "rc-notification": "~4.6.0", - "rc-pagination": "~3.1.9", - "rc-picker": "~2.6.4", - "rc-progress": "~3.2.1", + "rc-pagination": "~3.1.17", + "rc-picker": "~2.6.11", + "rc-progress": "~3.3.2", "rc-rate": "~2.9.0", "rc-resize-observer": "^1.2.0", - "rc-segmented": "~2.1.0 ", - "rc-select": "~14.1.1", + "rc-segmented": "~2.1.0", + "rc-select": "~14.1.13", "rc-slider": "~10.0.0", "rc-steps": "~4.1.0", "rc-switch": "~3.2.0", - "rc-table": "~7.24.0", - "rc-tabs": "~11.13.0", - "rc-textarea": "~0.3.0", - "rc-tooltip": "~5.1.1", - "rc-tree": "~5.5.0", - "rc-tree-select": "~5.3.0", + "rc-table": "~7.26.0", + "rc-tabs": "~12.2.0", + "rc-textarea": "~0.4.5", + "rc-tooltip": "~5.2.0", + "rc-tree": "~5.7.0", + "rc-tree-select": "~5.5.0", "rc-trigger": "^5.2.10", "rc-upload": "~4.3.0", - "rc-util": "^5.20.0", + "rc-util": "^5.22.5", "scroll-into-view-if-needed": "^2.2.25" } }, @@ -20422,7 +20558,9 @@ } }, "array-tree-filter": { - "version": "2.1.0" + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/array-tree-filter/-/array-tree-filter-2.1.0.tgz", + "integrity": "sha512-4ROwICNlNw/Hqa9v+rk5h22KjmzB1JGTMVKP2AKJBOCgb0yL0ASf0+YvCcLNNwquOHNX48jkeZIJ3a+oOQqKcw==" }, "array-union": { "version": "2.1.0", @@ -20465,7 +20603,9 @@ "dev": true }, "async-validator": { - "version": "4.1.1" + "version": "4.2.5", + "resolved": "https://registry.npmjs.org/async-validator/-/async-validator-4.2.5.tgz", + "integrity": "sha512-7HhHjtERjqlNbZtqNqy2rckN/SpOOlmDliet+lP7k+eKZEjPk3DgyeU9lIXLdeLz0uBbbVp+9Qdow9wJWgwwfg==" }, "asynckit": { "version": "0.4.0" @@ -21606,7 +21746,9 @@ } }, "date-fns": { - "version": "2.28.0" + "version": "2.29.3", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.3.tgz", + "integrity": "sha512-dDCnyH2WnnKusqvZZ6+jA1O51Ibt8ZMRNkDZdyAyK4YfbDwa/cEmuztzG5pk6hqlp9aSBPYcjOlktquahGwGeA==" }, "dayjs": { "version": "1.11.5", @@ -22303,6 +22445,16 @@ "@typescript-eslint/experimental-utils": "^5.0.0" } }, + "eslint-plugin-json": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-json/-/eslint-plugin-json-3.1.0.tgz", + "integrity": "sha512-MrlG2ynFEHe7wDGwbUuFPsaT2b1uhuEFhJ+W1f1u+1C2EkXmTYJp4B1aAdQQ8M+CC3t//N/oRKiIVw14L2HR1g==", + "dev": true, + "requires": { + "lodash": "^4.17.21", + "vscode-json-languageservice": "^4.1.6" + } + }, "eslint-plugin-jsx-a11y": { "version": "6.5.1", "dev": true, @@ -24453,6 +24605,8 @@ }, "json2mq": { "version": "0.2.0", + "resolved": "https://registry.npmjs.org/json2mq/-/json2mq-0.2.0.tgz", + "integrity": "sha512-SzoRg7ux5DWTII9J2qkrZrqV1gt+rTaoufMxEzXbS26Uid0NwaJd123HcoB80TgubEppxxIGdNxCx50fEoEWQA==", "requires": { "string-convert": "^0.2.0" } @@ -24461,6 +24615,12 @@ "version": "2.2.1", "dev": true }, + "jsonc-parser": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz", + "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==", + "dev": true + }, "jsonfile": { "version": "6.1.0", "dev": true, @@ -26190,13 +26350,15 @@ } }, "rc-cascader": { - "version": "3.5.0", + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/rc-cascader/-/rc-cascader-3.7.0.tgz", + "integrity": "sha512-SFtGpwmYN7RaWEAGTS4Rkc62ZV/qmQGg/tajr/7mfIkleuu8ro9Hlk6J+aA0x1YS4zlaZBtTcSaXM01QMiEV/A==", "requires": { "@babel/runtime": "^7.12.5", "array-tree-filter": "^2.1.0", "classnames": "^2.3.1", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.6.1" } }, @@ -26208,7 +26370,9 @@ } }, "rc-collapse": { - "version": "3.1.4", + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/rc-collapse/-/rc-collapse-3.3.1.tgz", + "integrity": "sha512-cOJfcSe3R8vocrF8T+PgaHDrgeA1tX+lwfhwSj60NX9QVRidsILIbRNDLD6nAzmcvVC5PWiIRiR4S1OobxdhCg==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -26218,7 +26382,9 @@ } }, "rc-dialog": { - "version": "8.8.1", + "version": "8.9.0", + "resolved": "https://registry.npmjs.org/rc-dialog/-/rc-dialog-8.9.0.tgz", + "integrity": "sha512-Cp0tbJnrvPchJfnwIvOMWmJ4yjX3HWFatO6oBFD1jx8QkgsQCR0p8nUWAKdd3seLJhEC39/v56kZaEjwp9muoQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -26227,41 +26393,52 @@ } }, "rc-drawer": { - "version": "4.4.3", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/rc-drawer/-/rc-drawer-5.1.0.tgz", + "integrity": "sha512-pU3Tsn99pxGdYowXehzZbdDVE+4lDXSGb7p8vA9mSmr569oc2Izh4Zw5vLKSe/Xxn2p5MSNbLVqD4tz+pK6SOw==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-util": "^5.7.0" + "rc-motion": "^2.6.1", + "rc-util": "^5.21.2" } }, "rc-dropdown": { - "version": "3.5.2", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/rc-dropdown/-/rc-dropdown-4.0.1.tgz", + "integrity": "sha512-OdpXuOcme1rm45cR0Jzgfl1otzmU4vuBVb+etXM8vcaULGokAKVpKlw8p6xzspG7jGd/XxShvq+N3VNEfk/l5g==", "requires": { - "@babel/runtime": "^7.10.1", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", - "rc-trigger": "^5.0.4", + "rc-trigger": "^5.3.1", "rc-util": "^5.17.0" } }, "rc-field-form": { - "version": "1.26.3", + "version": "1.27.3", + "resolved": "https://registry.npmjs.org/rc-field-form/-/rc-field-form-1.27.3.tgz", + "integrity": "sha512-HGqxHnmGQgkPApEcikV4qTg3BLPC82uB/cwBDftDt1pYaqitJfSl5TFTTUMKVEJVT5RqJ2Zi68ME1HmIMX2HAw==", "requires": { - "@babel/runtime": "^7.8.4", + "@babel/runtime": "^7.18.0", "async-validator": "^4.1.0", "rc-util": "^5.8.0" } }, "rc-image": { - "version": "5.6.2", + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/rc-image/-/rc-image-5.7.1.tgz", + "integrity": "sha512-QyMfdhoUfb5W14plqXSisaYwpdstcLYnB0MjX5ccIK2rydQM9sDPuekQWu500DDGR2dBaIF5vx9XbWkNFK17Fg==", "requires": { "@babel/runtime": "^7.11.2", "classnames": "^2.2.6", - "rc-dialog": "~8.8.0", + "rc-dialog": "~8.9.0", "rc-util": "^5.0.6" } }, "rc-input": { - "version": "0.0.1-alpha.7", + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/rc-input/-/rc-input-0.1.4.tgz", + "integrity": "sha512-FqDdNz+fV2dKNgfXzcSLKvC+jEs1709t7nD+WdfjrdSaOcefpgc7BUJYadc3usaING+b7ediMTfKxuJBsEFbXA==", "requires": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -26269,26 +26446,32 @@ } }, "rc-input-number": { - "version": "7.3.4", + "version": "7.3.9", + "resolved": "https://registry.npmjs.org/rc-input-number/-/rc-input-number-7.3.9.tgz", + "integrity": "sha512-u0+miS+SATdb6DtssYei2JJ1WuZME+nXaG6XGtR8maNyW5uGDytfDu60OTWLQEb0Anv/AcCzehldV8CKmKyQfA==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", - "rc-util": "^5.9.8" + "rc-util": "^5.23.0" } }, "rc-mentions": { - "version": "1.7.1", + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/rc-mentions/-/rc-mentions-1.10.0.tgz", + "integrity": "sha512-oMlYWnwXSxP2NQVlgxOTzuG/u9BUc3ySY78K3/t7MNhJWpZzXTao+/Bic6tyZLuNCO89//hVQJBdaR2rnFQl6Q==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-menu": "~9.5.1", - "rc-textarea": "^0.3.0", + "rc-menu": "~9.6.0", + "rc-textarea": "^0.4.0", "rc-trigger": "^5.0.4", - "rc-util": "^5.0.1" + "rc-util": "^5.22.5" } }, "rc-menu": { - "version": "9.5.5", + "version": "9.6.4", + "resolved": "https://registry.npmjs.org/rc-menu/-/rc-menu-9.6.4.tgz", + "integrity": "sha512-6DiNAjxjVIPLZXHffXxxcyE15d4isRL7iQ1ru4MqYDH2Cqc5bW96wZOdMydFtGLyDdnmEQ9jVvdCE9yliGvzkw==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -26300,7 +26483,9 @@ } }, "rc-motion": { - "version": "2.6.0", + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/rc-motion/-/rc-motion-2.6.2.tgz", + "integrity": "sha512-4w1FaX3dtV749P8GwfS4fYnFG4Rb9pxvCYPc/b2fw1cmlHJWNNgOFIz7ysiD+eOrzJSvnLJWlNQQncpNMXwwpg==", "requires": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -26317,7 +26502,9 @@ } }, "rc-overflow": { - "version": "1.2.5", + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc-overflow/-/rc-overflow-1.2.8.tgz", + "integrity": "sha512-QJ0UItckWPQ37ZL1dMEBAdY1dhfTXFL9k6oTTcyydVwoUNMnMqCGqnRNA98axSr/OeDKqR6DVFyi8eA5RQI/uQ==", "requires": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -26326,14 +26513,18 @@ } }, "rc-pagination": { - "version": "3.1.16", + "version": "3.1.17", + "resolved": "https://registry.npmjs.org/rc-pagination/-/rc-pagination-3.1.17.tgz", + "integrity": "sha512-/BQ5UxcBnW28vFAcP2hfh+Xg15W0QZn8TWYwdCApchMH1H0CxiaUUcULP8uXcFM1TygcdKWdt3JqsL9cTAfdkQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1" } }, "rc-picker": { - "version": "2.6.8", + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/rc-picker/-/rc-picker-2.6.11.tgz", + "integrity": "sha512-INJ7ULu+Kj4UgqbcqE8Q+QpMw55xFf9kkyLBHJFk0ihjJpAV4glialRfqHE7k4KX2BWYPQfpILwhwR14x2EiRQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -26346,7 +26537,9 @@ } }, "rc-progress": { - "version": "3.2.4", + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/rc-progress/-/rc-progress-3.3.3.tgz", + "integrity": "sha512-MDVNVHzGanYtRy2KKraEaWeZLri2ZHWIRyaE1a9MQ2MuJ09m+Wxj5cfcaoaR6z5iRpHpA59YeUxAlpML8N4PJw==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -26363,6 +26556,8 @@ }, "rc-resize-observer": { "version": "1.2.0", + "resolved": "https://registry.npmjs.org/rc-resize-observer/-/rc-resize-observer-1.2.0.tgz", + "integrity": "sha512-6W+UzT3PyDM0wVCEHfoW3qTHPTvbdSgiA43buiy8PzmeMnfgnDeb9NjdimMXMl3/TcrvvWl5RRVdp+NqcR47pQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -26380,7 +26575,9 @@ } }, "rc-select": { - "version": "14.1.2", + "version": "14.1.13", + "resolved": "https://registry.npmjs.org/rc-select/-/rc-select-14.1.13.tgz", + "integrity": "sha512-WMEsC3gTwA1dbzWOdVIXDmWyidYNLq68AwvvUlRROw790uGUly0/vmqDozXrIr0QvN/A3CEULx12o+WtLCAefg==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -26418,67 +26615,83 @@ } }, "rc-table": { - "version": "7.24.1", + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/rc-table/-/rc-table-7.26.0.tgz", + "integrity": "sha512-0cD8e6S+DTGAt5nBZQIPFYEaIukn17sfa5uFL98faHlH/whZzD8ii3dbFL4wmUDEL4BLybhYop+QUfZJ4CPvNQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", "rc-resize-observer": "^1.1.0", - "rc-util": "^5.14.0", + "rc-util": "^5.22.5", "shallowequal": "^1.1.0" } }, "rc-tabs": { - "version": "11.13.0", + "version": "12.2.1", + "resolved": "https://registry.npmjs.org/rc-tabs/-/rc-tabs-12.2.1.tgz", + "integrity": "sha512-09pVv4kN8VFqp6THceEmxOW8PAShQC08hrroeVYP4Y8YBFaP1PIWdyFL01czcbyz5YZFj9flZ7aljMaAl0jLVg==", "requires": { "@babel/runtime": "^7.11.2", "classnames": "2.x", - "rc-dropdown": "~3.5.0", - "rc-menu": "~9.5.1", + "rc-dropdown": "~4.0.0", + "rc-menu": "~9.6.0", + "rc-motion": "^2.6.2", "rc-resize-observer": "^1.0.0", "rc-util": "^5.5.0" } }, "rc-textarea": { - "version": "0.3.7", + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/rc-textarea/-/rc-textarea-0.4.6.tgz", + "integrity": "sha512-HEKCu8nouXXayqYelQnhQm8fdH7v92pAQvfVCz+jhIPv2PHTyBxVrmoZJMn3B8cU+wdyuvRGkshngO3/TzBn4w==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", "rc-resize-observer": "^1.0.0", - "rc-util": "^5.7.0", + "rc-util": "^5.24.4", "shallowequal": "^1.1.0" } }, "rc-tooltip": { - "version": "5.1.1", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/rc-tooltip/-/rc-tooltip-5.2.2.tgz", + "integrity": "sha512-jtQzU/18S6EI3lhSGoDYhPqNpWajMtS5VV/ld1LwyfrDByQpYmw/LW6U7oFXXLukjfDHQ7Ju705A82PRNFWYhg==", "requires": { "@babel/runtime": "^7.11.2", + "classnames": "^2.3.1", "rc-trigger": "^5.0.0" } }, "rc-tree": { - "version": "5.5.0", + "version": "5.7.0", + "resolved": "https://registry.npmjs.org/rc-tree/-/rc-tree-5.7.0.tgz", + "integrity": "sha512-F+Ewkv/UcutshnVBMISP+lPdHDlcsL+YH/MQDVWbk+QdkfID7vXiwrHMEZn31+2Rbbm21z/HPceGS8PXGMmnQg==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-motion": "^2.0.1", "rc-util": "^5.16.1", - "rc-virtual-list": "^3.4.2" + "rc-virtual-list": "^3.4.8" } }, "rc-tree-select": { - "version": "5.3.0", + "version": "5.5.3", + "resolved": "https://registry.npmjs.org/rc-tree-select/-/rc-tree-select-5.5.3.tgz", + "integrity": "sha512-gv8KyC6J7f9e50OkGk1ibF7v8vL+iaBnA8Ep/EVlMma2/tGdBQXO9xIvPjX8eQrZL5PjoeTUndNPM3cY3721ng==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.16.1" } }, "rc-trigger": { - "version": "5.2.18", + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/rc-trigger/-/rc-trigger-5.3.3.tgz", + "integrity": "sha512-IC4nuTSAME7RJSgwvHCNDQrIzhvGMKf6NDu5veX+zk1MG7i1UnwTWWthcP9WHw3+FZfP3oZGvkrHFPu/EGkFKw==", "requires": { - "@babel/runtime": "^7.11.2", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", "rc-align": "^4.0.0", "rc-motion": "^2.0.0", @@ -26494,16 +26707,21 @@ } }, "rc-util": { - "version": "5.21.2", + "version": "5.24.4", + "resolved": "https://registry.npmjs.org/rc-util/-/rc-util-5.24.4.tgz", + "integrity": "sha512-2a4RQnycV9eV7lVZPEJ7QwJRPlZNc06J7CwcwZo4vIHr3PfUqtYgl1EkUV9ETAc6VRRi8XZOMFhYG63whlIC9Q==", "requires": { - "@babel/runtime": "^7.12.5", + "@babel/runtime": "^7.18.3", "react-is": "^16.12.0", "shallowequal": "^1.1.0" } }, "rc-virtual-list": { - "version": "3.4.7", + "version": "3.4.11", + "resolved": "https://registry.npmjs.org/rc-virtual-list/-/rc-virtual-list-3.4.11.tgz", + "integrity": "sha512-BvUUH60kkeTBPigN5F89HtGaA5jSP4y2aM6cJ4dk9Y42I9yY+h6i08wF6UKeDcxdfOU8j3I5HxkSS/xA77J3wA==", "requires": { + "@babel/runtime": "^7.20.0", "classnames": "^2.2.6", "rc-resize-observer": "^1.0.0", "rc-util": "^5.15.0" @@ -26639,6 +26857,15 @@ "version": "0.11.0", "dev": true }, + "react-resizable": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/react-resizable/-/react-resizable-3.0.4.tgz", + "integrity": "sha512-StnwmiESiamNzdRHbSSvA65b0ZQJ7eVQpPusrSmcpyGKzC0gojhtO62xxH6YOBmepk9dQTBi9yxidL3W4s3EBA==", + "requires": { + "prop-types": "15.x", + "react-draggable": "^4.0.3" + } + }, "react-router": { "version": "6.3.0", "requires": { @@ -26764,7 +26991,9 @@ } }, "regenerator-runtime": { - "version": "0.13.9" + "version": "0.13.10", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.10.tgz", + "integrity": "sha512-KepLsg4dU12hryUO7bp/axHAKvwGOCV0sGloQtpagJ12ai+ojVDqkeGSiRX1zlq+kjIMZ1t7gpze+26QqtdGqw==" }, "regenerator-transform": { "version": "0.15.0", @@ -27432,7 +27661,9 @@ "dev": true }, "string-convert": { - "version": "0.2.1" + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/string-convert/-/string-convert-0.2.1.tgz", + "integrity": "sha512-u/1tdPl4yQnPBjnVrmdLo9gtuLvELKsAoRapekWggdiQNvvvum+jYF329d84NAa660KQw7pB2n36KrIKVoXa3A==" }, "string-length": { "version": "4.0.2", @@ -28057,6 +28288,43 @@ "version": "1.1.2", "dev": true }, + "vscode-json-languageservice": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/vscode-json-languageservice/-/vscode-json-languageservice-4.2.1.tgz", + "integrity": "sha512-xGmv9QIWs2H8obGbWg+sIPI/3/pFgj/5OWBhNzs00BkYQ9UaB2F6JJaGB/2/YOZJ3BvLXQTC4Q7muqU25QgAhA==", + "dev": true, + "requires": { + "jsonc-parser": "^3.0.0", + "vscode-languageserver-textdocument": "^1.0.3", + "vscode-languageserver-types": "^3.16.0", + "vscode-nls": "^5.0.0", + "vscode-uri": "^3.0.3" + } + }, + "vscode-languageserver-textdocument": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/vscode-languageserver-textdocument/-/vscode-languageserver-textdocument-1.0.7.tgz", + "integrity": "sha512-bFJH7UQxlXT8kKeyiyu41r22jCZXG8kuuVVA33OEJn1diWOZK5n8zBSPZFHVBOu8kXZ6h0LIRhf5UnCo61J4Hg==", + "dev": true + }, + "vscode-languageserver-types": { + "version": "3.17.2", + "resolved": "https://registry.npmjs.org/vscode-languageserver-types/-/vscode-languageserver-types-3.17.2.tgz", + "integrity": "sha512-zHhCWatviizPIq9B7Vh9uvrH6x3sK8itC84HkamnBWoDFJtzBf7SWlpLCZUit72b3os45h6RWQNC9xHRDF8dRA==", + "dev": true + }, + "vscode-nls": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/vscode-nls/-/vscode-nls-5.2.0.tgz", + "integrity": "sha512-RAaHx7B14ZU04EU31pT+rKz2/zSl7xMsfIZuo8pd+KZO6PXtQmpevpq3vxvWNcrGbdmhM/rr5Uw5Mz+NBfhVng==", + "dev": true + }, + "vscode-uri": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/vscode-uri/-/vscode-uri-3.0.6.tgz", + "integrity": "sha512-fmL7V1eiDBFRRnu+gfRWTzyPpNIHJTc4mWnFkwBUmO9U3KPgJAmTx7oxi2bl/Rh6HLdU7+4C9wlj0k2E4AdKFQ==", + "dev": true + }, "w3c-hr-time": { "version": "1.0.2", "dev": true, diff --git a/ui/package.json b/ui/package.json index 0cd3f6b7e..da870fbf3 100644 --- a/ui/package.json +++ b/ui/package.json @@ -6,7 +6,7 @@ "@ant-design/icons": "^4.7.0", "@azure/msal-browser": "^2.24.0", "@azure/msal-react": "^1.4.0", - "antd": "^4.20.2", + "antd": "^4.23.6", "axios": "^0.27.2", "classnames": "^2.3.2", "dagre": "^0.8.5", @@ -15,6 +15,7 @@ "react-dom": "^17.0.2", "react-flow-renderer": "^9.7.4", "react-query": "^3.38.0", + "react-resizable": "^3.0.4", "react-router-dom": "^6.3.0" }, "devDependencies": { @@ -27,6 +28,7 @@ "@types/node": "^16.11.26", "@types/react": "^17.0.43", "@types/react-dom": "^17.0.14", + "@types/react-resizable": "^3.0.3", "@typescript-eslint/eslint-plugin": "^5.30.7", "@typescript-eslint/parser": "^5.30.7", "babel-plugin-import": "^1.13.5", @@ -35,6 +37,7 @@ "eslint-config-prettier": "^8.5.0", "eslint-import-resolver-typescript": "^3.5.1", "eslint-plugin-import": "^2.26.0", + "eslint-plugin-json": "^3.1.0", "eslint-plugin-prettier": "^4.2.1", "eslint-plugin-react-hooks": "^4.6.0", "husky": "^8.0.1", @@ -51,7 +54,8 @@ "test": "craco test", "eject": "react-scripts eject", "lint:fix": "npx eslint --fix --ext ts --ext tsx src/ ", - "format": "npx prettier --write src/**" + "format": "npx prettier --write src/**", + "lintStaged": "lint-staged" }, "browserslist": { "production": [ diff --git a/ui/src/api/api.tsx b/ui/src/api/api.tsx index a95ab2bd5..6c8b6f665 100644 --- a/ui/src/api/api.tsx +++ b/ui/src/api/api.tsx @@ -38,14 +38,18 @@ export const fetchDataSource = async ( ) => { const axios = await authAxios(msalInstance); return axios - .get( + .get( `${getApiBaseUrl()}/projects/${project}/datasources/${dataSourceId}`, { params: { project: project, datasource: dataSourceId }, } ) .then((response) => { - return response.data; + if (response.data.message || response.data.detail) { + return Promise.reject(response.data.message || response.data.detail); + } else { + return response.data; + } }); }; @@ -109,33 +113,21 @@ export const fetchFeatureLineages = async (featureId: string) => { // Following are place-holder code export const createFeature = async (feature: Feature) => { const axios = await authAxios(msalInstance); - return axios - .post(`${getApiBaseUrl()}/features`, feature, { - headers: { "Content-Type": "application/json;" }, - params: {}, - }) - .then((response) => { - return response; - }) - .catch((error) => { - return error.response; - }); + return axios.post(`${getApiBaseUrl()}/features`, feature, { + headers: { "Content-Type": "application/json;" }, + params: {}, + }); }; -export const updateFeature = async (feature: Feature, id: string) => { +export const updateFeature = async (feature: Feature, id?: string) => { const axios = await authAxios(msalInstance); - feature.guid = id; - return await axios - .put(`${getApiBaseUrl()}/features/${id}`, feature, { - headers: { "Content-Type": "application/json;" }, - params: {}, - }) - .then((response) => { - return response; - }) - .catch((error) => { - return error.response; - }); + if (id) { + feature.guid = id; + } + return axios.put(`${getApiBaseUrl()}/features/${feature.guid}`, feature, { + headers: { "Content-Type": "application/json;" }, + params: {}, + }); }; export const listUserRole = async () => { @@ -245,6 +237,8 @@ export const authAxios = async (msalInstance: PublicClientApplication) => { if (error.response?.status === 403) { const detail = error.response.data.detail; window.location.href = "/responseErrors/403/" + detail; + } else { + return Promise.reject(error.response.data); } //TODO: handle other response errors } diff --git a/ui/src/components/CardDescriptions/index.tsx b/ui/src/components/CardDescriptions/index.tsx new file mode 100644 index 000000000..9c0d41498 --- /dev/null +++ b/ui/src/components/CardDescriptions/index.tsx @@ -0,0 +1,32 @@ +import React from "react"; +import { Card, Descriptions } from "antd"; + +export interface CardDescriptionsProps { + title?: string; + mapping: any[]; + descriptions: any; +} + +const CardDescriptions = (props: CardDescriptionsProps) => { + const { title, mapping, descriptions } = props; + + return descriptions ? ( + + + {mapping.reduce((list: any, item) => { + const value = descriptions?.[item.key]; + if (value) { + list.push( + + {typeof value === "string" ? value : JSON.stringify(value)} + + ); + } + return list; + }, [])} + + + ) : null; +}; + +export default CardDescriptions; diff --git a/ui/src/components/FlowGraph/FlowGraph.tsx b/ui/src/components/FlowGraph/FlowGraph.tsx new file mode 100644 index 000000000..ef3f16033 --- /dev/null +++ b/ui/src/components/FlowGraph/FlowGraph.tsx @@ -0,0 +1,236 @@ +import React, { + MouseEvent as ReactMouseEvent, + forwardRef, + useCallback, + useEffect, + useRef, + useState, +} from "react"; +import ReactFlow, { + ConnectionLineType, + Controls, + Edge, + Node, + Elements, + getIncomers, + getOutgoers, + ReactFlowProvider, + isNode, + OnLoadParams, +} from "react-flow-renderer"; +import { Spin } from "antd"; +import { LoadingOutlined } from "@ant-design/icons"; +import { useSearchParams } from "react-router-dom"; +import cs from "classnames"; +import { FeatureLineage } from "@/models/model"; +import { isFeature, FeatureType } from "@/utils/utils"; +import LineageNode from "./LineageNode"; +import { NodeData, FlowGraphProps } from "./interface"; +import { getElements } from "./utils"; + +import styles from "./index.module.less"; + +const FlowGraphNodeTypes = { + "custom-node": LineageNode, +}; + +const defaultProps: FlowGraphProps = { + project: "", + snapGrid: [15, 15], + featureType: FeatureType.AllNodes, +}; + +const FlowGraph = (props: FlowGraphProps, ref: any) => { + const { + className, + style, + data, + loading, + height, + minHeight, + project, + nodeId, + featureType, + snapGrid, + } = { + ...defaultProps, + ...props, + }; + const [, setURLSearchParams] = useSearchParams(); + const flowRef = useRef(); + const hasReadRef = useRef(false); + const elementRef = useRef>(); + const hasHighlight = useRef(false); + const [elements, setElements] = useState>([]); + + // Reset all node highlight status + const resetHighlight = useCallback(() => { + if ( + elementRef.current && + elementRef.current.length > 0 && + hasHighlight.current + ) { + hasHighlight.current = false; + setElements((state) => { + return state.map((element) => { + if (isNode(element)) { + element.style = { + ...element.style, + opacity: 1, + }; + element.data!.active = false; + } else { + element.animated = false; + } + return element; + }); + }); + } + }, [setElements]); + + // Highlight path of selected node, including all linked up and down stream nodes + const highlightPath = useCallback( + (node: Node) => { + if (elementRef.current && elementRef.current.length > 0) { + hasHighlight.current = true; + setElements((elements) => { + const incomerIds = new Set( + getIncomers(node, elements).map((item) => item.id) + ); + const outgoerIds = new Set( + getOutgoers(node, elements).map((item) => item.id) + ); + + return elements.map((element) => { + if (isNode(element)) { + const highlight = + element.id === node.id || + incomerIds.has(element.id) || + outgoerIds.has(element.id); + element.style = { + ...element.style, + opacity: highlight ? 1 : 0.25, + }; + element.data = { + ...element.data, + active: + element.id === node.id && isFeature(element.data!.subtitle), + }; + } else { + const highlight = + element.source === node.id || element.target === node.id; + const animated = + incomerIds.has(element.source) && + (incomerIds.has(element.target) || node.id === element.target); + + element.animated = highlight || animated; + } + return element; + }); + }); + } + }, + [setElements] + ); + + // Fired when panel is clicked, reset all highlighted path, and remove the nodeId query string in url path. + const onPaneClick = useCallback(() => { + resetHighlight(); + setURLSearchParams({}); + }, [resetHighlight, setURLSearchParams]); + + const onElementClick = useCallback( + (e: ReactMouseEvent, element: Node | Edge) => { + e.stopPropagation(); + if (isNode(element)) { + setURLSearchParams({ + nodeId: element.id, + featureType: element.data!.subtitle, + }); + setTimeout(() => { + highlightPath(element); + }, 0); + } + }, + [highlightPath, setURLSearchParams] + ); + + const handleInit = useCallback( + ( + project: string, + data: FeatureLineage, + featureType?: FeatureType, + nodeId?: string + ) => { + const elements = (elementRef.current = getElements( + project, + data, + featureType + )); + setElements(elements); + if (nodeId) { + const node = elements?.find( + (item) => item.id === nodeId + ) as Node; + if (node) { + highlightPath(node); + } + } + }, + [setElements, highlightPath] + ); + + // Fit the graph to the center of layout view when graph is initialized + const onLoad = (reactFlowInstance: OnLoadParams) => { + flowRef.current = reactFlowInstance; + flowRef.current?.fitView(); + }; + + useEffect(() => { + if (data) { + const type = hasHighlight.current ? FeatureType.AllNodes : featureType; + handleInit(project!, data, type, nodeId); + } + }, [data, project, nodeId, featureType, handleInit]); + + useEffect(() => { + if (elements.length > 0 && !hasReadRef.current) { + hasReadRef.current = true; + setTimeout(() => { + flowRef.current?.fitView(); + }, 0); + } + }, [elements]); + + return ( + } + > + + + + + + + ); +}; + +const FlowGraphComponent = forwardRef(FlowGraph); + +FlowGraphComponent.displayName = "FlowGraph"; + +export default FlowGraphComponent; diff --git a/ui/src/components/FlowGraph/LineageNode.tsx b/ui/src/components/FlowGraph/LineageNode.tsx new file mode 100644 index 000000000..27a99cc4f --- /dev/null +++ b/ui/src/components/FlowGraph/LineageNode.tsx @@ -0,0 +1,57 @@ +import React, { forwardRef, memo } from "react"; +import cs from "classnames"; +import { RightCircleOutlined } from "@ant-design/icons"; +import { useNavigate } from "react-router-dom"; +import { Handle, NodeProps, Position } from "react-flow-renderer"; +import { LineageNodeProps } from "./interface"; + +import styles from "./index.module.less"; + +const LineageNode = (props: LineageNodeProps, ref: any) => { + const navigate = useNavigate(); + + const { label, subtitle, version, borderColor, detialUrl, active } = + props.data; + + const nodeTitle = version ? `${label} (v${version})` : label; + const nodeSubtitle = subtitle.replace("feathr_", ""); + const nodeColorStyle = { + border: `2px solid ${borderColor}`, + }; + + const onNodeIconClick = () => { + if (detialUrl) { + navigate(detialUrl); + } + // `/projects/${project}/features/${featureId}`); + }; + + return ( +
+
+ +
+ {nodeTitle} + {active && ( + + )} +
{nodeSubtitle}
+
+ +
+
+ ); +}; + +const LineageNodeComponent = forwardRef(LineageNode); + +LineageNodeComponent.displayName = "LineageNode"; + +export default memo(LineageNodeComponent); diff --git a/ui/src/components/FlowGraph/index.module.less b/ui/src/components/FlowGraph/index.module.less new file mode 100644 index 000000000..9e69f59d7 --- /dev/null +++ b/ui/src/components/FlowGraph/index.module.less @@ -0,0 +1,43 @@ +.flowGraph { + width: 100%; +} + +.lineageNode { + height: 100%; + + &Active { + overflow: hidden; + border-radius: 0.25rem; + border-width: 2px; + border-style: solid; + --tw-border-opacity: 1; + border-color: rgba(57, 35, 150, var(--tw-border-opacity)); + --tw-bg-opacity: 1; + background-color: rgba(57, 35, 150, var(--tw-bg-opacity)); + --tw-text-opacity: 1; + color: rgba(255, 255, 255, var(--tw-text-opacity)); + opacity: 1; + } + + .box { + padding: 4px 12px 7px; + } + + .title { + font-size: 15px; + font-weight: 700; + } + + .subtitle { + font-size: 10px; + font-style: italic; + text-overflow: ellipsis; + max-width: 135px; + overflow: hidden; + white-space: nowrap; + } + + .navigate { + padding: 4px 12px 7px; + } +} diff --git a/ui/src/components/FlowGraph/index.ts b/ui/src/components/FlowGraph/index.ts new file mode 100644 index 000000000..0f6d659d8 --- /dev/null +++ b/ui/src/components/FlowGraph/index.ts @@ -0,0 +1,5 @@ +import FlowGraph from "./FlowGraph"; + +export * from "./interface"; + +export default FlowGraph; diff --git a/ui/src/components/FlowGraph/interface.ts b/ui/src/components/FlowGraph/interface.ts new file mode 100644 index 000000000..0949dbe97 --- /dev/null +++ b/ui/src/components/FlowGraph/interface.ts @@ -0,0 +1,30 @@ +import { CSSProperties } from "react"; +import { FeatureLineage } from "@/models/model"; +import { FeatureType } from "@/utils/utils"; +import { NodeProps, ReactFlowProps } from "react-flow-renderer"; + +export interface NodeData { + id: string; + label: string; + subtitle: string; + featureId: string; + version: string; + borderColor?: string; + active?: boolean; + detialUrl?: string; +} + +export interface FlowGraphProps { + className?: string; + style?: CSSProperties; + minHeight?: string | number; + height?: string | number; + loading?: boolean; + data?: FeatureLineage; + nodeId?: string; + project?: string; + snapGrid?: ReactFlowProps["snapGrid"]; + featureType?: FeatureType; +} + +export interface LineageNodeProps extends NodeProps {} diff --git a/ui/src/components/FlowGraph/utils.ts b/ui/src/components/FlowGraph/utils.ts new file mode 100644 index 000000000..141962895 --- /dev/null +++ b/ui/src/components/FlowGraph/utils.ts @@ -0,0 +1,192 @@ +import { Feature, FeatureLineage, RelationData } from "@/models/model"; +import { FeatureType, getFeatureDetailUrl } from "@/utils/utils"; +import dagre from "dagre"; +import { + Node, + Edge, + ArrowHeadType, + Position, + Elements, +} from "react-flow-renderer"; +import { NodeData } from "./interface"; + +const featureTypeColors: Record = { + feathr_source_v1: "hsl(315, 100%, 50%)", + feathr_anchor_v1: "hsl(270, 100%, 50%)", + feathr_anchor_feature_v1: "hsl(225, 100%, 50%)", + feathr_derived_feature_v1: "hsl(135, 100%, 50%)", +}; + +const DEFAULT_WIDTH = 20; +const DEFAULT_HEIGHT = 36; + +const generateNode = (project: string, data: Feature): Node => { + return { + id: data.guid, + type: "custom-node", + style: { + border: `2px solid featureTypeColors[data.typeName]`, + }, + position: { + x: 0, + y: 0, + }, + data: { + id: data.guid, + label: data.displayText, + subtitle: data.typeName, + featureId: data.guid, + version: data.version, + borderColor: featureTypeColors[data.typeName], + detialUrl: getFeatureDetailUrl(project, data), + }, + }; +}; + +const generateEdge = ( + data: RelationData, + entityMap: Record +): Edge => { + let { fromEntityId: from, toEntityId: to, relationshipType } = data; + + if (relationshipType === "Consumes") { + [from, to] = [to, from]; + } + const sourceNode = entityMap?.[from]; + const targetNode = entityMap?.[to]; + + return { + id: `e-${from}_${to}`, + source: from, + target: to, + arrowHeadType: ArrowHeadType.ArrowClosed, + data: { + sourceTypeName: sourceNode?.typeName, + targetTypeName: targetNode?.typeName, + }, + }; +}; + +export const getLineageNodes = ( + project: string, + lineageData: FeatureLineage, + featureType: FeatureType +): Node[] => { + const { guidEntityMap } = lineageData; + if (!guidEntityMap) { + return []; + } + + return Object.values(guidEntityMap).reduce( + (nodes: Node[], item: Feature) => { + if ( + item.typeName !== "feathr_workspace_v1" && + (featureType === FeatureType.AllNodes || + item.typeName === featureType || + (featureType === FeatureType.AnchorFeature && + item.typeName === FeatureType.Anchor)) + ) { + nodes.push(generateNode(project, item)); + } + return nodes; + }, + [] as Node[] + ); +}; + +export const getLineageEdge = ( + lineageData: FeatureLineage, + featureType: FeatureType +): Edge[] => { + if (!lineageData.relations || !lineageData.guidEntityMap) { + return []; + } + + return lineageData.relations.reduce((edges: Edge[], item) => { + if (["Consumes", "Contains", "Produces"].includes(item.relationshipType)) { + const edge = generateEdge(item, lineageData.guidEntityMap!); + if ( + edges.findIndex((item) => item.id === edge.id) === -1 && + edge.data.sourceTypeName !== "feathr_workspace_v1" && + (featureType === FeatureType.AllNodes || + (featureType === FeatureType.AnchorFeature && + edge.data.sourceTypeName === FeatureType.Anchor && + edge.data.targetTypeName === FeatureType.AnchorFeature)) + ) { + edges.push(edge); + } + } + + return edges; + }, [] as Edge[]); +}; + +export const getElements = ( + project: string, + lineageData: FeatureLineage, + featureType: FeatureType = FeatureType.AllNodes, + direction = "LR" +) => { + const elements: Elements = []; + + const dagreGraph = new dagre.graphlib.Graph({ compound: true }); + + dagreGraph.setDefaultEdgeLabel(() => ({})); + dagreGraph.setGraph({ rankdir: direction }); + + const isHorizontal = direction === "LR"; + + const nodes = getLineageNodes(project, lineageData, featureType); + let edges = getLineageEdge(lineageData, featureType); + + const anchorEdges = edges.filter((item) => { + return ( + item.data.sourceTypeName === FeatureType.Anchor && + item.data.targetTypeName === FeatureType.AnchorFeature + ); + }); + + edges = edges.reduce((data: any, item) => { + const anchorEdge = anchorEdges.find((i: any) => i.target === item.target); + if (anchorEdge) { + if ( + !( + item.data.sourceTypeName === FeatureType.Source && + item.data.targetTypeName === FeatureType.AnchorFeature + ) + ) { + data.push(item); + } + } else { + data.push(item); + } + return data; + }, []); + + nodes.forEach((item) => { + dagreGraph.setNode(item.id, { + label: item.data!.label, + node: item, + width: item.data!.label.length * 8 + DEFAULT_WIDTH, + height: item.style?.height || DEFAULT_HEIGHT, + }); + elements.push(item); + }); + + edges?.forEach((item: any) => { + dagreGraph.setEdge(item.source, item.target); + elements.push(item); + }); + + dagre.layout(dagreGraph); + + nodes.forEach((item) => { + const nodeWithPosition = dagreGraph.node(item.id); + item.targetPosition = isHorizontal ? Position.Left : Position.Top; + item.sourcePosition = isHorizontal ? Position.Right : Position.Bottom; + item.position.x = nodeWithPosition.x; + item.position.y = nodeWithPosition.y - DEFAULT_HEIGHT / 2; + }); + + return elements; +}; diff --git a/ui/src/components/ProjectsSelect/index.tsx b/ui/src/components/ProjectsSelect/index.tsx new file mode 100644 index 000000000..ca5fddf9f --- /dev/null +++ b/ui/src/components/ProjectsSelect/index.tsx @@ -0,0 +1,51 @@ +import React from "react"; +import { Select } from "antd"; +import { fetchProjects } from "@/api"; +import { useQuery } from "react-query"; + +export interface ProjectsSelectProps { + width?: number; + defaultValue?: string; + onChange?: (value: string) => void; +} + +const ProjectsSelect = (props: ProjectsSelectProps) => { + const { width = 350, defaultValue, onChange, ...restProps } = props; + + const { isLoading, data: options } = useQuery< + { value: string; label: string }[] + >( + ["projectsSelect"], + async () => { + try { + const result = await fetchProjects(); + return result.map((item) => ({ + value: item, + label: item, + })); + } catch (e) { + return Promise.reject(e); + } + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + + + + + + + + + + + + + + + + + ); +}; + +const FeatureFormComponent = forwardRef(FeatureForm); + +FeatureFormComponent.displayName = "FeatureFormComponent"; + +export default FeatureFormComponent; diff --git a/ui/src/pages/feature/components/FeatureTable/index.tsx b/ui/src/pages/feature/components/FeatureTable/index.tsx new file mode 100644 index 000000000..69e9c1ae6 --- /dev/null +++ b/ui/src/pages/feature/components/FeatureTable/index.tsx @@ -0,0 +1,151 @@ +import React, { forwardRef, useRef } from "react"; +import { Button } from "antd"; +import { useQuery } from "react-query"; +import { useNavigate } from "react-router-dom"; +import { Feature } from "@/models/model"; +import { fetchFeatures } from "@/api"; +import ResizeTable, { ResizeColumnType } from "@/components/ResizeTable"; + +export interface DataSourceTableProps { + project?: string; + keyword?: string; +} + +export interface SearchModel { + scope?: string; + roleName?: string; +} + +const DataSourceTable = (props: DataSourceTableProps, ref: any) => { + const navigate = useNavigate(); + + const { project, keyword } = props; + + const projectRef = useRef(project); + + const getDetialUrl = (guid: string) => { + return `/projects/${projectRef.current}/features/${guid}`; + }; + + const columns: ResizeColumnType[] = [ + { + key: "name", + title: "Name", + ellipsis: true, + width: 200, + render: (record: Feature) => { + return ( + + ); + }, + }, + { + key: "type", + title: "Type", + ellipsis: true, + width: 120, + render: (record: Feature) => { + return record.typeName.replace(/feathr_|_v1/gi, ""); + }, + }, + { + key: "transformation", + title: "Transformation", + width: 220, + render: (record: Feature) => { + const { transformExpr, defExpr } = record.attributes.transformation; + return transformExpr || defExpr; + }, + }, + { + key: "entitykey", + title: "Entity Key", + ellipsis: true, + width: 120, + render: (record: Feature) => { + const key = record.attributes.key && record.attributes.key[0]; + if ("NOT_NEEDED" !== key.keyColumn) { + return `${key.keyColumn} (${key.keyColumnType})`; + } else { + return "N/A"; + } + }, + }, + { + key: "aggregation", + title: "Aggregation", + ellipsis: true, + width: 150, + render: (record: Feature) => { + const { transformation } = record.attributes; + return ( + <> + {transformation.aggFunc && `Type: ${transformation.aggFunc}`} +
+ {transformation.aggFunc && `Window: ${transformation.window}`} + + ); + }, + }, + { + title: "Action", + fixed: "right", + width: 100, + resize: false, + render: (record: Feature) => { + return ( + + ); + }, + }, + ]; + + const { isLoading, data: tableData } = useQuery( + ["dataSources", project, keyword], + async () => { + if (project) { + projectRef.current = project; + return await fetchFeatures(project, 1, 10, keyword || ""); + } else { + return []; + } + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + + ); +}; + +const DataSourceTableComponent = forwardRef( + DataSourceTable +); + +DataSourceTableComponent.displayName = "DataSourceTableComponent"; + +export default DataSourceTableComponent; diff --git a/ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx b/ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx new file mode 100644 index 000000000..868c866a7 --- /dev/null +++ b/ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx @@ -0,0 +1,44 @@ +import React from "react"; +import { Space } from "antd"; +import { Feature } from "@/models/model"; +import CardDescriptions from "@/components/CardDescriptions"; +import { + TransformationMap, + FeatureKeyMap, + TypeMap, +} from "@/utils/attributesMapping"; + +export interface FeatureNodeDetialProps { + feature: Feature; +} + +const FeatureNodeDetial = (props: FeatureNodeDetialProps) => { + const { feature } = props; + + const { attributes } = feature; + const { transformation, key, type } = attributes; + const FeatureKey = key?.[0]; + + return ( + + + + + + ); +}; + +export default FeatureNodeDetial; diff --git a/ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx b/ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx new file mode 100644 index 000000000..fbf5be158 --- /dev/null +++ b/ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx @@ -0,0 +1,22 @@ +import React from "react"; +import { DataSource } from "@/models/model"; +import { SourceAttributesMap } from "@/utils/attributesMapping"; +import CardDescriptions from "@/components/CardDescriptions"; + +export interface SourceNodeDetialProps { + source: DataSource; +} + +const SourceNodeDetial = (props: SourceNodeDetialProps) => { + const { source } = props; + const { attributes } = source; + return ( + + ); +}; + +export default SourceNodeDetial; diff --git a/ui/src/pages/feature/components/NodeDetails/index.tsx b/ui/src/pages/feature/components/NodeDetails/index.tsx new file mode 100644 index 000000000..8a3391cfd --- /dev/null +++ b/ui/src/pages/feature/components/NodeDetails/index.tsx @@ -0,0 +1,63 @@ +import React from "react"; +import { useParams, useSearchParams } from "react-router-dom"; +import { fetchFeature, fetchDataSource } from "@/api"; +import { LoadingOutlined } from "@ant-design/icons"; +import { useQuery } from "react-query"; +import { Spin, Typography } from "antd"; +import { FeatureType } from "@/utils/utils"; +import FeatureNodeDetail from "./FeatureNodeDetail"; +import SourceNodeDetial from "./SourceNodeDetial"; + +const { Paragraph } = Typography; + +const NodeDetails = () => { + const [searchParams] = useSearchParams(); + const { project } = useParams(); + const nodeId = searchParams.get("nodeId") as string; + const featureType = searchParams.get("featureType") as string; + + const isSource = featureType === FeatureType.Source; + const isFeature = + featureType === FeatureType.AnchorFeature || + featureType === FeatureType.DerivedFeature; + + const { isLoading, data } = useQuery( + ["nodeDetails", project, nodeId], + async () => { + if (isSource || isFeature) { + const api = isSource ? fetchDataSource : fetchFeature; + return await api(project!, nodeId); + } + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + } + > +
+ {data ? ( + isSource ? ( + + ) : ( + + ) + ) : ( + !isLoading && ( + + Click on source or feature node to show metadata and metric + details + + ) + )} +
+
+ ); +}; + +export default NodeDetails; diff --git a/ui/src/pages/feature/components/SearchBar/index.tsx b/ui/src/pages/feature/components/SearchBar/index.tsx new file mode 100644 index 000000000..1a32f28b2 --- /dev/null +++ b/ui/src/pages/feature/components/SearchBar/index.tsx @@ -0,0 +1,67 @@ +import React, { useRef } from "react"; +import { Form, Input, Button } from "antd"; +import { useNavigate } from "react-router-dom"; +import ProjectsSelect from "@/components/ProjectsSelect"; + +export interface SearchValue { + project?: string; + keyword?: string; +} + +export interface SearchBarProps { + defaultValues?: SearchValue; + onSearch?: (values: SearchValue) => void; +} + +const { Item } = Form; + +const SearchBar = (props: SearchBarProps) => { + const [form] = Form.useForm(); + + const navigate = useNavigate(); + + const { defaultValues, onSearch } = props; + + const timeRef = useRef(null); + + const onChangeKeyword = () => { + clearTimeout(timeRef.current); + timeRef.current = setTimeout(() => { + form.submit(); + }, 350); + }; + + return ( +
+
+ + + + + + +
+ +
+ ); +}; + +export default SearchBar; diff --git a/ui/src/pages/feature/featureDetails.tsx b/ui/src/pages/feature/featureDetails.tsx index 549e5e3f7..fdecb7505 100644 --- a/ui/src/pages/feature/featureDetails.tsx +++ b/ui/src/pages/feature/featureDetails.tsx @@ -1,218 +1,116 @@ -import React, { useEffect, useState } from "react"; -import { Alert, Button, Card, Col, Row, Space, Spin, Typography } from "antd"; +import React, { useEffect, useRef, useState } from "react"; +import { + Alert, + Button, + PageHeader, + Breadcrumb, + Space, + Card, + Spin, + Descriptions, +} from "antd"; import { LoadingOutlined } from "@ant-design/icons"; -import { useNavigate, useParams } from "react-router-dom"; -import { QueryStatus, useQuery } from "react-query"; +import { Link, useNavigate, useParams } from "react-router-dom"; +import { useQuery } from "react-query"; import { AxiosError } from "axios"; -import { fetchFeature } from "../../api"; -import { Feature, InputFeature } from "../../models/model"; -import { FeatureLineage } from "../../models/model"; -import { fetchFeatureLineages } from "../../api"; -import { Elements } from "react-flow-renderer"; -import Graph from "../../components/graph/graph"; -import { getElements } from "../../components/graph/utils"; - -const { Title } = Typography; - -type FeatureKeyProps = { feature: Feature }; -const FeatureKey = ({ feature }: FeatureKeyProps) => { - const keys = feature.attributes.key; - return ( - <> - {keys && keys.length > 0 && ( - - - Entity Key -
-

Full Name: {keys[0].fullName}

-

Key Column: {keys[0].keyColumn}

-

Description: {keys[0].description}

-

Key Column Alias: {keys[0].keyColumnAlias}

-

Key Column Type: {keys[0].keyColumnType}

-
-
- - )} - - ); -}; - -type FeatureTypeProps = { feature: Feature }; -const FeatureType = ({ feature }: FeatureTypeProps) => { - const type = feature.attributes.type; - return ( - <> - {type && ( - - - Type -
-

Dimension Type: {type.dimensionType}

-

Tensor Category: {type.tensorCategory}

-

Type: {type.type}

-

Value Type: {type.valType}

-
-
- - )} - - ); -}; - -type FeatureTransformationProps = { feature: Feature }; -const FeatureTransformation = ({ feature }: FeatureTransformationProps) => { - const transformation = feature.attributes.transformation; - return ( - <> - {transformation && ( - - - Transformation -
- {transformation.transformExpr && ( -

Expression: {transformation.transformExpr}

- )} - {transformation.filter &&

Filter: {transformation.filter}

} - {transformation.aggFunc && ( -

Aggregation: {transformation.aggFunc}

- )} - {transformation.limit &&

Limit: {transformation.limit}

} - {transformation.groupBy && ( -

Group By: {transformation.groupBy}

- )} - {transformation.window &&

Window: {transformation.window}

} - {transformation.defExpr && ( -

Expression: {transformation.defExpr}

- )} -
-
- - )} - - ); -}; +import { fetchFeature, fetchFeatureLineages } from "@/api"; +import { Feature, InputFeature, FeatureLineage } from "@/models/model"; +import FlowGraph from "@/components/FlowGraph"; +import CardDescriptions from "@/components/CardDescriptions"; +import { + FeatureKeyMap, + TransformationMap, + TypeMap, +} from "@/utils/attributesMapping"; + +const contentStyle = { marginRight: 16 }; type InputAnchorFeaturesProps = { project: string; feature: Feature }; -const InputAnchorFeatures = ({ - project, - feature, -}: InputAnchorFeaturesProps) => { - const navigate = useNavigate(); - const inputAnchorFeatures = feature.attributes.inputAnchorFeatures; - return ( - <> - {inputAnchorFeatures && inputAnchorFeatures.length > 0 && ( - - - Input Anchor Features - {inputAnchorFeatures.map((input_feature) => ( - - ))} - - - )} - - ); + +const InputAnchorFeatures = (props: InputAnchorFeaturesProps) => { + const { project, feature } = props; + + const { inputAnchorFeatures } = feature.attributes; + + return inputAnchorFeatures?.length > 0 ? ( + + + {inputAnchorFeatures.map((input_feature) => ( + + + {input_feature.uniqueAttributes.qualifiedName} + + + ))} + + + ) : null; }; type InputDerivedFeaturesProps = { project: string; feature: Feature }; -const InputDerivedFeatures = ({ - project, - feature, -}: InputDerivedFeaturesProps) => { - const navigate = useNavigate(); - const inputDerivedFeatures = feature.attributes.inputDerivedFeatures; - return ( - <> - {inputDerivedFeatures && inputDerivedFeatures.length > 0 && ( - - - Input Derived Features - {inputDerivedFeatures.map((input_feature: InputFeature) => ( - - ))} - - - )} - - ); + +const InputDerivedFeatures = (props: InputDerivedFeaturesProps) => { + const { project, feature } = props; + + const { inputDerivedFeatures } = feature.attributes; + + return inputDerivedFeatures?.length ? ( + + + {inputDerivedFeatures.map((input_feature: InputFeature) => ( + + + {input_feature.uniqueAttributes.qualifiedName} + + + ))} + + + ) : null; }; const FeatureLineageGraph = () => { - const { featureId } = useParams() as Params; + const { project, featureId } = useParams() as Params; const [lineageData, setLineageData] = useState({ - guidEntityMap: null, - relations: null, + guidEntityMap: {}, + relations: [], }); - const [elements, SetElements] = useState([]); + const [loading, setLoading] = useState(false); + const mountedRef = useRef(true); + useEffect(() => { const fetchLineageData = async () => { setLoading(true); const data = await fetchFeatureLineages(featureId); - setLineageData(data); - setLoading(false); + if (mountedRef.current) { + setLineageData(data); + setLoading(false); + } }; fetchLineageData(); }, [featureId]); - // Generate graph data on client side, invoked after graphData or featureType is changed useEffect(() => { - const generateGraphData = async () => { - SetElements(getElements(lineageData, "all_nodes")!); + mountedRef.current = true; + return () => { + mountedRef.current = false; }; - - generateGraphData(); - }, [lineageData]); - - return ( - <> - {loading ? ( - } /> - ) : ( - - - Lineage - - - - )} - - ); + }, []); + + return !loading ? ( + + + + ) : null; }; type Params = { @@ -222,87 +120,77 @@ type Params = { const FeatureDetails = () => { const { project, featureId } = useParams() as Params; const navigate = useNavigate(); - const loadingIcon = ; - const { status, error, data } = useQuery( + + const { + isLoading, + error, + data = { attributes: {} } as Feature, + } = useQuery( ["featureId", featureId], - () => fetchFeature(project, featureId) + () => fetchFeature(project, featureId), + { + retry: false, + refetchOnWindowFocus: false, + } ); + const { attributes } = data; + const { transformation, key, type, name } = attributes; + const FeatureKey = key?.[0]; - const openLineageWindow = () => { - const lineageUrl = `/projects/${project}/lineage`; - navigate(lineageUrl); - }; - - const render = (status: QueryStatus): JSX.Element => { - switch (status) { - case "error": - return ( - - - - ); - case "idle": - return ( - - - - ); - case "loading": - return ( - - - - ); - case "success": - if (data === undefined) { - return ( - - - - ); - } else { - return ( - <> - - - {data.attributes.name} -
- - - -
-
- - - - - - - - -
-
- - ); + return ( +
+ + + Features + + Feature Details + } - } - }; - - return
{render(status)}
; + extra={[ + , + ]} + > + } + > + + {error && } + + + + + + + + +
+
+ ); }; export default FeatureDetails; diff --git a/ui/src/pages/feature/features.tsx b/ui/src/pages/feature/features.tsx index 275cde11f..9ace6ead6 100644 --- a/ui/src/pages/feature/features.tsx +++ b/ui/src/pages/feature/features.tsx @@ -1,20 +1,27 @@ -import { Button, Card, Space, Typography } from "antd"; -import { useNavigate, useSearchParams } from "react-router-dom"; -import FeatureList from "../../components/featureList"; - -const { Title } = Typography; +import { useState } from "react"; +import { PageHeader } from "antd"; +import { useSearchParams } from "react-router-dom"; +import SearchBar, { SearchValue } from "./components/SearchBar"; +import FeatureTable from "./components/FeatureTable"; const Features = () => { const [searchParams] = useSearchParams(); - const project = (searchParams.get("project") as string) ?? ""; - const keyword = (searchParams.get("keyword") as string) ?? ""; + + const [search, setProject] = useState({ + project: searchParams.get("project") || undefined, + keyword: searchParams.get("keyword") || undefined, + }); + + const onSearch = (values: SearchValue) => { + setProject(values); + }; return (
- - Features - - + + + +
); }; diff --git a/ui/src/pages/feature/lineageGraph.tsx b/ui/src/pages/feature/lineageGraph.tsx index ac75dff91..d8b1473df 100644 --- a/ui/src/pages/feature/lineageGraph.tsx +++ b/ui/src/pages/feature/lineageGraph.tsx @@ -1,17 +1,17 @@ -import React, { useEffect, useState } from "react"; -import { Card, Col, Radio, Row, Spin, Tabs, Typography } from "antd"; +import React, { useEffect, useRef, useState } from "react"; +import { PageHeader, Row, Col, Radio, Tabs } from "antd"; import { useParams, useSearchParams } from "react-router-dom"; -import { Elements } from "react-flow-renderer"; -import Graph from "../../components/graph/graph"; -import { fetchProjectLineages } from "../../api"; -import { FeatureLineage } from "../../models/model"; -import { LoadingOutlined } from "@ant-design/icons"; -import GraphNodeDetails from "../../components/graph/graphNodeDetails"; -import { getElements } from "../../components/graph/utils"; -import { FeatureType } from "../../utils/utils"; +import FlowGraph from "@/components/FlowGraph"; +import { fetchProjectLineages } from "@/api"; +import { FeatureLineage } from "@/models/model"; +import { FeatureType } from "@/utils/utils"; +import NodeDetails from "./components/NodeDetails"; -const { Title } = Typography; -const { TabPane } = Tabs; +const items = [ + { label: "Metadata", key: "1", children: }, + { label: "Metrics", key: "2", children:

Under construction

}, // 务必填写 key + { label: "Jobs", key: "3", children:

Under construction

}, +]; type Params = { project: string; @@ -22,90 +22,75 @@ const LineageGraph = () => { const nodeId = searchParams.get("nodeId") as string; const [lineageData, setLineageData] = useState({ - guidEntityMap: null, - relations: null, + guidEntityMap: {}, + relations: [], }); + const [loading, setLoading] = useState(false); - const [elements, SetElements] = useState([]); - const [featureType, setFeatureType] = useState("all_nodes"); + + const [featureType, setFeatureType] = useState( + FeatureType.AllNodes + ); + + const mountedRef = useRef(true); // Fetch lineage data from server side, invoked immediately after component is mounted useEffect(() => { const fetchLineageData = async () => { setLoading(true); const data = await fetchProjectLineages(project); - setLineageData(data); - setLoading(false); + if (mountedRef.current) { + setLineageData(data); + setLoading(false); + } }; fetchLineageData(); }, [project]); - // Generate graph data on client side, invoked after graphData or featureType is changed + const toggleFeatureType = (type: FeatureType) => { + setFeatureType(type); + }; + useEffect(() => { - const generateGraphData = async () => { - SetElements(getElements(lineageData, featureType)!); + mountedRef.current = true; + return () => { + mountedRef.current = false; }; - - generateGraphData(); - }, [lineageData, featureType]); - - const toggleFeatureType = (type: string) => { - setFeatureType((prevType: string | null) => { - if (prevType === type) { - return null; - } - return type; - }); - }; + }, []); return (
- - Lineage {project} -
- toggleFeatureType(e.target.value)} - > - All Nodes - Source - Anchor - - Anchor Feature - - - Derived Feature - - -
-
- {loading ? ( - } + + toggleFeatureType(e.target.value)} + > + All Nodes + Source + + Anchor Feature + + + Derived Feature + + + + + - ) : ( - - - - - - - - - - -

Under construction

-
- -

Under construction

-
-
- -
- )} -
-
+ + + + + +
); }; diff --git a/ui/src/pages/feature/newFeature.tsx b/ui/src/pages/feature/newFeature.tsx index d51dd2aa0..50afd64c3 100644 --- a/ui/src/pages/feature/newFeature.tsx +++ b/ui/src/pages/feature/newFeature.tsx @@ -1,16 +1,13 @@ import React from "react"; -import { Card, Typography } from "antd"; -import FeatureForm from "../../components/featureForm"; - -const { Title } = Typography; +import { PageHeader } from "antd"; +import FeatureForm from "./components/FeatureForm"; const NewFeature = () => { return (
- - Create Feature + - +
); }; diff --git a/ui/src/pages/management/components/RoleForm/index.tsx b/ui/src/pages/management/components/RoleForm/index.tsx index 9e073abd8..0a77b1610 100644 --- a/ui/src/pages/management/components/RoleForm/index.tsx +++ b/ui/src/pages/management/components/RoleForm/index.tsx @@ -1,6 +1,6 @@ import React, { forwardRef, useCallback, useEffect, useState } from "react"; import { Form, Select, Input, Button, message } from "antd"; -import { listUserRole, addUserRole } from "../../../../api"; +import { listUserRole, addUserRole } from "@/api"; export interface RoleFormProps { getRole?: (isAdmin: boolean) => void; diff --git a/ui/src/pages/management/components/UserRolesTable/index.tsx b/ui/src/pages/management/components/UserRolesTable/index.tsx index d264b2691..9a72f1539 100644 --- a/ui/src/pages/management/components/UserRolesTable/index.tsx +++ b/ui/src/pages/management/components/UserRolesTable/index.tsx @@ -6,12 +6,12 @@ import React, { useRef, useState, } from "react"; -import { Table, Tag, Button, message, Popconfirm } from "antd"; +import { Tag, Button, message, Popconfirm } from "antd"; import { DeleteOutlined } from "@ant-design/icons"; -import { ColumnsType } from "antd/lib/table"; import dayjs from "dayjs"; -import { UserRole } from "../../../../models/model"; -import { listUserRole, deleteUserRole } from "../../../../api"; +import { UserRole } from "@/models/model"; +import { listUserRole, deleteUserRole } from "@/api"; +import ResizeTable, { ResizeColumnType } from "@/components/ResizeTable"; export interface UserRolesTableProps {} @@ -74,25 +74,32 @@ const UserRolesTable = (props: UserRolesTableProps, ref: any) => { } }; - const columns: ColumnsType = [ + const columns: ResizeColumnType[] = [ { + key: "scope", title: "Scope (Project / Global)", dataIndex: "scope", ellipsis: true, + width: 330, + minWidth: 190, }, { title: "Role", dataIndex: "roleName", + ellipsis: true, width: 120, }, { title: "User", dataIndex: "userName", ellipsis: true, + width: 300, + minWidth: 100, }, { title: "Permissions", dataIndex: "access", + ellipsis: true, width: 240, render: (col: string[]) => { return col.map((tag) => { @@ -110,6 +117,7 @@ const UserRolesTable = (props: UserRolesTableProps, ref: any) => { title: "Reason", dataIndex: "createReason", ellipsis: true, + width: 300, }, { title: "Create By", @@ -138,6 +146,7 @@ const UserRolesTable = (props: UserRolesTableProps, ref: any) => { title: "Action", fixed: "right", width: 130, + resize: false, render: (col: string, record: UserRole) => { return ( { }, [fetchData]); return ( - ); }; diff --git a/ui/src/pages/management/management.tsx b/ui/src/pages/management/management.tsx index 79ba50daa..882048b1f 100644 --- a/ui/src/pages/management/management.tsx +++ b/ui/src/pages/management/management.tsx @@ -18,7 +18,7 @@ const Management = () => { return (
- + - + {showAlert && ( { + const navigate = useNavigate(); + + const { project } = props; + + const columns: ResizeColumnType[] = [ + { + key: "name", + title: "Name", + dataIndex: "name", + resize: false, + }, + { + key: "action", + title: "Action", + width: 130, + resize: false, + render: (record: Project) => { + const { name } = record; + return ( + + + + + ); + }, + }, + ]; + + const { isLoading, data: tableData } = useQuery( + ["Projects", project], + async () => { + const reuslt = await fetchProjects(); + + return reuslt.reduce((list, item: string) => { + const text = project?.trim().toLocaleLowerCase(); + if (!text || item.includes(text)) { + list.push({ name: item }); + } + return list; + }, [] as Project[]); + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + + ); +}; + +const ProjectTableComponent = forwardRef( + ProjectTable +); + +ProjectTableComponent.displayName = "ProjectTableComponent"; + +export default ProjectTableComponent; diff --git a/ui/src/pages/project/components/SearchBar/index.tsx b/ui/src/pages/project/components/SearchBar/index.tsx new file mode 100644 index 000000000..4ac3cd29d --- /dev/null +++ b/ui/src/pages/project/components/SearchBar/index.tsx @@ -0,0 +1,51 @@ +import React, { forwardRef, useRef } from "react"; +import { Form, Input } from "antd"; + +export interface SearchBarProps { + onSearch: (values: any) => void; +} + +const { Item } = Form; + +const SearchBar = (props: SearchBarProps, ref: any) => { + const [form] = Form.useForm(); + + const { onSearch } = props; + + const timeRef = useRef(null); + + const onChangeKeyword = () => { + clearTimeout(timeRef.current); + timeRef.current = setTimeout(() => { + form.submit(); + }, 350); + }; + + return ( +
+
+ + + + +
+ ); +}; + +const SearchBarComponent = forwardRef(SearchBar); + +SearchBarComponent.displayName = "SearchBarComponent"; + +export default SearchBarComponent; diff --git a/ui/src/pages/project/projects.tsx b/ui/src/pages/project/projects.tsx index 03cbf3d48..932915089 100644 --- a/ui/src/pages/project/projects.tsx +++ b/ui/src/pages/project/projects.tsx @@ -1,16 +1,21 @@ -import React from "react"; -import { Card, Typography } from "antd"; -import ProjectList from "../../components/projectList"; - -const { Title } = Typography; +import React, { useState } from "react"; +import { PageHeader } from "antd"; +import ProjectTable from "./components/ProjectTable"; +import SearchBar from "./components/SearchBar"; const Projects = () => { + const [project, setProject] = useState(""); + + const onSearch = ({ project }: { project: string }) => { + setProject(project); + }; + return (
- - Projects - - + + + +
); }; diff --git a/ui/src/site.css b/ui/src/site.css index cea4439f7..e1a42a944 100644 --- a/ui/src/site.css +++ b/ui/src/site.css @@ -4,8 +4,6 @@ } .card { - margin-top: 15px; - margin-right: 15px; box-shadow: 5px 8px 15px 5px rgba(208, 216, 243, 0.6); border-radius: 8px; } @@ -61,3 +59,7 @@ .dataSource-container { column-count: 1; } + +.display-flex { + display: "flex"; +} diff --git a/ui/src/utils/attributesMapping.ts b/ui/src/utils/attributesMapping.ts new file mode 100644 index 000000000..09e7459b7 --- /dev/null +++ b/ui/src/utils/attributesMapping.ts @@ -0,0 +1,48 @@ +import { + FeatureTransformation, + FeatureKey, + FeatureType, + DataSourceAttributes, +} from "@/models/model"; + +export const TransformationMap: Array<{ + label: string; + key: keyof FeatureTransformation; +}> = [ + { label: "Expression", key: "transformExpr" }, + { label: "Filter", key: "filter" }, + { label: "Aggregation", key: "aggFunc" }, + { label: "Limit", key: "limit" }, + { label: "Group By", key: "groupBy" }, + { label: "Window", key: "window" }, + { label: "Expression", key: "defExpr" }, +]; + +export const FeatureKeyMap: Array<{ label: string; key: keyof FeatureKey }> = [ + { label: "Full name", key: "fullName" }, + { label: "Description", key: "description" }, + { label: "Key column", key: "keyColumn" }, + { label: "Key column alias", key: "keyColumnAlias" }, + { label: "Key column type", key: "keyColumnType" }, +]; + +export const TypeMap: Array<{ label: string; key: keyof FeatureType }> = [ + { label: "Dimension Type", key: "dimensionType" }, + { label: "Tensor Category", key: "tensorCategory" }, + { label: "Type", key: "type" }, + { label: "Value Type", key: "valType" }, +]; + +export const SourceAttributesMap: Array<{ + label: string; + key: keyof DataSourceAttributes; +}> = [ + { label: "Name", key: "name" }, + { label: "Type", key: "type" }, + { label: "Path", key: "path" }, + { label: "Preprocessing", key: "preprocessing" }, + { label: "Event Timestamp Column", key: "event_timestamp_column" }, + { label: "Timestamp Forma", key: "timestamp_format" }, + { label: "Qualified Name", key: "qualified_name" }, + { label: "Tags", key: "tags" }, +]; diff --git a/ui/src/utils/utils.tsx b/ui/src/utils/utils.tsx index 85bfd8f42..9cd2c959b 100644 --- a/ui/src/utils/utils.tsx +++ b/ui/src/utils/utils.tsx @@ -1,3 +1,4 @@ +import { Feature } from "@/models/model"; import { Configuration, PublicClientApplication } from "@azure/msal-browser"; export const getMsalConfig = () => { @@ -16,8 +17,6 @@ export const getMsalConfig = () => { redirectUri: window.location.origin, }, }; - console.log("clientId = ", clientId); - console.log("authority = ", authority); return new PublicClientApplication(msalConfig); }; @@ -33,6 +32,19 @@ export const enum FeatureType { export const isFeature = (featureType: string) => { return ( featureType === FeatureType.AnchorFeature || - featureType === FeatureType.DerivedFeature + featureType === FeatureType.DerivedFeature || + featureType === FeatureType.Source ); }; + +export const getFeatureDetailUrl = (project: string, feature: Feature) => { + switch (feature.typeName) { + case FeatureType.Source: + return `/projects/${project}/dataSources/${feature.guid}`; + case FeatureType.AnchorFeature: + case FeatureType.DerivedFeature: + return `/projects/${project}/features/${feature.guid}`; + default: + return; + } +}; From afd930903be8368c4f0e71596e7ca5bd6ccb3c73 Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Tue, 1 Nov 2022 19:19:44 +0800 Subject: [PATCH 07/18] Add release instructions for Release Candidate (#809) * Add release instructions for Release Candidate * Add a section for release versioning * Add a section for overall process triggered by the release manager --- .../dev_guide/feathr_overall_release_guide.md | 61 +++++++++++++------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/docs/dev_guide/feathr_overall_release_guide.md b/docs/dev_guide/feathr_overall_release_guide.md index d0b16611a..5d6301a49 100644 --- a/docs/dev_guide/feathr_overall_release_guide.md +++ b/docs/dev_guide/feathr_overall_release_guide.md @@ -10,62 +10,87 @@ This document describes all the release process for the development team. ## Prerequisites -- Make sure the CI tests are passing so there are no surprises on the release day. +- Make sure the CI tests are passing prior to bug bash. - Make sure all the active PRs related to the release are merged. - ## When to Release -- For each major and minor version release, please follow these steps. -- For patch versions, there should be no releases. +The release process is triggered by the release manager. The release manager will decide when to release with following steps: + +1. Ensure Prerequisites are met. +2. Creation of Release Candidate(rc) on GitHub. +3. Bug Bash. +4. Creation of Release on GitHub. +5. Post Release announcement. + +## Release Versioning + +- Major and minor version: X.Y.Z +- Release Candidate: X.Y.Z-rcN ## Writing Release Note Write a release note following past examples [here](https://github.com/feathr-ai/feathr/releases). Read through the [commit log](https://github.com/feathr-ai/feathr/commits/main) to identify the commits after last release to include in the release note. Here are the major things to include -- highlights of the release -- improvements and changes of this release -- new contributors of this release +- Highlights of the release +- Improvements and changes of this release +- New contributors of this release ## Code Changes -Before the release is made, the version needs to be updated in following places + +Before the release candidate or release is made, the version needs to be updated in following places + - [build.sbt](https://github.com/feathr-ai/feathr/blob/main/build.sbt#L3) - For Maven release version - [version.py](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathr/version.py#L1) - For Feathr version - [conf.py](https://github.com/feathr-ai/feathr/blob/main/feathr_project/docs/conf.py#L27) - For documentation version -- [feathr_config.yaml](https://github.com/feathr-ai/feathr/blob/main/feathr_project/test/test_user_workspace/feathr_config.yaml#L84) - To set the spark runtime location for Azure Synapse and Azure Databricks used by test suite. Please update all .yaml files under this path. -- [azure_resource_provision.json](https://github.com/feathr-ai/feathr/blob/main/docs/how-to-guides/azure_resource_provision.json#L114) - To set the deployment template to pull the latest release image. -- [constants.py](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathr/constants.py#L31) - To set the default maven artifact version (Only needed when maven version is **NOT** the same as python sdk version) +- [feathr_config.yaml](https://github.com/feathr-ai/feathr/blob/main/feathr_project/test/test_user_workspace/feathr_config.yaml#L84) - To set the spark runtime location for Azure Synapse and Azure Databricks used by test suite. Please update all .yaml files under this path. - [package.json](https://github.com/feathr-ai/feathr/blob/main/ui/package.json#L3) - For Feathr UI version +Following file should only be updated for release, which means should be skipped for release candidate. + +- [azure_resource_provision.json](https://github.com/feathr-ai/feathr/blob/main/docs/how-to-guides/azure_resource_provision.json#L114) - To set the deployment template to pull the latest release image. + +## Release Branches + +Each major and minor release should have a release branch. The release branch should be named as `releases/vX.Y.Z` or `releases/vX.Y.Z-rcN` where `X.Y.Z` is the release version. The release branch should be created from the `main` branch. See past release branches [here](https://github.com/feathr-ai/feathr/branches/all?query=releases). + +## Release Tags + +Once the release branch is created, a release tag should be created from the release branch. The release tag should be named as `vX.Y.Z` or `vX.Y.Z-rcN` where `X.Y.Z` is the release version. See past release tags [here](https://github.com/feathr-ai/feathr/tags). + ## Triggering automated release pipelines -Our goal is to automate the release process as much as possible. So far, we have automated the following steps -1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/docker-publish.yml) to build and publish for our UI and API container to [dockerhub](https://hub.docker.com/r/feathrfeaturestore/feathr-registry/tags). - **Triggers** - Nightly, branch with name pattern "releases/*" -1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-pypi.yml) for publishing Python package to [PyPi](https://pypi.org/project/feathr/). +Once the release branch and release tag are created, the release pipelines will be triggered automatically. The release pipelines will build the release artifacts and publish them to Maven and PyPI. + +1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/docker-publish.yml) to build and publish for Feathr Registry docker images to [DockerHub](https://hub.docker.com/r/feathrfeaturestore/feathr-registry/tags). - **Triggers** - branch with name pattern "releases/*" + **Triggers** - Nightly or branch with name pattern "releases/*" -1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-maven.yml) for publishing the jar to [maven/sonatype repository](https://oss.sonatype.org/). +2. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-pypi.yml) for publishing Python package to [PyPi](https://pypi.org/project/feathr/). -**PLEASE NOTE: To trigger the above workflows as part of release, create a new branch with pattern releases/v0.x.0**. See past release branches [here](https://github.com/feathr-ai/feathr/branches/all?query=releases). + **Triggers** - branch with name pattern "releases/*" +3. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-maven.yml) for publishing the jar to [maven/sonatype repository](https://oss.sonatype.org/). ## Upload Feathr Jar Run the command to generate the Java jar. After the jar is generated, please upload to [Azure storage](https://ms.portal.azure.com/#view/Microsoft_Azure_Storage/ContainerMenuBlade/~/overview/storageAccountId/%2Fsubscriptions%2Fa6c2a7cc-d67e-4a1a-b765-983f08c0423a%2FresourceGroups%2Fazurefeathrintegration%2Fproviders%2FMicrosoft.Storage%2FstorageAccounts%2Fazurefeathrstorage/path/public/etag/%220x8D9E6F64D62D599%22/defaultEncryptionScope/%24account-encryption-key/denyEncryptionScopeOverride//defaultId//publicAccessVal/Container) for faster access. ## Release PyPi + The automated workflow should take care of this, you can check under [actions](https://github.com/feathr-ai/feathr/actions/workflows/publish-to-pypi.yml) to see the triggered run and results. For manual steps, see [Python Package Release Guide](https://feathr-ai.github.io/feathr/dev_guide/python_package_release.html) ## Updating docker image for API and Registry + The automated workflow should take care of this as well, you can check under [actions](https://github.com/feathr-ai/feathr/actions/workflows/docker-publish.yml) to see the triggered run and results. For manual steps, see [Feathr Registry docker image](https://feathr-ai.github.io/feathr/dev_guide/build-and-push-feathr-registry-docker-image.html) ## Release Maven + The automated workflow should take of this too, you can check under [actions](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-maven.yml) to see the triggered run and results. For manual steps, see [Feathr Developer Guide for publishing to maven](https://feathr-ai.github.io/feathr/dev_guide/publish_to_maven.html) ## Testing + Run one of the sample [notebook](https://github.com/feathr-ai/feathr/blob/main/docs/samples/azure_synapse/product_recommendation_demo.ipynb) as it uses the latest package from Maven and PyPi. ## Announcement From 8899f185c8c5d0e713d14ac86c56e4929be4717f Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Tue, 1 Nov 2022 20:26:40 +0800 Subject: [PATCH 08/18] Bump version to 0.9.0-rc1 (#810) --- build.sbt | 2 +- docs/how-to-guides/local-spark-provider.md | 2 +- feathr_project/feathr/version.py | 2 +- feathr_project/test/test_user_workspace/feathr_config.yaml | 4 ++-- .../test_user_workspace/feathr_config_registry_purview.yaml | 4 ++-- .../feathr_config_registry_purview_rbac.yaml | 4 ++-- .../test/test_user_workspace/feathr_config_registry_sql.yaml | 4 ++-- .../test_user_workspace/feathr_config_registry_sql_rbac.yaml | 4 ++-- ui/package.json | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/build.sbt b/build.sbt index 2919ddae6..2ad413ba2 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ import sbt.Keys.publishLocalConfiguration ThisBuild / resolvers += Resolver.mavenLocal ThisBuild / scalaVersion := "2.12.15" -ThisBuild / version := "0.8.0" +ThisBuild / version := "0.9.0-rc1" ThisBuild / organization := "com.linkedin.feathr" ThisBuild / organizationName := "linkedin" val sparkVersion = "3.1.3" diff --git a/docs/how-to-guides/local-spark-provider.md b/docs/how-to-guides/local-spark-provider.md index 433af64f3..0069322b8 100644 --- a/docs/how-to-guides/local-spark-provider.md +++ b/docs/how-to-guides/local-spark-provider.md @@ -36,7 +36,7 @@ A spark-submit script will auto generated in your workspace under `debug` folder spark-submit \ --master local[*] \ --name project_feathr_local_spark_test \ - --packages "org.apache.spark:spark-avro_2.12:3.3.0,com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8,com.microsoft.azure:spark-mssql-connector_2.12:1.2.0,org.apache.logging.log4j:log4j-core:2.17.2,com.typesafe:config:1.3.4,com.fasterxml.jackson.core:jackson-databind:2.12.6.1,org.apache.hadoop:hadoop-mapreduce-client-core:2.7.7,org.apache.hadoop:hadoop-common:2.7.7,org.apache.avro:avro:1.8.2,org.apache.xbean:xbean-asm6-shaded:4.10,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3,com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21,org.apache.kafka:kafka-clients:3.1.0,com.google.guava:guava:31.1-jre,it.unimi.dsi:fastutil:8.1.1,org.mvel:mvel2:2.2.8.Final,com.fasterxml.jackson.module:jackson-module-scala_2.12:2.13.3,com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.12.6,com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.12.6,com.jasonclawson:jackson-dataformat-hocon:1.1.0,com.redislabs:spark-redis_2.12:3.1.0,org.apache.xbean:xbean-asm6-shaded:4.10,com.google.protobuf:protobuf-java:3.19.4,net.snowflake:snowflake-jdbc:3.13.18,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,org.apache.commons:commons-lang3:3.12.0,org.xerial:sqlite-jdbc:3.36.0.3,com.github.changvvb:jackson-module-caseclass_2.12:1.1.1,com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.11.1,org.eclipse.jetty:jetty-util:9.3.24.v20180605,commons-io:commons-io:2.6,org.apache.hadoop:hadoop-azure:2.7.4,com.microsoft.azure:azure-storage:8.6.4,com.linkedin.feathr:feathr_2.12:0.8.0" \ + --packages "org.apache.spark:spark-avro_2.12:3.3.0,com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8,com.microsoft.azure:spark-mssql-connector_2.12:1.2.0,org.apache.logging.log4j:log4j-core:2.17.2,com.typesafe:config:1.3.4,com.fasterxml.jackson.core:jackson-databind:2.12.6.1,org.apache.hadoop:hadoop-mapreduce-client-core:2.7.7,org.apache.hadoop:hadoop-common:2.7.7,org.apache.avro:avro:1.8.2,org.apache.xbean:xbean-asm6-shaded:4.10,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3,com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21,org.apache.kafka:kafka-clients:3.1.0,com.google.guava:guava:31.1-jre,it.unimi.dsi:fastutil:8.1.1,org.mvel:mvel2:2.2.8.Final,com.fasterxml.jackson.module:jackson-module-scala_2.12:2.13.3,com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.12.6,com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.12.6,com.jasonclawson:jackson-dataformat-hocon:1.1.0,com.redislabs:spark-redis_2.12:3.1.0,org.apache.xbean:xbean-asm6-shaded:4.10,com.google.protobuf:protobuf-java:3.19.4,net.snowflake:snowflake-jdbc:3.13.18,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,org.apache.commons:commons-lang3:3.12.0,org.xerial:sqlite-jdbc:3.36.0.3,com.github.changvvb:jackson-module-caseclass_2.12:1.1.1,com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.11.1,org.eclipse.jetty:jetty-util:9.3.24.v20180605,commons-io:commons-io:2.6,org.apache.hadoop:hadoop-azure:2.7.4,com.microsoft.azure:azure-storage:8.6.4,com.linkedin.feathr:feathr_2.12:0.9.0-rc1" \ --conf "spark.driver.extraClassPath=../target/scala-2.12/classes:jars/config-1.3.4.jar:jars/jackson-dataformat-hocon-1.1.0.jar:jars/jackson-module-caseclass_2.12-1.1.1.jar:jars/mvel2-2.2.8.Final.jar:jars/fastutil-8.1.1.jar" \ --conf "spark.hadoop.fs.wasbs.impl=org.apache.hadoop.fs.azure.NativeAzureFileSystem" \ --class com.linkedin.feathr.offline.job.FeatureJoinJob \ diff --git a/feathr_project/feathr/version.py b/feathr_project/feathr/version.py index 807119de6..f31e00e36 100644 --- a/feathr_project/feathr/version.py +++ b/feathr_project/feathr/version.py @@ -1 +1 @@ -__version__ = "0.8.0" \ No newline at end of file +__version__ = "0.9.0-rc1" \ No newline at end of file diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index e67c803ef..f463785d5 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -82,7 +82,7 @@ spark_config: # Feathr Job configuration. Support local paths, path start with http(s)://, and paths start with abfs(s):// # this is the default location so end users don't have to compile the runtime again. # feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: # workspace instance workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' @@ -93,7 +93,7 @@ spark_config: # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/ work_dir: 'dbfs:/feathr_getting_started' # this is the default location so end users don't have to compile the runtime again. - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml index f716da0b4..b6e3aacde 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml index c842bc702..ffef212d2 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml index dcb73d827..8f6691725 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml index 29c6889e8..03c5f75f1 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/ui/package.json b/ui/package.json index da870fbf3..b4e1ce820 100644 --- a/ui/package.json +++ b/ui/package.json @@ -1,6 +1,6 @@ { "name": "feathr-ui", - "version": "0.8.0", + "version": "0.9.0-rc1", "private": true, "dependencies": { "@ant-design/icons": "^4.7.0", From 995f5091570524f3f8484a20244928d289eb8989 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 2 Nov 2022 22:12:25 +0000 Subject: [PATCH 09/18] Fix tests to use mocks and fix get_result_df's databricks behavior Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/utils/job_utils.py | 49 ++++---- feathr_project/pyproject.toml | 1 - feathr_project/test/conftest.py | 12 +- .../test_user_workspace/feathr_config.yaml | 2 +- feathr_project/test/unit/utils/test_config.py | 1 - .../test/unit/utils/test_job_utils.py | 112 +++++++++++++++--- 6 files changed, 130 insertions(+), 47 deletions(-) diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index c804d4ca9..1d33855b5 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,6 +1,5 @@ -from multiprocessing.sharedctypes import Value from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import NamedTemporaryFile from typing import Union from loguru import logger @@ -9,6 +8,7 @@ from feathr.client import FeathrClient from feathr.constants import OUTPUT_FORMAT +from feathr.utils.platform import is_databricks def get_result_pandas_df( @@ -82,6 +82,9 @@ def get_result_df( Returns: Either Spark or pandas DataFrame. """ + if is_databricks() and client.spark_runtime != "databricks": + raise RuntimeError(f"The function is called from Databricks but the client.spark_runtime is {client.spark_runtime}.") + # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: @@ -95,22 +98,22 @@ def get_result_df( "In local spark mode, the result files are expected to be stored at a local storage and thus `local_cache_path` argument will be ignored." ) local_cache_path = res_url + elif client.spark_runtime == "databricks": - if res_url.startswith("dbfs:"): + if not res_url.startswith("dbfs:"): + raise ValueError( + f"In Databricks, the result files are expected to be stored at a DBFS storage but res_url = {res_url}." + ) + + if is_databricks(): # Check if the function is being called from Databricks if local_cache_path is not None: logger.warning( "Result files are already in DBFS and thus `local_cache_path` will be ignored." ) local_cache_path = res_url - else: - # if local_cache_path params is not provided then create a temporary folder - if local_cache_path is None: - # We'll just use the name of a local TemporaryDirectory to cache the data into DBFS. - local_cache_path = TemporaryDirectory().name - - # Databricks uses "dbfs:/" prefix for spark paths - if not local_cache_path.startswith("dbfs:"): - local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) + elif local_cache_path is None: # Download the result from dbfs to local + local_cache_path = NamedTemporaryFile(delete=False).name + else: logger.warning("This utility function currently supports local spark and databricks. You may encounter unexpected results on other platforms.") # TODO elif azure_synapse @@ -127,16 +130,20 @@ def get_result_df( result_df = None - if spark is not None: - if data_format == "csv": - result_df = spark.read.option("header", True).csv(local_cache_path) + try: + if spark is not None: + if data_format == "csv": + result_df = spark.read.option("header", True).csv(local_cache_path) + else: + result_df = spark.read.format(data_format).load(local_cache_path) else: - result_df = spark.read.format(data_format).load(local_cache_path) - else: - result_df = _load_files_to_pandas_df( - dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. - data_format=data_format, - ) + result_df = _load_files_to_pandas_df( + dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. + data_format=data_format, + ) + except Exception as e: + logger.error(f"Failed to load result files from {local_cache_path} with format {data_format}.") + raise e return result_df diff --git a/feathr_project/pyproject.toml b/feathr_project/pyproject.toml index 0162ede04..5b7b2fc11 100644 --- a/feathr_project/pyproject.toml +++ b/feathr_project/pyproject.toml @@ -12,7 +12,6 @@ multi_line_output = 3 [tool.pytest.ini_options] markers = [ "notebooks: tests Jupyter notebooks", - "databricks: tests functions on a Databricks cluster", ] [build-system] diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py index d1ecd081b..b8ee3f345 100644 --- a/feathr_project/test/conftest.py +++ b/feathr_project/test/conftest.py @@ -12,14 +12,10 @@ def workspace_dir() -> str: @pytest.fixture(scope="function") -def feathr_client_local(workspace_dir) -> FeathrClient: - """Test function-scoped Feathr client""" - return FeathrClient(config_path=str(Path(workspace_dir, "feathr_config_local.yaml"))) - - -@pytest.fixture(scope="function") -def feathr_client_databricks(workspace_dir) -> FeathrClient: - """Test function-scoped Feathr client""" +def feathr_client(workspace_dir) -> FeathrClient: + """Test function-scoped Feathr client. + Note, cluster target (local, databricks, synapse) maybe overriden by the environment variables set at test machine. + """ return FeathrClient(config_path=str(Path(workspace_dir, "feathr_config.yaml"))) diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index f463785d5..921148728 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -85,7 +85,7 @@ spark_config: feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: # workspace instance - workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' + workspace_instance_url: 'https://adb-4121774437039026.6.azuredatabricks.net' workspace_token_value: '' # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index a53e1764a..52adcae39 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -1,5 +1,4 @@ from pathlib import Path -from tempfile import NamedTemporaryFile import pytest diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 21392bf84..a692d9e40 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -2,6 +2,7 @@ # TODO test with no data files exception and unsupported format exception from pathlib import Path from tempfile import NamedTemporaryFile +from typing import Type from unittest.mock import MagicMock import pandas as pd @@ -18,7 +19,7 @@ def test__get_result_pandas_df(mocker: MockerFixture): - # Assert if the base function, get_result_df, called w/ proper args + """Test if the base function, get_result_df, called w/ proper args""" mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") client = MagicMock() data_format = "some_data_format" @@ -29,7 +30,7 @@ def test__get_result_pandas_df(mocker: MockerFixture): def test__get_result_spark_df(mocker: MockerFixture): - # Assert if the base function, get_result_df, called w/ proper args + """Test if the base function, get_result_df, called w/ proper args""" mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") client = MagicMock() spark = MagicMock() @@ -40,19 +41,80 @@ def test__get_result_spark_df(mocker: MockerFixture): mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path, spark=spark) -# Local spark is expected to use a local filepath for res_url. Therefore, we mark this test to run with databricks. -@pytest.mark.databricks -def test__get_result_df__with_local_cache_path(feathr_client_databricks: FeathrClient): - # TODO Assert there is a local copy of the file in the given local_cache_path - pass +@pytest.mark.parametrize( + "is_databricks,spark_runtime,res_url,local_cache_path,expected_local_cache_path", [ + # For local spark results, res_url must be a local path and local_cache_path will be ignored. + (False, "local", "some_res_url", None, "some_res_url"), + (False, "local", "some_res_url", "some_local_cache_path", "some_res_url"), + # For databricks results, res_url must be a dbfs path. + # If the function is called in databricks, local_cache_path will be ignored. + (True, "databricks", "dbfs:/some_res_url", None, "/dbfs/some_res_url"), + (True, "databricks", "dbfs:/some_res_url", "some_local_cache_path", "/dbfs/some_res_url"), + (False, "databricks", "dbfs:/some_res_url", None, "mocked_temp_path"), + (False, "databricks", "dbfs:/some_res_url", "some_local_cache_path", "some_local_cache_path"), + ] +) +def test__get_result_df__with_local_cache_path( + mocker: MockerFixture, + is_databricks: bool, + spark_runtime: str, + res_url: str, + local_cache_path: str, + expected_local_cache_path: str, +): + """Test local_cache_path is used if provided""" + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime + client.feathr_spark_launcher.download_result = MagicMock() + mocked_load_files_to_pandas_df = mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") + + # Mock is_databricks + mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) + + # Mock temporary file module + mocked_named_temporary_file = MagicMock() + mocked_named_temporary_file.name = expected_local_cache_path + mocker.patch("feathr.utils.job_utils.NamedTemporaryFile", return_value=mocked_named_temporary_file) + + data_format = "csv" + get_result_df(client, data_format=data_format, res_url=res_url, local_cache_path=local_cache_path) + + mocked_load_files_to_pandas_df.assert_called_once_with( + dir_path=expected_local_cache_path, + data_format=data_format, + ) -def test__get_result_df__exceptions(): +@pytest.mark.parametrize( + "is_databricks,spark_runtime,res_url,expected_error", [ + (True, "local", None, RuntimeError), # Test RuntimeError when the function is running at Databricks but client.spark_runtime is not databricks + # Test ValueError when res_url is None + (False, "local", None, ValueError), + (True, "databricks", None, ValueError), + # Test ValueError when res_url is not a dbfs path but client.spark_runtime is databricks + (False, "databricks", "some_local_path", ValueError), + # Test ValueError when res_url does not exists or not able to access. + (False, "local", "some_doesnt_exist_path", Exception), + ] +) +def test__get_result_df__exceptions( + mocker: MockerFixture, + is_databricks: bool, + spark_runtime: str, + res_url: str, + expected_error: Type[Exception], +): + """Test exceptions""" + # Mock client client = MagicMock() - client.get_job_result_uri = MagicMock(return_value=None) + client.get_job_result_uri = MagicMock(return_value=res_url) + client.spark_runtime = spark_runtime + + # Mock is_data_bricks + mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) - # Test ValueError when res_url is None - with pytest.raises(ValueError): + with pytest.raises(expected_error): get_result_df(client) @@ -67,17 +129,27 @@ def test__get_result_df__exceptions(): ) def test__get_result_df( workspace_dir: str, - feathr_client_local: FeathrClient, + feathr_client: FeathrClient, data_format: str, output_filename: str, expected_count: int, ): + """Test get_result_df returns pandas DataFrame""" # Note: make sure the output file exists in the test_user_workspace res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url + + # Mock feathr_spark_launcher.download_result + feathr_client.feathr_spark_launcher.download_result = MagicMock() + + if feathr_client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + df = get_result_df( - client=feathr_client_local, + client=feathr_client, data_format=data_format, res_url=res_url, + local_cache_path=local_cache_path, ) assert isinstance(df, pd.DataFrame) assert len(df) == expected_count @@ -94,19 +166,29 @@ def test__get_result_df( ) def test__get_result_df__with_spark_session( workspace_dir: str, - feathr_client_local: FeathrClient, + feathr_client: FeathrClient, spark: SparkSession, data_format: str, output_filename: str, expected_count: int, ): + """Test get_result_df returns spark DataFrame""" # Note: make sure the output file exists in the test_user_workspace res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url + + # Mock feathr_spark_launcher.download_result + feathr_client.feathr_spark_launcher.download_result = MagicMock() + + if feathr_client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + df = get_result_df( - client=feathr_client_local, + client=feathr_client, data_format=data_format, res_url=res_url, spark=spark, + local_cache_path=local_cache_path, ) assert isinstance(df, DataFrame) assert df.count() == expected_count From 6198506558732d6bea11f4f912b5c07f5075b5b9 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 3 Nov 2022 14:55:06 +0000 Subject: [PATCH 10/18] fix tem file to dir Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/utils/job_utils.py | 6 +++--- feathr_project/test/unit/utils/test_job_utils.py | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 1d33855b5..12f27c2cb 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,5 +1,5 @@ from pathlib import Path -from tempfile import NamedTemporaryFile +from tempfile import TemporaryDirectory from typing import Union from loguru import logger @@ -74,7 +74,7 @@ def get_result_df( Default to `avro` if not specified. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure the job is finished and the result URL contains actual data. - local_cache_path (optional): Specify the absolute download path. if the user does not provide this, + local_cache_path (optional): Specify the absolute download directory. if the user does not provide this, the function will create a temporary directory. spark (optional): Spark session. If provided, the function returns spark Dataframe. Otherwise, it returns pd.DataFrame. @@ -112,7 +112,7 @@ def get_result_df( ) local_cache_path = res_url elif local_cache_path is None: # Download the result from dbfs to local - local_cache_path = NamedTemporaryFile(delete=False).name + local_cache_path = TemporaryDirectory().name else: logger.warning("This utility function currently supports local spark and databricks. You may encounter unexpected results on other platforms.") diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index a692d9e40..1e005855e 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -1,7 +1,6 @@ # TODO with, without optional args # TODO test with no data files exception and unsupported format exception from pathlib import Path -from tempfile import NamedTemporaryFile from typing import Type from unittest.mock import MagicMock @@ -73,9 +72,9 @@ def test__get_result_df__with_local_cache_path( mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) # Mock temporary file module - mocked_named_temporary_file = MagicMock() - mocked_named_temporary_file.name = expected_local_cache_path - mocker.patch("feathr.utils.job_utils.NamedTemporaryFile", return_value=mocked_named_temporary_file) + mocked_named_temporary_dir = MagicMock() + mocked_named_temporary_dir.name = expected_local_cache_path + mocker.patch("feathr.utils.job_utils.TemporaryDirectory", return_value=mocked_named_temporary_dir) data_format = "csv" get_result_df(client, data_format=data_format, res_url=res_url, local_cache_path=local_cache_path) From ae9095c10ac2b06d971366a7969494cad2f0bf85 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 3 Nov 2022 16:17:21 +0000 Subject: [PATCH 11/18] checkout the feature_derivations.py from main (it was temporally changed to goaround previous issues) Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/definition/feature_derivations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feathr_project/feathr/definition/feature_derivations.py b/feathr_project/feathr/definition/feature_derivations.py index 5717f12b2..9205685ce 100644 --- a/feathr_project/feathr/definition/feature_derivations.py +++ b/feathr_project/feathr/definition/feature_derivations.py @@ -36,9 +36,9 @@ def __init__(self, def validate_feature(self): """Validate the derived feature is valid""" - + input_feature_key_alias = [] - # for new entity in Purview, the attributes are Camel cases, while the old logic works as snake cases. + # for new entity in Purview, the attributes are Camel cases, while the old logic works as snake cases. # Modify the conversion to work with both schema. for feature in self.input_features: input_feature_key_alias.extend([x['keyColumnAlias'] for x in feature['attributes']['key']] if isinstance(feature,dict) else feature.key_alias) @@ -58,7 +58,7 @@ def to_feature_config(self) -> str: } {% endfor %} } - definition: {{derived_feature.transform.to_feature_config(False)}} + definition.sqlExpr: {{derived_feature.transform.to_feature_config(False)}} {{derived_feature.feature_type.to_feature_config()}} } """) From 59bd65caf3a21bd661091d442cf972287fa9797a Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 3 Nov 2022 17:03:28 +0000 Subject: [PATCH 12/18] Remove old databricks sample notebook. Change pip install feathr from the github main branch to pickup the latest changes always Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../databricks_quickstart_nyc_taxi_demo.ipynb | 2 +- ...atabricks_quickstart_nyc_taxi_driver.ipynb | 1444 ----------------- 2 files changed, 1 insertion(+), 1445 deletions(-) delete mode 100644 docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index 0bc099f11..1c8b193d9 100755 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["!pip install feathr"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n",""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n","!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n",""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb deleted file mode 100644 index ffd6e64d8..000000000 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb +++ /dev/null @@ -1,1444 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "384e5e16-7213-4186-9d04-09d03b155534", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Feathr Feature Store on Databricks Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. This is a notebook that's specially designed for databricks clusters and is relying on some of the databricks packages such as dbutils.\n", - "\n", - "The intent of this notebook is like \"one click run\" without configuring anything, so it has relatively limited capability. \n", - "\n", - "- For example, in this notebook there's no feature registry available since that requires running Azure Purview. \n", - "- Also for online store (Redis), you need to configure the Redis endpoint, otherwise that part will not work. \n", - "\n", - "However, the core part of Feathr, especially defining features, get offline features, point-in-time joins etc., should \"just work\". The full-fledged notebook is [located here](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Notebook Steps\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", - "\n", - "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f00b9d0b-94d1-418f-89b9-25bbacb8b068", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "! pip install feathr pandavro scikit-learn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n", - "import json\n", - "import requests" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Get the required databricks credentials automatically:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Get current databricks notebook context\n", - "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - "host_name = ctx.tags().get(\"browserHostName\").get()\n", - "host_token = ctx.apiToken().get()\n", - "cluster_id = ctx.tags().get(\"clusterId\").get()\n", - "\n", - "\n", - "\n", - "# databricks_config = {'run_name':'FEATHR_FILL_IN','existing_cluster_id':cluster_id,'libraries':[{'jar':'FEATHR_FILL_IN'}],'spark_jar_task':{'main_class_name':'FEATHR_FILL_IN','parameters':['FEATHR_FILL_IN']}}\n", - "os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + host_name\n", - "os.environ['spark_config__databricks__config_template']='{\"run_name\":\"FEATHR_FILL_IN\",\"new_cluster\":{\"spark_version\":\"10.4.x-scala2.12\",\"node_type_id\":\"Standard_D3_v2\",\"num_workers\":2,\"spark_conf\":{\"FEATHR_FILL_IN\":\"FEATHR_FILL_IN\"}},\"libraries\":[{\"jar\":\"FEATHR_FILL_IN\"}],\"spark_jar_task\":{\"main_class_name\":\"FEATHR_FILL_IN\",\"parameters\":[\"FEATHR_FILL_IN\"]}}'\n", - "# os.environ['spark_config__databricks__config_template']=json.dumps(databricks_config)\n", - "os.environ['spark_config__databricks__work_dir']='dbfs:/feathr_getting_started'\n", - "os.environ['project_config__project_name']='feathr_getting_started'\n", - "os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = host_token" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to setup the Redis credentials below in order to push features to online store. You can skip this part if you don't have Redis, but there will be failures for `client.materialize_features(settings)` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=\"\"\n", - "redis_host=\"\"\n", - "redis_password=\"\"\n", - "redis_ssl=\"\"\n", - "\n", - "# Set the resource link\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Configure required credentials (skip if you don't use those):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started2'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: ''\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: ''\n", - " jdbc_table: ''\n", - " snowflake:\n", - " snowflake_enabled: false\n", - " url: \".snowflakecomputing.com\"\n", - " user: \"\"\n", - " role: \"\"\n", - "spark_config:\n", - " # choice for spark runtime. Currently support: azure_synapse, databricks\n", - " # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.\n", - " spark_cluster: \"databricks\"\n", - " spark_result_output_parts: \"1\"\n", - "\n", - "online_store:\n", - " redis:\n", - " host: '.redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " api_endpoint: \"https://.azurewebsites.net/api/v1\"\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "16420730-582e-4e11-a343-efc0ddd35108", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "46f863c4-bb81-434a-a448-6b585031a221", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "f_trip_distance = Feature(name=\"f_trip_distance\",\n", - " feature_type=FLOAT, transform=\"trip_distance\")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " Feature(name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"cast_float(trip_distance)>30\"),\n", - " Feature(name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"request_features\",\n", - " source=INPUT_CONTEXT,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "05633bc3-9118-449b-9562-45fc437576c2", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "location_id = TypedKey(key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\")\n", - "agg_features = [Feature(name=\"f_location_avg_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_max_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"MAX\",\n", - " window=\"90d\")),\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=batch_source,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "f_trip_distance_rounded = DerivedFeature(name=\"f_trip_distance_rounded\",\n", - " feature_type=INT32,\n", - " input_features=[f_trip_distance],\n", - " transform=\"f_trip_distance * 10\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", - "showTitle": false, - "title": "" - } - }, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " f_trip_distance_rounded])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "output_path = 'dbfs:/feathrazure_test.avro'\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"f_location_avg_fare\", \"f_trip_distance_rounded\", \"f_is_long_trip_distance\"], key=location_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path\n", - " )\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "51f078e3-3f8f-4f10-b7f1-499ac8a9ff07", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "23c797b2-ac1a-4cf3-b0ed-c05216de3f37", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "from feathr.utils.job_utils import get_result_df\n", - "df_res = get_result_df(client, format=\"avro\", res_url = output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b9be042e-eb12-46b9-9d91-a0e5dd0c704f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "84745f36-5bac-49c0-903b-38828b923c7c", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# remove columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", - " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", - "\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", - " final_df[\"fare_amount\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", - "settings = MaterializationSettings(\"nycTaxiTable\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", - "\n", - "client.materialize_features(settings)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", - "showTitle": false, - "title": "" - } - }, - "source": [ - "We can then get the features from the online store (Redis):" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "nyc_driver_demo", - "notebookOrigID": 930353059183053, - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3.8.10 ('logistics')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "vscode": { - "interpreter": { - "hash": "6d25d3d1f1809ed0384c3d8e0cd4f1df57fe7bb936ead67f035c6ff1494f4e23" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 125cc3a4dbdb7769c9ad73355add741dd7955593 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Mon, 7 Nov 2022 23:02:05 +0000 Subject: [PATCH 13/18] Fix config and get_result_df for synapse --- .github/workflows/pull_request_push_test.yml | 6 +- .gitignore | 3 + .../databricks_quickstart_nyc_taxi_demo.ipynb | 1207 ++++++++++++++++- docs/samples/nyc_taxi_demo.ipynb | 178 ++- feathr_project/feathr/datasets/nyc_taxi.py | 2 +- feathr_project/feathr/utils/config.py | 206 ++- feathr_project/feathr/utils/job_utils.py | 12 +- feathr_project/pyproject.toml | 2 +- feathr_project/test/conftest.py | 15 + feathr_project/test/samples/test_notebooks.py | 14 +- feathr_project/test/unit/utils/test_config.py | 97 +- .../test/unit/utils/test_job_utils.py | 7 +- 12 files changed, 1585 insertions(+), 164 deletions(-) mode change 100755 => 100644 docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 0eb0e059b..3c9c7dd91 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -22,7 +22,7 @@ on: - "docs/**" - "ui/**" - "**/README.md" - + schedule: # Runs daily at 1 PM UTC (9 PM CST), will send notification to TEAMS_WEBHOOK - cron: '00 13 * * *' @@ -127,7 +127,7 @@ jobs: SQL1_USER: ${{secrets.SQL1_USER}} SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | - # run only test with databricks. run in 4 parallel jobs + # run only test with databricks. run in 6 parallel jobs pytest -n 6 feathr_project/test/ azure_synapse_test: @@ -196,7 +196,7 @@ jobs: SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | # skip databricks related test as we just ran the test; also seperate databricks and synapse test to make sure there's no write conflict - # run in 4 parallel jobs to make the time shorter + # run in 6 parallel jobs to make the time shorter pytest -n 6 feathr_project/test/ local_spark_test: diff --git a/.gitignore b/.gitignore index 976c0b239..4fe490c96 100644 --- a/.gitignore +++ b/.gitignore @@ -213,3 +213,6 @@ null/* project/.bloop metals.sbt .bsp/sbt.json + +# Feathr output debug folder +**/debug/ diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb old mode 100755 new mode 100644 index 1c8b193d9..4dc58eaf7 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -1 +1,1206 @@ -{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n","!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n",""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "843d3142-24ca-4bd1-9e31-b55163804fe3", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n", + "dbutils.widgets.text(\"REDIS_KEY\", \"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "384e5e16-7213-4186-9d04-09d03b155534", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Feathr Feature Store on Databricks Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n", + "\n", + "This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n", + "- This notebook skips feature registry which requires running Azure Purview. \n", + "- To make the online feature query work, you will need to configure the Redis endpoint. \n", + "\n", + "The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c2ce58c7-9263-469a-bbb7-43364ddb07b8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Prerequisite\n", + "\n", + "To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n", + "\n", + "To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4609d7ad-ad74-40fc-b97e-f440a0fa0737", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n", + "!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c81fa80c-bca6-4ae5-84ad-659a036977bd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Notebook Steps\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install Feathr and necessary dependencies.\n", + "1. Create shareable features with Feathr feature definition configs.\n", + "1. Create training data using point-in-time correct feature join\n", + "1. Train and evaluate a prediction model.\n", + "1. Materialize feature values for online scoring.\n", + "\n", + "The overall data flow is as follows:\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.evaluation import RegressionEvaluator\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.regression import GBTRegressor\n", + "from pyspark.sql import DataFrame\n", + "import pyspark.sql.functions as F\n", + "\n", + "import feathr\n", + "from feathr import (\n", + " FeathrClient,\n", + " # Feature data types\n", + " BOOLEAN,\n", + " FLOAT,\n", + " INT32,\n", + " ValueType,\n", + " # Feature data sources\n", + " INPUT_CONTEXT,\n", + " HdfsSource,\n", + " # Feature aggregations\n", + " TypedKey,\n", + " WindowAggTransformation,\n", + " # Feature types and anchor\n", + " DerivedFeature,\n", + " Feature,\n", + " FeatureAnchor,\n", + " # Materialization\n", + " BackfillTime,\n", + " MaterializationSettings,\n", + " RedisSink,\n", + " # Offline feature computation\n", + " FeatureQuery,\n", + " ObservationSettings,\n", + ")\n", + "from feathr.datasets import nyc_taxi\n", + "from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n", + "from feathr.utils.config import generate_config\n", + "from feathr.utils.job_utils import get_result_df\n", + "\n", + "\n", + "print(\n", + " f\"\"\"Feathr version: {feathr.__version__}\n", + "Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ab35fa01-b392-457e-8fde-7e445a3c39b5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 2. Create Shareable Features with Feathr Feature Definition Configs\n", + "\n", + "In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n", + "Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "09f93a9f-7b33-4d91-8f31-ee3b20991696", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n", + "PROJECT_NAME = \"feathr_getting_started\"\n", + "\n", + "REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n", + "\n", + "# Use a databricks cluster\n", + "SPARK_CLUSTER = \"databricks\"\n", + "\n", + "# Databricks file system path\n", + "DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Redis credential\n", + "os.environ[\"REDIS_PASSWORD\"] = REDIS_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Configurations\n", + "\n", + "Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field.\n", + "\n", + "In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_cluster=SPARK_CLUSTER,\n", + " # You may set an existing cluster id here, but Databricks recommend to use new clusters for greater reliability.\n", + " cluster_name=None, # Set None to create a new job cluster\n", + ")\n", + "\n", + "with open(config_path, \"r\") as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "58d22dc1-7590-494d-94ca-3e2488c31c8e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### View the NYC taxi fare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n", + "\n", + "# Download the data file\n", + "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n", + "df_raw.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Defining features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n", + "\n", + "* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n", + "* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", + "\n", + "Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n", + "\n", + "There are two types of features -- anchored features and derivated features:\n", + "\n", + "* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n", + "* **Derived features**: Features that are computed on top of other features.\n", + "\n", + "#### Define anchored features\n", + "\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "75b8d2ed-84df-4446-ae07-5f715434f3ea", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n", + "TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "93abbcc2-562b-47e4-ad4c-1fedd7cc64df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# We define f_trip_distance and f_trip_time_duration features separately\n", + "# so that we can reuse them later for the derived features.\n", + "f_trip_distance = Feature(\n", + " name=\"f_trip_distance\",\n", + " feature_type=FLOAT,\n", + " transform=\"trip_distance\",\n", + ")\n", + "f_trip_time_duration = Feature(\n", + " name=\"f_trip_time_duration\",\n", + " feature_type=FLOAT,\n", + " transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n", + ")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " Feature(\n", + " name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"trip_distance > 30.0\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_month\",\n", + " feature_type=INT32,\n", + " transform=\"dayofmonth(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_hour_of_day\",\n", + " feature_type=INT32,\n", + " transform=\"hour(lpep_dropoff_datetime)\",\n", + " ),\n", + "]\n", + "\n", + "# After you have defined features, bring them together to build the anchor to the source.\n", + "feature_anchor = FeatureAnchor(\n", + " name=\"feature_anchor\",\n", + " source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n", + " features=features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can define the source with a preprocessing python function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def preprocessing(df: DataFrame) -> DataFrame:\n", + " import pyspark.sql.functions as F\n", + "\n", + " df = df.withColumn(\n", + " \"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\")\n", + " )\n", + " return df\n", + "\n", + "\n", + "batch_source = HdfsSource(\n", + " name=\"nycTaxiBatchSource\",\n", + " path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " preprocessing=preprocessing,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "46f863c4-bb81-434a-a448-6b585031a221", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For the features with aggregation, the supported functions are as follows:\n", + "\n", + "| Aggregation Function | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "agg_key = TypedKey(\n", + " key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\",\n", + ")\n", + "\n", + "agg_window = \"90d\"\n", + "\n", + "# Anchored features with aggregations\n", + "agg_features = [\n", + " Feature(\n", + " name=\"f_location_avg_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"AVG\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + " Feature(\n", + " name=\"f_location_max_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"MAX\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + "]\n", + "\n", + "agg_feature_anchor = FeatureAnchor(\n", + " name=\"agg_feature_anchor\",\n", + " source=batch_source, # External data source for feature. Typically a data table.\n", + " features=agg_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Define derived features\n", + "\n", + "We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "05633bc3-9118-449b-9562-45fc437576c2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "derived_features = [\n", + " DerivedFeature(\n", + " name=\"f_trip_time_distance\",\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " ],\n", + " transform=\"f_trip_distance / f_trip_time_duration\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build features\n", + "\n", + "Finally, we build the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.build_features(\n", + " anchor_list=[feature_anchor, agg_feature_anchor],\n", + " derived_feature_list=derived_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 3. Create Training Data Using Point-in-Time Correct Feature Join\n", + "\n", + "After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "02feabc9-2f2f-43e8-898d-b28082798e98", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "feature_names = [feature.name for feature in features + agg_features + derived_features]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FORMAT = \"parquet\"\n", + "offline_features_path = str(\n", + " Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "67e81466-c736-47ba-b122-e640642c01cf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Features that we want to request. Can use a subset of features\n", + "query = FeatureQuery(\n", + " feature_list=feature_names,\n", + " key=agg_key,\n", + ")\n", + "settings = ObservationSettings(\n", + " observation_path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")\n", + "client.get_offline_features(\n", + " observation_settings=settings,\n", + " feature_query=query,\n", + " # Note, execution_configurations argument only works when using a new job cluster\n", + " # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n", + " execution_configurations=SparkExecutionConfiguration(\n", + " {\n", + " \"spark.feathr.outputFormat\": DATA_FORMAT,\n", + " }\n", + " ),\n", + " output_path=offline_features_path,\n", + ")\n", + "\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9871af55-25eb-41ee-a58a-fda74b1a174e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Show feature results\n", + "df = get_result_df(\n", + " spark=spark,\n", + " client=client,\n", + " data_format=\"parquet\",\n", + " res_url=offline_features_path,\n", + ")\n", + "df.select(feature_names).limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 4. Train and Evaluate a Prediction Model\n", + "\n", + "After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n", + "\n", + "Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Load Train and Test Data from the Offline Feature Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "bd2cdc83-0920-46e8-9454-e5e6e7832ce0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Train / test split\n", + "train_df, test_df = (\n", + " df.withColumn( # Dataframe that we generated from get_offline_features call.\n", + " \"label\", F.col(\"fare_amount\").cast(\"double\")\n", + " )\n", + " .where(F.col(\"f_trip_time_duration\") > 0)\n", + " .fillna(0)\n", + " .randomSplit([0.8, 0.2])\n", + ")\n", + "\n", + "print(f\"Num train samples: {train_df.count()}\")\n", + "print(f\"Num test samples: {test_df.count()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build a ML Pipeline\n", + "\n", + "Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "2a254361-63e9-45b2-8c19-40549762eacb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Generate a feature vector column for SparkML\n", + "vector_assembler = VectorAssembler(\n", + " inputCols=[x for x in df.columns if x in feature_names],\n", + " outputCol=\"features\",\n", + ")\n", + "\n", + "# Define a model\n", + "gbt = GBTRegressor(\n", + " featuresCol=\"features\",\n", + " maxIter=100,\n", + " maxDepth=5,\n", + " maxBins=16,\n", + ")\n", + "\n", + "# Create a ML pipeline\n", + "ml_pipeline = Pipeline(\n", + " stages=[\n", + " vector_assembler,\n", + " gbt,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Train and Evaluate the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Train a model\n", + "model = ml_pipeline.fit(train_df)\n", + "\n", + "# Make predictions\n", + "predictions = model.transform(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "1f9b584c-6228-4a02-a6c3-9b8dd2b78091", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Evaluate\n", + "evaluator = RegressionEvaluator(\n", + " labelCol=\"label\",\n", + " predictionCol=\"prediction\",\n", + ")\n", + "\n", + "rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n", + "mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n", + "print(f\"RMSE: {rmse}\\nMAE: {mae}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "25c33abd-6e87-437d-a6a1-86435f065a1e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n", + "predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n", + "\n", + "predictions_pdf.plot(\n", + " x=\"index\",\n", + " y=[\"label\", \"prediction\"],\n", + " style=[\"-\", \":\"],\n", + " figsize=(20, 10),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "664d78cc-4a92-430c-9e05-565ba904558e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "predictions_pdf.plot.scatter(\n", + " x=\"label\",\n", + " y=\"prediction\",\n", + " xlim=(0, 100),\n", + " ylim=(0, 100),\n", + " figsize=(10, 10),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8a56d165-c813-4ce0-8ae6-9f4d313c463d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 5. Materialize Feature Values for Online Scoring\n", + "\n", + "While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n", + "\n", + "Note, only the features anchored to offline data source can be materialized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "751fa72e-8f94-40a1-994e-3e8315b51d37", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "materialized_feature_names = [feature.name for feature in agg_features]\n", + "materialized_feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if REDIS_KEY and RESOURCE_PREFIX:\n", + " FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n", + "\n", + " # Get the last date from the dataset\n", + " backfill_timestamp = (\n", + " df_raw.select(\n", + " F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL)\n", + " )\n", + " .agg({TIMESTAMP_COL: \"max\"})\n", + " .collect()[0][0]\n", + " )\n", + "\n", + " # Time range to materialize\n", + " backfill_time = BackfillTime(\n", + " start=backfill_timestamp,\n", + " end=backfill_timestamp,\n", + " step=timedelta(days=1),\n", + " )\n", + "\n", + " # Destinations:\n", + " # For online store,\n", + " redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n", + "\n", + " # For offline store,\n", + " # adls_sink = HdfsSink(output_path=)\n", + "\n", + " settings = MaterializationSettings(\n", + " name=FEATURE_TABLE_NAME + \".job\", # job name\n", + " backfill_time=backfill_time,\n", + " sinks=[redis_sink], # or adls_sink\n", + " feature_names=materialized_feature_names,\n", + " )\n", + "\n", + " client.materialize_features(\n", + " settings=settings,\n", + " # Note, execution_configurations argument only works when using a new job cluster\n", + " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", + " )\n", + "\n", + " client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Now, you can retrieve features for online scoring as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "424bc9eb-a47f-4b46-be69-8218d55e66ad", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if REDIS_KEY and RESOURCE_PREFIX:\n", + " # Note, to get a single key, you may use client.get_online_features instead\n", + " materialized_feature_values = client.multi_get_online_features(\n", + " feature_table=FEATURE_TABLE_NAME,\n", + " keys=[\"239\", \"265\"],\n", + " feature_names=materialized_feature_names,\n", + " )\n", + " materialized_feature_values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3596dc71-a363-4b6a-a169-215c89978558", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b5fb292e-bbb6-4dd7-8e79-c62d9533e820", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Remove temporary files\n", + "dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "databricks_quickstart_nyc_taxi_demo", + "notebookOrigID": 2365994027381987, + "widgets": { + "REDIS_KEY": { + "currentValue": "", + "nuid": "d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca", + "widgetInfo": { + "defaultValue": "", + "label": null, + "name": "REDIS_KEY", + "options": { + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "RESOURCE_PREFIX": { + "currentValue": "", + "nuid": "87a26035-86fc-4dbd-8dd0-dc546c1c63c1", + "widgetInfo": { + "defaultValue": "", + "label": null, + "name": "RESOURCE_PREFIX", + "options": { + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "kernelspec": { + "display_name": "Python 3.10.4 ('feathr')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "e34a1a57d2e174682770a82d94a178aa36d3ccfaa21227c5d2308e319b7ae532" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index b789e9bf2..06b5cb340 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -89,18 +89,12 @@ }, "outputs": [], "source": [ - "from datetime import datetime, timedelta\n", - "import glob\n", - "import json\n", + "from datetime import timedelta\n", "from math import sqrt\n", "import os\n", "from pathlib import Path\n", - "import requests\n", "from tempfile import TemporaryDirectory\n", "\n", - "from azure.identity import AzureCliCredential, DefaultAzureCredential \n", - "from azure.keyvault.secrets import SecretClient\n", - "import pandas as pd\n", "from pyspark.ml import Pipeline\n", "from pyspark.ml.evaluation import RegressionEvaluator\n", "from pyspark.ml.feature import VectorAssembler\n", @@ -154,39 +148,44 @@ }, "outputs": [], "source": [ - "RESOURCE_PREFIX = None # TODO fill the value\n", + "RESOURCE_PREFIX = None # TODO fill the value used to deploy the resources via ARM template\n", "PROJECT_NAME = \"feathr_getting_started\"\n", "\n", - "# Data store root path. Could be a local file system path or Azure storage path like abfs or wasbs\n", - "DATA_STORE_PATH = TemporaryDirectory().name\n", - "\n", "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", "SPARK_CLUSTER = \"local\"\n", - "# TODO -- Synapse spark pool name or Databricks cluster id\n", - "CLUSTER_NAME = None\n", "\n", - "# If set True, use an interactive browser authentication\n", + "# If \"azure_synapse\":\n", + "AZURE_SYNAPSE_SPARK_POOL = None # Set Synapse spark pool name to use an existing cluster\n", + "\n", + "# If \"databricks\":\n", + "DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n", + "DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n", + "\n", + "# Data store root path. Could be a local file system path, dbfs or Azure storage path like abfs or wasbs\n", + "DATA_STORE_PATH = TemporaryDirectory().name\n", + "\n", + "# If set True, use an interactive browser authentication to get the redis password.\n", "USE_CLI_AUTH = False\n", "\n", + "REGISTER_FEATURES = False\n", + "\n", "# (For the notebook test pipeline) If true, use ScrapBook package to collect the results.\n", "SCRAP_RESULTS = False" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "KEY_VAULT = f\"{RESOURCE_PREFIX}kv\"\n", - "KEY_VAULT_URI = f\"https://{KEY_VAULT}.vault.azure.net\"\n", + "To use Databricks as the feathr client's target platform, you may need to set a databricks token to an environment variable like:\n", + "\n", + "`export DATABRICKS_WORKSPACE_TOKEN_VALUE=your-token`\n", + "\n", + "or in the notebook cell,\n", "\n", - "ADLS_PATH = f\"abfss://{RESOURCE_PREFIX}fs@{RESOURCE_PREFIX}dls.dfs.core.windows.net/feathr_project\"\n", + "`os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = your-token`\n", "\n", - "if SPARK_CLUSTER == \"azure_synapse\":\n", - " os.environ['spark_config__azure_synapse__dev_url'] = f\"https://{resource_prefix}syws.dev.azuresynapse.net\"\n", - " os.environ['spark_config__azure_synapse__pool_name'] = CLUSTER_NAME\n", - " os.environ['spark_config__azure_synapse__workspace_dir'] = f\"abfss://{adls_fs_name}@{resource_prefix}dls.dfs.core.windows.net/{PROJECT_NAME}\"" + "If you are running this notebook on Databricks, the token will be automatically retrieved by using the current Databricks notebook context." ] }, { @@ -195,22 +194,19 @@ "metadata": {}, "outputs": [], "source": [ - "if USE_CLI_AUTH:\n", - " !az login --use-device-code" + "# Force to use dbfs if the notebook is running on Databricks\n", + "if is_databricks() and not DATA_STORE_PATH.startswith(\"dbfs:\"):\n", + " DATA_STORE_PATH = f\"dbfs:/{DATA_STORE_PATH.lstrip('/')}\"" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "# Get all the required credentials from Azure Key Vault\n", - "credential = AzureCliCredential() if USE_CLI_AUTH else DefaultAzureCredential()\n", - "secret_client = SecretClient(vault_url=KEY_VAULT_URI, credential=credential)\n", - "retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value" + "if USE_CLI_AUTH:\n", + " !az login --use-device-code" ] }, { @@ -219,27 +215,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Redis credential\n", - "os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n", - "\n", - "if SPARK_CLUSTER == \"local\":\n", - " os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", - "\n", - "elif SPARK_CLUSTER == \"databricks\" and is_databricks():\n", - " ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - " databricks_config = {\n", - " 'run_name': \"FEATHR_FILL_IN\",\n", - " 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n", - " 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n", - " 'spark_jar_task': {\n", - " 'main_class_name': \"FEATHR_FILL_IN\",\n", - " 'parameters': [\"FEATHR_FILL_IN\"],\n", - " },\n", - " }\n", - " os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n", - " os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n", - " os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n", - " os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()\n" + "# Redis password\n", + "if 'REDIS_PASSWORD' not in os.environ:\n", + " # Try to get all the required credentials from Azure Key Vault\n", + " from azure.identity import AzureCliCredential, DefaultAzureCredential \n", + " from azure.keyvault.secrets import SecretClient\n", + "\n", + " vault_url = f\"https://{RESOURCE_PREFIX}kv.vault.azure.net\"\n", + " credential = AzureCliCredential() if USE_CLI_AUTH else DefaultAzureCredential()\n", + " secret_client = SecretClient(vault_url=vault_url, credential=credential)\n", + " retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value\n", + " os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n" ] }, { @@ -271,7 +257,14 @@ }, "outputs": [], "source": [ - "config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n", + "config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_cluster=SPARK_CLUSTER,\n", + " # cluster name will be ignored in \"local\" spark.\n", + " cluster_name=AZURE_SYNAPSE_SPARK_POOL if SPARK_CLUSTER == \"azure_synapse\" else DATABRICKS_CLUSTER_ID,\n", + " databricks_url=DATABRICKS_URL,\n", + ")\n", "\n", "with open(config_path, 'r') as f: \n", " print(f.read())" @@ -334,8 +327,8 @@ "metadata": {}, "outputs": [], "source": [ - "# To run on a local spark, start a spark session:\n", - "if SPARK_CLUSTER == \"local\":\n", + "# If the notebook is runnong on Jupyter, start a spark session:\n", + "if is_jupyter():\n", " spark = (\n", " SparkSession\n", " .builder\n", @@ -345,7 +338,7 @@ " .getOrCreate()\n", " )\n", " \n", - "# Else, you must already have spark session object available in databricks or synapse." + "# Else, you must already have a spark session object available in databricks or synapse." ] }, { @@ -472,7 +465,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can define the source with a preprocessing python function." + "We can define the source with a preprocessing python function. In order to make the source data accessible from the target spark cluster, we upload the data file into either DBFS or Azure Blob Storage if needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define data source path\n", + "if client.spark_runtime == \"local\" or (client.spark_runtime == \"databricks\" and is_databricks()):\n", + " # In local mode, we can use the same data path as the source.\n", + " # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path.\n", + " data_source_path = DATA_FILE_PATH\n", + "else:\n", + " # Otherwise, upload the local file to dbfs.\n", + " data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(DATA_FILE_PATH) " ] }, { @@ -488,7 +497,7 @@ "\n", "batch_source = HdfsSource(\n", " name=\"nycTaxiBatchSource\",\n", - " path=DATA_FILE_PATH,\n", + " path=data_source_path,\n", " event_timestamp_column=TIMESTAMP_COL,\n", " preprocessing=preprocessing,\n", " timestamp_format=TIMESTAMP_FORMAT,\n", @@ -692,7 +701,7 @@ " key=agg_key,\n", ")\n", "settings = ObservationSettings(\n", - " observation_path=DATA_FILE_PATH,\n", + " observation_path=data_source_path,\n", " event_timestamp_column=TIMESTAMP_COL,\n", " timestamp_format=TIMESTAMP_FORMAT,\n", ")\n", @@ -889,12 +898,14 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " client.register_features()\n", - "except KeyError:\n", - " # TODO temporarily go around the \"Already exists\" error\n", - " \n", - " client.list_registered_features(project_name=PROJECT_NAME)" + "if REGISTER_FEATURES:\n", + " try:\n", + " client.register_features()\n", + " except KeyError:\n", + " # TODO temporarily go around the \"Already exists\" error\n", + " pass \n", + " print(client.list_registered_features(project_name=PROJECT_NAME))\n", + " # You can get the actual features too by calling client.get_features_from_registry(PROJECT_NAME)" ] }, { @@ -915,29 +926,6 @@ "Note, only the features anchored to offline data source can be materialized." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get registered features\n", - "registered_features_dict = client.get_features_from_registry(PROJECT_NAME)\n", - "\n", - "observation_feature_names = []\n", - "materialized_feature_names = []\n", - "\n", - "for feature_name, feature in registered_features_dict.items():\n", - " if feature.key[0].key_column == \"NOT_NEEDED\":\n", - " observation_feature_names.append(feature_name)\n", - " else:\n", - " materialized_feature_names.append(feature_name)\n", - " \n", - "print(f\"Features that will be extracted directly from the observation: {observation_feature_names}\")\n", - "print(\"\")\n", - "print(f\"Features that will be extracted from the source data and materialized to online storage: {materialized_feature_names}\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -988,7 +976,7 @@ " name=FEATURE_TABLE_NAME + \".job\", # job name\n", " backfill_time=backfill_time,\n", " sinks=[redis_sink], # or adls_sink\n", - " feature_names=materialized_feature_names,\n", + " feature_names=[feature.name for feature in agg_features],\n", ")\n", "\n", "client.materialize_features(\n", @@ -1016,7 +1004,7 @@ "materialized_feature_values = client.multi_get_online_features(\n", " feature_table=FEATURE_TABLE_NAME,\n", " keys=[\"239\", \"265\"],\n", - " feature_names=materialized_feature_names,\n", + " feature_names=[feature.name for feature in agg_features],\n", ")\n", "materialized_feature_values" ] @@ -1034,7 +1022,7 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO: Unregister or any other cleanups." + "# TODO: Unregister, delete cached files or do any other cleanups." ] }, { @@ -1083,7 +1071,7 @@ }, "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3.10.4 ('feathr')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1101,7 +1089,7 @@ }, "vscode": { "interpreter": { - "hash": "ddb0e38f168d5afaa0b8ab4851ddd8c14364f1d087c15de6ff2ee5a559aec1f2" + "hash": "e34a1a57d2e174682770a82d94a178aa36d3ccfaa21227c5d2308e319b7ae532" } } }, diff --git a/feathr_project/feathr/datasets/nyc_taxi.py b/feathr_project/feathr/datasets/nyc_taxi.py index ec605aae6..e00fa7150 100644 --- a/feathr_project/feathr/datasets/nyc_taxi.py +++ b/feathr_project/feathr/datasets/nyc_taxi.py @@ -73,7 +73,7 @@ def get_spark_df( if is_databricks(): # Databricks uses "dbfs:/" prefix for spark paths if not local_cache_path.startswith("dbfs:"): - local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) + local_cache_path = f"dbfs:/{local_cache_path.lstrip('/')}" # Databricks uses "/dbfs/" prefix for python paths python_local_cache_path = local_cache_path.replace("dbfs:", "/dbfs") # TODO add "if is_synapse()" diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py index 9a9438567..47ac84679 100644 --- a/feathr_project/feathr/utils/config.py +++ b/feathr_project/feathr/utils/config.py @@ -1,61 +1,209 @@ +from copy import deepcopy +import os +import json from tempfile import NamedTemporaryFile +from typing import Dict +import yaml +from feathr.utils.platform import is_databricks -FEATHR_CONFIG_TEMPLATE = """ -api_version: 1 -project_config: - project_name: {project_name} +DEFAULT_FEATHR_CONFIG = { + "api_version": 1, + "project_config": {}, # "project_name" + "feature_registry": {}, # "api_endpoint" + "spark_config": { + # "spark_cluster". Currently support 'azure_synapse', 'databricks', and 'local' + "spark_result_output_parts": "1", + }, + "offline_store": { + "adls": {"adls_enabled": "true"}, + "wasb": {"wasb_enabled": "true"}, + }, + "online_store": { + "redis": { + # "host" + "port": "6380", + "ssl_enabled": "true", + } + } +} -feature_registry: - api_endpoint: 'https://{resource_prefix}webapp.azurewebsites.net/api/v1' -spark_config: - # Currently support: 'azure_synapse', 'databricks', and 'local' - spark_cluster: {spark_cluster} - spark_result_output_parts: '1' +# New databricks job cluster config +DEFAULT_DATABRICKS_CLUSTER_CONFIG = { + "spark_version": "11.2.x-scala2.12", + "node_type_id": "Standard_D3_v2", + "num_workers": 2, + "spark_conf": { + "FEATHR_FILL_IN": "FEATHR_FILL_IN", + # Exclude conflicting packages if use feathr <= v0.8.0: + "spark.jars.excludes": "commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api", + }, +} -offline_store: - wasb: - wasb_enabled: true -online_store: - # You can skip this part if you don't have Redis and skip materialization later in this notebook. - redis: - host: '{resource_prefix}redis.redis.cache.windows.net' - port: 6380 - ssl_enabled: true -""" +# New Azure Synapse spark pool config +DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG = { + "executor_size": "Small", + "executor_num": 2, +} def generate_config( resource_prefix: str, project_name: str, spark_cluster: str, + cluster_name: str = None, + databricks_url: str = None, output_filepath: str = None, + use_env_vars: bool = True, ) -> str: - """Generate a feathr config yaml file + """Generate a feathr config yaml file. Note, if environment variables are set, they will be used instead of the + provided arguments. + + Some credential variables are intentionally not included in the argument and the outut config file + to avoid leaking secrets. E.g. DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD. + Those values should be passed via the environment variables regardless of the `use_env_vars` flag. + + Note: + This utility function assumes Azure resources are deployed using the Azure Resource Manager (ARM) template, + and infers resource names based on the given `resource_prefix`. If you deploy resources manually, you may need + to create the config file manually. Args: resource_prefix: Resource name prefix. project_name: Project name. spark_cluster: Spark cluster to use. Either 'local', 'databricks', or 'azure_synapse'. - output_filepath: Output filepath. + cluster_name (optional): Synapse spark pool name or Databricks cluster id if applicable. + If not provided, a new (job) cluster will be created and used. + databricks_url (optional): Databricks workspace url if applicable. + output_filepath (optional): Output filepath. + use_env_vars (optional): Whether to use environment variables if they are set. Returns: str: Generated config file path. output_filepath if provided. Otherwise, NamedTemporaryFile path. """ + if use_env_vars: + spark_cluster = os.getenv("SPARK_CONFIG__SPARK_CLUSTER", spark_cluster) - conf_str = FEATHR_CONFIG_TEMPLATE.format( - resource_prefix=resource_prefix, - project_name=project_name, - spark_cluster=spark_cluster, - ) + config = deepcopy(DEFAULT_FEATHR_CONFIG) + config["project_config"]["project_name"] = project_name + config["feature_registry"]["api_endpoint"] = f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" + config["spark_config"]["spark_cluster"] = spark_cluster + config["online_store"]["redis"]["host"] = f"{resource_prefix}redis.redis.cache.windows.net" + + # Set platform specific configurations + if spark_cluster == "local": + _set_local_spark_config() + elif spark_cluster == "azure_synapse": + _set_azure_synapse_config( + config=config, + resource_prefix=resource_prefix, + project_name=project_name, + cluster_name=cluster_name, + use_env_vars=use_env_vars, + ) + elif spark_cluster == "databricks": + _set_databricks_config( + config=config, + project_name=project_name, + workspace_url=databricks_url, + cluster_name=cluster_name, + use_env_vars=use_env_vars, + ) if not output_filepath: output_filepath = NamedTemporaryFile(mode="w", delete=False).name - with open(output_filepath, "w") as conf_file: - conf_file.write(conf_str) + with open(output_filepath, "w") as f: + yaml.dump(config, f, default_flow_style=False) return output_filepath + + +def _set_local_spark_config(): + """Set environment variables for local spark cluster.""" + os.environ["SPARK_LOCAL_IP"] = os.getenv( + "SPARK_LOCAL_IP", + "127.0.0.1", + ) + + +def _set_azure_synapse_config( + config: Dict, + resource_prefix: str, + project_name: str, + cluster_name: str = None, + use_env_vars: bool = True, +): + """Set environment variables for Azure Synapse spark cluster. + One may need to set ADLS_KEY""" + + dev_url = f"https://{resource_prefix}syws.dev.azuresynapse.net" + workspace_dir = f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" + + if use_env_vars: + dev_url = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL", dev_url) + cluster_name = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME", cluster_name) + workspace_dir = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR", workspace_dir) + + if not cluster_name: + raise ValueError("Azure Synapse spark pool name is not provided.") + + config["spark_config"]["azure_synapse"] = { + "dev_url": dev_url, + "pool_name": cluster_name, + "workspace_dir": workspace_dir, + **DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG, + } + + +def _set_databricks_config( + config: Dict, + project_name: str, + workspace_url: str, + cluster_name: str = None, + use_env_vars: bool = True, +): + if is_databricks(): + # If this functions is being called in Databricks, we may use the context to override the provided arguments. + ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext() + workspace_url = "https://" + ctx.tags().get("browserHostName").get() + workspace_token = ctx.apiToken().get() + else: + workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", None) + + work_dir = f"dbfs:/{project_name}" + databricks_config = { + "run_name": "FEATHR_FILL_IN", + "libraries": [{"jar": "FEATHR_FILL_IN"}], + "spark_jar_task": { + "main_class_name": "FEATHR_FILL_IN", + "parameters": ["FEATHR_FILL_IN"], + }, + } + if cluster_name is None: + databricks_config["new_cluster"] = DEFAULT_DATABRICKS_CLUSTER_CONFIG + else: + databricks_config["existing_cluster_id"] = cluster_name + config_template = json.dumps(databricks_config) + + if use_env_vars: + work_dir = os.getenv("SPARK_CONFIG__DATABRICKS__WORK_DIR", work_dir) + workspace_url = os.getenv("SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL", workspace_url) + workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", workspace_token) + config_template = os.getenv("SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE", config_template) + + if not workspace_url: + raise ValueError("Databricks workspace url is not provided.") + + if not workspace_token: + raise ValueError("Databricks workspace token is not provided.") + + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = workspace_token + config["spark_config"]["databricks"] = { + "work_dir": work_dir, + "workspace_instance_url": workspace_url, + "config_template": config_template, + } diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 12f27c2cb..6f5814e43 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -101,9 +101,10 @@ def get_result_df( elif client.spark_runtime == "databricks": if not res_url.startswith("dbfs:"): - raise ValueError( - f"In Databricks, the result files are expected to be stored at a DBFS storage but res_url = {res_url}." + logger.warning( + f"In Databricks, the result files are expected to be stored in DBFS, but the res_url {res_url} is not a dbfs path. Prefixing it with 'dbfs:/'" ) + res_url = f"dbfs:/{res_url.lstrip('/')}" if is_databricks(): # Check if the function is being called from Databricks if local_cache_path is not None: @@ -111,12 +112,9 @@ def get_result_df( "Result files are already in DBFS and thus `local_cache_path` will be ignored." ) local_cache_path = res_url - elif local_cache_path is None: # Download the result from dbfs to local - local_cache_path = TemporaryDirectory().name - else: - logger.warning("This utility function currently supports local spark and databricks. You may encounter unexpected results on other platforms.") - # TODO elif azure_synapse + if local_cache_path is None: + local_cache_path = TemporaryDirectory().name if local_cache_path != res_url: logger.info(f"{res_url} files will be downloaded into {local_cache_path}") diff --git a/feathr_project/pyproject.toml b/feathr_project/pyproject.toml index 5b7b2fc11..338a0eed3 100644 --- a/feathr_project/pyproject.toml +++ b/feathr_project/pyproject.toml @@ -11,7 +11,7 @@ multi_line_output = 3 [tool.pytest.ini_options] markers = [ - "notebooks: tests Jupyter notebooks", + "notebooks: tests Jupyter notebooks" ] [build-system] diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py index b8ee3f345..52b10cf89 100644 --- a/feathr_project/test/conftest.py +++ b/feathr_project/test/conftest.py @@ -5,6 +5,21 @@ from feathr import FeathrClient +def pytest_addoption(parser): + """Pytest command line argument options. + E.g. + `python -m pytest feathr_project/test/ --resource-prefix your_feathr_resource_prefix` + """ + parser.addoption( + "--resource-prefix", action="store", default="feathrazuretest3", help="Test Azure resource prefix" + ) + + +@pytest.fixture +def resource_prefix(request): + return request.config.getoption("--resource-prefix") + + @pytest.fixture(scope="session") def workspace_dir() -> str: """Workspace directory path containing data files and configs for testing.""" diff --git a/feathr_project/test/samples/test_notebooks.py b/feathr_project/test/samples/test_notebooks.py index 778b157d7..f87cbff2e 100644 --- a/feathr_project/test/samples/test_notebooks.py +++ b/feathr_project/test/samples/test_notebooks.py @@ -23,22 +23,25 @@ @pytest.mark.notebooks -def test__nyc_taxi_demo(tmp_path): +def test__nyc_taxi_demo(resource_prefix, tmp_path): notebook_name = "nyc_taxi_demo" output_tmpdir = TemporaryDirectory() output_notebook_path = str(tmp_path.joinpath(f"{notebook_name}.ipynb")) + print(f"Running {notebook_name} notebook as {output_notebook_path}") + pm.execute_notebook( input_path=NOTEBOOK_PATHS[notebook_name], output_path=output_notebook_path, # kernel_name="python3", parameters=dict( - RESOURCE_PREFIX="feathrazuretest3", # Use the test resource group + RESOURCE_PREFIX=resource_prefix, PROJECT_NAME=notebook_name, DATA_STORE_PATH=output_tmpdir.name, SPARK_CLUSTER="local", USE_CLI_AUTH=False, + REGISTER_FEATURES=False, SCRAP_RESULTS=True, ), ) @@ -47,10 +50,7 @@ def test__nyc_taxi_demo(tmp_path): nb = sb.read_notebook(output_notebook_path) outputs = nb.scraps - assert outputs["materialized_feature_values"].data["239"] == pytest.approx([5707., 1480.], abs=1.) - assert outputs["materialized_feature_values"].data["265"] == pytest.approx([10000., 4160.], abs=1.) + assert outputs["materialized_feature_values"].data["239"] == pytest.approx([1480., 5707.], abs=1.) + assert outputs["materialized_feature_values"].data["265"] == pytest.approx([4160., 10000.], abs=1.) assert outputs["rmse"].data == pytest.approx(5., abs=2.) assert outputs["mae"].data == pytest.approx(2., abs=1.) - - # clean up - output_tmpdir.cleanup() diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index 52adcae39..7b3395bc9 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -1,36 +1,38 @@ +from copy import deepcopy +import os from pathlib import Path +import yaml import pytest -from feathr.utils.config import FEATHR_CONFIG_TEMPLATE, generate_config - - -@pytest.fixture(scope="session") -def feathr_config_str() -> str: - return FEATHR_CONFIG_TEMPLATE.format( - resource_prefix="test_prefix", - project_name="test_project", - spark_cluster="local", - ) +from feathr import FeathrClient +from feathr.utils.config import generate_config @pytest.mark.parametrize( "output_filepath", [None, "config.yml"], ) -def test__generate_config( +def test__generate_config__output_filepath( output_filepath: str, - feathr_config_str: str, tmp_path: Path, ): + resource_prefix = "test_prefix" + project_name = "test_project" + spark_cluster = "local" + # Use tmp_path so that the test files get cleaned up after the tests if output_filepath: output_filepath = str(tmp_path / output_filepath) + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + config_filepath = generate_config( - resource_prefix="test_prefix", - project_name="test_project", - spark_cluster="local", + resource_prefix=resource_prefix, + project_name=project_name, + spark_cluster=spark_cluster, output_filepath=output_filepath, + use_env_vars=False, ) # Assert if the config file was generated in the specified output path. @@ -39,4 +41,67 @@ def test__generate_config( # Assert the generated config string is correct. with open(config_filepath, "r") as f: - assert feathr_config_str == f.read() + config = yaml.safe_load(f) + + assert config["project_config"]["project_name"] == project_name + assert config["feature_registry"]["api_endpoint"] == f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" + assert config["spark_config"]["spark_cluster"] == spark_cluster + assert config["online_store"]["redis"]["host"] == f"{resource_prefix}redis.redis.cache.windows.net" + + +@pytest.mark.parametrize( + "spark_cluster,cluster_name,databricks_url", + [ + ("local", None, None), + ("databricks", None, "https://test_url"), + ("azure_synapse", "some_spark_pool", None), + ] +) +def test__generate_config__spark_cluster( + spark_cluster: str, + cluster_name: str, + databricks_url: str, +): + """Test if spark cluster specific configs are generated without errors. + TODO - For now, this test doesn't check if the config values are correct. + """ + + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_cluster=spark_cluster, + cluster_name=cluster_name, + databricks_url=databricks_url, + use_env_vars=False, + ) + + +@pytest.mark.parametrize( + "spark_cluster,cluster_name,databricks_url", + [ + ("databricks", "some_cluster_id", None), + ("azure_synapse", None, "https://test_url"), + ] +) +def test__generate_config__exceptions( + spark_cluster: str, + cluster_name: str, + databricks_url: str, +): + """Test if exceptions are raised when databricks url and token are not provided.""" + + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_cluster=spark_cluster, + cluster_name=cluster_name, + databricks_url=databricks_url, + use_env_vars=False, + ) diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 1e005855e..136bc3545 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -91,10 +91,6 @@ def test__get_result_df__with_local_cache_path( # Test ValueError when res_url is None (False, "local", None, ValueError), (True, "databricks", None, ValueError), - # Test ValueError when res_url is not a dbfs path but client.spark_runtime is databricks - (False, "databricks", "some_local_path", ValueError), - # Test ValueError when res_url does not exists or not able to access. - (False, "local", "some_doesnt_exist_path", Exception), ] ) def test__get_result_df__exceptions( @@ -113,6 +109,9 @@ def test__get_result_df__exceptions( # Mock is_data_bricks mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) + # Mock _load_files_to_pandas_df + mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") + with pytest.raises(expected_error): get_result_df(client) From 1f3894a0db20f4240d9ff40630b8cea4a83563a2 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Tue, 8 Nov 2022 04:11:53 +0000 Subject: [PATCH 14/18] Fix generate_config to accept all the feathr env var config name Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../databricks_quickstart_nyc_taxi_demo.ipynb | 15 +- docs/samples/nyc_taxi_demo.ipynb | 8 +- feathr_project/feathr/utils/config.py | 241 +++++++++++------- feathr_project/test/unit/utils/test_config.py | 42 ++- 4 files changed, 181 insertions(+), 125 deletions(-) diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index 4dc58eaf7..e562ec5db 100644 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -240,6 +240,15 @@ "In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -256,9 +265,11 @@ "config_path = generate_config(\n", " resource_prefix=RESOURCE_PREFIX,\n", " project_name=PROJECT_NAME,\n", - " spark_cluster=SPARK_CLUSTER,\n", + " spark_config__spark_cluster=SPARK_CLUSTER,\n", " # You may set an existing cluster id here, but Databricks recommend to use new clusters for greater reliability.\n", - " cluster_name=None, # Set None to create a new job cluster\n", + " databricks_cluster_id=None, # Set None to create a new job cluster\n", + " databricks_workspace_token_value=ctx.apiToken().get(),\n", + " spark_config__databricks__workspace_instance_url=f\"https://{ctx.tags().get('browserHostName').get()}\",\n", ")\n", "\n", "with open(config_path, \"r\") as f:\n", diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 06b5cb340..10e189251 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -154,9 +154,6 @@ "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", "SPARK_CLUSTER = \"local\"\n", "\n", - "# If \"azure_synapse\":\n", - "AZURE_SYNAPSE_SPARK_POOL = None # Set Synapse spark pool name to use an existing cluster\n", - "\n", "# If \"databricks\":\n", "DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n", "DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n", @@ -261,9 +258,8 @@ " resource_prefix=RESOURCE_PREFIX,\n", " project_name=PROJECT_NAME,\n", " spark_cluster=SPARK_CLUSTER,\n", - " # cluster name will be ignored in \"local\" spark.\n", - " cluster_name=AZURE_SYNAPSE_SPARK_POOL if SPARK_CLUSTER == \"azure_synapse\" else DATABRICKS_CLUSTER_ID,\n", - " databricks_url=DATABRICKS_URL,\n", + " databricks_cluster_id=DATABRICKS_CLUSTER_ID if SPARK_CLUSTER == \"databricks\" else None,\n", + " spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n", ")\n", "\n", "with open(config_path, 'r') as f: \n", diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py index 47ac84679..27b41e8c7 100644 --- a/feathr_project/feathr/utils/config.py +++ b/feathr_project/feathr/utils/config.py @@ -1,3 +1,4 @@ +import collections.abc from copy import deepcopy import os import json @@ -13,7 +14,7 @@ "project_config": {}, # "project_name" "feature_registry": {}, # "api_endpoint" "spark_config": { - # "spark_cluster". Currently support 'azure_synapse', 'databricks', and 'local' + "spark_cluster": "local", # Currently support 'azure_synapse', 'databricks', and 'local' "spark_result_output_parts": "1", }, "offline_store": { @@ -47,72 +48,97 @@ DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG = { "executor_size": "Small", "executor_num": 2, + "pool_name": "spark3", } def generate_config( resource_prefix: str, project_name: str, - spark_cluster: str, - cluster_name: str = None, - databricks_url: str = None, output_filepath: str = None, + databricks_workspace_token_value: str = None, + databricks_cluster_id: str = None, + redis_password: str = None, + adls_key: str = None, use_env_vars: bool = True, + **kwargs, ) -> str: - """Generate a feathr config yaml file. Note, if environment variables are set, they will be used instead of the - provided arguments. + """Generate a feathr config yaml file. + Note, `use_env_vars` argument gives an option to either use environment variables for generating the config file + or not. Feathr client will use environment variables anyway if they are set. - Some credential variables are intentionally not included in the argument and the outut config file - to avoid leaking secrets. E.g. DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD. - Those values should be passed via the environment variables regardless of the `use_env_vars` flag. + Keyword arguments follow the same naming convention as the feathr config. E.g. to set Databricks as the target + cluster, use `spark_config__spark_cluster="databricks"`. + See https://feathr-ai.github.io/feathr/quickstart_synapse.html#step-4-update-feathr-config for more details. Note: This utility function assumes Azure resources are deployed using the Azure Resource Manager (ARM) template, and infers resource names based on the given `resource_prefix`. If you deploy resources manually, you may need - to create the config file manually. + to pass each resource url manually, e.g. `spark_config__azure_synapse__dev_url="your-resource-url"`. Args: - resource_prefix: Resource name prefix. - project_name: Project name. - spark_cluster: Spark cluster to use. Either 'local', 'databricks', or 'azure_synapse'. - cluster_name (optional): Synapse spark pool name or Databricks cluster id if applicable. - If not provided, a new (job) cluster will be created and used. - databricks_url (optional): Databricks workspace url if applicable. + resource_prefix: Resource name prefix used when deploying Feathr resources by using ARM template. + project_name: Feathr project name. + cluster_name (optional): Databricks cluster or Azure Synapse spark pool name to use an existing one. output_filepath (optional): Output filepath. use_env_vars (optional): Whether to use environment variables if they are set. + databricks_workspace_token_value (optional): Databricks workspace token. If provided, the value will be stored + as the environment variable. + databricks_cluster_id (optional): Databricks cluster id to use an existing cluster. + redis_password (optional): Redis password. If provided, the value will be stored as the environment variable. + adls_key (optional): ADLS key. If provided, the value will be stored as the environment variable. Returns: - str: Generated config file path. output_filepath if provided. Otherwise, NamedTemporaryFile path. + str: Generated config file path. This will be identical to `output_filepath` if provided. """ - if use_env_vars: - spark_cluster = os.getenv("SPARK_CONFIG__SPARK_CLUSTER", spark_cluster) - + # Set keys + if databricks_workspace_token_value: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = databricks_workspace_token_value + if redis_password: + os.environ["REDIS_PASSWORD"] = redis_password + if adls_key: + os.environ["ADLS_KEY"] = adls_key + + # Set configs config = deepcopy(DEFAULT_FEATHR_CONFIG) config["project_config"]["project_name"] = project_name config["feature_registry"]["api_endpoint"] = f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" - config["spark_config"]["spark_cluster"] = spark_cluster config["online_store"]["redis"]["host"] = f"{resource_prefix}redis.redis.cache.windows.net" + # Update configs using kwargs + new_config = _config_kwargs_to_dict(**kwargs) + _update_config(config, new_config) + # Set platform specific configurations - if spark_cluster == "local": + if config["spark_config"]["spark_cluster"] == "local": _set_local_spark_config() - elif spark_cluster == "azure_synapse": + elif config["spark_config"]["spark_cluster"] == "azure_synapse": _set_azure_synapse_config( config=config, resource_prefix=resource_prefix, project_name=project_name, - cluster_name=cluster_name, - use_env_vars=use_env_vars, ) - elif spark_cluster == "databricks": + elif config["spark_config"]["spark_cluster"] == "databricks": _set_databricks_config( config=config, project_name=project_name, - workspace_url=databricks_url, - cluster_name=cluster_name, - use_env_vars=use_env_vars, + cluster_id=databricks_cluster_id, ) + # Maybe update configs with environment variables + if use_env_vars: + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__SPARK_CLUSTER") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__WORK_DIR") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE") + + # Verify config + _verify_config(config) + + # Write config to file if not output_filepath: output_filepath = NamedTemporaryFile(mode="w", delete=False).name @@ -134,76 +160,107 @@ def _set_azure_synapse_config( config: Dict, resource_prefix: str, project_name: str, - cluster_name: str = None, - use_env_vars: bool = True, ): - """Set environment variables for Azure Synapse spark cluster. - One may need to set ADLS_KEY""" + """Set environment variables for Azure Synapse spark cluster.""" - dev_url = f"https://{resource_prefix}syws.dev.azuresynapse.net" - workspace_dir = f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" + if "azure_synapse" not in config["spark_config"]: + config["spark_config"]["azure_synapse"] = dict() - if use_env_vars: - dev_url = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL", dev_url) - cluster_name = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME", cluster_name) - workspace_dir = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR", workspace_dir) - - if not cluster_name: - raise ValueError("Azure Synapse spark pool name is not provided.") - - config["spark_config"]["azure_synapse"] = { - "dev_url": dev_url, - "pool_name": cluster_name, - "workspace_dir": workspace_dir, - **DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG, - } + if "dev_url" not in config["spark_config"]["azure_synapse"]: + config["spark_config"]["azure_synapse"]["dev_url"] = f"https://{resource_prefix}syws.dev.azuresynapse.net" + + if "workspace_dir" not in config["spark_config"]["azure_synapse"]: + config["spark_config"]["azure_synapse"]["workspace_dir"] =\ + f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" + + for k, v in DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG.items(): + if k not in config["spark_config"]["azure_synapse"]: + config["spark_config"]["azure_synapse"][k] = v def _set_databricks_config( config: Dict, project_name: str, - workspace_url: str, - cluster_name: str = None, - use_env_vars: bool = True, + cluster_id: str = None, ): - if is_databricks(): - # If this functions is being called in Databricks, we may use the context to override the provided arguments. - ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext() - workspace_url = "https://" + ctx.tags().get("browserHostName").get() - workspace_token = ctx.apiToken().get() - else: - workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", None) - - work_dir = f"dbfs:/{project_name}" - databricks_config = { - "run_name": "FEATHR_FILL_IN", - "libraries": [{"jar": "FEATHR_FILL_IN"}], - "spark_jar_task": { - "main_class_name": "FEATHR_FILL_IN", - "parameters": ["FEATHR_FILL_IN"], - }, - } - if cluster_name is None: - databricks_config["new_cluster"] = DEFAULT_DATABRICKS_CLUSTER_CONFIG - else: - databricks_config["existing_cluster_id"] = cluster_name - config_template = json.dumps(databricks_config) + if "databricks" not in config["spark_config"]: + config["spark_config"]["databricks"] = dict() + + if "work_dir" not in config["spark_config"]["databricks"]: + config["spark_config"]["databricks"]["work_dir"] = f"dbfs:/{project_name}" + + if "config_template" not in config["spark_config"]["databricks"]: + databricks_config = { + "run_name": "FEATHR_FILL_IN", + "libraries": [{"jar": "FEATHR_FILL_IN"}], + "spark_jar_task": { + "main_class_name": "FEATHR_FILL_IN", + "parameters": ["FEATHR_FILL_IN"], + }, + } + if cluster_id is None: + databricks_config["new_cluster"] = DEFAULT_DATABRICKS_CLUSTER_CONFIG + else: + databricks_config["existing_cluster_id"] = cluster_id - if use_env_vars: - work_dir = os.getenv("SPARK_CONFIG__DATABRICKS__WORK_DIR", work_dir) - workspace_url = os.getenv("SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL", workspace_url) - workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", workspace_token) - config_template = os.getenv("SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE", config_template) - - if not workspace_url: - raise ValueError("Databricks workspace url is not provided.") - - if not workspace_token: - raise ValueError("Databricks workspace token is not provided.") - - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = workspace_token - config["spark_config"]["databricks"] = { - "work_dir": work_dir, - "workspace_instance_url": workspace_url, - "config_template": config_template, - } + config["spark_config"]["databricks"]["config_template"] = json.dumps(databricks_config) + + +def _config_kwargs_to_dict(**kwargs) -> Dict: + """Parse config's keyword arguments to dictionary. + e.g. `spark_config__spark_cluster="local"` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. + """ + config = dict() + + for conf_key, conf_value in kwargs.items(): + if conf_value is None: + continue + + conf = config + keys = conf_key.split("__") + for k in keys[:-1]: + if k not in conf: + conf[k] = dict() + conf = conf[k] + conf[keys[-1]] = conf_value + + return config + + +def _update_config(config: Dict, new_config: Dict): + """Update config dictionary with the values in `new_config`.""" + for k, v in new_config.items(): + if k in config and isinstance(v, collections.abc.Mapping): + _update_config(config[k], v) + else: + config[k] = v + + +def _verify_config(config: Dict): + """Verify config.""" + if config["spark_config"]["spark_cluster"] == "azure_synapse": + if "ADLS_KEY" not in os.environ: + raise ValueError("ADLS_KEY must be set in environment variables") + + elif config["spark_config"]["spark_cluster"] == "databricks": + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + raise ValueError("Databricks workspace token is not provided.") + elif "workspace_instance_url" not in config["spark_config"]["databricks"]: + raise ValueError("Databricks workspace url is not provided.") + + +def _maybe_update_config_with_env_var(config: Dict, env_var_name: str): + """Update config dictionary with the values in environment variables. + e.g. `SPARK_CONFIG__SPARK_CLUSTER` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. + """ + if env_var_name not in os.environ: + return + + keys = env_var_name.lower().split("__") + conf = config + for k in keys[:-1]: + if k not in conf: + conf[k] = dict() + conf = conf[k] + + conf[keys[-1]] = os.environ[env_var_name] diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index 7b3395bc9..6a5119981 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -18,19 +18,14 @@ def test__generate_config__output_filepath( ): resource_prefix = "test_prefix" project_name = "test_project" - spark_cluster = "local" # Use tmp_path so that the test files get cleaned up after the tests if output_filepath: output_filepath = str(tmp_path / output_filepath) - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" - config_filepath = generate_config( resource_prefix=resource_prefix, project_name=project_name, - spark_cluster=spark_cluster, output_filepath=output_filepath, use_env_vars=False, ) @@ -45,63 +40,60 @@ def test__generate_config__output_filepath( assert config["project_config"]["project_name"] == project_name assert config["feature_registry"]["api_endpoint"] == f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" - assert config["spark_config"]["spark_cluster"] == spark_cluster + assert config["spark_config"]["spark_cluster"] == "local" assert config["online_store"]["redis"]["host"] == f"{resource_prefix}redis.redis.cache.windows.net" @pytest.mark.parametrize( - "spark_cluster,cluster_name,databricks_url", + "spark_cluster,env_key,databricks_url", [ ("local", None, None), - ("databricks", None, "https://test_url"), - ("azure_synapse", "some_spark_pool", None), + ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", "https://test_url"), + ("azure_synapse", "ADLS_KEY", None), ] ) def test__generate_config__spark_cluster( spark_cluster: str, - cluster_name: str, + env_key: str, databricks_url: str, ): """Test if spark cluster specific configs are generated without errors. - TODO - For now, this test doesn't check if the config values are correct. + TODO - For now, this test doesn't check if the config values are correctly working with the actual Feathr client. """ - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + if env_key and env_key not in os.environ: + os.environ[env_key] = "test_value" generate_config( resource_prefix="test_prefix", project_name="test_project", - spark_cluster=spark_cluster, - cluster_name=cluster_name, - databricks_url=databricks_url, + spark_config__spark_cluster=spark_cluster, + spark_config__databricks__workspace_instance_url=databricks_url, use_env_vars=False, ) @pytest.mark.parametrize( - "spark_cluster,cluster_name,databricks_url", + "spark_cluster,env_key,databricks_url", [ - ("databricks", "some_cluster_id", None), - ("azure_synapse", None, "https://test_url"), + ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", None), ] ) def test__generate_config__exceptions( spark_cluster: str, - cluster_name: str, + env_key: str, databricks_url: str, ): """Test if exceptions are raised when databricks url and token are not provided.""" - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + if env_key and env_key not in os.environ: + os.environ[env_key] = "test_value" with pytest.raises(ValueError): generate_config( resource_prefix="test_prefix", project_name="test_project", - spark_cluster=spark_cluster, - cluster_name=cluster_name, - databricks_url=databricks_url, + spark_config__spark_cluster=spark_cluster, + spark_config__databricks__workspace_instance_url=databricks_url, use_env_vars=False, ) From 8a610ac926239cb708484db999e569ebf18dcbab Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 9 Nov 2022 07:01:24 +0000 Subject: [PATCH 15/18] Add more pytests Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../databricks_quickstart_nyc_taxi_demo.ipynb | 3 +- docs/samples/nyc_taxi_demo.ipynb | 54 +++++-- feathr_project/feathr/utils/config.py | 42 ++++-- feathr_project/feathr/utils/job_utils.py | 11 +- feathr_project/test/conftest.py | 9 +- feathr_project/test/samples/test_notebooks.py | 6 +- feathr_project/test/unit/utils/test_config.py | 135 ++++++++++++++---- .../test/unit/utils/test_job_utils.py | 50 +++++-- 8 files changed, 233 insertions(+), 77 deletions(-) diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index e562ec5db..65e305e8f 100644 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -113,8 +113,7 @@ }, "outputs": [], "source": [ - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", + "from datetime import timedelta\n", "import os\n", "from pathlib import Path\n", "\n", diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 10e189251..4cb6b5b4f 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -154,13 +154,20 @@ "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", "SPARK_CLUSTER = \"local\"\n", "\n", - "# If \"databricks\":\n", + "# TODO fill values to use databricks cluster:\n", "DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n", "DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n", "\n", + "# TODO fill values to use Azure Synapse cluster:\n", + "AZURE_SYNAPSE_SPARK_POOL = None # Set Azure Synapse Spark pool name\n", + "AZURE_SYNAPSE_URL = None # Set Azure Synapse workspace url to use Azure Synapse\n", + "\n", "# Data store root path. Could be a local file system path, dbfs or Azure storage path like abfs or wasbs\n", "DATA_STORE_PATH = TemporaryDirectory().name\n", "\n", + "# Feathr config file path to use an existing file\n", + "FEATHR_CONFIG_PATH = None\n", + "\n", "# If set True, use an interactive browser authentication to get the redis password.\n", "USE_CLI_AUTH = False\n", "\n", @@ -182,7 +189,27 @@ "\n", "`os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = your-token`\n", "\n", - "If you are running this notebook on Databricks, the token will be automatically retrieved by using the current Databricks notebook context." + "If you are running this notebook on Databricks, the token will be automatically retrieved by using the current Databricks notebook context.\n", + "\n", + "On the other hand, to use Azure Synapse cluster, you have to specify the synapse workspace storage key:\n", + "\n", + "`export ADLS_KEY=your-key`\n", + "\n", + "or in the notebook cell,\n", + "\n", + "`os.environ[\"ADLS_KEY\"] = your-key`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if SPARK_CLUSTER == \"azure_synapse\" and not os.environ.get(\"ADLS_KEY\"):\n", + " os.environ[\"ADLS_KEY\"] = add_your_key_here\n", + "elif SPARK_CLUSTER == \"databricks\" and not os.environ.get(\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"):\n", + " os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = add_your_token_here" ] }, { @@ -254,13 +281,18 @@ }, "outputs": [], "source": [ - "config_path = generate_config(\n", - " resource_prefix=RESOURCE_PREFIX,\n", - " project_name=PROJECT_NAME,\n", - " spark_cluster=SPARK_CLUSTER,\n", - " databricks_cluster_id=DATABRICKS_CLUSTER_ID if SPARK_CLUSTER == \"databricks\" else None,\n", - " spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n", - ")\n", + "if FEATHR_CONFIG_PATH:\n", + " config_path = FEATHR_CONFIG_PATH\n", + "else:\n", + " config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_config__spark_cluster=SPARK_CLUSTER,\n", + " spark_config__azure_synapse__dev_url=AZURE_SYNAPSE_URL,\n", + " spark_config__azure_synapse__pool_name=AZURE_SYNAPSE_SPARK_POOL,\n", + " spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n", + " databricks_cluster_id=DATABRICKS_CLUSTER_ID,\n", + " )\n", "\n", "with open(config_path, 'r') as f: \n", " print(f.read())" @@ -334,7 +366,7 @@ " .getOrCreate()\n", " )\n", " \n", - "# Else, you must already have a spark session object available in databricks or synapse." + "# Else, you must already have a spark session object available in databricks or synapse notebooks." ] }, { @@ -476,7 +508,7 @@ " # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path.\n", " data_source_path = DATA_FILE_PATH\n", "else:\n", - " # Otherwise, upload the local file to dbfs.\n", + " # Otherwise, upload the local file to the cloud storage (either dbfs or adls).\n", " data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(DATA_FILE_PATH) " ] }, diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py index 27b41e8c7..9a5f5fd89 100644 --- a/feathr_project/feathr/utils/config.py +++ b/feathr_project/feathr/utils/config.py @@ -48,7 +48,6 @@ DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG = { "executor_size": "Small", "executor_num": 2, - "pool_name": "spark3", } @@ -161,20 +160,19 @@ def _set_azure_synapse_config( resource_prefix: str, project_name: str, ): - """Set environment variables for Azure Synapse spark cluster.""" + """Set configs for Azure Synapse spark cluster.""" - if "azure_synapse" not in config["spark_config"]: - config["spark_config"]["azure_synapse"] = dict() + config["spark_config"]["azure_synapse"] = config["spark_config"].get("azure_synapse", {}) - if "dev_url" not in config["spark_config"]["azure_synapse"]: + if not config["spark_config"]["azure_synapse"].get("dev_url"): config["spark_config"]["azure_synapse"]["dev_url"] = f"https://{resource_prefix}syws.dev.azuresynapse.net" - if "workspace_dir" not in config["spark_config"]["azure_synapse"]: + if not config["spark_config"]["azure_synapse"].get("workspace_dir"): config["spark_config"]["azure_synapse"]["workspace_dir"] =\ f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" for k, v in DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG.items(): - if k not in config["spark_config"]["azure_synapse"]: + if not config["spark_config"]["azure_synapse"].get(k): config["spark_config"]["azure_synapse"][k] = v @@ -183,13 +181,14 @@ def _set_databricks_config( project_name: str, cluster_id: str = None, ): - if "databricks" not in config["spark_config"]: - config["spark_config"]["databricks"] = dict() + """Set configs for Databricks spark cluster.""" - if "work_dir" not in config["spark_config"]["databricks"]: + config["spark_config"]["databricks"] = config["spark_config"].get("databricks", {}) + + if not config["spark_config"]["databricks"].get("work_dir"): config["spark_config"]["databricks"]["work_dir"] = f"dbfs:/{project_name}" - if "config_template" not in config["spark_config"]["databricks"]: + if not config["spark_config"]["databricks"].get("config_template"): databricks_config = { "run_name": "FEATHR_FILL_IN", "libraries": [{"jar": "FEATHR_FILL_IN"}], @@ -239,13 +238,26 @@ def _update_config(config: Dict, new_config: Dict): def _verify_config(config: Dict): """Verify config.""" if config["spark_config"]["spark_cluster"] == "azure_synapse": - if "ADLS_KEY" not in os.environ: + if not os.environ.get("ADLS_KEY"): raise ValueError("ADLS_KEY must be set in environment variables") + elif ( + not os.environ.get("SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL") and + config["spark_config"]["azure_synapse"].get("dev_url") is None + ): + raise ValueError("Azure Synapse dev endpoint is not provided.") + elif ( + not os.environ.get("SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME") and + config["spark_config"]["azure_synapse"].get("pool_name") is None + ): + raise ValueError("Azure Synapse pool name is not provided.") elif config["spark_config"]["spark_cluster"] == "databricks": - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + if not os.environ.get("DATABRICKS_WORKSPACE_TOKEN_VALUE"): raise ValueError("Databricks workspace token is not provided.") - elif "workspace_instance_url" not in config["spark_config"]["databricks"]: + elif ( + not os.environ.get("SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL") and + config["spark_config"]["databricks"].get("workspace_instance_url") is None + ): raise ValueError("Databricks workspace url is not provided.") @@ -253,7 +265,7 @@ def _maybe_update_config_with_env_var(config: Dict, env_var_name: str): """Update config dictionary with the values in environment variables. e.g. `SPARK_CONFIG__SPARK_CLUSTER` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. """ - if env_var_name not in os.environ: + if not os.environ.get(env_var_name): return keys = env_var_name.lower().split("__") diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 6f5814e43..fbc16b1ff 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -120,11 +120,12 @@ def get_result_df( logger.info(f"{res_url} files will be downloaded into {local_cache_path}") client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_cache_path) - # use user provided format, if there isn't one, then otherwise use the one provided by the job; - # if none of them is available, "avro" is the default format. - data_format: str = data_format or client.get_job_tags().get(OUTPUT_FORMAT, "") - if data_format is None or data_format == "": - data_format = "avro" + # Use the provided format or one in the job tags. + if data_format is None: + if client.get_job_tags() and client.get_job_tags().get(OUTPUT_FORMAT): + data_format = client.get_job_tags().get(OUTPUT_FORMAT) + else: + raise ValueError("Cannot determine the data format. Please provide the data_format argument.") result_df = None diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py index 52b10cf89..c2699e871 100644 --- a/feathr_project/test/conftest.py +++ b/feathr_project/test/conftest.py @@ -11,13 +11,16 @@ def pytest_addoption(parser): `python -m pytest feathr_project/test/ --resource-prefix your_feathr_resource_prefix` """ parser.addoption( - "--resource-prefix", action="store", default="feathrazuretest3", help="Test Azure resource prefix" + "--config-path", + action="store", + default=str(Path(__file__).parent.resolve().joinpath("test_user_workspace", "feathr_config.yaml")), + help="Test config path", ) @pytest.fixture -def resource_prefix(request): - return request.config.getoption("--resource-prefix") +def config_path(request): + return request.config.getoption("--config-path") @pytest.fixture(scope="session") diff --git a/feathr_project/test/samples/test_notebooks.py b/feathr_project/test/samples/test_notebooks.py index f87cbff2e..c8d1cbefc 100644 --- a/feathr_project/test/samples/test_notebooks.py +++ b/feathr_project/test/samples/test_notebooks.py @@ -23,7 +23,7 @@ @pytest.mark.notebooks -def test__nyc_taxi_demo(resource_prefix, tmp_path): +def test__nyc_taxi_demo(config_path, tmp_path): notebook_name = "nyc_taxi_demo" output_tmpdir = TemporaryDirectory() @@ -36,10 +36,8 @@ def test__nyc_taxi_demo(resource_prefix, tmp_path): output_path=output_notebook_path, # kernel_name="python3", parameters=dict( - RESOURCE_PREFIX=resource_prefix, - PROJECT_NAME=notebook_name, + FEATHR_CONFIG_PATH=config_path, DATA_STORE_PATH=output_tmpdir.name, - SPARK_CLUSTER="local", USE_CLI_AUTH=False, REGISTER_FEATURES=False, SCRAP_RESULTS=True, diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index 6a5119981..770980e12 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -1,11 +1,13 @@ from copy import deepcopy import os from pathlib import Path +from unittest.mock import MagicMock import yaml import pytest +from pytest_mock import MockerFixture -from feathr import FeathrClient +import feathr.utils.config from feathr.utils.config import generate_config @@ -45,55 +47,134 @@ def test__generate_config__output_filepath( @pytest.mark.parametrize( - "spark_cluster,env_key,databricks_url", + "spark_cluster,env_key,kwargs", [ - ("local", None, None), - ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", "https://test_url"), - ("azure_synapse", "ADLS_KEY", None), + ("local", None, dict()), + ( + "databricks", + "DATABRICKS_WORKSPACE_TOKEN_VALUE", + dict(spark_config__databricks__workspace_instance_url="databricks_url"), + ), + ( + "azure_synapse", + "ADLS_KEY", + dict( + spark_config__azure_synapse__dev_url="synapse_url", + spark_config__azure_synapse__pool_name="pool_name", + ), + ), ] ) def test__generate_config__spark_cluster( + mocker: MockerFixture, spark_cluster: str, env_key: str, - databricks_url: str, + kwargs: str, ): """Test if spark cluster specific configs are generated without errors. TODO - For now, this test doesn't check if the config values are correctly working with the actual Feathr client. """ - - if env_key and env_key not in os.environ: - os.environ[env_key] = "test_value" + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", {env_key: "some_value"}) generate_config( resource_prefix="test_prefix", project_name="test_project", spark_config__spark_cluster=spark_cluster, - spark_config__databricks__workspace_instance_url=databricks_url, use_env_vars=False, + **kwargs, ) @pytest.mark.parametrize( - "spark_cluster,env_key,databricks_url", + "adls_key,pool_name,expected_error", [ - ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", None), + ("some_key", "some_name", None), + (None, "some_name", ValueError), + ("some_key", None, ValueError), ] ) -def test__generate_config__exceptions( - spark_cluster: str, - env_key: str, - databricks_url: str, +def test__generate_config__azure_synapse_exceptions( + mocker: MockerFixture, + adls_key: str, + pool_name: str, + expected_error: Exception, +): + """Test if exceptions are raised when databricks url and token are not provided.""" + + # Either env vars or argument should yield the same result + for environ in [{"ADLS_KEY": adls_key}, { + "ADLS_KEY": adls_key, + "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME": pool_name, + }]: + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", environ) + + # Test either using env vars or arguments + if "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME" in environ: + kwargs = dict() + else: + kwargs = dict(spark_config__azure_synapse__pool_name=pool_name) + + if expected_error is None: + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="azure_synapse", + **kwargs, + ) + else: + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="azure_synapse", + **kwargs, + ) + + +@pytest.mark.parametrize( + "databricks_token,workspace_url,expected_error", + [ + ("some_token", "some_url", None), + (None, "some_url", ValueError), + ("some_token", None, ValueError), + ] +) +def test__generate_config__databricks_exceptions( + mocker: MockerFixture, + databricks_token: str, + workspace_url: str, + expected_error: Exception, ): """Test if exceptions are raised when databricks url and token are not provided.""" - if env_key and env_key not in os.environ: - os.environ[env_key] = "test_value" - - with pytest.raises(ValueError): - generate_config( - resource_prefix="test_prefix", - project_name="test_project", - spark_config__spark_cluster=spark_cluster, - spark_config__databricks__workspace_instance_url=databricks_url, - use_env_vars=False, - ) + # Either env vars or argument should yield the same result + for environ in [{"DATABRICKS_WORKSPACE_TOKEN_VALUE": databricks_token}, { + "DATABRICKS_WORKSPACE_TOKEN_VALUE": databricks_token, + "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL": workspace_url, + }]: + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", environ) + + # Test either using env vars or arguments + if "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL" in environ: + kwargs = dict() + else: + kwargs = dict(spark_config__databricks__workspace_instance_url=workspace_url) + + if expected_error is None: + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="databricks", + **kwargs, + ) + else: + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="databricks", + **kwargs, + ) diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 136bc3545..9f82be66e 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -10,6 +10,7 @@ from pyspark.sql import DataFrame, SparkSession from feathr import FeathrClient +from feathr.constants import OUTPUT_FORMAT, OUTPUT_PATH_TAG from feathr.utils.job_utils import ( get_result_df, get_result_pandas_df, @@ -86,11 +87,24 @@ def test__get_result_df__with_local_cache_path( @pytest.mark.parametrize( - "is_databricks,spark_runtime,res_url,expected_error", [ - (True, "local", None, RuntimeError), # Test RuntimeError when the function is running at Databricks but client.spark_runtime is not databricks + "is_databricks,spark_runtime,res_url,data_format,expected_error", [ + # Test RuntimeError when the function is running at Databricks but client.spark_runtime is not databricks + (True, "local", "some_url", "some_format", RuntimeError), + (True, "azure_synapse", "some_url", "some_format", RuntimeError), + (True, "databricks", "some_url", "some_format", None), + (False, "local", "some_url", "some_format", None), + (False, "azure_synapse", "some_url", "some_format", None), + (False, "databricks", "some_url", "some_format", None), # Test ValueError when res_url is None - (False, "local", None, ValueError), - (True, "databricks", None, ValueError), + (True, "databricks", None, "some_format", ValueError), + (False, "local", None, "some_format", ValueError), + (False, "azure_synapse", None, "some_format", ValueError), + (False, "databricks", None, "some_format", ValueError), + # Test ValueError when data_format is None + (True, "databricks", "some_url", None, ValueError), + (False, "local", "some_url", None, ValueError), + (False, "azure_synapse", "some_url", None, ValueError), + (False, "databricks", "some_url", None, ValueError), ] ) def test__get_result_df__exceptions( @@ -98,13 +112,10 @@ def test__get_result_df__exceptions( is_databricks: bool, spark_runtime: str, res_url: str, + data_format: str, expected_error: Type[Exception], ): """Test exceptions""" - # Mock client - client = MagicMock() - client.get_job_result_uri = MagicMock(return_value=res_url) - client.spark_runtime = spark_runtime # Mock is_data_bricks mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) @@ -112,8 +123,27 @@ def test__get_result_df__exceptions( # Mock _load_files_to_pandas_df mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") - with pytest.raises(expected_error): - get_result_df(client) + # Either job tags or argument should yield the same result + for job_tag in [None, {OUTPUT_FORMAT: data_format, OUTPUT_PATH_TAG: res_url}]: + # Mock client + client = MagicMock() + client.get_job_result_uri = MagicMock(return_value=res_url) + client.get_job_tags = MagicMock(return_value=job_tag) + client.spark_runtime = spark_runtime + + if expected_error is None: + get_result_df( + client=client, + res_url=None if job_tag else res_url, + data_format=None if job_tag else data_format, + ) + else: + with pytest.raises(expected_error): + get_result_df( + client=client, + res_url=None if job_tag else res_url, + data_format=None if job_tag else data_format, + ) @pytest.mark.parametrize( From 4c50485a725685534e7caa7bf646cb64c4800123 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 9 Nov 2022 20:50:49 +0000 Subject: [PATCH 16/18] Use None as default dataformat in the job_utils. Instead, set 'avro' as a default output format to the job tags from the client Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- docs/samples/nyc_taxi_demo.ipynb | 6 ++- feathr_project/feathr/client.py | 2 + feathr_project/feathr/utils/job_utils.py | 42 +++++++++---------- .../test/test_input_output_sources.py | 19 ++++----- 4 files changed, 33 insertions(+), 36 deletions(-) diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 4cb6b5b4f..b80bac374 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -361,11 +361,13 @@ " SparkSession\n", " .builder\n", " .appName(\"feathr\")\n", - " .config(\"spark.jars.packages\", \"org.apache.spark:spark-avro_2.12:3.3.0\")\n", + " .config(\"spark.jars.packages\", \"org.apache.spark:spark-avro_2.12:3.3.0,io.delta:delta-core_2.12:2.1.1\")\n", + " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n", + " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", " .config(\"spark.ui.port\", \"8080\") # Set ui port other than the default one (4040) so that feathr spark job doesn't fail. \n", " .getOrCreate()\n", " )\n", - " \n", + "\n", "# Else, you must already have a spark session object available in databricks or synapse notebooks." ] }, diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index cd080f871..741317428 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -487,6 +487,8 @@ def _get_offline_features_with_config(self, # set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function if execution_configurations is not None and OUTPUT_FORMAT in execution_configurations: job_tags[OUTPUT_FORMAT] = execution_configurations[OUTPUT_FORMAT] + else: + job_tags[OUTPUT_FORMAT] = "avro" ''' - Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself. - Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index fbc16b1ff..d9c73c355 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -22,9 +22,9 @@ def get_result_pandas_df( Args: client: Feathr client data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. + Default to use client's job tags if exists. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. local_cache_path (optional): Specify the absolute download path. if the user does not provide this, the function will create a temporary directory. @@ -47,9 +47,9 @@ def get_result_spark_df( spark: Spark session client: Feathr client data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. + Default to use client's job tags if exists. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. local_cache_path (optional): Specify the absolute download path. if the user does not provide this, the function will create a temporary directory. @@ -71,9 +71,9 @@ def get_result_df( Args: client: Feathr client data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. + Default to use client's job tags if exists. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. local_cache_path (optional): Specify the absolute download directory. if the user does not provide this, the function will create a temporary directory. spark (optional): Spark session. If provided, the function returns spark Dataframe. @@ -82,9 +82,22 @@ def get_result_df( Returns: Either Spark or pandas DataFrame. """ + if data_format is None: + # May use data format from the job tags + if client.get_job_tags() and client.get_job_tags().get(OUTPUT_FORMAT): + data_format = client.get_job_tags().get(OUTPUT_FORMAT) + else: + raise ValueError("Cannot determine the data format. Please provide the data_format argument.") + + data_format = data_format.lower() + if is_databricks() and client.spark_runtime != "databricks": raise RuntimeError(f"The function is called from Databricks but the client.spark_runtime is {client.spark_runtime}.") + # TODO Loading Synapse Delta table result into pandas has a bug: https://github.com/delta-io/delta-rs/issues/582 + if not spark and client.spark_runtime == "azure_synapse" and data_format == "delta": + raise RuntimeError(f"Loading Delta table result from Azure Synapse into pandas DataFrame is not supported. You maybe able to use spark DataFrame to load the result instead.") + # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: @@ -120,15 +133,7 @@ def get_result_df( logger.info(f"{res_url} files will be downloaded into {local_cache_path}") client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_cache_path) - # Use the provided format or one in the job tags. - if data_format is None: - if client.get_job_tags() and client.get_job_tags().get(OUTPUT_FORMAT): - data_format = client.get_job_tags().get(OUTPUT_FORMAT) - else: - raise ValueError("Cannot determine the data format. Please provide the data_format argument.") - result_df = None - try: if spark is not None: if data_format == "csv": @@ -154,17 +159,8 @@ def _load_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.Dat elif data_format == "delta": from deltalake import DeltaTable - delta = DeltaTable(dir_path) - # if client.spark_runtime != "azure_synapse": - # don't detect for synapse result with Delta as there's a problem with underlying system - # Issues are tracked here: https://github.com/delta-io/delta-rs/issues/582 return delta.to_pyarrow_table().to_pandas() - # else: - # TODO -- Proper warning messages. Is this applied to all the other formats? - # raise RuntimeError( - # "Please use Azure Synapse to read the result in the Azure Synapse cluster. Reading local results is not supported for Azure Synapse." - # ) elif data_format == "avro": import pandavro as pdx diff --git a/feathr_project/test/test_input_output_sources.py b/feathr_project/test/test_input_output_sources.py index f4af85678..ba4b3921a 100644 --- a/feathr_project/test/test_input_output_sources.py +++ b/feathr_project/test/test_input_output_sources.py @@ -10,6 +10,7 @@ from test_fixture import basic_test_setup from test_utils.constants import Constants + # test parquet file read/write without an extension name def test_feathr_get_offline_features_with_parquet(): """ @@ -38,7 +39,7 @@ def test_feathr_get_offline_features_with_parquet(): else: output_path = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/output','_', str(now.minute), '_', str(now.second), ".parquet"]) - + client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path, @@ -47,14 +48,12 @@ def test_feathr_get_offline_features_with_parquet(): # assuming the job can successfully run; otherwise it will throw exception client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS) - + # download result and just assert the returned result is not empty res_df = get_result_df(client) assert res_df.shape[0] > 0 - - # test delta lake read/write without an extension name def test_feathr_get_offline_features_with_delta_lake(): """ @@ -83,7 +82,7 @@ def test_feathr_get_offline_features_with_delta_lake(): else: output_path = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/output','_', str(now.minute), '_', str(now.second), "_deltalake"]) - + client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path, @@ -92,15 +91,13 @@ def test_feathr_get_offline_features_with_delta_lake(): # assuming the job can successfully run; otherwise it will throw exception client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS) - + # wait for a few secs for the resource to come up in the databricks API time.sleep(5) - # download result and just assert the returned result is not empty - res_df = get_result_df(client) - + # download result and just assert the returned result is not empty + # if users are using delta format in synapse, skip this check, due to issue https://github.com/delta-io/delta-rs/issues/582 result_format: str = client.get_job_tags().get(OUTPUT_FORMAT, "") if not (client.spark_runtime == 'azure_synapse' and result_format == 'delta'): - # if users are using delta format in synapse, skip this check, due to issue https://github.com/delta-io/delta-rs/issues/582 + res_df = get_result_df(client) assert res_df.shape[0] > 0 - From c049958b18910f9fb6e32d58087b8b1bb0704893 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 10 Nov 2022 00:00:43 +0000 Subject: [PATCH 17/18] Change feathr client to mocked object Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../test/unit/utils/test_job_utils.py | 74 ++++++++++--------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 9f82be66e..0909fb56e 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -157,30 +157,35 @@ def test__get_result_df__exceptions( ) def test__get_result_df( workspace_dir: str, - feathr_client: FeathrClient, data_format: str, output_filename: str, expected_count: int, ): """Test get_result_df returns pandas DataFrame""" - # Note: make sure the output file exists in the test_user_workspace - res_url = str(Path(workspace_dir, "mock_results", output_filename)) - local_cache_path = res_url + for spark_runtime in ["local", "databricks", "azure_synapse"]: + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url - # Mock feathr_spark_launcher.download_result - feathr_client.feathr_spark_launcher.download_result = MagicMock() + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime - if feathr_client.spark_runtime == "databricks": - res_url = f"dbfs:/{res_url}" + # Mock feathr_spark_launcher.download_result + if client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + if client.spark_runtime == "azure_synapse" and data_format == "delta": + # TODO currently pass the delta table test on Synapse result due to the delta table package bug. + continue - df = get_result_df( - client=feathr_client, - data_format=data_format, - res_url=res_url, - local_cache_path=local_cache_path, - ) - assert isinstance(df, pd.DataFrame) - assert len(df) == expected_count + df = get_result_df( + client=client, + data_format=data_format, + res_url=res_url, + local_cache_path=local_cache_path, + ) + assert isinstance(df, pd.DataFrame) + assert len(df) == expected_count @pytest.mark.parametrize( @@ -194,29 +199,30 @@ def test__get_result_df( ) def test__get_result_df__with_spark_session( workspace_dir: str, - feathr_client: FeathrClient, spark: SparkSession, data_format: str, output_filename: str, expected_count: int, ): """Test get_result_df returns spark DataFrame""" - # Note: make sure the output file exists in the test_user_workspace - res_url = str(Path(workspace_dir, "mock_results", output_filename)) - local_cache_path = res_url - - # Mock feathr_spark_launcher.download_result - feathr_client.feathr_spark_launcher.download_result = MagicMock() + for spark_runtime in ["local", "databricks", "azure_synapse"]: + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url - if feathr_client.spark_runtime == "databricks": - res_url = f"dbfs:/{res_url}" + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime - df = get_result_df( - client=feathr_client, - data_format=data_format, - res_url=res_url, - spark=spark, - local_cache_path=local_cache_path, - ) - assert isinstance(df, DataFrame) - assert df.count() == expected_count + if client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + + df = get_result_df( + client=client, + data_format=data_format, + res_url=res_url, + spark=spark, + local_cache_path=local_cache_path, + ) + assert isinstance(df, DataFrame) + assert df.count() == expected_count From 190377c1b52e98d3ef1b5a0d5516c52af94f22ab Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Tue, 22 Nov 2022 20:04:10 +0000 Subject: [PATCH 18/18] Change timeout to 1000s in the notebook Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- docs/samples/nyc_taxi_demo.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 81a11a460..31754950e 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -751,7 +751,7 @@ " output_path=offline_features_path,\n", ")\n", "\n", - "client.wait_job_to_finish(timeout_sec=500)" + "client.wait_job_to_finish(timeout_sec=1000)" ] }, { @@ -1020,7 +1020,7 @@ " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", ")\n", "\n", - "client.wait_job_to_finish(timeout_sec=500)" + "client.wait_job_to_finish(timeout_sec=1000)" ] }, {