Du9=P&Y*v*J9Q6Y5yc zlw>O22=JeAAS>SH;$Wv&QEsl(tbTjwzKOb5dHP)P@}lrIMOH4cQ)LoUr$ zWpCEZHUXy>=MPhXzx)uYTy1Sk*JJrGeMSB>$cgi XEndcch2t-AAS2#x!_dHf1yR>D z59E7@+z{vx5jU+=#AJ#%mpwyX=J+KJWW jwsg~Av+ZH {D^>syTPF?J@U7RI e5=1 gW1@y?0GLo#sBofyz2!B=Us1mfN!vQGw>2zN9r;6-1BT5OH{lS z-GA(NQOBi2U_8IV^I |D8tRC^;+$rOLFVmvuO2N$kiaoi-{}*l2}TRZ2=pB0RE3 zqWXM6g45aab4E?7lwOc#jhYUJEM3^_Je$D1JCROQYn8cdEis&!${U4THIe@I*+fyF z#aGrL4&F6n^TlcSU7RZv^iu^8g3-osq7F`^&z?IsYy;{iKK}%- =uhe~Nw2 z@Q?7kl=#OvZ!rET&g+hUihVcokMO!k{lB#DoFl>$R{cLRTN$)Aj6p?$e}PF34xO6I z7jlE6qlPMH$K-KU9ZMP6kz`iQPU)$vs+}4&rc^blsL5n@TzdpAZ*ZA^kPlIN37Lfd GfcP(sf;?&f literal 0 HcmV?d00001 diff --git a/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv b/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv new file mode 100644 index 000000000..b5b08ca83 --- /dev/null +++ b/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv @@ -0,0 +1,5 @@ +0,2.0,2020-04-01 00:44:02,2020-04-01 00:52:23,N,1.0,42,41,1.0,1.68,8.0,0.5,0.5,0.0,0.0,"",0.3,9.3,1.0,1.0,0.0 +1,2.0,2020-04-01 00:24:39,2020-04-01 00:33:06,N,1.0,244,247,2.0,1.94,9.0,0.5,0.5,0.0,0.0,"",0.3,10.3,2.0,1.0,0.0 +2,2.0,2020-04-01 00:45:06,2020-04-01 00:51:13,N,1.0,244,243,3.0,1.0,6.5,0.5,0.5,0.0,0.0,"",0.3,7.8,2.0,1.0,0.0 +3,2.0,2020-04-01 00:45:06,2020-04-01 01:04:39,N,1.0,244,243,2.0,2.81,12.0,0.5,0.5,0.0,0.0,"",0.3,13.3,2.0,1.0,0.0 +4,2.0,2020-04-01 00:00:23,2020-04-01 00:16:13,N,1.0,75,169,1.0,6.79,21.0,0.5,0.5,0.0,0.0,"",0.3,22.3,1.0,1.0,0.0 diff --git a/feathr_project/test/unit/datasets/test_datasets.py b/feathr_project/test/unit/datasets/test_datasets.py index c1ac49a9b..10d89c673 100644 --- a/feathr_project/test/unit/datasets/test_datasets.py +++ b/feathr_project/test/unit/datasets/test_datasets.py @@ -12,15 +12,6 @@ NYC_TAXI_FILE_PATH = str(TEST_DATASET_DIR.joinpath("green_tripdata_2020-04_with_index.csv").resolve()) -@pytest.fixture(scope="module") -def spark() -> SparkSession: - """Generate a spark session for tests.""" - # Set ui port other than the default one (4040) so that feathr spark job may not fail. - spark_session = SparkSession.builder.appName("tests").config("spark.ui.port", "8080").getOrCreate() - yield spark_session - spark_session.stop() - - @pytest.mark.parametrize( "local_cache_path", [ diff --git a/feathr_project/test/unit/spark_provider/test_localspark_submission.py b/feathr_project/test/unit/spark_provider/test_localspark_submission.py index 9a9d7238b..992f2015e 100644 --- a/feathr_project/test/unit/spark_provider/test_localspark_submission.py +++ b/feathr_project/test/unit/spark_provider/test_localspark_submission.py @@ -4,6 +4,7 @@ import pytest from pytest_mock import MockerFixture +from feathr.constants import OUTPUT_PATH_TAG from feathr.spark_provider._localspark_submission import _FeathrLocalSparkJobLauncher @@ -15,9 +16,17 @@ def local_spark_job_launcher(tmp_path) -> _FeathrLocalSparkJobLauncher: ) +@pytest.mark.parametrize( + "job_tags,expected_result_uri", [ + (None, None), + ({OUTPUT_PATH_TAG: "output"}, "output"), + ] +) def test__local_spark_job_launcher__submit_feathr_job( mocker: MockerFixture, local_spark_job_launcher: _FeathrLocalSparkJobLauncher, + job_tags: Dict[str, str], + expected_result_uri: str, ): # Mock necessary components local_spark_job_launcher._init_args = MagicMock(return_value=[]) @@ -31,11 +40,16 @@ def test__local_spark_job_launcher__submit_feathr_job( job_name="unit-test", main_jar_path="", main_class_name="", + job_tags=job_tags, ) # Assert if the mocked spark process has called once mocked_spark_proc.assert_called_once() + # Assert job tags + assert local_spark_job_launcher.get_job_tags() == job_tags + assert local_spark_job_launcher.get_job_result_uri() == expected_result_uri + @pytest.mark.parametrize( "confs", [{}, {"spark.feathr.outputFormat": "parquet"}] diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py new file mode 100644 index 000000000..21392bf84 --- /dev/null +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -0,0 +1,112 @@ +# TODO with, without optional args +# TODO test with no data files exception and unsupported format exception +from pathlib import Path +from tempfile import NamedTemporaryFile +from unittest.mock import MagicMock + +import pandas as pd +import pytest +from pytest_mock import MockerFixture +from pyspark.sql import DataFrame, SparkSession + +from feathr import FeathrClient +from feathr.utils.job_utils import ( + get_result_df, + get_result_pandas_df, + get_result_spark_df, +) + + +def test__get_result_pandas_df(mocker: MockerFixture): + # Assert if the base function, get_result_df, called w/ proper args + mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") + client = MagicMock() + data_format = "some_data_format" + res_url = "some_res_url" + local_cache_path = "some_local_cache_path" + get_result_pandas_df(client, data_format, res_url, local_cache_path) + mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path) + + +def test__get_result_spark_df(mocker: MockerFixture): + # Assert if the base function, get_result_df, called w/ proper args + mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") + client = MagicMock() + spark = MagicMock() + data_format = "some_data_format" + res_url = "some_res_url" + local_cache_path = "some_local_cache_path" + get_result_spark_df(spark, client, data_format, res_url, local_cache_path) + mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path, spark=spark) + + +# Local spark is expected to use a local filepath for res_url. Therefore, we mark this test to run with databricks. +@pytest.mark.databricks +def test__get_result_df__with_local_cache_path(feathr_client_databricks: FeathrClient): + # TODO Assert there is a local copy of the file in the given local_cache_path + pass + + +def test__get_result_df__exceptions(): + client = MagicMock() + client.get_job_result_uri = MagicMock(return_value=None) + + # Test ValueError when res_url is None + with pytest.raises(ValueError): + get_result_df(client) + + +@pytest.mark.parametrize( + "data_format,output_filename,expected_count", [ + ("csv", "output.csv", 5), + ("csv", "output_dir.csv", 4), # TODO add a header to the csv file and change expected_count to 5 after fixing the bug https://github.com/feathr-ai/feathr/issues/811 + ("parquet", "output.parquet", 5), + ("avro", "output.avro", 5), + ("delta", "output-delta", 5), + ] +) +def test__get_result_df( + workspace_dir: str, + feathr_client_local: FeathrClient, + data_format: str, + output_filename: str, + expected_count: int, +): + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + df = get_result_df( + client=feathr_client_local, + data_format=data_format, + res_url=res_url, + ) + assert isinstance(df, pd.DataFrame) + assert len(df) == expected_count + + +@pytest.mark.parametrize( + "data_format,output_filename,expected_count", [ + ("csv", "output.csv", 5), + ("csv", "output_dir.csv", 4), # TODO add a header to the csv file and change expected_count = 5 after fixing the bug https://github.com/feathr-ai/feathr/issues/811 + ("parquet", "output.parquet", 5), + ("avro", "output.avro", 5), + ("delta", "output-delta", 5), + ] +) +def test__get_result_df__with_spark_session( + workspace_dir: str, + feathr_client_local: FeathrClient, + spark: SparkSession, + data_format: str, + output_filename: str, + expected_count: int, +): + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + df = get_result_df( + client=feathr_client_local, + data_format=data_format, + res_url=res_url, + spark=spark, + ) + assert isinstance(df, DataFrame) + assert df.count() == expected_count From 15f4939b38b3c0bd9a0500b61404f0396eea34e0 Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Tue, 1 Nov 2022 10:26:40 +0800 Subject: [PATCH 04/18] Update test_azure_spark_e2e.py --- feathr_project/test/test_azure_spark_e2e.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/feathr_project/test/test_azure_spark_e2e.py b/feathr_project/test/test_azure_spark_e2e.py index 9c4ab8c5a..ae7c1cab2 100644 --- a/feathr_project/test/test_azure_spark_e2e.py +++ b/feathr_project/test/test_azure_spark_e2e.py @@ -183,7 +183,7 @@ def test_feathr_get_offline_features(): full_name="nyc_taxi.location_id") feature_query = FeatureQuery( - feature_list=["f_location_avg_fare"], key=location_id) + feature_list=["f_location_avg_fare", "f_trip_time_rounded"], key=location_id) settings = ObservationSettings( observation_path="wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04.csv", event_timestamp_column="lpep_dropoff_datetime", @@ -309,9 +309,9 @@ def test_feathr_materialize_to_aerospike(): # os.chdir(test_workspace_dir) now = datetime.now() # set workspace folder by time; make sure we don't have write conflict if there are many CI tests running - os.environ['SPARK_CONFIG__DATABRICKS__WORK_DIR'] = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) - os.environ['SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR'] = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_github_ci','_', str(now.minute), '_', str(now.second) ,'_', str(now.microsecond)]) - + os.environ['SPARK_CONFIG__DATABRICKS__WORK_DIR'] = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) + os.environ['SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR'] = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_github_ci','_', str(now.minute), '_', str(now.second) ,'_', str(now.microsecond)]) + client = FeathrClient(config_path="feathr_config.yaml") batch_source = HdfsSource(name="nycTaxiBatchSource", path="wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04.csv", @@ -396,4 +396,4 @@ def test_feathr_materialize_to_aerospike(): if __name__ == "__main__": test_feathr_materialize_to_aerospike() test_feathr_get_offline_features_to_sql() - test_feathr_materialize_to_cosmosdb() \ No newline at end of file + test_feathr_materialize_to_cosmosdb() From 26b7a0d08fc10524aa15eac8d1dfc68d37d52142 Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Tue, 1 Nov 2022 11:13:13 +0800 Subject: [PATCH 05/18] Fix doc dead links (#805) This PR fixes dead links detected in latest ci run. The doc scan ci action has been updated to run on main only, as running this in PR frequently reports false alarm due to changes in CI not deployed. --- .github/workflows/document-scan.yml | 5 ++++- .../databricks/databricks_quickstart_nyc_taxi_demo.ipynb | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/document-scan.yml b/.github/workflows/document-scan.yml index 3762ca2af..291a04f44 100644 --- a/.github/workflows/document-scan.yml +++ b/.github/workflows/document-scan.yml @@ -1,6 +1,9 @@ name: Feathr Documents' Broken Link Check -on: [push] +on: + push: + branches: [main] + jobs: check-links: runs-on: ubuntu-latest diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index 13187aa44..0bc099f11 100755 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["!pip install feathr"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n"," "]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.4 ('feathr')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"vscode":{"interpreter":{"hash":"ddb0e38f168d5afaa0b8ab4851ddd8c14364f1d087c15de6ff2ee5a559aec1f2"}}},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["!pip install feathr"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n","
"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} From 8e401b4f840104d516a51f03c27a1961601eb754 Mon Sep 17 00:00:00 2001 From: Boli Guan
Date: Tue, 1 Nov 2022 18:08:21 +0800 Subject: [PATCH 06/18] Improve UI experience and clean up ui code warnings (#801) * Add DataSourcesSelect and FlowGraph and ResizeTable components. Fix all warning and lint issues. Signed-off-by: Boli Guan * Add CardDescriptions component and fix ESlint warning. Signed-off-by: Boli Guan * Update FeatureDetails page title. Signed-off-by: Boli Guan * Rename ProjectSelect Signed-off-by: Boli Guan Signed-off-by: Boli Guan --- ui/.eslintrc | 3 +- ui/package-lock.json | 656 ++++++++++++------ ui/package.json | 8 +- ui/src/api/api.tsx | 46 +- ui/src/components/CardDescriptions/index.tsx | 32 + ui/src/components/FlowGraph/FlowGraph.tsx | 236 +++++++ ui/src/components/FlowGraph/LineageNode.tsx | 57 ++ ui/src/components/FlowGraph/index.module.less | 43 ++ ui/src/components/FlowGraph/index.ts | 5 + ui/src/components/FlowGraph/interface.ts | 30 + ui/src/components/FlowGraph/utils.ts | 192 +++++ ui/src/components/ProjectsSelect/index.tsx | 51 ++ .../components/ResizeTable/ResizableTitle.tsx | 37 + .../components/ResizeTable/ResizeHandle.tsx | 29 + ui/src/components/ResizeTable/ResizeTable.tsx | 68 ++ .../components/ResizeTable/index.module.less | 22 + ui/src/components/ResizeTable/index.tsx | 5 + ui/src/components/ResizeTable/interface.ts | 22 + ui/src/components/graph/graphNodeDetails.tsx | 107 ++- ui/src/components/header/headerWidgetMenu.tsx | 43 +- ui/src/components/sidemenu/siteMenu.tsx | 123 ++-- ui/src/models/model.ts | 14 +- .../components/DataSourceTable/index.tsx | 146 ++++ .../dataSource/components/SearchBar/index.tsx | 38 + ui/src/pages/dataSource/dataSourceDetails.tsx | 169 ++--- ui/src/pages/dataSource/dataSources.tsx | 25 +- .../feature/components/FeatureForm/index.tsx | 87 +++ .../feature/components/FeatureTable/index.tsx | 151 ++++ .../NodeDetails/FeatureNodeDetail.tsx | 44 ++ .../NodeDetails/SourceNodeDetial.tsx | 22 + .../feature/components/NodeDetails/index.tsx | 63 ++ .../feature/components/SearchBar/index.tsx | 67 ++ ui/src/pages/feature/featureDetails.tsx | 422 +++++------ ui/src/pages/feature/features.tsx | 29 +- ui/src/pages/feature/lineageGraph.tsx | 139 ++-- ui/src/pages/feature/newFeature.tsx | 11 +- .../management/components/RoleForm/index.tsx | 2 +- .../components/UserRolesTable/index.tsx | 23 +- ui/src/pages/management/management.tsx | 2 +- ui/src/pages/management/roleManagement.tsx | 2 +- .../project/components/ProjectTable/index.tsx | 99 +++ .../project/components/SearchBar/index.tsx | 51 ++ ui/src/pages/project/projects.tsx | 23 +- ui/src/site.css | 6 +- ui/src/utils/attributesMapping.ts | 48 ++ ui/src/utils/utils.tsx | 18 +- 46 files changed, 2655 insertions(+), 861 deletions(-) create mode 100644 ui/src/components/CardDescriptions/index.tsx create mode 100644 ui/src/components/FlowGraph/FlowGraph.tsx create mode 100644 ui/src/components/FlowGraph/LineageNode.tsx create mode 100644 ui/src/components/FlowGraph/index.module.less create mode 100644 ui/src/components/FlowGraph/index.ts create mode 100644 ui/src/components/FlowGraph/interface.ts create mode 100644 ui/src/components/FlowGraph/utils.ts create mode 100644 ui/src/components/ProjectsSelect/index.tsx create mode 100644 ui/src/components/ResizeTable/ResizableTitle.tsx create mode 100644 ui/src/components/ResizeTable/ResizeHandle.tsx create mode 100644 ui/src/components/ResizeTable/ResizeTable.tsx create mode 100644 ui/src/components/ResizeTable/index.module.less create mode 100644 ui/src/components/ResizeTable/index.tsx create mode 100644 ui/src/components/ResizeTable/interface.ts create mode 100644 ui/src/pages/dataSource/components/DataSourceTable/index.tsx create mode 100644 ui/src/pages/dataSource/components/SearchBar/index.tsx create mode 100644 ui/src/pages/feature/components/FeatureForm/index.tsx create mode 100644 ui/src/pages/feature/components/FeatureTable/index.tsx create mode 100644 ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx create mode 100644 ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx create mode 100644 ui/src/pages/feature/components/NodeDetails/index.tsx create mode 100644 ui/src/pages/feature/components/SearchBar/index.tsx create mode 100644 ui/src/pages/project/components/ProjectTable/index.tsx create mode 100644 ui/src/pages/project/components/SearchBar/index.tsx create mode 100644 ui/src/utils/attributesMapping.ts diff --git a/ui/.eslintrc b/ui/.eslintrc index 43eeb60eb..c271bfa24 100644 --- a/ui/.eslintrc +++ b/ui/.eslintrc @@ -20,7 +20,8 @@ "react-app", // https://reactjs.org/docs/hooks-rules.html "plugin:react-hooks/recommended", - "plugin:prettier/recommended" + "plugin:prettier/recommended", + "plugin:json/recommended" ], "parser": "@typescript-eslint/parser", "parserOptions": { diff --git a/ui/package-lock.json b/ui/package-lock.json index b1568ad00..480dfdc62 100644 --- a/ui/package-lock.json +++ b/ui/package-lock.json @@ -11,7 +11,7 @@ "@ant-design/icons": "^4.7.0", "@azure/msal-browser": "^2.24.0", "@azure/msal-react": "^1.4.0", - "antd": "^4.20.2", + "antd": "^4.23.6", "axios": "^0.27.2", "classnames": "^2.3.2", "dagre": "^0.8.5", @@ -20,6 +20,7 @@ "react-dom": "^17.0.2", "react-flow-renderer": "^9.7.4", "react-query": "^3.38.0", + "react-resizable": "^3.0.4", "react-router-dom": "^6.3.0" }, "devDependencies": { @@ -32,6 +33,7 @@ "@types/node": "^16.11.26", "@types/react": "^17.0.43", "@types/react-dom": "^17.0.14", + "@types/react-resizable": "^3.0.3", "@typescript-eslint/eslint-plugin": "^5.30.7", "@typescript-eslint/parser": "^5.30.7", "babel-plugin-import": "^1.13.5", @@ -40,6 +42,7 @@ "eslint-config-prettier": "^8.5.0", "eslint-import-resolver-typescript": "^3.5.1", "eslint-plugin-import": "^2.26.0", + "eslint-plugin-json": "^3.1.0", "eslint-plugin-prettier": "^4.2.1", "eslint-plugin-react-hooks": "^4.6.0", "husky": "^8.0.1", @@ -94,14 +97,15 @@ "license": "MIT" }, "node_modules/@ant-design/react-slick": { - "version": "0.28.4", - "license": "MIT", + "version": "0.29.2", + "resolved": "https://registry.npmjs.org/@ant-design/react-slick/-/react-slick-0.29.2.tgz", + "integrity": "sha512-kgjtKmkGHa19FW21lHnAfyyH9AAoh35pBdcJ53rHmQ3O+cfFHGHnUbj/HFrRNJ5vIts09FKJVAD8RpaC+RaWfA==", "dependencies": { "@babel/runtime": "^7.10.4", "classnames": "^2.2.5", "json2mq": "^0.2.0", "lodash": "^4.17.21", - "resize-observer-polyfill": "^1.5.0" + "resize-observer-polyfill": "^1.5.1" }, "peerDependencies": { "react": ">=16.9.0" @@ -1946,10 +1950,11 @@ } }, "node_modules/@babel/runtime": { - "version": "7.17.9", - "license": "MIT", + "version": "7.20.0", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.20.0.tgz", + "integrity": "sha512-NDYdls71fTXoU8TZHfbBWg7DiZfNzClcKui/+kyi6ppD2L1qnWW3VV6CjtaBXSUGGhiTWJ6ereOIkUvenif66Q==", "dependencies": { - "regenerator-runtime": "^0.13.4" + "regenerator-runtime": "^0.13.10" }, "engines": { "node": ">=6.9.0" @@ -3607,6 +3612,15 @@ "redux": "^4.0.0" } }, + "node_modules/@types/react-resizable": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/react-resizable/-/react-resizable-3.0.3.tgz", + "integrity": "sha512-W/QsUOZoXBAIBQNhNm95A5ohoaiUA874lWQytO2UP9dOjp5JHO9+a0cwYNabea7sA12ZDJnGVUFZxcNaNksAWA==", + "dev": true, + "dependencies": { + "@types/react": "*" + } + }, "node_modules/@types/resolve": { "version": "1.17.1", "dev": true, @@ -4566,52 +4580,53 @@ } }, "node_modules/antd": { - "version": "4.20.2", - "license": "MIT", + "version": "4.23.6", + "resolved": "https://registry.npmjs.org/antd/-/antd-4.23.6.tgz", + "integrity": "sha512-AYH57cWBDe1ChtbnvG8i9dpKG4WnjE3AG0zIKpXByFNnxsr4saV6/19ihE8/ImSGpohN4E2zTXmo7R5/MyVRKQ==", "dependencies": { "@ant-design/colors": "^6.0.0", "@ant-design/icons": "^4.7.0", - "@ant-design/react-slick": "~0.28.1", - "@babel/runtime": "^7.12.5", + "@ant-design/react-slick": "~0.29.1", + "@babel/runtime": "^7.18.3", "@ctrl/tinycolor": "^3.4.0", "classnames": "^2.2.6", "copy-to-clipboard": "^3.2.0", "lodash": "^4.17.21", "memoize-one": "^6.0.0", "moment": "^2.29.2", - "rc-cascader": "~3.5.0", + "rc-cascader": "~3.7.0", "rc-checkbox": "~2.3.0", - "rc-collapse": "~3.1.0", - "rc-dialog": "~8.8.1", - "rc-drawer": "~4.4.2", - "rc-dropdown": "~3.5.0", - "rc-field-form": "~1.26.1", - "rc-image": "~5.6.0", - "rc-input": "~0.0.1-alpha.5", - "rc-input-number": "~7.3.0", - "rc-mentions": "~1.7.0", - "rc-menu": "~9.5.5", - "rc-motion": "^2.5.1", + "rc-collapse": "~3.3.0", + "rc-dialog": "~8.9.0", + "rc-drawer": "~5.1.0", + "rc-dropdown": "~4.0.0", + "rc-field-form": "~1.27.0", + "rc-image": "~5.7.0", + "rc-input": "~0.1.2", + "rc-input-number": "~7.3.9", + "rc-mentions": "~1.10.0", + "rc-menu": "~9.6.3", + "rc-motion": "^2.6.1", "rc-notification": "~4.6.0", - "rc-pagination": "~3.1.9", - "rc-picker": "~2.6.4", - "rc-progress": "~3.2.1", + "rc-pagination": "~3.1.17", + "rc-picker": "~2.6.11", + "rc-progress": "~3.3.2", "rc-rate": "~2.9.0", "rc-resize-observer": "^1.2.0", - "rc-segmented": "~2.1.0 ", - "rc-select": "~14.1.1", + "rc-segmented": "~2.1.0", + "rc-select": "~14.1.13", "rc-slider": "~10.0.0", "rc-steps": "~4.1.0", "rc-switch": "~3.2.0", - "rc-table": "~7.24.0", - "rc-tabs": "~11.13.0", - "rc-textarea": "~0.3.0", - "rc-tooltip": "~5.1.1", - "rc-tree": "~5.5.0", - "rc-tree-select": "~5.3.0", + "rc-table": "~7.26.0", + "rc-tabs": "~12.2.0", + "rc-textarea": "~0.4.5", + "rc-tooltip": "~5.2.0", + "rc-tree": "~5.7.0", + "rc-tree-select": "~5.5.0", "rc-trigger": "^5.2.10", "rc-upload": "~4.3.0", - "rc-util": "^5.20.0", + "rc-util": "^5.22.5", "scroll-into-view-if-needed": "^2.2.25" }, "funding": { @@ -4681,7 +4696,8 @@ }, "node_modules/array-tree-filter": { "version": "2.1.0", - "license": "MIT" + "resolved": "https://registry.npmjs.org/array-tree-filter/-/array-tree-filter-2.1.0.tgz", + "integrity": "sha512-4ROwICNlNw/Hqa9v+rk5h22KjmzB1JGTMVKP2AKJBOCgb0yL0ASf0+YvCcLNNwquOHNX48jkeZIJ3a+oOQqKcw==" }, "node_modules/array-union": { "version": "2.1.0", @@ -4749,8 +4765,9 @@ "license": "MIT" }, "node_modules/async-validator": { - "version": "4.1.1", - "license": "MIT" + "version": "4.2.5", + "resolved": "https://registry.npmjs.org/async-validator/-/async-validator-4.2.5.tgz", + "integrity": "sha512-7HhHjtERjqlNbZtqNqy2rckN/SpOOlmDliet+lP7k+eKZEjPk3DgyeU9lIXLdeLz0uBbbVp+9Qdow9wJWgwwfg==" }, "node_modules/asynckit": { "version": "0.4.0", @@ -6520,8 +6537,9 @@ } }, "node_modules/date-fns": { - "version": "2.28.0", - "license": "MIT", + "version": "2.29.3", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.3.tgz", + "integrity": "sha512-dDCnyH2WnnKusqvZZ6+jA1O51Ibt8ZMRNkDZdyAyK4YfbDwa/cEmuztzG5pk6hqlp9aSBPYcjOlktquahGwGeA==", "engines": { "node": ">=0.11" }, @@ -7491,6 +7509,19 @@ } } }, + "node_modules/eslint-plugin-json": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-json/-/eslint-plugin-json-3.1.0.tgz", + "integrity": "sha512-MrlG2ynFEHe7wDGwbUuFPsaT2b1uhuEFhJ+W1f1u+1C2EkXmTYJp4B1aAdQQ8M+CC3t//N/oRKiIVw14L2HR1g==", + "dev": true, + "dependencies": { + "lodash": "^4.17.21", + "vscode-json-languageservice": "^4.1.6" + }, + "engines": { + "node": ">=12.0" + } + }, "node_modules/eslint-plugin-jsx-a11y": { "version": "6.5.1", "dev": true, @@ -10877,7 +10908,8 @@ }, "node_modules/json2mq": { "version": "0.2.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/json2mq/-/json2mq-0.2.0.tgz", + "integrity": "sha512-SzoRg7ux5DWTII9J2qkrZrqV1gt+rTaoufMxEzXbS26Uid0NwaJd123HcoB80TgubEppxxIGdNxCx50fEoEWQA==", "dependencies": { "string-convert": "^0.2.0" } @@ -10893,6 +10925,12 @@ "node": ">=6" } }, + "node_modules/jsonc-parser": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz", + "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==", + "dev": true + }, "node_modules/jsonfile": { "version": "6.1.0", "dev": true, @@ -13833,14 +13871,15 @@ } }, "node_modules/rc-cascader": { - "version": "3.5.0", - "license": "MIT", + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/rc-cascader/-/rc-cascader-3.7.0.tgz", + "integrity": "sha512-SFtGpwmYN7RaWEAGTS4Rkc62ZV/qmQGg/tajr/7mfIkleuu8ro9Hlk6J+aA0x1YS4zlaZBtTcSaXM01QMiEV/A==", "dependencies": { "@babel/runtime": "^7.12.5", "array-tree-filter": "^2.1.0", "classnames": "^2.3.1", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.6.1" }, "peerDependencies": { @@ -13861,8 +13900,9 @@ } }, "node_modules/rc-collapse": { - "version": "3.1.4", - "license": "MIT", + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/rc-collapse/-/rc-collapse-3.3.1.tgz", + "integrity": "sha512-cOJfcSe3R8vocrF8T+PgaHDrgeA1tX+lwfhwSj60NX9QVRidsILIbRNDLD6nAzmcvVC5PWiIRiR4S1OobxdhCg==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -13876,8 +13916,9 @@ } }, "node_modules/rc-dialog": { - "version": "8.8.1", - "license": "MIT", + "version": "8.9.0", + "resolved": "https://registry.npmjs.org/rc-dialog/-/rc-dialog-8.9.0.tgz", + "integrity": "sha512-Cp0tbJnrvPchJfnwIvOMWmJ4yjX3HWFatO6oBFD1jx8QkgsQCR0p8nUWAKdd3seLJhEC39/v56kZaEjwp9muoQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -13890,11 +13931,14 @@ } }, "node_modules/rc-drawer": { - "version": "4.4.3", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/rc-drawer/-/rc-drawer-5.1.0.tgz", + "integrity": "sha512-pU3Tsn99pxGdYowXehzZbdDVE+4lDXSGb7p8vA9mSmr569oc2Izh4Zw5vLKSe/Xxn2p5MSNbLVqD4tz+pK6SOw==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-util": "^5.7.0" + "rc-motion": "^2.6.1", + "rc-util": "^5.21.2" }, "peerDependencies": { "react": ">=16.9.0", @@ -13902,12 +13946,13 @@ } }, "node_modules/rc-dropdown": { - "version": "3.5.2", - "license": "MIT", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/rc-dropdown/-/rc-dropdown-4.0.1.tgz", + "integrity": "sha512-OdpXuOcme1rm45cR0Jzgfl1otzmU4vuBVb+etXM8vcaULGokAKVpKlw8p6xzspG7jGd/XxShvq+N3VNEfk/l5g==", "dependencies": { - "@babel/runtime": "^7.10.1", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", - "rc-trigger": "^5.0.4", + "rc-trigger": "^5.3.1", "rc-util": "^5.17.0" }, "peerDependencies": { @@ -13916,10 +13961,11 @@ } }, "node_modules/rc-field-form": { - "version": "1.26.3", - "license": "MIT", + "version": "1.27.3", + "resolved": "https://registry.npmjs.org/rc-field-form/-/rc-field-form-1.27.3.tgz", + "integrity": "sha512-HGqxHnmGQgkPApEcikV4qTg3BLPC82uB/cwBDftDt1pYaqitJfSl5TFTTUMKVEJVT5RqJ2Zi68ME1HmIMX2HAw==", "dependencies": { - "@babel/runtime": "^7.8.4", + "@babel/runtime": "^7.18.0", "async-validator": "^4.1.0", "rc-util": "^5.8.0" }, @@ -13932,12 +13978,13 @@ } }, "node_modules/rc-image": { - "version": "5.6.2", - "license": "MIT", + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/rc-image/-/rc-image-5.7.1.tgz", + "integrity": "sha512-QyMfdhoUfb5W14plqXSisaYwpdstcLYnB0MjX5ccIK2rydQM9sDPuekQWu500DDGR2dBaIF5vx9XbWkNFK17Fg==", "dependencies": { "@babel/runtime": "^7.11.2", "classnames": "^2.2.6", - "rc-dialog": "~8.8.0", + "rc-dialog": "~8.9.0", "rc-util": "^5.0.6" }, "peerDependencies": { @@ -13946,8 +13993,9 @@ } }, "node_modules/rc-input": { - "version": "0.0.1-alpha.7", - "license": "MIT", + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/rc-input/-/rc-input-0.1.4.tgz", + "integrity": "sha512-FqDdNz+fV2dKNgfXzcSLKvC+jEs1709t7nD+WdfjrdSaOcefpgc7BUJYadc3usaING+b7ediMTfKxuJBsEFbXA==", "dependencies": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -13959,12 +14007,13 @@ } }, "node_modules/rc-input-number": { - "version": "7.3.4", - "license": "MIT", + "version": "7.3.9", + "resolved": "https://registry.npmjs.org/rc-input-number/-/rc-input-number-7.3.9.tgz", + "integrity": "sha512-u0+miS+SATdb6DtssYei2JJ1WuZME+nXaG6XGtR8maNyW5uGDytfDu60OTWLQEb0Anv/AcCzehldV8CKmKyQfA==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", - "rc-util": "^5.9.8" + "rc-util": "^5.23.0" }, "peerDependencies": { "react": ">=16.9.0", @@ -13972,15 +14021,16 @@ } }, "node_modules/rc-mentions": { - "version": "1.7.1", - "license": "MIT", + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/rc-mentions/-/rc-mentions-1.10.0.tgz", + "integrity": "sha512-oMlYWnwXSxP2NQVlgxOTzuG/u9BUc3ySY78K3/t7MNhJWpZzXTao+/Bic6tyZLuNCO89//hVQJBdaR2rnFQl6Q==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-menu": "~9.5.1", - "rc-textarea": "^0.3.0", + "rc-menu": "~9.6.0", + "rc-textarea": "^0.4.0", "rc-trigger": "^5.0.4", - "rc-util": "^5.0.1" + "rc-util": "^5.22.5" }, "peerDependencies": { "react": ">=16.9.0", @@ -13988,8 +14038,9 @@ } }, "node_modules/rc-menu": { - "version": "9.5.5", - "license": "MIT", + "version": "9.6.4", + "resolved": "https://registry.npmjs.org/rc-menu/-/rc-menu-9.6.4.tgz", + "integrity": "sha512-6DiNAjxjVIPLZXHffXxxcyE15d4isRL7iQ1ru4MqYDH2Cqc5bW96wZOdMydFtGLyDdnmEQ9jVvdCE9yliGvzkw==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -14005,8 +14056,9 @@ } }, "node_modules/rc-motion": { - "version": "2.6.0", - "license": "MIT", + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/rc-motion/-/rc-motion-2.6.2.tgz", + "integrity": "sha512-4w1FaX3dtV749P8GwfS4fYnFG4Rb9pxvCYPc/b2fw1cmlHJWNNgOFIz7ysiD+eOrzJSvnLJWlNQQncpNMXwwpg==", "dependencies": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -14035,8 +14087,9 @@ } }, "node_modules/rc-overflow": { - "version": "1.2.5", - "license": "MIT", + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc-overflow/-/rc-overflow-1.2.8.tgz", + "integrity": "sha512-QJ0UItckWPQ37ZL1dMEBAdY1dhfTXFL9k6oTTcyydVwoUNMnMqCGqnRNA98axSr/OeDKqR6DVFyi8eA5RQI/uQ==", "dependencies": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -14049,8 +14102,9 @@ } }, "node_modules/rc-pagination": { - "version": "3.1.16", - "license": "MIT", + "version": "3.1.17", + "resolved": "https://registry.npmjs.org/rc-pagination/-/rc-pagination-3.1.17.tgz", + "integrity": "sha512-/BQ5UxcBnW28vFAcP2hfh+Xg15W0QZn8TWYwdCApchMH1H0CxiaUUcULP8uXcFM1TygcdKWdt3JqsL9cTAfdkQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1" @@ -14061,8 +14115,9 @@ } }, "node_modules/rc-picker": { - "version": "2.6.8", - "license": "MIT", + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/rc-picker/-/rc-picker-2.6.11.tgz", + "integrity": "sha512-INJ7ULu+Kj4UgqbcqE8Q+QpMw55xFf9kkyLBHJFk0ihjJpAV4glialRfqHE7k4KX2BWYPQfpILwhwR14x2EiRQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -14082,8 +14137,9 @@ } }, "node_modules/rc-progress": { - "version": "3.2.4", - "license": "MIT", + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/rc-progress/-/rc-progress-3.3.3.tgz", + "integrity": "sha512-MDVNVHzGanYtRy2KKraEaWeZLri2ZHWIRyaE1a9MQ2MuJ09m+Wxj5cfcaoaR6z5iRpHpA59YeUxAlpML8N4PJw==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -14112,7 +14168,8 @@ }, "node_modules/rc-resize-observer": { "version": "1.2.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/rc-resize-observer/-/rc-resize-observer-1.2.0.tgz", + "integrity": "sha512-6W+UzT3PyDM0wVCEHfoW3qTHPTvbdSgiA43buiy8PzmeMnfgnDeb9NjdimMXMl3/TcrvvWl5RRVdp+NqcR47pQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -14139,8 +14196,9 @@ } }, "node_modules/rc-select": { - "version": "14.1.2", - "license": "MIT", + "version": "14.1.13", + "resolved": "https://registry.npmjs.org/rc-select/-/rc-select-14.1.13.tgz", + "integrity": "sha512-WMEsC3gTwA1dbzWOdVIXDmWyidYNLq68AwvvUlRROw790uGUly0/vmqDozXrIr0QvN/A3CEULx12o+WtLCAefg==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -14206,13 +14264,14 @@ } }, "node_modules/rc-table": { - "version": "7.24.1", - "license": "MIT", + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/rc-table/-/rc-table-7.26.0.tgz", + "integrity": "sha512-0cD8e6S+DTGAt5nBZQIPFYEaIukn17sfa5uFL98faHlH/whZzD8ii3dbFL4wmUDEL4BLybhYop+QUfZJ4CPvNQ==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", "rc-resize-observer": "^1.1.0", - "rc-util": "^5.14.0", + "rc-util": "^5.22.5", "shallowequal": "^1.1.0" }, "engines": { @@ -14224,13 +14283,15 @@ } }, "node_modules/rc-tabs": { - "version": "11.13.0", - "license": "MIT", + "version": "12.2.1", + "resolved": "https://registry.npmjs.org/rc-tabs/-/rc-tabs-12.2.1.tgz", + "integrity": "sha512-09pVv4kN8VFqp6THceEmxOW8PAShQC08hrroeVYP4Y8YBFaP1PIWdyFL01czcbyz5YZFj9flZ7aljMaAl0jLVg==", "dependencies": { "@babel/runtime": "^7.11.2", "classnames": "2.x", - "rc-dropdown": "~3.5.0", - "rc-menu": "~9.5.1", + "rc-dropdown": "~4.0.0", + "rc-menu": "~9.6.0", + "rc-motion": "^2.6.2", "rc-resize-observer": "^1.0.0", "rc-util": "^5.5.0" }, @@ -14243,13 +14304,14 @@ } }, "node_modules/rc-textarea": { - "version": "0.3.7", - "license": "MIT", + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/rc-textarea/-/rc-textarea-0.4.6.tgz", + "integrity": "sha512-HEKCu8nouXXayqYelQnhQm8fdH7v92pAQvfVCz+jhIPv2PHTyBxVrmoZJMn3B8cU+wdyuvRGkshngO3/TzBn4w==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", "rc-resize-observer": "^1.0.0", - "rc-util": "^5.7.0", + "rc-util": "^5.24.4", "shallowequal": "^1.1.0" }, "peerDependencies": { @@ -14258,10 +14320,12 @@ } }, "node_modules/rc-tooltip": { - "version": "5.1.1", - "license": "MIT", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/rc-tooltip/-/rc-tooltip-5.2.2.tgz", + "integrity": "sha512-jtQzU/18S6EI3lhSGoDYhPqNpWajMtS5VV/ld1LwyfrDByQpYmw/LW6U7oFXXLukjfDHQ7Ju705A82PRNFWYhg==", "dependencies": { "@babel/runtime": "^7.11.2", + "classnames": "^2.3.1", "rc-trigger": "^5.0.0" }, "peerDependencies": { @@ -14270,14 +14334,15 @@ } }, "node_modules/rc-tree": { - "version": "5.5.0", - "license": "MIT", + "version": "5.7.0", + "resolved": "https://registry.npmjs.org/rc-tree/-/rc-tree-5.7.0.tgz", + "integrity": "sha512-F+Ewkv/UcutshnVBMISP+lPdHDlcsL+YH/MQDVWbk+QdkfID7vXiwrHMEZn31+2Rbbm21z/HPceGS8PXGMmnQg==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-motion": "^2.0.1", "rc-util": "^5.16.1", - "rc-virtual-list": "^3.4.2" + "rc-virtual-list": "^3.4.8" }, "engines": { "node": ">=10.x" @@ -14288,13 +14353,14 @@ } }, "node_modules/rc-tree-select": { - "version": "5.3.0", - "license": "MIT", + "version": "5.5.3", + "resolved": "https://registry.npmjs.org/rc-tree-select/-/rc-tree-select-5.5.3.tgz", + "integrity": "sha512-gv8KyC6J7f9e50OkGk1ibF7v8vL+iaBnA8Ep/EVlMma2/tGdBQXO9xIvPjX8eQrZL5PjoeTUndNPM3cY3721ng==", "dependencies": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.16.1" }, "peerDependencies": { @@ -14303,10 +14369,11 @@ } }, "node_modules/rc-trigger": { - "version": "5.2.18", - "license": "MIT", + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/rc-trigger/-/rc-trigger-5.3.3.tgz", + "integrity": "sha512-IC4nuTSAME7RJSgwvHCNDQrIzhvGMKf6NDu5veX+zk1MG7i1UnwTWWthcP9WHw3+FZfP3oZGvkrHFPu/EGkFKw==", "dependencies": { - "@babel/runtime": "^7.11.2", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", "rc-align": "^4.0.0", "rc-motion": "^2.0.0", @@ -14334,10 +14401,11 @@ } }, "node_modules/rc-util": { - "version": "5.21.2", - "license": "MIT", + "version": "5.24.4", + "resolved": "https://registry.npmjs.org/rc-util/-/rc-util-5.24.4.tgz", + "integrity": "sha512-2a4RQnycV9eV7lVZPEJ7QwJRPlZNc06J7CwcwZo4vIHr3PfUqtYgl1EkUV9ETAc6VRRi8XZOMFhYG63whlIC9Q==", "dependencies": { - "@babel/runtime": "^7.12.5", + "@babel/runtime": "^7.18.3", "react-is": "^16.12.0", "shallowequal": "^1.1.0" }, @@ -14347,9 +14415,11 @@ } }, "node_modules/rc-virtual-list": { - "version": "3.4.7", - "license": "MIT", + "version": "3.4.11", + "resolved": "https://registry.npmjs.org/rc-virtual-list/-/rc-virtual-list-3.4.11.tgz", + "integrity": "sha512-BvUUH60kkeTBPigN5F89HtGaA5jSP4y2aM6cJ4dk9Y42I9yY+h6i08wF6UKeDcxdfOU8j3I5HxkSS/xA77J3wA==", "dependencies": { + "@babel/runtime": "^7.20.0", "classnames": "^2.2.6", "rc-resize-observer": "^1.0.0", "rc-util": "^5.15.0" @@ -14570,6 +14640,18 @@ "node": ">=0.10.0" } }, + "node_modules/react-resizable": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/react-resizable/-/react-resizable-3.0.4.tgz", + "integrity": "sha512-StnwmiESiamNzdRHbSSvA65b0ZQJ7eVQpPusrSmcpyGKzC0gojhtO62xxH6YOBmepk9dQTBi9yxidL3W4s3EBA==", + "dependencies": { + "prop-types": "15.x", + "react-draggable": "^4.0.3" + }, + "peerDependencies": { + "react": ">= 16.3" + } + }, "node_modules/react-router": { "version": "6.3.0", "license": "MIT", @@ -14746,8 +14828,9 @@ } }, "node_modules/regenerator-runtime": { - "version": "0.13.9", - "license": "MIT" + "version": "0.13.10", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.10.tgz", + "integrity": "sha512-KepLsg4dU12hryUO7bp/axHAKvwGOCV0sGloQtpagJ12ai+ojVDqkeGSiRX1zlq+kjIMZ1t7gpze+26QqtdGqw==" }, "node_modules/regenerator-transform": { "version": "0.15.0", @@ -15761,7 +15844,8 @@ }, "node_modules/string-convert": { "version": "0.2.1", - "license": "MIT" + "resolved": "https://registry.npmjs.org/string-convert/-/string-convert-0.2.1.tgz", + "integrity": "sha512-u/1tdPl4yQnPBjnVrmdLo9gtuLvELKsAoRapekWggdiQNvvvum+jYF329d84NAa660KQw7pB2n36KrIKVoXa3A==" }, "node_modules/string-length": { "version": "4.0.2", @@ -16735,6 +16819,43 @@ "node": ">= 0.8" } }, + "node_modules/vscode-json-languageservice": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/vscode-json-languageservice/-/vscode-json-languageservice-4.2.1.tgz", + "integrity": "sha512-xGmv9QIWs2H8obGbWg+sIPI/3/pFgj/5OWBhNzs00BkYQ9UaB2F6JJaGB/2/YOZJ3BvLXQTC4Q7muqU25QgAhA==", + "dev": true, + "dependencies": { + "jsonc-parser": "^3.0.0", + "vscode-languageserver-textdocument": "^1.0.3", + "vscode-languageserver-types": "^3.16.0", + "vscode-nls": "^5.0.0", + "vscode-uri": "^3.0.3" + } + }, + "node_modules/vscode-languageserver-textdocument": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/vscode-languageserver-textdocument/-/vscode-languageserver-textdocument-1.0.7.tgz", + "integrity": "sha512-bFJH7UQxlXT8kKeyiyu41r22jCZXG8kuuVVA33OEJn1diWOZK5n8zBSPZFHVBOu8kXZ6h0LIRhf5UnCo61J4Hg==", + "dev": true + }, + "node_modules/vscode-languageserver-types": { + "version": "3.17.2", + "resolved": "https://registry.npmjs.org/vscode-languageserver-types/-/vscode-languageserver-types-3.17.2.tgz", + "integrity": "sha512-zHhCWatviizPIq9B7Vh9uvrH6x3sK8itC84HkamnBWoDFJtzBf7SWlpLCZUit72b3os45h6RWQNC9xHRDF8dRA==", + "dev": true + }, + "node_modules/vscode-nls": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/vscode-nls/-/vscode-nls-5.2.0.tgz", + "integrity": "sha512-RAaHx7B14ZU04EU31pT+rKz2/zSl7xMsfIZuo8pd+KZO6PXtQmpevpq3vxvWNcrGbdmhM/rr5Uw5Mz+NBfhVng==", + "dev": true + }, + "node_modules/vscode-uri": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/vscode-uri/-/vscode-uri-3.0.6.tgz", + "integrity": "sha512-fmL7V1eiDBFRRnu+gfRWTzyPpNIHJTc4mWnFkwBUmO9U3KPgJAmTx7oxi2bl/Rh6HLdU7+4C9wlj0k2E4AdKFQ==", + "dev": true + }, "node_modules/w3c-hr-time": { "version": "1.0.2", "dev": true, @@ -17543,13 +17664,15 @@ "version": "4.2.1" }, "@ant-design/react-slick": { - "version": "0.28.4", + "version": "0.29.2", + "resolved": "https://registry.npmjs.org/@ant-design/react-slick/-/react-slick-0.29.2.tgz", + "integrity": "sha512-kgjtKmkGHa19FW21lHnAfyyH9AAoh35pBdcJ53rHmQ3O+cfFHGHnUbj/HFrRNJ5vIts09FKJVAD8RpaC+RaWfA==", "requires": { "@babel/runtime": "^7.10.4", "classnames": "^2.2.5", "json2mq": "^0.2.0", "lodash": "^4.17.21", - "resize-observer-polyfill": "^1.5.0" + "resize-observer-polyfill": "^1.5.1" } }, "@apideck/better-ajv-errors": { @@ -18653,9 +18776,11 @@ } }, "@babel/runtime": { - "version": "7.17.9", + "version": "7.20.0", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.20.0.tgz", + "integrity": "sha512-NDYdls71fTXoU8TZHfbBWg7DiZfNzClcKui/+kyi6ppD2L1qnWW3VV6CjtaBXSUGGhiTWJ6ereOIkUvenif66Q==", "requires": { - "regenerator-runtime": "^0.13.4" + "regenerator-runtime": "^0.13.10" } }, "@babel/runtime-corejs3": { @@ -19755,6 +19880,15 @@ "redux": "^4.0.0" } }, + "@types/react-resizable": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/react-resizable/-/react-resizable-3.0.3.tgz", + "integrity": "sha512-W/QsUOZoXBAIBQNhNm95A5ohoaiUA874lWQytO2UP9dOjp5JHO9+a0cwYNabea7sA12ZDJnGVUFZxcNaNksAWA==", + "dev": true, + "requires": { + "@types/react": "*" + } + }, "@types/resolve": { "version": "1.17.1", "dev": true, @@ -20335,51 +20469,53 @@ } }, "antd": { - "version": "4.20.2", + "version": "4.23.6", + "resolved": "https://registry.npmjs.org/antd/-/antd-4.23.6.tgz", + "integrity": "sha512-AYH57cWBDe1ChtbnvG8i9dpKG4WnjE3AG0zIKpXByFNnxsr4saV6/19ihE8/ImSGpohN4E2zTXmo7R5/MyVRKQ==", "requires": { "@ant-design/colors": "^6.0.0", "@ant-design/icons": "^4.7.0", - "@ant-design/react-slick": "~0.28.1", - "@babel/runtime": "^7.12.5", + "@ant-design/react-slick": "~0.29.1", + "@babel/runtime": "^7.18.3", "@ctrl/tinycolor": "^3.4.0", "classnames": "^2.2.6", "copy-to-clipboard": "^3.2.0", "lodash": "^4.17.21", "memoize-one": "^6.0.0", "moment": "^2.29.2", - "rc-cascader": "~3.5.0", + "rc-cascader": "~3.7.0", "rc-checkbox": "~2.3.0", - "rc-collapse": "~3.1.0", - "rc-dialog": "~8.8.1", - "rc-drawer": "~4.4.2", - "rc-dropdown": "~3.5.0", - "rc-field-form": "~1.26.1", - "rc-image": "~5.6.0", - "rc-input": "~0.0.1-alpha.5", - "rc-input-number": "~7.3.0", - "rc-mentions": "~1.7.0", - "rc-menu": "~9.5.5", - "rc-motion": "^2.5.1", + "rc-collapse": "~3.3.0", + "rc-dialog": "~8.9.0", + "rc-drawer": "~5.1.0", + "rc-dropdown": "~4.0.0", + "rc-field-form": "~1.27.0", + "rc-image": "~5.7.0", + "rc-input": "~0.1.2", + "rc-input-number": "~7.3.9", + "rc-mentions": "~1.10.0", + "rc-menu": "~9.6.3", + "rc-motion": "^2.6.1", "rc-notification": "~4.6.0", - "rc-pagination": "~3.1.9", - "rc-picker": "~2.6.4", - "rc-progress": "~3.2.1", + "rc-pagination": "~3.1.17", + "rc-picker": "~2.6.11", + "rc-progress": "~3.3.2", "rc-rate": "~2.9.0", "rc-resize-observer": "^1.2.0", - "rc-segmented": "~2.1.0 ", - "rc-select": "~14.1.1", + "rc-segmented": "~2.1.0", + "rc-select": "~14.1.13", "rc-slider": "~10.0.0", "rc-steps": "~4.1.0", "rc-switch": "~3.2.0", - "rc-table": "~7.24.0", - "rc-tabs": "~11.13.0", - "rc-textarea": "~0.3.0", - "rc-tooltip": "~5.1.1", - "rc-tree": "~5.5.0", - "rc-tree-select": "~5.3.0", + "rc-table": "~7.26.0", + "rc-tabs": "~12.2.0", + "rc-textarea": "~0.4.5", + "rc-tooltip": "~5.2.0", + "rc-tree": "~5.7.0", + "rc-tree-select": "~5.5.0", "rc-trigger": "^5.2.10", "rc-upload": "~4.3.0", - "rc-util": "^5.20.0", + "rc-util": "^5.22.5", "scroll-into-view-if-needed": "^2.2.25" } }, @@ -20422,7 +20558,9 @@ } }, "array-tree-filter": { - "version": "2.1.0" + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/array-tree-filter/-/array-tree-filter-2.1.0.tgz", + "integrity": "sha512-4ROwICNlNw/Hqa9v+rk5h22KjmzB1JGTMVKP2AKJBOCgb0yL0ASf0+YvCcLNNwquOHNX48jkeZIJ3a+oOQqKcw==" }, "array-union": { "version": "2.1.0", @@ -20465,7 +20603,9 @@ "dev": true }, "async-validator": { - "version": "4.1.1" + "version": "4.2.5", + "resolved": "https://registry.npmjs.org/async-validator/-/async-validator-4.2.5.tgz", + "integrity": "sha512-7HhHjtERjqlNbZtqNqy2rckN/SpOOlmDliet+lP7k+eKZEjPk3DgyeU9lIXLdeLz0uBbbVp+9Qdow9wJWgwwfg==" }, "asynckit": { "version": "0.4.0" @@ -21606,7 +21746,9 @@ } }, "date-fns": { - "version": "2.28.0" + "version": "2.29.3", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.3.tgz", + "integrity": "sha512-dDCnyH2WnnKusqvZZ6+jA1O51Ibt8ZMRNkDZdyAyK4YfbDwa/cEmuztzG5pk6hqlp9aSBPYcjOlktquahGwGeA==" }, "dayjs": { "version": "1.11.5", @@ -22303,6 +22445,16 @@ "@typescript-eslint/experimental-utils": "^5.0.0" } }, + "eslint-plugin-json": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-json/-/eslint-plugin-json-3.1.0.tgz", + "integrity": "sha512-MrlG2ynFEHe7wDGwbUuFPsaT2b1uhuEFhJ+W1f1u+1C2EkXmTYJp4B1aAdQQ8M+CC3t//N/oRKiIVw14L2HR1g==", + "dev": true, + "requires": { + "lodash": "^4.17.21", + "vscode-json-languageservice": "^4.1.6" + } + }, "eslint-plugin-jsx-a11y": { "version": "6.5.1", "dev": true, @@ -24453,6 +24605,8 @@ }, "json2mq": { "version": "0.2.0", + "resolved": "https://registry.npmjs.org/json2mq/-/json2mq-0.2.0.tgz", + "integrity": "sha512-SzoRg7ux5DWTII9J2qkrZrqV1gt+rTaoufMxEzXbS26Uid0NwaJd123HcoB80TgubEppxxIGdNxCx50fEoEWQA==", "requires": { "string-convert": "^0.2.0" } @@ -24461,6 +24615,12 @@ "version": "2.2.1", "dev": true }, + "jsonc-parser": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz", + "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==", + "dev": true + }, "jsonfile": { "version": "6.1.0", "dev": true, @@ -26190,13 +26350,15 @@ } }, "rc-cascader": { - "version": "3.5.0", + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/rc-cascader/-/rc-cascader-3.7.0.tgz", + "integrity": "sha512-SFtGpwmYN7RaWEAGTS4Rkc62ZV/qmQGg/tajr/7mfIkleuu8ro9Hlk6J+aA0x1YS4zlaZBtTcSaXM01QMiEV/A==", "requires": { "@babel/runtime": "^7.12.5", "array-tree-filter": "^2.1.0", "classnames": "^2.3.1", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.6.1" } }, @@ -26208,7 +26370,9 @@ } }, "rc-collapse": { - "version": "3.1.4", + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/rc-collapse/-/rc-collapse-3.3.1.tgz", + "integrity": "sha512-cOJfcSe3R8vocrF8T+PgaHDrgeA1tX+lwfhwSj60NX9QVRidsILIbRNDLD6nAzmcvVC5PWiIRiR4S1OobxdhCg==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -26218,7 +26382,9 @@ } }, "rc-dialog": { - "version": "8.8.1", + "version": "8.9.0", + "resolved": "https://registry.npmjs.org/rc-dialog/-/rc-dialog-8.9.0.tgz", + "integrity": "sha512-Cp0tbJnrvPchJfnwIvOMWmJ4yjX3HWFatO6oBFD1jx8QkgsQCR0p8nUWAKdd3seLJhEC39/v56kZaEjwp9muoQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -26227,41 +26393,52 @@ } }, "rc-drawer": { - "version": "4.4.3", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/rc-drawer/-/rc-drawer-5.1.0.tgz", + "integrity": "sha512-pU3Tsn99pxGdYowXehzZbdDVE+4lDXSGb7p8vA9mSmr569oc2Izh4Zw5vLKSe/Xxn2p5MSNbLVqD4tz+pK6SOw==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-util": "^5.7.0" + "rc-motion": "^2.6.1", + "rc-util": "^5.21.2" } }, "rc-dropdown": { - "version": "3.5.2", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/rc-dropdown/-/rc-dropdown-4.0.1.tgz", + "integrity": "sha512-OdpXuOcme1rm45cR0Jzgfl1otzmU4vuBVb+etXM8vcaULGokAKVpKlw8p6xzspG7jGd/XxShvq+N3VNEfk/l5g==", "requires": { - "@babel/runtime": "^7.10.1", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", - "rc-trigger": "^5.0.4", + "rc-trigger": "^5.3.1", "rc-util": "^5.17.0" } }, "rc-field-form": { - "version": "1.26.3", + "version": "1.27.3", + "resolved": "https://registry.npmjs.org/rc-field-form/-/rc-field-form-1.27.3.tgz", + "integrity": "sha512-HGqxHnmGQgkPApEcikV4qTg3BLPC82uB/cwBDftDt1pYaqitJfSl5TFTTUMKVEJVT5RqJ2Zi68ME1HmIMX2HAw==", "requires": { - "@babel/runtime": "^7.8.4", + "@babel/runtime": "^7.18.0", "async-validator": "^4.1.0", "rc-util": "^5.8.0" } }, "rc-image": { - "version": "5.6.2", + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/rc-image/-/rc-image-5.7.1.tgz", + "integrity": "sha512-QyMfdhoUfb5W14plqXSisaYwpdstcLYnB0MjX5ccIK2rydQM9sDPuekQWu500DDGR2dBaIF5vx9XbWkNFK17Fg==", "requires": { "@babel/runtime": "^7.11.2", "classnames": "^2.2.6", - "rc-dialog": "~8.8.0", + "rc-dialog": "~8.9.0", "rc-util": "^5.0.6" } }, "rc-input": { - "version": "0.0.1-alpha.7", + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/rc-input/-/rc-input-0.1.4.tgz", + "integrity": "sha512-FqDdNz+fV2dKNgfXzcSLKvC+jEs1709t7nD+WdfjrdSaOcefpgc7BUJYadc3usaING+b7ediMTfKxuJBsEFbXA==", "requires": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -26269,26 +26446,32 @@ } }, "rc-input-number": { - "version": "7.3.4", + "version": "7.3.9", + "resolved": "https://registry.npmjs.org/rc-input-number/-/rc-input-number-7.3.9.tgz", + "integrity": "sha512-u0+miS+SATdb6DtssYei2JJ1WuZME+nXaG6XGtR8maNyW5uGDytfDu60OTWLQEb0Anv/AcCzehldV8CKmKyQfA==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", - "rc-util": "^5.9.8" + "rc-util": "^5.23.0" } }, "rc-mentions": { - "version": "1.7.1", + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/rc-mentions/-/rc-mentions-1.10.0.tgz", + "integrity": "sha512-oMlYWnwXSxP2NQVlgxOTzuG/u9BUc3ySY78K3/t7MNhJWpZzXTao+/Bic6tyZLuNCO89//hVQJBdaR2rnFQl6Q==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", - "rc-menu": "~9.5.1", - "rc-textarea": "^0.3.0", + "rc-menu": "~9.6.0", + "rc-textarea": "^0.4.0", "rc-trigger": "^5.0.4", - "rc-util": "^5.0.1" + "rc-util": "^5.22.5" } }, "rc-menu": { - "version": "9.5.5", + "version": "9.6.4", + "resolved": "https://registry.npmjs.org/rc-menu/-/rc-menu-9.6.4.tgz", + "integrity": "sha512-6DiNAjxjVIPLZXHffXxxcyE15d4isRL7iQ1ru4MqYDH2Cqc5bW96wZOdMydFtGLyDdnmEQ9jVvdCE9yliGvzkw==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -26300,7 +26483,9 @@ } }, "rc-motion": { - "version": "2.6.0", + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/rc-motion/-/rc-motion-2.6.2.tgz", + "integrity": "sha512-4w1FaX3dtV749P8GwfS4fYnFG4Rb9pxvCYPc/b2fw1cmlHJWNNgOFIz7ysiD+eOrzJSvnLJWlNQQncpNMXwwpg==", "requires": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -26317,7 +26502,9 @@ } }, "rc-overflow": { - "version": "1.2.5", + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc-overflow/-/rc-overflow-1.2.8.tgz", + "integrity": "sha512-QJ0UItckWPQ37ZL1dMEBAdY1dhfTXFL9k6oTTcyydVwoUNMnMqCGqnRNA98axSr/OeDKqR6DVFyi8eA5RQI/uQ==", "requires": { "@babel/runtime": "^7.11.1", "classnames": "^2.2.1", @@ -26326,14 +26513,18 @@ } }, "rc-pagination": { - "version": "3.1.16", + "version": "3.1.17", + "resolved": "https://registry.npmjs.org/rc-pagination/-/rc-pagination-3.1.17.tgz", + "integrity": "sha512-/BQ5UxcBnW28vFAcP2hfh+Xg15W0QZn8TWYwdCApchMH1H0CxiaUUcULP8uXcFM1TygcdKWdt3JqsL9cTAfdkQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1" } }, "rc-picker": { - "version": "2.6.8", + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/rc-picker/-/rc-picker-2.6.11.tgz", + "integrity": "sha512-INJ7ULu+Kj4UgqbcqE8Q+QpMw55xFf9kkyLBHJFk0ihjJpAV4glialRfqHE7k4KX2BWYPQfpILwhwR14x2EiRQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -26346,7 +26537,9 @@ } }, "rc-progress": { - "version": "3.2.4", + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/rc-progress/-/rc-progress-3.3.3.tgz", + "integrity": "sha512-MDVNVHzGanYtRy2KKraEaWeZLri2ZHWIRyaE1a9MQ2MuJ09m+Wxj5cfcaoaR6z5iRpHpA59YeUxAlpML8N4PJw==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.6", @@ -26363,6 +26556,8 @@ }, "rc-resize-observer": { "version": "1.2.0", + "resolved": "https://registry.npmjs.org/rc-resize-observer/-/rc-resize-observer-1.2.0.tgz", + "integrity": "sha512-6W+UzT3PyDM0wVCEHfoW3qTHPTvbdSgiA43buiy8PzmeMnfgnDeb9NjdimMXMl3/TcrvvWl5RRVdp+NqcR47pQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", @@ -26380,7 +26575,9 @@ } }, "rc-select": { - "version": "14.1.2", + "version": "14.1.13", + "resolved": "https://registry.npmjs.org/rc-select/-/rc-select-14.1.13.tgz", + "integrity": "sha512-WMEsC3gTwA1dbzWOdVIXDmWyidYNLq68AwvvUlRROw790uGUly0/vmqDozXrIr0QvN/A3CEULx12o+WtLCAefg==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", @@ -26418,67 +26615,83 @@ } }, "rc-table": { - "version": "7.24.1", + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/rc-table/-/rc-table-7.26.0.tgz", + "integrity": "sha512-0cD8e6S+DTGAt5nBZQIPFYEaIukn17sfa5uFL98faHlH/whZzD8ii3dbFL4wmUDEL4BLybhYop+QUfZJ4CPvNQ==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.5", "rc-resize-observer": "^1.1.0", - "rc-util": "^5.14.0", + "rc-util": "^5.22.5", "shallowequal": "^1.1.0" } }, "rc-tabs": { - "version": "11.13.0", + "version": "12.2.1", + "resolved": "https://registry.npmjs.org/rc-tabs/-/rc-tabs-12.2.1.tgz", + "integrity": "sha512-09pVv4kN8VFqp6THceEmxOW8PAShQC08hrroeVYP4Y8YBFaP1PIWdyFL01czcbyz5YZFj9flZ7aljMaAl0jLVg==", "requires": { "@babel/runtime": "^7.11.2", "classnames": "2.x", - "rc-dropdown": "~3.5.0", - "rc-menu": "~9.5.1", + "rc-dropdown": "~4.0.0", + "rc-menu": "~9.6.0", + "rc-motion": "^2.6.2", "rc-resize-observer": "^1.0.0", "rc-util": "^5.5.0" } }, "rc-textarea": { - "version": "0.3.7", + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/rc-textarea/-/rc-textarea-0.4.6.tgz", + "integrity": "sha512-HEKCu8nouXXayqYelQnhQm8fdH7v92pAQvfVCz+jhIPv2PHTyBxVrmoZJMn3B8cU+wdyuvRGkshngO3/TzBn4w==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "^2.2.1", "rc-resize-observer": "^1.0.0", - "rc-util": "^5.7.0", + "rc-util": "^5.24.4", "shallowequal": "^1.1.0" } }, "rc-tooltip": { - "version": "5.1.1", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/rc-tooltip/-/rc-tooltip-5.2.2.tgz", + "integrity": "sha512-jtQzU/18S6EI3lhSGoDYhPqNpWajMtS5VV/ld1LwyfrDByQpYmw/LW6U7oFXXLukjfDHQ7Ju705A82PRNFWYhg==", "requires": { "@babel/runtime": "^7.11.2", + "classnames": "^2.3.1", "rc-trigger": "^5.0.0" } }, "rc-tree": { - "version": "5.5.0", + "version": "5.7.0", + "resolved": "https://registry.npmjs.org/rc-tree/-/rc-tree-5.7.0.tgz", + "integrity": "sha512-F+Ewkv/UcutshnVBMISP+lPdHDlcsL+YH/MQDVWbk+QdkfID7vXiwrHMEZn31+2Rbbm21z/HPceGS8PXGMmnQg==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-motion": "^2.0.1", "rc-util": "^5.16.1", - "rc-virtual-list": "^3.4.2" + "rc-virtual-list": "^3.4.8" } }, "rc-tree-select": { - "version": "5.3.0", + "version": "5.5.3", + "resolved": "https://registry.npmjs.org/rc-tree-select/-/rc-tree-select-5.5.3.tgz", + "integrity": "sha512-gv8KyC6J7f9e50OkGk1ibF7v8vL+iaBnA8Ep/EVlMma2/tGdBQXO9xIvPjX8eQrZL5PjoeTUndNPM3cY3721ng==", "requires": { "@babel/runtime": "^7.10.1", "classnames": "2.x", "rc-select": "~14.1.0", - "rc-tree": "~5.5.0", + "rc-tree": "~5.7.0", "rc-util": "^5.16.1" } }, "rc-trigger": { - "version": "5.2.18", + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/rc-trigger/-/rc-trigger-5.3.3.tgz", + "integrity": "sha512-IC4nuTSAME7RJSgwvHCNDQrIzhvGMKf6NDu5veX+zk1MG7i1UnwTWWthcP9WHw3+FZfP3oZGvkrHFPu/EGkFKw==", "requires": { - "@babel/runtime": "^7.11.2", + "@babel/runtime": "^7.18.3", "classnames": "^2.2.6", "rc-align": "^4.0.0", "rc-motion": "^2.0.0", @@ -26494,16 +26707,21 @@ } }, "rc-util": { - "version": "5.21.2", + "version": "5.24.4", + "resolved": "https://registry.npmjs.org/rc-util/-/rc-util-5.24.4.tgz", + "integrity": "sha512-2a4RQnycV9eV7lVZPEJ7QwJRPlZNc06J7CwcwZo4vIHr3PfUqtYgl1EkUV9ETAc6VRRi8XZOMFhYG63whlIC9Q==", "requires": { - "@babel/runtime": "^7.12.5", + "@babel/runtime": "^7.18.3", "react-is": "^16.12.0", "shallowequal": "^1.1.0" } }, "rc-virtual-list": { - "version": "3.4.7", + "version": "3.4.11", + "resolved": "https://registry.npmjs.org/rc-virtual-list/-/rc-virtual-list-3.4.11.tgz", + "integrity": "sha512-BvUUH60kkeTBPigN5F89HtGaA5jSP4y2aM6cJ4dk9Y42I9yY+h6i08wF6UKeDcxdfOU8j3I5HxkSS/xA77J3wA==", "requires": { + "@babel/runtime": "^7.20.0", "classnames": "^2.2.6", "rc-resize-observer": "^1.0.0", "rc-util": "^5.15.0" @@ -26639,6 +26857,15 @@ "version": "0.11.0", "dev": true }, + "react-resizable": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/react-resizable/-/react-resizable-3.0.4.tgz", + "integrity": "sha512-StnwmiESiamNzdRHbSSvA65b0ZQJ7eVQpPusrSmcpyGKzC0gojhtO62xxH6YOBmepk9dQTBi9yxidL3W4s3EBA==", + "requires": { + "prop-types": "15.x", + "react-draggable": "^4.0.3" + } + }, "react-router": { "version": "6.3.0", "requires": { @@ -26764,7 +26991,9 @@ } }, "regenerator-runtime": { - "version": "0.13.9" + "version": "0.13.10", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.10.tgz", + "integrity": "sha512-KepLsg4dU12hryUO7bp/axHAKvwGOCV0sGloQtpagJ12ai+ojVDqkeGSiRX1zlq+kjIMZ1t7gpze+26QqtdGqw==" }, "regenerator-transform": { "version": "0.15.0", @@ -27432,7 +27661,9 @@ "dev": true }, "string-convert": { - "version": "0.2.1" + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/string-convert/-/string-convert-0.2.1.tgz", + "integrity": "sha512-u/1tdPl4yQnPBjnVrmdLo9gtuLvELKsAoRapekWggdiQNvvvum+jYF329d84NAa660KQw7pB2n36KrIKVoXa3A==" }, "string-length": { "version": "4.0.2", @@ -28057,6 +28288,43 @@ "version": "1.1.2", "dev": true }, + "vscode-json-languageservice": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/vscode-json-languageservice/-/vscode-json-languageservice-4.2.1.tgz", + "integrity": "sha512-xGmv9QIWs2H8obGbWg+sIPI/3/pFgj/5OWBhNzs00BkYQ9UaB2F6JJaGB/2/YOZJ3BvLXQTC4Q7muqU25QgAhA==", + "dev": true, + "requires": { + "jsonc-parser": "^3.0.0", + "vscode-languageserver-textdocument": "^1.0.3", + "vscode-languageserver-types": "^3.16.0", + "vscode-nls": "^5.0.0", + "vscode-uri": "^3.0.3" + } + }, + "vscode-languageserver-textdocument": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/vscode-languageserver-textdocument/-/vscode-languageserver-textdocument-1.0.7.tgz", + "integrity": "sha512-bFJH7UQxlXT8kKeyiyu41r22jCZXG8kuuVVA33OEJn1diWOZK5n8zBSPZFHVBOu8kXZ6h0LIRhf5UnCo61J4Hg==", + "dev": true + }, + "vscode-languageserver-types": { + "version": "3.17.2", + "resolved": "https://registry.npmjs.org/vscode-languageserver-types/-/vscode-languageserver-types-3.17.2.tgz", + "integrity": "sha512-zHhCWatviizPIq9B7Vh9uvrH6x3sK8itC84HkamnBWoDFJtzBf7SWlpLCZUit72b3os45h6RWQNC9xHRDF8dRA==", + "dev": true + }, + "vscode-nls": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/vscode-nls/-/vscode-nls-5.2.0.tgz", + "integrity": "sha512-RAaHx7B14ZU04EU31pT+rKz2/zSl7xMsfIZuo8pd+KZO6PXtQmpevpq3vxvWNcrGbdmhM/rr5Uw5Mz+NBfhVng==", + "dev": true + }, + "vscode-uri": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/vscode-uri/-/vscode-uri-3.0.6.tgz", + "integrity": "sha512-fmL7V1eiDBFRRnu+gfRWTzyPpNIHJTc4mWnFkwBUmO9U3KPgJAmTx7oxi2bl/Rh6HLdU7+4C9wlj0k2E4AdKFQ==", + "dev": true + }, "w3c-hr-time": { "version": "1.0.2", "dev": true, diff --git a/ui/package.json b/ui/package.json index 0cd3f6b7e..da870fbf3 100644 --- a/ui/package.json +++ b/ui/package.json @@ -6,7 +6,7 @@ "@ant-design/icons": "^4.7.0", "@azure/msal-browser": "^2.24.0", "@azure/msal-react": "^1.4.0", - "antd": "^4.20.2", + "antd": "^4.23.6", "axios": "^0.27.2", "classnames": "^2.3.2", "dagre": "^0.8.5", @@ -15,6 +15,7 @@ "react-dom": "^17.0.2", "react-flow-renderer": "^9.7.4", "react-query": "^3.38.0", + "react-resizable": "^3.0.4", "react-router-dom": "^6.3.0" }, "devDependencies": { @@ -27,6 +28,7 @@ "@types/node": "^16.11.26", "@types/react": "^17.0.43", "@types/react-dom": "^17.0.14", + "@types/react-resizable": "^3.0.3", "@typescript-eslint/eslint-plugin": "^5.30.7", "@typescript-eslint/parser": "^5.30.7", "babel-plugin-import": "^1.13.5", @@ -35,6 +37,7 @@ "eslint-config-prettier": "^8.5.0", "eslint-import-resolver-typescript": "^3.5.1", "eslint-plugin-import": "^2.26.0", + "eslint-plugin-json": "^3.1.0", "eslint-plugin-prettier": "^4.2.1", "eslint-plugin-react-hooks": "^4.6.0", "husky": "^8.0.1", @@ -51,7 +54,8 @@ "test": "craco test", "eject": "react-scripts eject", "lint:fix": "npx eslint --fix --ext ts --ext tsx src/ ", - "format": "npx prettier --write src/**" + "format": "npx prettier --write src/**", + "lintStaged": "lint-staged" }, "browserslist": { "production": [ diff --git a/ui/src/api/api.tsx b/ui/src/api/api.tsx index a95ab2bd5..6c8b6f665 100644 --- a/ui/src/api/api.tsx +++ b/ui/src/api/api.tsx @@ -38,14 +38,18 @@ export const fetchDataSource = async ( ) => { const axios = await authAxios(msalInstance); return axios - .get ( + .get ( `${getApiBaseUrl()}/projects/${project}/datasources/${dataSourceId}`, { params: { project: project, datasource: dataSourceId }, } ) .then((response) => { - return response.data; + if (response.data.message || response.data.detail) { + return Promise.reject(response.data.message || response.data.detail); + } else { + return response.data; + } }); }; @@ -109,33 +113,21 @@ export const fetchFeatureLineages = async (featureId: string) => { // Following are place-holder code export const createFeature = async (feature: Feature) => { const axios = await authAxios(msalInstance); - return axios - .post(`${getApiBaseUrl()}/features`, feature, { - headers: { "Content-Type": "application/json;" }, - params: {}, - }) - .then((response) => { - return response; - }) - .catch((error) => { - return error.response; - }); + return axios.post(`${getApiBaseUrl()}/features`, feature, { + headers: { "Content-Type": "application/json;" }, + params: {}, + }); }; -export const updateFeature = async (feature: Feature, id: string) => { +export const updateFeature = async (feature: Feature, id?: string) => { const axios = await authAxios(msalInstance); - feature.guid = id; - return await axios - .put(`${getApiBaseUrl()}/features/${id}`, feature, { - headers: { "Content-Type": "application/json;" }, - params: {}, - }) - .then((response) => { - return response; - }) - .catch((error) => { - return error.response; - }); + if (id) { + feature.guid = id; + } + return axios.put(`${getApiBaseUrl()}/features/${feature.guid}`, feature, { + headers: { "Content-Type": "application/json;" }, + params: {}, + }); }; export const listUserRole = async () => { @@ -245,6 +237,8 @@ export const authAxios = async (msalInstance: PublicClientApplication) => { if (error.response?.status === 403) { const detail = error.response.data.detail; window.location.href = "/responseErrors/403/" + detail; + } else { + return Promise.reject(error.response.data); } //TODO: handle other response errors } diff --git a/ui/src/components/CardDescriptions/index.tsx b/ui/src/components/CardDescriptions/index.tsx new file mode 100644 index 000000000..9c0d41498 --- /dev/null +++ b/ui/src/components/CardDescriptions/index.tsx @@ -0,0 +1,32 @@ +import React from "react"; +import { Card, Descriptions } from "antd"; + +export interface CardDescriptionsProps { + title?: string; + mapping: any[]; + descriptions: any; +} + +const CardDescriptions = (props: CardDescriptionsProps) => { + const { title, mapping, descriptions } = props; + + return descriptions ? ( + + + ) : null; +}; + +export default CardDescriptions; diff --git a/ui/src/components/FlowGraph/FlowGraph.tsx b/ui/src/components/FlowGraph/FlowGraph.tsx new file mode 100644 index 000000000..ef3f16033 --- /dev/null +++ b/ui/src/components/FlowGraph/FlowGraph.tsx @@ -0,0 +1,236 @@ +import React, { + MouseEvent as ReactMouseEvent, + forwardRef, + useCallback, + useEffect, + useRef, + useState, +} from "react"; +import ReactFlow, { + ConnectionLineType, + Controls, + Edge, + Node, + Elements, + getIncomers, + getOutgoers, + ReactFlowProvider, + isNode, + OnLoadParams, +} from "react-flow-renderer"; +import { Spin } from "antd"; +import { LoadingOutlined } from "@ant-design/icons"; +import { useSearchParams } from "react-router-dom"; +import cs from "classnames"; +import { FeatureLineage } from "@/models/model"; +import { isFeature, FeatureType } from "@/utils/utils"; +import LineageNode from "./LineageNode"; +import { NodeData, FlowGraphProps } from "./interface"; +import { getElements } from "./utils"; + +import styles from "./index.module.less"; + +const FlowGraphNodeTypes = { + "custom-node": LineageNode, +}; + +const defaultProps: FlowGraphProps = { + project: "", + snapGrid: [15, 15], + featureType: FeatureType.AllNodes, +}; + +const FlowGraph = (props: FlowGraphProps, ref: any) => { + const { + className, + style, + data, + loading, + height, + minHeight, + project, + nodeId, + featureType, + snapGrid, + } = { + ...defaultProps, + ...props, + }; + const [, setURLSearchParams] = useSearchParams(); + const flowRef = useRef+ {mapping.reduce((list: any, item) => { + const value = descriptions?.[item.key]; + if (value) { + list.push( + ++ {typeof value === "string" ? value : JSON.stringify(value)} + + ); + } + return list; + }, [])} +(); + const hasReadRef = useRef (false); + const elementRef = useRef >(); + const hasHighlight = useRef (false); + const [elements, setElements] = useState >([]); + + // Reset all node highlight status + const resetHighlight = useCallback(() => { + if ( + elementRef.current && + elementRef.current.length > 0 && + hasHighlight.current + ) { + hasHighlight.current = false; + setElements((state) => { + return state.map((element) => { + if (isNode(element)) { + element.style = { + ...element.style, + opacity: 1, + }; + element.data!.active = false; + } else { + element.animated = false; + } + return element; + }); + }); + } + }, [setElements]); + + // Highlight path of selected node, including all linked up and down stream nodes + const highlightPath = useCallback( + (node: Node ) => { + if (elementRef.current && elementRef.current.length > 0) { + hasHighlight.current = true; + setElements((elements) => { + const incomerIds = new Set( + getIncomers(node, elements).map((item) => item.id) + ); + const outgoerIds = new Set( + getOutgoers(node, elements).map((item) => item.id) + ); + + return elements.map((element) => { + if (isNode(element)) { + const highlight = + element.id === node.id || + incomerIds.has(element.id) || + outgoerIds.has(element.id); + element.style = { + ...element.style, + opacity: highlight ? 1 : 0.25, + }; + element.data = { + ...element.data, + active: + element.id === node.id && isFeature(element.data!.subtitle), + }; + } else { + const highlight = + element.source === node.id || element.target === node.id; + const animated = + incomerIds.has(element.source) && + (incomerIds.has(element.target) || node.id === element.target); + + element.animated = highlight || animated; + } + return element; + }); + }); + } + }, + [setElements] + ); + + // Fired when panel is clicked, reset all highlighted path, and remove the nodeId query string in url path. + const onPaneClick = useCallback(() => { + resetHighlight(); + setURLSearchParams({}); + }, [resetHighlight, setURLSearchParams]); + + const onElementClick = useCallback( + (e: ReactMouseEvent, element: Node | Edge) => { + e.stopPropagation(); + if (isNode(element)) { + setURLSearchParams({ + nodeId: element.id, + featureType: element.data!.subtitle, + }); + setTimeout(() => { + highlightPath(element); + }, 0); + } + }, + [highlightPath, setURLSearchParams] + ); + + const handleInit = useCallback( + ( + project: string, + data: FeatureLineage, + featureType?: FeatureType, + nodeId?: string + ) => { + const elements = (elementRef.current = getElements( + project, + data, + featureType + )); + setElements(elements); + if (nodeId) { + const node = elements?.find( + (item) => item.id === nodeId + ) as Node ; + if (node) { + highlightPath(node); + } + } + }, + [setElements, highlightPath] + ); + + // Fit the graph to the center of layout view when graph is initialized + const onLoad = (reactFlowInstance: OnLoadParams) => { + flowRef.current = reactFlowInstance; + flowRef.current?.fitView(); + }; + + useEffect(() => { + if (data) { + const type = hasHighlight.current ? FeatureType.AllNodes : featureType; + handleInit(project!, data, type, nodeId); + } + }, [data, project, nodeId, featureType, handleInit]); + + useEffect(() => { + if (elements.length > 0 && !hasReadRef.current) { + hasReadRef.current = true; + setTimeout(() => { + flowRef.current?.fitView(); + }, 0); + } + }, [elements]); + + return ( + } + > + + + + ); +}; + +const FlowGraphComponent = forwardRef+ ++ (FlowGraph); + +FlowGraphComponent.displayName = "FlowGraph"; + +export default FlowGraphComponent; diff --git a/ui/src/components/FlowGraph/LineageNode.tsx b/ui/src/components/FlowGraph/LineageNode.tsx new file mode 100644 index 000000000..27a99cc4f --- /dev/null +++ b/ui/src/components/FlowGraph/LineageNode.tsx @@ -0,0 +1,57 @@ +import React, { forwardRef, memo } from "react"; +import cs from "classnames"; +import { RightCircleOutlined } from "@ant-design/icons"; +import { useNavigate } from "react-router-dom"; +import { Handle, NodeProps, Position } from "react-flow-renderer"; +import { LineageNodeProps } from "./interface"; + +import styles from "./index.module.less"; + +const LineageNode = (props: LineageNodeProps, ref: any) => { + const navigate = useNavigate(); + + const { label, subtitle, version, borderColor, detialUrl, active } = + props.data; + + const nodeTitle = version ? `${label} (v${version})` : label; + const nodeSubtitle = subtitle.replace("feathr_", ""); + const nodeColorStyle = { + border: `2px solid ${borderColor}`, + }; + + const onNodeIconClick = () => { + if (detialUrl) { + navigate(detialUrl); + } + // `/projects/${project}/features/${featureId}`); + }; + + return ( + ++ ); +}; + +const LineageNodeComponent = forwardRef+++ + {nodeTitle} + {active && ( +++ )} + {nodeSubtitle}++ (LineageNode); + +LineageNodeComponent.displayName = "LineageNode"; + +export default memo(LineageNodeComponent); diff --git a/ui/src/components/FlowGraph/index.module.less b/ui/src/components/FlowGraph/index.module.less new file mode 100644 index 000000000..9e69f59d7 --- /dev/null +++ b/ui/src/components/FlowGraph/index.module.less @@ -0,0 +1,43 @@ +.flowGraph { + width: 100%; +} + +.lineageNode { + height: 100%; + + &Active { + overflow: hidden; + border-radius: 0.25rem; + border-width: 2px; + border-style: solid; + --tw-border-opacity: 1; + border-color: rgba(57, 35, 150, var(--tw-border-opacity)); + --tw-bg-opacity: 1; + background-color: rgba(57, 35, 150, var(--tw-bg-opacity)); + --tw-text-opacity: 1; + color: rgba(255, 255, 255, var(--tw-text-opacity)); + opacity: 1; + } + + .box { + padding: 4px 12px 7px; + } + + .title { + font-size: 15px; + font-weight: 700; + } + + .subtitle { + font-size: 10px; + font-style: italic; + text-overflow: ellipsis; + max-width: 135px; + overflow: hidden; + white-space: nowrap; + } + + .navigate { + padding: 4px 12px 7px; + } +} diff --git a/ui/src/components/FlowGraph/index.ts b/ui/src/components/FlowGraph/index.ts new file mode 100644 index 000000000..0f6d659d8 --- /dev/null +++ b/ui/src/components/FlowGraph/index.ts @@ -0,0 +1,5 @@ +import FlowGraph from "./FlowGraph"; + +export * from "./interface"; + +export default FlowGraph; diff --git a/ui/src/components/FlowGraph/interface.ts b/ui/src/components/FlowGraph/interface.ts new file mode 100644 index 000000000..0949dbe97 --- /dev/null +++ b/ui/src/components/FlowGraph/interface.ts @@ -0,0 +1,30 @@ +import { CSSProperties } from "react"; +import { FeatureLineage } from "@/models/model"; +import { FeatureType } from "@/utils/utils"; +import { NodeProps, ReactFlowProps } from "react-flow-renderer"; + +export interface NodeData { + id: string; + label: string; + subtitle: string; + featureId: string; + version: string; + borderColor?: string; + active?: boolean; + detialUrl?: string; +} + +export interface FlowGraphProps { + className?: string; + style?: CSSProperties; + minHeight?: string | number; + height?: string | number; + loading?: boolean; + data?: FeatureLineage; + nodeId?: string; + project?: string; + snapGrid?: ReactFlowProps["snapGrid"]; + featureType?: FeatureType; +} + +export interface LineageNodeProps extends NodeProps {} diff --git a/ui/src/components/FlowGraph/utils.ts b/ui/src/components/FlowGraph/utils.ts new file mode 100644 index 000000000..141962895 --- /dev/null +++ b/ui/src/components/FlowGraph/utils.ts @@ -0,0 +1,192 @@ +import { Feature, FeatureLineage, RelationData } from "@/models/model"; +import { FeatureType, getFeatureDetailUrl } from "@/utils/utils"; +import dagre from "dagre"; +import { + Node, + Edge, + ArrowHeadType, + Position, + Elements, +} from "react-flow-renderer"; +import { NodeData } from "./interface"; + +const featureTypeColors: Record = { + feathr_source_v1: "hsl(315, 100%, 50%)", + feathr_anchor_v1: "hsl(270, 100%, 50%)", + feathr_anchor_feature_v1: "hsl(225, 100%, 50%)", + feathr_derived_feature_v1: "hsl(135, 100%, 50%)", +}; + +const DEFAULT_WIDTH = 20; +const DEFAULT_HEIGHT = 36; + +const generateNode = (project: string, data: Feature): Node => { + return { + id: data.guid, + type: "custom-node", + style: { + border: `2px solid featureTypeColors[data.typeName]`, + }, + position: { + x: 0, + y: 0, + }, + data: { + id: data.guid, + label: data.displayText, + subtitle: data.typeName, + featureId: data.guid, + version: data.version, + borderColor: featureTypeColors[data.typeName], + detialUrl: getFeatureDetailUrl(project, data), + }, + }; +}; + +const generateEdge = ( + data: RelationData, + entityMap: Record +): Edge => { + let { fromEntityId: from, toEntityId: to, relationshipType } = data; + + if (relationshipType === "Consumes") { + [from, to] = [to, from]; + } + const sourceNode = entityMap?.[from]; + const targetNode = entityMap?.[to]; + + return { + id: `e-${from}_${to}`, + source: from, + target: to, + arrowHeadType: ArrowHeadType.ArrowClosed, + data: { + sourceTypeName: sourceNode?.typeName, + targetTypeName: targetNode?.typeName, + }, + }; +}; + +export const getLineageNodes = ( + project: string, + lineageData: FeatureLineage, + featureType: FeatureType +): Node [] => { + const { guidEntityMap } = lineageData; + if (!guidEntityMap) { + return []; + } + + return Object.values(guidEntityMap).reduce( + (nodes: Node [], item: Feature) => { + if ( + item.typeName !== "feathr_workspace_v1" && + (featureType === FeatureType.AllNodes || + item.typeName === featureType || + (featureType === FeatureType.AnchorFeature && + item.typeName === FeatureType.Anchor)) + ) { + nodes.push(generateNode(project, item)); + } + return nodes; + }, + [] as Node [] + ); +}; + +export const getLineageEdge = ( + lineageData: FeatureLineage, + featureType: FeatureType +): Edge[] => { + if (!lineageData.relations || !lineageData.guidEntityMap) { + return []; + } + + return lineageData.relations.reduce((edges: Edge[], item) => { + if (["Consumes", "Contains", "Produces"].includes(item.relationshipType)) { + const edge = generateEdge(item, lineageData.guidEntityMap!); + if ( + edges.findIndex((item) => item.id === edge.id) === -1 && + edge.data.sourceTypeName !== "feathr_workspace_v1" && + (featureType === FeatureType.AllNodes || + (featureType === FeatureType.AnchorFeature && + edge.data.sourceTypeName === FeatureType.Anchor && + edge.data.targetTypeName === FeatureType.AnchorFeature)) + ) { + edges.push(edge); + } + } + + return edges; + }, [] as Edge[]); +}; + +export const getElements = ( + project: string, + lineageData: FeatureLineage, + featureType: FeatureType = FeatureType.AllNodes, + direction = "LR" +) => { + const elements: Elements = []; + + const dagreGraph = new dagre.graphlib.Graph({ compound: true }); + + dagreGraph.setDefaultEdgeLabel(() => ({})); + dagreGraph.setGraph({ rankdir: direction }); + + const isHorizontal = direction === "LR"; + + const nodes = getLineageNodes(project, lineageData, featureType); + let edges = getLineageEdge(lineageData, featureType); + + const anchorEdges = edges.filter((item) => { + return ( + item.data.sourceTypeName === FeatureType.Anchor && + item.data.targetTypeName === FeatureType.AnchorFeature + ); + }); + + edges = edges.reduce((data: any, item) => { + const anchorEdge = anchorEdges.find((i: any) => i.target === item.target); + if (anchorEdge) { + if ( + !( + item.data.sourceTypeName === FeatureType.Source && + item.data.targetTypeName === FeatureType.AnchorFeature + ) + ) { + data.push(item); + } + } else { + data.push(item); + } + return data; + }, []); + + nodes.forEach((item) => { + dagreGraph.setNode(item.id, { + label: item.data!.label, + node: item, + width: item.data!.label.length * 8 + DEFAULT_WIDTH, + height: item.style?.height || DEFAULT_HEIGHT, + }); + elements.push(item); + }); + + edges?.forEach((item: any) => { + dagreGraph.setEdge(item.source, item.target); + elements.push(item); + }); + + dagre.layout(dagreGraph); + + nodes.forEach((item) => { + const nodeWithPosition = dagreGraph.node(item.id); + item.targetPosition = isHorizontal ? Position.Left : Position.Top; + item.sourcePosition = isHorizontal ? Position.Right : Position.Bottom; + item.position.x = nodeWithPosition.x; + item.position.y = nodeWithPosition.y - DEFAULT_HEIGHT / 2; + }); + + return elements; +}; diff --git a/ui/src/components/ProjectsSelect/index.tsx b/ui/src/components/ProjectsSelect/index.tsx new file mode 100644 index 000000000..ca5fddf9f --- /dev/null +++ b/ui/src/components/ProjectsSelect/index.tsx @@ -0,0 +1,51 @@ +import React from "react"; +import { Select } from "antd"; +import { fetchProjects } from "@/api"; +import { useQuery } from "react-query"; + +export interface ProjectsSelectProps { + width?: number; + defaultValue?: string; + onChange?: (value: string) => void; +} + +const ProjectsSelect = (props: ProjectsSelectProps) => { + const { width = 350, defaultValue, onChange, ...restProps } = props; + + const { isLoading, data: options } = useQuery< + { value: string; label: string }[] + >( + ["projectsSelect"], + async () => { + try { + const result = await fetchProjects(); + return result.map((item) => ({ + value: item, + label: item, + })); + } catch (e) { + return Promise.reject(e); + } + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + + ); +}; + +export default ProjectsSelect; diff --git a/ui/src/components/ResizeTable/ResizableTitle.tsx b/ui/src/components/ResizeTable/ResizableTitle.tsx new file mode 100644 index 000000000..863db673f --- /dev/null +++ b/ui/src/components/ResizeTable/ResizableTitle.tsx @@ -0,0 +1,37 @@ +import React from "react"; +import { Resizable } from "react-resizable"; +import ResizeHandle from "./ResizeHandle"; +import { ResizableTitleProps } from "./interface"; + +import styles from "./index.module.less"; + +const ResizableTitle = (props: ResizableTitleProps) => { + const { onResize, width, minWidth, ...restProps } = props; + + if (!width) { + return ; + } + + return ( + } + onResize={onResize} + minConstraints={minWidth ? [minWidth, minWidth] : undefined} + onResizeStart={() => { + document.body.classList.add(styles.colResize); + }} + onResizeStop={() => { + document.body.classList.remove(styles.colResize); + }} + draggableOpts={{ + enableUserSelectHack: false, + }} + > + + + ); +}; + +export default ResizableTitle; diff --git a/ui/src/components/ResizeTable/ResizeHandle.tsx b/ui/src/components/ResizeTable/ResizeHandle.tsx new file mode 100644 index 000000000..df2baaf0d --- /dev/null +++ b/ui/src/components/ResizeTable/ResizeHandle.tsx @@ -0,0 +1,29 @@ +import React, { forwardRef, LegacyRef } from "react"; + +import { ResizeHandleProps } from "./interface"; + +const ResizeHandle = ( + props: ResizeHandleProps, + ref: LegacyRef +) => { + const { handleAxis, ...restProps } = props; + + return ( + { + e.stopPropagation(); + }} + /> + ); +}; + +const ResizeHandleComponent = forwardRef ( + ResizeHandle +); + +ResizeHandleComponent.displayName = "ResizeHandleComponent"; + +export default ResizeHandleComponent; diff --git a/ui/src/components/ResizeTable/ResizeTable.tsx b/ui/src/components/ResizeTable/ResizeTable.tsx new file mode 100644 index 000000000..d90e9aa6d --- /dev/null +++ b/ui/src/components/ResizeTable/ResizeTable.tsx @@ -0,0 +1,68 @@ +import React, { useCallback, useMemo, useState } from "react"; +import { Table } from "antd"; +import cs from "classnames"; +import ResizableTitle from "./ResizableTitle"; +import { + ResizableTitleProps, + ResizeTableProps, + ResizeColumnType, +} from "./interface"; + +import styles from "./index.module.less"; + +const ResizeTable = (props: ResizeTableProps ) => { + const { + className, + columns: originColumns = [], + components: originComponents, + ...rest + } = props; + + const handleResize = useCallback((index) => { + return ((e, { size }) => { + setColumns((prevColumns) => { + const nextColumns = [...prevColumns!]; + nextColumns[index] = { ...nextColumns[index], width: size.width }; + return nextColumns; + }); + }) as ResizableTitleProps["onResize"]; + }, []); + + const [columns, setColumns] = useState( + originColumns?.map((column, index) => { + if (column.width && column.resize !== false) { + return { + ...column, + onHeaderCell: (col: ResizeColumnType ) => ({ + width: col.width, + minWidth: col.minWidth, + onResize: handleResize(index), + }), + }; + } + + return column; + }) + ); + + const components = useMemo(() => { + return { + ...originComponents, + header: { + ...originComponents?.header, + cell: ResizableTitle, + }, + }; + }, [originComponents]); + + return ( + + ); +}; + +export default ResizeTable; diff --git a/ui/src/components/ResizeTable/index.module.less b/ui/src/components/ResizeTable/index.module.less new file mode 100644 index 000000000..1fcde7c27 --- /dev/null +++ b/ui/src/components/ResizeTable/index.module.less @@ -0,0 +1,22 @@ +.colResize { + cursor: col-resize; + user-select: none; +} + +.resizableTable { + :global { + .react-resizable { + position: relative; + background-clip: padding-box; + } + .react-resizable-handle { + position: absolute; + width: 10px; + height: 100%; + bottom: 0; + right: -5px; + cursor: col-resize; + z-index: 1; + } + } +} diff --git a/ui/src/components/ResizeTable/index.tsx b/ui/src/components/ResizeTable/index.tsx new file mode 100644 index 000000000..26b5e7f18 --- /dev/null +++ b/ui/src/components/ResizeTable/index.tsx @@ -0,0 +1,5 @@ +import ResizeTable from "./ResizeTable"; + +export * from "./interface"; + +export default ResizeTable; diff --git a/ui/src/components/ResizeTable/interface.ts b/ui/src/components/ResizeTable/interface.ts new file mode 100644 index 000000000..c8535ecaf --- /dev/null +++ b/ui/src/components/ResizeTable/interface.ts @@ -0,0 +1,22 @@ +import { TableProps } from "antd"; +import type { ColumnType } from "antd/es/table"; +import { ResizeHandle, ResizableProps } from "react-resizable"; + +export interface ResizeTableProps
extends Omit , "columns"> { + columns?: ResizeColumnType []; +} + +export interface ResizeColumnType extends ColumnType { + resize?: boolean; + minWidth?: number; +} + +export interface ResizableTitleProps { + onResize?: ResizableProps["onResize"]; + width?: ResizableProps["width"]; + minWidth?: number; +} + +export interface ResizeHandleProps { + handleAxis?: ResizeHandle; +} diff --git a/ui/src/components/graph/graphNodeDetails.tsx b/ui/src/components/graph/graphNodeDetails.tsx index 2be5b1ec5..7aa003c6d 100644 --- a/ui/src/components/graph/graphNodeDetails.tsx +++ b/ui/src/components/graph/graphNodeDetails.tsx @@ -1,10 +1,10 @@ import React, { useEffect, useState } from "react"; import { useParams, useSearchParams } from "react-router-dom"; -import { fetchFeature } from "../../api"; -import { Feature } from "../../models/model"; +import { fetchFeature } from "@/api"; +import { Feature } from "@/models/model"; import { LoadingOutlined } from "@ant-design/icons"; import { Card, Spin, Typography } from "antd"; -import { isFeature } from "../../utils/utils"; +import { isFeature } from "@/utils/utils"; const { Title } = Typography; @@ -36,66 +36,57 @@ const GraphNodeDetails = () => { }, [featureType, project, nodeId]); return ( - <> - {loading ? ( - } /> - ) : ( - - {!feature && ( -+ {feature.attributes.transformation.defExpr && ( +Click on feature node to show metadata and metric details
+} + > + {!feature && ( + Click on feature node to show metadata and metric details
+ )} + {feature?.attributes.transformation && ( ++ {feature.attributes.transformation.transformExpr && ( + Expression: {feature.attributes.transformation.transformExpr}
+ )} + {feature.attributes.transformation.filter && ( +Filter {feature.attributes.transformation.filter}
+ )} + {feature.attributes.transformation.aggFunc && ( +Aggregation: {feature.attributes.transformation.aggFunc}
)} - {feature?.attributes.transformation && ( -- + {feature.attributes.transformation.limit && ( +Transformation - {feature.attributes.transformation.transformExpr && ( -- Expression: {feature.attributes.transformation.transformExpr} -
- )} - {feature.attributes.transformation.filter && ( -Filter {feature.attributes.transformation.filter}
- )} - {feature.attributes.transformation.aggFunc && ( -Aggregation: {feature.attributes.transformation.aggFunc}
- )} - {feature.attributes.transformation.limit && ( -Limit: {feature.attributes.transformation.limit}
- )} - {feature.attributes.transformation.groupBy && ( -Group By: {feature.attributes.transformation.groupBy}
- )} - {feature.attributes.transformation.window && ( -Window: {feature.attributes.transformation.window}
- )} - {feature.attributes.transformation.defExpr && ( -Expression: {feature.attributes.transformation.defExpr}
- )} -Limit: {feature.attributes.transformation.limit}
)} - {feature?.attributes.key && feature.attributes.key.length > 0 && ( -- + {feature.attributes.transformation.groupBy && ( +Entity Key -Full name: {feature.attributes.key[0].fullName}
-Description: {feature.attributes.key[0].description}
-Key column: {feature.attributes.key[0].keyColumn}
-- Key column alias: {feature.attributes.key[0].keyColumnAlias} -
-Key column type: {feature.attributes.key[0].keyColumnType}
-Group By: {feature.attributes.transformation.groupBy}
)} - {feature?.attributes.type && ( -- + {feature.attributes.transformation.window && ( +Type -Dimension Type: {feature.attributes.type.dimensionType}
-Tensor Category: {feature.attributes.type.tensorCategory}
-Type: {feature.attributes.type.type}
-Value Type: {feature.attributes.type.valType}
-Window: {feature.attributes.transformation.window}
)} -Expression: {feature.attributes.transformation.defExpr}
+ )} + + )} + {feature?.attributes.key && feature.attributes.key.length > 0 && ( ++ + )} + {feature?.attributes.type && ( +Full name: {feature.attributes.key[0].fullName}
+Description: {feature.attributes.key[0].description}
+Key column: {feature.attributes.key[0].keyColumn}
+Key column alias: {feature.attributes.key[0].keyColumnAlias}
+Key column type: {feature.attributes.key[0].keyColumnType}
++ )} - > + ); }; diff --git a/ui/src/components/header/headerWidgetMenu.tsx b/ui/src/components/header/headerWidgetMenu.tsx index ca0bf8e38..4cb7753d8 100644 --- a/ui/src/components/header/headerWidgetMenu.tsx +++ b/ui/src/components/header/headerWidgetMenu.tsx @@ -1,6 +1,6 @@ import React from "react"; import { LogoutOutlined } from "@ant-design/icons"; -import { Menu } from "antd"; +import { Menu, MenuProps } from "antd"; import { IPublicClientApplication } from "@azure/msal-browser"; type Props = { instance: IPublicClientApplication }; @@ -9,32 +9,27 @@ const HeaderWidgetMenu = ({ instance }: Props) => { { key: "logout", icon:Type +Dimension Type: {feature.attributes.type.dimensionType}
+Tensor Category: {feature.attributes.type.tensorCategory}
+Type: {feature.attributes.type.type}
+Value Type: {feature.attributes.type.valType}
+, - value: "Logout", - callback: () => { - instance.logoutRedirect().catch((e) => { - console.error(e); - }); - }, + label: "Logout", }, ]; - // @ts-ignore - const onClick = ({ key }) => { - const item = menuItems.find((i) => i.key === key); - if (item && item.callback) item.callback(); + + const logout = () => { + instance.logoutRedirect().catch((e) => { + console.error(e); + }); + }; + + const onClick: MenuProps["onClick"] = ({ key }) => { + switch (key) { + case "logout": + logout(); + break; + default: + break; + } }; - return ( - - ); + + return ; }; export default HeaderWidgetMenu; diff --git a/ui/src/components/sidemenu/siteMenu.tsx b/ui/src/components/sidemenu/siteMenu.tsx index 0ea68fd6e..2159ebaac 100644 --- a/ui/src/components/sidemenu/siteMenu.tsx +++ b/ui/src/components/sidemenu/siteMenu.tsx @@ -8,16 +8,70 @@ import { ProjectOutlined, RocketOutlined, } from "@ant-design/icons"; -import { Link } from "react-router-dom"; +import { Link, useLocation } from "react-router-dom"; +import { useEffect, useState } from "react"; const { Title } = Typography; const { Sider } = Layout; +const menuItems = [ + { + key: "", + icon: , + label: Home, + }, + { + key: "projects", + icon: , + label: Projects, + }, + { + key: "datasources", + icon: , + label: Data Sources, + }, + { + key: "features", + icon: , + label: Features, + }, + { + key: "jobs", + icon: , + label: Jobs, + }, + { + key: "monitoring", + icon: , + label: Monitoring, + }, +]; + +const enableRBAC = window.environment?.enableRBAC; +const showManagement = enableRBAC + ? enableRBAC + : process.env.REACT_APP_ENABLE_RBAC; + +if (showManagement !== "true") { + menuItems.push({ + key: "management", + icon: , + label: Management, + }); +} + +const getMenuKey = (pathname: string) => { + return pathname.split("/")[1].toLocaleLowerCase(); +}; + const SideMenu = () => { - const enableRBAC = window.environment?.enableRBAC; - const showManagement = enableRBAC - ? enableRBAC - : process.env.REACT_APP_ENABLE_RBAC; + const location = useLocation(); + + const [current, setcurrent] = useState (getMenuKey(location.pathname)); + + useEffect(() => { + setcurrent(getMenuKey(location.pathname)); + }, [location.pathname]); return ( @@ -35,62 +89,9 @@ const SideMenu = () => { + selectedKeys={[current]} + items={menuItems} + /> ); }; diff --git a/ui/src/models/model.ts b/ui/src/models/model.ts index e18387d07..c45d0632f 100644 --- a/ui/src/models/model.ts +++ b/ui/src/models/model.ts @@ -80,11 +80,21 @@ export interface DataSourceAttributes { tags: string[]; timestampFormat: string; type: string; + qualified_name: string; + timestamp_format: string; + event_timestamp_column: string; +} + +export interface RelationData { + fromEntityId: string; + relationshipId: string; + relationshipType: string; + toEntityId: string; } export interface FeatureLineage { - guidEntityMap: any; - relations: any; + guidEntityMap: Record; + relations: RelationData[]; } export interface UserRole { diff --git a/ui/src/pages/dataSource/components/DataSourceTable/index.tsx b/ui/src/pages/dataSource/components/DataSourceTable/index.tsx new file mode 100644 index 000000000..951bd39fd --- /dev/null +++ b/ui/src/pages/dataSource/components/DataSourceTable/index.tsx @@ -0,0 +1,146 @@ +import React, { forwardRef, useRef } from "react"; +import { Button } from "antd"; +import { useQuery } from "react-query"; +import { useNavigate } from "react-router-dom"; +import { DataSource } from "@/models/model"; +import { fetchDataSources } from "@/api"; +import ResizeTable, { ResizeColumnType } from "@/components/ResizeTable"; + +export interface DataSourceTableProps { + project?: string; +} + +export interface SearchModel { + scope?: string; + roleName?: string; +} + +const DataSourceTable = (props: DataSourceTableProps, ref: any) => { + const navigate = useNavigate(); + + const { project } = props; + + const projectRef = useRef(project); + + const getDetialUrl = (guid: string) => { + return `/projects/${projectRef.current}/dataSources/${guid}`; + }; + + const columns: ResizeColumnType [] = [ + { + key: "name", + title: "Name", + ellipsis: true, + width: 200, + render: (record: DataSource) => { + return ( + + ); + }, + }, + { + key: "type", + title: "Type", + ellipsis: true, + width: 80, + render: (record: DataSource) => { + return record.attributes.type; + }, + }, + { + key: "path", + title: "Path", + width: 220, + render: (record: DataSource) => { + return record.attributes.path; + }, + }, + { + key: "preprocessing", + title: "Preprocessing", + ellipsis: true, + width: 190, + render: (record: DataSource) => { + return record.attributes.preprocessing; + }, + }, + { + key: "eventTimestampColumn", + title: "Event Timestamp Column", + ellipsis: true, + width: 190, + render: (record: DataSource) => { + return record.attributes.eventTimestampColumn; + }, + }, + { + key: "timestampFormat", + title: "Timestamp Format", + ellipsis: true, + width: 190, + render: (record: DataSource) => { + return record.attributes.timestampFormat; + }, + }, + { + title: "Action", + fixed: "right", + width: 130, + resize: false, + render: (record: DataSource) => { + return ( + + ); + }, + }, + ]; + + const { isLoading, data: tableData } = useQuery ( + ["dataSources", project], + async () => { + if (project) { + projectRef.current = project; + return await fetchDataSources(project); + } else { + return []; + } + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + + ); +}; + +const DataSourceTableComponent = forwardRef ( + DataSourceTable +); + +DataSourceTableComponent.displayName = "DataSourceTableComponent"; + +export default DataSourceTableComponent; diff --git a/ui/src/pages/dataSource/components/SearchBar/index.tsx b/ui/src/pages/dataSource/components/SearchBar/index.tsx new file mode 100644 index 000000000..9577bae35 --- /dev/null +++ b/ui/src/pages/dataSource/components/SearchBar/index.tsx @@ -0,0 +1,38 @@ +import React from "react"; +import { Form } from "antd"; +import ProjectsSelect from "@/components/ProjectsSelect"; + +export interface SearchBarProps { + defaultProject?: string; + onSearch: (values: any) => void; +} + +const { Item } = Form; + +const SearchBar = (props: SearchBarProps) => { + const [form] = Form.useForm(); + + const { defaultProject, onSearch } = props; + + return ( + + ++ ); +}; + +export default SearchBar; diff --git a/ui/src/pages/dataSource/dataSourceDetails.tsx b/ui/src/pages/dataSource/dataSourceDetails.tsx index 2548644b2..af82fe0d7 100644 --- a/ui/src/pages/dataSource/dataSourceDetails.tsx +++ b/ui/src/pages/dataSource/dataSourceDetails.tsx @@ -1,112 +1,85 @@ import React from "react"; import { LoadingOutlined } from "@ant-design/icons"; import { useNavigate, useParams } from "react-router-dom"; -import { Alert, Button, Card, Col, Row, Spin, Typography } from "antd"; -import { QueryStatus, useQuery } from "react-query"; +import { Alert, Space, Breadcrumb, PageHeader, Spin, Button } from "antd"; +import { Link } from "react-router-dom"; +import { useQuery } from "react-query"; import { AxiosError } from "axios"; -import { fetchDataSource } from "../../api"; -import { DataSource, DataSourceAttributes } from "../../models/model"; - -const { Title } = Typography; - -type DataSourceKeyProps = { dataSource: DataSource }; -const DataSourceKey = ({ dataSource }: DataSourceKeyProps) => { - const keys = dataSource.attributes; - return ( - <> - {keys && ( -- - - - )} - > - ); -}; - -type Params = { - project: string; - dataSourceId: string; -}; +import { fetchDataSource } from "@/api"; +import { DataSource } from "@/models/model"; +import { SourceAttributesMap } from "@/utils/attributesMapping"; +import CardDescriptions from "@/components/CardDescriptions"; const DataSourceDetails = () => { - const { project, dataSourceId } = useParams() as Params; const navigate = useNavigate(); - const loadingIcon =Data Source Attributes ---Name: {keys.name}
-Type: {keys.type}
-Path: {keys.path}
-Preprocessing: {keys.preprocessing}
-Event Timestamp Column: {keys.eventTimestampColumn}
-Timestamp Format: {keys.timestampFormat}
-Qualified Name: {keys.qualifiedName}
-Tags: {JSON.stringify(keys.tags)}
-; - const { status, error, data } = useQuery ( + + const { project = "", dataSourceId = "" } = useParams(); + + const { + isLoading, + error, + data = { attributes: {} } as DataSource, + } = useQuery ( ["dataSourceId", dataSourceId], - () => fetchDataSource(project, dataSourceId) + () => fetchDataSource(project, dataSourceId), + { + retry: false, + refetchOnWindowFocus: false, + } ); - const render = (status: QueryStatus): JSX.Element => { - switch (status) { - case "error": - return ( - - - ); - case "idle": - return ( -- - - ); - case "loading": - return ( -- - - ); - case "success": - if (data === undefined) { - return ( -- - - ); - } else { - return ( - <> - -- - - > - ); - } - } - }; + const { attributes } = data; - return{data.attributes.name} ----
-- {render(status)}; + return ( +++ ); }; export default DataSourceDetails; diff --git a/ui/src/pages/dataSource/dataSources.tsx b/ui/src/pages/dataSource/dataSources.tsx index 6d84aa0af..c36db0b12 100644 --- a/ui/src/pages/dataSource/dataSources.tsx +++ b/ui/src/pages/dataSource/dataSources.tsx @@ -1,20 +1,27 @@ -import { Card, Typography } from "antd"; +import { PageHeader } from "antd"; +import { useState } from "react"; import { useSearchParams } from "react-router-dom"; -import DataSourceList from "../../components/dataSourceList"; -const { Title } = Typography; +import DataSourceTable from "./components/DataSourceTable"; +import SearchBar from "./components/SearchBar"; const DataSources = () => { const [searchParams] = useSearchParams(); - const project = (searchParams.get("project") as string) ?? ""; - const keyword = (searchParams.get("keyword") as string) ?? ""; + + const [project, setProject] = useState+ ++ Data Sources + +Data Source Attributes + + } + extra={[ + , + , + ]} + > +} + > + + {error && + +} + + ( + searchParams.get("project") || undefined + ); + + const onSearch = ({ project }: { project: string }) => { + setProject(project); + }; return ( -); }; diff --git a/ui/src/pages/feature/components/FeatureForm/index.tsx b/ui/src/pages/feature/components/FeatureForm/index.tsx new file mode 100644 index 000000000..02f33fe8d --- /dev/null +++ b/ui/src/pages/feature/components/FeatureForm/index.tsx @@ -0,0 +1,87 @@ +import React, { forwardRef, useEffect, useState } from "react"; +import { Button, Form, Input, message } from "antd"; +import { useNavigate } from "react-router-dom"; +import { createFeature, updateFeature } from "@/api"; +import { FeatureAttributes, Feature } from "@/models/model"; + +export interface FeatureFormProps { + isNew: boolean; + editMode: boolean; + feature?: FeatureAttributes; +} + +const FeatureForm = (props: FeatureFormProps, ref: any) => { + const navigate = useNavigate(); + + const { isNew, editMode, feature } = props; + + const [createLoading, setCreateLoading] = useState- +Data Sources -- + + + (false); + + const [form] = Form.useForm(); + + const handleFinish = async (values: Feature) => { + setCreateLoading(true); + try { + if (isNew) { + await createFeature(values); + message.success("New feature created"); + } else if (feature?.qualifiedName) { + values.guid = feature.qualifiedName; + await updateFeature(values); + message.success("Feature is updated successfully"); + } + navigate("/features"); + } catch (err: any) { + message.error(err.detail || err.message, 8); + } finally { + setCreateLoading(false); + } + }; + + useEffect(() => { + if (feature) { + form.setFieldsValue(feature); + } + }, [feature, form]); + + return ( + <> + + + ++ + ++ + ++ + ++ + + + > + ); +}; + +const FeatureFormComponent = forwardRef(FeatureForm); + +FeatureFormComponent.displayName = "FeatureFormComponent"; + +export default FeatureFormComponent; diff --git a/ui/src/pages/feature/components/FeatureTable/index.tsx b/ui/src/pages/feature/components/FeatureTable/index.tsx new file mode 100644 index 000000000..69e9c1ae6 --- /dev/null +++ b/ui/src/pages/feature/components/FeatureTable/index.tsx @@ -0,0 +1,151 @@ +import React, { forwardRef, useRef } from "react"; +import { Button } from "antd"; +import { useQuery } from "react-query"; +import { useNavigate } from "react-router-dom"; +import { Feature } from "@/models/model"; +import { fetchFeatures } from "@/api"; +import ResizeTable, { ResizeColumnType } from "@/components/ResizeTable"; + +export interface DataSourceTableProps { + project?: string; + keyword?: string; +} + +export interface SearchModel { + scope?: string; + roleName?: string; +} + +const DataSourceTable = (props: DataSourceTableProps, ref: any) => { + const navigate = useNavigate(); + + const { project, keyword } = props; + + const projectRef = useRef(project); + + const getDetialUrl = (guid: string) => { + return `/projects/${projectRef.current}/features/${guid}`; + }; + + const columns: ResizeColumnType [] = [ + { + key: "name", + title: "Name", + ellipsis: true, + width: 200, + render: (record: Feature) => { + return ( + + ); + }, + }, + { + key: "type", + title: "Type", + ellipsis: true, + width: 120, + render: (record: Feature) => { + return record.typeName.replace(/feathr_|_v1/gi, ""); + }, + }, + { + key: "transformation", + title: "Transformation", + width: 220, + render: (record: Feature) => { + const { transformExpr, defExpr } = record.attributes.transformation; + return transformExpr || defExpr; + }, + }, + { + key: "entitykey", + title: "Entity Key", + ellipsis: true, + width: 120, + render: (record: Feature) => { + const key = record.attributes.key && record.attributes.key[0]; + if ("NOT_NEEDED" !== key.keyColumn) { + return `${key.keyColumn} (${key.keyColumnType})`; + } else { + return "N/A"; + } + }, + }, + { + key: "aggregation", + title: "Aggregation", + ellipsis: true, + width: 150, + render: (record: Feature) => { + const { transformation } = record.attributes; + return ( + <> + {transformation.aggFunc && `Type: ${transformation.aggFunc}`} +
+ {transformation.aggFunc && `Window: ${transformation.window}`} + > + ); + }, + }, + { + title: "Action", + fixed: "right", + width: 100, + resize: false, + render: (record: Feature) => { + return ( + + ); + }, + }, + ]; + + const { isLoading, data: tableData } = useQuery( + ["dataSources", project, keyword], + async () => { + if (project) { + projectRef.current = project; + return await fetchFeatures(project, 1, 10, keyword || ""); + } else { + return []; + } + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + + ); +}; + +const DataSourceTableComponent = forwardRef ( + DataSourceTable +); + +DataSourceTableComponent.displayName = "DataSourceTableComponent"; + +export default DataSourceTableComponent; diff --git a/ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx b/ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx new file mode 100644 index 000000000..868c866a7 --- /dev/null +++ b/ui/src/pages/feature/components/NodeDetails/FeatureNodeDetail.tsx @@ -0,0 +1,44 @@ +import React from "react"; +import { Space } from "antd"; +import { Feature } from "@/models/model"; +import CardDescriptions from "@/components/CardDescriptions"; +import { + TransformationMap, + FeatureKeyMap, + TypeMap, +} from "@/utils/attributesMapping"; + +export interface FeatureNodeDetialProps { + feature: Feature; +} + +const FeatureNodeDetial = (props: FeatureNodeDetialProps) => { + const { feature } = props; + + const { attributes } = feature; + const { transformation, key, type } = attributes; + const FeatureKey = key?.[0]; + + return ( + + + ); +}; + +export default FeatureNodeDetial; diff --git a/ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx b/ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx new file mode 100644 index 000000000..fbf5be158 --- /dev/null +++ b/ui/src/pages/feature/components/NodeDetails/SourceNodeDetial.tsx @@ -0,0 +1,22 @@ +import React from "react"; +import { DataSource } from "@/models/model"; +import { SourceAttributesMap } from "@/utils/attributesMapping"; +import CardDescriptions from "@/components/CardDescriptions"; + +export interface SourceNodeDetialProps { + source: DataSource; +} + +const SourceNodeDetial = (props: SourceNodeDetialProps) => { + const { source } = props; + const { attributes } = source; + return ( ++ + + + ); +}; + +export default SourceNodeDetial; diff --git a/ui/src/pages/feature/components/NodeDetails/index.tsx b/ui/src/pages/feature/components/NodeDetails/index.tsx new file mode 100644 index 000000000..8a3391cfd --- /dev/null +++ b/ui/src/pages/feature/components/NodeDetails/index.tsx @@ -0,0 +1,63 @@ +import React from "react"; +import { useParams, useSearchParams } from "react-router-dom"; +import { fetchFeature, fetchDataSource } from "@/api"; +import { LoadingOutlined } from "@ant-design/icons"; +import { useQuery } from "react-query"; +import { Spin, Typography } from "antd"; +import { FeatureType } from "@/utils/utils"; +import FeatureNodeDetail from "./FeatureNodeDetail"; +import SourceNodeDetial from "./SourceNodeDetial"; + +const { Paragraph } = Typography; + +const NodeDetails = () => { + const [searchParams] = useSearchParams(); + const { project } = useParams(); + const nodeId = searchParams.get("nodeId") as string; + const featureType = searchParams.get("featureType") as string; + + const isSource = featureType === FeatureType.Source; + const isFeature = + featureType === FeatureType.AnchorFeature || + featureType === FeatureType.DerivedFeature; + + const { isLoading, data } = useQuery ( + ["nodeDetails", project, nodeId], + async () => { + if (isSource || isFeature) { + const api = isSource ? fetchDataSource : fetchFeature; + return await api(project!, nodeId); + } + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + } + > + + {data ? ( + isSource ? ( ++ + ); +}; + +export default NodeDetails; diff --git a/ui/src/pages/feature/components/SearchBar/index.tsx b/ui/src/pages/feature/components/SearchBar/index.tsx new file mode 100644 index 000000000..1a32f28b2 --- /dev/null +++ b/ui/src/pages/feature/components/SearchBar/index.tsx @@ -0,0 +1,67 @@ +import React, { useRef } from "react"; +import { Form, Input, Button } from "antd"; +import { useNavigate } from "react-router-dom"; +import ProjectsSelect from "@/components/ProjectsSelect"; + +export interface SearchValue { + project?: string; + keyword?: string; +} + +export interface SearchBarProps { + defaultValues?: SearchValue; + onSearch?: (values: SearchValue) => void; +} + +const { Item } = Form; + +const SearchBar = (props: SearchBarProps) => { + const [form] = Form.useForm(); + + const navigate = useNavigate(); + + const { defaultValues, onSearch } = props; + + const timeRef = useRef+ ) : ( + + ) + ) : ( + !isLoading && ( + + Click on source or feature node to show metadata and metric + details + + ) + )} +(null); + + const onChangeKeyword = () => { + clearTimeout(timeRef.current); + timeRef.current = setTimeout(() => { + form.submit(); + }, 350); + }; + + return ( + + + ++ ); +}; + +export default SearchBar; diff --git a/ui/src/pages/feature/featureDetails.tsx b/ui/src/pages/feature/featureDetails.tsx index 549e5e3f7..fdecb7505 100644 --- a/ui/src/pages/feature/featureDetails.tsx +++ b/ui/src/pages/feature/featureDetails.tsx @@ -1,218 +1,116 @@ -import React, { useEffect, useState } from "react"; -import { Alert, Button, Card, Col, Row, Space, Spin, Typography } from "antd"; +import React, { useEffect, useRef, useState } from "react"; +import { + Alert, + Button, + PageHeader, + Breadcrumb, + Space, + Card, + Spin, + Descriptions, +} from "antd"; import { LoadingOutlined } from "@ant-design/icons"; -import { useNavigate, useParams } from "react-router-dom"; -import { QueryStatus, useQuery } from "react-query"; +import { Link, useNavigate, useParams } from "react-router-dom"; +import { useQuery } from "react-query"; import { AxiosError } from "axios"; -import { fetchFeature } from "../../api"; -import { Feature, InputFeature } from "../../models/model"; -import { FeatureLineage } from "../../models/model"; -import { fetchFeatureLineages } from "../../api"; -import { Elements } from "react-flow-renderer"; -import Graph from "../../components/graph/graph"; -import { getElements } from "../../components/graph/utils"; - -const { Title } = Typography; - -type FeatureKeyProps = { feature: Feature }; -const FeatureKey = ({ feature }: FeatureKeyProps) => { - const keys = feature.attributes.key; - return ( - <> - {keys && keys.length > 0 && ( -- - - - )} - > - ); -}; - -type FeatureTypeProps = { feature: Feature }; -const FeatureType = ({ feature }: FeatureTypeProps) => { - const type = feature.attributes.type; - return ( - <> - {type && ( -Entity Key ---Full Name: {keys[0].fullName}
-Key Column: {keys[0].keyColumn}
-Description: {keys[0].description}
-Key Column Alias: {keys[0].keyColumnAlias}
-Key Column Type: {keys[0].keyColumnType}
-- - - - )} - > - ); -}; - -type FeatureTransformationProps = { feature: Feature }; -const FeatureTransformation = ({ feature }: FeatureTransformationProps) => { - const transformation = feature.attributes.transformation; - return ( - <> - {transformation && ( -Type ---Dimension Type: {type.dimensionType}
-Tensor Category: {type.tensorCategory}
-Type: {type.type}
-Value Type: {type.valType}
-- - - - )} - > - ); -}; +import { fetchFeature, fetchFeatureLineages } from "@/api"; +import { Feature, InputFeature, FeatureLineage } from "@/models/model"; +import FlowGraph from "@/components/FlowGraph"; +import CardDescriptions from "@/components/CardDescriptions"; +import { + FeatureKeyMap, + TransformationMap, + TypeMap, +} from "@/utils/attributesMapping"; + +const contentStyle = { marginRight: 16 }; type InputAnchorFeaturesProps = { project: string; feature: Feature }; -const InputAnchorFeatures = ({ - project, - feature, -}: InputAnchorFeaturesProps) => { - const navigate = useNavigate(); - const inputAnchorFeatures = feature.attributes.inputAnchorFeatures; - return ( - <> - {inputAnchorFeatures && inputAnchorFeatures.length > 0 && ( -Transformation -- {transformation.transformExpr && ( --Expression: {transformation.transformExpr}
- )} - {transformation.filter &&Filter: {transformation.filter}
} - {transformation.aggFunc && ( -Aggregation: {transformation.aggFunc}
- )} - {transformation.limit &&Limit: {transformation.limit}
} - {transformation.groupBy && ( -Group By: {transformation.groupBy}
- )} - {transformation.window &&Window: {transformation.window}
} - {transformation.defExpr && ( -Expression: {transformation.defExpr}
- )} -- - - - )} - > - ); + +const InputAnchorFeatures = (props: InputAnchorFeaturesProps) => { + const { project, feature } = props; + + const { inputAnchorFeatures } = feature.attributes; + + return inputAnchorFeatures?.length > 0 ? ( +Input Anchor Features - {inputAnchorFeatures.map((input_feature) => ( - - ))} -+ + ) : null; }; type InputDerivedFeaturesProps = { project: string; feature: Feature }; -const InputDerivedFeatures = ({ - project, - feature, -}: InputDerivedFeaturesProps) => { - const navigate = useNavigate(); - const inputDerivedFeatures = feature.attributes.inputDerivedFeatures; - return ( - <> - {inputDerivedFeatures && inputDerivedFeatures.length > 0 && ( -+ {inputAnchorFeatures.map((input_feature) => ( + ++ + {input_feature.uniqueAttributes.qualifiedName} + + + ))} +- - - - )} - > - ); + +const InputDerivedFeatures = (props: InputDerivedFeaturesProps) => { + const { project, feature } = props; + + const { inputDerivedFeatures } = feature.attributes; + + return inputDerivedFeatures?.length ? ( +Input Derived Features - {inputDerivedFeatures.map((input_feature: InputFeature) => ( - - ))} -+ + ) : null; }; const FeatureLineageGraph = () => { - const { featureId } = useParams() as Params; + const { project, featureId } = useParams() as Params; const [lineageData, setLineageData] = useState+ {inputDerivedFeatures.map((input_feature: InputFeature) => ( + ++ + {input_feature.uniqueAttributes.qualifiedName} + + + ))} +({ - guidEntityMap: null, - relations: null, + guidEntityMap: {}, + relations: [], }); - const [elements, SetElements] = useState ([]); + const [loading, setLoading] = useState (false); + const mountedRef = useRef (true); + useEffect(() => { const fetchLineageData = async () => { setLoading(true); const data = await fetchFeatureLineages(featureId); - setLineageData(data); - setLoading(false); + if (mountedRef.current) { + setLineageData(data); + setLoading(false); + } }; fetchLineageData(); }, [featureId]); - // Generate graph data on client side, invoked after graphData or featureType is changed useEffect(() => { - const generateGraphData = async () => { - SetElements(getElements(lineageData, "all_nodes")!); + mountedRef.current = true; + return () => { + mountedRef.current = false; }; - - generateGraphData(); - }, [lineageData]); - - return ( - <> - {loading ? ( - } /> - ) : ( - - - - - )} - > - ); + }, []); + + return !loading ? ( +Lineage -- + + ) : null; }; type Params = { @@ -222,87 +120,77 @@ type Params = { const FeatureDetails = () => { const { project, featureId } = useParams() as Params; const navigate = useNavigate(); - const loadingIcon =+ ; - const { status, error, data } = useQuery ( + + const { + isLoading, + error, + data = { attributes: {} } as Feature, + } = useQuery ( ["featureId", featureId], - () => fetchFeature(project, featureId) + () => fetchFeature(project, featureId), + { + retry: false, + refetchOnWindowFocus: false, + } ); + const { attributes } = data; + const { transformation, key, type, name } = attributes; + const FeatureKey = key?.[0]; - const openLineageWindow = () => { - const lineageUrl = `/projects/${project}/lineage`; - navigate(lineageUrl); - }; - - const render = (status: QueryStatus): JSX.Element => { - switch (status) { - case "error": - return ( - - - ); - case "idle": - return ( -- - - ); - case "loading": - return ( -- - - ); - case "success": - if (data === undefined) { - return ( -- - - ); - } else { - return ( - <> - -- - - > - ); + return ( +{data.attributes.name} ---- - ----
-- - - - - - ++ ); }; export default FeatureDetails; diff --git a/ui/src/pages/feature/features.tsx b/ui/src/pages/feature/features.tsx index 275cde11f..9ace6ead6 100644 --- a/ui/src/pages/feature/features.tsx +++ b/ui/src/pages/feature/features.tsx @@ -1,20 +1,27 @@ -import { Button, Card, Space, Typography } from "antd"; -import { useNavigate, useSearchParams } from "react-router-dom"; -import FeatureList from "../../components/featureList"; - -const { Title } = Typography; +import { useState } from "react"; +import { PageHeader } from "antd"; +import { useSearchParams } from "react-router-dom"; +import SearchBar, { SearchValue } from "./components/SearchBar"; +import FeatureTable from "./components/FeatureTable"; const Features = () => { const [searchParams] = useSearchParams(); - const project = (searchParams.get("project") as string) ?? ""; - const keyword = (searchParams.get("keyword") as string) ?? ""; + + const [search, setProject] = useState+ ++ Features + +Feature Details + } - } - }; - - return{render(status)}; + extra={[ + , + ]} + > +} + > + + {error && + +} + + + + + + + ({ + project: searchParams.get("project") || undefined, + keyword: searchParams.get("keyword") || undefined, + }); + + const onSearch = (values: SearchValue) => { + setProject(values); + }; return ( -); }; diff --git a/ui/src/pages/feature/lineageGraph.tsx b/ui/src/pages/feature/lineageGraph.tsx index ac75dff91..d8b1473df 100644 --- a/ui/src/pages/feature/lineageGraph.tsx +++ b/ui/src/pages/feature/lineageGraph.tsx @@ -1,17 +1,17 @@ -import React, { useEffect, useState } from "react"; -import { Card, Col, Radio, Row, Spin, Tabs, Typography } from "antd"; +import React, { useEffect, useRef, useState } from "react"; +import { PageHeader, Row, Col, Radio, Tabs } from "antd"; import { useParams, useSearchParams } from "react-router-dom"; -import { Elements } from "react-flow-renderer"; -import Graph from "../../components/graph/graph"; -import { fetchProjectLineages } from "../../api"; -import { FeatureLineage } from "../../models/model"; -import { LoadingOutlined } from "@ant-design/icons"; -import GraphNodeDetails from "../../components/graph/graphNodeDetails"; -import { getElements } from "../../components/graph/utils"; -import { FeatureType } from "../../utils/utils"; +import FlowGraph from "@/components/FlowGraph"; +import { fetchProjectLineages } from "@/api"; +import { FeatureLineage } from "@/models/model"; +import { FeatureType } from "@/utils/utils"; +import NodeDetails from "./components/NodeDetails"; -const { Title } = Typography; -const { TabPane } = Tabs; +const items = [ + { label: "Metadata", key: "1", children:- +Features -- + + + }, + { label: "Metrics", key: "2", children: Under construction
}, // 务必填写 key + { label: "Jobs", key: "3", children:Under construction
}, +]; type Params = { project: string; @@ -22,90 +22,75 @@ const LineageGraph = () => { const nodeId = searchParams.get("nodeId") as string; const [lineageData, setLineageData] = useState({ - guidEntityMap: null, - relations: null, + guidEntityMap: {}, + relations: [], }); + const [loading, setLoading] = useState (false); - const [elements, SetElements] = useState ([]); - const [featureType, setFeatureType] = useState ("all_nodes"); + + const [featureType, setFeatureType] = useState ( + FeatureType.AllNodes + ); + + const mountedRef = useRef (true); // Fetch lineage data from server side, invoked immediately after component is mounted useEffect(() => { const fetchLineageData = async () => { setLoading(true); const data = await fetchProjectLineages(project); - setLineageData(data); - setLoading(false); + if (mountedRef.current) { + setLineageData(data); + setLoading(false); + } }; fetchLineageData(); }, [project]); - // Generate graph data on client side, invoked after graphData or featureType is changed + const toggleFeatureType = (type: FeatureType) => { + setFeatureType(type); + }; + useEffect(() => { - const generateGraphData = async () => { - SetElements(getElements(lineageData, featureType)!); + mountedRef.current = true; + return () => { + mountedRef.current = false; }; - - generateGraphData(); - }, [lineageData, featureType]); - - const toggleFeatureType = (type: string) => { - setFeatureType((prevType: string | null) => { - if (prevType === type) { - return null; - } - return type; - }); - }; + }, []); return ( -); }; diff --git a/ui/src/pages/feature/newFeature.tsx b/ui/src/pages/feature/newFeature.tsx index d51dd2aa0..50afd64c3 100644 --- a/ui/src/pages/feature/newFeature.tsx +++ b/ui/src/pages/feature/newFeature.tsx @@ -1,16 +1,13 @@ import React from "react"; -import { Card, Typography } from "antd"; -import FeatureForm from "../../components/featureForm"; - -const { Title } = Typography; +import { PageHeader } from "antd"; +import FeatureForm from "./components/FeatureForm"; const NewFeature = () => { return (- + +Lineage {project} ---toggleFeatureType(e.target.value)} - > - -All Nodes -Source -Anchor -- Anchor Feature - -- Derived Feature - -- {loading ? ( --} + + toggleFeatureType(e.target.value)} + > + +All Nodes +Source ++ Anchor Feature + ++ Derived Feature + ++
+ - ) : ( - -
- )} -- - - - - - -- -- - -Under construction
-- -Under construction
-+ + + + -); }; diff --git a/ui/src/pages/management/components/RoleForm/index.tsx b/ui/src/pages/management/components/RoleForm/index.tsx index 9e073abd8..0a77b1610 100644 --- a/ui/src/pages/management/components/RoleForm/index.tsx +++ b/ui/src/pages/management/components/RoleForm/index.tsx @@ -1,6 +1,6 @@ import React, { forwardRef, useCallback, useEffect, useState } from "react"; import { Form, Select, Input, Button, message } from "antd"; -import { listUserRole, addUserRole } from "../../../../api"; +import { listUserRole, addUserRole } from "@/api"; export interface RoleFormProps { getRole?: (isAdmin: boolean) => void; diff --git a/ui/src/pages/management/components/UserRolesTable/index.tsx b/ui/src/pages/management/components/UserRolesTable/index.tsx index d264b2691..9a72f1539 100644 --- a/ui/src/pages/management/components/UserRolesTable/index.tsx +++ b/ui/src/pages/management/components/UserRolesTable/index.tsx @@ -6,12 +6,12 @@ import React, { useRef, useState, } from "react"; -import { Table, Tag, Button, message, Popconfirm } from "antd"; +import { Tag, Button, message, Popconfirm } from "antd"; import { DeleteOutlined } from "@ant-design/icons"; -import { ColumnsType } from "antd/lib/table"; import dayjs from "dayjs"; -import { UserRole } from "../../../../models/model"; -import { listUserRole, deleteUserRole } from "../../../../api"; +import { UserRole } from "@/models/model"; +import { listUserRole, deleteUserRole } from "@/api"; +import ResizeTable, { ResizeColumnType } from "@/components/ResizeTable"; export interface UserRolesTableProps {} @@ -74,25 +74,32 @@ const UserRolesTable = (props: UserRolesTableProps, ref: any) => { } }; - const columns: ColumnsType- +Create Feature +- = [ + const columns: ResizeColumnType [] = [ { + key: "scope", title: "Scope (Project / Global)", dataIndex: "scope", ellipsis: true, + width: 330, + minWidth: 190, }, { title: "Role", dataIndex: "roleName", + ellipsis: true, width: 120, }, { title: "User", dataIndex: "userName", ellipsis: true, + width: 300, + minWidth: 100, }, { title: "Permissions", dataIndex: "access", + ellipsis: true, width: 240, render: (col: string[]) => { return col.map((tag) => { @@ -110,6 +117,7 @@ const UserRolesTable = (props: UserRolesTableProps, ref: any) => { title: "Reason", dataIndex: "createReason", ellipsis: true, + width: 300, }, { title: "Create By", @@ -138,6 +146,7 @@ const UserRolesTable = (props: UserRolesTableProps, ref: any) => { title: "Action", fixed: "right", width: 130, + resize: false, render: (col: string, record: UserRole) => { return ( { }, [fetchData]); return ( - ); }; diff --git a/ui/src/pages/management/management.tsx b/ui/src/pages/management/management.tsx index 79ba50daa..882048b1f 100644 --- a/ui/src/pages/management/management.tsx +++ b/ui/src/pages/management/management.tsx @@ -18,7 +18,7 @@ const Management = () => { return (
- + - + {showAlert && ( { + const navigate = useNavigate(); + + const { project } = props; + + const columns: ResizeColumnType [] = [ + { + key: "name", + title: "Name", + dataIndex: "name", + resize: false, + }, + { + key: "action", + title: "Action", + width: 130, + resize: false, + render: (record: Project) => { + const { name } = record; + return ( + + + + + ); + }, + }, + ]; + + const { isLoading, data: tableData } = useQuery( + ["Projects", project], + async () => { + const reuslt = await fetchProjects(); + + return reuslt.reduce((list, item: string) => { + const text = project?.trim().toLocaleLowerCase(); + if (!text || item.includes(text)) { + list.push({ name: item }); + } + return list; + }, [] as Project[]); + }, + { + retry: false, + refetchOnWindowFocus: false, + } + ); + + return ( + + ); +}; + +const ProjectTableComponent = forwardRef ( + ProjectTable +); + +ProjectTableComponent.displayName = "ProjectTableComponent"; + +export default ProjectTableComponent; diff --git a/ui/src/pages/project/components/SearchBar/index.tsx b/ui/src/pages/project/components/SearchBar/index.tsx new file mode 100644 index 000000000..4ac3cd29d --- /dev/null +++ b/ui/src/pages/project/components/SearchBar/index.tsx @@ -0,0 +1,51 @@ +import React, { forwardRef, useRef } from "react"; +import { Form, Input } from "antd"; + +export interface SearchBarProps { + onSearch: (values: any) => void; +} + +const { Item } = Form; + +const SearchBar = (props: SearchBarProps, ref: any) => { + const [form] = Form.useForm(); + + const { onSearch } = props; + + const timeRef = useRef (null); + + const onChangeKeyword = () => { + clearTimeout(timeRef.current); + timeRef.current = setTimeout(() => { + form.submit(); + }, 350); + }; + + return ( + + ++ ); +}; + +const SearchBarComponent = forwardRef(SearchBar); + +SearchBarComponent.displayName = "SearchBarComponent"; + +export default SearchBarComponent; diff --git a/ui/src/pages/project/projects.tsx b/ui/src/pages/project/projects.tsx index 03cbf3d48..932915089 100644 --- a/ui/src/pages/project/projects.tsx +++ b/ui/src/pages/project/projects.tsx @@ -1,16 +1,21 @@ -import React from "react"; -import { Card, Typography } from "antd"; -import ProjectList from "../../components/projectList"; - -const { Title } = Typography; +import React, { useState } from "react"; +import { PageHeader } from "antd"; +import ProjectTable from "./components/ProjectTable"; +import SearchBar from "./components/SearchBar"; const Projects = () => { + const [project, setProject] = useState (""); + + const onSearch = ({ project }: { project: string }) => { + setProject(project); + }; + return ( -); }; diff --git a/ui/src/site.css b/ui/src/site.css index cea4439f7..e1a42a944 100644 --- a/ui/src/site.css +++ b/ui/src/site.css @@ -4,8 +4,6 @@ } .card { - margin-top: 15px; - margin-right: 15px; box-shadow: 5px 8px 15px 5px rgba(208, 216, 243, 0.6); border-radius: 8px; } @@ -61,3 +59,7 @@ .dataSource-container { column-count: 1; } + +.display-flex { + display: "flex"; +} diff --git a/ui/src/utils/attributesMapping.ts b/ui/src/utils/attributesMapping.ts new file mode 100644 index 000000000..09e7459b7 --- /dev/null +++ b/ui/src/utils/attributesMapping.ts @@ -0,0 +1,48 @@ +import { + FeatureTransformation, + FeatureKey, + FeatureType, + DataSourceAttributes, +} from "@/models/model"; + +export const TransformationMap: Array<{ + label: string; + key: keyof FeatureTransformation; +}> = [ + { label: "Expression", key: "transformExpr" }, + { label: "Filter", key: "filter" }, + { label: "Aggregation", key: "aggFunc" }, + { label: "Limit", key: "limit" }, + { label: "Group By", key: "groupBy" }, + { label: "Window", key: "window" }, + { label: "Expression", key: "defExpr" }, +]; + +export const FeatureKeyMap: Array<{ label: string; key: keyof FeatureKey }> = [ + { label: "Full name", key: "fullName" }, + { label: "Description", key: "description" }, + { label: "Key column", key: "keyColumn" }, + { label: "Key column alias", key: "keyColumnAlias" }, + { label: "Key column type", key: "keyColumnType" }, +]; + +export const TypeMap: Array<{ label: string; key: keyof FeatureType }> = [ + { label: "Dimension Type", key: "dimensionType" }, + { label: "Tensor Category", key: "tensorCategory" }, + { label: "Type", key: "type" }, + { label: "Value Type", key: "valType" }, +]; + +export const SourceAttributesMap: Array<{ + label: string; + key: keyof DataSourceAttributes; +}> = [ + { label: "Name", key: "name" }, + { label: "Type", key: "type" }, + { label: "Path", key: "path" }, + { label: "Preprocessing", key: "preprocessing" }, + { label: "Event Timestamp Column", key: "event_timestamp_column" }, + { label: "Timestamp Forma", key: "timestamp_format" }, + { label: "Qualified Name", key: "qualified_name" }, + { label: "Tags", key: "tags" }, +]; diff --git a/ui/src/utils/utils.tsx b/ui/src/utils/utils.tsx index 85bfd8f42..9cd2c959b 100644 --- a/ui/src/utils/utils.tsx +++ b/ui/src/utils/utils.tsx @@ -1,3 +1,4 @@ +import { Feature } from "@/models/model"; import { Configuration, PublicClientApplication } from "@azure/msal-browser"; export const getMsalConfig = () => { @@ -16,8 +17,6 @@ export const getMsalConfig = () => { redirectUri: window.location.origin, }, }; - console.log("clientId = ", clientId); - console.log("authority = ", authority); return new PublicClientApplication(msalConfig); }; @@ -33,6 +32,19 @@ export const enum FeatureType { export const isFeature = (featureType: string) => { return ( featureType === FeatureType.AnchorFeature || - featureType === FeatureType.DerivedFeature + featureType === FeatureType.DerivedFeature || + featureType === FeatureType.Source ); }; + +export const getFeatureDetailUrl = (project: string, feature: Feature) => { + switch (feature.typeName) { + case FeatureType.Source: + return `/projects/${project}/dataSources/${feature.guid}`; + case FeatureType.AnchorFeature: + case FeatureType.DerivedFeature: + return `/projects/${project}/features/${feature.guid}`; + default: + return; + } +}; From afd930903be8368c4f0e71596e7ca5bd6ccb3c73 Mon Sep 17 00:00:00 2001 From: Blair Chen- +Projects -- + + + Date: Tue, 1 Nov 2022 19:19:44 +0800 Subject: [PATCH 07/18] Add release instructions for Release Candidate (#809) * Add release instructions for Release Candidate * Add a section for release versioning * Add a section for overall process triggered by the release manager --- .../dev_guide/feathr_overall_release_guide.md | 61 +++++++++++++------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/docs/dev_guide/feathr_overall_release_guide.md b/docs/dev_guide/feathr_overall_release_guide.md index d0b16611a..5d6301a49 100644 --- a/docs/dev_guide/feathr_overall_release_guide.md +++ b/docs/dev_guide/feathr_overall_release_guide.md @@ -10,62 +10,87 @@ This document describes all the release process for the development team. ## Prerequisites -- Make sure the CI tests are passing so there are no surprises on the release day. +- Make sure the CI tests are passing prior to bug bash. - Make sure all the active PRs related to the release are merged. - ## When to Release -- For each major and minor version release, please follow these steps. -- For patch versions, there should be no releases. +The release process is triggered by the release manager. The release manager will decide when to release with following steps: + +1. Ensure Prerequisites are met. +2. Creation of Release Candidate(rc) on GitHub. +3. Bug Bash. +4. Creation of Release on GitHub. +5. Post Release announcement. + +## Release Versioning + +- Major and minor version: X.Y.Z +- Release Candidate: X.Y.Z-rcN ## Writing Release Note Write a release note following past examples [here](https://github.com/feathr-ai/feathr/releases). Read through the [commit log](https://github.com/feathr-ai/feathr/commits/main) to identify the commits after last release to include in the release note. Here are the major things to include -- highlights of the release -- improvements and changes of this release -- new contributors of this release +- Highlights of the release +- Improvements and changes of this release +- New contributors of this release ## Code Changes -Before the release is made, the version needs to be updated in following places + +Before the release candidate or release is made, the version needs to be updated in following places + - [build.sbt](https://github.com/feathr-ai/feathr/blob/main/build.sbt#L3) - For Maven release version - [version.py](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathr/version.py#L1) - For Feathr version - [conf.py](https://github.com/feathr-ai/feathr/blob/main/feathr_project/docs/conf.py#L27) - For documentation version -- [feathr_config.yaml](https://github.com/feathr-ai/feathr/blob/main/feathr_project/test/test_user_workspace/feathr_config.yaml#L84) - To set the spark runtime location for Azure Synapse and Azure Databricks used by test suite. Please update all .yaml files under this path. -- [azure_resource_provision.json](https://github.com/feathr-ai/feathr/blob/main/docs/how-to-guides/azure_resource_provision.json#L114) - To set the deployment template to pull the latest release image. -- [constants.py](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathr/constants.py#L31) - To set the default maven artifact version (Only needed when maven version is **NOT** the same as python sdk version) +- [feathr_config.yaml](https://github.com/feathr-ai/feathr/blob/main/feathr_project/test/test_user_workspace/feathr_config.yaml#L84) - To set the spark runtime location for Azure Synapse and Azure Databricks used by test suite. Please update all .yaml files under this path. - [package.json](https://github.com/feathr-ai/feathr/blob/main/ui/package.json#L3) - For Feathr UI version +Following file should only be updated for release, which means should be skipped for release candidate. + +- [azure_resource_provision.json](https://github.com/feathr-ai/feathr/blob/main/docs/how-to-guides/azure_resource_provision.json#L114) - To set the deployment template to pull the latest release image. + +## Release Branches + +Each major and minor release should have a release branch. The release branch should be named as `releases/vX.Y.Z` or `releases/vX.Y.Z-rcN` where `X.Y.Z` is the release version. The release branch should be created from the `main` branch. See past release branches [here](https://github.com/feathr-ai/feathr/branches/all?query=releases). + +## Release Tags + +Once the release branch is created, a release tag should be created from the release branch. The release tag should be named as `vX.Y.Z` or `vX.Y.Z-rcN` where `X.Y.Z` is the release version. See past release tags [here](https://github.com/feathr-ai/feathr/tags). + ## Triggering automated release pipelines -Our goal is to automate the release process as much as possible. So far, we have automated the following steps -1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/docker-publish.yml) to build and publish for our UI and API container to [dockerhub](https://hub.docker.com/r/feathrfeaturestore/feathr-registry/tags). - **Triggers** - Nightly, branch with name pattern "releases/*" -1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-pypi.yml) for publishing Python package to [PyPi](https://pypi.org/project/feathr/). +Once the release branch and release tag are created, the release pipelines will be triggered automatically. The release pipelines will build the release artifacts and publish them to Maven and PyPI. + +1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/docker-publish.yml) to build and publish for Feathr Registry docker images to [DockerHub](https://hub.docker.com/r/feathrfeaturestore/feathr-registry/tags). - **Triggers** - branch with name pattern "releases/*" + **Triggers** - Nightly or branch with name pattern "releases/*" -1. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-maven.yml) for publishing the jar to [maven/sonatype repository](https://oss.sonatype.org/). +2. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-pypi.yml) for publishing Python package to [PyPi](https://pypi.org/project/feathr/). -**PLEASE NOTE: To trigger the above workflows as part of release, create a new branch with pattern releases/v0.x.0**. See past release branches [here](https://github.com/feathr-ai/feathr/branches/all?query=releases). + **Triggers** - branch with name pattern "releases/*" +3. Automated [workflow](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-maven.yml) for publishing the jar to [maven/sonatype repository](https://oss.sonatype.org/). ## Upload Feathr Jar Run the command to generate the Java jar. After the jar is generated, please upload to [Azure storage](https://ms.portal.azure.com/#view/Microsoft_Azure_Storage/ContainerMenuBlade/~/overview/storageAccountId/%2Fsubscriptions%2Fa6c2a7cc-d67e-4a1a-b765-983f08c0423a%2FresourceGroups%2Fazurefeathrintegration%2Fproviders%2FMicrosoft.Storage%2FstorageAccounts%2Fazurefeathrstorage/path/public/etag/%220x8D9E6F64D62D599%22/defaultEncryptionScope/%24account-encryption-key/denyEncryptionScopeOverride//defaultId//publicAccessVal/Container) for faster access. ## Release PyPi + The automated workflow should take care of this, you can check under [actions](https://github.com/feathr-ai/feathr/actions/workflows/publish-to-pypi.yml) to see the triggered run and results. For manual steps, see [Python Package Release Guide](https://feathr-ai.github.io/feathr/dev_guide/python_package_release.html) ## Updating docker image for API and Registry + The automated workflow should take care of this as well, you can check under [actions](https://github.com/feathr-ai/feathr/actions/workflows/docker-publish.yml) to see the triggered run and results. For manual steps, see [Feathr Registry docker image](https://feathr-ai.github.io/feathr/dev_guide/build-and-push-feathr-registry-docker-image.html) ## Release Maven + The automated workflow should take of this too, you can check under [actions](https://github.com/feathr-ai/feathr/blob/main/.github/workflows/publish-to-maven.yml) to see the triggered run and results. For manual steps, see [Feathr Developer Guide for publishing to maven](https://feathr-ai.github.io/feathr/dev_guide/publish_to_maven.html) ## Testing + Run one of the sample [notebook](https://github.com/feathr-ai/feathr/blob/main/docs/samples/azure_synapse/product_recommendation_demo.ipynb) as it uses the latest package from Maven and PyPi. ## Announcement From 8899f185c8c5d0e713d14ac86c56e4929be4717f Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Tue, 1 Nov 2022 20:26:40 +0800 Subject: [PATCH 08/18] Bump version to 0.9.0-rc1 (#810) --- build.sbt | 2 +- docs/how-to-guides/local-spark-provider.md | 2 +- feathr_project/feathr/version.py | 2 +- feathr_project/test/test_user_workspace/feathr_config.yaml | 4 ++-- .../test_user_workspace/feathr_config_registry_purview.yaml | 4 ++-- .../feathr_config_registry_purview_rbac.yaml | 4 ++-- .../test/test_user_workspace/feathr_config_registry_sql.yaml | 4 ++-- .../test_user_workspace/feathr_config_registry_sql_rbac.yaml | 4 ++-- ui/package.json | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/build.sbt b/build.sbt index 2919ddae6..2ad413ba2 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ import sbt.Keys.publishLocalConfiguration ThisBuild / resolvers += Resolver.mavenLocal ThisBuild / scalaVersion := "2.12.15" -ThisBuild / version := "0.8.0" +ThisBuild / version := "0.9.0-rc1" ThisBuild / organization := "com.linkedin.feathr" ThisBuild / organizationName := "linkedin" val sparkVersion = "3.1.3" diff --git a/docs/how-to-guides/local-spark-provider.md b/docs/how-to-guides/local-spark-provider.md index 433af64f3..0069322b8 100644 --- a/docs/how-to-guides/local-spark-provider.md +++ b/docs/how-to-guides/local-spark-provider.md @@ -36,7 +36,7 @@ A spark-submit script will auto generated in your workspace under `debug` folder spark-submit \ --master local[*] \ --name project_feathr_local_spark_test \ - --packages "org.apache.spark:spark-avro_2.12:3.3.0,com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8,com.microsoft.azure:spark-mssql-connector_2.12:1.2.0,org.apache.logging.log4j:log4j-core:2.17.2,com.typesafe:config:1.3.4,com.fasterxml.jackson.core:jackson-databind:2.12.6.1,org.apache.hadoop:hadoop-mapreduce-client-core:2.7.7,org.apache.hadoop:hadoop-common:2.7.7,org.apache.avro:avro:1.8.2,org.apache.xbean:xbean-asm6-shaded:4.10,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3,com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21,org.apache.kafka:kafka-clients:3.1.0,com.google.guava:guava:31.1-jre,it.unimi.dsi:fastutil:8.1.1,org.mvel:mvel2:2.2.8.Final,com.fasterxml.jackson.module:jackson-module-scala_2.12:2.13.3,com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.12.6,com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.12.6,com.jasonclawson:jackson-dataformat-hocon:1.1.0,com.redislabs:spark-redis_2.12:3.1.0,org.apache.xbean:xbean-asm6-shaded:4.10,com.google.protobuf:protobuf-java:3.19.4,net.snowflake:snowflake-jdbc:3.13.18,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,org.apache.commons:commons-lang3:3.12.0,org.xerial:sqlite-jdbc:3.36.0.3,com.github.changvvb:jackson-module-caseclass_2.12:1.1.1,com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.11.1,org.eclipse.jetty:jetty-util:9.3.24.v20180605,commons-io:commons-io:2.6,org.apache.hadoop:hadoop-azure:2.7.4,com.microsoft.azure:azure-storage:8.6.4,com.linkedin.feathr:feathr_2.12:0.8.0" \ + --packages "org.apache.spark:spark-avro_2.12:3.3.0,com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8,com.microsoft.azure:spark-mssql-connector_2.12:1.2.0,org.apache.logging.log4j:log4j-core:2.17.2,com.typesafe:config:1.3.4,com.fasterxml.jackson.core:jackson-databind:2.12.6.1,org.apache.hadoop:hadoop-mapreduce-client-core:2.7.7,org.apache.hadoop:hadoop-common:2.7.7,org.apache.avro:avro:1.8.2,org.apache.xbean:xbean-asm6-shaded:4.10,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3,com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21,org.apache.kafka:kafka-clients:3.1.0,com.google.guava:guava:31.1-jre,it.unimi.dsi:fastutil:8.1.1,org.mvel:mvel2:2.2.8.Final,com.fasterxml.jackson.module:jackson-module-scala_2.12:2.13.3,com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.12.6,com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.12.6,com.jasonclawson:jackson-dataformat-hocon:1.1.0,com.redislabs:spark-redis_2.12:3.1.0,org.apache.xbean:xbean-asm6-shaded:4.10,com.google.protobuf:protobuf-java:3.19.4,net.snowflake:snowflake-jdbc:3.13.18,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2,org.apache.commons:commons-lang3:3.12.0,org.xerial:sqlite-jdbc:3.36.0.3,com.github.changvvb:jackson-module-caseclass_2.12:1.1.1,com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.11.1,org.eclipse.jetty:jetty-util:9.3.24.v20180605,commons-io:commons-io:2.6,org.apache.hadoop:hadoop-azure:2.7.4,com.microsoft.azure:azure-storage:8.6.4,com.linkedin.feathr:feathr_2.12:0.9.0-rc1" \ --conf "spark.driver.extraClassPath=../target/scala-2.12/classes:jars/config-1.3.4.jar:jars/jackson-dataformat-hocon-1.1.0.jar:jars/jackson-module-caseclass_2.12-1.1.1.jar:jars/mvel2-2.2.8.Final.jar:jars/fastutil-8.1.1.jar" \ --conf "spark.hadoop.fs.wasbs.impl=org.apache.hadoop.fs.azure.NativeAzureFileSystem" \ --class com.linkedin.feathr.offline.job.FeatureJoinJob \ diff --git a/feathr_project/feathr/version.py b/feathr_project/feathr/version.py index 807119de6..f31e00e36 100644 --- a/feathr_project/feathr/version.py +++ b/feathr_project/feathr/version.py @@ -1 +1 @@ -__version__ = "0.8.0" \ No newline at end of file +__version__ = "0.9.0-rc1" \ No newline at end of file diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index e67c803ef..f463785d5 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -82,7 +82,7 @@ spark_config: # Feathr Job configuration. Support local paths, path start with http(s)://, and paths start with abfs(s):// # this is the default location so end users don't have to compile the runtime again. # feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: # workspace instance workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' @@ -93,7 +93,7 @@ spark_config: # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/ work_dir: 'dbfs:/feathr_getting_started' # this is the default location so end users don't have to compile the runtime again. - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml index f716da0b4..b6e3aacde 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_purview.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml index c842bc702..ffef212d2 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_purview_rbac.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml index dcb73d827..8f6691725 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_sql.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml b/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml index 29c6889e8..03c5f75f1 100644 --- a/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config_registry_sql_rbac.yaml @@ -25,13 +25,13 @@ spark_config: workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' executor_size: 'Small' executor_num: 1 - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":1,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} work_dir: 'dbfs:/feathr_getting_started' - feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.8.0.jar" + feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" online_store: redis: diff --git a/ui/package.json b/ui/package.json index da870fbf3..b4e1ce820 100644 --- a/ui/package.json +++ b/ui/package.json @@ -1,6 +1,6 @@ { "name": "feathr-ui", - "version": "0.8.0", + "version": "0.9.0-rc1", "private": true, "dependencies": { "@ant-design/icons": "^4.7.0", From 995f5091570524f3f8484a20244928d289eb8989 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 2 Nov 2022 22:12:25 +0000 Subject: [PATCH 09/18] Fix tests to use mocks and fix get_result_df's databricks behavior Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/utils/job_utils.py | 49 ++++---- feathr_project/pyproject.toml | 1 - feathr_project/test/conftest.py | 12 +- .../test_user_workspace/feathr_config.yaml | 2 +- feathr_project/test/unit/utils/test_config.py | 1 - .../test/unit/utils/test_job_utils.py | 112 +++++++++++++++--- 6 files changed, 130 insertions(+), 47 deletions(-) diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index c804d4ca9..1d33855b5 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,6 +1,5 @@ -from multiprocessing.sharedctypes import Value from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import NamedTemporaryFile from typing import Union from loguru import logger @@ -9,6 +8,7 @@ from feathr.client import FeathrClient from feathr.constants import OUTPUT_FORMAT +from feathr.utils.platform import is_databricks def get_result_pandas_df( @@ -82,6 +82,9 @@ def get_result_df( Returns: Either Spark or pandas DataFrame. """ + if is_databricks() and client.spark_runtime != "databricks": + raise RuntimeError(f"The function is called from Databricks but the client.spark_runtime is {client.spark_runtime}.") + # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: @@ -95,22 +98,22 @@ def get_result_df( "In local spark mode, the result files are expected to be stored at a local storage and thus `local_cache_path` argument will be ignored." ) local_cache_path = res_url + elif client.spark_runtime == "databricks": - if res_url.startswith("dbfs:"): + if not res_url.startswith("dbfs:"): + raise ValueError( + f"In Databricks, the result files are expected to be stored at a DBFS storage but res_url = {res_url}." + ) + + if is_databricks(): # Check if the function is being called from Databricks if local_cache_path is not None: logger.warning( "Result files are already in DBFS and thus `local_cache_path` will be ignored." ) local_cache_path = res_url - else: - # if local_cache_path params is not provided then create a temporary folder - if local_cache_path is None: - # We'll just use the name of a local TemporaryDirectory to cache the data into DBFS. - local_cache_path = TemporaryDirectory().name - - # Databricks uses "dbfs:/" prefix for spark paths - if not local_cache_path.startswith("dbfs:"): - local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) + elif local_cache_path is None: # Download the result from dbfs to local + local_cache_path = NamedTemporaryFile(delete=False).name + else: logger.warning("This utility function currently supports local spark and databricks. You may encounter unexpected results on other platforms.") # TODO elif azure_synapse @@ -127,16 +130,20 @@ def get_result_df( result_df = None - if spark is not None: - if data_format == "csv": - result_df = spark.read.option("header", True).csv(local_cache_path) + try: + if spark is not None: + if data_format == "csv": + result_df = spark.read.option("header", True).csv(local_cache_path) + else: + result_df = spark.read.format(data_format).load(local_cache_path) else: - result_df = spark.read.format(data_format).load(local_cache_path) - else: - result_df = _load_files_to_pandas_df( - dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. - data_format=data_format, - ) + result_df = _load_files_to_pandas_df( + dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. + data_format=data_format, + ) + except Exception as e: + logger.error(f"Failed to load result files from {local_cache_path} with format {data_format}.") + raise e return result_df diff --git a/feathr_project/pyproject.toml b/feathr_project/pyproject.toml index 0162ede04..5b7b2fc11 100644 --- a/feathr_project/pyproject.toml +++ b/feathr_project/pyproject.toml @@ -12,7 +12,6 @@ multi_line_output = 3 [tool.pytest.ini_options] markers = [ "notebooks: tests Jupyter notebooks", - "databricks: tests functions on a Databricks cluster", ] [build-system] diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py index d1ecd081b..b8ee3f345 100644 --- a/feathr_project/test/conftest.py +++ b/feathr_project/test/conftest.py @@ -12,14 +12,10 @@ def workspace_dir() -> str: @pytest.fixture(scope="function") -def feathr_client_local(workspace_dir) -> FeathrClient: - """Test function-scoped Feathr client""" - return FeathrClient(config_path=str(Path(workspace_dir, "feathr_config_local.yaml"))) - - -@pytest.fixture(scope="function") -def feathr_client_databricks(workspace_dir) -> FeathrClient: - """Test function-scoped Feathr client""" +def feathr_client(workspace_dir) -> FeathrClient: + """Test function-scoped Feathr client. + Note, cluster target (local, databricks, synapse) maybe overriden by the environment variables set at test machine. + """ return FeathrClient(config_path=str(Path(workspace_dir, "feathr_config.yaml"))) diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index f463785d5..921148728 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -85,7 +85,7 @@ spark_config: feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0-rc1.jar" databricks: # workspace instance - workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' + workspace_instance_url: 'https://adb-4121774437039026.6.azuredatabricks.net' workspace_token_value: '' # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index a53e1764a..52adcae39 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -1,5 +1,4 @@ from pathlib import Path -from tempfile import NamedTemporaryFile import pytest diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 21392bf84..a692d9e40 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -2,6 +2,7 @@ # TODO test with no data files exception and unsupported format exception from pathlib import Path from tempfile import NamedTemporaryFile +from typing import Type from unittest.mock import MagicMock import pandas as pd @@ -18,7 +19,7 @@ def test__get_result_pandas_df(mocker: MockerFixture): - # Assert if the base function, get_result_df, called w/ proper args + """Test if the base function, get_result_df, called w/ proper args""" mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") client = MagicMock() data_format = "some_data_format" @@ -29,7 +30,7 @@ def test__get_result_pandas_df(mocker: MockerFixture): def test__get_result_spark_df(mocker: MockerFixture): - # Assert if the base function, get_result_df, called w/ proper args + """Test if the base function, get_result_df, called w/ proper args""" mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") client = MagicMock() spark = MagicMock() @@ -40,19 +41,80 @@ def test__get_result_spark_df(mocker: MockerFixture): mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path, spark=spark) -# Local spark is expected to use a local filepath for res_url. Therefore, we mark this test to run with databricks. -@pytest.mark.databricks -def test__get_result_df__with_local_cache_path(feathr_client_databricks: FeathrClient): - # TODO Assert there is a local copy of the file in the given local_cache_path - pass +@pytest.mark.parametrize( + "is_databricks,spark_runtime,res_url,local_cache_path,expected_local_cache_path", [ + # For local spark results, res_url must be a local path and local_cache_path will be ignored. + (False, "local", "some_res_url", None, "some_res_url"), + (False, "local", "some_res_url", "some_local_cache_path", "some_res_url"), + # For databricks results, res_url must be a dbfs path. + # If the function is called in databricks, local_cache_path will be ignored. + (True, "databricks", "dbfs:/some_res_url", None, "/dbfs/some_res_url"), + (True, "databricks", "dbfs:/some_res_url", "some_local_cache_path", "/dbfs/some_res_url"), + (False, "databricks", "dbfs:/some_res_url", None, "mocked_temp_path"), + (False, "databricks", "dbfs:/some_res_url", "some_local_cache_path", "some_local_cache_path"), + ] +) +def test__get_result_df__with_local_cache_path( + mocker: MockerFixture, + is_databricks: bool, + spark_runtime: str, + res_url: str, + local_cache_path: str, + expected_local_cache_path: str, +): + """Test local_cache_path is used if provided""" + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime + client.feathr_spark_launcher.download_result = MagicMock() + mocked_load_files_to_pandas_df = mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") + + # Mock is_databricks + mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) + + # Mock temporary file module + mocked_named_temporary_file = MagicMock() + mocked_named_temporary_file.name = expected_local_cache_path + mocker.patch("feathr.utils.job_utils.NamedTemporaryFile", return_value=mocked_named_temporary_file) + + data_format = "csv" + get_result_df(client, data_format=data_format, res_url=res_url, local_cache_path=local_cache_path) + + mocked_load_files_to_pandas_df.assert_called_once_with( + dir_path=expected_local_cache_path, + data_format=data_format, + ) -def test__get_result_df__exceptions(): +@pytest.mark.parametrize( + "is_databricks,spark_runtime,res_url,expected_error", [ + (True, "local", None, RuntimeError), # Test RuntimeError when the function is running at Databricks but client.spark_runtime is not databricks + # Test ValueError when res_url is None + (False, "local", None, ValueError), + (True, "databricks", None, ValueError), + # Test ValueError when res_url is not a dbfs path but client.spark_runtime is databricks + (False, "databricks", "some_local_path", ValueError), + # Test ValueError when res_url does not exists or not able to access. + (False, "local", "some_doesnt_exist_path", Exception), + ] +) +def test__get_result_df__exceptions( + mocker: MockerFixture, + is_databricks: bool, + spark_runtime: str, + res_url: str, + expected_error: Type[Exception], +): + """Test exceptions""" + # Mock client client = MagicMock() - client.get_job_result_uri = MagicMock(return_value=None) + client.get_job_result_uri = MagicMock(return_value=res_url) + client.spark_runtime = spark_runtime + + # Mock is_data_bricks + mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) - # Test ValueError when res_url is None - with pytest.raises(ValueError): + with pytest.raises(expected_error): get_result_df(client) @@ -67,17 +129,27 @@ def test__get_result_df__exceptions(): ) def test__get_result_df( workspace_dir: str, - feathr_client_local: FeathrClient, + feathr_client: FeathrClient, data_format: str, output_filename: str, expected_count: int, ): + """Test get_result_df returns pandas DataFrame""" # Note: make sure the output file exists in the test_user_workspace res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url + + # Mock feathr_spark_launcher.download_result + feathr_client.feathr_spark_launcher.download_result = MagicMock() + + if feathr_client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + df = get_result_df( - client=feathr_client_local, + client=feathr_client, data_format=data_format, res_url=res_url, + local_cache_path=local_cache_path, ) assert isinstance(df, pd.DataFrame) assert len(df) == expected_count @@ -94,19 +166,29 @@ def test__get_result_df( ) def test__get_result_df__with_spark_session( workspace_dir: str, - feathr_client_local: FeathrClient, + feathr_client: FeathrClient, spark: SparkSession, data_format: str, output_filename: str, expected_count: int, ): + """Test get_result_df returns spark DataFrame""" # Note: make sure the output file exists in the test_user_workspace res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url + + # Mock feathr_spark_launcher.download_result + feathr_client.feathr_spark_launcher.download_result = MagicMock() + + if feathr_client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + df = get_result_df( - client=feathr_client_local, + client=feathr_client, data_format=data_format, res_url=res_url, spark=spark, + local_cache_path=local_cache_path, ) assert isinstance(df, DataFrame) assert df.count() == expected_count From 6198506558732d6bea11f4f912b5c07f5075b5b9 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 3 Nov 2022 14:55:06 +0000 Subject: [PATCH 10/18] fix tem file to dir Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/utils/job_utils.py | 6 +++--- feathr_project/test/unit/utils/test_job_utils.py | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 1d33855b5..12f27c2cb 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,5 +1,5 @@ from pathlib import Path -from tempfile import NamedTemporaryFile +from tempfile import TemporaryDirectory from typing import Union from loguru import logger @@ -74,7 +74,7 @@ def get_result_df( Default to `avro` if not specified. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure the job is finished and the result URL contains actual data. - local_cache_path (optional): Specify the absolute download path. if the user does not provide this, + local_cache_path (optional): Specify the absolute download directory. if the user does not provide this, the function will create a temporary directory. spark (optional): Spark session. If provided, the function returns spark Dataframe. Otherwise, it returns pd.DataFrame. @@ -112,7 +112,7 @@ def get_result_df( ) local_cache_path = res_url elif local_cache_path is None: # Download the result from dbfs to local - local_cache_path = NamedTemporaryFile(delete=False).name + local_cache_path = TemporaryDirectory().name else: logger.warning("This utility function currently supports local spark and databricks. You may encounter unexpected results on other platforms.") diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index a692d9e40..1e005855e 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -1,7 +1,6 @@ # TODO with, without optional args # TODO test with no data files exception and unsupported format exception from pathlib import Path -from tempfile import NamedTemporaryFile from typing import Type from unittest.mock import MagicMock @@ -73,9 +72,9 @@ def test__get_result_df__with_local_cache_path( mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) # Mock temporary file module - mocked_named_temporary_file = MagicMock() - mocked_named_temporary_file.name = expected_local_cache_path - mocker.patch("feathr.utils.job_utils.NamedTemporaryFile", return_value=mocked_named_temporary_file) + mocked_named_temporary_dir = MagicMock() + mocked_named_temporary_dir.name = expected_local_cache_path + mocker.patch("feathr.utils.job_utils.TemporaryDirectory", return_value=mocked_named_temporary_dir) data_format = "csv" get_result_df(client, data_format=data_format, res_url=res_url, local_cache_path=local_cache_path) From ae9095c10ac2b06d971366a7969494cad2f0bf85 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 3 Nov 2022 16:17:21 +0000 Subject: [PATCH 11/18] checkout the feature_derivations.py from main (it was temporally changed to goaround previous issues) Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/definition/feature_derivations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feathr_project/feathr/definition/feature_derivations.py b/feathr_project/feathr/definition/feature_derivations.py index 5717f12b2..9205685ce 100644 --- a/feathr_project/feathr/definition/feature_derivations.py +++ b/feathr_project/feathr/definition/feature_derivations.py @@ -36,9 +36,9 @@ def __init__(self, def validate_feature(self): """Validate the derived feature is valid""" - + input_feature_key_alias = [] - # for new entity in Purview, the attributes are Camel cases, while the old logic works as snake cases. + # for new entity in Purview, the attributes are Camel cases, while the old logic works as snake cases. # Modify the conversion to work with both schema. for feature in self.input_features: input_feature_key_alias.extend([x['keyColumnAlias'] for x in feature['attributes']['key']] if isinstance(feature,dict) else feature.key_alias) @@ -58,7 +58,7 @@ def to_feature_config(self) -> str: } {% endfor %} } - definition: {{derived_feature.transform.to_feature_config(False)}} + definition.sqlExpr: {{derived_feature.transform.to_feature_config(False)}} {{derived_feature.feature_type.to_feature_config()}} } """) From 59bd65caf3a21bd661091d442cf972287fa9797a Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 3 Nov 2022 17:03:28 +0000 Subject: [PATCH 12/18] Remove old databricks sample notebook. Change pip install feathr from the github main branch to pickup the latest changes always Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../databricks_quickstart_nyc_taxi_demo.ipynb | 2 +- ...atabricks_quickstart_nyc_taxi_driver.ipynb | 1444 ----------------- 2 files changed, 1 insertion(+), 1445 deletions(-) delete mode 100644 docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index 0bc099f11..1c8b193d9 100755 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["!pip install feathr"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n"," "]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n","!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n","
"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb deleted file mode 100644 index ffd6e64d8..000000000 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb +++ /dev/null @@ -1,1444 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "384e5e16-7213-4186-9d04-09d03b155534", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Feathr Feature Store on Databricks Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. This is a notebook that's specially designed for databricks clusters and is relying on some of the databricks packages such as dbutils.\n", - "\n", - "The intent of this notebook is like \"one click run\" without configuring anything, so it has relatively limited capability. \n", - "\n", - "- For example, in this notebook there's no feature registry available since that requires running Azure Purview. \n", - "- Also for online store (Redis), you need to configure the Redis endpoint, otherwise that part will not work. \n", - "\n", - "However, the core part of Feathr, especially defining features, get offline features, point-in-time joins etc., should \"just work\". The full-fledged notebook is [located here](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Notebook Steps\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", - "\n", - "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f00b9d0b-94d1-418f-89b9-25bbacb8b068", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "! pip install feathr pandavro scikit-learn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n", - "import json\n", - "import requests" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Get the required databricks credentials automatically:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Get current databricks notebook context\n", - "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - "host_name = ctx.tags().get(\"browserHostName\").get()\n", - "host_token = ctx.apiToken().get()\n", - "cluster_id = ctx.tags().get(\"clusterId\").get()\n", - "\n", - "\n", - "\n", - "# databricks_config = {'run_name':'FEATHR_FILL_IN','existing_cluster_id':cluster_id,'libraries':[{'jar':'FEATHR_FILL_IN'}],'spark_jar_task':{'main_class_name':'FEATHR_FILL_IN','parameters':['FEATHR_FILL_IN']}}\n", - "os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + host_name\n", - "os.environ['spark_config__databricks__config_template']='{\"run_name\":\"FEATHR_FILL_IN\",\"new_cluster\":{\"spark_version\":\"10.4.x-scala2.12\",\"node_type_id\":\"Standard_D3_v2\",\"num_workers\":2,\"spark_conf\":{\"FEATHR_FILL_IN\":\"FEATHR_FILL_IN\"}},\"libraries\":[{\"jar\":\"FEATHR_FILL_IN\"}],\"spark_jar_task\":{\"main_class_name\":\"FEATHR_FILL_IN\",\"parameters\":[\"FEATHR_FILL_IN\"]}}'\n", - "# os.environ['spark_config__databricks__config_template']=json.dumps(databricks_config)\n", - "os.environ['spark_config__databricks__work_dir']='dbfs:/feathr_getting_started'\n", - "os.environ['project_config__project_name']='feathr_getting_started'\n", - "os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = host_token" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to setup the Redis credentials below in order to push features to online store. You can skip this part if you don't have Redis, but there will be failures for `client.materialize_features(settings)` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=\"\"\n", - "redis_host=\"\"\n", - "redis_password=\"\"\n", - "redis_ssl=\"\"\n", - "\n", - "# Set the resource link\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Configure required credentials (skip if you don't use those):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started2'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: ''\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: ''\n", - " jdbc_table: ''\n", - " snowflake:\n", - " snowflake_enabled: false\n", - " url: \"
.snowflakecomputing.com\"\n", - " user: \" \"\n", - " role: \" \"\n", - "spark_config:\n", - " # choice for spark runtime. Currently support: azure_synapse, databricks\n", - " # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.\n", - " spark_cluster: \"databricks\"\n", - " spark_result_output_parts: \"1\"\n", - "\n", - "online_store:\n", - " redis:\n", - " host: ' .redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " api_endpoint: \"https:// .azurewebsites.net/api/v1\"\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "16420730-582e-4e11-a343-efc0ddd35108", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "46f863c4-bb81-434a-a448-6b585031a221", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "f_trip_distance = Feature(name=\"f_trip_distance\",\n", - " feature_type=FLOAT, transform=\"trip_distance\")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " Feature(name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"cast_float(trip_distance)>30\"),\n", - " Feature(name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"request_features\",\n", - " source=INPUT_CONTEXT,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "05633bc3-9118-449b-9562-45fc437576c2", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "location_id = TypedKey(key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\")\n", - "agg_features = [Feature(name=\"f_location_avg_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_max_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"MAX\",\n", - " window=\"90d\")),\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=batch_source,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "f_trip_distance_rounded = DerivedFeature(name=\"f_trip_distance_rounded\",\n", - " feature_type=INT32,\n", - " input_features=[f_trip_distance],\n", - " transform=\"f_trip_distance * 10\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", - "showTitle": false, - "title": "" - } - }, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " f_trip_distance_rounded])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "output_path = 'dbfs:/feathrazure_test.avro'\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"f_location_avg_fare\", \"f_trip_distance_rounded\", \"f_is_long_trip_distance\"], key=location_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path\n", - " )\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "51f078e3-3f8f-4f10-b7f1-499ac8a9ff07", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "23c797b2-ac1a-4cf3-b0ed-c05216de3f37", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "from feathr.utils.job_utils import get_result_df\n", - "df_res = get_result_df(client, format=\"avro\", res_url = output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b9be042e-eb12-46b9-9d91-a0e5dd0c704f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "84745f36-5bac-49c0-903b-38828b923c7c", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# remove columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", - " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", - "\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", - " final_df[\"fare_amount\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", - "settings = MaterializationSettings(\"nycTaxiTable\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", - "\n", - "client.materialize_features(settings)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", - "showTitle": false, - "title": "" - } - }, - "source": [ - "We can then get the features from the online store (Redis):" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "nyc_driver_demo", - "notebookOrigID": 930353059183053, - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3.8.10 ('logistics')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "vscode": { - "interpreter": { - "hash": "6d25d3d1f1809ed0384c3d8e0cd4f1df57fe7bb936ead67f035c6ff1494f4e23" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 125cc3a4dbdb7769c9ad73355add741dd7955593 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Mon, 7 Nov 2022 23:02:05 +0000 Subject: [PATCH 13/18] Fix config and get_result_df for synapse --- .github/workflows/pull_request_push_test.yml | 6 +- .gitignore | 3 + .../databricks_quickstart_nyc_taxi_demo.ipynb | 1207 ++++++++++++++++- docs/samples/nyc_taxi_demo.ipynb | 178 ++- feathr_project/feathr/datasets/nyc_taxi.py | 2 +- feathr_project/feathr/utils/config.py | 206 ++- feathr_project/feathr/utils/job_utils.py | 12 +- feathr_project/pyproject.toml | 2 +- feathr_project/test/conftest.py | 15 + feathr_project/test/samples/test_notebooks.py | 14 +- feathr_project/test/unit/utils/test_config.py | 97 +- .../test/unit/utils/test_job_utils.py | 7 +- 12 files changed, 1585 insertions(+), 164 deletions(-) mode change 100755 => 100644 docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 0eb0e059b..3c9c7dd91 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -22,7 +22,7 @@ on: - "docs/**" - "ui/**" - "**/README.md" - + schedule: # Runs daily at 1 PM UTC (9 PM CST), will send notification to TEAMS_WEBHOOK - cron: '00 13 * * *' @@ -127,7 +127,7 @@ jobs: SQL1_USER: ${{secrets.SQL1_USER}} SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | - # run only test with databricks. run in 4 parallel jobs + # run only test with databricks. run in 6 parallel jobs pytest -n 6 feathr_project/test/ azure_synapse_test: @@ -196,7 +196,7 @@ jobs: SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | # skip databricks related test as we just ran the test; also seperate databricks and synapse test to make sure there's no write conflict - # run in 4 parallel jobs to make the time shorter + # run in 6 parallel jobs to make the time shorter pytest -n 6 feathr_project/test/ local_spark_test: diff --git a/.gitignore b/.gitignore index 976c0b239..4fe490c96 100644 --- a/.gitignore +++ b/.gitignore @@ -213,3 +213,6 @@ null/* project/.bloop metals.sbt .bsp/sbt.json + +# Feathr output debug folder +**/debug/ diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb old mode 100755 new mode 100644 index 1c8b193d9..4dc58eaf7 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -1 +1,1206 @@ -{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3","showTitle":false,"title":""}},"outputs":[],"source":["dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n","dbutils.widgets.text(\"REDIS_KEY\", \"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n","\n","This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n","- This notebook skips feature registry which requires running Azure Purview. \n","- To make the online feature query work, you will need to configure the Redis endpoint. \n","\n","The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8","showTitle":false,"title":""}},"source":["## Prerequisite\n","\n","To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n","\n","To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737","showTitle":false,"title":""}},"outputs":[],"source":["# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n","!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd","showTitle":false,"title":""}},"source":["## Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install Feathr and necessary dependencies.\n","1. Create shareable features with Feathr feature definition configs.\n","1. Create training data using point-in-time correct feature join\n","1. Train and evaluate a prediction model.\n","1. Materialize feature values for online scoring.\n","\n","The overall data flow is as follows:\n","\n"," "]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[],"source":["from datetime import datetime, timedelta\n","import glob\n","import json\n","from math import sqrt\n","import os\n","from pathlib import Path\n","import requests\n","from tempfile import TemporaryDirectory\n","\n","from azure.identity import AzureCliCredential, DefaultAzureCredential \n","from azure.keyvault.secrets import SecretClient\n","import pandas as pd\n","from pyspark.ml import Pipeline\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import GBTRegressor\n","from pyspark.sql import DataFrame, SparkSession\n","import pyspark.sql.functions as F\n","\n","import feathr\n","from feathr import (\n"," FeathrClient,\n"," # Feature data types\n"," BOOLEAN, FLOAT, INT32, ValueType,\n"," # Feature data sources\n"," INPUT_CONTEXT, HdfsSource,\n"," # Feature aggregations\n"," TypedKey, WindowAggTransformation,\n"," # Feature types and anchor\n"," DerivedFeature, Feature, FeatureAnchor,\n"," # Materialization\n"," BackfillTime, MaterializationSettings, RedisSink,\n"," # Offline feature computation\n"," FeatureQuery, ObservationSettings,\n",")\n","from feathr.datasets import nyc_taxi\n","from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n","from feathr.utils.config import generate_config\n","from feathr.utils.job_utils import get_result_df\n","\n","\n","print(f\"\"\"Feathr version: {feathr.__version__}\n","Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5","showTitle":false,"title":""}},"source":["## 2. Create Shareable Features with Feathr Feature Definition Configs\n","\n","In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n","Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696","showTitle":false,"title":""}},"outputs":[],"source":["RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n","PROJECT_NAME = \"feathr_getting_started\"\n","\n","REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n","\n","# Use a databricks cluster\n","SPARK_CLUSTER = \"databricks\"\n","\n","# Databricks file system path\n","DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\""]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n","\n","Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n","`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[],"source":["# Redis credential\n","os.environ['REDIS_PASSWORD'] = REDIS_KEY\n","\n","# Setup databricks env configs\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","databricks_config = {\n"," 'run_name': \"FEATHR_FILL_IN\",\n"," # To use an existing all-purpose cluster:\n"," # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n"," # To use a new job cluster:\n"," 'new_cluster': {\n"," 'spark_version': \"11.2.x-scala2.12\",\n"," 'node_type_id': \"Standard_D3_v2\",\n"," 'num_workers':1,\n"," 'spark_conf': {\n"," 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n"," # Exclude conflicting packages if use feathr <= v0.8.0:\n"," 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n"," },\n"," },\n"," 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n"," 'spark_jar_task': {\n"," 'main_class_name': \"FEATHR_FILL_IN\",\n"," 'parameters': [\"FEATHR_FILL_IN\"],\n"," },\n","}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n","os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["### Configurations\n","\n","Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[],"source":["config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n","\n","with open(config_path, 'r') as f: \n"," print(f.read())"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e","showTitle":false,"title":""}},"source":["All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["### Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=config_path)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["### View the NYC taxi fare dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n","\n","# Download the data file\n","df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n","df_raw.limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["### Defining features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n","\n","* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n","* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n","\n","Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n","\n","There are two types of features -- anchored features and derivated features:\n","\n","* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n","* **Derived features**: Features that are computed on top of other features.\n","\n","#### Define anchored features\n","\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea","showTitle":false,"title":""}},"outputs":[],"source":["TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n","TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df","showTitle":false,"title":""}},"outputs":[],"source":["# We define f_trip_distance and f_trip_time_duration features separately\n","# so that we can reuse them later for the derived features.\n","f_trip_distance = Feature(\n"," name=\"f_trip_distance\",\n"," feature_type=FLOAT,\n"," transform=\"trip_distance\",\n",")\n","f_trip_time_duration = Feature(\n"," name=\"f_trip_time_duration\",\n"," feature_type=FLOAT,\n"," transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n",")\n","\n","features = [\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," Feature(\n"," name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"trip_distance > 30.0\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_day_of_month\",\n"," feature_type=INT32,\n"," transform=\"dayofmonth(lpep_dropoff_datetime)\",\n"," ),\n"," Feature(\n"," name=\"f_hour_of_day\",\n"," feature_type=INT32,\n"," transform=\"hour(lpep_dropoff_datetime)\",\n"," ),\n","]\n","\n","# After you have defined features, bring them together to build the anchor to the source.\n","feature_anchor = FeatureAnchor(\n"," name=\"feature_anchor\",\n"," source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n"," features=features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["We can define the source with a preprocessing python function."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[],"source":["def preprocessing(df: DataFrame) -> DataFrame:\n"," import pyspark.sql.functions as F\n"," df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n"," return df\n","\n","batch_source = HdfsSource(\n"," name=\"nycTaxiBatchSource\",\n"," path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," preprocessing=preprocessing,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["For the features with aggregation, the supported functions are as follows:\n","\n","| Aggregation Function | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[],"source":["agg_key = TypedKey(\n"," key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\",\n",")\n","\n","agg_window = \"90d\"\n","\n","# Anchored features with aggregations\n","agg_features = [\n"," Feature(\n"," name=\"f_location_avg_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"AVG\",\n"," window=agg_window,\n"," ),\n"," ),\n"," Feature(\n"," name=\"f_location_max_fare\",\n"," key=agg_key,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(\n"," agg_expr=\"fare_amount_cents\",\n"," agg_func=\"MAX\",\n"," window=agg_window,\n"," ),\n"," ),\n","]\n","\n","agg_feature_anchor = FeatureAnchor(\n"," name=\"agg_feature_anchor\",\n"," source=batch_source, # External data source for feature. Typically a data table.\n"," features=agg_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["#### Define derived features\n","\n","We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[],"source":["derived_features = [\n"," DerivedFeature(\n"," name=\"f_trip_time_distance\",\n"," feature_type=FLOAT,\n"," input_features=[\n"," f_trip_distance,\n"," f_trip_time_duration,\n"," ],\n"," transform=\"f_trip_distance / f_trip_time_duration\",\n"," )\n","]"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["### Build features\n","\n","Finally, we build the features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(\n"," anchor_list=[feature_anchor, agg_feature_anchor],\n"," derived_feature_list=derived_features,\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## 3. Create Training Data Using Point-in-Time Correct Feature Join\n","\n","After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02feabc9-2f2f-43e8-898d-b28082798e98","showTitle":false,"title":""}},"outputs":[],"source":["feature_names = [feature.name for feature in features + agg_features + derived_features]\n","feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[],"source":["DATA_FORMAT = \"parquet\"\n","offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"67e81466-c736-47ba-b122-e640642c01cf","showTitle":false,"title":""}},"outputs":[],"source":["# Features that we want to request. Can use a subset of features\n","query = FeatureQuery(\n"," feature_list=feature_names,\n"," key=agg_key,\n",")\n","settings = ObservationSettings(\n"," observation_path=DATA_FILE_PATH,\n"," event_timestamp_column=TIMESTAMP_COL,\n"," timestamp_format=TIMESTAMP_FORMAT,\n",")\n","client.get_offline_features(\n"," observation_settings=settings,\n"," feature_query=query,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n"," execution_configurations=SparkExecutionConfiguration({\n"," \"spark.feathr.outputFormat\": DATA_FORMAT,\n"," }),\n"," output_path=offline_features_path,\n",")\n","\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e","showTitle":false,"title":""}},"outputs":[],"source":["# Show feature results\n","df = get_result_df(\n"," spark=spark,\n"," client=client,\n"," data_format=\"parquet\",\n"," res_url=offline_features_path,\n",")\n","df.select(feature_names).limit(5).toPandas()"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## 4. Train and Evaluate a Prediction Model\n","\n","After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n","\n","Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["### Load Train and Test Data from the Offline Feature Values"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0","showTitle":false,"title":""}},"outputs":[],"source":["# Train / test split\n","train_df, test_df = (\n"," df # Dataframe that we generated from get_offline_features call.\n"," .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n"," .where(F.col(\"f_trip_time_duration\") > 0)\n"," .fillna(0)\n"," .randomSplit([0.8, 0.2])\n",")\n","\n","print(f\"Num train samples: {train_df.count()}\")\n","print(f\"Num test samples: {test_df.count()}\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["### Build a ML Pipeline\n","\n","Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2a254361-63e9-45b2-8c19-40549762eacb","showTitle":false,"title":""}},"outputs":[],"source":["# Generate a feature vector column for SparkML\n","vector_assembler = VectorAssembler(\n"," inputCols=[x for x in df.columns if x in feature_names],\n"," outputCol=\"features\",\n",")\n","\n","# Define a model\n","gbt = GBTRegressor(\n"," featuresCol=\"features\",\n"," maxIter=100,\n"," maxDepth=5,\n"," maxBins=16,\n",")\n","\n","# Create a ML pipeline\n","ml_pipeline = Pipeline(stages=[\n"," vector_assembler,\n"," gbt,\n","])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["### Train and Evaluate the Model"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[],"source":["# Train a model\n","model = ml_pipeline.fit(train_df)\n","\n","# Make predictions\n","predictions = model.transform(test_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091","showTitle":false,"title":""}},"outputs":[],"source":["# Evaluate\n","evaluator = RegressionEvaluator(\n"," labelCol=\"label\",\n"," predictionCol=\"prediction\",\n",")\n","\n","rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n","mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n","print(f\"RMSE: {rmse}\\nMAE: {mae}\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e","showTitle":false,"title":""}},"outputs":[],"source":["# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n","predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n","\n","predictions_pdf.plot(\n"," x=\"index\",\n"," y=[\"label\", \"prediction\"],\n"," style=['-', ':'],\n"," figsize=(20, 10),\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"664d78cc-4a92-430c-9e05-565ba904558e","showTitle":false,"title":""}},"outputs":[],"source":["predictions_pdf.plot.scatter(\n"," x=\"label\",\n"," y=\"prediction\",\n"," xlim=(0, 100),\n"," ylim=(0, 100),\n"," figsize=(10, 10),\n",")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d","showTitle":false,"title":""}},"source":["## 5. Materialize Feature Values for Online Scoring\n","\n","While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n","\n","Note, only the features anchored to offline data source can be materialized."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37","showTitle":false,"title":""}},"outputs":[],"source":["materialized_feature_names = [feature.name for feature in agg_features]\n","materialized_feature_names"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n","\n"," # Get the last date from the dataset\n"," backfill_timestamp = (\n"," df_raw\n"," .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n"," .agg({TIMESTAMP_COL: \"max\"})\n"," .collect()[0][0]\n"," )\n","\n"," # Time range to materialize\n"," backfill_time = BackfillTime(\n"," start=backfill_timestamp,\n"," end=backfill_timestamp,\n"," step=timedelta(days=1),\n"," )\n","\n"," # Destinations:\n"," # For online store,\n"," redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n","\n"," # For offline store,\n"," # adls_sink = HdfsSink(output_path=)\n","\n"," settings = MaterializationSettings(\n"," name=FEATURE_TABLE_NAME + \".job\", # job name\n"," backfill_time=backfill_time,\n"," sinks=[redis_sink], # or adls_sink\n"," feature_names=materialized_feature_names,\n"," )\n","\n"," client.materialize_features(\n"," settings=settings,\n"," # Note, execution_configurations argument only works when using a new job cluster\n"," execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n"," )\n","\n"," client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9","showTitle":false,"title":""}},"source":["Now, you can retrieve features for online scoring as follows:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad","showTitle":false,"title":""}},"outputs":[],"source":["if REDIS_KEY and RESOURCE_PREFIX:\n"," # Note, to get a single key, you may use client.get_online_features instead\n"," materialized_feature_values = client.multi_get_online_features(\n"," feature_table=FEATURE_TABLE_NAME,\n"," keys=[\"239\", \"265\"],\n"," feature_names=materialized_feature_names,\n"," )\n"," materialized_feature_values"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3596dc71-a363-4b6a-a169-215c89978558","showTitle":false,"title":""}},"source":["## Cleanup"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820","showTitle":false,"title":""}},"outputs":[],"source":["# Remove temporary files\n","dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"databricks_quickstart_nyc_taxi_demo","notebookOrigID":2365994027381987,"widgets":{"REDIS_KEY":{"currentValue":"","nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca","widgetInfo":{"defaultValue":"","label":null,"name":"REDIS_KEY","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}},"RESOURCE_PREFIX":{"currentValue":"","nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1","widgetInfo":{"defaultValue":"","label":null,"name":"RESOURCE_PREFIX","options":{"validationRegex":null,"widgetType":"text"},"widgetType":"text"}}}},"kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "843d3142-24ca-4bd1-9e31-b55163804fe3", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n", + "dbutils.widgets.text(\"REDIS_KEY\", \"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "384e5e16-7213-4186-9d04-09d03b155534", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Feathr Feature Store on Databricks Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n", + "\n", + "This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n", + "- This notebook skips feature registry which requires running Azure Purview. \n", + "- To make the online feature query work, you will need to configure the Redis endpoint. \n", + "\n", + "The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c2ce58c7-9263-469a-bbb7-43364ddb07b8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Prerequisite\n", + "\n", + "To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n", + "\n", + "To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4609d7ad-ad74-40fc-b97e-f440a0fa0737", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n", + "!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c81fa80c-bca6-4ae5-84ad-659a036977bd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Notebook Steps\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install Feathr and necessary dependencies.\n", + "1. Create shareable features with Feathr feature definition configs.\n", + "1. Create training data using point-in-time correct feature join\n", + "1. Train and evaluate a prediction model.\n", + "1. Materialize feature values for online scoring.\n", + "\n", + "The overall data flow is as follows:\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.evaluation import RegressionEvaluator\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.regression import GBTRegressor\n", + "from pyspark.sql import DataFrame\n", + "import pyspark.sql.functions as F\n", + "\n", + "import feathr\n", + "from feathr import (\n", + " FeathrClient,\n", + " # Feature data types\n", + " BOOLEAN,\n", + " FLOAT,\n", + " INT32,\n", + " ValueType,\n", + " # Feature data sources\n", + " INPUT_CONTEXT,\n", + " HdfsSource,\n", + " # Feature aggregations\n", + " TypedKey,\n", + " WindowAggTransformation,\n", + " # Feature types and anchor\n", + " DerivedFeature,\n", + " Feature,\n", + " FeatureAnchor,\n", + " # Materialization\n", + " BackfillTime,\n", + " MaterializationSettings,\n", + " RedisSink,\n", + " # Offline feature computation\n", + " FeatureQuery,\n", + " ObservationSettings,\n", + ")\n", + "from feathr.datasets import nyc_taxi\n", + "from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n", + "from feathr.utils.config import generate_config\n", + "from feathr.utils.job_utils import get_result_df\n", + "\n", + "\n", + "print(\n", + " f\"\"\"Feathr version: {feathr.__version__}\n", + "Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ab35fa01-b392-457e-8fde-7e445a3c39b5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 2. Create Shareable Features with Feathr Feature Definition Configs\n", + "\n", + "In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n", + "Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "09f93a9f-7b33-4d91-8f31-ee3b20991696", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n", + "PROJECT_NAME = \"feathr_getting_started\"\n", + "\n", + "REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n", + "\n", + "# Use a databricks cluster\n", + "SPARK_CLUSTER = \"databricks\"\n", + "\n", + "# Databricks file system path\n", + "DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Redis credential\n", + "os.environ[\"REDIS_PASSWORD\"] = REDIS_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Configurations\n", + "\n", + "Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field.\n", + "\n", + "In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_cluster=SPARK_CLUSTER,\n", + " # You may set an existing cluster id here, but Databricks recommend to use new clusters for greater reliability.\n", + " cluster_name=None, # Set None to create a new job cluster\n", + ")\n", + "\n", + "with open(config_path, \"r\") as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "58d22dc1-7590-494d-94ca-3e2488c31c8e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### View the NYC taxi fare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n", + "\n", + "# Download the data file\n", + "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n", + "df_raw.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Defining features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n", + "\n", + "* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n", + "* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", + "\n", + "Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n", + "\n", + "There are two types of features -- anchored features and derivated features:\n", + "\n", + "* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n", + "* **Derived features**: Features that are computed on top of other features.\n", + "\n", + "#### Define anchored features\n", + "\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "75b8d2ed-84df-4446-ae07-5f715434f3ea", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n", + "TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "93abbcc2-562b-47e4-ad4c-1fedd7cc64df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# We define f_trip_distance and f_trip_time_duration features separately\n", + "# so that we can reuse them later for the derived features.\n", + "f_trip_distance = Feature(\n", + " name=\"f_trip_distance\",\n", + " feature_type=FLOAT,\n", + " transform=\"trip_distance\",\n", + ")\n", + "f_trip_time_duration = Feature(\n", + " name=\"f_trip_time_duration\",\n", + " feature_type=FLOAT,\n", + " transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n", + ")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " Feature(\n", + " name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"trip_distance > 30.0\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_month\",\n", + " feature_type=INT32,\n", + " transform=\"dayofmonth(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_hour_of_day\",\n", + " feature_type=INT32,\n", + " transform=\"hour(lpep_dropoff_datetime)\",\n", + " ),\n", + "]\n", + "\n", + "# After you have defined features, bring them together to build the anchor to the source.\n", + "feature_anchor = FeatureAnchor(\n", + " name=\"feature_anchor\",\n", + " source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n", + " features=features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can define the source with a preprocessing python function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def preprocessing(df: DataFrame) -> DataFrame:\n", + " import pyspark.sql.functions as F\n", + "\n", + " df = df.withColumn(\n", + " \"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\")\n", + " )\n", + " return df\n", + "\n", + "\n", + "batch_source = HdfsSource(\n", + " name=\"nycTaxiBatchSource\",\n", + " path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " preprocessing=preprocessing,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "46f863c4-bb81-434a-a448-6b585031a221", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For the features with aggregation, the supported functions are as follows:\n", + "\n", + "| Aggregation Function | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "agg_key = TypedKey(\n", + " key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\",\n", + ")\n", + "\n", + "agg_window = \"90d\"\n", + "\n", + "# Anchored features with aggregations\n", + "agg_features = [\n", + " Feature(\n", + " name=\"f_location_avg_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"AVG\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + " Feature(\n", + " name=\"f_location_max_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"MAX\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + "]\n", + "\n", + "agg_feature_anchor = FeatureAnchor(\n", + " name=\"agg_feature_anchor\",\n", + " source=batch_source, # External data source for feature. Typically a data table.\n", + " features=agg_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Define derived features\n", + "\n", + "We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "05633bc3-9118-449b-9562-45fc437576c2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "derived_features = [\n", + " DerivedFeature(\n", + " name=\"f_trip_time_distance\",\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " ],\n", + " transform=\"f_trip_distance / f_trip_time_duration\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build features\n", + "\n", + "Finally, we build the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.build_features(\n", + " anchor_list=[feature_anchor, agg_feature_anchor],\n", + " derived_feature_list=derived_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 3. Create Training Data Using Point-in-Time Correct Feature Join\n", + "\n", + "After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "02feabc9-2f2f-43e8-898d-b28082798e98", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "feature_names = [feature.name for feature in features + agg_features + derived_features]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FORMAT = \"parquet\"\n", + "offline_features_path = str(\n", + " Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "67e81466-c736-47ba-b122-e640642c01cf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Features that we want to request. Can use a subset of features\n", + "query = FeatureQuery(\n", + " feature_list=feature_names,\n", + " key=agg_key,\n", + ")\n", + "settings = ObservationSettings(\n", + " observation_path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")\n", + "client.get_offline_features(\n", + " observation_settings=settings,\n", + " feature_query=query,\n", + " # Note, execution_configurations argument only works when using a new job cluster\n", + " # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n", + " execution_configurations=SparkExecutionConfiguration(\n", + " {\n", + " \"spark.feathr.outputFormat\": DATA_FORMAT,\n", + " }\n", + " ),\n", + " output_path=offline_features_path,\n", + ")\n", + "\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9871af55-25eb-41ee-a58a-fda74b1a174e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Show feature results\n", + "df = get_result_df(\n", + " spark=spark,\n", + " client=client,\n", + " data_format=\"parquet\",\n", + " res_url=offline_features_path,\n", + ")\n", + "df.select(feature_names).limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 4. Train and Evaluate a Prediction Model\n", + "\n", + "After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n", + "\n", + "Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Load Train and Test Data from the Offline Feature Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "bd2cdc83-0920-46e8-9454-e5e6e7832ce0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Train / test split\n", + "train_df, test_df = (\n", + " df.withColumn( # Dataframe that we generated from get_offline_features call.\n", + " \"label\", F.col(\"fare_amount\").cast(\"double\")\n", + " )\n", + " .where(F.col(\"f_trip_time_duration\") > 0)\n", + " .fillna(0)\n", + " .randomSplit([0.8, 0.2])\n", + ")\n", + "\n", + "print(f\"Num train samples: {train_df.count()}\")\n", + "print(f\"Num test samples: {test_df.count()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build a ML Pipeline\n", + "\n", + "Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "2a254361-63e9-45b2-8c19-40549762eacb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Generate a feature vector column for SparkML\n", + "vector_assembler = VectorAssembler(\n", + " inputCols=[x for x in df.columns if x in feature_names],\n", + " outputCol=\"features\",\n", + ")\n", + "\n", + "# Define a model\n", + "gbt = GBTRegressor(\n", + " featuresCol=\"features\",\n", + " maxIter=100,\n", + " maxDepth=5,\n", + " maxBins=16,\n", + ")\n", + "\n", + "# Create a ML pipeline\n", + "ml_pipeline = Pipeline(\n", + " stages=[\n", + " vector_assembler,\n", + " gbt,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Train and Evaluate the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Train a model\n", + "model = ml_pipeline.fit(train_df)\n", + "\n", + "# Make predictions\n", + "predictions = model.transform(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "1f9b584c-6228-4a02-a6c3-9b8dd2b78091", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Evaluate\n", + "evaluator = RegressionEvaluator(\n", + " labelCol=\"label\",\n", + " predictionCol=\"prediction\",\n", + ")\n", + "\n", + "rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n", + "mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n", + "print(f\"RMSE: {rmse}\\nMAE: {mae}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "25c33abd-6e87-437d-a6a1-86435f065a1e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n", + "predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n", + "\n", + "predictions_pdf.plot(\n", + " x=\"index\",\n", + " y=[\"label\", \"prediction\"],\n", + " style=[\"-\", \":\"],\n", + " figsize=(20, 10),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "664d78cc-4a92-430c-9e05-565ba904558e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "predictions_pdf.plot.scatter(\n", + " x=\"label\",\n", + " y=\"prediction\",\n", + " xlim=(0, 100),\n", + " ylim=(0, 100),\n", + " figsize=(10, 10),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8a56d165-c813-4ce0-8ae6-9f4d313c463d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 5. Materialize Feature Values for Online Scoring\n", + "\n", + "While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n", + "\n", + "Note, only the features anchored to offline data source can be materialized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "751fa72e-8f94-40a1-994e-3e8315b51d37", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "materialized_feature_names = [feature.name for feature in agg_features]\n", + "materialized_feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if REDIS_KEY and RESOURCE_PREFIX:\n", + " FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n", + "\n", + " # Get the last date from the dataset\n", + " backfill_timestamp = (\n", + " df_raw.select(\n", + " F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL)\n", + " )\n", + " .agg({TIMESTAMP_COL: \"max\"})\n", + " .collect()[0][0]\n", + " )\n", + "\n", + " # Time range to materialize\n", + " backfill_time = BackfillTime(\n", + " start=backfill_timestamp,\n", + " end=backfill_timestamp,\n", + " step=timedelta(days=1),\n", + " )\n", + "\n", + " # Destinations:\n", + " # For online store,\n", + " redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n", + "\n", + " # For offline store,\n", + " # adls_sink = HdfsSink(output_path=)\n", + "\n", + " settings = MaterializationSettings(\n", + " name=FEATURE_TABLE_NAME + \".job\", # job name\n", + " backfill_time=backfill_time,\n", + " sinks=[redis_sink], # or adls_sink\n", + " feature_names=materialized_feature_names,\n", + " )\n", + "\n", + " client.materialize_features(\n", + " settings=settings,\n", + " # Note, execution_configurations argument only works when using a new job cluster\n", + " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", + " )\n", + "\n", + " client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Now, you can retrieve features for online scoring as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "424bc9eb-a47f-4b46-be69-8218d55e66ad", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if REDIS_KEY and RESOURCE_PREFIX:\n", + " # Note, to get a single key, you may use client.get_online_features instead\n", + " materialized_feature_values = client.multi_get_online_features(\n", + " feature_table=FEATURE_TABLE_NAME,\n", + " keys=[\"239\", \"265\"],\n", + " feature_names=materialized_feature_names,\n", + " )\n", + " materialized_feature_values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3596dc71-a363-4b6a-a169-215c89978558", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b5fb292e-bbb6-4dd7-8e79-c62d9533e820", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Remove temporary files\n", + "dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "databricks_quickstart_nyc_taxi_demo", + "notebookOrigID": 2365994027381987, + "widgets": { + "REDIS_KEY": { + "currentValue": "", + "nuid": "d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca", + "widgetInfo": { + "defaultValue": "", + "label": null, + "name": "REDIS_KEY", + "options": { + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "RESOURCE_PREFIX": { + "currentValue": "", + "nuid": "87a26035-86fc-4dbd-8dd0-dc546c1c63c1", + "widgetInfo": { + "defaultValue": "", + "label": null, + "name": "RESOURCE_PREFIX", + "options": { + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "kernelspec": { + "display_name": "Python 3.10.4 ('feathr')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "e34a1a57d2e174682770a82d94a178aa36d3ccfaa21227c5d2308e319b7ae532" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index b789e9bf2..06b5cb340 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -89,18 +89,12 @@ }, "outputs": [], "source": [ - "from datetime import datetime, timedelta\n", - "import glob\n", - "import json\n", + "from datetime import timedelta\n", "from math import sqrt\n", "import os\n", "from pathlib import Path\n", - "import requests\n", "from tempfile import TemporaryDirectory\n", "\n", - "from azure.identity import AzureCliCredential, DefaultAzureCredential \n", - "from azure.keyvault.secrets import SecretClient\n", - "import pandas as pd\n", "from pyspark.ml import Pipeline\n", "from pyspark.ml.evaluation import RegressionEvaluator\n", "from pyspark.ml.feature import VectorAssembler\n", @@ -154,39 +148,44 @@ }, "outputs": [], "source": [ - "RESOURCE_PREFIX = None # TODO fill the value\n", + "RESOURCE_PREFIX = None # TODO fill the value used to deploy the resources via ARM template\n", "PROJECT_NAME = \"feathr_getting_started\"\n", "\n", - "# Data store root path. Could be a local file system path or Azure storage path like abfs or wasbs\n", - "DATA_STORE_PATH = TemporaryDirectory().name\n", - "\n", "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", "SPARK_CLUSTER = \"local\"\n", - "# TODO -- Synapse spark pool name or Databricks cluster id\n", - "CLUSTER_NAME = None\n", "\n", - "# If set True, use an interactive browser authentication\n", + "# If \"azure_synapse\":\n", + "AZURE_SYNAPSE_SPARK_POOL = None # Set Synapse spark pool name to use an existing cluster\n", + "\n", + "# If \"databricks\":\n", + "DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n", + "DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n", + "\n", + "# Data store root path. Could be a local file system path, dbfs or Azure storage path like abfs or wasbs\n", + "DATA_STORE_PATH = TemporaryDirectory().name\n", + "\n", + "# If set True, use an interactive browser authentication to get the redis password.\n", "USE_CLI_AUTH = False\n", "\n", + "REGISTER_FEATURES = False\n", + "\n", "# (For the notebook test pipeline) If true, use ScrapBook package to collect the results.\n", "SCRAP_RESULTS = False" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "KEY_VAULT = f\"{RESOURCE_PREFIX}kv\"\n", - "KEY_VAULT_URI = f\"https://{KEY_VAULT}.vault.azure.net\"\n", + "To use Databricks as the feathr client's target platform, you may need to set a databricks token to an environment variable like:\n", + "\n", + "`export DATABRICKS_WORKSPACE_TOKEN_VALUE=your-token`\n", + "\n", + "or in the notebook cell,\n", "\n", - "ADLS_PATH = f\"abfss://{RESOURCE_PREFIX}fs@{RESOURCE_PREFIX}dls.dfs.core.windows.net/feathr_project\"\n", + "`os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = your-token`\n", "\n", - "if SPARK_CLUSTER == \"azure_synapse\":\n", - " os.environ['spark_config__azure_synapse__dev_url'] = f\"https://{resource_prefix}syws.dev.azuresynapse.net\"\n", - " os.environ['spark_config__azure_synapse__pool_name'] = CLUSTER_NAME\n", - " os.environ['spark_config__azure_synapse__workspace_dir'] = f\"abfss://{adls_fs_name}@{resource_prefix}dls.dfs.core.windows.net/{PROJECT_NAME}\"" + "If you are running this notebook on Databricks, the token will be automatically retrieved by using the current Databricks notebook context." ] }, { @@ -195,22 +194,19 @@ "metadata": {}, "outputs": [], "source": [ - "if USE_CLI_AUTH:\n", - " !az login --use-device-code" + "# Force to use dbfs if the notebook is running on Databricks\n", + "if is_databricks() and not DATA_STORE_PATH.startswith(\"dbfs:\"):\n", + " DATA_STORE_PATH = f\"dbfs:/{DATA_STORE_PATH.lstrip('/')}\"" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "# Get all the required credentials from Azure Key Vault\n", - "credential = AzureCliCredential() if USE_CLI_AUTH else DefaultAzureCredential()\n", - "secret_client = SecretClient(vault_url=KEY_VAULT_URI, credential=credential)\n", - "retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value" + "if USE_CLI_AUTH:\n", + " !az login --use-device-code" ] }, { @@ -219,27 +215,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Redis credential\n", - "os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n", - "\n", - "if SPARK_CLUSTER == \"local\":\n", - " os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", - "\n", - "elif SPARK_CLUSTER == \"databricks\" and is_databricks():\n", - " ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - " databricks_config = {\n", - " 'run_name': \"FEATHR_FILL_IN\",\n", - " 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n", - " 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n", - " 'spark_jar_task': {\n", - " 'main_class_name': \"FEATHR_FILL_IN\",\n", - " 'parameters': [\"FEATHR_FILL_IN\"],\n", - " },\n", - " }\n", - " os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n", - " os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n", - " os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n", - " os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()\n" + "# Redis password\n", + "if 'REDIS_PASSWORD' not in os.environ:\n", + " # Try to get all the required credentials from Azure Key Vault\n", + " from azure.identity import AzureCliCredential, DefaultAzureCredential \n", + " from azure.keyvault.secrets import SecretClient\n", + "\n", + " vault_url = f\"https://{RESOURCE_PREFIX}kv.vault.azure.net\"\n", + " credential = AzureCliCredential() if USE_CLI_AUTH else DefaultAzureCredential()\n", + " secret_client = SecretClient(vault_url=vault_url, credential=credential)\n", + " retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value\n", + " os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n" ] }, { @@ -271,7 +257,14 @@ }, "outputs": [], "source": [ - "config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n", + "config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_cluster=SPARK_CLUSTER,\n", + " # cluster name will be ignored in \"local\" spark.\n", + " cluster_name=AZURE_SYNAPSE_SPARK_POOL if SPARK_CLUSTER == \"azure_synapse\" else DATABRICKS_CLUSTER_ID,\n", + " databricks_url=DATABRICKS_URL,\n", + ")\n", "\n", "with open(config_path, 'r') as f: \n", " print(f.read())" @@ -334,8 +327,8 @@ "metadata": {}, "outputs": [], "source": [ - "# To run on a local spark, start a spark session:\n", - "if SPARK_CLUSTER == \"local\":\n", + "# If the notebook is runnong on Jupyter, start a spark session:\n", + "if is_jupyter():\n", " spark = (\n", " SparkSession\n", " .builder\n", @@ -345,7 +338,7 @@ " .getOrCreate()\n", " )\n", " \n", - "# Else, you must already have spark session object available in databricks or synapse." + "# Else, you must already have a spark session object available in databricks or synapse." ] }, { @@ -472,7 +465,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can define the source with a preprocessing python function." + "We can define the source with a preprocessing python function. In order to make the source data accessible from the target spark cluster, we upload the data file into either DBFS or Azure Blob Storage if needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define data source path\n", + "if client.spark_runtime == \"local\" or (client.spark_runtime == \"databricks\" and is_databricks()):\n", + " # In local mode, we can use the same data path as the source.\n", + " # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path.\n", + " data_source_path = DATA_FILE_PATH\n", + "else:\n", + " # Otherwise, upload the local file to dbfs.\n", + " data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(DATA_FILE_PATH) " ] }, { @@ -488,7 +497,7 @@ "\n", "batch_source = HdfsSource(\n", " name=\"nycTaxiBatchSource\",\n", - " path=DATA_FILE_PATH,\n", + " path=data_source_path,\n", " event_timestamp_column=TIMESTAMP_COL,\n", " preprocessing=preprocessing,\n", " timestamp_format=TIMESTAMP_FORMAT,\n", @@ -692,7 +701,7 @@ " key=agg_key,\n", ")\n", "settings = ObservationSettings(\n", - " observation_path=DATA_FILE_PATH,\n", + " observation_path=data_source_path,\n", " event_timestamp_column=TIMESTAMP_COL,\n", " timestamp_format=TIMESTAMP_FORMAT,\n", ")\n", @@ -889,12 +898,14 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " client.register_features()\n", - "except KeyError:\n", - " # TODO temporarily go around the \"Already exists\" error\n", - " \n", - " client.list_registered_features(project_name=PROJECT_NAME)" + "if REGISTER_FEATURES:\n", + " try:\n", + " client.register_features()\n", + " except KeyError:\n", + " # TODO temporarily go around the \"Already exists\" error\n", + " pass \n", + " print(client.list_registered_features(project_name=PROJECT_NAME))\n", + " # You can get the actual features too by calling client.get_features_from_registry(PROJECT_NAME)" ] }, { @@ -915,29 +926,6 @@ "Note, only the features anchored to offline data source can be materialized." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get registered features\n", - "registered_features_dict = client.get_features_from_registry(PROJECT_NAME)\n", - "\n", - "observation_feature_names = []\n", - "materialized_feature_names = []\n", - "\n", - "for feature_name, feature in registered_features_dict.items():\n", - " if feature.key[0].key_column == \"NOT_NEEDED\":\n", - " observation_feature_names.append(feature_name)\n", - " else:\n", - " materialized_feature_names.append(feature_name)\n", - " \n", - "print(f\"Features that will be extracted directly from the observation: {observation_feature_names}\")\n", - "print(\"\")\n", - "print(f\"Features that will be extracted from the source data and materialized to online storage: {materialized_feature_names}\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -988,7 +976,7 @@ " name=FEATURE_TABLE_NAME + \".job\", # job name\n", " backfill_time=backfill_time,\n", " sinks=[redis_sink], # or adls_sink\n", - " feature_names=materialized_feature_names,\n", + " feature_names=[feature.name for feature in agg_features],\n", ")\n", "\n", "client.materialize_features(\n", @@ -1016,7 +1004,7 @@ "materialized_feature_values = client.multi_get_online_features(\n", " feature_table=FEATURE_TABLE_NAME,\n", " keys=[\"239\", \"265\"],\n", - " feature_names=materialized_feature_names,\n", + " feature_names=[feature.name for feature in agg_features],\n", ")\n", "materialized_feature_values" ] @@ -1034,7 +1022,7 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO: Unregister or any other cleanups." + "# TODO: Unregister, delete cached files or do any other cleanups." ] }, { @@ -1083,7 +1071,7 @@ }, "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3.10.4 ('feathr')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1101,7 +1089,7 @@ }, "vscode": { "interpreter": { - "hash": "ddb0e38f168d5afaa0b8ab4851ddd8c14364f1d087c15de6ff2ee5a559aec1f2" + "hash": "e34a1a57d2e174682770a82d94a178aa36d3ccfaa21227c5d2308e319b7ae532" } } }, diff --git a/feathr_project/feathr/datasets/nyc_taxi.py b/feathr_project/feathr/datasets/nyc_taxi.py index ec605aae6..e00fa7150 100644 --- a/feathr_project/feathr/datasets/nyc_taxi.py +++ b/feathr_project/feathr/datasets/nyc_taxi.py @@ -73,7 +73,7 @@ def get_spark_df( if is_databricks(): # Databricks uses "dbfs:/" prefix for spark paths if not local_cache_path.startswith("dbfs:"): - local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) + local_cache_path = f"dbfs:/{local_cache_path.lstrip('/')}" # Databricks uses "/dbfs/" prefix for python paths python_local_cache_path = local_cache_path.replace("dbfs:", "/dbfs") # TODO add "if is_synapse()" diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py index 9a9438567..47ac84679 100644 --- a/feathr_project/feathr/utils/config.py +++ b/feathr_project/feathr/utils/config.py @@ -1,61 +1,209 @@ +from copy import deepcopy +import os +import json from tempfile import NamedTemporaryFile +from typing import Dict +import yaml +from feathr.utils.platform import is_databricks -FEATHR_CONFIG_TEMPLATE = """ -api_version: 1 -project_config: - project_name: {project_name} +DEFAULT_FEATHR_CONFIG = { + "api_version": 1, + "project_config": {}, # "project_name" + "feature_registry": {}, # "api_endpoint" + "spark_config": { + # "spark_cluster". Currently support 'azure_synapse', 'databricks', and 'local' + "spark_result_output_parts": "1", + }, + "offline_store": { + "adls": {"adls_enabled": "true"}, + "wasb": {"wasb_enabled": "true"}, + }, + "online_store": { + "redis": { + # "host" + "port": "6380", + "ssl_enabled": "true", + } + } +} -feature_registry: - api_endpoint: 'https://{resource_prefix}webapp.azurewebsites.net/api/v1' -spark_config: - # Currently support: 'azure_synapse', 'databricks', and 'local' - spark_cluster: {spark_cluster} - spark_result_output_parts: '1' +# New databricks job cluster config +DEFAULT_DATABRICKS_CLUSTER_CONFIG = { + "spark_version": "11.2.x-scala2.12", + "node_type_id": "Standard_D3_v2", + "num_workers": 2, + "spark_conf": { + "FEATHR_FILL_IN": "FEATHR_FILL_IN", + # Exclude conflicting packages if use feathr <= v0.8.0: + "spark.jars.excludes": "commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api", + }, +} -offline_store: - wasb: - wasb_enabled: true -online_store: - # You can skip this part if you don't have Redis and skip materialization later in this notebook. - redis: - host: '{resource_prefix}redis.redis.cache.windows.net' - port: 6380 - ssl_enabled: true -""" +# New Azure Synapse spark pool config +DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG = { + "executor_size": "Small", + "executor_num": 2, +} def generate_config( resource_prefix: str, project_name: str, spark_cluster: str, + cluster_name: str = None, + databricks_url: str = None, output_filepath: str = None, + use_env_vars: bool = True, ) -> str: - """Generate a feathr config yaml file + """Generate a feathr config yaml file. Note, if environment variables are set, they will be used instead of the + provided arguments. + + Some credential variables are intentionally not included in the argument and the outut config file + to avoid leaking secrets. E.g. DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD. + Those values should be passed via the environment variables regardless of the `use_env_vars` flag. + + Note: + This utility function assumes Azure resources are deployed using the Azure Resource Manager (ARM) template, + and infers resource names based on the given `resource_prefix`. If you deploy resources manually, you may need + to create the config file manually. Args: resource_prefix: Resource name prefix. project_name: Project name. spark_cluster: Spark cluster to use. Either 'local', 'databricks', or 'azure_synapse'. - output_filepath: Output filepath. + cluster_name (optional): Synapse spark pool name or Databricks cluster id if applicable. + If not provided, a new (job) cluster will be created and used. + databricks_url (optional): Databricks workspace url if applicable. + output_filepath (optional): Output filepath. + use_env_vars (optional): Whether to use environment variables if they are set. Returns: str: Generated config file path. output_filepath if provided. Otherwise, NamedTemporaryFile path. """ + if use_env_vars: + spark_cluster = os.getenv("SPARK_CONFIG__SPARK_CLUSTER", spark_cluster) - conf_str = FEATHR_CONFIG_TEMPLATE.format( - resource_prefix=resource_prefix, - project_name=project_name, - spark_cluster=spark_cluster, - ) + config = deepcopy(DEFAULT_FEATHR_CONFIG) + config["project_config"]["project_name"] = project_name + config["feature_registry"]["api_endpoint"] = f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" + config["spark_config"]["spark_cluster"] = spark_cluster + config["online_store"]["redis"]["host"] = f"{resource_prefix}redis.redis.cache.windows.net" + + # Set platform specific configurations + if spark_cluster == "local": + _set_local_spark_config() + elif spark_cluster == "azure_synapse": + _set_azure_synapse_config( + config=config, + resource_prefix=resource_prefix, + project_name=project_name, + cluster_name=cluster_name, + use_env_vars=use_env_vars, + ) + elif spark_cluster == "databricks": + _set_databricks_config( + config=config, + project_name=project_name, + workspace_url=databricks_url, + cluster_name=cluster_name, + use_env_vars=use_env_vars, + ) if not output_filepath: output_filepath = NamedTemporaryFile(mode="w", delete=False).name - with open(output_filepath, "w") as conf_file: - conf_file.write(conf_str) + with open(output_filepath, "w") as f: + yaml.dump(config, f, default_flow_style=False) return output_filepath + + +def _set_local_spark_config(): + """Set environment variables for local spark cluster.""" + os.environ["SPARK_LOCAL_IP"] = os.getenv( + "SPARK_LOCAL_IP", + "127.0.0.1", + ) + + +def _set_azure_synapse_config( + config: Dict, + resource_prefix: str, + project_name: str, + cluster_name: str = None, + use_env_vars: bool = True, +): + """Set environment variables for Azure Synapse spark cluster. + One may need to set ADLS_KEY""" + + dev_url = f"https://{resource_prefix}syws.dev.azuresynapse.net" + workspace_dir = f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" + + if use_env_vars: + dev_url = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL", dev_url) + cluster_name = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME", cluster_name) + workspace_dir = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR", workspace_dir) + + if not cluster_name: + raise ValueError("Azure Synapse spark pool name is not provided.") + + config["spark_config"]["azure_synapse"] = { + "dev_url": dev_url, + "pool_name": cluster_name, + "workspace_dir": workspace_dir, + **DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG, + } + + +def _set_databricks_config( + config: Dict, + project_name: str, + workspace_url: str, + cluster_name: str = None, + use_env_vars: bool = True, +): + if is_databricks(): + # If this functions is being called in Databricks, we may use the context to override the provided arguments. + ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext() + workspace_url = "https://" + ctx.tags().get("browserHostName").get() + workspace_token = ctx.apiToken().get() + else: + workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", None) + + work_dir = f"dbfs:/{project_name}" + databricks_config = { + "run_name": "FEATHR_FILL_IN", + "libraries": [{"jar": "FEATHR_FILL_IN"}], + "spark_jar_task": { + "main_class_name": "FEATHR_FILL_IN", + "parameters": ["FEATHR_FILL_IN"], + }, + } + if cluster_name is None: + databricks_config["new_cluster"] = DEFAULT_DATABRICKS_CLUSTER_CONFIG + else: + databricks_config["existing_cluster_id"] = cluster_name + config_template = json.dumps(databricks_config) + + if use_env_vars: + work_dir = os.getenv("SPARK_CONFIG__DATABRICKS__WORK_DIR", work_dir) + workspace_url = os.getenv("SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL", workspace_url) + workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", workspace_token) + config_template = os.getenv("SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE", config_template) + + if not workspace_url: + raise ValueError("Databricks workspace url is not provided.") + + if not workspace_token: + raise ValueError("Databricks workspace token is not provided.") + + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = workspace_token + config["spark_config"]["databricks"] = { + "work_dir": work_dir, + "workspace_instance_url": workspace_url, + "config_template": config_template, + } diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 12f27c2cb..6f5814e43 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -101,9 +101,10 @@ def get_result_df( elif client.spark_runtime == "databricks": if not res_url.startswith("dbfs:"): - raise ValueError( - f"In Databricks, the result files are expected to be stored at a DBFS storage but res_url = {res_url}." + logger.warning( + f"In Databricks, the result files are expected to be stored in DBFS, but the res_url {res_url} is not a dbfs path. Prefixing it with 'dbfs:/'" ) + res_url = f"dbfs:/{res_url.lstrip('/')}" if is_databricks(): # Check if the function is being called from Databricks if local_cache_path is not None: @@ -111,12 +112,9 @@ def get_result_df( "Result files are already in DBFS and thus `local_cache_path` will be ignored." ) local_cache_path = res_url - elif local_cache_path is None: # Download the result from dbfs to local - local_cache_path = TemporaryDirectory().name - else: - logger.warning("This utility function currently supports local spark and databricks. You may encounter unexpected results on other platforms.") - # TODO elif azure_synapse + if local_cache_path is None: + local_cache_path = TemporaryDirectory().name if local_cache_path != res_url: logger.info(f"{res_url} files will be downloaded into {local_cache_path}") diff --git a/feathr_project/pyproject.toml b/feathr_project/pyproject.toml index 5b7b2fc11..338a0eed3 100644 --- a/feathr_project/pyproject.toml +++ b/feathr_project/pyproject.toml @@ -11,7 +11,7 @@ multi_line_output = 3 [tool.pytest.ini_options] markers = [ - "notebooks: tests Jupyter notebooks", + "notebooks: tests Jupyter notebooks" ] [build-system] diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py index b8ee3f345..52b10cf89 100644 --- a/feathr_project/test/conftest.py +++ b/feathr_project/test/conftest.py @@ -5,6 +5,21 @@ from feathr import FeathrClient +def pytest_addoption(parser): + """Pytest command line argument options. + E.g. + `python -m pytest feathr_project/test/ --resource-prefix your_feathr_resource_prefix` + """ + parser.addoption( + "--resource-prefix", action="store", default="feathrazuretest3", help="Test Azure resource prefix" + ) + + +@pytest.fixture +def resource_prefix(request): + return request.config.getoption("--resource-prefix") + + @pytest.fixture(scope="session") def workspace_dir() -> str: """Workspace directory path containing data files and configs for testing.""" diff --git a/feathr_project/test/samples/test_notebooks.py b/feathr_project/test/samples/test_notebooks.py index 778b157d7..f87cbff2e 100644 --- a/feathr_project/test/samples/test_notebooks.py +++ b/feathr_project/test/samples/test_notebooks.py @@ -23,22 +23,25 @@ @pytest.mark.notebooks -def test__nyc_taxi_demo(tmp_path): +def test__nyc_taxi_demo(resource_prefix, tmp_path): notebook_name = "nyc_taxi_demo" output_tmpdir = TemporaryDirectory() output_notebook_path = str(tmp_path.joinpath(f"{notebook_name}.ipynb")) + print(f"Running {notebook_name} notebook as {output_notebook_path}") + pm.execute_notebook( input_path=NOTEBOOK_PATHS[notebook_name], output_path=output_notebook_path, # kernel_name="python3", parameters=dict( - RESOURCE_PREFIX="feathrazuretest3", # Use the test resource group + RESOURCE_PREFIX=resource_prefix, PROJECT_NAME=notebook_name, DATA_STORE_PATH=output_tmpdir.name, SPARK_CLUSTER="local", USE_CLI_AUTH=False, + REGISTER_FEATURES=False, SCRAP_RESULTS=True, ), ) @@ -47,10 +50,7 @@ def test__nyc_taxi_demo(tmp_path): nb = sb.read_notebook(output_notebook_path) outputs = nb.scraps - assert outputs["materialized_feature_values"].data["239"] == pytest.approx([5707., 1480.], abs=1.) - assert outputs["materialized_feature_values"].data["265"] == pytest.approx([10000., 4160.], abs=1.) + assert outputs["materialized_feature_values"].data["239"] == pytest.approx([1480., 5707.], abs=1.) + assert outputs["materialized_feature_values"].data["265"] == pytest.approx([4160., 10000.], abs=1.) assert outputs["rmse"].data == pytest.approx(5., abs=2.) assert outputs["mae"].data == pytest.approx(2., abs=1.) - - # clean up - output_tmpdir.cleanup() diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index 52adcae39..7b3395bc9 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -1,36 +1,38 @@ +from copy import deepcopy +import os from pathlib import Path +import yaml import pytest -from feathr.utils.config import FEATHR_CONFIG_TEMPLATE, generate_config - - -@pytest.fixture(scope="session") -def feathr_config_str() -> str: - return FEATHR_CONFIG_TEMPLATE.format( - resource_prefix="test_prefix", - project_name="test_project", - spark_cluster="local", - ) +from feathr import FeathrClient +from feathr.utils.config import generate_config @pytest.mark.parametrize( "output_filepath", [None, "config.yml"], ) -def test__generate_config( +def test__generate_config__output_filepath( output_filepath: str, - feathr_config_str: str, tmp_path: Path, ): + resource_prefix = "test_prefix" + project_name = "test_project" + spark_cluster = "local" + # Use tmp_path so that the test files get cleaned up after the tests if output_filepath: output_filepath = str(tmp_path / output_filepath) + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + config_filepath = generate_config( - resource_prefix="test_prefix", - project_name="test_project", - spark_cluster="local", + resource_prefix=resource_prefix, + project_name=project_name, + spark_cluster=spark_cluster, output_filepath=output_filepath, + use_env_vars=False, ) # Assert if the config file was generated in the specified output path. @@ -39,4 +41,67 @@ def test__generate_config( # Assert the generated config string is correct. with open(config_filepath, "r") as f: - assert feathr_config_str == f.read() + config = yaml.safe_load(f) + + assert config["project_config"]["project_name"] == project_name + assert config["feature_registry"]["api_endpoint"] == f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" + assert config["spark_config"]["spark_cluster"] == spark_cluster + assert config["online_store"]["redis"]["host"] == f"{resource_prefix}redis.redis.cache.windows.net" + + +@pytest.mark.parametrize( + "spark_cluster,cluster_name,databricks_url", + [ + ("local", None, None), + ("databricks", None, "https://test_url"), + ("azure_synapse", "some_spark_pool", None), + ] +) +def test__generate_config__spark_cluster( + spark_cluster: str, + cluster_name: str, + databricks_url: str, +): + """Test if spark cluster specific configs are generated without errors. + TODO - For now, this test doesn't check if the config values are correct. + """ + + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_cluster=spark_cluster, + cluster_name=cluster_name, + databricks_url=databricks_url, + use_env_vars=False, + ) + + +@pytest.mark.parametrize( + "spark_cluster,cluster_name,databricks_url", + [ + ("databricks", "some_cluster_id", None), + ("azure_synapse", None, "https://test_url"), + ] +) +def test__generate_config__exceptions( + spark_cluster: str, + cluster_name: str, + databricks_url: str, +): + """Test if exceptions are raised when databricks url and token are not provided.""" + + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_cluster=spark_cluster, + cluster_name=cluster_name, + databricks_url=databricks_url, + use_env_vars=False, + ) diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 1e005855e..136bc3545 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -91,10 +91,6 @@ def test__get_result_df__with_local_cache_path( # Test ValueError when res_url is None (False, "local", None, ValueError), (True, "databricks", None, ValueError), - # Test ValueError when res_url is not a dbfs path but client.spark_runtime is databricks - (False, "databricks", "some_local_path", ValueError), - # Test ValueError when res_url does not exists or not able to access. - (False, "local", "some_doesnt_exist_path", Exception), ] ) def test__get_result_df__exceptions( @@ -113,6 +109,9 @@ def test__get_result_df__exceptions( # Mock is_data_bricks mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) + # Mock _load_files_to_pandas_df + mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") + with pytest.raises(expected_error): get_result_df(client) From 1f3894a0db20f4240d9ff40630b8cea4a83563a2 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Tue, 8 Nov 2022 04:11:53 +0000 Subject: [PATCH 14/18] Fix generate_config to accept all the feathr env var config name Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../databricks_quickstart_nyc_taxi_demo.ipynb | 15 +- docs/samples/nyc_taxi_demo.ipynb | 8 +- feathr_project/feathr/utils/config.py | 241 +++++++++++------- feathr_project/test/unit/utils/test_config.py | 42 ++- 4 files changed, 181 insertions(+), 125 deletions(-) diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index 4dc58eaf7..e562ec5db 100644 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -240,6 +240,15 @@ "In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -256,9 +265,11 @@ "config_path = generate_config(\n", " resource_prefix=RESOURCE_PREFIX,\n", " project_name=PROJECT_NAME,\n", - " spark_cluster=SPARK_CLUSTER,\n", + " spark_config__spark_cluster=SPARK_CLUSTER,\n", " # You may set an existing cluster id here, but Databricks recommend to use new clusters for greater reliability.\n", - " cluster_name=None, # Set None to create a new job cluster\n", + " databricks_cluster_id=None, # Set None to create a new job cluster\n", + " databricks_workspace_token_value=ctx.apiToken().get(),\n", + " spark_config__databricks__workspace_instance_url=f\"https://{ctx.tags().get('browserHostName').get()}\",\n", ")\n", "\n", "with open(config_path, \"r\") as f:\n", diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 06b5cb340..10e189251 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -154,9 +154,6 @@ "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", "SPARK_CLUSTER = \"local\"\n", "\n", - "# If \"azure_synapse\":\n", - "AZURE_SYNAPSE_SPARK_POOL = None # Set Synapse spark pool name to use an existing cluster\n", - "\n", "# If \"databricks\":\n", "DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n", "DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n", @@ -261,9 +258,8 @@ " resource_prefix=RESOURCE_PREFIX,\n", " project_name=PROJECT_NAME,\n", " spark_cluster=SPARK_CLUSTER,\n", - " # cluster name will be ignored in \"local\" spark.\n", - " cluster_name=AZURE_SYNAPSE_SPARK_POOL if SPARK_CLUSTER == \"azure_synapse\" else DATABRICKS_CLUSTER_ID,\n", - " databricks_url=DATABRICKS_URL,\n", + " databricks_cluster_id=DATABRICKS_CLUSTER_ID if SPARK_CLUSTER == \"databricks\" else None,\n", + " spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n", ")\n", "\n", "with open(config_path, 'r') as f: \n", diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py index 47ac84679..27b41e8c7 100644 --- a/feathr_project/feathr/utils/config.py +++ b/feathr_project/feathr/utils/config.py @@ -1,3 +1,4 @@ +import collections.abc from copy import deepcopy import os import json @@ -13,7 +14,7 @@ "project_config": {}, # "project_name" "feature_registry": {}, # "api_endpoint" "spark_config": { - # "spark_cluster". Currently support 'azure_synapse', 'databricks', and 'local' + "spark_cluster": "local", # Currently support 'azure_synapse', 'databricks', and 'local' "spark_result_output_parts": "1", }, "offline_store": { @@ -47,72 +48,97 @@ DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG = { "executor_size": "Small", "executor_num": 2, + "pool_name": "spark3", } def generate_config( resource_prefix: str, project_name: str, - spark_cluster: str, - cluster_name: str = None, - databricks_url: str = None, output_filepath: str = None, + databricks_workspace_token_value: str = None, + databricks_cluster_id: str = None, + redis_password: str = None, + adls_key: str = None, use_env_vars: bool = True, + **kwargs, ) -> str: - """Generate a feathr config yaml file. Note, if environment variables are set, they will be used instead of the - provided arguments. + """Generate a feathr config yaml file. + Note, `use_env_vars` argument gives an option to either use environment variables for generating the config file + or not. Feathr client will use environment variables anyway if they are set. - Some credential variables are intentionally not included in the argument and the outut config file - to avoid leaking secrets. E.g. DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD. - Those values should be passed via the environment variables regardless of the `use_env_vars` flag. + Keyword arguments follow the same naming convention as the feathr config. E.g. to set Databricks as the target + cluster, use `spark_config__spark_cluster="databricks"`. + See https://feathr-ai.github.io/feathr/quickstart_synapse.html#step-4-update-feathr-config for more details. Note: This utility function assumes Azure resources are deployed using the Azure Resource Manager (ARM) template, and infers resource names based on the given `resource_prefix`. If you deploy resources manually, you may need - to create the config file manually. + to pass each resource url manually, e.g. `spark_config__azure_synapse__dev_url="your-resource-url"`. Args: - resource_prefix: Resource name prefix. - project_name: Project name. - spark_cluster: Spark cluster to use. Either 'local', 'databricks', or 'azure_synapse'. - cluster_name (optional): Synapse spark pool name or Databricks cluster id if applicable. - If not provided, a new (job) cluster will be created and used. - databricks_url (optional): Databricks workspace url if applicable. + resource_prefix: Resource name prefix used when deploying Feathr resources by using ARM template. + project_name: Feathr project name. + cluster_name (optional): Databricks cluster or Azure Synapse spark pool name to use an existing one. output_filepath (optional): Output filepath. use_env_vars (optional): Whether to use environment variables if they are set. + databricks_workspace_token_value (optional): Databricks workspace token. If provided, the value will be stored + as the environment variable. + databricks_cluster_id (optional): Databricks cluster id to use an existing cluster. + redis_password (optional): Redis password. If provided, the value will be stored as the environment variable. + adls_key (optional): ADLS key. If provided, the value will be stored as the environment variable. Returns: - str: Generated config file path. output_filepath if provided. Otherwise, NamedTemporaryFile path. + str: Generated config file path. This will be identical to `output_filepath` if provided. """ - if use_env_vars: - spark_cluster = os.getenv("SPARK_CONFIG__SPARK_CLUSTER", spark_cluster) - + # Set keys + if databricks_workspace_token_value: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = databricks_workspace_token_value + if redis_password: + os.environ["REDIS_PASSWORD"] = redis_password + if adls_key: + os.environ["ADLS_KEY"] = adls_key + + # Set configs config = deepcopy(DEFAULT_FEATHR_CONFIG) config["project_config"]["project_name"] = project_name config["feature_registry"]["api_endpoint"] = f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" - config["spark_config"]["spark_cluster"] = spark_cluster config["online_store"]["redis"]["host"] = f"{resource_prefix}redis.redis.cache.windows.net" + # Update configs using kwargs + new_config = _config_kwargs_to_dict(**kwargs) + _update_config(config, new_config) + # Set platform specific configurations - if spark_cluster == "local": + if config["spark_config"]["spark_cluster"] == "local": _set_local_spark_config() - elif spark_cluster == "azure_synapse": + elif config["spark_config"]["spark_cluster"] == "azure_synapse": _set_azure_synapse_config( config=config, resource_prefix=resource_prefix, project_name=project_name, - cluster_name=cluster_name, - use_env_vars=use_env_vars, ) - elif spark_cluster == "databricks": + elif config["spark_config"]["spark_cluster"] == "databricks": _set_databricks_config( config=config, project_name=project_name, - workspace_url=databricks_url, - cluster_name=cluster_name, - use_env_vars=use_env_vars, + cluster_id=databricks_cluster_id, ) + # Maybe update configs with environment variables + if use_env_vars: + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__SPARK_CLUSTER") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__WORK_DIR") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE") + + # Verify config + _verify_config(config) + + # Write config to file if not output_filepath: output_filepath = NamedTemporaryFile(mode="w", delete=False).name @@ -134,76 +160,107 @@ def _set_azure_synapse_config( config: Dict, resource_prefix: str, project_name: str, - cluster_name: str = None, - use_env_vars: bool = True, ): - """Set environment variables for Azure Synapse spark cluster. - One may need to set ADLS_KEY""" + """Set environment variables for Azure Synapse spark cluster.""" - dev_url = f"https://{resource_prefix}syws.dev.azuresynapse.net" - workspace_dir = f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" + if "azure_synapse" not in config["spark_config"]: + config["spark_config"]["azure_synapse"] = dict() - if use_env_vars: - dev_url = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL", dev_url) - cluster_name = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME", cluster_name) - workspace_dir = os.getenv("SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR", workspace_dir) - - if not cluster_name: - raise ValueError("Azure Synapse spark pool name is not provided.") - - config["spark_config"]["azure_synapse"] = { - "dev_url": dev_url, - "pool_name": cluster_name, - "workspace_dir": workspace_dir, - **DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG, - } + if "dev_url" not in config["spark_config"]["azure_synapse"]: + config["spark_config"]["azure_synapse"]["dev_url"] = f"https://{resource_prefix}syws.dev.azuresynapse.net" + + if "workspace_dir" not in config["spark_config"]["azure_synapse"]: + config["spark_config"]["azure_synapse"]["workspace_dir"] =\ + f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" + + for k, v in DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG.items(): + if k not in config["spark_config"]["azure_synapse"]: + config["spark_config"]["azure_synapse"][k] = v def _set_databricks_config( config: Dict, project_name: str, - workspace_url: str, - cluster_name: str = None, - use_env_vars: bool = True, + cluster_id: str = None, ): - if is_databricks(): - # If this functions is being called in Databricks, we may use the context to override the provided arguments. - ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext() - workspace_url = "https://" + ctx.tags().get("browserHostName").get() - workspace_token = ctx.apiToken().get() - else: - workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", None) - - work_dir = f"dbfs:/{project_name}" - databricks_config = { - "run_name": "FEATHR_FILL_IN", - "libraries": [{"jar": "FEATHR_FILL_IN"}], - "spark_jar_task": { - "main_class_name": "FEATHR_FILL_IN", - "parameters": ["FEATHR_FILL_IN"], - }, - } - if cluster_name is None: - databricks_config["new_cluster"] = DEFAULT_DATABRICKS_CLUSTER_CONFIG - else: - databricks_config["existing_cluster_id"] = cluster_name - config_template = json.dumps(databricks_config) + if "databricks" not in config["spark_config"]: + config["spark_config"]["databricks"] = dict() + + if "work_dir" not in config["spark_config"]["databricks"]: + config["spark_config"]["databricks"]["work_dir"] = f"dbfs:/{project_name}" + + if "config_template" not in config["spark_config"]["databricks"]: + databricks_config = { + "run_name": "FEATHR_FILL_IN", + "libraries": [{"jar": "FEATHR_FILL_IN"}], + "spark_jar_task": { + "main_class_name": "FEATHR_FILL_IN", + "parameters": ["FEATHR_FILL_IN"], + }, + } + if cluster_id is None: + databricks_config["new_cluster"] = DEFAULT_DATABRICKS_CLUSTER_CONFIG + else: + databricks_config["existing_cluster_id"] = cluster_id - if use_env_vars: - work_dir = os.getenv("SPARK_CONFIG__DATABRICKS__WORK_DIR", work_dir) - workspace_url = os.getenv("SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL", workspace_url) - workspace_token = os.getenv("DATABRICKS_WORKSPACE_TOKEN_VALUE", workspace_token) - config_template = os.getenv("SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE", config_template) - - if not workspace_url: - raise ValueError("Databricks workspace url is not provided.") - - if not workspace_token: - raise ValueError("Databricks workspace token is not provided.") - - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = workspace_token - config["spark_config"]["databricks"] = { - "work_dir": work_dir, - "workspace_instance_url": workspace_url, - "config_template": config_template, - } + config["spark_config"]["databricks"]["config_template"] = json.dumps(databricks_config) + + +def _config_kwargs_to_dict(**kwargs) -> Dict: + """Parse config's keyword arguments to dictionary. + e.g. `spark_config__spark_cluster="local"` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. + """ + config = dict() + + for conf_key, conf_value in kwargs.items(): + if conf_value is None: + continue + + conf = config + keys = conf_key.split("__") + for k in keys[:-1]: + if k not in conf: + conf[k] = dict() + conf = conf[k] + conf[keys[-1]] = conf_value + + return config + + +def _update_config(config: Dict, new_config: Dict): + """Update config dictionary with the values in `new_config`.""" + for k, v in new_config.items(): + if k in config and isinstance(v, collections.abc.Mapping): + _update_config(config[k], v) + else: + config[k] = v + + +def _verify_config(config: Dict): + """Verify config.""" + if config["spark_config"]["spark_cluster"] == "azure_synapse": + if "ADLS_KEY" not in os.environ: + raise ValueError("ADLS_KEY must be set in environment variables") + + elif config["spark_config"]["spark_cluster"] == "databricks": + if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + raise ValueError("Databricks workspace token is not provided.") + elif "workspace_instance_url" not in config["spark_config"]["databricks"]: + raise ValueError("Databricks workspace url is not provided.") + + +def _maybe_update_config_with_env_var(config: Dict, env_var_name: str): + """Update config dictionary with the values in environment variables. + e.g. `SPARK_CONFIG__SPARK_CLUSTER` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. + """ + if env_var_name not in os.environ: + return + + keys = env_var_name.lower().split("__") + conf = config + for k in keys[:-1]: + if k not in conf: + conf[k] = dict() + conf = conf[k] + + conf[keys[-1]] = os.environ[env_var_name] diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index 7b3395bc9..6a5119981 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -18,19 +18,14 @@ def test__generate_config__output_filepath( ): resource_prefix = "test_prefix" project_name = "test_project" - spark_cluster = "local" # Use tmp_path so that the test files get cleaned up after the tests if output_filepath: output_filepath = str(tmp_path / output_filepath) - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" - config_filepath = generate_config( resource_prefix=resource_prefix, project_name=project_name, - spark_cluster=spark_cluster, output_filepath=output_filepath, use_env_vars=False, ) @@ -45,63 +40,60 @@ def test__generate_config__output_filepath( assert config["project_config"]["project_name"] == project_name assert config["feature_registry"]["api_endpoint"] == f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" - assert config["spark_config"]["spark_cluster"] == spark_cluster + assert config["spark_config"]["spark_cluster"] == "local" assert config["online_store"]["redis"]["host"] == f"{resource_prefix}redis.redis.cache.windows.net" @pytest.mark.parametrize( - "spark_cluster,cluster_name,databricks_url", + "spark_cluster,env_key,databricks_url", [ ("local", None, None), - ("databricks", None, "https://test_url"), - ("azure_synapse", "some_spark_pool", None), + ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", "https://test_url"), + ("azure_synapse", "ADLS_KEY", None), ] ) def test__generate_config__spark_cluster( spark_cluster: str, - cluster_name: str, + env_key: str, databricks_url: str, ): """Test if spark cluster specific configs are generated without errors. - TODO - For now, this test doesn't check if the config values are correct. + TODO - For now, this test doesn't check if the config values are correctly working with the actual Feathr client. """ - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + if env_key and env_key not in os.environ: + os.environ[env_key] = "test_value" generate_config( resource_prefix="test_prefix", project_name="test_project", - spark_cluster=spark_cluster, - cluster_name=cluster_name, - databricks_url=databricks_url, + spark_config__spark_cluster=spark_cluster, + spark_config__databricks__workspace_instance_url=databricks_url, use_env_vars=False, ) @pytest.mark.parametrize( - "spark_cluster,cluster_name,databricks_url", + "spark_cluster,env_key,databricks_url", [ - ("databricks", "some_cluster_id", None), - ("azure_synapse", None, "https://test_url"), + ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", None), ] ) def test__generate_config__exceptions( spark_cluster: str, - cluster_name: str, + env_key: str, databricks_url: str, ): """Test if exceptions are raised when databricks url and token are not provided.""" - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: - os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = "test_token" + if env_key and env_key not in os.environ: + os.environ[env_key] = "test_value" with pytest.raises(ValueError): generate_config( resource_prefix="test_prefix", project_name="test_project", - spark_cluster=spark_cluster, - cluster_name=cluster_name, - databricks_url=databricks_url, + spark_config__spark_cluster=spark_cluster, + spark_config__databricks__workspace_instance_url=databricks_url, use_env_vars=False, ) From 8a610ac926239cb708484db999e569ebf18dcbab Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 9 Nov 2022 07:01:24 +0000 Subject: [PATCH 15/18] Add more pytests Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../databricks_quickstart_nyc_taxi_demo.ipynb | 3 +- docs/samples/nyc_taxi_demo.ipynb | 54 +++++-- feathr_project/feathr/utils/config.py | 42 ++++-- feathr_project/feathr/utils/job_utils.py | 11 +- feathr_project/test/conftest.py | 9 +- feathr_project/test/samples/test_notebooks.py | 6 +- feathr_project/test/unit/utils/test_config.py | 135 ++++++++++++++---- .../test/unit/utils/test_job_utils.py | 50 +++++-- 8 files changed, 233 insertions(+), 77 deletions(-) diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb index e562ec5db..65e305e8f 100644 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -113,8 +113,7 @@ }, "outputs": [], "source": [ - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", + "from datetime import timedelta\n", "import os\n", "from pathlib import Path\n", "\n", diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 10e189251..4cb6b5b4f 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -154,13 +154,20 @@ "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", "SPARK_CLUSTER = \"local\"\n", "\n", - "# If \"databricks\":\n", + "# TODO fill values to use databricks cluster:\n", "DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n", "DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n", "\n", + "# TODO fill values to use Azure Synapse cluster:\n", + "AZURE_SYNAPSE_SPARK_POOL = None # Set Azure Synapse Spark pool name\n", + "AZURE_SYNAPSE_URL = None # Set Azure Synapse workspace url to use Azure Synapse\n", + "\n", "# Data store root path. Could be a local file system path, dbfs or Azure storage path like abfs or wasbs\n", "DATA_STORE_PATH = TemporaryDirectory().name\n", "\n", + "# Feathr config file path to use an existing file\n", + "FEATHR_CONFIG_PATH = None\n", + "\n", "# If set True, use an interactive browser authentication to get the redis password.\n", "USE_CLI_AUTH = False\n", "\n", @@ -182,7 +189,27 @@ "\n", "`os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = your-token`\n", "\n", - "If you are running this notebook on Databricks, the token will be automatically retrieved by using the current Databricks notebook context." + "If you are running this notebook on Databricks, the token will be automatically retrieved by using the current Databricks notebook context.\n", + "\n", + "On the other hand, to use Azure Synapse cluster, you have to specify the synapse workspace storage key:\n", + "\n", + "`export ADLS_KEY=your-key`\n", + "\n", + "or in the notebook cell,\n", + "\n", + "`os.environ[\"ADLS_KEY\"] = your-key`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if SPARK_CLUSTER == \"azure_synapse\" and not os.environ.get(\"ADLS_KEY\"):\n", + " os.environ[\"ADLS_KEY\"] = add_your_key_here\n", + "elif SPARK_CLUSTER == \"databricks\" and not os.environ.get(\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"):\n", + " os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = add_your_token_here" ] }, { @@ -254,13 +281,18 @@ }, "outputs": [], "source": [ - "config_path = generate_config(\n", - " resource_prefix=RESOURCE_PREFIX,\n", - " project_name=PROJECT_NAME,\n", - " spark_cluster=SPARK_CLUSTER,\n", - " databricks_cluster_id=DATABRICKS_CLUSTER_ID if SPARK_CLUSTER == \"databricks\" else None,\n", - " spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n", - ")\n", + "if FEATHR_CONFIG_PATH:\n", + " config_path = FEATHR_CONFIG_PATH\n", + "else:\n", + " config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_config__spark_cluster=SPARK_CLUSTER,\n", + " spark_config__azure_synapse__dev_url=AZURE_SYNAPSE_URL,\n", + " spark_config__azure_synapse__pool_name=AZURE_SYNAPSE_SPARK_POOL,\n", + " spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n", + " databricks_cluster_id=DATABRICKS_CLUSTER_ID,\n", + " )\n", "\n", "with open(config_path, 'r') as f: \n", " print(f.read())" @@ -334,7 +366,7 @@ " .getOrCreate()\n", " )\n", " \n", - "# Else, you must already have a spark session object available in databricks or synapse." + "# Else, you must already have a spark session object available in databricks or synapse notebooks." ] }, { @@ -476,7 +508,7 @@ " # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path.\n", " data_source_path = DATA_FILE_PATH\n", "else:\n", - " # Otherwise, upload the local file to dbfs.\n", + " # Otherwise, upload the local file to the cloud storage (either dbfs or adls).\n", " data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(DATA_FILE_PATH) " ] }, diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py index 27b41e8c7..9a5f5fd89 100644 --- a/feathr_project/feathr/utils/config.py +++ b/feathr_project/feathr/utils/config.py @@ -48,7 +48,6 @@ DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG = { "executor_size": "Small", "executor_num": 2, - "pool_name": "spark3", } @@ -161,20 +160,19 @@ def _set_azure_synapse_config( resource_prefix: str, project_name: str, ): - """Set environment variables for Azure Synapse spark cluster.""" + """Set configs for Azure Synapse spark cluster.""" - if "azure_synapse" not in config["spark_config"]: - config["spark_config"]["azure_synapse"] = dict() + config["spark_config"]["azure_synapse"] = config["spark_config"].get("azure_synapse", {}) - if "dev_url" not in config["spark_config"]["azure_synapse"]: + if not config["spark_config"]["azure_synapse"].get("dev_url"): config["spark_config"]["azure_synapse"]["dev_url"] = f"https://{resource_prefix}syws.dev.azuresynapse.net" - if "workspace_dir" not in config["spark_config"]["azure_synapse"]: + if not config["spark_config"]["azure_synapse"].get("workspace_dir"): config["spark_config"]["azure_synapse"]["workspace_dir"] =\ f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" for k, v in DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG.items(): - if k not in config["spark_config"]["azure_synapse"]: + if not config["spark_config"]["azure_synapse"].get(k): config["spark_config"]["azure_synapse"][k] = v @@ -183,13 +181,14 @@ def _set_databricks_config( project_name: str, cluster_id: str = None, ): - if "databricks" not in config["spark_config"]: - config["spark_config"]["databricks"] = dict() + """Set configs for Databricks spark cluster.""" - if "work_dir" not in config["spark_config"]["databricks"]: + config["spark_config"]["databricks"] = config["spark_config"].get("databricks", {}) + + if not config["spark_config"]["databricks"].get("work_dir"): config["spark_config"]["databricks"]["work_dir"] = f"dbfs:/{project_name}" - if "config_template" not in config["spark_config"]["databricks"]: + if not config["spark_config"]["databricks"].get("config_template"): databricks_config = { "run_name": "FEATHR_FILL_IN", "libraries": [{"jar": "FEATHR_FILL_IN"}], @@ -239,13 +238,26 @@ def _update_config(config: Dict, new_config: Dict): def _verify_config(config: Dict): """Verify config.""" if config["spark_config"]["spark_cluster"] == "azure_synapse": - if "ADLS_KEY" not in os.environ: + if not os.environ.get("ADLS_KEY"): raise ValueError("ADLS_KEY must be set in environment variables") + elif ( + not os.environ.get("SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL") and + config["spark_config"]["azure_synapse"].get("dev_url") is None + ): + raise ValueError("Azure Synapse dev endpoint is not provided.") + elif ( + not os.environ.get("SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME") and + config["spark_config"]["azure_synapse"].get("pool_name") is None + ): + raise ValueError("Azure Synapse pool name is not provided.") elif config["spark_config"]["spark_cluster"] == "databricks": - if "DATABRICKS_WORKSPACE_TOKEN_VALUE" not in os.environ: + if not os.environ.get("DATABRICKS_WORKSPACE_TOKEN_VALUE"): raise ValueError("Databricks workspace token is not provided.") - elif "workspace_instance_url" not in config["spark_config"]["databricks"]: + elif ( + not os.environ.get("SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL") and + config["spark_config"]["databricks"].get("workspace_instance_url") is None + ): raise ValueError("Databricks workspace url is not provided.") @@ -253,7 +265,7 @@ def _maybe_update_config_with_env_var(config: Dict, env_var_name: str): """Update config dictionary with the values in environment variables. e.g. `SPARK_CONFIG__SPARK_CLUSTER` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. """ - if env_var_name not in os.environ: + if not os.environ.get(env_var_name): return keys = env_var_name.lower().split("__") diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 6f5814e43..fbc16b1ff 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -120,11 +120,12 @@ def get_result_df( logger.info(f"{res_url} files will be downloaded into {local_cache_path}") client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_cache_path) - # use user provided format, if there isn't one, then otherwise use the one provided by the job; - # if none of them is available, "avro" is the default format. - data_format: str = data_format or client.get_job_tags().get(OUTPUT_FORMAT, "") - if data_format is None or data_format == "": - data_format = "avro" + # Use the provided format or one in the job tags. + if data_format is None: + if client.get_job_tags() and client.get_job_tags().get(OUTPUT_FORMAT): + data_format = client.get_job_tags().get(OUTPUT_FORMAT) + else: + raise ValueError("Cannot determine the data format. Please provide the data_format argument.") result_df = None diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py index 52b10cf89..c2699e871 100644 --- a/feathr_project/test/conftest.py +++ b/feathr_project/test/conftest.py @@ -11,13 +11,16 @@ def pytest_addoption(parser): `python -m pytest feathr_project/test/ --resource-prefix your_feathr_resource_prefix` """ parser.addoption( - "--resource-prefix", action="store", default="feathrazuretest3", help="Test Azure resource prefix" + "--config-path", + action="store", + default=str(Path(__file__).parent.resolve().joinpath("test_user_workspace", "feathr_config.yaml")), + help="Test config path", ) @pytest.fixture -def resource_prefix(request): - return request.config.getoption("--resource-prefix") +def config_path(request): + return request.config.getoption("--config-path") @pytest.fixture(scope="session") diff --git a/feathr_project/test/samples/test_notebooks.py b/feathr_project/test/samples/test_notebooks.py index f87cbff2e..c8d1cbefc 100644 --- a/feathr_project/test/samples/test_notebooks.py +++ b/feathr_project/test/samples/test_notebooks.py @@ -23,7 +23,7 @@ @pytest.mark.notebooks -def test__nyc_taxi_demo(resource_prefix, tmp_path): +def test__nyc_taxi_demo(config_path, tmp_path): notebook_name = "nyc_taxi_demo" output_tmpdir = TemporaryDirectory() @@ -36,10 +36,8 @@ def test__nyc_taxi_demo(resource_prefix, tmp_path): output_path=output_notebook_path, # kernel_name="python3", parameters=dict( - RESOURCE_PREFIX=resource_prefix, - PROJECT_NAME=notebook_name, + FEATHR_CONFIG_PATH=config_path, DATA_STORE_PATH=output_tmpdir.name, - SPARK_CLUSTER="local", USE_CLI_AUTH=False, REGISTER_FEATURES=False, SCRAP_RESULTS=True, diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py index 6a5119981..770980e12 100644 --- a/feathr_project/test/unit/utils/test_config.py +++ b/feathr_project/test/unit/utils/test_config.py @@ -1,11 +1,13 @@ from copy import deepcopy import os from pathlib import Path +from unittest.mock import MagicMock import yaml import pytest +from pytest_mock import MockerFixture -from feathr import FeathrClient +import feathr.utils.config from feathr.utils.config import generate_config @@ -45,55 +47,134 @@ def test__generate_config__output_filepath( @pytest.mark.parametrize( - "spark_cluster,env_key,databricks_url", + "spark_cluster,env_key,kwargs", [ - ("local", None, None), - ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", "https://test_url"), - ("azure_synapse", "ADLS_KEY", None), + ("local", None, dict()), + ( + "databricks", + "DATABRICKS_WORKSPACE_TOKEN_VALUE", + dict(spark_config__databricks__workspace_instance_url="databricks_url"), + ), + ( + "azure_synapse", + "ADLS_KEY", + dict( + spark_config__azure_synapse__dev_url="synapse_url", + spark_config__azure_synapse__pool_name="pool_name", + ), + ), ] ) def test__generate_config__spark_cluster( + mocker: MockerFixture, spark_cluster: str, env_key: str, - databricks_url: str, + kwargs: str, ): """Test if spark cluster specific configs are generated without errors. TODO - For now, this test doesn't check if the config values are correctly working with the actual Feathr client. """ - - if env_key and env_key not in os.environ: - os.environ[env_key] = "test_value" + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", {env_key: "some_value"}) generate_config( resource_prefix="test_prefix", project_name="test_project", spark_config__spark_cluster=spark_cluster, - spark_config__databricks__workspace_instance_url=databricks_url, use_env_vars=False, + **kwargs, ) @pytest.mark.parametrize( - "spark_cluster,env_key,databricks_url", + "adls_key,pool_name,expected_error", [ - ("databricks", "DATABRICKS_WORKSPACE_TOKEN_VALUE", None), + ("some_key", "some_name", None), + (None, "some_name", ValueError), + ("some_key", None, ValueError), ] ) -def test__generate_config__exceptions( - spark_cluster: str, - env_key: str, - databricks_url: str, +def test__generate_config__azure_synapse_exceptions( + mocker: MockerFixture, + adls_key: str, + pool_name: str, + expected_error: Exception, +): + """Test if exceptions are raised when databricks url and token are not provided.""" + + # Either env vars or argument should yield the same result + for environ in [{"ADLS_KEY": adls_key}, { + "ADLS_KEY": adls_key, + "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME": pool_name, + }]: + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", environ) + + # Test either using env vars or arguments + if "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME" in environ: + kwargs = dict() + else: + kwargs = dict(spark_config__azure_synapse__pool_name=pool_name) + + if expected_error is None: + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="azure_synapse", + **kwargs, + ) + else: + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="azure_synapse", + **kwargs, + ) + + +@pytest.mark.parametrize( + "databricks_token,workspace_url,expected_error", + [ + ("some_token", "some_url", None), + (None, "some_url", ValueError), + ("some_token", None, ValueError), + ] +) +def test__generate_config__databricks_exceptions( + mocker: MockerFixture, + databricks_token: str, + workspace_url: str, + expected_error: Exception, ): """Test if exceptions are raised when databricks url and token are not provided.""" - if env_key and env_key not in os.environ: - os.environ[env_key] = "test_value" - - with pytest.raises(ValueError): - generate_config( - resource_prefix="test_prefix", - project_name="test_project", - spark_config__spark_cluster=spark_cluster, - spark_config__databricks__workspace_instance_url=databricks_url, - use_env_vars=False, - ) + # Either env vars or argument should yield the same result + for environ in [{"DATABRICKS_WORKSPACE_TOKEN_VALUE": databricks_token}, { + "DATABRICKS_WORKSPACE_TOKEN_VALUE": databricks_token, + "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL": workspace_url, + }]: + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", environ) + + # Test either using env vars or arguments + if "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL" in environ: + kwargs = dict() + else: + kwargs = dict(spark_config__databricks__workspace_instance_url=workspace_url) + + if expected_error is None: + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="databricks", + **kwargs, + ) + else: + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="databricks", + **kwargs, + ) diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 136bc3545..9f82be66e 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -10,6 +10,7 @@ from pyspark.sql import DataFrame, SparkSession from feathr import FeathrClient +from feathr.constants import OUTPUT_FORMAT, OUTPUT_PATH_TAG from feathr.utils.job_utils import ( get_result_df, get_result_pandas_df, @@ -86,11 +87,24 @@ def test__get_result_df__with_local_cache_path( @pytest.mark.parametrize( - "is_databricks,spark_runtime,res_url,expected_error", [ - (True, "local", None, RuntimeError), # Test RuntimeError when the function is running at Databricks but client.spark_runtime is not databricks + "is_databricks,spark_runtime,res_url,data_format,expected_error", [ + # Test RuntimeError when the function is running at Databricks but client.spark_runtime is not databricks + (True, "local", "some_url", "some_format", RuntimeError), + (True, "azure_synapse", "some_url", "some_format", RuntimeError), + (True, "databricks", "some_url", "some_format", None), + (False, "local", "some_url", "some_format", None), + (False, "azure_synapse", "some_url", "some_format", None), + (False, "databricks", "some_url", "some_format", None), # Test ValueError when res_url is None - (False, "local", None, ValueError), - (True, "databricks", None, ValueError), + (True, "databricks", None, "some_format", ValueError), + (False, "local", None, "some_format", ValueError), + (False, "azure_synapse", None, "some_format", ValueError), + (False, "databricks", None, "some_format", ValueError), + # Test ValueError when data_format is None + (True, "databricks", "some_url", None, ValueError), + (False, "local", "some_url", None, ValueError), + (False, "azure_synapse", "some_url", None, ValueError), + (False, "databricks", "some_url", None, ValueError), ] ) def test__get_result_df__exceptions( @@ -98,13 +112,10 @@ def test__get_result_df__exceptions( is_databricks: bool, spark_runtime: str, res_url: str, + data_format: str, expected_error: Type[Exception], ): """Test exceptions""" - # Mock client - client = MagicMock() - client.get_job_result_uri = MagicMock(return_value=res_url) - client.spark_runtime = spark_runtime # Mock is_data_bricks mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) @@ -112,8 +123,27 @@ def test__get_result_df__exceptions( # Mock _load_files_to_pandas_df mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") - with pytest.raises(expected_error): - get_result_df(client) + # Either job tags or argument should yield the same result + for job_tag in [None, {OUTPUT_FORMAT: data_format, OUTPUT_PATH_TAG: res_url}]: + # Mock client + client = MagicMock() + client.get_job_result_uri = MagicMock(return_value=res_url) + client.get_job_tags = MagicMock(return_value=job_tag) + client.spark_runtime = spark_runtime + + if expected_error is None: + get_result_df( + client=client, + res_url=None if job_tag else res_url, + data_format=None if job_tag else data_format, + ) + else: + with pytest.raises(expected_error): + get_result_df( + client=client, + res_url=None if job_tag else res_url, + data_format=None if job_tag else data_format, + ) @pytest.mark.parametrize( From 4c50485a725685534e7caa7bf646cb64c4800123 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 9 Nov 2022 20:50:49 +0000 Subject: [PATCH 16/18] Use None as default dataformat in the job_utils. Instead, set 'avro' as a default output format to the job tags from the client Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- docs/samples/nyc_taxi_demo.ipynb | 6 ++- feathr_project/feathr/client.py | 2 + feathr_project/feathr/utils/job_utils.py | 42 +++++++++---------- .../test/test_input_output_sources.py | 19 ++++----- 4 files changed, 33 insertions(+), 36 deletions(-) diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 4cb6b5b4f..b80bac374 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -361,11 +361,13 @@ " SparkSession\n", " .builder\n", " .appName(\"feathr\")\n", - " .config(\"spark.jars.packages\", \"org.apache.spark:spark-avro_2.12:3.3.0\")\n", + " .config(\"spark.jars.packages\", \"org.apache.spark:spark-avro_2.12:3.3.0,io.delta:delta-core_2.12:2.1.1\")\n", + " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n", + " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", " .config(\"spark.ui.port\", \"8080\") # Set ui port other than the default one (4040) so that feathr spark job doesn't fail. \n", " .getOrCreate()\n", " )\n", - " \n", + "\n", "# Else, you must already have a spark session object available in databricks or synapse notebooks." ] }, diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index cd080f871..741317428 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -487,6 +487,8 @@ def _get_offline_features_with_config(self, # set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function if execution_configurations is not None and OUTPUT_FORMAT in execution_configurations: job_tags[OUTPUT_FORMAT] = execution_configurations[OUTPUT_FORMAT] + else: + job_tags[OUTPUT_FORMAT] = "avro" ''' - Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself. - Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index fbc16b1ff..d9c73c355 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -22,9 +22,9 @@ def get_result_pandas_df( Args: client: Feathr client data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. + Default to use client's job tags if exists. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. local_cache_path (optional): Specify the absolute download path. if the user does not provide this, the function will create a temporary directory. @@ -47,9 +47,9 @@ def get_result_spark_df( spark: Spark session client: Feathr client data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. + Default to use client's job tags if exists. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. local_cache_path (optional): Specify the absolute download path. if the user does not provide this, the function will create a temporary directory. @@ -71,9 +71,9 @@ def get_result_df( Args: client: Feathr client data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. + Default to use client's job tags if exists. res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. local_cache_path (optional): Specify the absolute download directory. if the user does not provide this, the function will create a temporary directory. spark (optional): Spark session. If provided, the function returns spark Dataframe. @@ -82,9 +82,22 @@ def get_result_df( Returns: Either Spark or pandas DataFrame. """ + if data_format is None: + # May use data format from the job tags + if client.get_job_tags() and client.get_job_tags().get(OUTPUT_FORMAT): + data_format = client.get_job_tags().get(OUTPUT_FORMAT) + else: + raise ValueError("Cannot determine the data format. Please provide the data_format argument.") + + data_format = data_format.lower() + if is_databricks() and client.spark_runtime != "databricks": raise RuntimeError(f"The function is called from Databricks but the client.spark_runtime is {client.spark_runtime}.") + # TODO Loading Synapse Delta table result into pandas has a bug: https://github.com/delta-io/delta-rs/issues/582 + if not spark and client.spark_runtime == "azure_synapse" and data_format == "delta": + raise RuntimeError(f"Loading Delta table result from Azure Synapse into pandas DataFrame is not supported. You maybe able to use spark DataFrame to load the result instead.") + # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: @@ -120,15 +133,7 @@ def get_result_df( logger.info(f"{res_url} files will be downloaded into {local_cache_path}") client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_cache_path) - # Use the provided format or one in the job tags. - if data_format is None: - if client.get_job_tags() and client.get_job_tags().get(OUTPUT_FORMAT): - data_format = client.get_job_tags().get(OUTPUT_FORMAT) - else: - raise ValueError("Cannot determine the data format. Please provide the data_format argument.") - result_df = None - try: if spark is not None: if data_format == "csv": @@ -154,17 +159,8 @@ def _load_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.Dat elif data_format == "delta": from deltalake import DeltaTable - delta = DeltaTable(dir_path) - # if client.spark_runtime != "azure_synapse": - # don't detect for synapse result with Delta as there's a problem with underlying system - # Issues are tracked here: https://github.com/delta-io/delta-rs/issues/582 return delta.to_pyarrow_table().to_pandas() - # else: - # TODO -- Proper warning messages. Is this applied to all the other formats? - # raise RuntimeError( - # "Please use Azure Synapse to read the result in the Azure Synapse cluster. Reading local results is not supported for Azure Synapse." - # ) elif data_format == "avro": import pandavro as pdx diff --git a/feathr_project/test/test_input_output_sources.py b/feathr_project/test/test_input_output_sources.py index f4af85678..ba4b3921a 100644 --- a/feathr_project/test/test_input_output_sources.py +++ b/feathr_project/test/test_input_output_sources.py @@ -10,6 +10,7 @@ from test_fixture import basic_test_setup from test_utils.constants import Constants + # test parquet file read/write without an extension name def test_feathr_get_offline_features_with_parquet(): """ @@ -38,7 +39,7 @@ def test_feathr_get_offline_features_with_parquet(): else: output_path = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/output','_', str(now.minute), '_', str(now.second), ".parquet"]) - + client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path, @@ -47,14 +48,12 @@ def test_feathr_get_offline_features_with_parquet(): # assuming the job can successfully run; otherwise it will throw exception client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS) - + # download result and just assert the returned result is not empty res_df = get_result_df(client) assert res_df.shape[0] > 0 - - # test delta lake read/write without an extension name def test_feathr_get_offline_features_with_delta_lake(): """ @@ -83,7 +82,7 @@ def test_feathr_get_offline_features_with_delta_lake(): else: output_path = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/output','_', str(now.minute), '_', str(now.second), "_deltalake"]) - + client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path, @@ -92,15 +91,13 @@ def test_feathr_get_offline_features_with_delta_lake(): # assuming the job can successfully run; otherwise it will throw exception client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS) - + # wait for a few secs for the resource to come up in the databricks API time.sleep(5) - # download result and just assert the returned result is not empty - res_df = get_result_df(client) - + # download result and just assert the returned result is not empty + # if users are using delta format in synapse, skip this check, due to issue https://github.com/delta-io/delta-rs/issues/582 result_format: str = client.get_job_tags().get(OUTPUT_FORMAT, "") if not (client.spark_runtime == 'azure_synapse' and result_format == 'delta'): - # if users are using delta format in synapse, skip this check, due to issue https://github.com/delta-io/delta-rs/issues/582 + res_df = get_result_df(client) assert res_df.shape[0] > 0 - From c049958b18910f9fb6e32d58087b8b1bb0704893 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 10 Nov 2022 00:00:43 +0000 Subject: [PATCH 17/18] Change feathr client to mocked object Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../test/unit/utils/test_job_utils.py | 74 ++++++++++--------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py index 9f82be66e..0909fb56e 100644 --- a/feathr_project/test/unit/utils/test_job_utils.py +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -157,30 +157,35 @@ def test__get_result_df__exceptions( ) def test__get_result_df( workspace_dir: str, - feathr_client: FeathrClient, data_format: str, output_filename: str, expected_count: int, ): """Test get_result_df returns pandas DataFrame""" - # Note: make sure the output file exists in the test_user_workspace - res_url = str(Path(workspace_dir, "mock_results", output_filename)) - local_cache_path = res_url + for spark_runtime in ["local", "databricks", "azure_synapse"]: + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url - # Mock feathr_spark_launcher.download_result - feathr_client.feathr_spark_launcher.download_result = MagicMock() + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime - if feathr_client.spark_runtime == "databricks": - res_url = f"dbfs:/{res_url}" + # Mock feathr_spark_launcher.download_result + if client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + if client.spark_runtime == "azure_synapse" and data_format == "delta": + # TODO currently pass the delta table test on Synapse result due to the delta table package bug. + continue - df = get_result_df( - client=feathr_client, - data_format=data_format, - res_url=res_url, - local_cache_path=local_cache_path, - ) - assert isinstance(df, pd.DataFrame) - assert len(df) == expected_count + df = get_result_df( + client=client, + data_format=data_format, + res_url=res_url, + local_cache_path=local_cache_path, + ) + assert isinstance(df, pd.DataFrame) + assert len(df) == expected_count @pytest.mark.parametrize( @@ -194,29 +199,30 @@ def test__get_result_df( ) def test__get_result_df__with_spark_session( workspace_dir: str, - feathr_client: FeathrClient, spark: SparkSession, data_format: str, output_filename: str, expected_count: int, ): """Test get_result_df returns spark DataFrame""" - # Note: make sure the output file exists in the test_user_workspace - res_url = str(Path(workspace_dir, "mock_results", output_filename)) - local_cache_path = res_url - - # Mock feathr_spark_launcher.download_result - feathr_client.feathr_spark_launcher.download_result = MagicMock() + for spark_runtime in ["local", "databricks", "azure_synapse"]: + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url - if feathr_client.spark_runtime == "databricks": - res_url = f"dbfs:/{res_url}" + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime - df = get_result_df( - client=feathr_client, - data_format=data_format, - res_url=res_url, - spark=spark, - local_cache_path=local_cache_path, - ) - assert isinstance(df, DataFrame) - assert df.count() == expected_count + if client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + + df = get_result_df( + client=client, + data_format=data_format, + res_url=res_url, + spark=spark, + local_cache_path=local_cache_path, + ) + assert isinstance(df, DataFrame) + assert df.count() == expected_count From 190377c1b52e98d3ef1b5a0d5516c52af94f22ab Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Tue, 22 Nov 2022 20:04:10 +0000 Subject: [PATCH 18/18] Change timeout to 1000s in the notebook Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- docs/samples/nyc_taxi_demo.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index 81a11a460..31754950e 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -751,7 +751,7 @@ " output_path=offline_features_path,\n", ")\n", "\n", - "client.wait_job_to_finish(timeout_sec=500)" + "client.wait_job_to_finish(timeout_sec=1000)" ] }, { @@ -1020,7 +1020,7 @@ " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", ")\n", "\n", - "client.wait_job_to_finish(timeout_sec=500)" + "client.wait_job_to_finish(timeout_sec=1000)" ] }, {