Adobe-Marketing-Cloud · enavitaniuc · Jun 13, 2024 · Jun 14, 2024
diff --git a/conf/config.ini b/conf/config.ini
@@ -20,8 +20,8 @@ compression_type=gzip
 model_name=cmle_propensity_model
 
 [DataRobot]
-datarobot_key = 
-datarobot_endpoint = 
+datarobot_key =
+datarobot_endpoint =
 
 [AWS]
 s3_bucket_name=

diff --git a/notebooks/databricks/CommonInclude.py b/notebooks/databricks/CommonInclude.py
@@ -49,8 +49,7 @@ def getDataPatched(
 environment = config.get("Platform", "environment")
 client_id = config.get("Authentication", "client_id")
 client_secret = config.get("Authentication", "client_secret")
-private_key_path = config.get("Authentication", "private_key_path")
-tech_account_id = config.get("Authentication", "tech_acct_id")
+scopes = config.get("Authentication", "scopes")
 dataset_id = config.get("Platform", "dataset_id")
 featurized_dataset_id = config.get("Platform", "featurized_dataset_id")
 scoring_dataset_id = config.get("Platform", "scoring_dataset_id")
@@ -60,9 +59,6 @@ def getDataPatched(
 compression_type = config.get("Cloud", "compression_type")
 model_name = config.get("Cloud", "model_name")
 
-if not os.path.exists(private_key_path):
-    raise Exception(f"Looking for private key file under {private_key_path} but key not found, please verify path")
-
 
 # COMMAND ----------
 
@@ -82,9 +78,8 @@ def getDataPatched(
 
 aepp.configure(
     org_id=ims_org_id,
-    tech_id=tech_account_id,
+    scopes=scopes,
     secret=client_secret,
-    path_to_key=private_key_path,
     client_id=client_id,
     environment=environment,
     sandbox=sandbox_name,
@@ -201,7 +196,6 @@ def get_dataset_ids_by_name(cat_conn, name):
 
 # COMMAND ----------
 
-from adlfs import AzureBlobFileSystem
 from fsspec import AbstractFileSystem
 
 def get_export_time(fs: AbstractFileSystem, container_name: str, base_path: str, dataset_id: str):
@@ -219,6 +213,9 @@ def get_export_time(fs: AbstractFileSystem, container_name: str, base_path: str,
 
 
 # COMMAND ----------
+from aepp import flowservice
+
+flow_conn = flowservice.FlowService()
 
 connector = aepp.connector.AdobeRequest(
     config_object=aepp.config.config_object,
@@ -230,11 +227,7 @@ def get_export_time(fs: AbstractFileSystem, container_name: str, base_path: str,
     aepp.config.endpoints["global"]
     + "/data/foundation/connectors/landingzone/credentials")
 
-dlz_credentials = connector.getData(endpoint=dlz_endpoint, params={"type": "dlz_destination"})
-dlz_container = dlz_credentials["containerName"]
-dlz_sas_token = dlz_credentials["SASToken"]
-dlz_storage_account = dlz_credentials["storageAccountName"]
-dlz_sas_uri = dlz_credentials["SASUri"]
+dlz_credentials = flow_conn.getLandingZoneCredential())
 
 # COMMAND ----------
 

diff --git a/notebooks/databricks/RunMe.py b/notebooks/databricks/RunMe.py
@@ -84,7 +84,9 @@
 
 pypi_packages = [
     "PyGreSQL==5.2.5",
-    "adlfs==2023.8.0",
+    "adlfs==2023.9.0",
+    "fsspec==2023.9.0",
+    "s3fs==2023.9.0",
     "aepp==0.3.1.post5",
     "mmh3==4.0.1",
     "rstr==3.2.1",

diff --git a/notebooks/databricks/Week2Notebook.py b/notebooks/databricks/Week2Notebook.py
@@ -1349,49 +1349,82 @@ def get_or_create_query_template(template_spec):
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC Now that a run of our Data Flow has executed successfully, we're all set! We can do a sanity check to verify that the data indeed made its way into the DLZ. For that, we recommend setting up [Azure Storage Explorer](https://azure.microsoft.com/en-us/products/storage/storage-explorer) to connect to your DLZ container using [this guide](https://experienceleague.adobe.com/docs/experience-platform/destinations/catalog/cloud-storage/data-landing-zone.html?lang=en). To get the credentials, you can execute the code below to get the SAS URL needed:
+# MAGIC Now that a run of our Data Flow has executed successfully, we're all set! We can do a sanity check to verify that the data indeed made its way into the DLZ. Based on whether DLZ was provisioned on AWS os Azure we will use a generic approach for listing directory structures.
 
 # COMMAND ----------
+import aepp
+import fsspec
+from aepp import flowservice
 
-# TODO: use functionality in aepp once released
-from aepp import connector
-
-connector = connector.AdobeRequest(
-    config_object=aepp.config.config_object,
-    header=aepp.config.header,
-    loggingEnabled=False,
-    logger=None,
-)
+def getDLZFSPath(credentials: dict):
+    if 'dlzProvider' in credentials.keys() and 'Amazon S3' in credentials['dlzProvider']:
+        aws_credentials = {
+            'key' : credentials['credentials']['awsAccessKeyId'],
+            'secret' : credentials['credentials']['awsSecretAccessKey'],
+            'token' : credentials['credentials']['awsSessionToken']
+        }
+        return fsspec.filesystem('s3', **aws_credentials), credentials['dlzPath']['bucketName']
+    else:
+        abs_credentials = {
+            'account_name' : credentials['storageAccountName'],
+            'sas_token' : credentials['SASToken']
+        }
+        return fsspec.filesystem('abfss', **abs_credentials), credentials['containerName']
+
+def listDLZ(fs, container, prefix):
+    entries = fs.ls(container, detail=True)
+    entries_sorted = sorted(entries, key=lambda x: x['type'], reverse=True)  # Directories first
+    for i, entry in enumerate(entries_sorted):
+        entry_name = entry['name'].split('/')[-1]
+        if entry['type'] == 'directory':
+            entry_name += '/'
+        connector = '|-- ' if i < len(entries_sorted) - 1 else '└- '
+        print(f"{prefix}{connector}{entry_name}")
+        if entry['type'] == 'directory':
+            new_prefix = prefix + ('|   ' if i < len(entries_sorted) - 1 else '    ')
+            listDLZ(fs, entry['name'], new_prefix)
 
-endpoint = aepp.config.endpoints["global"] + "/data/foundation/connectors/landingzone/credentials"
 
-dlz_credentials = connector.getData(endpoint=endpoint, params={
-    "type": "dlz_destination"
-})
-dlz_container = dlz_credentials["containerName"]
-dlz_sas_token = dlz_credentials["SASToken"]
-dlz_storage_account = dlz_credentials["storageAccountName"]
-dlz_sas_uri = dlz_credentials["SASUri"]
-print(f"DLZ container: {dlz_container}")
-print(f"DLZ storage account: {dlz_storage_account}")
-print(f"DLZ SAS URL: {dlz_sas_uri}")
+flow_conn = flowservice.FlowService()
+credentials = flow_conn.getLandingZoneCredential(dlz_type='dlz_destination')
 
+fs, container = getDLZFSPath(credentials)
+listDLZ(fs, container, '')                  
+
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC Once setup you should be able to see your featurized data as a set of Parquet files under the following directory structure: `cmle/egress/$DATASETID/exportTime=$TIMESTAMP` - see screenshot below.
+# MAGIC Once setup you should be able to see your featurized data as a set of Parquet files under the following directory structure: `cmle/egress/$DATASETID/exportTime=$TIMESTAMP` - in a tree like structure:
+# MAGIC ```
+# MAGIC |-- _$azuretmpfolder$/
+# MAGIC └- cmle/
+# MAGIC     └- egress/
+# MAGIC        └- 66018d8312377d2c68545bac/
+# MAGIC            └- exportTime=20240405230601/
+# MAGIC                |-- part-00000-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102384-1-c000.gz.parquet
+# MAGIC                |-- part-00001-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102385-1-c000.gz.parquet
+# MAGIC                |-- part-00002-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102386-1-c000.gz.parquet
+# MAGIC                |-- part-00003-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102387-1-c000.gz.parquet
+# MAGIC                |-- part-00004-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102388-1-c000.gz.parquet
+# MAGIC                |-- part-00005-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102389-1-c000.gz.parquet
+# MAGIC                |-- part-00006-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102390-1-c000.gz.parquet
+# MAGIC                |-- part-00007-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102391-1-c000.gz.parquet
+# MAGIC                |-- part-00008-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102392-1-c000.gz.parquet
+# MAGIC                |-- part-00009-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102393-1-c000.gz.parquet
+# MAGIC                |-- part-00010-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102394-1-c000.gz.parquet
+# MAGIC                |-- part-00011-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102395-1-c000.gz.parquet
+# MAGIC                |-- part-00012-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102396-1-c000.gz.parquet
+# MAGIC                |-- part-00013-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102397-1-c000.gz.parquet
+# MAGIC                |-- part-00014-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102398-1-c000.gz.parquet
+# MAGIC                └- part-00015-tid-6351713407229798623-174d2b9b-87e8-4c29-8a76-ec05b444f26a-102399-1-c000.gz.parquet
+# MAGIC ```
 
 # COMMAND ----------
 
 print(f"Featurized data in DLZ should be available under {export_path}/{created_dataset_id}")
 
 # COMMAND ----------
 
-# MAGIC %md
-# MAGIC ![DLZ](/files/static/7cf4bf44-5482-4426-a3b3-842be2f737b1/media/CMLE-Notebooks-Week2-ExportedDataset.png)
-
-# COMMAND ----------
-
 # MAGIC %md
 # MAGIC ## 4.5 Saving the featurized dataset to the configuration
 

diff --git a/notebooks/databricks/Week3Notebook.py b/notebooks/databricks/Week3Notebook.py
@@ -48,35 +48,105 @@
 
 # COMMAND ----------
 
-from adlfs import AzureBlobFileSystem
+import fsspec
 from fsspec import AbstractFileSystem
 
-abfs = AzureBlobFileSystem(account_name=dlz_storage_account, sas_token=dlz_sas_token)
-export_time = get_export_time(abfs, dlz_container, export_path, featurized_dataset_id)
+def getDLZFSPath(credentials: dict):
+    if 'dlzProvider' in credentials.keys() and 'Amazon S3' in credentials['dlzProvider']:
+        aws_credentials = {
+            'key' : credentials['credentials']['awsAccessKeyId'],
+            'secret' : credentials['credentials']['awsSecretAccessKey'],
+            'token' : credentials['credentials']['awsSessionToken']
+        }
+        return fsspec.filesystem('s3', **aws_credentials), credentials['dlzPath']['bucketName']
+    else:
+        abs_credentials = {
+            'account_name' : credentials['storageAccountName'],
+            'sas_token' : credentials['SASToken']
+        }
+        return fsspec.filesystem('abfss', **abs_credentials), credentials['containerName']
+
+
+def get_export_time(fs: AbstractFileSystem, container_name: str, base_path: str, dataset_id: str):
+  featurized_data_base_path = f"{container_name}/{base_path}/{dataset_id}"
+  featurized_data_export_paths = fs.ls(featurized_data_base_path)
+
+  if len(featurized_data_export_paths) == 0:
+    raise Exception(f"Found no exports for featurized data from dataset ID {dataset_id} under path {featurized_data_base_path}")
+  elif len(featurized_data_export_paths) > 1:
+    print(f"Found {len(featurized_data_export_paths)} exports from dataset dataset ID {dataset_id} under path {featurized_data_base_path}, using most recent one")
+
+  featurized_data_export_path = featurized_data_export_paths[-1]
+  featurized_data_export_time = featurized_data_export_path.strip().split("/")[-1].split("=")[-1]
+  return featurized_data_export_time
+
+
+fs, container = getDLZFSPath(res)
+
+
+export_time = get_export_time(fs, container, export_path, featurized_dataset_id)
 print(f"Using featurized data export time of {export_time}")
 
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC At this point we're ready to read this data. We're using Spark since it could be pretty large as we're not doing any sampling. Spark needs the following properties to be able to authenticate using SAS:
+# MAGIC At that point we're ready to read this data. We're using Spark since it could be pretty large as we're not doing any sampling. 
+# MAGIC ```
+# MAGIC Based on the provisioned account Landing Zone could be either configured to use azure or aws, 
+# MAGIC in case of azure following properties will be used to authenticate using SAS:
 # MAGIC - `fs.azure.account.auth.type.$ACCOUNT.dfs.core.windows.net` should be set to `SAS`.
 # MAGIC - `fs.azure.sas.token.provider.type.$ACCOUNT.dfs.core.windows.net` should be set to `org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider`.
 # MAGIC - `fs.azure.sas.fixed.token.$ACCOUNT.dfs.core.windows.net` should be set to the SAS token retrieved earlier.
-# MAGIC
-# MAGIC Let's put that in practice and create a Spark dataframe containing the entire featurized data:
-
-# COMMAND ----------
+# MAGIC 
+# MAGIC in case of aws following properties will be used to access data stored in s3:
+# MAGIC - `fs.s3a.access.key` and `spark.hadoop.fs.s3a.access.key` should be the s3 access key
+# MAGIC - `fs.s3a.secret.key` and `spark.hadoop.fs.s3a.secret.key` should be the s3 secret
+# MAGIC - `fs.s3a.session.token` and `spark.hadoop.fs.s3a.session.token` should be set to s3 session token
+# MAGIC - `fs.s3a.aws.credentials.provider` and `spark.hadoop.fs.s3a.aws.credentials.provider` should be set to `org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider`
+# MAGIC - `fs.s3.impl` and `spark.hadoop.fs.s3.impl` should be set to `org.apache.hadoop.fs.s3a.S3AFileSystem`
+# MAGIC 
+# MAGIC 
+# MAGIC The above properties are calculated based on the landing zone credentials, following util method will set these up:
+# MAGIC ```
+# COMMAND ----------
+
+def configureSparkSessionAndGetPath(credentials):
+    if 'dlzProvider' in credentials.keys() and 'Amazon S3' in credentials['dlzProvider']:
+        aws_key = credentials['credentials']['awsAccessKeyId']
+        aws_secret = credentials['credentials']['awsSecretAccessKey']
+        aws_token = credentials['credentials']['awsSessionToken']
+        aws_buket = credentials['dlzPath']['bucketName']
+        dlz_folder = credentials['dlzPath']['dlzFolder']
+        spark.conf.set("fs.s3a.access.key", aws_key)
+        spark.conf.set("fs.s3a.secret.key", aws_secret)
+        spark.conf.set("fs.s3a.session.token", aws_token)
+        spark.conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
+        spark.conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
+        spark.conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
+        spark.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
+        spark.conf.set("spark.hadoop.fs.s3a.access.key", aws_key)
+        spark.conf.set("spark.hadoop.fs.s3a.secret.key", aws_secret)
+        spark.conf.set("fs.s3a.session.token", aws_token)
+        return f"s3a://${aws_buket}/{dlz_folder}/"
+    else:
+        dlz_storage_account = credentials['storageAccountName']
+        dlz_sas_token = credentials['SASToken']
+        dlz_container = credentials['containerName']
+        spark.conf.set(f"fs.azure.account.auth.type.{dlz_storage_account}.dfs.core.windows.net", "SAS")
+        spark.conf.set(f"fs.azure.sas.token.provider.type.{dlz_storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
+        spark.conf.set(f"fs.azure.sas.fixed.token.{dlz_storage_account}.dfs.core.windows.net", dlz_sas_token)
+        return f"abfss://{dlz_container}@{dlz_storage_account}.dfs.core.windows.net/"
 
-spark.conf.set(f"fs.azure.account.auth.type.{dlz_storage_account}.dfs.core.windows.net", "SAS")
-spark.conf.set(f"fs.azure.sas.token.provider.type.{dlz_storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
-spark.conf.set(f"fs.azure.sas.fixed.token.{dlz_storage_account}.dfs.core.windows.net", dlz_sas_token)
+# init spark session for provisioned DLZ and get the base path (fs3://bucket_name/folder or abfss://container@account/)
+cloud_base_path = configureSparkSessionAndGetPath(credentials)
 
-protocol = "abfss"
-input_path = f"{protocol}://{dlz_container}@{dlz_storage_account}.dfs.core.windows.net/{export_path}/{featurized_dataset_id}/exportTime={export_time}/"
+input_path = cloud_base_path + f"{export_path}/{featurized_dataset_id}/exportTime={export_time}/"
 
-dlz_input_df = spark.read.parquet(input_path).na.fill(0)
+#Let's put that in practice and create a Spark dataframe containing the entire featurized data:
+dlz_input_df = spark.read.parquet(input_path)
 dlz_input_df.printSchema()
 
+
 # COMMAND ----------
 
 # MAGIC %md