Merge branch 'main' into feature/datacoves3.3

datacoves · Feb 22, 2025 · 6ec4d92 · 6ec4d92
2 parents 1eec64a + 874aa78
commit 6ec4d92
Show file tree

Hide file tree

Showing 74 changed files with 934 additions and 215 deletions.
diff --git a/.github/workflows/10_integrate_dbt_changes.yml b/.github/workflows/10_integrate_dbt_changes.yml
@@ -79,6 +79,10 @@ jobs:
       ##### Governance Checks
       # this first runs dbt but creates enpty tables, this is enough to then run the hooks and fail fast
 
+      # We need to run observe model so that post hook works
+      - name: Run Observe Model
+        run: "dbt build --fail-fast -s L1_inlets.observe"
+
       # There is an issue with --empty and dynamic tables so need to exclude them
       - name: Governance run of dynamic tables
         run: "dbt build --fail-fast -s config.materialized:dynamic_table -s test_failures --defer --state logs"

diff --git a/.github/workflows/30_deploy_changes_to_production.yml b/.github/workflows/30_deploy_changes_to_production.yml
@@ -47,6 +47,8 @@ jobs:
       DATACOVES__MAIN__USER:      ${{ vars.DATACOVES__MAIN__USER }}
       DATACOVES__MAIN__PASSWORD:  ${{ secrets.DATACOVES__MAIN__PASSWORD }}
 
+      DBT_ARTIFACTS_BUCKET: "convexa-local"
+
       # This is used by datacoves to drop the staging database for blue/green
       # deployments, most likely you don't want to set this, we use it for demos
       DATACOVES__DROP_DB_ON_FAIL:  ${{ vars.DATACOVES__DROP_DB_ON_FAIL }}
@@ -104,6 +106,25 @@ jobs:
       - name: Upload dbt artifacts
         run: "dbt run-operation upload_artifacts"
 
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-west-2
+
+      - name: Upload artifacts to S3 for Datahub
+        run: |
+          # Check and upload each file individually
+          for file in target/sources.json target/manifest.json target/catalog.json target/run_results.json; do
+            if [ -f "$file" ]; then
+              echo "Uploading $file to S3..."
+              aws s3 cp "$file" "s3://${DBT_ARTIFACTS_BUCKET}/dbt_artifacts/$(basename $file)"
+            else
+              echo "File $file does not exist, skipping..."
+            fi
+          done
+
       - uses: fregante/setup-git-user@v2
       - name: Bump dbt project and git project version
         run: "../automate/dbt/bump_dbt_project.sh"

diff --git a/load/dlt/loans_data.py b/load/dlt/loans_data.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S uv run --verbose --cache-dir /tmp/.uv_cache
+#!/usr/bin/env -S uv run
 # /// script
 # dependencies = [
 #   "dlt[snowflake, parquet]==1.5.0",
@@ -28,7 +28,7 @@ def zip_coordinates():
 
 @dlt.source
 def loans_data():
-    return personal_loans, zip_coordinates
+    return [personal_loans, zip_coordinates]
 
 if __name__ == "__main__":
     datacoves_snowflake = dlt.destinations.snowflake(
@@ -46,8 +46,6 @@ def loans_data():
         dataset_name="loans"
     )
 
-    load_info = pipeline.run([
-            loans_data()
-        ])
+    load_info = pipeline.run(loans_data())
 
     print(load_info)
diff --git a/load/dlt/us_population.py b/load/dlt/us_population.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env -S uv run --cache-dir /tmp/.uv_cache
+#!/usr/bin/env -S uv run
 # /// script
 # dependencies = [
 #   "dlt[snowflake, parquet]==1.5.0",
@@ -30,7 +30,7 @@ def us_population_source():
     )
 
     pipeline = dlt.pipeline(
-        progress = "enlighten",
+        progress = "log",
         pipeline_name = "loans",
         destination = datacoves_snowflake,
         pipelines_dir = pipelines_dir,
@@ -40,7 +40,7 @@ def us_population_source():
     )
 
     load_info = pipeline.run([
-            us_population()
+            us_population_source()
         ])
 
     print(load_info)
diff --git a/observe/osmosis/.env.sample b/observe/osmosis/.env.sample
@@ -0,0 +1 @@
+OPENAI_API_KEY=abc....
diff --git a/observe/osmosis/readme.md b/observe/osmosis/readme.md
@@ -0,0 +1,22 @@
+# dbt-osmosis
+
+## Set openai key from .env file
+`set -a && source /config/workspace/observe/osmosis/.env && set +a`
+
+## Run Refactor
+Executes organize which syncs yaml files with database schema.
+This also uses OpenAI to add descriptions to cols.
+
+`uvx --with=dbt-snowflake~=1.8.0 --from 'dbt-osmosis[openai]' dbt-osmosis yaml refactor --synthesize`
+
+Run without OpenAI
+
+`uvx --with=dbt-snowflake~=1.8.0 dbt-osmosis yaml refactor`
+
+You can pass `--fqn` with a name of a folder in `models/` to limit the refactor
+
+`dbt-osmosis yaml refactor --fqn L3_coves`
+
+Separate sub-folders with a `.`
+
+`dbt-osmosis yaml refactor --fqn L3_coves.loan_analytics`
diff --git a/orchestrate/dags/daily_loan_run.py b/orchestrate/dags/daily_loan_run.py
@@ -12,7 +12,7 @@ def daily_loan_run():
 
     @task_group(group_id="extract_and_load_airbyte", tooltip="Airbyte Extract and Load")
     def extract_and_load_airbyte():
-        
+
         @task
         def sync_airbyte():
             from airflow.providers.airbyte.operators.airbyte import AirbyteTriggerSyncOperator
@@ -21,14 +21,14 @@ def sync_airbyte():
                 connection_id="ac02ea96-58a1-4061-be67-78900bb5aaf6",
                 airbyte_conn_id="airbyte_connection",
             ).execute({})
-        
+
         sync_airbyte()
 
     tg_extract_and_load_airbyte = extract_and_load_airbyte()
 
     @task_group(group_id="extract_and_load_fivetran", tooltip="Fivetran Extract and Load")
     def extract_and_load_fivetran():
-        
+
         @task
         def trigger_fivetran():
             from fivetran_provider_async.operators import FivetranOperator
@@ -52,7 +52,7 @@ def sensor_fivetran():
         trigger = trigger_fivetran()
         sensor = sensor_fivetran()
         trigger >> sensor  # Set dependency
-        
+
     tg_extract_and_load_fivetran = extract_and_load_fivetran()
 
     @task_group(group_id="extract_and_load_dlt", tooltip="dlt Extract and Load")
@@ -66,7 +66,7 @@ def load_us_population():
 
     tg_extract_and_load_dlt = extract_and_load_dlt()
 
-    @task.datacoves_dbt(connection_id="main")  
+    @task.datacoves_dbt(connection_id="main")
     def transform():
         return "dbt build -s 'tag:daily_run_airbyte+ tag:daily_run_fivetran+ -t prd'"
 

diff --git a/orchestrate/dags/default_args_dag.py b/orchestrate/dags/default_args_dag.py
@@ -0,0 +1,20 @@
+from airflow.decorators import dag
+from orchestrate.utils.default_args import default_args
+from operators.datacoves.dbt import DatacovesDbtOperator
+
+
+@dag(
+    default_args=default_args,
+    description="Daily dbt run",
+    schedule="0 12 * * *",
+    tags=["transform"],
+    catchup=False,
+
+)
+def default_args_dag():
+    run_dbt = DatacovesDbtOperator(
+        task_id="run_dbt", bash_command="dbt run -s country_codes"
+    )
+
+
+dag = default_args_dag()
diff --git a/orchestrate/utils/default_args.py b/orchestrate/utils/default_args.py
@@ -0,0 +1,11 @@
+# utils/default_args.py
+from datetime import datetime, timedelta
+
+default_args = {
+    'owner': '[email protected]',
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 3,
+    'retry_delay': timedelta(minutes=5),
+    'start_date': datetime(2023, 12, 1),
+}
diff --git a/secure/databases.yml b/secure/databases.yml
@@ -9,11 +9,11 @@
     schemas:
       - L1_ACCOUNT_USAGE
       - L1_COUNTRY_DATA
-      - L1_GOOGLE_ANALYTICS_4
-      - L1_DBT_ARTIFACTS
       - L1_COVID19_EPIDEMIOLOGICAL_DATA
+      - L1_GOOGLE_ANALYTICS_4
       - L1_LOANS
       - L1_OBSERVE
+      - L1_US_POPULATION
 
       - L2_COUNTRY_DEMOGRAPHICS
       - L2_COVID_OBSERVATIONS

diff --git a/secure/roles.yml b/secure/roles.yml
@@ -16,16 +16,15 @@
 
       - z_schema_seeds
 
-      - z_schema_l1_dbt_artifacts
+      - z_schema_dbt_test__audit
+
       - z_schema_l1_account_usage
       - z_schema_l1_country_data
-      - z_schema_l1_google_analytics_4
-      - z_schema_l1_dbt_artifacts
       - z_schema_l1_covid19_epidemiological_data
+      - z_schema_l1_google_analytics_4
       - z_schema_l1_loans
       - z_schema_l1_observe
-
-      - z_schema_dbt_test__audit
+      - z_schema_l1_us_population
 
       - z_schema_l2_country_demographics
       - z_schema_l2_covid_observations
@@ -355,66 +354,66 @@
         read:
           - raw.snapshots
 
-# INLETS
-- z_schema_l1_account_usage:
+- z_schema_resources:
     privileges:
       schemas:
         read:
-          - balboa.l1_account_usage
+          - balboa_apps.resources
 
-- z_schema_l1_country_data:
+- z_schema_seeds:
     privileges:
       schemas:
         read:
-          - balboa.l1_country_data
+          - balboa.seeds
 
-- z_schema_l1_google_analytics_4:
+- z_schema_dbt_test__audit:
     privileges:
       schemas:
         read:
-          - balboa.l1_google_analytics_4
+          - balboa.dbt_test__audit
 
-- z_schema_l1_covid19_epidemiological_data:
+# INLETS
+- z_schema_l1_account_usage:
     privileges:
       schemas:
         read:
-          - balboa.l1_covid19_epidemiological_data
+          - balboa.l1_account_usage
 
-- z_schema_l1_dbt_artifacts:
+- z_schema_l1_country_data:
     privileges:
       schemas:
         read:
-          - balboa.l1_dbt_artifacts
+          - balboa.l1_country_data
 
-- z_schema_l1_loans:
+- z_schema_l1_covid19_epidemiological_data:
     privileges:
       schemas:
         read:
-          - balboa.l1_loans
+          - balboa.l1_covid19_epidemiological_data
 
-- z_schema_l1_observe:
+- z_schema_l1_google_analytics_4:
     privileges:
       schemas:
         read:
-          - balboa.l1_observe
+          - balboa.l1_google_analytics_4
 
-- z_schema_resources:
+- z_schema_l1_loans:
     privileges:
       schemas:
         read:
-          - balboa_apps.resources
+          - balboa.l1_loans
 
-- z_schema_seeds:
+- z_schema_l1_observe:
     privileges:
       schemas:
         read:
-          - balboa.seeds
+          - balboa.l1_observe
 
-- z_schema_dbt_test__audit:
+- z_schema_l1_us_population:
     privileges:
       schemas:
         read:
-          - balboa.dbt_test__audit
+          - balboa.l1_us_population
 
 # BAYS
 - z_schema_l2_country_demographics:

diff --git a/secure/users.yml b/secure/users.yml
@@ -29,6 +29,8 @@
     can_login: true
     member_of:
       - analyst
+      - accountadmin
+      - securityadmin
 
 - sebastian:
     can_login: true

diff --git a/training_and_demos/uv_slack/.env.sample b/training_and_demos/uv_slack/.env.sample
@@ -0,0 +1 @@
+SLACK_API_TOKEN=xyz...
diff --git a/training_and_demos/uv_slack/msg_slack.py b/training_and_demos/uv_slack/msg_slack.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env -S uv run --cache-dir /tmp/.uv_cache
+# /// script
+# dependencies = [
+#   "slack_sdk==3.34.0",
+#   "python-dotenv"
+# ]
+# ///
+from slack_sdk import WebClient
+from slack_sdk.errors import SlackApiError
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+SLACK_TOKEN = os.getenv("SLACK_API_TOKEN")
+
+def send_hello_world():
+    # Initialize the Slack client with your bot token
+    client = WebClient(token=SLACK_TOKEN)
+    print("TEST")
+    try:
+        response = client.chat_postMessage(
+            channel='#bot-aks-notifications',
+            text='Hello, World! 👋'
+        )
+        print(f"Message sent successfully: {response['ts']}")
+
+    except SlackApiError as e:
+        print(f"Error sending message: {e.response['error']}")
+
+if __name__ == "__main__":
+    send_hello_world()
diff --git a/transform/.dbt_coves/config.yml b/transform/.dbt_coves/config.yml
@@ -4,8 +4,8 @@ generate:
     # schemas: # List of schema names where to look for source tables
     #   - RAW
     sources_destination: "models/L1_inlets/{{schema}}/_{{schema}}.yml" # Where sources yml files will be generated
-    models_destination: "models/L1_inlets/{{schema}}/{{relation}}.sql" # Where models sql files will be generated
-    model_props_destination: "models/L1_inlets/{{schema}}/{{relation}}.yml" # Where models yml files will be generated
+    models_destination: "models/L1_inlets/{{schema}}/stg_{{relation}}.sql" # Where models sql files will be generated
+    model_props_destination: "models/L1_inlets/{{schema}}/stg_{{relation}}.yml" # Where models yml files will be generated
     update_strategy: update # Action to perform when a property file exists. Options: update, recreate, fail, ask
     templates_folder: ".dbt_coves/templates" # Folder where source generation jinja templates are located.
     flatten_json_fields: "no" # Action to perform when VARIANT / JSON field is encounted

diff --git a/transform/.dbt_coves/templates/staging_model_props.yml b/transform/.dbt_coves/templates/staging_model_props.yml
@@ -1,7 +1,7 @@
 version: 2
 
 models:
-  - name: {{model}}
+  - name: stg_{{ model | lower }}
     description: ''
     columns:
 {%- for cols in nested.values() %}