Merge pull request #23 from databricks/sql-templates

Add examples based on default-sql & dbt-sql templates
databricks · Apr 5, 2024 · 239f7d2 · 239f7d2
2 parents 200965d + 24678f5
commit 239f7d2
Show file tree

Hide file tree

Showing 28 changed files with 621 additions and 0 deletions.
diff --git a/dbt_sql/.gitignore b/dbt_sql/.gitignore
@@ -0,0 +1,14 @@
+# DABs
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
+
+# dbt
+target/
+dbt_packages/
+logs/
diff --git a/dbt_sql/.vscode/__builtins__.pyi b/dbt_sql/.vscode/__builtins__.pyi
@@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
diff --git a/dbt_sql/.vscode/extensions.json b/dbt_sql/.vscode/extensions.json
@@ -0,0 +1,6 @@
+{
+    "recommendations": [
+        "redhat.vscode-yaml",
+        "innoverio.vscode-dbt-power-user",
+    ]
+}
diff --git a/dbt_sql/.vscode/settings.json b/dbt_sql/.vscode/settings.json
@@ -0,0 +1,33 @@
+{
+    "python.analysis.stubPath": ".vscode",
+    "databricks.python.envFile": "${workspaceFolder}/.env",
+    "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+    "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.analysis.extraPaths": ["src"],
+    "files.exclude": {
+        "**/*.egg-info": true,
+        "**/__pycache__": true,
+        ".pytest_cache": true,
+    },
+    "python.envFile": "${workspaceFolder}/.databricks/.databricks.env",
+    "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
+    "sqltools.connections": [
+        {
+            "connectionMethod": "VS Code Extension (beta)",
+            "catalog": "hive_metastore",
+            "previewLimit": 50,
+            "driver": "Databricks",
+            "name": "databricks",
+            "path": "/sql/1.0/warehouses/abcdef1234567890"
+        }
+    ],
+    "sqltools.autoConnectTo": "",
+    "[jinja-sql]": {
+        "editor.defaultFormatter": "innoverio.vscode-dbt-power-user"
+    }
+}
diff --git a/dbt_sql/README.md b/dbt_sql/README.md
@@ -0,0 +1,138 @@
+# dbt_sql
+
+The 'dbt_sql' project was generated by using the dbt template for
+Databricks Asset Bundles. It follows the standard dbt project structure
+and has an additional `resources` directory to define Databricks resources such as jobs
+that run dbt models.
+
+* Learn more about the dbt and its standard project structure here: https://docs.getdbt.com/docs/build/projects.
+* Learn more about Databricks Asset Bundles here: https://docs.databricks.com/en/dev-tools/bundles/index.html
+
+The remainder of this file includes instructions for local development (using dbt)
+and deployment to production (using Databricks Asset Bundles).
+
+## Development setup
+
+1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+2. Authenticate to your Databricks workspace, if you have not done so already:
+    ```
+    $ databricks configure
+    ```
+
+3. Install dbt
+
+   To install dbt, you need a recent version of Python. For the instructions below,
+   we assume `python3` refers to the Python version you want to use. On some systems,
+   you may need to refer to a different Python version, e.g. `python` or `/usr/bin/python`.
+
+   Run these instructions from the `dbt_sql` directory. We recommend making
+   use of a Python virtual environment and installing dbt as follows:
+
+   ```
+   $ python3 -m venv .venv
+   $ . .venv/bin/activate
+   $ pip install -r requirements-dev.txt
+   ```
+
+4. Initialize your dbt profile
+
+   Use `dbt init` to initialize your profile.
+
+   ```
+   $ dbt init
+   ```
+
+   Note that dbt authentication uses personal access tokens by default
+   (see https://docs.databricks.com/dev-tools/auth/pat.html).
+   You can use OAuth as an alternative, but this currently requires manual configuration.
+   See https://github.com/databricks/dbt-databricks/blob/main/docs/oauth.md
+   for general instructions, or https://community.databricks.com/t5/technical-blog/using-dbt-core-with-oauth-on-azure-databricks/ba-p/46605
+   for advice on setting up OAuth for Azure Databricks.
+
+   To setup up additional profiles, such as a 'prod' profile,
+   see https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles.
+
+5. Activate dbt so it can be used from the terminal
+
+   ```
+   $ . .venv/bin/activate
+    ```
+
+## Local development with dbt
+
+Use `dbt` to [run this project locally using a SQL warehouse](https://docs.databricks.com/partners/prep/dbt.html):
+
+```
+$ dbt seed
+$ dbt run
+```
+
+(Did you get an error that the dbt command could not be found? You may need
+to try the last step from the development setup above to re-activate
+your Python virtual environment!)
+
+
+To just evaluate a single model defined in a file called orders.sql, use:
+
+```
+$ dbt run --model orders
+```
+
+Use `dbt test` to run tests generated from yml files such as `models/schema.yml`
+and any SQL tests from `tests/`
+
+```
+$ dbt test
+```
+
+## Production setup
+
+Your production dbt profiles are defined in dbt_profiles/profiles.yml.
+These profiles define the default catalog, schema, and any other
+target-specific settings. Read more about dbt profiles on Databricks at
+https://docs.databricks.com/en/workflows/jobs/how-to/use-dbt-in-workflows.html#advanced-run-dbt-with-a-custom-profile.
+
+The target workspaces for staging and prod are defined in databricks.yml.
+You can manaulyl deploy based on these configurations (see below).
+Or you can use CI/CD to automate deployment. See
+https://docs.databricks.com/dev-tools/bundles/ci-cd.html for documentation
+on CI/CD setup.
+
+## Manually deploying to to Databricks with Databricks Asset Bundles
+
+Databricks Asset Bundles can be used to deploy to Databricks and to execute
+dbt commands as a job using Databricks Workflows. See
+https://docs.databricks.com/dev-tools/bundles/index.html to learn more.
+
+Use the Databricks CLI to deploy a development copy of this project to a workspace:
+
+```
+$ databricks bundle deploy --target dev
+```
+
+(Note that "dev" is the default target, so the `--target` parameter
+is optional here.)
+
+This deploys everything that's defined for this project.
+For example, the default template would deploy a job called
+`[dev yourname] dbt_sql_job` to your workspace.
+You can find that job by opening your workpace and clicking on **Workflows**.
+
+You can also deploy to your production target directly from the command-line.
+The warehouse, catalog, and schema for that target are configured in databricks.yml.
+When deploying to this target, note that the default job at resources/dbt_sql_job.yml
+has a schedule set that runs every day. The schedule is paused when deploying in development mode
+(see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html).
+
+To deploy a production copy, type:
+
+```
+$ databricks bundle deploy --target prod
+```
+
+## IDE support
+
+Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+https://docs.databricks.com/dev-tools/vscode-ext.html. Third-party extensions
+related to dbt may further enhance your dbt development experience!
diff --git a/dbt_sql/databricks.yml b/dbt_sql/databricks.yml
@@ -0,0 +1,30 @@
+# This file defines the structure of this project and how it is deployed
+# to production using Databricks Asset Bundles.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: dbt_sql
+
+include:
+  - resources/*.yml
+
+# Deployment targets.
+# The default schema, catalog, etc. for dbt are defined in dbt_profiles/profiles.yml
+targets:
+  dev:
+    default: true
+    # We use 'mode: development' to indicate this is a personal development copy.
+    # Any job schedules and triggers are paused by default.
+    mode: development
+    workspace:
+      host: https://myworkspace.databricks.com
+
+  prod:
+    mode: production
+    workspace:
+      host: https://myworkspace.databricks.com
+      # We always use /Users/[email protected] for all resources to make sure we only have a single copy.
+      root_path: /Users/[email protected]/.bundle/${bundle.name}/${bundle.target}
+    run_as:
+      # This runs as [email protected] in production. We could also use a service principal here
+      # using service_principal_name (see the Databricks documentation).
+      user_name: [email protected]
diff --git a/dbt_sql/dbt_profiles/profiles.yml b/dbt_sql/dbt_profiles/profiles.yml
@@ -0,0 +1,33 @@
+
+# This file defines dbt profiles for deployed dbt jobs.
+# Note that for local development you should create your own, local profile.
+# (see README.md).
+my_dbt_project:
+   target: dev # default target
+   outputs:
+
+     dev:
+      type: databricks
+      method: http
+      catalog: main
+      schema: "{{ var('dev_schema') }}"
+
+      http_path: /sql/1.0/warehouses/abcdef1234567890
+
+      # The workspace host / token are provided by Databricks
+      # see databricks.yml for the host used for 'dev'
+      host: "{{ env_var('DBT_HOST') }}"
+      token: "{{ env_var('DBT_ACCESS_TOKEN') }}"
+
+     prod:
+      type: databricks
+      method: http
+      catalog: main
+      schema: default
+
+      http_path: /sql/1.0/warehouses/abcdef1234567890
+
+      # The workspace host / token are provided by Databricks
+      # see databricks.yml for the host used for 'dev'
+      host: "{{ env_var('DBT_HOST') }}"
+      token: "{{ env_var('DBT_ACCESS_TOKEN') }}"
diff --git a/dbt_sql/dbt_project.yml b/dbt_sql/dbt_project.yml
@@ -0,0 +1,32 @@
+name: 'dbt_sql'
+version: '1.0.0'
+config-version: 2
+
+# This setting configures which "profile" dbt uses for this project.
+profile: 'dbt_sql'
+
+# These configurations specify where dbt should look for different types of files.
+# For Databricks asset bundles, we put everything in src, as you may have
+# non-dbt resources in your project.
+model-paths: ["src/models"]
+analysis-paths: ["src/analyses"]
+test-paths: ["src/tests"]
+seed-paths: ["src/seeds"]
+macro-paths: ["src/macros"]
+snapshot-paths: ["src/snapshots"]
+
+clean-targets:         # directories to be removed by `dbt clean`
+  - "target"
+  - "dbt_packages"
+
+# Configuring models
+# Full documentation: https://docs.getdbt.com/docs/configuring-models
+
+# In this example config, we tell dbt to build all models in the example/
+# directory as views by default. These settings can be overridden in the
+# individual model files using the `{{ config(...) }}` macro.
+models:
+  dbt_sql:
+    # Config indicated by + and applies to all files under models/example/
+    example:
+      +materialized: view
diff --git a/dbt_sql/profile_template.yml b/dbt_sql/profile_template.yml
@@ -0,0 +1,23 @@
+# This file defines prompts with defaults for dbt initializaton.
+# It is used when the `dbt init` command is invoked.
+#
+fixed:
+  type: databricks
+prompts:
+  host:
+    default: myworkspace.databricks.com
+  token:
+    hint: 'personal access token to use, dapiXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
+    hide_input: true
+  http_path:
+    hint: 'HTTP path of SQL warehouse to use'
+    default: /sql/1.0/warehouses/abcdef1234567890
+  catalog:
+    hint: 'initial catalog'
+    default: main
+  schema:
+    hint: 'personal schema where dbt will build objects during development, example: user_name'
+  threads:
+    hint: 'threads to use during development, 1 or more'
+    type: 'int'
+    default: 4
diff --git a/dbt_sql/requirements-dev.txt b/dbt_sql/requirements-dev.txt
@@ -0,0 +1,3 @@
+## requirements-dev.txt: dependencies for local development.
+
+dbt-databricks>=1.0.0,<2.0.0
diff --git a/dbt_sql/resources/dbt_sql_job.yml b/dbt_sql/resources/dbt_sql_job.yml
@@ -0,0 +1,40 @@
+resources:
+  jobs:
+    dbt_sql_job:
+      name: dbt_sql_job
+
+      schedule:
+        # Run every day at 9:27 AM
+        quartz_cron_expression: 21 27 9 * * ?
+        timezone_id: UTC
+
+      email_notifications:
+        on_failure:
+          - [email protected]
+
+      tasks:
+        - task_key: dbt
+
+          dbt_task:
+            project_directory: ../
+            # The default schema, catalog, etc. are defined in ../dbt_profiles/profiles.yml
+            profiles_directory: dbt_profiles/
+            commands:
+            - 'dbt deps --target=${bundle.target}'
+            - 'dbt seed --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"'
+            - 'dbt run --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"'
+
+          libraries:
+          - pypi:
+              package: dbt-databricks>=1.0.0,<2.0.0
+
+          new_cluster:
+            spark_version: 13.3.x-scala2.12
+            node_type_id: i3.xlarge
+            data_security_mode: SINGLE_USER
+            num_workers: 0
+            spark_conf:
+                spark.master: "local[*, 4]"
+                spark.databricks.cluster.profile: singleNode
+            custom_tags:
+              ResourceClass: SingleNode
diff --git a/dbt_sql/src/analyses/.gitkeep b/dbt_sql/src/analyses/.gitkeep
diff --git a/dbt_sql/src/macros/.gitkeep b/dbt_sql/src/macros/.gitkeep
diff --git a/dbt_sql/src/models/example/orders_daily.sql b/dbt_sql/src/models/example/orders_daily.sql
@@ -0,0 +1,17 @@
+
+-- This model file defines a materialized view called 'orders_daily'
+--
+-- Read more about materialized at https://docs.getdbt.com/reference/resource-configs/databricks-configs#materialized-views-and-streaming-tables
+-- Current limitation: a "full refresh" is needed in case the definition below is changed; see https://github.com/databricks/dbt-databricks/issues/561.
+{{ config(materialized = 'materialized_view') }}
+
+select order_date, count(*) AS number_of_orders
+
+from {{ ref('orders_raw') }}
+
+-- During development, only process a smaller range of data
+{% if target.name != 'prod' %}
+where order_date >= '2019-08-01' and order_date < '2019-09-01'
+{% endif %}
+
+group by order_date
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		## requirements-dev.txt: dependencies for local development.

		dbt-databricks>=1.0.0,<2.0.0