Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.0.6 Query metrics from history #32

Merged
merged 26 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
db0218b
git commit -m "Fix setSchema and query execution from directory issue…
anhhchu Jan 15, 2024
9ccf81e
Rebuild v0.0.4 wheel file
anhhchu Jan 16, 2024
f8937cd
Feature: Enable pro and classic warehouse creation
anhhchu Jan 19, 2024
b676f01
Feature: Enable pro and classic warehouse creation
anhhchu Jan 19, 2024
d5396de
Fix method call in SQLWarehouseUtils
anhhchu Jan 19, 2024
6f590c8
Replace warehouse status check with WorkspaceClient().warehouses.star…
anhhchu Jan 19, 2024
8ac21b2
Merge branch 'goodwillpunning:main' into main
anhhchu Jan 19, 2024
80cb6e9
Replace beaker v0.0.5
anhhchu Jan 19, 2024
96a4a99
Add option to specify warehouse type in new SQL warehouse provisioning
anhhchu Jan 19, 2024
ac120de
Merge branch 'main' into feature-warehouse_type
anhhchu Jan 19, 2024
9ccba33
Merge pull request #1 from anhhchu/feature-warehouse_type
anhhchu Jan 22, 2024
9b02b08
Add auth option to workspace client
anhhchu Jan 22, 2024
fbf138d
Merge branch 'feature-warehouse_type'
anhhchu Jan 22, 2024
d4ddee2
Pull query metrics from history using /api/2.0/sql/history/queries
anhhchu Jan 24, 2024
211d9a8
Add query history, update examples to reflect the changes
anhhchu Jan 27, 2024
89c0b2a
Resolve conflict
anhhchu Jan 27, 2024
ef4de94
Merge pull request #2 from anhhchu/feature_query_metrics_history
anhhchu Jan 27, 2024
741026b
Add warehouse_type option
anhhchu Jan 27, 2024
adf0319
Remove warehouse_type in metrics
anhhchu Jan 27, 2024
edce81d
Update v0.0.6
anhhchu Feb 3, 2024
d3c098e
Add query metrics history extraction
anhhchu Feb 26, 2024
6184a6f
Fix orig file format parsing
anhhchu Feb 26, 2024
181887f
Update benchmark
anhhchu Feb 27, 2024
81f3da2
Add stop warehouse function
anhhchu Feb 28, 2024
37aa481
Update format of metrics pandas dataframe
anhhchu Mar 1, 2024
2bd7ebe
Remove commented out function
anhhchu Mar 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,9 @@
# Build helpers
*beaker.egg-info*
build/*

myvenv/

*/.env
.env

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ metrics = benchmark.execute()
print(metrics)
```

`metrics` is a list of dict. Each dict is the result of a single query execution.
`metrics` is a pandas dataframe of the result of a single query execution.

If you want to examine the results as a spark DataFrame and your environment has the capability of creating a spark session, you can use spark_fixture.

Expand Down
Binary file modified dist/beaker-0.0.5-py3-none-any.whl
Binary file not shown.
Binary file modified dist/beaker-0.0.5.tar.gz
Binary file not shown.
Binary file added dist/beaker-0.0.6-py3-none-any.whl
Binary file not shown.
Binary file added dist/beaker-0.0.6.tar.gz
Binary file not shown.
Binary file removed examples/beaker_getting_started.dbc
Binary file not shown.
55 changes: 38 additions & 17 deletions examples/beaker_getting_started.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Databricks notebook source
# MAGIC %pip install databricks-sql-connector
# MAGIC %pip install databricks-sql-connector -q
# MAGIC %pip install databricks-sdk -q
# MAGIC dbutils.library.restartPython()

# COMMAND ----------

Expand Down Expand Up @@ -31,6 +33,10 @@

# COMMAND ----------

importlib.reload(spark_fixture)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Create a new Benchmark Test

Expand All @@ -47,21 +53,24 @@
# COMMAND ----------

# Change hostname and http_path to your dbsql warehouse
hostname = "your-dbsql-hostname"
http_path = "your-dbsql-http-path"
hostname = spark.conf.get('spark.databricks.workspaceUrl')
# Extract token from dbutils
pat = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
# OR Add the appropriate scope and key for your token if configured in databricks secrets
# pat = dbutils.secrets.get(scope="your-scope", key="your-token")

# Add the appropriate scope and key for your token
pat = dbutils.secrets.get(scope="your-scope", key="your-token")
# warehouse http path example, replace with your own
http_path = "/sql/1.0/warehouses/475b94ddc7cd5211"

# COMMAND ----------

# Define connection parameters
# Use the builder pattern to add parameters for connecting to the warehouse
bm.setName(name="simple_test")
bm.setHostname(hostname=hostname)
bm.setWarehouseToken(token=pat)
bm.setWarehouse(http_path=http_path)
bm.setConcurrency(concurrency=1)
bm.setWarehouseToken(token=pat)

# Define the query to execute and target Catalog
query_str = """
Expand All @@ -75,11 +84,7 @@
# COMMAND ----------

# Run the benchmark!
metrics = bm.execute()

# COMMAND ----------

metrics
metrics_pdf = bm.execute()

# COMMAND ----------

Expand All @@ -88,7 +93,7 @@

# COMMAND ----------

df_simple_test = spark_fixture.metrics_to_df_view(metrics, "simple_test_vw")
df_simple_test = spark_fixture.metrics_to_df_view(metrics_pdf, "simple_test_vw")
df_simple_test.display()

# COMMAND ----------
Expand All @@ -109,10 +114,19 @@

# COMMAND ----------

hostname = spark.conf.get('spark.databricks.workspaceUrl')
pat = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
query_str = """
SELECT count(*)
FROM delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`
WHERE passenger_count > 2
"""

new_warehouse_config = {
"type": "warehouse",
"runtime": "latest",
"size": "Large",
"size": "2X-Small",
"warehouse": "serverless",
"min_num_clusters": 1,
"max_num_clusters": 3,
"enable_photon": True,
Expand All @@ -132,8 +146,8 @@
# benchmark.preWarmTables(tables=["table_a", "table_b", "table_c"])

# Run the benchmark!
metrics = bm.execute()
print(metrics)
metrics_pdf = bm.execute()
display(metrics_pdf)

# COMMAND ----------

Expand Down Expand Up @@ -189,7 +203,14 @@

# COMMAND ----------

metrics = bm.execute()
print(metrics)
metrics_pdf = bm.execute()
# Create a spark dataframe of the returned metrics pandas dataframe
metrics_df = spark_fixture.metrics_to_df_view(metrics_pdf, view_name="metrics_view")

# COMMAND ----------

# MAGIC %sql select * from metrics_view

# COMMAND ----------


47 changes: 33 additions & 14 deletions examples/beaker_standalone.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,63 @@
logger = logging.getLogger()
logger.setLevel(logging.INFO)

sys.path.append("../src")
from dotenv import load_dotenv
load_dotenv()

from beaker import benchmark
sys.path.append("../src")

bm = benchmark.Benchmark()
from beaker import benchmark, sqlwarehouseutils

hostname = os.getenv("DATABRICKS_HOST")
http_path = os.getenv("DATABRICKS_HTTP_PATH")
# Don't put tokens in plaintext in code
access_token = os.getenv("DATABRICKS_ACCESS_TOKEN")
catalog_name = os.getenv("CATALOG")
schema_name = os.getenv("SCHEMA")


bm = benchmark.Benchmark()
bm.setName(name="simple_test")
bm.setHostname(hostname=hostname)
bm.setWarehouse(http_path=http_path)
bm.setConcurrency(concurrency=2)
bm.setWarehouseToken(token=access_token)
bm.setWarehouse(http_path=http_path)
bm.setConcurrency(concurrency=1)

print("---- Specify query in code ------")
query_str = """
SELECT count(*)
FROM delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`
WHERE passenger_count > 2
WHERE passenger_count > 2;
"""
bm.setQuery(query=query_str)
bm.setCatalog(catalog="hive_metastore")

metrics = bm.execute()
print(metrics)
bm.setSchema(schema="default")
metrics_pdf = bm.execute()
print(metrics_pdf)


print("---- Specify a single query file ------")
bm.query_file_format = "semicolon-delimited"
bm.setQueryFile("queries/q1.sql")
metrics = bm.execute()
print(metrics)
metrics_pdf = bm.execute()
print(metrics_pdf)


print("---- Specify a query directory ------")
print("---- Specify a query directory semicolon format------")
bm.query_file_format = "semicolon-delimited"
bm.setQueryFileDir("queries")
metrics = bm.execute()
print(metrics)
metrics_pdf = bm.execute()
print(metrics_pdf)


print("---- Specify a query directory original format------")
bm.query_file_format = "original"
bm.setQueryFileDir("queries_orig")
metrics_pdf = bm.execute()
print(metrics_pdf)


print("---- Close connection ------")
bm.sql_warehouse.close_connection()
# res = bm.stop_warehouse("c0688d9c9c6d1091")
# print(res)
40 changes: 40 additions & 0 deletions examples/beaker_standalone_tpch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os, sys
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

from dotenv import load_dotenv
load_dotenv()

sys.path.append("../src")

from beaker import benchmark, sqlwarehouseutils

hostname = os.getenv("DATABRICKS_HOST")
http_path = os.getenv("DATABRICKS_HTTP_PATH")
# Don't put tokens in plaintext in code
access_token = os.getenv("DATABRICKS_ACCESS_TOKEN")
catalog_name = "samples"
schema_name = "tpch"

bm = benchmark.Benchmark()
bm.setName(name="simple_test")
bm.setHostname(hostname=hostname)
bm.setWarehouseToken(token=access_token)
bm.setWarehouse(http_path=http_path)
bm.setConcurrency(concurrency=1)

print("---- Test prewarm table ------")
bm.setCatalog(catalog_name)
bm.setSchema(schema_name)
tables = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
# bm.preWarmTables(tables=tables)

bm.query_file_format = "original"
bm.setQueryFileDir("tpch")
metrics_pdf = bm.execute()
print(metrics_pdf)

print("---- Close connection ------")
bm.sql_warehouse.close_connection()
Loading