-
Notifications
You must be signed in to change notification settings - Fork 124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor: move query and wait logic to separate module #720
Changes from 2 commits
d3600c2
97a94c8
357f63f
7d6195b
4ddc5ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
# Copyright (c) 2017 pandas-gbq Authors All rights reserved. | ||
# Use of this source code is governed by a BSD-style | ||
# license that can be found in the LICENSE file. | ||
|
||
from __future__ import annotations | ||
|
||
import concurrent.futures | ||
import logging | ||
from typing import Optional | ||
|
||
from google.cloud import bigquery | ||
|
||
import pandas_gbq.exceptions | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
# On-demand BQ Queries costs $6.25 per TB. First 1 TB per month is free | ||
# see here for more: https://cloud.google.com/bigquery/pricing | ||
QUERY_PRICE_FOR_TB = 6.25 / 2**40 # USD/TB | ||
|
||
|
||
# http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size | ||
def sizeof_fmt(num, suffix="B"): | ||
fmt = "%3.1f %s%s" | ||
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: | ||
if abs(num) < 1024.0: | ||
return fmt % (num, unit, suffix) | ||
num /= 1024.0 | ||
return fmt % (num, "Y", suffix) | ||
|
||
|
||
def _wait_for_query_job( | ||
connector, | ||
client: bigquery.Client, | ||
query_reply: bigquery.QueryJob, | ||
timeout_ms: Optional[float], | ||
): | ||
"""Wait for query to complete, pausing occasionally to update progress. | ||
|
||
Args: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see this method has been moved out to be a helper function, maybe we should also update the description as the parameters have changed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated docstrings here and added docstrings to the |
||
query_reply (QueryJob): | ||
A query job which has started. | ||
|
||
timeout_ms (Optional[int]): | ||
How long to wait before cancelling the query. | ||
""" | ||
# Wait at most 10 seconds so we can show progress. | ||
# TODO(https://github.com/googleapis/python-bigquery-pandas/issues/327): | ||
# Include a tqdm progress bar here instead of a stream of log messages. | ||
timeout_sec = 10.0 | ||
if timeout_ms: | ||
timeout_sec = min(timeout_sec, timeout_ms / 1000.0) | ||
|
||
while query_reply.state != "DONE": | ||
connector.log_elapsed_seconds(" Elapsed", "s. Waiting...") | ||
|
||
if timeout_ms and timeout_ms < connector.get_elapsed_seconds() * 1000: | ||
client.cancel_job(query_reply.job_id, location=query_reply.location) | ||
raise pandas_gbq.exceptions.QueryTimeout( | ||
"Query timeout: {} ms".format(timeout_ms) | ||
) | ||
|
||
try: | ||
query_reply.result(timeout=timeout_sec) | ||
except concurrent.futures.TimeoutError: | ||
# Use our own timeout logic | ||
pass | ||
except connector.http_error as ex: | ||
connector.process_http_error(ex) | ||
|
||
|
||
def query_and_wait( | ||
connector, | ||
client: bigquery.Client, | ||
query: str, | ||
*, | ||
job_config: bigquery.QueryJobConfig, | ||
location: Optional[str], | ||
project_id: Optional[str], | ||
max_results: Optional[int], | ||
timeout_ms: Optional[int], | ||
): | ||
from google.auth.exceptions import RefreshError | ||
|
||
try: | ||
logger.debug("Requesting query... ") | ||
query_reply = client.query( | ||
query, | ||
job_config=job_config, | ||
location=location, | ||
project=project_id, | ||
) | ||
logger.debug("Query running...") | ||
except (RefreshError, ValueError) as ex: | ||
if connector.private_key: | ||
raise pandas_gbq.exceptions.AccessDenied( | ||
f"The service account credentials are not valid: {ex}" | ||
) | ||
else: | ||
raise pandas_gbq.exceptions.AccessDenied( | ||
"The credentials have been revoked or expired, " | ||
f"please re-run the application to re-authorize: {ex}" | ||
) | ||
except connector.http_error as ex: | ||
connector.process_http_error(ex) | ||
|
||
job_id = query_reply.job_id | ||
logger.debug("Job ID: %s" % job_id) | ||
|
||
_wait_for_query_job(connector, connector.client, query_reply, timeout_ms) | ||
|
||
if query_reply.cache_hit: | ||
logger.debug("Query done.\nCache hit.\n") | ||
else: | ||
bytes_processed = query_reply.total_bytes_processed or 0 | ||
bytes_billed = query_reply.total_bytes_billed or 0 | ||
logger.debug( | ||
"Query done.\nProcessed: {} Billed: {}".format( | ||
sizeof_fmt(bytes_processed), | ||
sizeof_fmt(bytes_billed), | ||
) | ||
) | ||
logger.debug( | ||
"Standard price: ${:,.2f} USD\n".format(bytes_billed * QUERY_PRICE_FOR_TB) | ||
) | ||
|
||
# As of google-cloud-bigquery 2.3.0, QueryJob.result() uses | ||
# getQueryResults() instead of tabledata.list, which returns the correct | ||
# response with DML/DDL queries. | ||
try: | ||
return query_reply.result(max_results=max_results) | ||
except connector.http_error as ex: | ||
connector.process_http_error(ex) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: should we keep the
pass
here? Other error classes do not have this.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're correct.
pass
is not needed because there is a docstring. I'll remove.