Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BigQuery: Add tqdm progress bar for downloads #7552

Merged
merged 12 commits into from
Mar 28, 2019
30 changes: 29 additions & 1 deletion bigquery/google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
except ImportError: # pragma: NO COVER
pandas = None

try:
import tqdm
except ImportError: # pragma: NO COVER
tqdm = None

from google.api_core.page_iterator import HTTPIterator

import google.cloud._helpers
Expand Down Expand Up @@ -1334,8 +1339,31 @@ def _to_dataframe_tabledata_list(self, dtypes):
"""Use (slower, but free) tabledata.list to construct a DataFrame."""
column_names = [field.name for field in self.schema]
frames = []

# report progress if tqdm installed
progress_bar = None
if tqdm is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add a progress_bar_type (string) parameter to to_dataframe and check that it also is not None.

It should default to None for now, until we update the magics module to support turning it off in the Context and also update the magics to use the tqdm_notebook option instead.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should default to None for now.

Actually, I think defaulting to 'tqdm' is fine. When @alixhami tests notebooks, he can set google.cloud.bigquery.table.tqdm = None like you do in your tests.

Copy link
Contributor

@alixhami alixhami Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should default to None because otherwise it will throw errors for users who don't have tqdm installed who aren't using the parameter.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in the latest commits.

try:
progress_bar = tqdm.tqdm(
desc="Downloading", total=self.total_rows, unit="rows"
)
except (KeyError, TypeError):
# Protect ourselves from any tqdm errors. In case of
# unexpected tqdm behavior, just fall back to showing
# no progress bar.
pass

for page in iter(self.pages):
frames.append(self._to_dataframe_dtypes(page, column_names, dtypes))
current_frame = self._to_dataframe_dtypes(page, column_names, dtypes)
frames.append(current_frame)

if progress_bar is not None:
# In some cases, the number of total rows is not populated
# until the first page of rows is fetched. Update the
# progress bar's total to keep an accurate count.
progress_bar.total = progress_bar.total or self.total_rows
progress_bar.update(len(current_frame))

return pandas.concat(frames)

def _to_dataframe_bqstorage(self, bqstorage_client, dtypes):
Expand Down
1 change: 1 addition & 0 deletions bigquery/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
# Exclude PyArrow dependency from Windows Python 2.7.
'pyarrow: platform_system != "Windows" or python_version >= "3.4"':
'pyarrow>=0.4.1',
'tqdm': 'tqdm >= 4.31.1',
'fastparquet': ['fastparquet', 'python-snappy'],
}

Expand Down
77 changes: 77 additions & 0 deletions bigquery/tests/unit/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
except (ImportError, AttributeError): # pragma: NO COVER
pandas = None

try:
from tqdm import tqdm
except (ImportError, AttributeError): # pragma: NO COVER
tqdm = None

from google.cloud.bigquery.dataset import DatasetReference


Expand Down Expand Up @@ -1413,6 +1418,78 @@ def test_to_dataframe(self):
self.assertEqual(df.name.dtype.name, "object")
self.assertEqual(df.age.dtype.name, "int64")

@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(tqdm is None, "Requires `tqdm`")
@mock.patch('tqdm.tqdm')
def test_to_dataframe_progress_bar(self, tqdm_mock):
from google.cloud.bigquery.table import RowIterator
from google.cloud.bigquery.table import SchemaField

schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
rows = [
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = RowIterator(_mock_client(), api_request, path, schema)
df = row_iterator.to_dataframe()

tqdm_mock.assert_called()
tqdm_mock().update.assert_called()

@unittest.skipIf(pandas is None, "Requires `pandas`")
@mock.patch("google.cloud.bigquery.table.tqdm", new=None)
def test_to_dataframe_no_tqdm(self):
from google.cloud.bigquery.table import RowIterator
from google.cloud.bigquery.table import SchemaField

schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
rows = [
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = RowIterator(_mock_client(), api_request, path, schema)
df = row_iterator.to_dataframe()

self.assertEqual(len(df), 4) # all should be well

@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(tqdm is None, "Requires `tqdm`")
@mock.patch("tqdm.tqdm", new=None) # will raise TypeError on call
def test_to_dataframe_tqdm_error(self):
from google.cloud.bigquery.table import RowIterator
from google.cloud.bigquery.table import SchemaField

schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
rows = [
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = RowIterator(_mock_client(), api_request, path, schema)
df = row_iterator.to_dataframe()

self.assertEqual(len(df), 4) # all should be well

@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_to_dataframe_w_empty_results(self):
from google.cloud.bigquery.table import RowIterator
Expand Down