Skip to content

Commit

Permalink
ENH: Use tz-aware dtype for timestamp columns (#269)
Browse files Browse the repository at this point in the history
ENH: Use tz-aware dtype for timestamp columns in all supported pandas versions

Adds a table documenting this behavior to the "reading" how-to guides.
  • Loading branch information
tswast authored Apr 3, 2019
1 parent 0e1ebf5 commit 141b2b4
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 47 deletions.
9 changes: 9 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ Changelog
version. This is required to use new functionality such as the BigQuery
Storage API. (:issue:`267`)

Documentation
~~~~~~~~~~~~~

- Document :ref:`BigQuery data type to pandas dtype conversion
<reading-dtypes>` for ``read_gbq``. (:issue:`269`)

Dependency updates
~~~~~~~~~~~~~~~~~~

Expand All @@ -27,11 +33,14 @@ Internal changes

Enhancements
~~~~~~~~~~~~

- Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns,
with the rest being populated using the DataFrame dtypes (:issue:`218`)
(contributed by @johnpaton)
- Read ``project_id`` in :func:`to_gbq` from provided ``credentials`` if
available (contributed by @daureg)
- ``read_gbq`` uses the timezone-aware ``DatetimeTZDtype(unit='ns',
tz='UTC')`` dtype for BigQuery ``TIMESTAMP`` columns. (:issue:`269`)

.. _changelog-0.9.0:

Expand Down
64 changes: 47 additions & 17 deletions docs/source/reading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,32 @@ Suppose you want to load all data from an existing BigQuery table

.. code-block:: python
# Insert your BigQuery Project ID Here
# Can be found in the Google web console
import pandas_gbq
# TODO: Set your BigQuery Project ID.
projectid = "xxxxxxxx"
data_frame = read_gbq('SELECT * FROM test_dataset.test_table', projectid)
data_frame = pandas_gbq.read_gbq(
'SELECT * FROM `test_dataset.test_table`',
project_id=projectid)
.. note::

A project ID is sometimes optional if it can be inferred during
authentication, but it is required when authenticating with user
credentials. You can find your project ID in the `Google Cloud console
<https://console.cloud.google.com>`__.

You can define which column from BigQuery to use as an index in the
destination DataFrame as well as a preferred column order as follows:

.. code-block:: python
data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
index_col='index_column_name',
col_order=['col1', 'col2', 'col3'], projectid)
data_frame = pandas_gbq.read_gbq(
'SELECT * FROM `test_dataset.test_table`',
project_id=projectid,
index_col='index_column_name',
col_order=['col1', 'col2', 'col3'])
You can specify the query config as parameter to use additional options of
Expand All @@ -37,20 +48,39 @@ your job. For more information about query configuration parameters see `here
"useQueryCache": False
}
}
data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
configuration=configuration, projectid)
data_frame = read_gbq(
'SELECT * FROM `test_dataset.test_table`',
project_id=projectid,
configuration=configuration)
.. note::
The ``dialect`` argument can be used to indicate whether to use
BigQuery's ``'legacy'`` SQL or BigQuery's ``'standard'`` SQL (beta). The
default value is ``'standard'`` For more information on BigQuery's standard
SQL, see `BigQuery SQL Reference
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__

You can find your project id in the `Google developers console
<https://console.developers.google.com>`__.
.. code-block:: python
data_frame = pandas_gbq.read_gbq(
'SELECT * FROM [test_dataset.test_table]',
project_id=projectid,
dialect='legacy')
.. note::
The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``, though this will change
in a subsequent release to ``'standard'``. For more information
on BigQuery's standard SQL, see `BigQuery SQL Reference
<https://cloud.google.com/bigquery/sql-reference/>`__
.. _reading-dtypes:

Inferring the DataFrame's dtypes
--------------------------------

The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each column, based on the BigQuery table schema.

================== =========================
BigQuery Data Type dtype
================== =========================
FLOAT float
TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'``
DATETIME datetime64[ns]
TIME datetime64[ns]
DATE datetime64[ns]
================== =========================
43 changes: 34 additions & 9 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,9 @@ def run_query(self, query, **kwargs):
if df.empty:
df = _cast_empty_df_dtypes(schema_fields, df)

# Ensure any TIMESTAMP columns are tz-aware.
df = _localize_df(schema_fields, df)

logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
return df

Expand Down Expand Up @@ -644,17 +647,21 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):


def _bqschema_to_nullsafe_dtypes(schema_fields):
# Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
# default dtype choice.
#
# See:
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
# #missing-data-casting-rules-and-indexing
"""Specify explicit dtypes based on BigQuery schema.
This function only specifies a dtype when the dtype allows nulls.
Otherwise, use pandas's default dtype choice.
See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
#missing-data-casting-rules-and-indexing
"""
# If you update this mapping, also update the table at
# `docs/source/reading.rst`.
dtype_map = {
"FLOAT": np.dtype(float),
# Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't
# support datetime64[ns, UTC] as dtype in DataFrame constructors. See:
# https://github.com/pandas-dev/pandas/issues/12513
# pandas doesn't support timezone-aware dtype in DataFrame/Series
# constructors. It's more idiomatic to localize after construction.
# https://github.com/pandas-dev/pandas/issues/25843
"TIMESTAMP": "datetime64[ns]",
"TIME": "datetime64[ns]",
"DATE": "datetime64[ns]",
Expand Down Expand Up @@ -702,6 +709,24 @@ def _cast_empty_df_dtypes(schema_fields, df):
return df


def _localize_df(schema_fields, df):
"""Localize any TIMESTAMP columns to tz-aware type.
In pandas versions before 0.24.0, DatetimeTZDtype cannot be used as the
dtype in Series/DataFrame construction, so localize those columns after
the DataFrame is constructed.
"""
for field in schema_fields:
column = str(field["name"])
if field["mode"].upper() == "REPEATED":
continue

if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None:
df[column] = df[column].dt.tz_localize("UTC")

return df


def read_gbq(
query,
project_id=None,
Expand Down
37 changes: 22 additions & 15 deletions tests/system/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,13 +310,15 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id):
credentials=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(
df,
DataFrame(
{"unix_epoch": ["1970-01-01T00:00:00.000000Z"]},
dtype="datetime64[ns]",
),
expected = DataFrame(
{"unix_epoch": ["1970-01-01T00:00:00.000000Z"]},
dtype="datetime64[ns]",
)
if expected["unix_epoch"].dt.tz is None:
expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize(
"UTC"
)
tm.assert_frame_equal(df, expected)

def test_should_properly_handle_arbitrary_timestamp(self, project_id):
query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp'
Expand All @@ -326,13 +328,15 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
credentials=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(
df,
DataFrame(
{"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]},
dtype="datetime64[ns]",
),
expected = DataFrame(
{"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]},
dtype="datetime64[ns]",
)
if expected["valid_timestamp"].dt.tz is None:
expected["valid_timestamp"] = expected[
"valid_timestamp"
].dt.tz_localize("UTC")
tm.assert_frame_equal(df, expected)

def test_should_properly_handle_datetime_unix_epoch(self, project_id):
query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch'
Expand Down Expand Up @@ -368,7 +372,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id):
"expression, is_expected_dtype",
[
("current_date()", pandas.api.types.is_datetime64_ns_dtype),
("current_timestamp()", pandas.api.types.is_datetime64_ns_dtype),
("current_timestamp()", pandas.api.types.is_datetime64tz_dtype),
("current_datetime()", pandas.api.types.is_datetime64_ns_dtype),
("TRUE", pandas.api.types.is_bool_dtype),
("FALSE", pandas.api.types.is_bool_dtype),
Expand Down Expand Up @@ -402,9 +406,11 @@ def test_should_properly_handle_null_timestamp(self, project_id):
credentials=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(
df, DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]")
expected = DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]")
expected["null_timestamp"] = expected["null_timestamp"].dt.tz_localize(
"UTC"
)
tm.assert_frame_equal(df, expected)

def test_should_properly_handle_null_datetime(self, project_id):
query = "SELECT CAST(NULL AS DATETIME) AS null_datetime"
Expand Down Expand Up @@ -594,6 +600,7 @@ def test_zero_rows(self, project_id):
expected_result = DataFrame(
empty_columns, columns=["title", "id", "is_bot", "ts"]
)
expected_result["ts"] = expected_result["ts"].dt.tz_localize("UTC")
tm.assert_frame_equal(df, expected_result, check_index_type=False)

def test_one_row_one_column(self, project_id):
Expand Down
26 changes: 20 additions & 6 deletions tests/unit/test_gbq.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
# -*- coding: utf-8 -*-

import pandas.util.testing as tm
import pytest
try:
import mock
except ImportError: # pragma: NO COVER
from unittest import mock

import numpy
from pandas import DataFrame
import pandas.util.testing as tm
import pkg_resources
import pytest

import pandas_gbq.exceptions
from pandas_gbq import gbq

try:
import mock
except ImportError: # pragma: NO COVER
from unittest import mock

pytestmark = pytest.mark.filter_warnings(
"ignore:credentials from Google Cloud SDK"
)
pandas_installed_version = pkg_resources.get_distribution(
"pandas"
).parsed_version


@pytest.fixture
Expand Down Expand Up @@ -90,6 +95,7 @@ def no_auth(monkeypatch):
("INTEGER", None), # Can't handle NULL
("BOOLEAN", None), # Can't handle NULL
("FLOAT", numpy.dtype(float)),
# TIMESTAMP will be localized after DataFrame construction.
("TIMESTAMP", "datetime64[ns]"),
("DATETIME", "datetime64[ns]"),
],
Expand Down Expand Up @@ -200,6 +206,10 @@ def test_to_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version):
assert len(recwarn) == 0


@pytest.mark.skipif(
pandas_installed_version < pkg_resources.parse_version("0.24.0"),
reason="Requires pandas 0.24+",
)
def test_to_gbq_with_private_key_new_pandas_warns_deprecation(
min_bq_version, monkeypatch
):
Expand Down Expand Up @@ -413,6 +423,10 @@ def test_read_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version):
assert len(recwarn) == 0


@pytest.mark.skipif(
pandas_installed_version < pkg_resources.parse_version("0.24.0"),
reason="Requires pandas 0.24+",
)
def test_read_gbq_with_private_key_new_pandas_warns_deprecation(
min_bq_version, monkeypatch
):
Expand Down

0 comments on commit 141b2b4

Please sign in to comment.