Skip to content

Commit

Permalink
In gbq.to_gbq allow the DataFrame column order to differ from schema
Browse files Browse the repository at this point in the history
closes #11359

Author: Anthonios Partheniou <[email protected]>

Closes #14202 from parthea/gbq-verify-schema-less-scrict and squashes the following commits:

7e6a073 [Anthonios Partheniou] In gbq.to_gbq allow the DataFrame column order to differ from schema #11359
  • Loading branch information
parthea authored and jreback committed Sep 13, 2016
1 parent 461e0e9 commit f363236
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 9 deletions.
3 changes: 1 addition & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4579,8 +4579,7 @@ a ``TableCreationError`` if the destination table already exists.

If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
be written to the table using the defined table schema and column types. The
dataframe must match the destination table in column order, structure, and
data types.
dataframe must match the destination table in structure and data types.
If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
different schema, a delay of 2 minutes will be forced to ensure that the new schema
has propagated in the Google environment. See
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci
Google BigQuery Enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
The :func:`pandas.io.gbq.to_gbq` method now allows the DataFrame column order to differ from the destination table schema (:issue:`11359`).

.. _whatsnew_0190.errstate:

Expand Down
18 changes: 11 additions & 7 deletions pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,12 +547,17 @@ def verify_schema(self, dataset_id, table_id, schema):
from apiclient.errors import HttpError

try:
return (self.service.tables().get(
remote_schema = self.service.tables().get(
projectId=self.project_id,
datasetId=dataset_id,
tableId=table_id
).execute()['schema']) == schema
tableId=table_id).execute()['schema']

fields_remote = set([json.dumps(field_remote)
for field_remote in remote_schema['fields']])
fields_local = set(json.dumps(field_local)
for field_local in schema['fields'])

return fields_remote == fields_local
except HttpError as ex:
self.process_http_error(ex)

Expand Down Expand Up @@ -819,10 +824,9 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
dataset_id, table_id, table_schema)
elif if_exists == 'append':
if not connector.verify_schema(dataset_id, table_id, table_schema):
raise InvalidSchema("Please verify that the column order, "
"structure and data types in the "
"DataFrame match the schema of the "
"destination table.")
raise InvalidSchema("Please verify that the structure and "
"data types in the DataFrame match the "
"schema of the destination table.")
else:
table.create(table_id, table_schema)

Expand Down
65 changes: 65 additions & 0 deletions pandas/io/tests/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,8 @@ def setUp(self):
private_key=_get_private_key_path())
self.table = gbq._Table(_get_project_id(), DATASET_ID + "1",
private_key=_get_private_key_path())
self.sut = gbq.GbqConnector(_get_project_id(),
private_key=_get_private_key_path())

@classmethod
def tearDownClass(cls):
Expand Down Expand Up @@ -906,6 +908,69 @@ def test_list_table(self):
'Expected table list to contain table {0}'
.format(destination_table))

def test_verify_schema_allows_flexible_column_order(self):
destination_table = TABLE_ID + "10"
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'D', 'type': 'TIMESTAMP'}]}

self.table.create(destination_table, test_schema_1)
self.assertTrue(self.sut.verify_schema(
DATASET_ID + "1", destination_table, test_schema_2),
'Expected schema to match')

def test_verify_schema_fails_different_data_type(self):
destination_table = TABLE_ID + "11"
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'STRING'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}

self.table.create(destination_table, test_schema_1)
self.assertFalse(self.sut.verify_schema(
DATASET_ID + "1", destination_table, test_schema_2),
'Expected different schema')

def test_verify_schema_fails_different_structure(self):
destination_table = TABLE_ID + "12"
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B2', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}

self.table.create(destination_table, test_schema_1)
self.assertFalse(self.sut.verify_schema(
DATASET_ID + "1", destination_table, test_schema_2),
'Expected different schema')

def test_upload_data_flexible_column_order(self):
destination_table = DESTINATION_TABLE + "13"

test_size = 10
df = make_mixed_dataframe_v2(test_size)

# Initialize table with sample data
gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
private_key=_get_private_key_path())

df_columns_reversed = df[df.columns[::-1]]

gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(),
if_exists='append', private_key=_get_private_key_path())

def test_list_dataset(self):
dataset_id = DATASET_ID + "1"
self.assertTrue(dataset_id in self.dataset.datasets(),
Expand Down

0 comments on commit f363236

Please sign in to comment.