In gbq.to_gbq allow the DataFrame column order to differ from schema

closes #11359 Author: Anthonios Partheniou <[email protected]> Closes #14202 from parthea/gbq-verify-schema-less-scrict and squashes the following commits: 7e6a073 [Anthonios Partheniou] In gbq.to_gbq allow the DataFrame column order to differ from schema #11359
pandas-dev · Sep 13, 2016 · f363236 · f363236
1 parent 461e0e9
commit f363236
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 9 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4579,8 +4579,7 @@ a ``TableCreationError`` if the destination table already exists.
 
    If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
    be written to the table using the defined table schema and column types. The
-   dataframe must match the destination table in column order, structure, and
-   data types.
+   dataframe must match the destination table in structure and data types.
    If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
    different schema, a delay of 2 minutes will be forced to ensure that the new schema
    has propagated in the Google environment. See

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -397,6 +397,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci
 Google BigQuery Enhancements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
+The :func:`pandas.io.gbq.to_gbq` method now allows the DataFrame column order to differ from the destination table schema (:issue:`11359`).
 
 .. _whatsnew_0190.errstate:
 

diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
@@ -547,12 +547,17 @@ def verify_schema(self, dataset_id, table_id, schema):
             from apiclient.errors import HttpError
 
         try:
-            return (self.service.tables().get(
+            remote_schema = self.service.tables().get(
                 projectId=self.project_id,
                 datasetId=dataset_id,
-                tableId=table_id
-            ).execute()['schema']) == schema
+                tableId=table_id).execute()['schema']
 
+            fields_remote = set([json.dumps(field_remote)
+                                 for field_remote in remote_schema['fields']])
+            fields_local = set(json.dumps(field_local)
+                               for field_local in schema['fields'])
+
+            return fields_remote == fields_local
         except HttpError as ex:
             self.process_http_error(ex)
 
@@ -819,10 +824,9 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
                 dataset_id, table_id, table_schema)
         elif if_exists == 'append':
             if not connector.verify_schema(dataset_id, table_id, table_schema):
-                raise InvalidSchema("Please verify that the column order, "
-                                    "structure and data types in the "
-                                    "DataFrame match the schema of the "
-                                    "destination table.")
+                raise InvalidSchema("Please verify that the structure and "
+                                    "data types in the DataFrame match the "
+                                    "schema of the destination table.")
     else:
         table.create(table_id, table_schema)
 

diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py
@@ -743,6 +743,8 @@ def setUp(self):
                                     private_key=_get_private_key_path())
         self.table = gbq._Table(_get_project_id(), DATASET_ID + "1",
                                 private_key=_get_private_key_path())
+        self.sut = gbq.GbqConnector(_get_project_id(),
+                                    private_key=_get_private_key_path())
 
     @classmethod
     def tearDownClass(cls):
@@ -906,6 +908,69 @@ def test_list_table(self):
             'Expected table list to contain table {0}'
             .format(destination_table))
 
+    def test_verify_schema_allows_flexible_column_order(self):
+        destination_table = TABLE_ID + "10"
+        test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'B', 'type': 'FLOAT'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+        test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'B', 'type': 'FLOAT'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+
+        self.table.create(destination_table, test_schema_1)
+        self.assertTrue(self.sut.verify_schema(
+            DATASET_ID + "1", destination_table, test_schema_2),
+            'Expected schema to match')
+
+    def test_verify_schema_fails_different_data_type(self):
+        destination_table = TABLE_ID + "11"
+        test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'B', 'type': 'FLOAT'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+        test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'B', 'type': 'STRING'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+
+        self.table.create(destination_table, test_schema_1)
+        self.assertFalse(self.sut.verify_schema(
+            DATASET_ID + "1", destination_table, test_schema_2),
+            'Expected different schema')
+
+    def test_verify_schema_fails_different_structure(self):
+        destination_table = TABLE_ID + "12"
+        test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'B', 'type': 'FLOAT'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+        test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
+                                    {'name': 'B2', 'type': 'FLOAT'},
+                                    {'name': 'C', 'type': 'STRING'},
+                                    {'name': 'D', 'type': 'TIMESTAMP'}]}
+
+        self.table.create(destination_table, test_schema_1)
+        self.assertFalse(self.sut.verify_schema(
+            DATASET_ID + "1", destination_table, test_schema_2),
+            'Expected different schema')
+
+    def test_upload_data_flexible_column_order(self):
+        destination_table = DESTINATION_TABLE + "13"
+
+        test_size = 10
+        df = make_mixed_dataframe_v2(test_size)
+
+        # Initialize table with sample data
+        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
+                   private_key=_get_private_key_path())
+
+        df_columns_reversed = df[df.columns[::-1]]
+
+        gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(),
+                   if_exists='append', private_key=_get_private_key_path())
+
     def test_list_dataset(self):
         dataset_id = DATASET_ID + "1"
         self.assertTrue(dataset_id in self.dataset.datasets(),