-
Notifications
You must be signed in to change notification settings - Fork 124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
When appending to a table, load if the dataframe contains a subset of the existing schema #24
Changes from 6 commits
c21f999
ad45010
48851db
7eee379
8726a01
71002fe
d46c1ed
d219bcc
764df25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,13 +6,15 @@ Changelog | |
|
||
- Resolve issue where the optional ``--noauth_local_webserver`` command line argument would not be propagated during the authentication process. (:issue:`35`) | ||
- Drop support for Python 3.4 (:issue:`40`) | ||
- The dataframe passed to ```.to_gbq(...., if_exists='append')``` needs to contain only a subset of the fields in the BigQuery schema. To support this, ```schema_is_subset``` tests whether a local dataframe is a subset of the BigQuery schema and ```schema``` returns the remote schema. (:issue:`24`) | ||
|
||
|
||
0.1.6 / 2017-05-03 | ||
------------------ | ||
|
||
- All gbq errors will simply be subclasses of ``ValueError`` and no longer inherit from the deprecated ``PandasError``. | ||
|
||
0.1.4 / 2017-03-17 | ||
0.1.5 / 2017-04-20 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move to latest (0.1.7) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the issue reference is misformatted, see the other entries |
||
------------------ | ||
|
||
- ``InvalidIndexColumn`` will be raised instead of ``InvalidColumnOrder`` in ``read_gbq`` when the index column specified does not exist in the BigQuery schema. (:issue:`6`) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -557,7 +557,25 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): | |
|
||
self._print("\n") | ||
|
||
def verify_schema(self, dataset_id, table_id, schema): | ||
def schema(self, dataset_id, table_id): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you update the |
||
"""Retrieve the schema of the table | ||
|
||
Obtain from BigQuery the field names and field types | ||
for the table defined by the parameters | ||
|
||
Parameters | ||
---------- | ||
dataset_id : str | ||
Name of the BigQuery dataset for the table | ||
table_id : str | ||
Name of the BigQuery table | ||
|
||
Returns | ||
------- | ||
list of dicts | ||
Fields representing the schema | ||
""" | ||
|
||
try: | ||
from googleapiclient.errors import HttpError | ||
except: | ||
|
@@ -573,15 +591,67 @@ def verify_schema(self, dataset_id, table_id, schema): | |
'type': field_remote['type']} | ||
for field_remote in remote_schema['fields']] | ||
|
||
fields_remote = set([json.dumps(field_remote) | ||
for field_remote in remote_fields]) | ||
fields_local = set(json.dumps(field_local) | ||
for field_local in schema['fields']) | ||
|
||
return fields_remote == fields_local | ||
return remote_fields | ||
except HttpError as ex: | ||
self.process_http_error(ex) | ||
|
||
def verify_schema(self, dataset_id, table_id, schema): | ||
"""Indicate whether schemas match exactly | ||
|
||
Compare the BigQuery table identified in the parameters with | ||
the schema passed in and indicate whether all fields in the former | ||
are present in the latter. Order is not considered. | ||
|
||
Parameters | ||
---------- | ||
dataset_id :str | ||
Name of the BigQuery dataset for the table | ||
table_id : str | ||
Name of the BigQuery table | ||
schema : list(dict) | ||
Schema for comparison. Each item should have | ||
a 'name' and a 'type' | ||
|
||
Returns | ||
------- | ||
bool | ||
Whether the schemas match | ||
""" | ||
|
||
fields_remote = sorted(self.schema(dataset_id, table_id), | ||
key=lambda x: x['name']) | ||
fields_local = sorted(schema['fields'], key=lambda x: x['name']) | ||
|
||
return fields_remote == fields_local | ||
|
||
def schema_is_subset(self, dataset_id, table_id, schema): | ||
"""Indicate whether the schema to be uploaded is a subset | ||
|
||
Compare the BigQuery table identified in the parameters with | ||
the schema passed in and indicate whether a subset of the fields in | ||
the former are present in the latter. Order is not considered. | ||
|
||
Parameters | ||
---------- | ||
dataset_id : str | ||
Name of the BigQuery dataset for the table | ||
table_id : str | ||
Name of the BigQuery table | ||
schema : list(dict) | ||
Schema for comparison. Each item should have | ||
a 'name' and a 'type' | ||
|
||
Returns | ||
------- | ||
bool | ||
Whether the passed schema is a subset | ||
""" | ||
|
||
fields_remote = self.schema(dataset_id, table_id) | ||
fields_local = schema['fields'] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these routines should prob just take the result of |
||
return all(field in fields_remote for field in fields_local) | ||
|
||
def delete_and_recreate_table(self, dataset_id, table_id, table_schema): | ||
delay = 0 | ||
|
||
|
@@ -844,7 +914,9 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, | |
connector.delete_and_recreate_table( | ||
dataset_id, table_id, table_schema) | ||
elif if_exists == 'append': | ||
if not connector.verify_schema(dataset_id, table_id, table_schema): | ||
if not connector.schema_is_subset(dataset_id, | ||
table_id, | ||
table_schema): | ||
raise InvalidSchema("Please verify that the structure and " | ||
"data types in the DataFrame match the " | ||
"schema of the destination table.") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1071,6 +1071,31 @@ def test_upload_data_if_table_exists_append(self): | |
_get_project_id(), if_exists='append', | ||
private_key=_get_private_key_path()) | ||
|
||
def test_upload_subset_columns_if_table_exists_append(self): | ||
# Issue 24: Upload is succesful if dataframe has columns | ||
# which are a subset of the current schema | ||
test_id = "16" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a comment with the issue ref number (this PR number is ok) |
||
test_size = 10 | ||
df = make_mixed_dataframe_v2(test_size) | ||
df_subset_cols = df.iloc[:, :2] | ||
|
||
# Initialize table with sample data | ||
gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), | ||
chunksize=10000, private_key=_get_private_key_path()) | ||
|
||
# Test the if_exists parameter with value 'append' | ||
gbq.to_gbq(df_subset_cols, | ||
self.destination_table + test_id, _get_project_id(), | ||
if_exists='append', private_key=_get_private_key_path()) | ||
|
||
sleep(30) # <- Curses Google!!! | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls update as @MaximilianR suggests (look at other tests to see how this is done) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @MaximilianR hmm, |
||
|
||
result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" | ||
.format(self.destination_table + test_id), | ||
project_id=_get_project_id(), | ||
private_key=_get_private_key_path()) | ||
assert result['num_rows'][0] == test_size * 2 | ||
|
||
def test_upload_data_if_table_exists_replace(self): | ||
test_id = "4" | ||
test_size = 10 | ||
|
@@ -1258,6 +1283,66 @@ def test_verify_schema_ignores_field_mode(self): | |
assert self.sut.verify_schema( | ||
self.dataset_prefix + "1", TABLE_ID + test_id, test_schema_2) | ||
|
||
def test_retrieve_schema(self): | ||
# Issue #24 schema function returns the schema in biquery | ||
test_id = "15" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment of issue number |
||
test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, | ||
{'name': 'B', 'type': 'FLOAT'}, | ||
{'name': 'C', 'type': 'STRING'}, | ||
{'name': 'D', 'type': 'TIMESTAMP'}]} | ||
|
||
self.table.create(TABLE_ID + test_id, test_schema) | ||
actual = self.sut.schema(self.dataset_prefix + "1", TABLE_ID + test_id) | ||
expected = test_schema['fields'] | ||
assert expected == actual, 'Expected schema used to create table' | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some tests that exercise schema_is_subset |
||
def test_schema_is_subset_passes_if_subset(self): | ||
# Issue #24 schema_is_subset indicates whether the schema of the | ||
# dataframe is a subset of the schema of the bigquery table | ||
test_id = '16' | ||
|
||
table_name = TABLE_ID + test_id | ||
dataset = self.dataset_prefix + '1' | ||
|
||
table_schema = {'fields': [{'name': 'A', | ||
'type': 'FLOAT'}, | ||
{'name': 'B', | ||
'type': 'FLOAT'}, | ||
{'name': 'C', | ||
'type': 'STRING'}]} | ||
tested_schema = {'fields': [{'name': 'A', | ||
'type': 'FLOAT'}, | ||
{'name': 'B', | ||
'type': 'FLOAT'}]} | ||
|
||
self.table.create(table_name, table_schema) | ||
|
||
assert self.sut.schema_is_subset( | ||
dataset, table_name, tested_schema) is True | ||
|
||
def test_schema_is_subset_fails_if_not_subset(self): | ||
# For pull request #24 | ||
test_id = '17' | ||
|
||
table_name = TABLE_ID + test_id | ||
dataset = self.dataset_prefix + '1' | ||
|
||
table_schema = {'fields': [{'name': 'A', | ||
'type': 'FLOAT'}, | ||
{'name': 'B', | ||
'type': 'FLOAT'}, | ||
{'name': 'C', | ||
'type': 'STRING'}]} | ||
tested_schema = {'fields': [{'name': 'A', | ||
'type': 'FLOAT'}, | ||
{'name': 'C', | ||
'type': 'FLOAT'}]} | ||
|
||
self.table.create(table_name, table_schema) | ||
|
||
assert self.sut.schema_is_subset( | ||
dataset, table_name, tested_schema) is False | ||
|
||
def test_list_dataset(self): | ||
dataset_id = self.dataset_prefix + "1" | ||
assert dataset_id in self.dataset.datasets() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did you change this? there isn't a 1.5 release