From be5c8b1ede9a2d762fd5574c32587d125eca4713 Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Tue, 17 Mar 2020 09:37:20 +0100 Subject: [PATCH] chore: deprecate pandas code paths that don't use pyarrow (#48) * feat: deprecate code paths without pyarrow * Emit warning if loading dataframe data w/o pyarrow * Issue a warning in table.to dataframe() w/o pyarrow --- google/cloud/bigquery/__init__.py | 3 +++ google/cloud/bigquery/client.py | 10 ++++++++ google/cloud/bigquery/exceptions.py | 17 ++++++++++++ google/cloud/bigquery/table.py | 9 +++++++ tests/unit/test_client.py | 40 ++++++++++++++++++++++++++++- tests/unit/test_table.py | 32 +++++++++++++++++++++++ 6 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 google/cloud/bigquery/exceptions.py diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index 3982c1175..63d71694c 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -38,6 +38,7 @@ from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import StandardSqlDataTypes +from google.cloud.bigquery.exceptions import PyarrowMissingWarning from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions from google.cloud.bigquery.external_config import BigtableColumnFamily @@ -142,6 +143,8 @@ "WriteDisposition", # EncryptionConfiguration "EncryptionConfiguration", + # Errors and warnings + "PyarrowMissingWarning", ] diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 343217ae6..6fe474218 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -59,6 +59,7 @@ from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery.exceptions import PyarrowMissingWarning from google.cloud.bigquery import job from google.cloud.bigquery.model import Model from google.cloud.bigquery.model import ModelReference @@ -1848,6 +1849,15 @@ def load_table_from_dataframe( parquet_compression=parquet_compression, ) else: + if not pyarrow: + warnings.warn( + "Loading dataframe data without pyarrow installed is " + "deprecated and will become unsupported in the future. " + "Please install the pyarrow package.", + PyarrowMissingWarning, + stacklevel=2, + ) + if job_config.schema: warnings.warn( "job_config.schema is set, but not used to assist in " diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py new file mode 100644 index 000000000..93490ef97 --- /dev/null +++ b/google/cloud/bigquery/exceptions.py @@ -0,0 +1,17 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class PyarrowMissingWarning(DeprecationWarning): + pass diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index f7b575536..e674f237d 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -54,6 +54,7 @@ from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields +from google.cloud.bigquery.exceptions import PyarrowMissingWarning from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration @@ -1739,6 +1740,14 @@ def to_dataframe( for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) return df + else: + warnings.warn( + "Converting to a dataframe without pyarrow installed is " + "often slower and will become unsupported in the future. " + "Please install the pyarrow package.", + PyarrowMissingWarning, + stacklevel=2, + ) # The bqstorage_client is only used if pyarrow is available, so the # rest of this method only needs to account for tabledata.list. diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index b8dfbbad1..e4bc6af75 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -6580,6 +6580,42 @@ def test_load_table_from_dataframe_unknown_table(self): job_config=mock.ANY, ) + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") + def test_load_table_from_dataframe_no_pyarrow_warning(self): + from google.cloud.bigquery.client import PyarrowMissingWarning + + client = self._make_client() + + # Pick at least one column type that translates to Pandas dtype + # "object". A string column matches that. + records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] + dataframe = pandas.DataFrame(records) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) + pyarrow_patch_helpers = mock.patch( + "google.cloud.bigquery._pandas_helpers.pyarrow", None + ) + catch_warnings = warnings.catch_warnings(record=True) + + with get_table_patch, load_patch, pyarrow_patch, pyarrow_patch_helpers, catch_warnings as warned: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, location=self.LOCATION + ) + + matches = [ + warning for warning in warned if warning.category is PyarrowMissingWarning + ] + assert matches, "A missing pyarrow deprecation warning was not raised." + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") def test_load_table_from_dataframe_no_schema_warning_wo_pyarrow(self): @@ -6854,7 +6890,9 @@ def test_load_table_from_dataframe_w_schema_wo_pyarrow(self): assert warned # there should be at least one warning for warning in warned: assert "pyarrow" in str(warning) - assert warning.category in (DeprecationWarning, PendingDeprecationWarning) + assert issubclass( + warning.category, (DeprecationWarning, PendingDeprecationWarning) + ) load_table_from_file.assert_called_once_with( client, diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index c1611c084..5bcd60986 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2239,6 +2239,38 @@ def test_to_dataframe(self): self.assertEqual(df.name.dtype.name, "object") self.assertEqual(df.age.dtype.name, "int64") + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_warning_wo_pyarrow(self): + from google.cloud.bigquery.client import PyarrowMissingWarning + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one(_mock_client(), api_request, path, schema) + + no_pyarrow_patch = mock.patch("google.cloud.bigquery.table.pyarrow", new=None) + catch_warnings = warnings.catch_warnings(record=True) + + with no_pyarrow_patch, catch_warnings as warned: + df = row_iterator.to_dataframe() + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 2) + matches = [ + warning for warning in warned if warning.category is PyarrowMissingWarning + ] + self.assertTrue( + matches, msg="A missing pyarrow deprecation warning was not raised." + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm_gui")