Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: ensure BIGNUMERIC type is used if scale > 9 in Decimal values #844

Merged
merged 1 commit into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pandas_gbq/schema/pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ def arrow_type_to_bigquery_field(
return schema.SchemaField(name, "TIMESTAMP")

detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None)

# We need a special case for values that might fit in Arrow decimal128 but
# not with the scale/precision that is used in BigQuery's NUMERIC type.
# See: https://github.com/googleapis/python-bigquery/issues/1650
if detected_type == "NUMERIC" and type_.scale > 9:
detected_type = "BIGNUMERIC"

if detected_type is not None:
return schema.SchemaField(name, detected_type)

Expand Down
26 changes: 26 additions & 0 deletions tests/unit/schema/test_pandas_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import collections
import datetime
import decimal
import operator

from google.cloud.bigquery import schema
Expand Down Expand Up @@ -46,6 +47,29 @@ def test_dataframe_to_bigquery_fields_w_named_index(module_under_test):
),
],
),
# Need to fallback to Arrow to avoid data loss and disambiguate
# NUMERIC from BIGNUMERIC. We don't want to pick too small of a
# type and lose precision. See:
# https://github.com/googleapis/python-bigquery/issues/1650
#
(
"bignumeric_column",
[
# Start with a lower precision Decimal to make sure we
# aren't trying to determine the type from just one value.
decimal.Decimal("1.25"),
decimal.Decimal("0.1234567891"),
],
),
(
"numeric_column",
[
# Minimum value greater than 0 that can be handled: 1e-9
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric_types
decimal.Decimal("0.000000001"),
decimal.Decimal("-0.000000001"),
],
),
]
)
dataframe = pandas.DataFrame(df_data).set_index("str_index", drop=True)
Expand All @@ -64,6 +88,8 @@ def test_dataframe_to_bigquery_fields_w_named_index(module_under_test):
schema.SchemaField("boolean_column", "BOOLEAN", "NULLABLE"),
schema.SchemaField("datetime_column", "DATETIME", "NULLABLE"),
schema.SchemaField("timestamp_column", "TIMESTAMP", "NULLABLE"),
schema.SchemaField("bignumeric_column", "BIGNUMERIC", "NULLABLE"),
schema.SchemaField("numeric_column", "NUMERIC", "NULLABLE"),
)
assert returned_schema == expected_schema

Expand Down
Loading