Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Update dataframe.to_gbq to dedup column names. #286

Merged
merged 11 commits into from
Dec 28, 2023
14 changes: 8 additions & 6 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
"""Create query text representing this dataframe for I/O."""
array_value = self._block.expr

new_col_labels, new_idx_labels = utils.get_standardized_ids(
self._block.column_labels, self.index.names
)

columns = list(self._block.value_columns)
column_labels = list(self._block.column_labels)
column_labels = new_col_labels
# This code drops unnamed indexes to keep consistent with the behavior of
# most pandas write APIs. The exception is `pandas.to_csv`, which keeps
# unnamed indexes as `Unnamed: 0`.
# TODO(chelsealin): check if works for multiple indexes.
if index and self.index.name is not None:
columns.extend(self._block.index_columns)
column_labels.extend(self.index.names)
column_labels.extend(new_idx_labels)
else:
array_value = array_value.drop_columns(self._block.index_columns)

# Make columns in SQL reflect _labels_ not _ids_. Note: This may use
# the arbitrary unicode column labels feature in BigQuery, which is
# currently (June 2023) in preview.
# TODO(swast): Handle duplicate and NULL labels.
id_overrides = {
col_id: col_label
for col_id, col_label in zip(columns, column_labels)
if col_label and isinstance(col_label, str)
col_id: col_label for col_id, col_label in zip(columns, column_labels)
}

if ordering_id is not None:
Expand Down
44 changes: 44 additions & 0 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,50 @@ def test_to_gbq_if_exists(
)


def test_to_gbq_w_duplicate_column_names(
scalars_df_index, scalars_pandas_df_index, dataset_id
):
"""Test the `to_gbq` API when dealing with duplicate column names."""
destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"

# Renaming 'int64_too' to 'int64_col', which will result in 'int64_too'
# becoming 'int64_col_1' after deduplication.
scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"})
scalars_df_index.to_gbq(destination_table, if_exists="replace")

bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()

pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
)
pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_too"],
bf_result["int64_col_1"],
check_names=False,
)


def test_to_gbq_w_None_column_names(
scalars_df_index, scalars_pandas_df_index, dataset_id
):
"""Test the `to_gbq` API with None as a column name."""
destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names"

scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
scalars_df_index.to_gbq(destination_table, if_exists="replace")

bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()

pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
)
pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_too"],
bf_result["bigframes_unnamed_column"],
check_names=False,
)


def test_to_gbq_w_invalid_destination_table(scalars_df_index):
with pytest.raises(ValueError):
scalars_df_index.to_gbq("table_id")
Expand Down