From 00f3ffd627125e513ffee74e7fb5bb10810bf939 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 21 Dec 2023 01:22:52 +0000 Subject: [PATCH 1/9] Fix: Update dataframe.to_gbq to convert non-string column names to string, and dedup column names. --- bigframes/core/utils.py | 12 ++++++- bigframes/dataframe.py | 15 +++++---- tests/system/small/test_dataframe_io.py | 44 +++++++++++++++++++++++++ tests/unit/core/test_bf_utils.py | 22 +++++++++++++ 4 files changed, 86 insertions(+), 7 deletions(-) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index dc7c709011..526b7a9bb8 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import typing -from typing import Hashable, Iterable, List +from typing import Hashable, Iterable, List, Optional, Union +from ibis.backends.bigquery.compiler import _NAME_REGEX import pandas as pd import typing_extensions @@ -69,6 +70,15 @@ def split_index( return (None, index) +def gen_valid_names( + names: Iterable[Hashable], default_name: Optional[str] = None +) -> List[Union[str, None]]: + return [ + default_name if name is None else "_".join(_NAME_REGEX.findall(str(name))) + for name in names + ] + + def get_standardized_ids( col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = () ) -> tuple[list[str], list[str]]: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 98aa8f1185..f030315275 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2705,26 +2705,29 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: """Create query text representing this dataframe for I/O.""" array_value = self._block.expr + + new_col_labels, new_idx_labels = utils.get_standardized_ids( + utils.gen_valid_names(self._block.column_labels), + utils.gen_valid_names(self.index.names), + ) + columns = list(self._block.value_columns) - column_labels = list(self._block.column_labels) + column_labels = new_col_labels # This code drops unnamed indexes to keep consistent with the behavior of # most pandas write APIs. The exception is `pandas.to_csv`, which keeps # unnamed indexes as `Unnamed: 0`. # TODO(chelsealin): check if works for multiple indexes. if index and self.index.name is not None: columns.extend(self._block.index_columns) - column_labels.extend(self.index.names) + column_labels.extend(new_idx_labels) else: array_value = array_value.drop_columns(self._block.index_columns) # Make columns in SQL reflect _labels_ not _ids_. Note: This may use # the arbitrary unicode column labels feature in BigQuery, which is # currently (June 2023) in preview. - # TODO(swast): Handle duplicate and NULL labels. id_overrides = { - col_id: col_label - for col_id, col_label in zip(columns, column_labels) - if col_label and isinstance(col_label, str) + col_id: col_label for col_id, col_label in zip(columns, column_labels) } if ordering_id is not None: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fb9fb7bb89..1f613e6509 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -265,6 +265,50 @@ def test_to_gbq_if_exists( ) +def test_to_gbq_w_duplicate_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API when dealing with duplicate column names.""" + destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names" + + # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too' + # becoming 'int64_col_1' after deduplication. + scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["int64_col_1"], + check_names=False, + ) + + +def test_to_gbq_w_None_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API with None as a column name.""" + destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names" + + scalars_df_index = scalars_df_index.rename(columns={"int64_too": None}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["bigframes_unnamed_column"], + check_names=False, + ) + + def test_to_gbq_w_invalid_destination_table(scalars_df_index): with pytest.raises(ValueError): scalars_df_index.to_gbq("table_id") diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 10ce1fd09e..7c07706116 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest + from bigframes.core import utils @@ -31,6 +33,26 @@ def test_get_standardized_ids_columns(): assert idx_ids == [] +@pytest.mark.parametrize( + "names, default_name, expected", + [ + ( + ["aaaa", "aa#$%^&", "HKJ3::>,."], + None, + ["aaaa", "aa#_%_&", "HKJ3::>"], + ), + ( + [None, "aa#$%^&", "HKJ3::>,."], + "DefaultName", + ["DefaultName", "aa#_%_&", "HKJ3::>"], + ), + ([None, "aa#$%^&", "HKJ3::>,....."], None, [None, "aa#_%_&", "HKJ3::>"]), + ], +) +def test_gen_valid_names(names, default_name, expected): + assert utils.gen_valid_names(names, default_name) == expected + + def test_get_standardized_ids_indexes(): col_labels = ["duplicate"] idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"] From 085618a2b66143794c9fcc44ddb92b47f5b48a4a Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 26 Dec 2023 19:05:34 +0000 Subject: [PATCH 2/9] remove escaping. --- bigframes/core/utils.py | 12 +----------- bigframes/dataframe.py | 3 +-- tests/unit/core/test_bf_utils.py | 20 -------------------- 3 files changed, 2 insertions(+), 33 deletions(-) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 526b7a9bb8..dc7c709011 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import typing -from typing import Hashable, Iterable, List, Optional, Union +from typing import Hashable, Iterable, List -from ibis.backends.bigquery.compiler import _NAME_REGEX import pandas as pd import typing_extensions @@ -70,15 +69,6 @@ def split_index( return (None, index) -def gen_valid_names( - names: Iterable[Hashable], default_name: Optional[str] = None -) -> List[Union[str, None]]: - return [ - default_name if name is None else "_".join(_NAME_REGEX.findall(str(name))) - for name in names - ] - - def get_standardized_ids( col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = () ) -> tuple[list[str], list[str]]: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f030315275..af673746c3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2707,8 +2707,7 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: array_value = self._block.expr new_col_labels, new_idx_labels = utils.get_standardized_ids( - utils.gen_valid_names(self._block.column_labels), - utils.gen_valid_names(self.index.names), + self._block.column_labels, self.index.names ) columns = list(self._block.value_columns) diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 7c07706116..66f5b0eed3 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -33,26 +33,6 @@ def test_get_standardized_ids_columns(): assert idx_ids == [] -@pytest.mark.parametrize( - "names, default_name, expected", - [ - ( - ["aaaa", "aa#$%^&", "HKJ3::>,."], - None, - ["aaaa", "aa#_%_&", "HKJ3::>"], - ), - ( - [None, "aa#$%^&", "HKJ3::>,."], - "DefaultName", - ["DefaultName", "aa#_%_&", "HKJ3::>"], - ), - ([None, "aa#$%^&", "HKJ3::>,....."], None, [None, "aa#_%_&", "HKJ3::>"]), - ], -) -def test_gen_valid_names(names, default_name, expected): - assert utils.gen_valid_names(names, default_name) == expected - - def test_get_standardized_ids_indexes(): col_labels = ["duplicate"] idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"] From 9aea94c020ffacdc2abe2c78358081ef6571060d Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 26 Dec 2023 19:07:17 +0000 Subject: [PATCH 3/9] remove unused import --- tests/unit/core/test_bf_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 66f5b0eed3..10ce1fd09e 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest - from bigframes.core import utils From 65de270003e6cfa49abb758a6537007f73aebf54 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 21 Dec 2023 00:33:37 +0000 Subject: [PATCH 4/9] fix: make `Series.str.replace` work for simple strings (#285) --- bigframes/operations/__init__.py | 2 +- tests/system/small/operations/test_strings.py | 2 ++ .../bigframes_vendored/pandas/core/series.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 753870a42d..678774978a 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -385,7 +385,7 @@ def _as_ibis(self, x: ibis_types.Value): ibis_types.StringValue, ibis_types.literal(self._pat) ) repl_str_value = typing.cast( - ibis_types.StringValue, ibis_types.literal(self._pat) + ibis_types.StringValue, ibis_types.literal(self._repl) ) return typing.cast(ibis_types.StringValue, x).replace( diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 27a35134d4..79f92c94b4 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -94,6 +94,8 @@ def test_str_extract(scalars_dfs, pat): (".*", "blah", True, 0, True), ("h.l", "blah", False, 0, True), (re.compile("(?i).e.."), "blah", None, 0, True), + ("H", "h", True, 0, False), + (", ", "__", True, 0, False), ], ) def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index d054684598..366f32c77e 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2304,6 +2304,24 @@ def str(self): NAs stay NA unless handled otherwise by a particular method. Patterned after Python’s string methods, with some inspiration from R’s stringr package. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: string + + >>> s.str.lower() + 0 a_str_series + dtype: string + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: string + Returns: bigframes.operations.strings.StringMethods: An accessor containing string methods. From 0b262c33f4d544041ad651d28fc48ada2fafddc0 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 20 Dec 2023 18:35:18 -0800 Subject: [PATCH 5/9] feat: specific pyarrow mappings for decimal, bytes types (#283) * feat: new bytes, json, decimal type mappings * amend tests to reflect new types * add implicit type conversion for df.replace * more type casting tests * skip pandas 1.x for more tests --------- Co-authored-by: Tim Swast --- bigframes/core/block_transforms.py | 2 +- bigframes/core/blocks.py | 2 +- bigframes/core/compile/compiled.py | 14 +- bigframes/core/groupby/__init__.py | 7 +- bigframes/dataframe.py | 20 +-- bigframes/dtypes.py | 139 ++++++++++++++------ bigframes/series.py | 24 +++- tests/system/large/ml/test_compose.py | 4 +- tests/system/large/ml/test_core.py | 1 + tests/system/small/ml/test_core.py | 3 +- tests/system/small/ml/test_imported.py | 2 + tests/system/small/ml/test_llm.py | 10 +- tests/system/small/ml/test_preprocessing.py | 16 +++ tests/system/small/ml/test_remote.py | 1 + tests/system/small/test_dataframe.py | 37 +++--- tests/system/small/test_dataframe_io.py | 18 ++- tests/system/small/test_multiindex.py | 7 +- tests/system/small/test_series.py | 15 ++- tests/system/small/test_session.py | 3 + tests/system/utils.py | 36 ++++- tests/unit/test_dtypes.py | 7 +- 21 files changed, 267 insertions(+), 101 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 6654892287..c6867c1a33 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -131,7 +131,7 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: if len(index_columns) != 1: raise ValueError("only method 'linear' supports multi-index") xvalues = block.index_columns[0] - if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES: + if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: raise ValueError("Can only interpolate on numeric index.") for column in original_columns: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 3163aa5b09..779d11b371 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1063,7 +1063,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.AggregateOp]: stats: list[agg_ops.AggregateOp] = [agg_ops.count_op] if dtype not in bigframes.dtypes.UNORDERED_DTYPES: stats += [agg_ops.min_op, agg_ops.max_op] - if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: # Notable exclusions: # prod op tends to cause overflows # Also, var_op is redundant as can be derived from std diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index d6183228d1..199c8db785 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -220,7 +220,10 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: raise ValueError( "Column name {} not in set of values: {}".format(key, self.column_ids) ) - return typing.cast(ibis_types.Value, self._column_names[key]) + return typing.cast( + ibis_types.Value, + bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]), + ) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( @@ -1177,7 +1180,14 @@ def _to_ibis_expr( # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + bigframes.dtypes.ibis_value_to_canonical_type( + column.resolve(self._table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ) + for column in columns ) base_table = table if self._reduced_predicate is not None: diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 3ee46ef675..66ba901649 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -359,7 +359,8 @@ def _convert_index(self, dataframe: df.DataFrame): def _raise_on_non_numeric(self, op: str): if not all( - dtype in dtypes.NUMERIC_BIGFRAMES_TYPES for dtype in self._block.dtypes + dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + for dtype in self._block.dtypes ): raise NotImplementedError( f"'{op}' does not support non-numeric columns. " @@ -371,7 +372,9 @@ def _raise_on_non_numeric(self, op: str): def _aggregated_columns(self, numeric_only: bool = False) -> typing.Sequence[str]: valid_agg_cols: list[str] = [] for col_id in self._selected_cols: - is_numeric = self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES + is_numeric = ( + self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + ) if is_numeric or not numeric_only: valid_agg_cols.append(col_id) return valid_agg_cols diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index af673746c3..88924135ff 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1800,7 +1800,7 @@ def agg( ) -> DataFrame | bigframes.series.Series: if utils.is_list_like(func): if any( - dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE for dtype in self.dtypes ): raise NotImplementedError( @@ -1867,7 +1867,7 @@ def melt( ) def describe(self) -> DataFrame: - df_numeric = self._drop_non_numeric(keep_bool=False) + df_numeric = self._drop_non_numeric(permissive=False) if len(df_numeric.columns) == 0: raise NotImplementedError( f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}" @@ -2005,10 +2005,12 @@ def unstack(self, level: LevelsType = -1): ) return DataFrame(pivot_block) - def _drop_non_numeric(self, keep_bool=True) -> DataFrame: - types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) - if not keep_bool: - types_to_keep -= set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES) + def _drop_non_numeric(self, permissive=True) -> DataFrame: + types_to_keep = ( + set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) + if permissive + else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) + ) non_numeric_cols = [ col_id for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) @@ -2026,7 +2028,7 @@ def _drop_non_bool(self) -> DataFrame: def _raise_on_non_numeric(self, op: str): if not all( - dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE for dtype in self._block.dtypes ): raise NotImplementedError( @@ -2301,7 +2303,7 @@ def notna(self) -> DataFrame: def cumsum(self): is_numeric_types = [ - (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) + (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) for _, dtype in self.dtypes.items() ] if not all(is_numeric_types): @@ -2313,7 +2315,7 @@ def cumsum(self): def cumprod(self) -> DataFrame: is_numeric_types = [ - (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) + (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) for _, dtype in self.dtypes.items() ] if not all(is_numeric_types): diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 891c372a10..b754acea2e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -15,6 +15,7 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" import datetime +import decimal import textwrap import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union @@ -30,6 +31,7 @@ import bigframes.constants as constants import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ @@ -40,9 +42,6 @@ pd.ArrowDtype, ] -# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) -NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()] - # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable UNORDERED_DTYPES = [gpd.array.GeometryDtype()] @@ -57,6 +56,9 @@ "timestamp[us][pyarrow]", "date32[day][pyarrow]", "time64[us][pyarrow]", + "decimal128(38, 9)[pyarrow]", + "decimal256(38, 9)[pyarrow]", + "binary[pyarrow]", ] # Type hints for Ibis data types supported by BigQuery DataFrame @@ -72,8 +74,17 @@ BOOL_BIGFRAMES_TYPES = [pd.BooleanDtype()] -# Several operations are restricted to these types. -NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()] +# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) +# Pandas is inconsistent, so two definitions are provided, each used in different contexts +NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [ + pd.Float64Dtype(), + pd.Int64Dtype(), +] +NUMERIC_BIGFRAMES_TYPES_PERMISSIVE = NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + [ + pd.BooleanDtype(), + pd.ArrowDtype(pa.decimal128(38, 9)), + pd.ArrowDtype(pa.decimal256(76, 38)), +] # Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame ReadOnlyIbisDtype = Union[ @@ -97,6 +108,15 @@ ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), ), + (ibis_dtypes.binary, pd.ArrowDtype(pa.binary())), + ( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + pd.ArrowDtype(pa.decimal128(38, 9)), + ), + ( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + pd.ArrowDtype(pa.decimal256(76, 38)), + ), ) BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = { @@ -112,6 +132,9 @@ ibis_dtypes.time: pa.time64("us"), ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), + ibis_dtypes.binary: pa.binary(), + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9), + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38), } ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} @@ -125,10 +148,6 @@ ) IBIS_TO_BIGFRAMES.update( { - ibis_dtypes.binary: np.dtype("O"), - ibis_dtypes.json: np.dtype("O"), - ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): np.dtype("O"), - ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): np.dtype("O"), ibis_dtypes.GeoSpatial( geotype="geography", srid=4326, nullable=True ): gpd.array.GeometryDtype(), @@ -178,7 +197,7 @@ def ibis_dtype_to_bigframes_dtype( # our IO returns them as objects. Eventually, we should support them as # ArrowDType (and update the IO accordingly) if isinstance(ibis_dtype, ibis_dtypes.Array): - return np.dtype("O") + return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) @@ -200,7 +219,9 @@ def ibis_dtype_to_bigframes_dtype( def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: if isinstance(ibis_dtype, ibis_dtypes.Array): - return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type)) + return pa.list_( + ibis_dtype_to_arrow_dtype(ibis_dtype.value_type.copy(nullable=True)) + ) if isinstance(ibis_dtype, ibis_dtypes.Struct): return pa.struct( @@ -224,21 +245,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: This is useful in cases where multiple types correspond to the same BigFrames dtype. """ ibis_type = value.type() + name = value.get_name() + if ibis_type.is_json(): + value = vendored_ibis_ops.ToJsonString(value).to_expr() + return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) - return value.cast(nullable_type).name(value.get_name()) - - -def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: - """Converts an Ibis table expression to canonical types. - - This is useful in cases where multiple types correspond to the same BigFrames dtype. - """ - casted_columns = [] - for column_name in table.columns: - column = typing.cast(ibis_types.Value, table[column_name]) - casted_columns.append(ibis_value_to_canonical_type(column)) - return table.select(*casted_columns) + return value.cast(nullable_type).name(name) def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: @@ -386,15 +399,35 @@ def cast_ibis_value( ibis_dtypes.bool, ibis_dtypes.float64, ibis_dtypes.string, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.float64: ( + ibis_dtypes.string, + ibis_dtypes.int64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.string: ( + ibis_dtypes.int64, + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ibis_dtypes.binary, ), - ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), - ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (ibis_dtypes.string,), - ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), - ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), + ibis_dtypes.Decimal(precision=38, scale=9): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.Decimal(precision=76, scale=38): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), + ibis_dtypes.binary: (ibis_dtypes.string,), } value = ibis_value_to_canonical_type(value) @@ -458,30 +491,62 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: return False +# string is binary def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: """Determine whether a scalar's type matches a given pyarrow type.""" if pa_type == pa.time64("us"): return isinstance(scalar, datetime.time) - if pa_type == pa.timestamp("us"): + elif pa_type == pa.timestamp("us"): if isinstance(scalar, datetime.datetime): return not scalar.tzinfo if isinstance(scalar, pd.Timestamp): return not scalar.tzinfo - if pa_type == pa.timestamp("us", tz="UTC"): + elif pa_type == pa.timestamp("us", tz="UTC"): if isinstance(scalar, datetime.datetime): return scalar.tzinfo == datetime.timezone.utc if isinstance(scalar, pd.Timestamp): return scalar.tzinfo == datetime.timezone.utc - if pa_type == pa.date32(): + elif pa_type == pa.date32(): return isinstance(scalar, datetime.date) + elif pa_type == pa.binary(): + return isinstance(scalar, bytes) + elif pa_type == pa.decimal128(38, 9): + # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks + return isinstance(scalar, decimal.Decimal) + elif pa_type == pa.decimal256(76, 38): + # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks + return isinstance(scalar, decimal.Decimal) return False -def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool: - """Whether scalar can be compare to items of dtype (though maybe requiring coercion)""" +def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: + """Whether scalar can be compare to items of dtype (though maybe requiring coercion). Returns the datatype that must be used for the comparison""" if is_dtype(scalar, dtype): - return True + return dtype elif pd.api.types.is_numeric_dtype(dtype): - return pd.api.types.is_number(scalar) - else: - return False + # Implicit conversion currently only supported for numeric types + if pd.api.types.is_bool(scalar): + return lcd_type(pd.BooleanDtype(), dtype) + if pd.api.types.is_float(scalar): + return lcd_type(pd.Float64Dtype(), dtype) + if pd.api.types.is_integer(scalar): + return lcd_type(pd.Int64Dtype(), dtype) + if isinstance(scalar, decimal.Decimal): + # TODO: Check context to see if can use NUMERIC instead of BIGNUMERIC + return lcd_type(pd.ArrowDtype(pa.decimal128(76, 38)), dtype) + return None + + +def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]: + # Implicit conversion currently only supported for numeric types + hierarchy: list[Dtype] = [ + pd.BooleanDtype(), + pd.Int64Dtype(), + pd.Float64Dtype(), + pd.ArrowDtype(pa.decimal128(38, 9)), + pd.ArrowDtype(pa.decimal256(76, 38)), + ] + if (dtype1 not in hierarchy) or (dtype2 not in hierarchy): + return None + lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2)) + return hierarchy[lcd_index] diff --git a/bigframes/series.py b/bigframes/series.py index 6837c1c7f8..eefd2b755d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -16,6 +16,7 @@ from __future__ import annotations +import functools import itertools import numbers import textwrap @@ -455,7 +456,7 @@ def replace( else: # Scalar replace_list = [to_replace] replace_list = [ - i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype) + i for i in replace_list if bigframes.dtypes.is_compatible(i, self.dtype) ] return self._simple_replace(replace_list, value) if replace_list else self @@ -472,11 +473,15 @@ def _regex_replace(self, to_replace: str, value: str): return Series(block.select_column(result_col)) def _simple_replace(self, to_replace_list: typing.Sequence, value): - if not bigframes.dtypes.is_dtype(value, self.dtype): + result_type = bigframes.dtypes.is_compatible(value, self.dtype) + if not result_type: raise NotImplementedError( f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) + if result_type != self.dtype: + return self.astype(result_type)._simple_replace(to_replace_list, value) + block, cond = self._block.apply_unary_op( self._value_column, ops.IsInOp(to_replace_list) ) @@ -490,15 +495,26 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): tuples = [] + lcd_types: list[typing.Optional[bigframes.dtypes.Dtype]] = [] for key, value in mapping.items(): - if not bigframes.dtypes.is_comparable(key, self.dtype): + lcd_type = bigframes.dtypes.is_compatible(key, self.dtype) + if not lcd_type: continue if not bigframes.dtypes.is_dtype(value, self.dtype): raise NotImplementedError( f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) tuples.append((key, value)) + lcd_types.append(lcd_type) + result_dtype = functools.reduce( + lambda t1, t2: bigframes.dtypes.lcd_type(t1, t2) if (t1 and t2) else None, + lcd_types, + ) + if not result_dtype: + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible mapping {mapping} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" + ) block, result = self._block.apply_unary_op( self._value_column, ops.MapOp(tuple(tuples)) ) @@ -782,7 +798,7 @@ def _central_moment(self, n: int) -> float: def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: if _is_list_like(func): - if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: raise NotImplementedError( f"Multiple aggregations only supported on numeric series. {constants.FEEDBACK_LINK}" ) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 0c280e5d02..6ea4f72489 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -72,7 +72,7 @@ def test_columntransformer_standalone_fit_and_transform( expected.standard_scaled_flipper_length_mm.astype("Float64") ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) + pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) def test_columntransformer_standalone_fit_transform(new_penguins_df): @@ -123,4 +123,4 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): expected.standard_scaled_flipper_length_mm.astype("Float64") ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) + pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index 3b30d7eb1d..df387e6ee1 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -184,4 +184,5 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df): expected, check_exact=False, rtol=0.1, + check_dtype=False, ) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index eece5ef21d..f39815aec2 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -292,11 +292,12 @@ def test_model_predict_with_unnamed_index( def test_remote_model_predict( bqml_linear_remote_model: core.BqmlModel, new_penguins_df ): - predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]}, index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + dtype=pd.ArrowDtype(pa.list_(pa.float64())), ) + predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() pd.testing.assert_frame_equal( predictions[["predicted_body_mass_g"]].sort_index(), expected, diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index 9008e85a0b..8ffd9924e9 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -51,6 +51,7 @@ def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): result, expected, check_exact=False, + check_dtype=False, atol=0.1, ) @@ -90,6 +91,7 @@ def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): result, expected, check_exact=False, + check_dtype=False, atol=0.1, ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 267a2ed9c1..fd1b803eea 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import pytest from bigframes.ml import llm @@ -202,8 +201,7 @@ def test_embedding_generator_predict_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 @pytest.mark.flaky(retries=2, delay=120) @@ -215,8 +213,7 @@ def test_embedding_generator_multilingual_predict_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 @pytest.mark.flaky(retries=2, delay=120) @@ -228,5 +225,4 @@ def test_embedding_generator_predict_series_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 45548acca3..c3bd7f3b87 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -15,6 +15,7 @@ import math import pandas as pd +import pyarrow as pa import bigframes.ml.preprocessing @@ -453,6 +454,9 @@ def test_one_hot_encoder_default_params(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -482,6 +486,9 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -507,6 +514,9 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -537,6 +547,9 @@ def test_one_hot_encoder_params(new_penguins_df): [{"index": 0, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -567,6 +580,9 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py index e8eb1c85e8..5036cdadfc 100644 --- a/tests/system/small/ml/test_remote.py +++ b/tests/system/small/ml/test_remote.py @@ -29,5 +29,6 @@ def test_remote_linear_vertex_model_predict( predictions[["predicted_body_mass_g"]].sort_index(), expected, check_exact=False, + check_dtype=False, rtol=0.1, ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ed78e73e5d..86b8cfbe66 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -19,7 +19,6 @@ from typing import Tuple import geopandas as gpd # type: ignore -import numpy as np import pandas as pd import pandas.testing import pyarrow as pa # type: ignore @@ -29,7 +28,11 @@ import bigframes._config.display_options as display_options import bigframes.dataframe as dataframe import bigframes.series as series -from tests.system.utils import assert_pandas_df_equal, assert_series_equal +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_df_construct_copy(scalars_dfs): @@ -273,19 +276,19 @@ def test_df_info(scalars_dfs): " # Column Non-Null Count Dtype\n" "--- ------------- ---------------- ------------------------------\n" " 0 bool_col 8 non-null boolean\n" - " 1 bytes_col 6 non-null object\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" " 2 date_col 7 non-null date32[day][pyarrow]\n" " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" " 4 geography_col 4 non-null geometry\n" " 5 int64_col 8 non-null Int64\n" " 6 int64_too 9 non-null Int64\n" - " 7 numeric_col 6 non-null object\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" " 8 float64_col 7 non-null Float64\n" " 9 rowindex_2 9 non-null Int64\n" " 10 string_col 8 non-null string\n" " 11 time_col 6 non-null time64[us][pyarrow]\n" " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" - "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" "memory usage: 945 bytes\n" ) @@ -362,6 +365,7 @@ def test_drop_bigframes_index_with_na(scalars_dfs): pd.testing.assert_frame_equal(pd_result, bf_result) +@skip_legacy_pandas def test_drop_bigframes_multiindex(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs scalars_df = scalars_df.copy() @@ -841,13 +845,11 @@ def test_df_fillna(scalars_dfs): def test_df_replace_scalar_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas() - pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!") + bf_result = scalars_df.replace(555.555, 3).to_pandas() + pd_result = scalars_pandas_df.replace(555.555, 3) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) + # pandas has narrower result types as they are determined dynamically + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) def test_df_replace_regex_scalar(scalars_dfs): @@ -863,12 +865,14 @@ def test_df_replace_regex_scalar(scalars_dfs): def test_df_replace_list_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas() - pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!") + bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas() + pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) + # pandas has narrower result types as they are determined dynamically pd.testing.assert_frame_equal( pd_result, bf_result, + check_dtype=False, ) @@ -1198,13 +1202,13 @@ def test_get_dtypes(scalars_df_default_index): pd.Series( { "bool_col": pd.BooleanDtype(), - "bytes_col": np.dtype("O"), + "bytes_col": pd.ArrowDtype(pa.binary()), "date_col": pd.ArrowDtype(pa.date32()), "datetime_col": pd.ArrowDtype(pa.timestamp("us")), "geography_col": gpd.array.GeometryDtype(), "int64_col": pd.Int64Dtype(), "int64_too": pd.Int64Dtype(), - "numeric_col": np.dtype("O"), + "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), "float64_col": pd.Float64Dtype(), "rowindex": pd.Int64Dtype(), "rowindex_2": pd.Int64Dtype(), @@ -1232,7 +1236,7 @@ def test_get_dtypes_array_struct(session): dtypes, pd.Series( { - "array_column": np.dtype("O"), + "array_column": pd.ArrowDtype(pa.list_(pa.int64())), "struct_column": pd.ArrowDtype( pa.struct( [ @@ -2138,6 +2142,7 @@ def test_dataframe_agg_multi_string(scalars_dfs): ).all() +@skip_legacy_pandas def test_df_describe(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # pyarrows time columns fail in pandas diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 1f613e6509..fc8c2549cf 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -56,7 +56,9 @@ def test_to_pandas_array_struct_correct_result(session): result = df.to_pandas() expected = pd.DataFrame( { - "array_column": [[1, 3, 2]], + "array_column": pd.Series( + [[1, 3, 2]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + ), "struct_column": pd.Series( [{"string_field": "a", "float_field": 1.2}], dtype=pd.ArrowDtype( @@ -91,7 +93,8 @@ def test_load_json(session): expected = pd.DataFrame( { "json_column": ['{"bar":true,"foo":10}'], - } + }, + dtype=pd.StringDtype(storage="pyarrow"), ) expected.index = expected.index.astype("Int64") pd.testing.assert_series_equal(result.dtypes, expected.dtypes) @@ -137,6 +140,8 @@ def test_to_csv_index( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") + # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + dtype.pop("bytes_col") gcs_df = pd.read_csv( path, dtype=dtype, @@ -148,7 +153,6 @@ def test_to_csv_index( scalars_pandas_df = scalars_pandas_df.copy() scalars_pandas_df.index = scalars_pandas_df.index.astype("int64") - # Ordering should be maintained for tables smaller than 1 GB. pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) @@ -174,6 +178,8 @@ def test_to_csv_tabs( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") + # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + dtype.pop("bytes_col") gcs_df = pd.read_csv( path, sep="\t", @@ -216,6 +222,8 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index): df_out = df_out.sort_values("rowindex_2").reset_index(drop=True) convert_pandas_dtypes(df_out, bytes_col=False) + # pd.read_gbq interpets bytes_col as object, reconvert to pyarrow binary + df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary())) expected = scalars_pandas_df.copy() expected.index.name = index_col pd.testing.assert_frame_equal(df_out, expected, check_index_type=False) @@ -421,7 +429,9 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): scalars_pandas_df.index = scalars_pandas_df.index.astype("Int64") # Ordering should be maintained for tables smaller than 1 GB. - pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) + pd.testing.assert_frame_equal( + gcs_df.drop("bytes_col", axis=1), scalars_pandas_df.drop("bytes_col", axis=1) + ) def test_to_sql_query_unnamed_index_included( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 1708735f4c..2d4e1f0204 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -394,14 +394,17 @@ def test_multi_index_dataframe_groupby_level_aggregate( def test_multi_index_dataframe_groupby_level_analytic( scalars_df_index, scalars_pandas_df_index, level, as_index ): + # Drop "numeric_col" as pandas doesn't support numerics for grouped window function bf_result = ( - scalars_df_index.set_index(["int64_too", "bool_col"]) + scalars_df_index.drop("numeric_col", axis=1) + .set_index(["int64_too", "bool_col"]) .groupby(level=level, as_index=as_index, dropna=False) .cumsum(numeric_only=True) .to_pandas() ) pd_result = ( - scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) + scalars_pandas_df_index.drop("numeric_col", axis=1) + .set_index(["int64_too", "bool_col"]) .groupby(level=level, as_index=as_index, dropna=False) .cumsum(numeric_only=True) ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 623da74aa4..6f919f740f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -24,7 +24,11 @@ import bigframes.pandas import bigframes.series as series -from tests.system.utils import assert_pandas_df_equal, assert_series_equal +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_series_construct_copy(scalars_dfs): @@ -81,14 +85,14 @@ def test_series_construct_from_list_escaped_strings(): [ ("bool_col", pd.BooleanDtype()), # TODO(swast): Use a more efficient type. - ("bytes_col", numpy.dtype("object")), + ("bytes_col", pd.ArrowDtype(pa.binary())), ("date_col", pd.ArrowDtype(pa.date32())), ("datetime_col", pd.ArrowDtype(pa.timestamp("us"))), ("float64_col", pd.Float64Dtype()), ("geography_col", gpd.array.GeometryDtype()), ("int64_col", pd.Int64Dtype()), # TODO(swast): Use a more efficient type. - ("numeric_col", numpy.dtype("object")), + ("numeric_col", pd.ArrowDtype(pa.decimal128(38, 9))), ("int64_too", pd.Int64Dtype()), ("string_col", pd.StringDtype(storage="pyarrow")), ("time_col", pd.ArrowDtype(pa.time64("us"))), @@ -2519,8 +2523,12 @@ def test_mask_custom_value(scalars_dfs): ("int64_col", pd.Float64Dtype()), ("int64_col", "string[pyarrow]"), ("int64_col", "boolean"), + ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), + ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), + ("string_col", "binary[pyarrow]"), + ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and # raises a deprecation warning to use tz_localize/tz_convert instead, # but BigQuery always stores values as UTC and doesn't have to deal @@ -2538,6 +2546,7 @@ def test_mask_custom_value(scalars_dfs): # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions ], ) +@skip_legacy_pandas def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): bf_result = scalars_df_index[column].astype(to_type).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e6eb40a5fa..8ce442376a 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -30,6 +30,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model +from tests.system.utils import skip_legacy_pandas FIRST_FILE = "000000000000" @@ -385,6 +386,7 @@ def test_read_pandas_tokyo( pd.testing.assert_frame_equal(result, expected) +@skip_legacy_pandas def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs if scalars_df.index.name is not None: @@ -441,6 +443,7 @@ def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): pytest.param("\t", id="custom_sep"), ], ) +@skip_legacy_pandas def test_read_csv_local_default_engine(session, scalars_dfs, sep): scalars_df, scalars_pandas_df = scalars_dfs with tempfile.TemporaryDirectory() as dir: diff --git a/tests/system/utils.py b/tests/system/utils.py index f49b5ece31..a4647b4f51 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -14,11 +14,23 @@ import base64 import decimal +import functools import geopandas as gpd # type: ignore import numpy as np import pandas as pd import pyarrow as pa # type: ignore +import pytest + + +def skip_legacy_pandas(test): + @functools.wraps(test) + def wrapper(*args, **kwds): + if pd.__version__.startswith("1."): + pytest.skip("Skips pandas 1.x as not compatible with 2.x behavior.") + return test(*args, **kwds) + + return wrapper def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): @@ -133,16 +145,28 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): df["geography_col"].replace({np.nan: None}) ) - # Convert bytes types column. - if bytes_col: + if bytes_col and not isinstance(df["bytes_col"].dtype, pd.ArrowDtype): df["bytes_col"] = df["bytes_col"].apply( lambda value: base64.b64decode(value) if not pd.isnull(value) else value ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["bytes_col"]), + schema=pa.schema([("bytes_col", pa.binary())]), + ) + df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"] - # Convert numeric types column. - df["numeric_col"] = df["numeric_col"].apply( - lambda value: decimal.Decimal(str(value)) if value else None # type: ignore - ) + if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype): + # Convert numeric types column. + df["numeric_col"] = df["numeric_col"].apply( + lambda value: decimal.Decimal(str(value)) if value else None # type: ignore + ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["numeric_col"]), + schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]), + ) + df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ + "numeric_col" + ] def assert_pandas_df_equal_pca_components(actual, expected, **kwargs): diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 6ceaaf911b..e648fd28cc 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -31,11 +31,11 @@ # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types pytest.param( ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal256(76, 38)), id="bignumeric", ), pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), - pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.binary, pd.ArrowDtype(pa.binary()), id="bytes"), pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), pytest.param( ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" @@ -49,10 +49,9 @@ pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), # TODO(tswast): custom dtype (or at least string dtype) for JSON objects - pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), pytest.param( ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal128(38, 9)), id="numeric", ), pytest.param( From 8ea4a663bc81a49bb4425d9354d0ca2699a53410 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 21 Dec 2023 23:41:45 +0000 Subject: [PATCH 6/9] docs: code samples for `drop` and `fillna` (#284) --- .../bigframes_vendored/pandas/core/frame.py | 141 ++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 81 ++++++++++ 2 files changed, 222 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 00be9e5e9e..427e586c52 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -869,6 +869,97 @@ def drop( Remove columns by directly specifying column names. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + [3 rows x 4 columns] + + Drop columns: + + >>> df.drop(['B', 'C'], axis=1) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + [3 rows x 2 columns] + + >>> df.drop(columns=['B', 'C']) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + [3 rows x 2 columns] + + Drop a row by index: + + >>> df.drop([0, 1]) + A B C D + 2 8 9 10 11 + + [1 rows x 4 columns] + + Drop columns and/or rows of MultiIndex DataFrame: + + >>> import pandas as pd + >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = bpd.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) + >>> df + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 + + [9 rows x 2 columns] + + Drop a specific index and column combination from the MultiIndex + DataFrame, i.e., drop the index ``'cow'`` and column ``'small'``: + + >>> df.drop(index='cow', columns='small') + big + llama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + + [6 rows x 1 columns] + + >>> df.drop(index='length', level=1) + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + falcon speed 320.0 250.0 + weight 1.0 0.8 + + [6 rows x 2 columns] + Args: labels: Index or column labels to drop. @@ -4343,6 +4434,56 @@ def fillna(self, value): """ Fill NA/NaN values using the specified method. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4]], + ... columns=list("ABCD")).astype("Float64") + >>> df + A B C D + 0 2.0 0.0 + 1 3.0 4.0 1.0 + 2 + 3 3.0 4.0 + + [4 rows x 4 columns] + + Replace all NA elements with 0s. + + >>> df.fillna(0) + A B C D + 0 0.0 2.0 0.0 0.0 + 1 3.0 4.0 0.0 1.0 + 2 0.0 0.0 0.0 0.0 + 3 0.0 3.0 0.0 4.0 + + [4 rows x 4 columns] + + You can use fill values from another DataFrame: + + >>> df_fill = bpd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df_fill + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + [3 rows x 4 columns] + >>> df.fillna(df_fill) + A B C D + 0 0.0 2.0 2.0 0.0 + 1 3.0 4.0 6.0 1.0 + 2 8.0 9.0 10.0 11.0 + 3 3.0 4.0 + + [4 rows x 4 columns] + Args: value (scalar, Series): Value to use to fill holes (e.g. 0), alternately a diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 366f32c77e..01cc3a0500 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1062,6 +1062,55 @@ def drop( When using a multi-index, labels on different levels can be removed by specifying the level. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) + >>> s + A 0 + B 1 + C 2 + dtype: Int64 + + Drop labels B and C: + + >>> s.drop(labels=['B', 'C']) + A 0 + dtype: Int64 + + Drop 2nd level label in MultiIndex Series: + + >>> import pandas as pd + >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + + >>> s = bpd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], + ... index=midx) + >>> s + llama speed 45.0 + weight 200.0 + length 1.2 + cow speed 30.0 + weight 250.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + dtype: Float64 + + >>> s.drop(labels='weight', level=1) + llama speed 45.0 + length 1.2 + cow speed 30.0 + length 1.5 + falcon speed 320.0 + length 0.3 + dtype: Float64 + Args: labels (single label or list-like): Index labels to drop. @@ -1193,6 +1242,38 @@ def fillna( """ Fill NA/NaN values using the specified method. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([np.nan, 2, np.nan, -1]) + >>> s + 0 + 1 2.0 + 2 + 3 -1.0 + dtype: Float64 + + Replace all NA elements with 0s. + + >>> s.fillna(0) + 0 0.0 + 1 2.0 + 2 0.0 + 3 -1.0 + dtype: Float64 + + You can use fill values from another Series: + + >>> s_fill = bpd.Series([11, 22, 33]) + >>> s.fillna(s_fill) + 0 11.0 + 1 2.0 + 2 33.0 + 3 -1.0 + dtype: Float64 + Args: value (scalar, dict, Series, or DataFrame, default None): Value to use to fill holes (e.g. 0). From a3ff76a200a5599c569404ada74c85dad3de37fe Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 22 Dec 2023 02:45:36 +0000 Subject: [PATCH 7/9] docs: code samples for `reset_index` and `sort_values` (#282) * docs: code samples for `reset_index` and `sort_values` * fix alignment in dataframe api code samples --- .../bigframes_vendored/pandas/core/frame.py | 161 ++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 110 ++++++++++++ 2 files changed, 271 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 427e586c52..fb34193710 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1138,6 +1138,93 @@ def reset_index( Reset the index of the DataFrame, and use the default one instead. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import numpy as np + >>> df = bpd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal + + [4 rows x 2 columns] + + When we reset the index, the old index is added as a column, and a new sequential index is used: + + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal + + [4 rows x 3 columns] + + We can use the ``drop`` parameter to avoid the old index being added as a column: + + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal + + [4 rows x 2 columns] + + You can also use ``reset_index`` with ``MultiIndex``. + + >>> import pandas as pd + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = ['speed', 'max'] + >>> df = bpd.DataFrame([(389.0, 'fly'), + ... (24.0, 'fly'), + ... (80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=columns) + >>> df + speed max + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey jump + + [4 rows x 2 columns] + + >>> df.reset_index() + class name speed max + 0 bird falcon 389.0 fly + 1 bird parrot 24.0 fly + 2 mammal lion 80.5 run + 3 mammal monkey jump + + [4 rows x 4 columns] + + >>> df.reset_index(drop=True) + speed max + 0 389.0 fly + 1 24.0 fly + 2 80.5 run + 3 jump + + [4 rows x 2 columns] + + Args: drop (bool, default False): Do not try to insert index into dataframe columns. This resets @@ -1347,6 +1434,80 @@ def sort_values( ) -> DataFrame: """Sort by the values along row axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], + ... 'col2': [2, 1, 9, 8, 7, 4], + ... 'col3': [0, 1, 9, 4, 2, 3], + ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] + ... }) + >>> df + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 8 4 D + 4 D 7 2 e + 5 C 4 3 F + + [6 rows x 4 columns] + + Sort by col1: + + >>> df.sort_values(by=['col1']) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 8 4 D + + [6 rows x 4 columns] + + Sort by multiple columns: + + >>> df.sort_values(by=['col1', 'col2']) + col1 col2 col3 col4 + 1 A 1 1 B + 0 A 2 0 a + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 8 4 D + + [6 rows x 4 columns] + + Sort Descending: + + >>> df.sort_values(by='col1', ascending=False) + col1 col2 col3 col4 + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + 3 8 4 D + + [6 rows x 4 columns] + + Putting NAs first: + + >>> df.sort_values(by='col1', ascending=False, na_position='first') + col1 col2 col3 col4 + 3 8 4 D + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + + [6 rows x 4 columns] + Args: by (str or Sequence[str]): Name or list of names to sort by. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 01cc3a0500..778ad68e0e 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -168,6 +168,53 @@ def reset_index( when the index is meaningless and needs to be reset to the default before another operation. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3, 4], name='foo', + ... index=['a', 'b', 'c', 'd']) + >>> s.index.name = "idx" + >>> s + idx + a 1 + b 2 + c 3 + d 4 + Name: foo, dtype: Int64 + + Generate a DataFrame with default index. + + >>> s.reset_index() + idx foo + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + [4 rows x 2 columns] + + To specify the name of the new column use ``name`` param. + + >>> s.reset_index(name="bar") + idx bar + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + [4 rows x 2 columns] + + To generate a new Series with the default index set param ``drop=True``. + + >>> s.reset_index(drop=True) + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: Int64 + Args: drop (bool, default False): Just reset the index, without inserting it as a column in @@ -699,6 +746,69 @@ def sort_values( Sort a Series in ascending or descending order by some criterion. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) + >>> s + 0 + 1 1.0 + 2 3.0 + 3 10.0 + 4 5.0 + dtype: Float64 + + Sort values ascending order (default behaviour): + + >>> s.sort_values(ascending=True) + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + 0 + dtype: Float64 + + Sort values descending order: + + >>> s.sort_values(ascending=False) + 3 10.0 + 4 5.0 + 2 3.0 + 1 1.0 + 0 + dtype: Float64 + + Sort values putting NAs first: + + >>> s.sort_values(na_position='first') + 0 + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + dtype: Float64 + + Sort a series of strings: + + >>> s = bpd.Series(['z', 'b', 'd', 'a', 'c']) + >>> s + 0 z + 1 b + 2 d + 3 a + 4 c + dtype: string + + >>> s.sort_values() + 3 a + 1 b + 4 c + 2 d + 0 z + dtype: string + Args: axis (0 or 'index'): Unused. Parameter needed for compatibility with DataFrame. From 8dab439a2730d92dea4e4d6cafb7b543a2f8c02b Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 26 Dec 2023 18:29:00 +0000 Subject: [PATCH 8/9] docs: code samples for `isna`, `isnull`, `dropna`, `isin` (#289) * docs: code samples for `isna`, `isnull`, `dropna`, `isin` * fix header alignment in rendering --- .../bigframes_vendored/pandas/core/frame.py | 81 +++++++++++++++++++ .../bigframes_vendored/pandas/core/generic.py | 65 +++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 70 ++++++++++++++++ 3 files changed, 216 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index fb34193710..2de63b9103 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1289,9 +1289,57 @@ def duplicated(self, subset=None, keep="first"): def dropna( self, + *, + axis: int | str = 0, + how: str = "any", + ignore_index=False, ) -> DataFrame: """Remove missing values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [bpd.NA, "1940-04-25", bpd.NA]}) + >>> df + name toy born + 0 Alfred + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [3 rows x 3 columns] + + Drop the rows where at least one element is missing: + + >>> df.dropna() + name toy born + 1 Batman Batmobile 1940-04-25 + + [1 rows x 3 columns] + + Drop the columns where at least one element is missing. + + >>> df.dropna(axis='columns') + name + 0 Alfred + 1 Batman + 2 Catwoman + + [3 rows x 1 columns] + + Drop the rows where all elements are missing: + + >>> df.dropna(how='all') + name toy born + 0 Alfred + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [3 rows x 3 columns] + Args: axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Determine if rows or columns which contain missing values are @@ -1318,6 +1366,39 @@ def isin(self, values): """ Whether each element in the DataFrame is contained in values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, + ... index=['falcon', 'dog']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + + [2 rows x 2 columns] + + When ``values`` is a list check whether every value in the DataFrame is + present in the list (which animals have 0 or 2 legs or wings). + + >>> df.isin([0, 2]) + num_legs num_wings + falcon True True + dog False True + + [2 rows x 2 columns] + + When ``values`` is a dict, we can pass it to check for each column separately: + + >>> df.isin({'num_wings': [0, 3]}) + num_legs num_wings + falcon False False + dog False True + + [2 rows x 2 columns] + Args: values (iterable, or dict): The result will only be true at a location if all the diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index ca5c6344ce..2885162fd6 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -499,6 +499,71 @@ def isna(self) -> NDFrame: False values. Characters such as empty strings ``''`` or :attr:`numpy.inf` are not considered NA values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + + >>> df = bpd.DataFrame(dict( + ... age=[5, 6, np.nan], + ... born=[bpd.NA, "1940-04-25", "1940-04-25"], + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'], + ... )) + >>> df + age born name toy + 0 5.0 Alfred + 1 6.0 1940-04-25 Batman Batmobile + 2 1940-04-25 Joker + + [3 rows x 4 columns] + + Show which entries in a DataFrame are NA: + + >>> df.isna() + age born name toy + 0 False True False True + 1 False False False False + 2 True False False False + + [3 rows x 4 columns] + + >>> df.isnull() + age born name toy + 0 False True False True + 1 False False False False + 2 True False False False + + [3 rows x 4 columns] + + Show which entries in a Series are NA: + + >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA]) + >>> ser + 0 5.0 + 1 + 2 6.0 + 3 + 4 + dtype: Float64 + + >>> ser.isna() + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: boolean + + >>> ser.isnull() + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: boolean + Returns: Mask of bool values for each element that indicates whether an element is an NA value. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 778ad68e0e..cbe0963051 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1460,6 +1460,42 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: """ Return a new Series with missing values removed. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Drop NA values from a Series: + + >>> ser = bpd.Series([1., 2., np.nan]) + >>> ser + 0 1.0 + 1 2.0 + 2 + dtype: Float64 + + >>> ser.dropna() + 0 1.0 + 1 2.0 + dtype: Float64 + + Empty strings are not considered NA values. ``None`` is considered an NA value. + + >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object') + >>> ser + 0 2 + 1 + 2 + 3 + 4 I stay + dtype: string + + >>> ser.dropna() + 0 2 + 2 + 4 I stay + dtype: string + Args: axis (0 or 'index'): Unused. Parameter needed for compatibility with DataFrame. @@ -2531,6 +2567,40 @@ def isin(self, values): the same. That is, if any form of NaN is present in values, all forms of NaN in the series will be considered a match. (though pandas may not) + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', + ... 'hippo'], name='animal') + >>> s + 0 llama + 1 cow + 2 llama + 3 beetle + 4 llama + 5 hippo + Name: animal, dtype: string + + >>> s.isin(['cow', 'llama']) + 0 True + 1 True + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: boolean + + Strings and integers are distinct and are therefore not comparable: + + >>> bpd.Series([1]).isin(['1']) + 0 False + dtype: boolean + >>> bpd.Series([1.1]).isin(['1.1']) + 0 False + dtype: boolean + Args: values (list-like): The sequence of values to test. Passing in a single string will raise a From e9c53ee5fac198491177ee635df998b0e7d54c71 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 27 Dec 2023 00:23:45 +0000 Subject: [PATCH 9/9] test update --- tests/system/small/test_dataframe_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fc8c2549cf..6f1b31b48e 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -300,7 +300,7 @@ def test_to_gbq_w_None_column_names( scalars_df_index, scalars_pandas_df_index, dataset_id ): """Test the `to_gbq` API with None as a column name.""" - destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names" + destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names" scalars_df_index = scalars_df_index.rename(columns={"int64_too": None}) scalars_df_index.to_gbq(destination_table, if_exists="replace")