From 00f3ffd627125e513ffee74e7fb5bb10810bf939 Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Thu, 21 Dec 2023 01:22:52 +0000
Subject: [PATCH 1/9] Fix: Update dataframe.to_gbq to convert non-string column
 names to string, and dedup column names.

---
 bigframes/core/utils.py                 | 12 ++++++-
 bigframes/dataframe.py                  | 15 +++++----
 tests/system/small/test_dataframe_io.py | 44 +++++++++++++++++++++++++
 tests/unit/core/test_bf_utils.py        | 22 +++++++++++++
 4 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
index dc7c709011..526b7a9bb8 100644
--- a/bigframes/core/utils.py
+++ b/bigframes/core/utils.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import typing
-from typing import Hashable, Iterable, List
+from typing import Hashable, Iterable, List, Optional, Union
 
+from ibis.backends.bigquery.compiler import _NAME_REGEX
 import pandas as pd
 import typing_extensions
 
@@ -69,6 +70,15 @@ def split_index(
         return (None, index)
 
 
+def gen_valid_names(
+    names: Iterable[Hashable], default_name: Optional[str] = None
+) -> List[Union[str, None]]:
+    return [
+        default_name if name is None else "_".join(_NAME_REGEX.findall(str(name)))
+        for name in names
+    ]
+
+
 def get_standardized_ids(
     col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = ()
 ) -> tuple[list[str], list[str]]:
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 98aa8f1185..f030315275 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2705,26 +2705,29 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
     def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
         """Create query text representing this dataframe for I/O."""
         array_value = self._block.expr
+
+        new_col_labels, new_idx_labels = utils.get_standardized_ids(
+            utils.gen_valid_names(self._block.column_labels),
+            utils.gen_valid_names(self.index.names),
+        )
+
         columns = list(self._block.value_columns)
-        column_labels = list(self._block.column_labels)
+        column_labels = new_col_labels
         # This code drops unnamed indexes to keep consistent with the behavior of
         # most pandas write APIs. The exception is `pandas.to_csv`, which keeps
         # unnamed indexes as `Unnamed: 0`.
         # TODO(chelsealin): check if works for multiple indexes.
         if index and self.index.name is not None:
             columns.extend(self._block.index_columns)
-            column_labels.extend(self.index.names)
+            column_labels.extend(new_idx_labels)
         else:
             array_value = array_value.drop_columns(self._block.index_columns)
 
         # Make columns in SQL reflect _labels_ not _ids_. Note: This may use
         # the arbitrary unicode column labels feature in BigQuery, which is
         # currently (June 2023) in preview.
-        # TODO(swast): Handle duplicate and NULL labels.
         id_overrides = {
-            col_id: col_label
-            for col_id, col_label in zip(columns, column_labels)
-            if col_label and isinstance(col_label, str)
+            col_id: col_label for col_id, col_label in zip(columns, column_labels)
         }
 
         if ordering_id is not None:
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index fb9fb7bb89..1f613e6509 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -265,6 +265,50 @@ def test_to_gbq_if_exists(
     )
 
 
+def test_to_gbq_w_duplicate_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """Test the `to_gbq` API when dealing with duplicate column names."""
+    destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"
+
+    # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too'
+    # becoming 'int64_col_1' after deduplication.
+    scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"})
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+    )
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_too"],
+        bf_result["int64_col_1"],
+        check_names=False,
+    )
+
+
+def test_to_gbq_w_None_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """Test the `to_gbq` API with None as a column name."""
+    destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"
+
+    scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+    )
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_too"],
+        bf_result["bigframes_unnamed_column"],
+        check_names=False,
+    )
+
+
 def test_to_gbq_w_invalid_destination_table(scalars_df_index):
     with pytest.raises(ValueError):
         scalars_df_index.to_gbq("table_id")
diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py
index 10ce1fd09e..7c07706116 100644
--- a/tests/unit/core/test_bf_utils.py
+++ b/tests/unit/core/test_bf_utils.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
+
 from bigframes.core import utils
 
 
@@ -31,6 +33,26 @@ def test_get_standardized_ids_columns():
     assert idx_ids == []
 
 
+@pytest.mark.parametrize(
+    "names, default_name, expected",
+    [
+        (
+            ["aaaa", "aa#$%^&", "HKJ3::>,."],
+            None,
+            ["aaaa", "aa#_%_&", "HKJ3::>"],
+        ),
+        (
+            [None, "aa#$%^&", "HKJ3::>,."],
+            "DefaultName",
+            ["DefaultName", "aa#_%_&", "HKJ3::>"],
+        ),
+        ([None, "aa#$%^&", "HKJ3::>,....."], None, [None, "aa#_%_&", "HKJ3::>"]),
+    ],
+)
+def test_gen_valid_names(names, default_name, expected):
+    assert utils.gen_valid_names(names, default_name) == expected
+
+
 def test_get_standardized_ids_indexes():
     col_labels = ["duplicate"]
     idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"]

From 085618a2b66143794c9fcc44ddb92b47f5b48a4a Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Tue, 26 Dec 2023 19:05:34 +0000
Subject: [PATCH 2/9] remove escaping.

---
 bigframes/core/utils.py          | 12 +-----------
 bigframes/dataframe.py           |  3 +--
 tests/unit/core/test_bf_utils.py | 20 --------------------
 3 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
index 526b7a9bb8..dc7c709011 100644
--- a/bigframes/core/utils.py
+++ b/bigframes/core/utils.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import typing
-from typing import Hashable, Iterable, List, Optional, Union
+from typing import Hashable, Iterable, List
 
-from ibis.backends.bigquery.compiler import _NAME_REGEX
 import pandas as pd
 import typing_extensions
 
@@ -70,15 +69,6 @@ def split_index(
         return (None, index)
 
 
-def gen_valid_names(
-    names: Iterable[Hashable], default_name: Optional[str] = None
-) -> List[Union[str, None]]:
-    return [
-        default_name if name is None else "_".join(_NAME_REGEX.findall(str(name)))
-        for name in names
-    ]
-
-
 def get_standardized_ids(
     col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = ()
 ) -> tuple[list[str], list[str]]:
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index f030315275..af673746c3 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2707,8 +2707,7 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
         array_value = self._block.expr
 
         new_col_labels, new_idx_labels = utils.get_standardized_ids(
-            utils.gen_valid_names(self._block.column_labels),
-            utils.gen_valid_names(self.index.names),
+            self._block.column_labels, self.index.names
         )
 
         columns = list(self._block.value_columns)
diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py
index 7c07706116..66f5b0eed3 100644
--- a/tests/unit/core/test_bf_utils.py
+++ b/tests/unit/core/test_bf_utils.py
@@ -33,26 +33,6 @@ def test_get_standardized_ids_columns():
     assert idx_ids == []
 
 
-@pytest.mark.parametrize(
-    "names, default_name, expected",
-    [
-        (
-            ["aaaa", "aa#$%^&", "HKJ3::>,."],
-            None,
-            ["aaaa", "aa#_%_&", "HKJ3::>"],
-        ),
-        (
-            [None, "aa#$%^&", "HKJ3::>,."],
-            "DefaultName",
-            ["DefaultName", "aa#_%_&", "HKJ3::>"],
-        ),
-        ([None, "aa#$%^&", "HKJ3::>,....."], None, [None, "aa#_%_&", "HKJ3::>"]),
-    ],
-)
-def test_gen_valid_names(names, default_name, expected):
-    assert utils.gen_valid_names(names, default_name) == expected
-
-
 def test_get_standardized_ids_indexes():
     col_labels = ["duplicate"]
     idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"]

From 9aea94c020ffacdc2abe2c78358081ef6571060d Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Tue, 26 Dec 2023 19:07:17 +0000
Subject: [PATCH 3/9] remove unused import

---
 tests/unit/core/test_bf_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py
index 66f5b0eed3..10ce1fd09e 100644
--- a/tests/unit/core/test_bf_utils.py
+++ b/tests/unit/core/test_bf_utils.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-
 from bigframes.core import utils
 
 

From 65de270003e6cfa49abb758a6537007f73aebf54 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Thu, 21 Dec 2023 00:33:37 +0000
Subject: [PATCH 4/9] fix: make `Series.str.replace` work for simple strings
 (#285)

---
 bigframes/operations/__init__.py               |  2 +-
 tests/system/small/operations/test_strings.py  |  2 ++
 .../bigframes_vendored/pandas/core/series.py   | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index 753870a42d..678774978a 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -385,7 +385,7 @@ def _as_ibis(self, x: ibis_types.Value):
             ibis_types.StringValue, ibis_types.literal(self._pat)
         )
         repl_str_value = typing.cast(
-            ibis_types.StringValue, ibis_types.literal(self._pat)
+            ibis_types.StringValue, ibis_types.literal(self._repl)
         )
 
         return typing.cast(ibis_types.StringValue, x).replace(
diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py
index 27a35134d4..79f92c94b4 100644
--- a/tests/system/small/operations/test_strings.py
+++ b/tests/system/small/operations/test_strings.py
@@ -94,6 +94,8 @@ def test_str_extract(scalars_dfs, pat):
         (".*", "blah", True, 0, True),
         ("h.l", "blah", False, 0, True),
         (re.compile("(?i).e.."), "blah", None, 0, True),
+        ("H", "h", True, 0, False),
+        (", ", "__", True, 0, False),
     ],
 )
 def test_str_replace(scalars_dfs, pat, repl, case, flags, regex):
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index d054684598..366f32c77e 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -2304,6 +2304,24 @@ def str(self):
         NAs stay NA unless handled otherwise by a particular method. Patterned
         after Python’s string methods, with some inspiration from R’s stringr package.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> s = bpd.Series(["A_Str_Series"])
+            >>> s
+            0    A_Str_Series
+            dtype: string
+
+            >>> s.str.lower()
+            0    a_str_series
+            dtype: string
+
+            >>> s.str.replace("_", "")
+            0    AStrSeries
+            dtype: string
+
         Returns:
             bigframes.operations.strings.StringMethods:
                 An accessor containing string methods.

From 0b262c33f4d544041ad651d28fc48ada2fafddc0 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Wed, 20 Dec 2023 18:35:18 -0800
Subject: [PATCH 5/9] feat: specific pyarrow mappings for decimal, bytes types
 (#283)

* feat: new bytes, json, decimal type mappings

* amend tests to reflect new types

* add implicit type conversion for df.replace

* more type casting tests

* skip pandas 1.x for more tests

---------

Co-authored-by: Tim Swast <swast@google.com>
---
 bigframes/core/block_transforms.py          |   2 +-
 bigframes/core/blocks.py                    |   2 +-
 bigframes/core/compile/compiled.py          |  14 +-
 bigframes/core/groupby/__init__.py          |   7 +-
 bigframes/dataframe.py                      |  20 +--
 bigframes/dtypes.py                         | 139 ++++++++++++++------
 bigframes/series.py                         |  24 +++-
 tests/system/large/ml/test_compose.py       |   4 +-
 tests/system/large/ml/test_core.py          |   1 +
 tests/system/small/ml/test_core.py          |   3 +-
 tests/system/small/ml/test_imported.py      |   2 +
 tests/system/small/ml/test_llm.py           |  10 +-
 tests/system/small/ml/test_preprocessing.py |  16 +++
 tests/system/small/ml/test_remote.py        |   1 +
 tests/system/small/test_dataframe.py        |  37 +++---
 tests/system/small/test_dataframe_io.py     |  18 ++-
 tests/system/small/test_multiindex.py       |   7 +-
 tests/system/small/test_series.py           |  15 ++-
 tests/system/small/test_session.py          |   3 +
 tests/system/utils.py                       |  36 ++++-
 tests/unit/test_dtypes.py                   |   7 +-
 21 files changed, 267 insertions(+), 101 deletions(-)

diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index 6654892287..c6867c1a33 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -131,7 +131,7 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
         if len(index_columns) != 1:
             raise ValueError("only method 'linear' supports multi-index")
         xvalues = block.index_columns[0]
-        if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES:
+        if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
             raise ValueError("Can only interpolate on numeric index.")
 
     for column in original_columns:
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 3163aa5b09..779d11b371 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1063,7 +1063,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.AggregateOp]:
         stats: list[agg_ops.AggregateOp] = [agg_ops.count_op]
         if dtype not in bigframes.dtypes.UNORDERED_DTYPES:
             stats += [agg_ops.min_op, agg_ops.max_op]
-        if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES:
+        if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
             # Notable exclusions:
             # prod op tends to cause overflows
             # Also, var_op is redundant as can be derived from std
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
index d6183228d1..199c8db785 100644
--- a/bigframes/core/compile/compiled.py
+++ b/bigframes/core/compile/compiled.py
@@ -220,7 +220,10 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value:
             raise ValueError(
                 "Column name {} not in set of values: {}".format(key, self.column_ids)
             )
-        return typing.cast(ibis_types.Value, self._column_names[key])
+        return typing.cast(
+            ibis_types.Value,
+            bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]),
+        )
 
     def get_column_type(self, key: str) -> bigframes.dtypes.Dtype:
         ibis_type = typing.cast(
@@ -1177,7 +1180,14 @@ def _to_ibis_expr(
         # Make sure all dtypes are the "canonical" ones for BigFrames. This is
         # important for operations like UNION where the schema must match.
         table = self._table.select(
-            bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns
+            bigframes.dtypes.ibis_value_to_canonical_type(
+                column.resolve(self._table)
+                # TODO(https://github.com/ibis-project/ibis/issues/7613): use
+                # public API to refer to Deferred type.
+                if isinstance(column, ibis.common.deferred.Deferred)
+                else column
+            )
+            for column in columns
         )
         base_table = table
         if self._reduced_predicate is not None:
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
index 3ee46ef675..66ba901649 100644
--- a/bigframes/core/groupby/__init__.py
+++ b/bigframes/core/groupby/__init__.py
@@ -359,7 +359,8 @@ def _convert_index(self, dataframe: df.DataFrame):
 
     def _raise_on_non_numeric(self, op: str):
         if not all(
-            dtype in dtypes.NUMERIC_BIGFRAMES_TYPES for dtype in self._block.dtypes
+            dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
+            for dtype in self._block.dtypes
         ):
             raise NotImplementedError(
                 f"'{op}' does not support non-numeric columns. "
@@ -371,7 +372,9 @@ def _raise_on_non_numeric(self, op: str):
     def _aggregated_columns(self, numeric_only: bool = False) -> typing.Sequence[str]:
         valid_agg_cols: list[str] = []
         for col_id in self._selected_cols:
-            is_numeric = self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES
+            is_numeric = (
+                self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
+            )
             if is_numeric or not numeric_only:
                 valid_agg_cols.append(col_id)
         return valid_agg_cols
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index af673746c3..88924135ff 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1800,7 +1800,7 @@ def agg(
     ) -> DataFrame | bigframes.series.Series:
         if utils.is_list_like(func):
             if any(
-                dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES
+                dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
                 for dtype in self.dtypes
             ):
                 raise NotImplementedError(
@@ -1867,7 +1867,7 @@ def melt(
         )
 
     def describe(self) -> DataFrame:
-        df_numeric = self._drop_non_numeric(keep_bool=False)
+        df_numeric = self._drop_non_numeric(permissive=False)
         if len(df_numeric.columns) == 0:
             raise NotImplementedError(
                 f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}"
@@ -2005,10 +2005,12 @@ def unstack(self, level: LevelsType = -1):
         )
         return DataFrame(pivot_block)
 
-    def _drop_non_numeric(self, keep_bool=True) -> DataFrame:
-        types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES)
-        if not keep_bool:
-            types_to_keep -= set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES)
+    def _drop_non_numeric(self, permissive=True) -> DataFrame:
+        types_to_keep = (
+            set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
+            if permissive
+            else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
+        )
         non_numeric_cols = [
             col_id
             for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
@@ -2026,7 +2028,7 @@ def _drop_non_bool(self) -> DataFrame:
 
     def _raise_on_non_numeric(self, op: str):
         if not all(
-            dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES
+            dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
             for dtype in self._block.dtypes
         ):
             raise NotImplementedError(
@@ -2301,7 +2303,7 @@ def notna(self) -> DataFrame:
 
     def cumsum(self):
         is_numeric_types = [
-            (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES)
+            (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             for _, dtype in self.dtypes.items()
         ]
         if not all(is_numeric_types):
@@ -2313,7 +2315,7 @@ def cumsum(self):
 
     def cumprod(self) -> DataFrame:
         is_numeric_types = [
-            (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES)
+            (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             for _, dtype in self.dtypes.items()
         ]
         if not all(is_numeric_types):
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 891c372a10..b754acea2e 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -15,6 +15,7 @@
 """Mappings for Pandas dtypes supported by BigQuery DataFrames package"""
 
 import datetime
+import decimal
 import textwrap
 import typing
 from typing import Any, Dict, Iterable, Literal, Tuple, Union
@@ -30,6 +31,7 @@
 
 import bigframes.constants as constants
 import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers
+import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
 
 # Type hints for Pandas dtypes supported by BigQuery DataFrame
 Dtype = Union[
@@ -40,9 +42,6 @@
     pd.ArrowDtype,
 ]
 
-# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
-NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()]
-
 # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
 UNORDERED_DTYPES = [gpd.array.GeometryDtype()]
 
@@ -57,6 +56,9 @@
     "timestamp[us][pyarrow]",
     "date32[day][pyarrow]",
     "time64[us][pyarrow]",
+    "decimal128(38, 9)[pyarrow]",
+    "decimal256(38, 9)[pyarrow]",
+    "binary[pyarrow]",
 ]
 
 # Type hints for Ibis data types supported by BigQuery DataFrame
@@ -72,8 +74,17 @@
 
 BOOL_BIGFRAMES_TYPES = [pd.BooleanDtype()]
 
-# Several operations are restricted to these types.
-NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()]
+# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
+# Pandas is inconsistent, so two definitions are provided, each used in different contexts
+NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
+    pd.Float64Dtype(),
+    pd.Int64Dtype(),
+]
+NUMERIC_BIGFRAMES_TYPES_PERMISSIVE = NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + [
+    pd.BooleanDtype(),
+    pd.ArrowDtype(pa.decimal128(38, 9)),
+    pd.ArrowDtype(pa.decimal256(76, 38)),
+]
 
 # Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame
 ReadOnlyIbisDtype = Union[
@@ -97,6 +108,15 @@
         ibis_dtypes.Timestamp(timezone="UTC"),
         pd.ArrowDtype(pa.timestamp("us", tz="UTC")),
     ),
+    (ibis_dtypes.binary, pd.ArrowDtype(pa.binary())),
+    (
+        ibis_dtypes.Decimal(precision=38, scale=9, nullable=True),
+        pd.ArrowDtype(pa.decimal128(38, 9)),
+    ),
+    (
+        ibis_dtypes.Decimal(precision=76, scale=38, nullable=True),
+        pd.ArrowDtype(pa.decimal256(76, 38)),
+    ),
 )
 
 BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = {
@@ -112,6 +132,9 @@
     ibis_dtypes.time: pa.time64("us"),
     ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"),
     ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"),
+    ibis_dtypes.binary: pa.binary(),
+    ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9),
+    ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38),
 }
 
 ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()}
@@ -125,10 +148,6 @@
 )
 IBIS_TO_BIGFRAMES.update(
     {
-        ibis_dtypes.binary: np.dtype("O"),
-        ibis_dtypes.json: np.dtype("O"),
-        ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): np.dtype("O"),
-        ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): np.dtype("O"),
         ibis_dtypes.GeoSpatial(
             geotype="geography", srid=4326, nullable=True
         ): gpd.array.GeometryDtype(),
@@ -178,7 +197,7 @@ def ibis_dtype_to_bigframes_dtype(
     # our IO returns them as objects. Eventually, we should support them as
     # ArrowDType (and update the IO accordingly)
     if isinstance(ibis_dtype, ibis_dtypes.Array):
-        return np.dtype("O")
+        return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
 
     if isinstance(ibis_dtype, ibis_dtypes.Struct):
         return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
@@ -200,7 +219,9 @@ def ibis_dtype_to_bigframes_dtype(
 
 def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType:
     if isinstance(ibis_dtype, ibis_dtypes.Array):
-        return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type))
+        return pa.list_(
+            ibis_dtype_to_arrow_dtype(ibis_dtype.value_type.copy(nullable=True))
+        )
 
     if isinstance(ibis_dtype, ibis_dtypes.Struct):
         return pa.struct(
@@ -224,21 +245,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
     This is useful in cases where multiple types correspond to the same BigFrames dtype.
     """
     ibis_type = value.type()
+    name = value.get_name()
+    if ibis_type.is_json():
+        value = vendored_ibis_ops.ToJsonString(value).to_expr()
+        return value.name(name)
     # Allow REQUIRED fields to be joined with NULLABLE fields.
     nullable_type = ibis_type.copy(nullable=True)
-    return value.cast(nullable_type).name(value.get_name())
-
-
-def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table:
-    """Converts an Ibis table expression to canonical types.
-
-    This is useful in cases where multiple types correspond to the same BigFrames dtype.
-    """
-    casted_columns = []
-    for column_name in table.columns:
-        column = typing.cast(ibis_types.Value, table[column_name])
-        casted_columns.append(ibis_value_to_canonical_type(column))
-    return table.select(*casted_columns)
+    return value.cast(nullable_type).name(name)
 
 
 def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:
@@ -386,15 +399,35 @@ def cast_ibis_value(
             ibis_dtypes.bool,
             ibis_dtypes.float64,
             ibis_dtypes.string,
+            ibis_dtypes.Decimal(precision=38, scale=9),
+            ibis_dtypes.Decimal(precision=76, scale=38),
+        ),
+        ibis_dtypes.float64: (
+            ibis_dtypes.string,
+            ibis_dtypes.int64,
+            ibis_dtypes.Decimal(precision=38, scale=9),
+            ibis_dtypes.Decimal(precision=76, scale=38),
+        ),
+        ibis_dtypes.string: (
+            ibis_dtypes.int64,
+            ibis_dtypes.float64,
+            ibis_dtypes.Decimal(precision=38, scale=9),
+            ibis_dtypes.Decimal(precision=76, scale=38),
+            ibis_dtypes.binary,
         ),
-        ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64),
-        ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64),
         ibis_dtypes.date: (ibis_dtypes.string,),
-        ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,),
-        ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,),
+        ibis_dtypes.Decimal(precision=38, scale=9): (
+            ibis_dtypes.float64,
+            ibis_dtypes.Decimal(precision=76, scale=38),
+        ),
+        ibis_dtypes.Decimal(precision=76, scale=38): (
+            ibis_dtypes.float64,
+            ibis_dtypes.Decimal(precision=38, scale=9),
+        ),
         ibis_dtypes.time: (),
         ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),),
         ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,),
+        ibis_dtypes.binary: (ibis_dtypes.string,),
     }
 
     value = ibis_value_to_canonical_type(value)
@@ -458,30 +491,62 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
     return False
 
 
+# string is binary
 def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
     """Determine whether a scalar's type matches a given pyarrow type."""
     if pa_type == pa.time64("us"):
         return isinstance(scalar, datetime.time)
-    if pa_type == pa.timestamp("us"):
+    elif pa_type == pa.timestamp("us"):
         if isinstance(scalar, datetime.datetime):
             return not scalar.tzinfo
         if isinstance(scalar, pd.Timestamp):
             return not scalar.tzinfo
-    if pa_type == pa.timestamp("us", tz="UTC"):
+    elif pa_type == pa.timestamp("us", tz="UTC"):
         if isinstance(scalar, datetime.datetime):
             return scalar.tzinfo == datetime.timezone.utc
         if isinstance(scalar, pd.Timestamp):
             return scalar.tzinfo == datetime.timezone.utc
-    if pa_type == pa.date32():
+    elif pa_type == pa.date32():
         return isinstance(scalar, datetime.date)
+    elif pa_type == pa.binary():
+        return isinstance(scalar, bytes)
+    elif pa_type == pa.decimal128(38, 9):
+        # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks
+        return isinstance(scalar, decimal.Decimal)
+    elif pa_type == pa.decimal256(76, 38):
+        # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks
+        return isinstance(scalar, decimal.Decimal)
     return False
 
 
-def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool:
-    """Whether scalar can be compare to items of dtype (though maybe requiring coercion)"""
+def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]:
+    """Whether scalar can be compare to items of dtype (though maybe requiring coercion). Returns the datatype that must be used for the comparison"""
     if is_dtype(scalar, dtype):
-        return True
+        return dtype
     elif pd.api.types.is_numeric_dtype(dtype):
-        return pd.api.types.is_number(scalar)
-    else:
-        return False
+        # Implicit conversion currently only supported for numeric types
+        if pd.api.types.is_bool(scalar):
+            return lcd_type(pd.BooleanDtype(), dtype)
+        if pd.api.types.is_float(scalar):
+            return lcd_type(pd.Float64Dtype(), dtype)
+        if pd.api.types.is_integer(scalar):
+            return lcd_type(pd.Int64Dtype(), dtype)
+        if isinstance(scalar, decimal.Decimal):
+            # TODO: Check context to see if can use NUMERIC instead of BIGNUMERIC
+            return lcd_type(pd.ArrowDtype(pa.decimal128(76, 38)), dtype)
+    return None
+
+
+def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]:
+    # Implicit conversion currently only supported for numeric types
+    hierarchy: list[Dtype] = [
+        pd.BooleanDtype(),
+        pd.Int64Dtype(),
+        pd.Float64Dtype(),
+        pd.ArrowDtype(pa.decimal128(38, 9)),
+        pd.ArrowDtype(pa.decimal256(76, 38)),
+    ]
+    if (dtype1 not in hierarchy) or (dtype2 not in hierarchy):
+        return None
+    lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2))
+    return hierarchy[lcd_index]
diff --git a/bigframes/series.py b/bigframes/series.py
index 6837c1c7f8..eefd2b755d 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import functools
 import itertools
 import numbers
 import textwrap
@@ -455,7 +456,7 @@ def replace(
         else:  # Scalar
             replace_list = [to_replace]
         replace_list = [
-            i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype)
+            i for i in replace_list if bigframes.dtypes.is_compatible(i, self.dtype)
         ]
         return self._simple_replace(replace_list, value) if replace_list else self
 
@@ -472,11 +473,15 @@ def _regex_replace(self, to_replace: str, value: str):
         return Series(block.select_column(result_col))
 
     def _simple_replace(self, to_replace_list: typing.Sequence, value):
-        if not bigframes.dtypes.is_dtype(value, self.dtype):
+        result_type = bigframes.dtypes.is_compatible(value, self.dtype)
+        if not result_type:
             raise NotImplementedError(
                 f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
             )
 
+        if result_type != self.dtype:
+            return self.astype(result_type)._simple_replace(to_replace_list, value)
+
         block, cond = self._block.apply_unary_op(
             self._value_column, ops.IsInOp(to_replace_list)
         )
@@ -490,15 +495,26 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value):
 
     def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]):
         tuples = []
+        lcd_types: list[typing.Optional[bigframes.dtypes.Dtype]] = []
         for key, value in mapping.items():
-            if not bigframes.dtypes.is_comparable(key, self.dtype):
+            lcd_type = bigframes.dtypes.is_compatible(key, self.dtype)
+            if not lcd_type:
                 continue
             if not bigframes.dtypes.is_dtype(value, self.dtype):
                 raise NotImplementedError(
                     f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
                 )
             tuples.append((key, value))
+            lcd_types.append(lcd_type)
 
+        result_dtype = functools.reduce(
+            lambda t1, t2: bigframes.dtypes.lcd_type(t1, t2) if (t1 and t2) else None,
+            lcd_types,
+        )
+        if not result_dtype:
+            raise NotImplementedError(
+                f"Cannot replace {self.dtype} elements with incompatible mapping {mapping} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
+            )
         block, result = self._block.apply_unary_op(
             self._value_column, ops.MapOp(tuple(tuples))
         )
@@ -782,7 +798,7 @@ def _central_moment(self, n: int) -> float:
 
     def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series:
         if _is_list_like(func):
-            if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES:
+            if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
                 raise NotImplementedError(
                     f"Multiple aggregations only supported on numeric series. {constants.FEEDBACK_LINK}"
                 )
diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py
index 0c280e5d02..6ea4f72489 100644
--- a/tests/system/large/ml/test_compose.py
+++ b/tests/system/large/ml/test_compose.py
@@ -72,7 +72,7 @@ def test_columntransformer_standalone_fit_and_transform(
         expected.standard_scaled_flipper_length_mm.astype("Float64")
     )
 
-    pandas.testing.assert_frame_equal(result, expected, rtol=1e-3)
+    pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False)
 
 
 def test_columntransformer_standalone_fit_transform(new_penguins_df):
@@ -123,4 +123,4 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df):
         expected.standard_scaled_flipper_length_mm.astype("Float64")
     )
 
-    pandas.testing.assert_frame_equal(result, expected, rtol=1e-3)
+    pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False)
diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py
index 3b30d7eb1d..df387e6ee1 100644
--- a/tests/system/large/ml/test_core.py
+++ b/tests/system/large/ml/test_core.py
@@ -184,4 +184,5 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df):
         expected,
         check_exact=False,
         rtol=0.1,
+        check_dtype=False,
     )
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index eece5ef21d..f39815aec2 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -292,11 +292,12 @@ def test_model_predict_with_unnamed_index(
 def test_remote_model_predict(
     bqml_linear_remote_model: core.BqmlModel, new_penguins_df
 ):
-    predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas()
     expected = pd.DataFrame(
         {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]},
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+        dtype=pd.ArrowDtype(pa.list_(pa.float64())),
     )
+    predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas()
     pd.testing.assert_frame_equal(
         predictions[["predicted_body_mass_g"]].sort_index(),
         expected,
diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py
index 9008e85a0b..8ffd9924e9 100644
--- a/tests/system/small/ml/test_imported.py
+++ b/tests/system/small/ml/test_imported.py
@@ -51,6 +51,7 @@ def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df):
         result,
         expected,
         check_exact=False,
+        check_dtype=False,
         atol=0.1,
     )
 
@@ -90,6 +91,7 @@ def test_onnx_model_predict(imported_onnx_model, onnx_iris_df):
         result,
         expected,
         check_exact=False,
+        check_dtype=False,
         atol=0.1,
     )
 
diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py
index 267a2ed9c1..fd1b803eea 100644
--- a/tests/system/small/ml/test_llm.py
+++ b/tests/system/small/ml/test_llm.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import pytest
 
 from bigframes.ml import llm
@@ -202,8 +201,7 @@ def test_embedding_generator_predict_success(
     assert "text_embedding" in df.columns
     series = df["text_embedding"]
     value = series[0]
-    assert isinstance(value, np.ndarray)
-    assert value.size == 768
+    assert len(value) == 768
 
 
 @pytest.mark.flaky(retries=2, delay=120)
@@ -215,8 +213,7 @@ def test_embedding_generator_multilingual_predict_success(
     assert "text_embedding" in df.columns
     series = df["text_embedding"]
     value = series[0]
-    assert isinstance(value, np.ndarray)
-    assert value.size == 768
+    assert len(value) == 768
 
 
 @pytest.mark.flaky(retries=2, delay=120)
@@ -228,5 +225,4 @@ def test_embedding_generator_predict_series_success(
     assert "text_embedding" in df.columns
     series = df["text_embedding"]
     value = series[0]
-    assert isinstance(value, np.ndarray)
-    assert value.size == 768
+    assert len(value) == 768
diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py
index 45548acca3..c3bd7f3b87 100644
--- a/tests/system/small/ml/test_preprocessing.py
+++ b/tests/system/small/ml/test_preprocessing.py
@@ -15,6 +15,7 @@
 import math
 
 import pandas as pd
+import pyarrow as pa
 
 import bigframes.ml.preprocessing
 
@@ -453,6 +454,9 @@ def test_one_hot_encoder_default_params(new_penguins_df):
                 [{"index": 2, "value": 1.0}],
             ],
         },
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))
+        ),
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )
 
@@ -482,6 +486,9 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df):
                 [{"index": 2, "value": 1.0}],
             ],
         },
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))
+        ),
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )
 
@@ -507,6 +514,9 @@ def test_one_hot_encoder_series_default_params(new_penguins_df):
                 [{"index": 2, "value": 1.0}],
             ],
         },
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))
+        ),
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )
 
@@ -537,6 +547,9 @@ def test_one_hot_encoder_params(new_penguins_df):
                 [{"index": 0, "value": 1.0}],
             ],
         },
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))
+        ),
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )
 
@@ -567,6 +580,9 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_
                 [{"index": 2, "value": 1.0}],
             ],
         },
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))
+        ),
         index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
     )
 
diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py
index e8eb1c85e8..5036cdadfc 100644
--- a/tests/system/small/ml/test_remote.py
+++ b/tests/system/small/ml/test_remote.py
@@ -29,5 +29,6 @@ def test_remote_linear_vertex_model_predict(
         predictions[["predicted_body_mass_g"]].sort_index(),
         expected,
         check_exact=False,
+        check_dtype=False,
         rtol=0.1,
     )
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index ed78e73e5d..86b8cfbe66 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -19,7 +19,6 @@
 from typing import Tuple
 
 import geopandas as gpd  # type: ignore
-import numpy as np
 import pandas as pd
 import pandas.testing
 import pyarrow as pa  # type: ignore
@@ -29,7 +28,11 @@
 import bigframes._config.display_options as display_options
 import bigframes.dataframe as dataframe
 import bigframes.series as series
-from tests.system.utils import assert_pandas_df_equal, assert_series_equal
+from tests.system.utils import (
+    assert_pandas_df_equal,
+    assert_series_equal,
+    skip_legacy_pandas,
+)
 
 
 def test_df_construct_copy(scalars_dfs):
@@ -273,19 +276,19 @@ def test_df_info(scalars_dfs):
         "  #  Column         Non-Null Count    Dtype\n"
         "---  -------------  ----------------  ------------------------------\n"
         "  0  bool_col       8 non-null        boolean\n"
-        "  1  bytes_col      6 non-null        object\n"
+        "  1  bytes_col      6 non-null        binary[pyarrow]\n"
         "  2  date_col       7 non-null        date32[day][pyarrow]\n"
         "  3  datetime_col   6 non-null        timestamp[us][pyarrow]\n"
         "  4  geography_col  4 non-null        geometry\n"
         "  5  int64_col      8 non-null        Int64\n"
         "  6  int64_too      9 non-null        Int64\n"
-        "  7  numeric_col    6 non-null        object\n"
+        "  7  numeric_col    6 non-null        decimal128(38, 9)[pyarrow]\n"
         "  8  float64_col    7 non-null        Float64\n"
         "  9  rowindex_2     9 non-null        Int64\n"
         " 10  string_col     8 non-null        string\n"
         " 11  time_col       6 non-null        time64[us][pyarrow]\n"
         " 12  timestamp_col  6 non-null        timestamp[us, tz=UTC][pyarrow]\n"
-        "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n"
+        "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n"
         "memory usage: 945 bytes\n"
     )
 
@@ -362,6 +365,7 @@ def test_drop_bigframes_index_with_na(scalars_dfs):
     pd.testing.assert_frame_equal(pd_result, bf_result)
 
 
+@skip_legacy_pandas
 def test_drop_bigframes_multiindex(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     scalars_df = scalars_df.copy()
@@ -841,13 +845,11 @@ def test_df_fillna(scalars_dfs):
 
 def test_df_replace_scalar_scalar(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
-    bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas()
-    pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!")
+    bf_result = scalars_df.replace(555.555, 3).to_pandas()
+    pd_result = scalars_pandas_df.replace(555.555, 3)
 
-    pd.testing.assert_frame_equal(
-        pd_result,
-        bf_result,
-    )
+    # pandas has narrower result types as they are determined dynamically
+    pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
 
 
 def test_df_replace_regex_scalar(scalars_dfs):
@@ -863,12 +865,14 @@ def test_df_replace_regex_scalar(scalars_dfs):
 
 def test_df_replace_list_scalar(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
-    bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas()
-    pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!")
+    bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas()
+    pd_result = scalars_pandas_df.replace([555.555, 3.2], 3)
 
+    # pandas has narrower result types as they are determined dynamically
     pd.testing.assert_frame_equal(
         pd_result,
         bf_result,
+        check_dtype=False,
     )
 
 
@@ -1198,13 +1202,13 @@ def test_get_dtypes(scalars_df_default_index):
         pd.Series(
             {
                 "bool_col": pd.BooleanDtype(),
-                "bytes_col": np.dtype("O"),
+                "bytes_col": pd.ArrowDtype(pa.binary()),
                 "date_col": pd.ArrowDtype(pa.date32()),
                 "datetime_col": pd.ArrowDtype(pa.timestamp("us")),
                 "geography_col": gpd.array.GeometryDtype(),
                 "int64_col": pd.Int64Dtype(),
                 "int64_too": pd.Int64Dtype(),
-                "numeric_col": np.dtype("O"),
+                "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)),
                 "float64_col": pd.Float64Dtype(),
                 "rowindex": pd.Int64Dtype(),
                 "rowindex_2": pd.Int64Dtype(),
@@ -1232,7 +1236,7 @@ def test_get_dtypes_array_struct(session):
         dtypes,
         pd.Series(
             {
-                "array_column": np.dtype("O"),
+                "array_column": pd.ArrowDtype(pa.list_(pa.int64())),
                 "struct_column": pd.ArrowDtype(
                     pa.struct(
                         [
@@ -2138,6 +2142,7 @@ def test_dataframe_agg_multi_string(scalars_dfs):
     ).all()
 
 
+@skip_legacy_pandas
 def test_df_describe(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     # pyarrows time columns fail in pandas
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index 1f613e6509..fc8c2549cf 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -56,7 +56,9 @@ def test_to_pandas_array_struct_correct_result(session):
     result = df.to_pandas()
     expected = pd.DataFrame(
         {
-            "array_column": [[1, 3, 2]],
+            "array_column": pd.Series(
+                [[1, 3, 2]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))
+            ),
             "struct_column": pd.Series(
                 [{"string_field": "a", "float_field": 1.2}],
                 dtype=pd.ArrowDtype(
@@ -91,7 +93,8 @@ def test_load_json(session):
     expected = pd.DataFrame(
         {
             "json_column": ['{"bar":true,"foo":10}'],
-        }
+        },
+        dtype=pd.StringDtype(storage="pyarrow"),
     )
     expected.index = expected.index.astype("Int64")
     pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
@@ -137,6 +140,8 @@ def test_to_csv_index(
     dtype = scalars_df.reset_index().dtypes.to_dict()
     dtype.pop("geography_col")
     dtype.pop("rowindex")
+    # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
+    dtype.pop("bytes_col")
     gcs_df = pd.read_csv(
         path,
         dtype=dtype,
@@ -148,7 +153,6 @@ def test_to_csv_index(
 
     scalars_pandas_df = scalars_pandas_df.copy()
     scalars_pandas_df.index = scalars_pandas_df.index.astype("int64")
-
     # Ordering should be maintained for tables smaller than 1 GB.
     pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df)
 
@@ -174,6 +178,8 @@ def test_to_csv_tabs(
     dtype = scalars_df.reset_index().dtypes.to_dict()
     dtype.pop("geography_col")
     dtype.pop("rowindex")
+    # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
+    dtype.pop("bytes_col")
     gcs_df = pd.read_csv(
         path,
         sep="\t",
@@ -216,6 +222,8 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index):
         df_out = df_out.sort_values("rowindex_2").reset_index(drop=True)
 
     convert_pandas_dtypes(df_out, bytes_col=False)
+    # pd.read_gbq interpets bytes_col as object, reconvert to pyarrow binary
+    df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary()))
     expected = scalars_pandas_df.copy()
     expected.index.name = index_col
     pd.testing.assert_frame_equal(df_out, expected, check_index_type=False)
@@ -421,7 +429,9 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index):
     scalars_pandas_df.index = scalars_pandas_df.index.astype("Int64")
 
     # Ordering should be maintained for tables smaller than 1 GB.
-    pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df)
+    pd.testing.assert_frame_equal(
+        gcs_df.drop("bytes_col", axis=1), scalars_pandas_df.drop("bytes_col", axis=1)
+    )
 
 
 def test_to_sql_query_unnamed_index_included(
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index 1708735f4c..2d4e1f0204 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -394,14 +394,17 @@ def test_multi_index_dataframe_groupby_level_aggregate(
 def test_multi_index_dataframe_groupby_level_analytic(
     scalars_df_index, scalars_pandas_df_index, level, as_index
 ):
+    # Drop "numeric_col" as pandas doesn't support numerics for grouped window function
     bf_result = (
-        scalars_df_index.set_index(["int64_too", "bool_col"])
+        scalars_df_index.drop("numeric_col", axis=1)
+        .set_index(["int64_too", "bool_col"])
         .groupby(level=level, as_index=as_index, dropna=False)
         .cumsum(numeric_only=True)
         .to_pandas()
     )
     pd_result = (
-        scalars_pandas_df_index.set_index(["int64_too", "bool_col"])
+        scalars_pandas_df_index.drop("numeric_col", axis=1)
+        .set_index(["int64_too", "bool_col"])
         .groupby(level=level, as_index=as_index, dropna=False)
         .cumsum(numeric_only=True)
     )
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 623da74aa4..6f919f740f 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -24,7 +24,11 @@
 
 import bigframes.pandas
 import bigframes.series as series
-from tests.system.utils import assert_pandas_df_equal, assert_series_equal
+from tests.system.utils import (
+    assert_pandas_df_equal,
+    assert_series_equal,
+    skip_legacy_pandas,
+)
 
 
 def test_series_construct_copy(scalars_dfs):
@@ -81,14 +85,14 @@ def test_series_construct_from_list_escaped_strings():
     [
         ("bool_col", pd.BooleanDtype()),
         # TODO(swast): Use a more efficient type.
-        ("bytes_col", numpy.dtype("object")),
+        ("bytes_col", pd.ArrowDtype(pa.binary())),
         ("date_col", pd.ArrowDtype(pa.date32())),
         ("datetime_col", pd.ArrowDtype(pa.timestamp("us"))),
         ("float64_col", pd.Float64Dtype()),
         ("geography_col", gpd.array.GeometryDtype()),
         ("int64_col", pd.Int64Dtype()),
         # TODO(swast): Use a more efficient type.
-        ("numeric_col", numpy.dtype("object")),
+        ("numeric_col", pd.ArrowDtype(pa.decimal128(38, 9))),
         ("int64_too", pd.Int64Dtype()),
         ("string_col", pd.StringDtype(storage="pyarrow")),
         ("time_col", pd.ArrowDtype(pa.time64("us"))),
@@ -2519,8 +2523,12 @@ def test_mask_custom_value(scalars_dfs):
         ("int64_col", pd.Float64Dtype()),
         ("int64_col", "string[pyarrow]"),
         ("int64_col", "boolean"),
+        ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))),
+        ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))),
         ("bool_col", "Int64"),
         ("bool_col", "string[pyarrow]"),
+        ("string_col", "binary[pyarrow]"),
+        ("bytes_col", "string[pyarrow]"),
         # pandas actually doesn't let folks convert to/from naive timestamp and
         # raises a deprecation warning to use tz_localize/tz_convert instead,
         # but BigQuery always stores values as UTC and doesn't have to deal
@@ -2538,6 +2546,7 @@ def test_mask_custom_value(scalars_dfs):
         # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions
     ],
 )
+@skip_legacy_pandas
 def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type):
     bf_result = scalars_df_index[column].astype(to_type).to_pandas()
     pd_result = scalars_pandas_df_index[column].astype(to_type)
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index e6eb40a5fa..8ce442376a 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -30,6 +30,7 @@
 import bigframes.dataframe
 import bigframes.dtypes
 import bigframes.ml.linear_model
+from tests.system.utils import skip_legacy_pandas
 
 FIRST_FILE = "000000000000"
 
@@ -385,6 +386,7 @@ def test_read_pandas_tokyo(
     pd.testing.assert_frame_equal(result, expected)
 
 
+@skip_legacy_pandas
 def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder):
     scalars_df, _ = scalars_dfs
     if scalars_df.index.name is not None:
@@ -441,6 +443,7 @@ def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder):
         pytest.param("\t", id="custom_sep"),
     ],
 )
+@skip_legacy_pandas
 def test_read_csv_local_default_engine(session, scalars_dfs, sep):
     scalars_df, scalars_pandas_df = scalars_dfs
     with tempfile.TemporaryDirectory() as dir:
diff --git a/tests/system/utils.py b/tests/system/utils.py
index f49b5ece31..a4647b4f51 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -14,11 +14,23 @@
 
 import base64
 import decimal
+import functools
 
 import geopandas as gpd  # type: ignore
 import numpy as np
 import pandas as pd
 import pyarrow as pa  # type: ignore
+import pytest
+
+
+def skip_legacy_pandas(test):
+    @functools.wraps(test)
+    def wrapper(*args, **kwds):
+        if pd.__version__.startswith("1."):
+            pytest.skip("Skips pandas 1.x as not compatible with 2.x behavior.")
+        return test(*args, **kwds)
+
+    return wrapper
 
 
 def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs):
@@ -133,16 +145,28 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool):
             df["geography_col"].replace({np.nan: None})
         )
 
-    # Convert bytes types column.
-    if bytes_col:
+    if bytes_col and not isinstance(df["bytes_col"].dtype, pd.ArrowDtype):
         df["bytes_col"] = df["bytes_col"].apply(
             lambda value: base64.b64decode(value) if not pd.isnull(value) else value
         )
+        arrow_table = pa.Table.from_pandas(
+            pd.DataFrame(df, columns=["bytes_col"]),
+            schema=pa.schema([("bytes_col", pa.binary())]),
+        )
+        df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"]
 
-    # Convert numeric types column.
-    df["numeric_col"] = df["numeric_col"].apply(
-        lambda value: decimal.Decimal(str(value)) if value else None  # type: ignore
-    )
+    if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype):
+        # Convert numeric types column.
+        df["numeric_col"] = df["numeric_col"].apply(
+            lambda value: decimal.Decimal(str(value)) if value else None  # type: ignore
+        )
+        arrow_table = pa.Table.from_pandas(
+            pd.DataFrame(df, columns=["numeric_col"]),
+            schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]),
+        )
+        df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[
+            "numeric_col"
+        ]
 
 
 def assert_pandas_df_equal_pca_components(actual, expected, **kwargs):
diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py
index 6ceaaf911b..e648fd28cc 100644
--- a/tests/unit/test_dtypes.py
+++ b/tests/unit/test_dtypes.py
@@ -31,11 +31,11 @@
         # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
         pytest.param(
             ibis_dtypes.Decimal(precision=76, scale=38, nullable=True),
-            np.dtype("O"),
+            pd.ArrowDtype(pa.decimal256(76, 38)),
             id="bignumeric",
         ),
         pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"),
-        pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"),
+        pytest.param(ibis_dtypes.binary, pd.ArrowDtype(pa.binary()), id="bytes"),
         pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"),
         pytest.param(
             ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime"
@@ -49,10 +49,9 @@
         pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"),
         pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"),
         # TODO(tswast): custom dtype (or at least string dtype) for JSON objects
-        pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"),
         pytest.param(
             ibis_dtypes.Decimal(precision=38, scale=9, nullable=True),
-            np.dtype("O"),
+            pd.ArrowDtype(pa.decimal128(38, 9)),
             id="numeric",
         ),
         pytest.param(

From 8ea4a663bc81a49bb4425d9354d0ca2699a53410 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Thu, 21 Dec 2023 23:41:45 +0000
Subject: [PATCH 6/9] docs: code samples for `drop` and `fillna` (#284)

---
 .../bigframes_vendored/pandas/core/frame.py   | 141 ++++++++++++++++++
 .../bigframes_vendored/pandas/core/series.py  |  81 ++++++++++
 2 files changed, 222 insertions(+)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 00be9e5e9e..427e586c52 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -869,6 +869,97 @@ def drop(
 
         Remove columns by directly specifying column names.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4),
+            ...                    columns=['A', 'B', 'C', 'D'])
+            >>> df
+               A  B   C   D
+            0  0  1   2   3
+            1  4  5   6   7
+            2  8  9  10  11
+            <BLANKLINE>
+            [3 rows x 4 columns]
+
+        Drop columns:
+
+            >>> df.drop(['B', 'C'], axis=1)
+               A   D
+            0  0   3
+            1  4   7
+            2  8  11
+            <BLANKLINE>
+            [3 rows x 2 columns]
+
+            >>> df.drop(columns=['B', 'C'])
+               A   D
+            0  0   3
+            1  4   7
+            2  8  11
+            <BLANKLINE>
+            [3 rows x 2 columns]
+
+        Drop a row by index:
+
+            >>> df.drop([0, 1])
+               A  B   C   D
+            2  8  9  10  11
+            <BLANKLINE>
+            [1 rows x 4 columns]
+
+        Drop columns and/or rows of MultiIndex DataFrame:
+
+            >>> import pandas as pd
+            >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'],
+            ...                              ['speed', 'weight', 'length']],
+            ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+            ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+            >>> df = bpd.DataFrame(index=midx, columns=['big', 'small'],
+            ...                    data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
+            ...                          [250, 150], [1.5, 0.8], [320, 250],
+            ...                          [1, 0.8], [0.3, 0.2]])
+            >>> df
+                             big  small
+            llama  speed    45.0   30.0
+                   weight  200.0  100.0
+                   length    1.5    1.0
+            cow    speed    30.0   20.0
+                   weight  250.0  150.0
+                   length    1.5    0.8
+            falcon speed   320.0  250.0
+                   weight    1.0    0.8
+                   length    0.3    0.2
+            <BLANKLINE>
+            [9 rows x 2 columns]
+
+        Drop a specific index and column combination from the MultiIndex
+        DataFrame, i.e., drop the index ``'cow'`` and column ``'small'``:
+
+            >>> df.drop(index='cow', columns='small')
+                             big
+            llama  speed    45.0
+                   weight  200.0
+                   length    1.5
+            falcon speed   320.0
+                   weight    1.0
+                   length    0.3
+            <BLANKLINE>
+            [6 rows x 1 columns]
+
+            >>> df.drop(index='length', level=1)
+                             big  small
+            llama  speed    45.0   30.0
+                   weight  200.0  100.0
+            cow    speed    30.0   20.0
+                   weight  250.0  150.0
+            falcon speed   320.0  250.0
+                   weight    1.0    0.8
+            <BLANKLINE>
+            [6 rows x 2 columns]
+
         Args:
             labels:
                 Index or column labels to drop.
@@ -4343,6 +4434,56 @@ def fillna(self, value):
         """
         Fill NA/NaN values using the specified method.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0],
+            ...                     [3, 4, np.nan, 1],
+            ...                     [np.nan, np.nan, np.nan, np.nan],
+            ...                     [np.nan, 3, np.nan, 4]],
+            ...                    columns=list("ABCD")).astype("Float64")
+            >>> df
+                A     B     C     D
+            0  <NA>   2.0  <NA>   0.0
+            1   3.0   4.0  <NA>   1.0
+            2  <NA>  <NA>  <NA>  <NA>
+            3  <NA>   3.0  <NA>   4.0
+            <BLANKLINE>
+            [4 rows x 4 columns]
+
+        Replace all NA elements with 0s.
+
+            >>> df.fillna(0)
+                 A    B    C    D
+            0  0.0  2.0  0.0  0.0
+            1  3.0  4.0  0.0  1.0
+            2  0.0  0.0  0.0  0.0
+            3  0.0  3.0  0.0  4.0
+            <BLANKLINE>
+            [4 rows x 4 columns]
+
+        You can use fill values from another DataFrame:
+
+            >>> df_fill = bpd.DataFrame(np.arange(12).reshape(3, 4),
+            ...                         columns=['A', 'B', 'C', 'D'])
+            >>> df_fill
+               A  B   C   D
+            0  0  1   2   3
+            1  4  5   6   7
+            2  8  9  10  11
+            <BLANKLINE>
+            [3 rows x 4 columns]
+            >>> df.fillna(df_fill)
+                A    B     C     D
+            0   0.0  2.0   2.0   0.0
+            1   3.0  4.0   6.0   1.0
+            2   8.0  9.0  10.0  11.0
+            3  <NA>  3.0  <NA>   4.0
+            <BLANKLINE>
+            [4 rows x 4 columns]
+
         Args:
             value (scalar, Series):
                 Value to use to fill holes (e.g. 0), alternately a
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 366f32c77e..01cc3a0500 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -1062,6 +1062,55 @@ def drop(
         When using a multi-index, labels on different levels can be removed
         by specifying the level.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C'])
+            >>> s
+            A    0
+            B    1
+            C    2
+            dtype: Int64
+
+        Drop labels B and C:
+
+            >>> s.drop(labels=['B', 'C'])
+            A    0
+            dtype: Int64
+
+        Drop 2nd level label in MultiIndex Series:
+
+            >>> import pandas as pd
+            >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'],
+            ...                              ['speed', 'weight', 'length']],
+            ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+            ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+
+            >>> s = bpd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
+            ...               index=midx)
+            >>> s
+            llama   speed      45.0
+                    weight    200.0
+                    length      1.2
+            cow     speed      30.0
+                    weight    250.0
+                    length      1.5
+            falcon  speed     320.0
+                    weight      1.0
+                    length      0.3
+            dtype: Float64
+
+            >>> s.drop(labels='weight', level=1)
+            llama   speed      45.0
+                    length      1.2
+            cow     speed      30.0
+                    length      1.5
+            falcon  speed     320.0
+                    length      0.3
+            dtype: Float64
+
         Args:
             labels (single label or list-like):
                 Index labels to drop.
@@ -1193,6 +1242,38 @@ def fillna(
         """
         Fill NA/NaN values using the specified method.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> s = bpd.Series([np.nan, 2, np.nan, -1])
+            >>> s
+            0    <NA>
+            1     2.0
+            2    <NA>
+            3    -1.0
+            dtype: Float64
+
+        Replace all NA elements with 0s.
+
+            >>> s.fillna(0)
+            0    0.0
+            1    2.0
+            2    0.0
+            3   -1.0
+            dtype: Float64
+
+        You can use fill values from another Series:
+
+            >>> s_fill = bpd.Series([11, 22, 33])
+            >>> s.fillna(s_fill)
+            0    11.0
+            1     2.0
+            2    33.0
+            3    -1.0
+            dtype: Float64
+
         Args:
             value (scalar, dict, Series, or DataFrame, default None):
                 Value to use to fill holes (e.g. 0).

From a3ff76a200a5599c569404ada74c85dad3de37fe Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Fri, 22 Dec 2023 02:45:36 +0000
Subject: [PATCH 7/9] docs: code samples for `reset_index` and `sort_values`
 (#282)

* docs: code samples for `reset_index` and `sort_values`

* fix alignment in dataframe api code samples
---
 .../bigframes_vendored/pandas/core/frame.py   | 161 ++++++++++++++++++
 .../bigframes_vendored/pandas/core/series.py  | 110 ++++++++++++
 2 files changed, 271 insertions(+)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 427e586c52..fb34193710 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1138,6 +1138,93 @@ def reset_index(
 
         Reset the index of the DataFrame, and use the default one instead.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> import numpy as np
+            >>> df = bpd.DataFrame([('bird', 389.0),
+            ...                     ('bird', 24.0),
+            ...                     ('mammal', 80.5),
+            ...                     ('mammal', np.nan)],
+            ...                    index=['falcon', 'parrot', 'lion', 'monkey'],
+            ...                    columns=('class', 'max_speed'))
+            >>> df
+                     class  max_speed
+            falcon    bird      389.0
+            parrot    bird       24.0
+            lion    mammal       80.5
+            monkey  mammal       <NA>
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        When we reset the index, the old index is added as a column, and a new sequential index is used:
+
+            >>> df.reset_index()
+                index   class  max_speed
+            0  falcon    bird      389.0
+            1  parrot    bird       24.0
+            2    lion  mammal       80.5
+            3  monkey  mammal       <NA>
+            <BLANKLINE>
+            [4 rows x 3 columns]
+
+        We can use the ``drop`` parameter to avoid the old index being added as a column:
+
+            >>> df.reset_index(drop=True)
+                class  max_speed
+            0    bird      389.0
+            1    bird       24.0
+            2  mammal       80.5
+            3  mammal       <NA>
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        You can also use ``reset_index`` with ``MultiIndex``.
+
+            >>> import pandas as pd
+            >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
+            ...                                    ('bird', 'parrot'),
+            ...                                    ('mammal', 'lion'),
+            ...                                    ('mammal', 'monkey')],
+            ...                                   names=['class', 'name'])
+            >>> columns = ['speed', 'max']
+            >>> df = bpd.DataFrame([(389.0, 'fly'),
+            ...                     (24.0, 'fly'),
+            ...                     (80.5, 'run'),
+            ...                     (np.nan, 'jump')],
+            ...                    index=index,
+            ...                    columns=columns)
+            >>> df
+                           speed   max
+            class  name
+            bird   falcon  389.0   fly
+                   parrot   24.0   fly
+            mammal lion     80.5   run
+                   monkey   <NA>  jump
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+            >>> df.reset_index()
+                class    name  speed   max
+            0    bird  falcon  389.0   fly
+            1    bird  parrot   24.0   fly
+            2  mammal    lion   80.5   run
+            3  mammal  monkey   <NA>  jump
+            <BLANKLINE>
+            [4 rows x 4 columns]
+
+            >>> df.reset_index(drop=True)
+               speed   max
+            0  389.0   fly
+            1   24.0   fly
+            2   80.5   run
+            3   <NA>  jump
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+
         Args:
             drop (bool, default False):
                 Do not try to insert index into dataframe columns. This resets
@@ -1347,6 +1434,80 @@ def sort_values(
     ) -> DataFrame:
         """Sort by the values along row axis.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({
+            ...     'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'],
+            ...     'col2': [2, 1, 9, 8, 7, 4],
+            ...     'col3': [0, 1, 9, 4, 2, 3],
+            ...     'col4': ['a', 'B', 'c', 'D', 'e', 'F']
+            ... })
+            >>> df
+               col1  col2  col3 col4
+            0     A     2     0    a
+            1     A     1     1    B
+            2     B     9     9    c
+            3  <NA>     8     4    D
+            4     D     7     2    e
+            5     C     4     3    F
+            <BLANKLINE>
+            [6 rows x 4 columns]
+
+        Sort by col1:
+
+            >>> df.sort_values(by=['col1'])
+               col1  col2  col3 col4
+            0     A     2     0    a
+            1     A     1     1    B
+            2     B     9     9    c
+            5     C     4     3    F
+            4     D     7     2    e
+            3  <NA>     8     4    D
+            <BLANKLINE>
+            [6 rows x 4 columns]
+
+        Sort by multiple columns:
+
+            >>> df.sort_values(by=['col1', 'col2'])
+               col1  col2  col3 col4
+            1     A     1     1    B
+            0     A     2     0    a
+            2     B     9     9    c
+            5     C     4     3    F
+            4     D     7     2    e
+            3  <NA>     8     4    D
+            <BLANKLINE>
+            [6 rows x 4 columns]
+
+        Sort Descending:
+
+            >>> df.sort_values(by='col1', ascending=False)
+               col1  col2  col3 col4
+            4     D     7     2    e
+            5     C     4     3    F
+            2     B     9     9    c
+            0     A     2     0    a
+            1     A     1     1    B
+            3  <NA>     8     4    D
+            <BLANKLINE>
+            [6 rows x 4 columns]
+
+        Putting NAs first:
+
+            >>> df.sort_values(by='col1', ascending=False, na_position='first')
+               col1  col2  col3 col4
+            3  <NA>     8     4    D
+            4     D     7     2    e
+            5     C     4     3    F
+            2     B     9     9    c
+            0     A     2     0    a
+            1     A     1     1    B
+            <BLANKLINE>
+            [6 rows x 4 columns]
+
         Args:
             by (str or Sequence[str]):
                 Name or list of names to sort by.
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 01cc3a0500..778ad68e0e 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -168,6 +168,53 @@ def reset_index(
         when the index is meaningless and needs to be reset to the default
         before another operation.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> s = bpd.Series([1, 2, 3, 4], name='foo',
+            ...                index=['a', 'b', 'c', 'd'])
+            >>> s.index.name = "idx"
+            >>> s
+            idx
+            a    1
+            b    2
+            c    3
+            d    4
+            Name: foo, dtype: Int64
+
+        Generate a DataFrame with default index.
+
+            >>> s.reset_index()
+                idx  foo
+            0     a    1
+            1     b    2
+            2     c    3
+            3     d    4
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        To specify the name of the new column use ``name`` param.
+
+            >>> s.reset_index(name="bar")
+                idx   bar
+            0     a    1
+            1     b    2
+            2     c    3
+            3     d    4
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        To generate a new Series with the default index set param ``drop=True``.
+
+            >>> s.reset_index(drop=True)
+            0    1
+            1    2
+            2    3
+            3    4
+            Name: foo, dtype: Int64
+
         Args:
             drop (bool, default False):
                 Just reset the index, without inserting it as a column in
@@ -699,6 +746,69 @@ def sort_values(
         Sort a Series in ascending or descending order by some
         criterion.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> s = bpd.Series([np.nan, 1, 3, 10, 5])
+            >>> s
+            0    <NA>
+            1     1.0
+            2     3.0
+            3    10.0
+            4     5.0
+            dtype: Float64
+
+        Sort values ascending order (default behaviour):
+
+            >>> s.sort_values(ascending=True)
+            1     1.0
+            2     3.0
+            4     5.0
+            3    10.0
+            0    <NA>
+            dtype: Float64
+
+        Sort values descending order:
+
+            >>> s.sort_values(ascending=False)
+            3    10.0
+            4     5.0
+            2     3.0
+            1     1.0
+            0    <NA>
+            dtype: Float64
+
+        Sort values putting NAs first:
+
+            >>> s.sort_values(na_position='first')
+            0    <NA>
+            1     1.0
+            2     3.0
+            4     5.0
+            3    10.0
+            dtype: Float64
+
+        Sort a series of strings:
+
+            >>> s = bpd.Series(['z', 'b', 'd', 'a', 'c'])
+            >>> s
+            0    z
+            1    b
+            2    d
+            3    a
+            4    c
+            dtype: string
+
+            >>> s.sort_values()
+            3    a
+            1    b
+            4    c
+            2    d
+            0    z
+            dtype: string
+
         Args:
             axis (0 or 'index'):
                 Unused. Parameter needed for compatibility with DataFrame.

From 8dab439a2730d92dea4e4d6cafb7b543a2f8c02b Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Tue, 26 Dec 2023 18:29:00 +0000
Subject: [PATCH 8/9] docs: code samples for `isna`, `isnull`, `dropna`, `isin`
 (#289)

* docs: code samples for `isna`, `isnull`, `dropna`, `isin`

* fix header alignment in rendering
---
 .../bigframes_vendored/pandas/core/frame.py   | 81 +++++++++++++++++++
 .../bigframes_vendored/pandas/core/generic.py | 65 +++++++++++++++
 .../bigframes_vendored/pandas/core/series.py  | 70 ++++++++++++++++
 3 files changed, 216 insertions(+)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index fb34193710..2de63b9103 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1289,9 +1289,57 @@ def duplicated(self, subset=None, keep="first"):
 
     def dropna(
         self,
+        *,
+        axis: int | str = 0,
+        how: str = "any",
+        ignore_index=False,
     ) -> DataFrame:
         """Remove missing values.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
+            ...                     "toy": [np.nan, 'Batmobile', 'Bullwhip'],
+            ...                     "born": [bpd.NA, "1940-04-25", bpd.NA]})
+            >>> df
+                   name        toy        born
+            0    Alfred       <NA>        <NA>
+            1    Batman  Batmobile  1940-04-25
+            2  Catwoman   Bullwhip        <NA>
+            <BLANKLINE>
+            [3 rows x 3 columns]
+
+        Drop the rows where at least one element is missing:
+
+            >>> df.dropna()
+                 name        toy        born
+            1  Batman  Batmobile  1940-04-25
+            <BLANKLINE>
+            [1 rows x 3 columns]
+
+        Drop the columns where at least one element is missing.
+
+            >>> df.dropna(axis='columns')
+                   name
+            0    Alfred
+            1    Batman
+            2  Catwoman
+            <BLANKLINE>
+            [3 rows x 1 columns]
+
+        Drop the rows where all elements are missing:
+
+            >>> df.dropna(how='all')
+                   name        toy        born
+            0    Alfred       <NA>        <NA>
+            1    Batman  Batmobile  1940-04-25
+            2  Catwoman   Bullwhip        <NA>
+            <BLANKLINE>
+            [3 rows x 3 columns]
+
         Args:
             axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
                 Determine if rows or columns which contain missing values are
@@ -1318,6 +1366,39 @@ def isin(self, values):
         """
         Whether each element in the DataFrame is contained in values.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
+            ...                    index=['falcon', 'dog'])
+            >>> df
+                    num_legs  num_wings
+            falcon         2          2
+            dog            4          0
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+        When ``values`` is a list check whether every value in the DataFrame is
+        present in the list (which animals have 0 or 2 legs or wings).
+
+            >>> df.isin([0, 2])
+                    num_legs  num_wings
+            falcon      True       True
+            dog        False       True
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+        When ``values`` is a dict, we can pass it to check for each column separately:
+
+            >>> df.isin({'num_wings': [0, 3]})
+                    num_legs  num_wings
+            falcon     False      False
+            dog        False       True
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
         Args:
             values (iterable, or dict):
                 The result will only be true at a location if all the
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
index ca5c6344ce..2885162fd6 100644
--- a/third_party/bigframes_vendored/pandas/core/generic.py
+++ b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -499,6 +499,71 @@ def isna(self) -> NDFrame:
         False values. Characters such as empty strings ``''`` or
         :attr:`numpy.inf` are not considered NA values.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> import numpy as np
+
+            >>> df = bpd.DataFrame(dict(
+            ...         age=[5, 6, np.nan],
+            ...         born=[bpd.NA, "1940-04-25", "1940-04-25"],
+            ...         name=['Alfred', 'Batman', ''],
+            ...         toy=[None, 'Batmobile', 'Joker'],
+            ... ))
+            >>> df
+                age        born    name        toy
+            0   5.0        <NA>  Alfred       <NA>
+            1   6.0  1940-04-25  Batman  Batmobile
+            2  <NA>  1940-04-25              Joker
+            <BLANKLINE>
+            [3 rows x 4 columns]
+
+        Show which entries in a DataFrame are NA:
+
+            >>> df.isna()
+                age   born   name    toy
+            0  False   True  False   True
+            1  False  False  False  False
+            2   True  False  False  False
+            <BLANKLINE>
+            [3 rows x 4 columns]
+
+            >>> df.isnull()
+                age   born   name    toy
+            0  False   True  False   True
+            1  False  False  False  False
+            2   True  False  False  False
+            <BLANKLINE>
+            [3 rows x 4 columns]
+
+        Show which entries in a Series are NA:
+
+            >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA])
+            >>> ser
+            0     5.0
+            1    <NA>
+            2     6.0
+            3    <NA>
+            4    <NA>
+            dtype: Float64
+
+            >>> ser.isna()
+            0    False
+            1     True
+            2    False
+            3     True
+            4     True
+            dtype: boolean
+
+            >>> ser.isnull()
+            0    False
+            1     True
+            2    False
+            3     True
+            4     True
+            dtype: boolean
+
         Returns:
             Mask of bool values for each element that indicates whether an
             element is an NA value.
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 778ad68e0e..cbe0963051 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -1460,6 +1460,42 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series:
         """
         Return a new Series with missing values removed.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+        Drop NA values from a Series:
+
+            >>> ser = bpd.Series([1., 2., np.nan])
+            >>> ser
+            0     1.0
+            1     2.0
+            2    <NA>
+            dtype: Float64
+
+            >>> ser.dropna()
+            0    1.0
+            1    2.0
+            dtype: Float64
+
+        Empty strings are not considered NA values. ``None`` is considered an NA value.
+
+            >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object')
+            >>> ser
+            0         2
+            1      <NA>
+            2
+            3      <NA>
+            4    I stay
+            dtype: string
+
+            >>> ser.dropna()
+            0         2
+            2
+            4    I stay
+            dtype: string
+
         Args:
             axis (0 or 'index'):
                 Unused. Parameter needed for compatibility with DataFrame.
@@ -2531,6 +2567,40 @@ def isin(self, values):
             the same. That is, if any form of NaN is present in values, all forms
             of NaN in the series will be considered a match. (though pandas may not)
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama',
+            ...                 'hippo'], name='animal')
+            >>> s
+            0     llama
+            1       cow
+            2     llama
+            3    beetle
+            4     llama
+            5     hippo
+            Name: animal, dtype: string
+
+            >>> s.isin(['cow', 'llama'])
+            0     True
+            1     True
+            2     True
+            3    False
+            4     True
+            5    False
+            Name: animal, dtype: boolean
+
+        Strings and integers are distinct and are therefore not comparable:
+
+            >>> bpd.Series([1]).isin(['1'])
+            0    False
+            dtype: boolean
+            >>> bpd.Series([1.1]).isin(['1.1'])
+            0    False
+            dtype: boolean
+
         Args:
             values (list-like):
                 The sequence of values to test. Passing in a single string will raise a

From e9c53ee5fac198491177ee635df998b0e7d54c71 Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Wed, 27 Dec 2023 00:23:45 +0000
Subject: [PATCH 9/9] test update

---
 tests/system/small/test_dataframe_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index fc8c2549cf..6f1b31b48e 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -300,7 +300,7 @@ def test_to_gbq_w_None_column_names(
     scalars_df_index, scalars_pandas_df_index, dataset_id
 ):
     """Test the `to_gbq` API with None as a column name."""
-    destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"
+    destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names"
 
     scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
     scalars_df_index.to_gbq(destination_table, if_exists="replace")