Skip to content

Commit

Permalink
Make cudf.pandas proxy array picklable (#17929)
Browse files Browse the repository at this point in the history
Apart of #17490.

We employ custom pickling logic for our cudf.pandas wrapped types. The logic lets us serialize and de-serialize wrapped types by serializing and de-serializing the underlying wrapped types (ie. the type of `_fsproxy_wrapped`). This pickling logic is defined in `_FinalProxy`, which is the base class of all of our "final" proxy types. 

The failures in the integration tests occurred because this pickling logic wasn't used for the proxy numpy array type. This is because the "final" proxy array type inherits from an additional base class: `ProxyNDarrayBase` (which contains logic to inherit from `np.ndarray` ). And it comes before `_FinalProxy` in the classes MRO, so the custom pickling is not used.

Additionally, the custom pickling logic used for other proxy types is incompatible with our proxy array. So this PR defines a custom function for handling proxy array serialization.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #17929
  • Loading branch information
Matt711 authored Feb 13, 2025
1 parent a035ccc commit 9ead47b
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 19 deletions.
18 changes: 18 additions & 0 deletions python/cudf/cudf/pandas/_wrappers/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,23 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
return result


def ndarray__reduce__(self):
# As it stands the custom pickling logic used for all other
# proxy types is incompatible with our proxy ndarray. The pickle
# constructor we use to deserialize the other proxy types calls
# object.__new__(type) which you cannot call on subclasses of
# numpy arrays because the new array won't be created with numpy's
# specific memory management logic. Therefore, we have to handle
# serialization separately for proxy arrays.
return (
ndarray.__new__,
(
ndarray,
self._fsproxy_wrapped,
),
)


ndarray = make_final_proxy_type(
"ndarray",
cupy.ndarray,
Expand All @@ -140,6 +157,7 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
"__cuda_array_interface__": cuda_array_interface,
"__array_interface__": array_interface,
"__array_ufunc__": ndarray__array_ufunc__,
"__reduce__": ndarray__reduce__,
# ndarrays are unhashable
"__hash__": None,
# iter(cupy-array) produces an iterable of zero-dim device
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1979,3 +1979,18 @@ def test_numpy_data_access():
actual = xs.values.data

assert type(expected) is type(actual)


def test_pickle_round_trip_proxy_numpy_array(array):
arr, proxy_arr = array
pickled_arr = BytesIO()
pickled_proxy_arr = BytesIO()
pickle.dump(arr, pickled_arr)
pickle.dump(proxy_arr, pickled_proxy_arr)

pickled_arr.seek(0)
pickled_proxy_arr.seek(0)

np.testing.assert_equal(
pickle.load(pickled_proxy_arr), pickle.load(pickled_arr)
)
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
import holoviews as hv
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -71,9 +71,6 @@ def test_holoviews_heatmap(df):
)


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_holoviews_histogram(df):
return get_plot_info(hv.Histogram(df.values))

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -33,19 +33,13 @@ def assert_plots_equal(expect, got):
pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_line():
df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
(data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-")

return plt.gca()


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_bar():
data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
ax = data.plot(kind="bar")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -37,9 +37,6 @@ def test_numpy_dot(df):
return np.dot(df, df.T)


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_numpy_fft(sr):
fft = np.fft.fft(sr)
return fft
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
import pandas as pd
import pytest
import seaborn as sns
Expand Down Expand Up @@ -54,9 +54,6 @@ def test_scatter(df):
return ax


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_lineplot_with_sns_data():
df = sns.load_dataset("flights")
ax = sns.lineplot(data=df, x="month", y="passengers")
Expand Down

0 comments on commit 9ead47b

Please sign in to comment.