Backport PR pandas-dev#42049: BUG: hash_pandas_object ignores optiona…

…l arguments when the input is a DataFrame. (pandas-dev#42108) Co-authored-by: i-aki-y <[email protected]>
meeseeksmachine · Jun 18, 2021 · bc56bf5 · bc56bf5
1 parent bdd95b6
commit bc56bf5
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -1215,6 +1215,7 @@ Other
 - Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`)
 - Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`)
 - Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`)
+- Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -139,7 +139,10 @@ def hash_pandas_object(
         ser = Series(h, index=obj.index, dtype="uint64", copy=False)
 
     elif isinstance(obj, ABCDataFrame):
-        hashes = (hash_array(series._values) for _, series in obj.items())
+        hashes = (
+            hash_array(series._values, encoding, hash_key, categorize)
+            for _, series in obj.items()
+        )
         num_items = len(obj.columns)
         if index:
             index_hash_generator = (

diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
@@ -255,6 +255,32 @@ def test_hash_keys():
     assert (a != b).all()
 
 
+def test_df_hash_keys():
+    # DataFrame version of the test_hash_keys.
+    # https://github.com/pandas-dev/pandas/issues/41404
+    obj = DataFrame({"x": np.arange(3), "y": list("abc")})
+
+    a = hash_pandas_object(obj, hash_key="9876543210123456")
+    b = hash_pandas_object(obj, hash_key="9876543210123465")
+
+    assert (a != b).all()
+
+
+def test_df_encoding():
+    # Check that DataFrame recognizes optional encoding.
+    # https://github.com/pandas-dev/pandas/issues/41404
+    # https://github.com/pandas-dev/pandas/pull/42049
+    obj = DataFrame({"x": np.arange(3), "y": list("a+c")})
+
+    a = hash_pandas_object(obj, encoding="utf8")
+    b = hash_pandas_object(obj, encoding="utf7")
+
+    # Note that the "+" is encoded as "+-" in utf-7.
+    assert a[0] == b[0]
+    assert a[1] != b[1]
+    assert a[2] == b[2]
+
+
 def test_invalid_key():
     # This only matters for object dtypes.
     msg = "key should be a 16-byte string encoded"