diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py new file mode 100644 index 0000000000000..c024390297fec --- /dev/null +++ b/pandas/tests/frame/methods/test_equals.py @@ -0,0 +1,23 @@ +from pandas import DataFrame +import pandas._testing as tm + + +class TestEquals: + def test_dataframe_not_equal(self): + # see GH#28839 + df1 = DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False + + def test_equals_different_blocks(self): + # GH#9330 + df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) + df1 = df0.reset_index()[["A", "B", "C"]] + # this assert verifies that the above operations have + # induced a block rearrangement + assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype + + # do the real tests + tm.assert_frame_equal(df0, df1) + assert df0.equals(df1) + assert df1.equals(df0) diff --git a/pandas/tests/frame/methods/test_infer_objects.py b/pandas/tests/frame/methods/test_infer_objects.py new file mode 100644 index 0000000000000..a824a615b5c29 --- /dev/null +++ b/pandas/tests/frame/methods/test_infer_objects.py @@ -0,0 +1,42 @@ +from datetime import datetime + +from pandas import DataFrame +import pandas._testing as tm + + +class TestInferObjects: + def test_infer_objects(self): + # GH#11221 + df = DataFrame( + { + "a": ["a", 1, 2, 3], + "b": ["b", 2.0, 3.0, 4.1], + "c": [ + "c", + datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3), + ], + "d": [1, 2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + df = df.iloc[1:].infer_objects() + + assert df["a"].dtype == "int64" + assert df["b"].dtype == "float64" + assert df["c"].dtype == "M8[ns]" + assert df["d"].dtype == "object" + + expected = DataFrame( + { + "a": [1, 2, 3], + "b": [2.0, 3.0, 4.1], + "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], + "d": [2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + # reconstruct frame to verify inference is same + result = df.reset_index(drop=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py new file mode 100644 index 0000000000000..436e70fbb0e56 --- /dev/null +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -0,0 +1,52 @@ +import numpy as np + +from pandas import DataFrame +from pandas.core.arrays import PandasArray + + +class TestToDictOfBlocks: + def test_copy_blocks(self, float_frame): + # GH#9607 + df = DataFrame(float_frame, copy=True) + column = df.columns[0] + + # use the default copy=True, change a column + blocks = df._to_dict_of_blocks(copy=True) + for dtype, _df in blocks.items(): + if column in _df: + _df.loc[:, column] = _df[column] + 1 + + # make sure we did not change the original DataFrame + assert not _df[column].equals(df[column]) + + def test_no_copy_blocks(self, float_frame): + # GH#9607 + df = DataFrame(float_frame, copy=True) + column = df.columns[0] + + # use the copy=False, change a column + blocks = df._to_dict_of_blocks(copy=False) + for dtype, _df in blocks.items(): + if column in _df: + _df.loc[:, column] = _df[column] + 1 + + # make sure we did change the original DataFrame + assert _df[column].equals(df[column]) + + +def test_to_dict_of_blocks_item_cache(): + # Calling to_dict_of_blocks should not poison item_cache + df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = PandasArray(np.array([1, 2, None, 3], dtype=object)) + mgr = df._mgr + assert len(mgr.blocks) == 3 # i.e. not consolidated + + ser = df["b"] # populations item_cache["b"] + + df._to_dict_of_blocks() + + # Check that the to_dict_of_blocks didnt break link between ser and df + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + + assert df["b"] is ser diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index ddfe60773aa8f..c2f084c0eb8bb 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -102,3 +102,62 @@ def test_interleave_with_tzaware(self, timezone_frame): dtype=object, ).T tm.assert_numpy_array_equal(result, expected) + + def test_values_interleave_non_unique_cols(self): + df = DataFrame( + [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], + columns=["x", "x"], + index=[1, 2], + ) + + df_unique = df.copy() + df_unique.columns = ["x", "y"] + assert df_unique.values.shape == df.values.shape + tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) + tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) + + def test_values_numeric_cols(self, float_frame): + float_frame["foo"] = "bar" + + values = float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + def test_values_lcd(self, mixed_float_frame, mixed_int_frame): + + # mixed lcd + values = mixed_float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_float_frame[["A", "B", "C"]].values + assert values.dtype == np.float32 + + values = mixed_float_frame[["C"]].values + assert values.dtype == np.float16 + + # GH#10364 + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_int_frame[["A", "D"]].values + assert values.dtype == np.int64 + + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C"]].values + assert values.dtype == np.float64 + + # as B and C are both unsigned, no forcing to float is needed + values = mixed_int_frame[["B", "C"]].values + assert values.dtype == np.uint64 + + values = mixed_int_frame[["A", "C"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C", "D"]].values + assert values.dtype == np.int64 + + values = mixed_int_frame[["A"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C"]].values + assert values.dtype == np.uint8 diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 788ac56829a2b..0ee74beea4858 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -14,6 +14,32 @@ from pandas.core.computation.expressions import _MIN_ELEMENTS, NUMEXPR_INSTALLED from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int + +class DummyElement: + def __init__(self, value, dtype): + self.value = value + self.dtype = np.dtype(dtype) + + def __array__(self): + return np.array(self.value, dtype=self.dtype) + + def __str__(self) -> str: + return f"DummyElement({self.value}, {self.dtype})" + + def __repr__(self) -> str: + return str(self) + + def astype(self, dtype, copy=False): + self.dtype = dtype + return self + + def view(self, dtype): + return type(self)(self.value.view(dtype), dtype) + + def any(self, axis=None): + return bool(self.value) + + # ------------------------------------------------------------------- # Comparisons @@ -782,6 +808,77 @@ def test_frame_with_frame_reindex(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "value, dtype", + [ + (1, "i8"), + (1.0, "f8"), + (2 ** 63, "f8"), + (1j, "complex128"), + (2 ** 63, "complex128"), + (True, "bool"), + (np.timedelta64(20, "ns"), " 1] = 2 tm.assert_almost_equal(expected, float_frame.values) - def test_values_numeric_cols(self, float_frame): - float_frame["foo"] = "bar" - - values = float_frame[["A", "B", "C", "D"]].values - assert values.dtype == np.float64 - - def test_values_lcd(self, mixed_float_frame, mixed_int_frame): - - # mixed lcd - values = mixed_float_frame[["A", "B", "C", "D"]].values - assert values.dtype == np.float64 - - values = mixed_float_frame[["A", "B", "C"]].values - assert values.dtype == np.float32 - - values = mixed_float_frame[["C"]].values - assert values.dtype == np.float16 - - # GH 10364 - # B uint64 forces float because there are other signed int types - values = mixed_int_frame[["A", "B", "C", "D"]].values - assert values.dtype == np.float64 - - values = mixed_int_frame[["A", "D"]].values - assert values.dtype == np.int64 - - # B uint64 forces float because there are other signed int types - values = mixed_int_frame[["A", "B", "C"]].values - assert values.dtype == np.float64 - - # as B and C are both unsigned, no forcing to float is needed - values = mixed_int_frame[["B", "C"]].values - assert values.dtype == np.uint64 - - values = mixed_int_frame[["A", "C"]].values - assert values.dtype == np.int32 - - values = mixed_int_frame[["C", "D"]].values - assert values.dtype == np.int64 - - values = mixed_int_frame[["A"]].values - assert values.dtype == np.int32 - - values = mixed_int_frame[["C"]].values - assert values.dtype == np.uint8 - def test_constructor_with_convert(self): # this is actually mostly a test of lib.maybe_convert_objects # #2845 @@ -300,47 +254,6 @@ def f(dtype): if not compat.is_platform_windows(): f("M8[ns]") - def test_equals_different_blocks(self): - # GH 9330 - df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) - df1 = df0.reset_index()[["A", "B", "C"]] - # this assert verifies that the above operations have - # induced a block rearrangement - assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype - - # do the real tests - tm.assert_frame_equal(df0, df1) - assert df0.equals(df1) - assert df1.equals(df0) - - def test_copy_blocks(self, float_frame): - # API/ENH 9607 - df = DataFrame(float_frame, copy=True) - column = df.columns[0] - - # use the default copy=True, change a column - blocks = df._to_dict_of_blocks(copy=True) - for dtype, _df in blocks.items(): - if column in _df: - _df.loc[:, column] = _df[column] + 1 - - # make sure we did not change the original DataFrame - assert not _df[column].equals(df[column]) - - def test_no_copy_blocks(self, float_frame): - # API/ENH 9607 - df = DataFrame(float_frame, copy=True) - column = df.columns[0] - - # use the copy=False, change a column - blocks = df._to_dict_of_blocks(copy=False) - for dtype, _df in blocks.items(): - if column in _df: - _df.loc[:, column] = _df[column] + 1 - - # make sure we did change the original DataFrame - assert _df[column].equals(df[column]) - def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() cop["E"] = cop["A"] @@ -469,88 +382,6 @@ def test_get_numeric_data_extension_dtype(self): expected = df.loc[:, ["A", "C"]] tm.assert_frame_equal(result, expected) - def test_convert_objects(self, float_string_frame): - - oops = float_string_frame.T.T - converted = oops._convert(datetime=True) - tm.assert_frame_equal(converted, float_string_frame) - assert converted["A"].dtype == np.float64 - - # force numeric conversion - float_string_frame["H"] = "1." - float_string_frame["I"] = "1" - - # add in some items that will be nan - length = len(float_string_frame) - float_string_frame["J"] = "1." - float_string_frame["K"] = "1" - float_string_frame.loc[float_string_frame.index[0:5], ["J", "K"]] = "garbled" - converted = float_string_frame._convert(datetime=True, numeric=True) - assert converted["H"].dtype == "float64" - assert converted["I"].dtype == "int64" - assert converted["J"].dtype == "float64" - assert converted["K"].dtype == "float64" - assert len(converted["J"].dropna()) == length - 5 - assert len(converted["K"].dropna()) == length - 5 - - # via astype - converted = float_string_frame.copy() - converted["H"] = converted["H"].astype("float64") - converted["I"] = converted["I"].astype("int64") - assert converted["H"].dtype == "float64" - assert converted["I"].dtype == "int64" - - # via astype, but errors - converted = float_string_frame.copy() - with pytest.raises(ValueError, match="invalid literal"): - converted["H"].astype("int32") - - # mixed in a single column - df = DataFrame(dict(s=Series([1, "na", 3, 4]))) - result = df._convert(datetime=True, numeric=True) - expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) - tm.assert_frame_equal(result, expected) - - def test_convert_objects_no_conversion(self): - mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]}) - mixed2 = mixed1._convert(datetime=True) - tm.assert_frame_equal(mixed1, mixed2) - - def test_infer_objects(self): - # GH 11221 - df = DataFrame( - { - "a": ["a", 1, 2, 3], - "b": ["b", 2.0, 3.0, 4.1], - "c": [ - "c", - datetime(2016, 1, 1), - datetime(2016, 1, 2), - datetime(2016, 1, 3), - ], - "d": [1, 2, 3, "d"], - }, - columns=["a", "b", "c", "d"], - ) - df = df.iloc[1:].infer_objects() - - assert df["a"].dtype == "int64" - assert df["b"].dtype == "float64" - assert df["c"].dtype == "M8[ns]" - assert df["d"].dtype == "object" - - expected = DataFrame( - { - "a": [1, 2, 3], - "b": [2.0, 3.0, 4.1], - "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], - "d": [2, 3, "d"], - }, - columns=["a", "b", "c", "d"], - ) - # reconstruct frame to verify inference is same - tm.assert_frame_equal(df.reset_index(drop=True), expected) - def test_stale_cached_series_bug_473(self): # this is chained, but ok @@ -628,24 +459,6 @@ def test_add_column_with_pandas_array(self): tm.assert_frame_equal(df, df2) -def test_to_dict_of_blocks_item_cache(): - # Calling to_dict_of_blocks should not poison item_cache - df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) - mgr = df._mgr - assert len(mgr.blocks) == 3 # i.e. not consolidated - - ser = df["b"] # populations item_cache["b"] - - df._to_dict_of_blocks() - - # Check that the to_dict_of_blocks didnt break link between ser and df - ser.values[0] = "foo" - assert df.loc[0, "b"] == "foo" - - assert df["b"] is ser - - def test_update_inplace_sets_valid_block_values(): # https://github.com/pandas-dev/pandas/issues/33457 df = DataFrame({"a": Series([1, 2, None], dtype="category")}) diff --git a/pandas/tests/frame/test_convert.py b/pandas/tests/frame/test_convert.py new file mode 100644 index 0000000000000..50add248f9614 --- /dev/null +++ b/pandas/tests/frame/test_convert.py @@ -0,0 +1,54 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestConvert: + def test_convert_objects(self, float_string_frame): + + oops = float_string_frame.T.T + converted = oops._convert(datetime=True) + tm.assert_frame_equal(converted, float_string_frame) + assert converted["A"].dtype == np.float64 + + # force numeric conversion + float_string_frame["H"] = "1." + float_string_frame["I"] = "1" + + # add in some items that will be nan + length = len(float_string_frame) + float_string_frame["J"] = "1." + float_string_frame["K"] = "1" + float_string_frame.loc[float_string_frame.index[0:5], ["J", "K"]] = "garbled" + converted = float_string_frame._convert(datetime=True, numeric=True) + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + assert converted["J"].dtype == "float64" + assert converted["K"].dtype == "float64" + assert len(converted["J"].dropna()) == length - 5 + assert len(converted["K"].dropna()) == length - 5 + + # via astype + converted = float_string_frame.copy() + converted["H"] = converted["H"].astype("float64") + converted["I"] = converted["I"].astype("int64") + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + + # via astype, but errors + converted = float_string_frame.copy() + with pytest.raises(ValueError, match="invalid literal"): + converted["H"].astype("int32") + + # mixed in a single column + df = DataFrame(dict(s=Series([1, "na", 3, 4]))) + result = df._convert(datetime=True, numeric=True) + expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) + tm.assert_frame_equal(result, expected) + + def test_convert_objects_no_conversion(self): + mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]}) + mixed2 = mixed1._convert(datetime=True) + tm.assert_frame_equal(mixed1, mixed2) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index df76f3c64947a..9d1a2bed5db12 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1,6 +1,5 @@ from datetime import date, datetime import itertools -import operator import re import numpy as np @@ -1042,31 +1041,6 @@ def test_blockplacement_add_int_raises(self, val): BlockPlacement(val).add(-10) -class DummyElement: - def __init__(self, value, dtype): - self.value = value - self.dtype = np.dtype(dtype) - - def __array__(self): - return np.array(self.value, dtype=self.dtype) - - def __str__(self) -> str: - return f"DummyElement({self.value}, {self.dtype})" - - def __repr__(self) -> str: - return str(self) - - def astype(self, dtype, copy=False): - self.dtype = dtype - return self - - def view(self, dtype): - return type(self)(self.value.view(dtype), dtype) - - def any(self, axis=None): - return bool(self.value) - - class TestCanHoldElement: def test_datetime_block_can_hold_element(self): block = create_block("datetime", [0]) @@ -1095,77 +1069,6 @@ def test_datetime_block_can_hold_element(self): with pytest.raises(TypeError, match=msg): arr[0] = val - @pytest.mark.parametrize( - "value, dtype", - [ - (1, "i8"), - (1.0, "f8"), - (2 ** 63, "f8"), - (1j, "complex128"), - (2 ** 63, "complex128"), - (True, "bool"), - (np.timedelta64(20, "ns"), "