Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: Parametrize tests in tests/util/test_hashing.py #21883

Merged
merged 1 commit into from
Jul 14, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 81 additions & 87 deletions pandas/tests/util/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@

class TestHashing(object):

def setup_method(self, method):
self.df = DataFrame(
{'i32': np.array([1, 2, 3] * 3, dtype='int32'),
'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
'obj': Series(['d', 'e', 'f'] * 3),
'bool': np.array([True, False, True] * 3),
'dt': Series(pd.date_range('20130101', periods=9)),
'dt_tz': Series(pd.date_range('20130101', periods=9,
tz='US/Eastern')),
'td': Series(pd.timedelta_range('2000', periods=9))})
@pytest.fixture(params=[
Series([1, 2, 3] * 3, dtype='int32'),
Series([None, 2.5, 3.5] * 3, dtype='float32'),
Series(['a', 'b', 'c'] * 3, dtype='category'),
Series(['d', 'e', 'f'] * 3),
Series([True, False, True] * 3),
Series(pd.date_range('20130101', periods=9)),
Series(pd.date_range('20130101', periods=9, tz='US/Eastern')),
Series(pd.timedelta_range('2000', periods=9))])
def series(self, request):
return request.param

def test_consistency(self):
# check that our hash doesn't change because of a mistake
Expand All @@ -34,10 +34,9 @@ def test_consistency(self):
index=['foo', 'bar', 'baz'])
tm.assert_series_equal(result, expected)

def test_hash_array(self):
for name, s in self.df.iteritems():
a = s.values
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
def test_hash_array(self, series):
a = series.values
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

def test_hash_array_mixed(self):
result1 = hash_array(np.array([3, 4, 'All']))
Expand All @@ -46,10 +45,11 @@ def test_hash_array_mixed(self):
tm.assert_numpy_array_equal(result1, result2)
tm.assert_numpy_array_equal(result1, result3)

def test_hash_array_errors(self):

for val in [5, 'foo', pd.Timestamp('20130101')]:
pytest.raises(TypeError, hash_array, val)
@pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
def test_hash_array_errors(self, val):
msg = 'must pass a ndarray-like'
with tm.assert_raises_regex(TypeError, msg):
hash_array(val)

def check_equal(self, obj, **kwargs):
a = hash_pandas_object(obj, **kwargs)
Expand Down Expand Up @@ -80,31 +80,33 @@ def test_hash_tuples(self):
result = hash_tuples(tups[0])
assert result == expected[0]

def test_hash_tuple(self):
@pytest.mark.parametrize('tup', [
(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
('A', pd.Timestamp("2012-01-01"))])
def test_hash_tuple(self, tup):
# test equivalence between hash_tuples and hash_tuple
for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
('A', pd.Timestamp("2012-01-01"))]:
result = hash_tuple(tup)
expected = hash_tuples([tup])[0]
assert result == expected

def test_hash_scalar(self):
for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
datetime.datetime(2012, 1, 1),
pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
pd.Timedelta('1 days'), datetime.timedelta(1),
pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
np.nan, pd.NaT, None]:
result = _hash_scalar(val)
expected = hash_array(np.array([val], dtype=object),
categorize=True)
assert result[0] == expected[0]

def test_hash_tuples_err(self):

for val in [5, 'foo', pd.Timestamp('20130101')]:
pytest.raises(TypeError, hash_tuples, val)
result = hash_tuple(tup)
expected = hash_tuples([tup])[0]
assert result == expected

@pytest.mark.parametrize('val', [
1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
datetime.datetime(2012, 1, 1),
pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
pd.Timedelta('1 days'), datetime.timedelta(1),
pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
np.nan, pd.NaT, None])
def test_hash_scalar(self, val):
result = _hash_scalar(val)
expected = hash_array(np.array([val], dtype=object), categorize=True)
assert result[0] == expected[0]

@pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
def test_hash_tuples_err(self, val):
msg = 'must be convertible to a list-of-tuples'
with tm.assert_raises_regex(TypeError, msg):
hash_tuples(val)

def test_multiindex_unique(self):
mi = MultiIndex.from_tuples([(118, 472), (236, 118),
Expand Down Expand Up @@ -172,36 +174,35 @@ def test_hash_pandas_object(self, obj):
self.check_equal(obj)
self.check_not_equal_with_index(obj)

def test_hash_pandas_object2(self):
for name, s in self.df.iteritems():
self.check_equal(s)
self.check_not_equal_with_index(s)

def test_hash_pandas_empty_object(self):
for obj in [Series([], dtype='float64'),
Series([], dtype='object'),
Index([])]:
self.check_equal(obj)
def test_hash_pandas_object2(self, series):
self.check_equal(series)
self.check_not_equal_with_index(series)

# these are by-definition the same with
# or w/o the index as the data is empty
@pytest.mark.parametrize('obj', [
Series([], dtype='float64'), Series([], dtype='object'), Index([])])
def test_hash_pandas_empty_object(self, obj):
# these are by-definition the same with
# or w/o the index as the data is empty
self.check_equal(obj)

def test_categorical_consistency(self):
@pytest.mark.parametrize('s1', [
Series(['a', 'b', 'c', 'd']),
Series([1000, 2000, 3000, 4000]),
Series(pd.date_range(0, periods=4))])
@pytest.mark.parametrize('categorize', [True, False])
def test_categorical_consistency(self, s1, categorize):
# GH15143
# Check that categoricals hash consistent with their values, not codes
# This should work for categoricals of any dtype
for s1 in [Series(['a', 'b', 'c', 'd']),
Series([1000, 2000, 3000, 4000]),
Series(pd.date_range(0, periods=4))]:
s2 = s1.astype('category').cat.set_categories(s1)
s3 = s2.cat.set_categories(list(reversed(s1)))
for categorize in [True, False]:
# These should all hash identically
h1 = hash_pandas_object(s1, categorize=categorize)
h2 = hash_pandas_object(s2, categorize=categorize)
h3 = hash_pandas_object(s3, categorize=categorize)
tm.assert_series_equal(h1, h2)
tm.assert_series_equal(h1, h3)
s2 = s1.astype('category').cat.set_categories(s1)
s3 = s2.cat.set_categories(list(reversed(s1)))

# These should all hash identically
h1 = hash_pandas_object(s1, categorize=categorize)
h2 = hash_pandas_object(s2, categorize=categorize)
h3 = hash_pandas_object(s3, categorize=categorize)
tm.assert_series_equal(h1, h2)
tm.assert_series_equal(h1, h3)

def test_categorical_with_nan_consistency(self):
c = pd.Categorical.from_codes(
Expand All @@ -216,13 +217,12 @@ def test_categorical_with_nan_consistency(self):
assert result[1] in expected

def test_pandas_errors(self):

for obj in [pd.Timestamp('20130101')]:
with pytest.raises(TypeError):
hash_pandas_object(obj)
with pytest.raises(TypeError):
hash_pandas_object(pd.Timestamp('20130101'))

with catch_warnings(record=True):
obj = tm.makePanel()

with pytest.raises(TypeError):
hash_pandas_object(obj)

Expand All @@ -238,9 +238,9 @@ def test_hash_keys(self):

def test_invalid_key(self):
# this only matters for object dtypes
def f():
msg = 'key should be a 16-byte string encoded'
with tm.assert_raises_regex(ValueError, msg):
hash_pandas_object(Series(list('abc')), hash_key='foo')
pytest.raises(ValueError, f)

def test_alread_encoded(self):
# if already encoded then ok
Expand All @@ -253,19 +253,13 @@ def test_alternate_encoding(self):
obj = Series(list('abc'))
self.check_equal(obj, encoding='ascii')

def test_same_len_hash_collisions(self):

for l in range(8):
length = 2**(l + 8) + 1
s = tm.rands_array(length, 2)
result = hash_array(s, 'utf8')
assert not result[0] == result[1]

for l in range(8):
length = 2**(l + 8)
s = tm.rands_array(length, 2)
result = hash_array(s, 'utf8')
assert not result[0] == result[1]
@pytest.mark.parametrize('l_exp', range(8))
@pytest.mark.parametrize('l_add', [0, 1])
def test_same_len_hash_collisions(self, l_exp, l_add):
length = 2**(l_exp + 8) + l_add
s = tm.rands_array(length, 2)
result = hash_array(s, 'utf8')
assert not result[0] == result[1]

def test_hash_collisions(self):

Expand Down