Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fixes GH9311 groupby on datetime64 #9345

Merged
merged 1 commit into from
Feb 14, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ Bug Fixes
- Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`)
- Bug in ``Categorical.__getitem__/__setitem__`` with listlike input getting incorrect results from indexer coercion (:issue:`9469`)
- Bug in partial setting with a DatetimeIndex (:issue:`9478`)
- Bug in groupby for integer and datetime64 columns when applying an aggregator that caused the value to be
changed when the number was sufficiently large (:issue:`9311`, :issue:`6620`)
- Fixed bug in ``to_sql`` when mapping a ``Timestamp`` object column (datetime
column with timezone info) to the according sqlalchemy type (:issue:`9085`).
- Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated
Expand Down
60 changes: 36 additions & 24 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
notnull, _DATELIKE_DTYPES, is_numeric_dtype,
is_timedelta64_dtype, is_datetime64_dtype,
is_categorical_dtype, _values_from_object)
is_categorical_dtype, _values_from_object,
_is_datetime_or_timedelta_dtype, is_bool_dtype)
from pandas.core.config import option_context
import pandas.lib as lib
from pandas.lib import Timestamp
Expand Down Expand Up @@ -1444,7 +1445,9 @@ def get_func(fname):
f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
if f is not None:
return f
return getattr(_algos, fname, None)

if dtype_str == 'float64':
return getattr(_algos, fname, None)

ftype = self._cython_functions[how]

Expand All @@ -1471,7 +1474,6 @@ def wrapper(*args, **kwargs):
return func, dtype_str

def aggregate(self, values, how, axis=0):

arity = self._cython_arity.get(how, 1)

vdim = values.ndim
Expand All @@ -1487,27 +1489,44 @@ def aggregate(self, values, how, axis=0):
raise NotImplementedError
out_shape = (self.ngroups,) + values.shape[1:]

if is_numeric_dtype(values.dtype):
values = com.ensure_float(values)
is_numeric = True
out_dtype = 'f%d' % values.dtype.itemsize
is_numeric = is_numeric_dtype(values.dtype)

if _is_datetime_or_timedelta_dtype(values.dtype):
values = values.view('int64')
elif is_bool_dtype(values.dtype):
values = _algos.ensure_float64(values)
elif com.is_integer_dtype(values):
values = values.astype('int64', copy=False)
elif is_numeric:
values = _algos.ensure_float64(values)
else:
is_numeric = issubclass(values.dtype.type, (np.datetime64,
np.timedelta64))
values = values.astype(object)

try:
agg_func, dtype_str = self._get_aggregate_function(how, values)
except NotImplementedError:
if is_numeric:
out_dtype = 'float64'
values = values.view('int64')
values = _algos.ensure_float64(values)
agg_func, dtype_str = self._get_aggregate_function(how, values)
else:
out_dtype = 'object'
values = values.astype(object)
raise

if is_numeric:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would move this to line 1498 (e.g. combine with the is_numeric flag which doesn't seem to do anything else)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to set the outdtype after the possible float conversion on 1504, but I can yank the flag and just use it in the if statements.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought. Since you have to pass is_numeric into self._aggregate I think this needs to stay given the comment above.

out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
else:
out_dtype = 'object'

# will be filled in Cython function
result = np.empty(out_shape, dtype=out_dtype)

result.fill(np.nan)
counts = np.zeros(self.ngroups, dtype=np.int64)

result = self._aggregate(result, counts, values, how, is_numeric)
result = self._aggregate(result, counts, values, agg_func, is_numeric)

if com.is_integer_dtype(result):
if len(result[result == tslib.iNaT]) > 0:
result = result.astype('float64')
result[result == tslib.iNaT] = np.nan

if self._filter_empty_groups and not counts.all():
if result.ndim == 2:
Expand Down Expand Up @@ -1535,9 +1554,7 @@ def aggregate(self, values, how, axis=0):

return result, names

def _aggregate(self, result, counts, values, how, is_numeric):
agg_func, dtype = self._get_aggregate_function(how, values)

def _aggregate(self, result, counts, values, agg_func, is_numeric):
comp_ids, _, ngroups = self.group_info
if values.ndim > 3:
# punting for now
Expand Down Expand Up @@ -1796,9 +1813,7 @@ def size(self):
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
}

def _aggregate(self, result, counts, values, how, is_numeric=True):

agg_func, dtype = self._get_aggregate_function(how, values)
def _aggregate(self, result, counts, values, agg_func, is_numeric=True):

if values.ndim > 3:
# punting for now
Expand Down Expand Up @@ -2535,9 +2550,6 @@ def _cython_agg_blocks(self, how, numeric_only=True):

values = block._try_operate(block.values)

if block.is_numeric:
values = _algos.ensure_float64(values)

result, _ = self.grouper.aggregate(values, how, axis=agg_axis)

# see if we can cast the block back to the original dtype
Expand Down
5 changes: 1 addition & 4 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1811,10 +1811,7 @@ def _try_coerce_args(self, values, other):
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
if result.dtype == 'i8':
result = tslib.array_to_datetime(
result.astype(object).ravel()).reshape(result.shape)
elif result.dtype.kind in ['i', 'f', 'O']:
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('M8[ns]')
elif isinstance(result, (np.integer, np.datetime64)):
result = lib.Timestamp(result)
Expand Down
125 changes: 102 additions & 23 deletions pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
# don't introduce a pandas/pandas.compat import
# or we get a bootstrapping problem
from StringIO import StringIO
import numpy as np

_int64_max = np.iinfo(np.int64).max

header = """
cimport numpy as np
Expand Down Expand Up @@ -680,7 +683,7 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -726,7 +729,7 @@ def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -773,7 +776,7 @@ def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -819,7 +822,7 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -1278,7 +1281,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
nobs = np.zeros_like(out)

minx = np.empty_like(out)
minx.fill(np.inf)
minx.fill(%(inf_val)s)

if bins[len(bins) - 1] == len(values):
ngroups = len(bins)
Expand Down Expand Up @@ -1319,7 +1322,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = minx[i, j]
"""
Expand All @@ -1344,7 +1347,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
nobs = np.zeros_like(out)

maxx = np.empty_like(out)
maxx.fill(-np.inf)
maxx.fill(-%(inf_val)s)

N, K = (<object> values).shape

Expand Down Expand Up @@ -1381,7 +1384,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = maxx[i, j]
"""
Expand All @@ -1402,7 +1405,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,

nobs = np.zeros_like(out)
maxx = np.empty_like(out)
maxx.fill(-np.inf)
maxx.fill(-%(inf_val)s)

if bins[len(bins) - 1] == len(values):
ngroups = len(bins)
Expand Down Expand Up @@ -1443,7 +1446,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = maxx[i, j]
"""
Expand All @@ -1469,7 +1472,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
nobs = np.zeros_like(out)

minx = np.empty_like(out)
minx.fill(np.inf)
minx.fill(%(inf_val)s)

N, K = (<object> values).shape

Expand Down Expand Up @@ -1506,7 +1509,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = minx[i, j]
"""
Expand Down Expand Up @@ -2286,6 +2289,70 @@ def generate_put_template(template, use_ints=True, use_floats=True,
output.write(func)
return output.getvalue()

def generate_put_min_max_template(template, use_ints=True, use_floats=True,
use_objects=False, use_datelikes=False):
floats_list = [
('float64', 'float64_t', 'nan', 'np.inf'),
('float32', 'float32_t', 'nan', 'np.inf'),
]
ints_list = [
('int64', 'int64_t', 'iNaT', _int64_max),
]
date_like_list = [
('int64', 'int64_t', 'iNaT', _int64_max),
]
object_list = [('object', 'object', 'nan', 'np.inf')]
function_list = []
if use_floats:
function_list.extend(floats_list)
if use_ints:
function_list.extend(ints_list)
if use_objects:
function_list.extend(object_list)
if use_datelikes:
function_list.extend(date_like_list)

output = StringIO()
for name, dest_type, nan_val, inf_val in function_list:
func = template % {'name': name,
'dest_type2': dest_type,
'nan_val': nan_val,
'inf_val': inf_val}
output.write(func)
return output.getvalue()

def generate_put_selection_template(template, use_ints=True, use_floats=True,
use_objects=False, use_datelikes=False):
floats_list = [
('float64', 'float64_t', 'float64_t', 'nan'),
('float32', 'float32_t', 'float32_t', 'nan'),
]
ints_list = [
('int64', 'int64_t', 'int64_t', 'iNaT'),
]
date_like_list = [
('int64', 'int64_t', 'int64_t', 'iNaT'),
]
object_list = [('object', 'object', 'object', 'nan')]
function_list = []
if use_floats:
function_list.extend(floats_list)
if use_ints:
function_list.extend(ints_list)
if use_objects:
function_list.extend(object_list)
if use_datelikes:
function_list.extend(date_like_list)

output = StringIO()
for name, c_type, dest_type, nan_val in function_list:
func = template % {'name': name,
'c_type': c_type,
'dest_type2': dest_type,
'nan_val': nan_val}
output.write(func)
return output.getvalue()

def generate_take_template(template, exclude=None):
# name, dest, ctypein, ctypeout, preval, postval, cancopy
function_list = [
Expand Down Expand Up @@ -2347,24 +2414,27 @@ def generate_from_template(template, exclude=None):
return output.getvalue()

put_2d = [diff_2d_template]
groupbys = [group_last_template,
group_last_bin_template,
group_nth_template,
group_nth_bin_template,
group_add_template,

groupbys = [group_add_template,
group_add_bin_template,
group_prod_template,
group_prod_bin_template,
group_var_template,
group_var_bin_template,
group_mean_template,
group_mean_bin_template,
group_min_template,
group_min_bin_template,
group_max_template,
group_max_bin_template,
group_ohlc_template]

groupby_selection = [group_last_template,
group_last_bin_template,
group_nth_template,
group_nth_bin_template]

groupby_min_max = [group_min_template,
group_min_bin_template,
group_max_template,
group_max_bin_template]

groupby_count = [group_count_template, group_count_bin_template]

templates_1d = [map_indices_template,
Expand Down Expand Up @@ -2407,9 +2477,18 @@ def generate_take_cython_file(path='generated.pyx'):
for template in groupbys:
print(generate_put_template(template, use_ints=False), file=f)

for template in groupby_selection:
print(generate_put_selection_template(template, use_ints=True),
file=f)

for template in groupby_min_max:
print(generate_put_min_max_template(template, use_ints=True),
file=f)

for template in groupby_count:
print(generate_put_template(template, use_ints=False,
use_datelikes=True, use_objects=True),
print(generate_put_selection_template(template, use_ints=True,
use_datelikes=True,
use_objects=True),
file=f)

# for template in templates_1d_datetime:
Expand Down
Loading