Skip to content

Commit

Permalink
Merge pull request #9345 from iwschris/groupby_nano_int
Browse files Browse the repository at this point in the history
BUG: Fixes GH9311 groupby on datetime64
  • Loading branch information
jreback committed Feb 14, 2015
2 parents f2882b8 + 5f6cbf8 commit 3f24b87
Show file tree
Hide file tree
Showing 6 changed files with 1,391 additions and 743 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ Bug Fixes
- Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`)
- Bug in ``Categorical.__getitem__/__setitem__`` with listlike input getting incorrect results from indexer coercion (:issue:`9469`)
- Bug in partial setting with a DatetimeIndex (:issue:`9478`)
- Bug in groupby for integer and datetime64 columns when applying an aggregator that caused the value to be
changed when the number was sufficiently large (:issue:`9311`, :issue:`6620`)
- Fixed bug in ``to_sql`` when mapping a ``Timestamp`` object column (datetime
column with timezone info) to the according sqlalchemy type (:issue:`9085`).
- Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated
Expand Down
60 changes: 36 additions & 24 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
notnull, _DATELIKE_DTYPES, is_numeric_dtype,
is_timedelta64_dtype, is_datetime64_dtype,
is_categorical_dtype, _values_from_object)
is_categorical_dtype, _values_from_object,
_is_datetime_or_timedelta_dtype, is_bool_dtype)
from pandas.core.config import option_context
import pandas.lib as lib
from pandas.lib import Timestamp
Expand Down Expand Up @@ -1444,7 +1445,9 @@ def get_func(fname):
f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
if f is not None:
return f
return getattr(_algos, fname, None)

if dtype_str == 'float64':
return getattr(_algos, fname, None)

ftype = self._cython_functions[how]

Expand All @@ -1471,7 +1474,6 @@ def wrapper(*args, **kwargs):
return func, dtype_str

def aggregate(self, values, how, axis=0):

arity = self._cython_arity.get(how, 1)

vdim = values.ndim
Expand All @@ -1487,27 +1489,44 @@ def aggregate(self, values, how, axis=0):
raise NotImplementedError
out_shape = (self.ngroups,) + values.shape[1:]

if is_numeric_dtype(values.dtype):
values = com.ensure_float(values)
is_numeric = True
out_dtype = 'f%d' % values.dtype.itemsize
is_numeric = is_numeric_dtype(values.dtype)

if _is_datetime_or_timedelta_dtype(values.dtype):
values = values.view('int64')
elif is_bool_dtype(values.dtype):
values = _algos.ensure_float64(values)
elif com.is_integer_dtype(values):
values = values.astype('int64', copy=False)
elif is_numeric:
values = _algos.ensure_float64(values)
else:
is_numeric = issubclass(values.dtype.type, (np.datetime64,
np.timedelta64))
values = values.astype(object)

try:
agg_func, dtype_str = self._get_aggregate_function(how, values)
except NotImplementedError:
if is_numeric:
out_dtype = 'float64'
values = values.view('int64')
values = _algos.ensure_float64(values)
agg_func, dtype_str = self._get_aggregate_function(how, values)
else:
out_dtype = 'object'
values = values.astype(object)
raise

if is_numeric:
out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
else:
out_dtype = 'object'

# will be filled in Cython function
result = np.empty(out_shape, dtype=out_dtype)

result.fill(np.nan)
counts = np.zeros(self.ngroups, dtype=np.int64)

result = self._aggregate(result, counts, values, how, is_numeric)
result = self._aggregate(result, counts, values, agg_func, is_numeric)

if com.is_integer_dtype(result):
if len(result[result == tslib.iNaT]) > 0:
result = result.astype('float64')
result[result == tslib.iNaT] = np.nan

if self._filter_empty_groups and not counts.all():
if result.ndim == 2:
Expand Down Expand Up @@ -1535,9 +1554,7 @@ def aggregate(self, values, how, axis=0):

return result, names

def _aggregate(self, result, counts, values, how, is_numeric):
agg_func, dtype = self._get_aggregate_function(how, values)

def _aggregate(self, result, counts, values, agg_func, is_numeric):
comp_ids, _, ngroups = self.group_info
if values.ndim > 3:
# punting for now
Expand Down Expand Up @@ -1796,9 +1813,7 @@ def size(self):
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
}

def _aggregate(self, result, counts, values, how, is_numeric=True):

agg_func, dtype = self._get_aggregate_function(how, values)
def _aggregate(self, result, counts, values, agg_func, is_numeric=True):

if values.ndim > 3:
# punting for now
Expand Down Expand Up @@ -2535,9 +2550,6 @@ def _cython_agg_blocks(self, how, numeric_only=True):

values = block._try_operate(block.values)

if block.is_numeric:
values = _algos.ensure_float64(values)

result, _ = self.grouper.aggregate(values, how, axis=agg_axis)

# see if we can cast the block back to the original dtype
Expand Down
5 changes: 1 addition & 4 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1811,10 +1811,7 @@ def _try_coerce_args(self, values, other):
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
if result.dtype == 'i8':
result = tslib.array_to_datetime(
result.astype(object).ravel()).reshape(result.shape)
elif result.dtype.kind in ['i', 'f', 'O']:
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('M8[ns]')
elif isinstance(result, (np.integer, np.datetime64)):
result = lib.Timestamp(result)
Expand Down
125 changes: 102 additions & 23 deletions pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
# don't introduce a pandas/pandas.compat import
# or we get a bootstrapping problem
from StringIO import StringIO
import numpy as np

_int64_max = np.iinfo(np.int64).max

header = """
cimport numpy as np
Expand Down Expand Up @@ -680,7 +683,7 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -726,7 +729,7 @@ def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -773,7 +776,7 @@ def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -819,7 +822,7 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = resx[i, j]
"""
Expand Down Expand Up @@ -1278,7 +1281,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
nobs = np.zeros_like(out)
minx = np.empty_like(out)
minx.fill(np.inf)
minx.fill(%(inf_val)s)
if bins[len(bins) - 1] == len(values):
ngroups = len(bins)
Expand Down Expand Up @@ -1319,7 +1322,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = minx[i, j]
"""
Expand All @@ -1344,7 +1347,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
nobs = np.zeros_like(out)
maxx = np.empty_like(out)
maxx.fill(-np.inf)
maxx.fill(-%(inf_val)s)
N, K = (<object> values).shape
Expand Down Expand Up @@ -1381,7 +1384,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = maxx[i, j]
"""
Expand All @@ -1402,7 +1405,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
nobs = np.zeros_like(out)
maxx = np.empty_like(out)
maxx.fill(-np.inf)
maxx.fill(-%(inf_val)s)
if bins[len(bins) - 1] == len(values):
ngroups = len(bins)
Expand Down Expand Up @@ -1443,7 +1446,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = maxx[i, j]
"""
Expand All @@ -1469,7 +1472,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
nobs = np.zeros_like(out)
minx = np.empty_like(out)
minx.fill(np.inf)
minx.fill(%(inf_val)s)
N, K = (<object> values).shape
Expand Down Expand Up @@ -1506,7 +1509,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(len(counts)):
for j in range(K):
if nobs[i, j] == 0:
out[i, j] = nan
out[i, j] = %(nan_val)s
else:
out[i, j] = minx[i, j]
"""
Expand Down Expand Up @@ -2286,6 +2289,70 @@ def generate_put_template(template, use_ints=True, use_floats=True,
output.write(func)
return output.getvalue()

def generate_put_min_max_template(template, use_ints=True, use_floats=True,
use_objects=False, use_datelikes=False):
floats_list = [
('float64', 'float64_t', 'nan', 'np.inf'),
('float32', 'float32_t', 'nan', 'np.inf'),
]
ints_list = [
('int64', 'int64_t', 'iNaT', _int64_max),
]
date_like_list = [
('int64', 'int64_t', 'iNaT', _int64_max),
]
object_list = [('object', 'object', 'nan', 'np.inf')]
function_list = []
if use_floats:
function_list.extend(floats_list)
if use_ints:
function_list.extend(ints_list)
if use_objects:
function_list.extend(object_list)
if use_datelikes:
function_list.extend(date_like_list)

output = StringIO()
for name, dest_type, nan_val, inf_val in function_list:
func = template % {'name': name,
'dest_type2': dest_type,
'nan_val': nan_val,
'inf_val': inf_val}
output.write(func)
return output.getvalue()

def generate_put_selection_template(template, use_ints=True, use_floats=True,
use_objects=False, use_datelikes=False):
floats_list = [
('float64', 'float64_t', 'float64_t', 'nan'),
('float32', 'float32_t', 'float32_t', 'nan'),
]
ints_list = [
('int64', 'int64_t', 'int64_t', 'iNaT'),
]
date_like_list = [
('int64', 'int64_t', 'int64_t', 'iNaT'),
]
object_list = [('object', 'object', 'object', 'nan')]
function_list = []
if use_floats:
function_list.extend(floats_list)
if use_ints:
function_list.extend(ints_list)
if use_objects:
function_list.extend(object_list)
if use_datelikes:
function_list.extend(date_like_list)

output = StringIO()
for name, c_type, dest_type, nan_val in function_list:
func = template % {'name': name,
'c_type': c_type,
'dest_type2': dest_type,
'nan_val': nan_val}
output.write(func)
return output.getvalue()

def generate_take_template(template, exclude=None):
# name, dest, ctypein, ctypeout, preval, postval, cancopy
function_list = [
Expand Down Expand Up @@ -2347,24 +2414,27 @@ def generate_from_template(template, exclude=None):
return output.getvalue()

put_2d = [diff_2d_template]
groupbys = [group_last_template,
group_last_bin_template,
group_nth_template,
group_nth_bin_template,
group_add_template,

groupbys = [group_add_template,
group_add_bin_template,
group_prod_template,
group_prod_bin_template,
group_var_template,
group_var_bin_template,
group_mean_template,
group_mean_bin_template,
group_min_template,
group_min_bin_template,
group_max_template,
group_max_bin_template,
group_ohlc_template]

groupby_selection = [group_last_template,
group_last_bin_template,
group_nth_template,
group_nth_bin_template]

groupby_min_max = [group_min_template,
group_min_bin_template,
group_max_template,
group_max_bin_template]

groupby_count = [group_count_template, group_count_bin_template]

templates_1d = [map_indices_template,
Expand Down Expand Up @@ -2407,9 +2477,18 @@ def generate_take_cython_file(path='generated.pyx'):
for template in groupbys:
print(generate_put_template(template, use_ints=False), file=f)

for template in groupby_selection:
print(generate_put_selection_template(template, use_ints=True),
file=f)

for template in groupby_min_max:
print(generate_put_min_max_template(template, use_ints=True),
file=f)

for template in groupby_count:
print(generate_put_template(template, use_ints=False,
use_datelikes=True, use_objects=True),
print(generate_put_selection_template(template, use_ints=True,
use_datelikes=True,
use_objects=True),
file=f)

# for template in templates_1d_datetime:
Expand Down
Loading

0 comments on commit 3f24b87

Please sign in to comment.