From 5f6cbf88d47e7e777b8be855bb950c4ed722cd53 Mon Sep 17 00:00:00 2001 From: Chris Reynolds Date: Wed, 21 Jan 2015 22:26:31 -0600 Subject: [PATCH] BUG: Fixes GH9311 groupby on datetime64 datetime64 columns were changing at the nano-second scale when applying a groupby aggregator. --- doc/source/whatsnew/v0.16.0.txt | 2 + pandas/core/groupby.py | 60 +- pandas/core/internals.py | 5 +- pandas/src/generate_code.py | 125 ++- pandas/src/generated.pyx | 1871 +++++++++++++++++++------------ pandas/tests/test_groupby.py | 71 ++ 6 files changed, 1391 insertions(+), 743 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 9e1546f5e50a9..cd7cdbb645686 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -187,6 +187,8 @@ Bug Fixes - Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`) - Bug in ``Categorical.__getitem__/__setitem__`` with listlike input getting incorrect results from indexer coercion (:issue:`9469`) - Bug in partial setting with a DatetimeIndex (:issue:`9478`) +- Bug in groupby for integer and datetime64 columns when applying an aggregator that caused the value to be + changed when the number was sufficiently large (:issue:`9311`, :issue:`6620`) - Fixed bug in ``to_sql`` when mapping a ``Timestamp`` object column (datetime column with timezone info) to the according sqlalchemy type (:issue:`9085`). - Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 440c0966ac066..9d5fde5600be3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -24,7 +24,8 @@ from pandas.core.common import(_possibly_downcast_to_dtype, isnull, notnull, _DATELIKE_DTYPES, is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, - is_categorical_dtype, _values_from_object) + is_categorical_dtype, _values_from_object, + _is_datetime_or_timedelta_dtype, is_bool_dtype) from pandas.core.config import option_context import pandas.lib as lib from pandas.lib import Timestamp @@ -1444,7 +1445,9 @@ def get_func(fname): f = getattr(_algos, "%s_%s" % (fname, dtype_str), None) if f is not None: return f - return getattr(_algos, fname, None) + + if dtype_str == 'float64': + return getattr(_algos, fname, None) ftype = self._cython_functions[how] @@ -1471,7 +1474,6 @@ def wrapper(*args, **kwargs): return func, dtype_str def aggregate(self, values, how, axis=0): - arity = self._cython_arity.get(how, 1) vdim = values.ndim @@ -1487,27 +1489,44 @@ def aggregate(self, values, how, axis=0): raise NotImplementedError out_shape = (self.ngroups,) + values.shape[1:] - if is_numeric_dtype(values.dtype): - values = com.ensure_float(values) - is_numeric = True - out_dtype = 'f%d' % values.dtype.itemsize + is_numeric = is_numeric_dtype(values.dtype) + + if _is_datetime_or_timedelta_dtype(values.dtype): + values = values.view('int64') + elif is_bool_dtype(values.dtype): + values = _algos.ensure_float64(values) + elif com.is_integer_dtype(values): + values = values.astype('int64', copy=False) + elif is_numeric: + values = _algos.ensure_float64(values) else: - is_numeric = issubclass(values.dtype.type, (np.datetime64, - np.timedelta64)) + values = values.astype(object) + + try: + agg_func, dtype_str = self._get_aggregate_function(how, values) + except NotImplementedError: if is_numeric: - out_dtype = 'float64' - values = values.view('int64') + values = _algos.ensure_float64(values) + agg_func, dtype_str = self._get_aggregate_function(how, values) else: - out_dtype = 'object' - values = values.astype(object) + raise + + if is_numeric: + out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + else: + out_dtype = 'object' # will be filled in Cython function result = np.empty(out_shape, dtype=out_dtype) - result.fill(np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) - result = self._aggregate(result, counts, values, how, is_numeric) + result = self._aggregate(result, counts, values, agg_func, is_numeric) + + if com.is_integer_dtype(result): + if len(result[result == tslib.iNaT]) > 0: + result = result.astype('float64') + result[result == tslib.iNaT] = np.nan if self._filter_empty_groups and not counts.all(): if result.ndim == 2: @@ -1535,9 +1554,7 @@ def aggregate(self, values, how, axis=0): return result, names - def _aggregate(self, result, counts, values, how, is_numeric): - agg_func, dtype = self._get_aggregate_function(how, values) - + def _aggregate(self, result, counts, values, agg_func, is_numeric): comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now @@ -1796,9 +1813,7 @@ def size(self): 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] } - def _aggregate(self, result, counts, values, how, is_numeric=True): - - agg_func, dtype = self._get_aggregate_function(how, values) + def _aggregate(self, result, counts, values, agg_func, is_numeric=True): if values.ndim > 3: # punting for now @@ -2535,9 +2550,6 @@ def _cython_agg_blocks(self, how, numeric_only=True): values = block._try_operate(block.values) - if block.is_numeric: - values = _algos.ensure_float64(values) - result, _ = self.grouper.aggregate(values, how, axis=agg_axis) # see if we can cast the block back to the original dtype diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f4abe05097cff..6cf7fa5888539 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1811,10 +1811,7 @@ def _try_coerce_args(self, values, other): def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): - if result.dtype == 'i8': - result = tslib.array_to_datetime( - result.astype(object).ravel()).reshape(result.shape) - elif result.dtype.kind in ['i', 'f', 'O']: + if result.dtype.kind in ['i', 'f', 'O']: result = result.astype('M8[ns]') elif isinstance(result, (np.integer, np.datetime64)): result = lib.Timestamp(result) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index d04f55bb19fff..575fcf386f570 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -3,6 +3,9 @@ # don't introduce a pandas/pandas.compat import # or we get a bootstrapping problem from StringIO import StringIO +import numpy as np + +_int64_max = np.iinfo(np.int64).max header = """ cimport numpy as np @@ -680,7 +683,7 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = resx[i, j] """ @@ -726,7 +729,7 @@ def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = resx[i, j] """ @@ -773,7 +776,7 @@ def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = resx[i, j] """ @@ -819,7 +822,7 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = resx[i, j] """ @@ -1278,7 +1281,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, nobs = np.zeros_like(out) minx = np.empty_like(out) - minx.fill(np.inf) + minx.fill(%(inf_val)s) if bins[len(bins) - 1] == len(values): ngroups = len(bins) @@ -1319,7 +1322,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = minx[i, j] """ @@ -1344,7 +1347,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, nobs = np.zeros_like(out) maxx = np.empty_like(out) - maxx.fill(-np.inf) + maxx.fill(-%(inf_val)s) N, K = ( values).shape @@ -1381,7 +1384,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = maxx[i, j] """ @@ -1402,7 +1405,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, nobs = np.zeros_like(out) maxx = np.empty_like(out) - maxx.fill(-np.inf) + maxx.fill(-%(inf_val)s) if bins[len(bins) - 1] == len(values): ngroups = len(bins) @@ -1443,7 +1446,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = maxx[i, j] """ @@ -1469,7 +1472,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, nobs = np.zeros_like(out) minx = np.empty_like(out) - minx.fill(np.inf) + minx.fill(%(inf_val)s) N, K = ( values).shape @@ -1506,7 +1509,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = %(nan_val)s else: out[i, j] = minx[i, j] """ @@ -2286,6 +2289,70 @@ def generate_put_template(template, use_ints=True, use_floats=True, output.write(func) return output.getvalue() +def generate_put_min_max_template(template, use_ints=True, use_floats=True, + use_objects=False, use_datelikes=False): + floats_list = [ + ('float64', 'float64_t', 'nan', 'np.inf'), + ('float32', 'float32_t', 'nan', 'np.inf'), + ] + ints_list = [ + ('int64', 'int64_t', 'iNaT', _int64_max), + ] + date_like_list = [ + ('int64', 'int64_t', 'iNaT', _int64_max), + ] + object_list = [('object', 'object', 'nan', 'np.inf')] + function_list = [] + if use_floats: + function_list.extend(floats_list) + if use_ints: + function_list.extend(ints_list) + if use_objects: + function_list.extend(object_list) + if use_datelikes: + function_list.extend(date_like_list) + + output = StringIO() + for name, dest_type, nan_val, inf_val in function_list: + func = template % {'name': name, + 'dest_type2': dest_type, + 'nan_val': nan_val, + 'inf_val': inf_val} + output.write(func) + return output.getvalue() + +def generate_put_selection_template(template, use_ints=True, use_floats=True, + use_objects=False, use_datelikes=False): + floats_list = [ + ('float64', 'float64_t', 'float64_t', 'nan'), + ('float32', 'float32_t', 'float32_t', 'nan'), + ] + ints_list = [ + ('int64', 'int64_t', 'int64_t', 'iNaT'), + ] + date_like_list = [ + ('int64', 'int64_t', 'int64_t', 'iNaT'), + ] + object_list = [('object', 'object', 'object', 'nan')] + function_list = [] + if use_floats: + function_list.extend(floats_list) + if use_ints: + function_list.extend(ints_list) + if use_objects: + function_list.extend(object_list) + if use_datelikes: + function_list.extend(date_like_list) + + output = StringIO() + for name, c_type, dest_type, nan_val in function_list: + func = template % {'name': name, + 'c_type': c_type, + 'dest_type2': dest_type, + 'nan_val': nan_val} + output.write(func) + return output.getvalue() + def generate_take_template(template, exclude=None): # name, dest, ctypein, ctypeout, preval, postval, cancopy function_list = [ @@ -2347,11 +2414,8 @@ def generate_from_template(template, exclude=None): return output.getvalue() put_2d = [diff_2d_template] -groupbys = [group_last_template, - group_last_bin_template, - group_nth_template, - group_nth_bin_template, - group_add_template, + +groupbys = [group_add_template, group_add_bin_template, group_prod_template, group_prod_bin_template, @@ -2359,12 +2423,18 @@ def generate_from_template(template, exclude=None): group_var_bin_template, group_mean_template, group_mean_bin_template, - group_min_template, - group_min_bin_template, - group_max_template, - group_max_bin_template, group_ohlc_template] +groupby_selection = [group_last_template, + group_last_bin_template, + group_nth_template, + group_nth_bin_template] + +groupby_min_max = [group_min_template, + group_min_bin_template, + group_max_template, + group_max_bin_template] + groupby_count = [group_count_template, group_count_bin_template] templates_1d = [map_indices_template, @@ -2407,9 +2477,18 @@ def generate_take_cython_file(path='generated.pyx'): for template in groupbys: print(generate_put_template(template, use_ints=False), file=f) + for template in groupby_selection: + print(generate_put_selection_template(template, use_ints=True), + file=f) + + for template in groupby_min_max: + print(generate_put_min_max_template(template, use_ints=True), + file=f) + for template in groupby_count: - print(generate_put_template(template, use_ints=False, - use_datelikes=True, use_objects=True), + print(generate_put_selection_template(template, use_ints=True, + use_datelikes=True, + use_objects=True), file=f) # for template in templates_1d_datetime: diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 01c80518ca21a..cab3a84f6ffe8 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -4845,391 +4845,487 @@ def diff_2d_int64(ndarray[int64_t, ndim=2] arr, for j in range(start, stop): out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) @cython.wraparound(False) -@cython.wraparound(False) -def group_last_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): +def group_add_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, lab float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs + ndarray[float64_t, ndim=2] sumx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) N, K = ( values).shape - for i in range(N): - lab = labels[i] - if lab < 0: - continue + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - for j in range(K): - val = values[i, j] + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] # not nan if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, 0] += 1 + sumx[lab, 0] += val for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] -@cython.wraparound(False) + out[i, j] = sumx[i, j] +@cython.boundscheck(False) @cython.wraparound(False) -def group_last_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): +def group_add_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, lab float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs + ndarray[float32_t, ndim=2] sumx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) N, K = ( values).shape - for i in range(N): - lab = labels[i] - if lab < 0: - continue + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - for j in range(K): - val = values[i, j] + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] # not nan if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, 0] += 1 + sumx[lab, 0] += val for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] + out[i, j] = sumx[i, j] +@cython.boundscheck(False) @cython.wraparound(False) -@cython.wraparound(False) -def group_last_bin_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): +def group_add_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: - Py_ssize_t i, j, N, K, ngroups, b + Py_ssize_t i, j, N, K, ngroups, b, nbins float64_t val, count - ndarray[float64_t, ndim=2] resx, nobs + ndarray[float64_t, ndim=2] sumx, nobs nobs = np.zeros_like(out) - resx = np.empty_like(out) + sumx = np.zeros_like(out) if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 - N, K = ( values).shape b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 - counts[b] += 1 - for j in range(K): - val = values[i, j] + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] # not nan if val == val: - nobs[b, j] += 1 - resx[b, j] = val + nobs[b, 0] += 1 + sumx[b, 0] += val for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] -@cython.wraparound(False) + out[i, j] = sumx[i, j] +@cython.boundscheck(False) @cython.wraparound(False) -def group_last_bin_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] bins): +def group_add_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: - Py_ssize_t i, j, N, K, ngroups, b + Py_ssize_t i, j, N, K, ngroups, b, nbins float32_t val, count - ndarray[float32_t, ndim=2] resx, nobs + ndarray[float32_t, ndim=2] sumx, nobs nobs = np.zeros_like(out) - resx = np.empty_like(out) + sumx = np.zeros_like(out) if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 - N, K = ( values).shape b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 - counts[b] += 1 - for j in range(K): - val = values[i, j] + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] # not nan if val == val: - nobs[b, j] += 1 - resx[b, j] = val + nobs[b, 0] += 1 + sumx[b, 0] += val for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] + out[i, j] = sumx[i, j] @cython.boundscheck(False) @cython.wraparound(False) -def group_nth_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): +def group_prod_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, lab float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs + ndarray[float64_t, ndim=2] prodx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) + nobs = np.zeros_like(out) + prodx = np.ones_like(out) N, K = ( values).shape - for i in range(N): - lab = labels[i] - if lab < 0: - continue + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - for j in range(K): - val = values[i, j] + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] # not nan if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, 0] += 1 + prodx[lab, 0] *= val for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] + out[i, j] = prodx[i, j] @cython.boundscheck(False) @cython.wraparound(False) -def group_nth_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): +def group_prod_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, lab float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs + ndarray[float32_t, ndim=2] prodx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) + nobs = np.zeros_like(out) + prodx = np.ones_like(out) N, K = ( values).shape - for i in range(N): - lab = labels[i] - if lab < 0: - continue + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - for j in range(K): - val = values[i, j] + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] # not nan if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, 0] += 1 + prodx[lab, 0] *= val for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] + out[i, j] = prodx[i, j] @cython.boundscheck(False) @cython.wraparound(False) -def group_nth_bin_float64(ndarray[float64_t, ndim=2] out, +def group_prod_bin_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins, int64_t rank): + ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, ngroups, b float64_t val, count - ndarray[float64_t, ndim=2] resx, nobs + ndarray[float64_t, ndim=2] prodx, nobs nobs = np.zeros_like(out) - resx = np.empty_like(out) + prodx = np.ones_like(out) if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 - N, K = ( values).shape b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 - counts[b] += 1 - for j in range(K): - val = values[i, j] + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] # not nan if val == val: - nobs[b, j] += 1 - if nobs[b, j] == rank: - resx[b, j] = val + nobs[b, 0] += 1 + prodx[b, 0] *= val for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] + out[i, j] = prodx[i, j] @cython.boundscheck(False) @cython.wraparound(False) -def group_nth_bin_float32(ndarray[float32_t, ndim=2] out, +def group_prod_bin_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float32_t, ndim=2] values, - ndarray[int64_t] bins, int64_t rank): + ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, ngroups, b float32_t val, count - ndarray[float32_t, ndim=2] resx, nobs + ndarray[float32_t, ndim=2] prodx, nobs nobs = np.zeros_like(out) - resx = np.empty_like(out) + prodx = np.ones_like(out) if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 - N, K = ( values).shape b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 - counts[b] += 1 - for j in range(K): - val = values[i, j] + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] # not nan if val == val: - nobs[b, j] += 1 - if nobs[b, j] == rank: - resx[b, j] = val + nobs[b, 0] += 1 + prodx[b, 0] *= val for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = resx[i, j] + out[i, j] = prodx[i, j] -@cython.boundscheck(False) @cython.wraparound(False) -def group_add_float64(ndarray[float64_t, ndim=2] out, +@cython.boundscheck(False) +def group_var_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' cdef: Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) N, K = ( values).shape if K > 1: for i in range(N): + lab = labels[i] if lab < 0: continue counts[lab] += 1 + for j in range(K): val = values[i, j] @@ -5237,55 +5333,60 @@ def group_add_float64(ndarray[float64_t, ndim=2] out, if val == val: nobs[lab, j] += 1 sumx[lab, j] += val + sumxx[lab, j] += val * val else: for i in range(N): + lab = labels[i] if lab < 0: continue counts[lab] += 1 val = values[i, 0] - # not nan if val == val: nobs[lab, 0] += 1 sumx[lab, 0] += val + sumxx[lab, 0] += val * val + for i in range(len(counts)): for j in range(K): - if nobs[i, j] == 0: + ct = nobs[i, j] + if ct < 2: out[i, j] = nan else: - out[i, j] = sumx[i, j] -@cython.boundscheck(False) + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) @cython.wraparound(False) -def group_add_float32(ndarray[float32_t, ndim=2] out, +@cython.boundscheck(False) +def group_var_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float32_t, ndim=2] values, ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' cdef: Py_ssize_t i, j, N, K, lab - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs + float32_t val, ct + ndarray[float32_t, ndim=2] nobs, sumx, sumxx if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) N, K = ( values).shape if K > 1: for i in range(N): + lab = labels[i] if lab < 0: continue counts[lab] += 1 + for j in range(K): val = values[i, j] @@ -5293,48 +5394,53 @@ def group_add_float32(ndarray[float32_t, ndim=2] out, if val == val: nobs[lab, j] += 1 sumx[lab, j] += val + sumxx[lab, j] += val * val else: for i in range(N): + lab = labels[i] if lab < 0: continue counts[lab] += 1 val = values[i, 0] - # not nan if val == val: nobs[lab, 0] += 1 sumx[lab, 0] += val + sumxx[lab, 0] += val * val + for i in range(len(counts)): for j in range(K): - if nobs[i, j] == 0: + ct = nobs[i, j] + if ct < 2: out[i, j] = nan else: - out[i, j] = sumx[i, j] + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) -@cython.boundscheck(False) @cython.wraparound(False) -def group_add_bin_float64(ndarray[float64_t, ndim=2] out, +@cython.boundscheck(False) +def group_var_bin_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' + cdef: - Py_ssize_t i, j, N, K, ngroups, b, nbins - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx nobs = np.zeros_like(out) sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 + N, K = ( values).shape b = 0 @@ -5344,6 +5450,7 @@ def group_add_bin_float64(ndarray[float64_t, ndim=2] out, b += 1 counts[b] += 1 + for j in range(K): val = values[i, j] @@ -5351,6 +5458,7 @@ def group_add_bin_float64(ndarray[float64_t, ndim=2] out, if val == val: nobs[b, j] += 1 sumx[b, j] += val + sumxx[b, j] += val * val else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: @@ -5363,34 +5471,37 @@ def group_add_bin_float64(ndarray[float64_t, ndim=2] out, if val == val: nobs[b, 0] += 1 sumx[b, 0] += val + sumxx[b, 0] += val * val for i in range(ngroups): for j in range(K): - if nobs[i, j] == 0: + ct = nobs[i, j] + if ct < 2: out[i, j] = nan else: - out[i, j] = sumx[i, j] -@cython.boundscheck(False) + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) @cython.wraparound(False) -def group_add_bin_float32(ndarray[float32_t, ndim=2] out, +@cython.boundscheck(False) +def group_var_bin_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float32_t, ndim=2] values, ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' + cdef: - Py_ssize_t i, j, N, K, ngroups, b, nbins - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, ct + ndarray[float32_t, ndim=2] nobs, sumx, sumxx nobs = np.zeros_like(out) sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 + N, K = ( values).shape b = 0 @@ -5400,6 +5511,7 @@ def group_add_bin_float32(ndarray[float32_t, ndim=2] out, b += 1 counts[b] += 1 + for j in range(K): val = values[i, j] @@ -5407,6 +5519,7 @@ def group_add_bin_float32(ndarray[float32_t, ndim=2] out, if val == val: nobs[b, j] += 1 sumx[b, j] += val + sumxx[b, j] += val * val else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: @@ -5419,33 +5532,33 @@ def group_add_bin_float32(ndarray[float32_t, ndim=2] out, if val == val: nobs[b, 0] += 1 sumx[b, 0] += val + sumxx[b, 0] += val * val for i in range(ngroups): for j in range(K): - if nobs[i, j] == 0: + ct = nobs[i, j] + if ct < 2: out[i, j] = nan else: - out[i, j] = sumx[i, j] + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) -@cython.boundscheck(False) @cython.wraparound(False) -def group_prod_float64(ndarray[float64_t, ndim=2] out, +@cython.boundscheck(False) +def group_mean_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' cdef: Py_ssize_t i, j, N, K, lab float64_t val, count - ndarray[float64_t, ndim=2] prodx, nobs + ndarray[float64_t, ndim=2] sumx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) - prodx = np.ones_like(out) + sumx = np.zeros_like(out) N, K = ( values).shape @@ -5458,11 +5571,10 @@ def group_prod_float64(ndarray[float64_t, ndim=2] out, counts[lab] += 1 for j in range(K): val = values[i, j] - # not nan if val == val: nobs[lab, j] += 1 - prodx[lab, j] *= val + sumx[lab, j] += val else: for i in range(N): lab = labels[i] @@ -5471,37 +5583,34 @@ def group_prod_float64(ndarray[float64_t, ndim=2] out, counts[lab] += 1 val = values[i, 0] - # not nan if val == val: nobs[lab, 0] += 1 - prodx[lab, 0] *= val + sumx[lab, 0] += val for i in range(len(counts)): for j in range(K): + count = nobs[i, j] if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = prodx[i, j] -@cython.boundscheck(False) + out[i, j] = sumx[i, j] / count @cython.wraparound(False) -def group_prod_float32(ndarray[float32_t, ndim=2] out, +@cython.boundscheck(False) +def group_mean_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float32_t, ndim=2] values, ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' cdef: Py_ssize_t i, j, N, K, lab float32_t val, count - ndarray[float32_t, ndim=2] prodx, nobs + ndarray[float32_t, ndim=2] sumx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) - prodx = np.ones_like(out) + sumx = np.zeros_like(out) N, K = ( values).shape @@ -5514,11 +5623,10 @@ def group_prod_float32(ndarray[float32_t, ndim=2] out, counts[lab] += 1 for j in range(K): val = values[i, j] - # not nan if val == val: nobs[lab, j] += 1 - prodx[lab, j] *= val + sumx[lab, j] += val else: for i in range(N): lab = labels[i] @@ -5527,41 +5635,37 @@ def group_prod_float32(ndarray[float32_t, ndim=2] out, counts[lab] += 1 val = values[i, 0] - # not nan if val == val: nobs[lab, 0] += 1 - prodx[lab, 0] *= val + sumx[lab, 0] += val for i in range(len(counts)): for j in range(K): + count = nobs[i, j] if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = prodx[i, j] + out[i, j] = sumx[i, j] / count -@cython.boundscheck(False) -@cython.wraparound(False) -def group_prod_bin_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' + +def group_mean_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): cdef: Py_ssize_t i, j, N, K, ngroups, b float64_t val, count - ndarray[float64_t, ndim=2] prodx, nobs + ndarray[float64_t, ndim=2] sumx, nobs nobs = np.zeros_like(out) - prodx = np.ones_like(out) + sumx = np.zeros_like(out) + N, K = ( values).shape if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 - N, K = ( values).shape b = 0 if K > 1: @@ -5576,7 +5680,7 @@ def group_prod_bin_float64(ndarray[float64_t, ndim=2] out, # not nan if val == val: nobs[b, j] += 1 - prodx[b, j] *= val + sumx[b, j] += val else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: @@ -5588,36 +5692,33 @@ def group_prod_bin_float64(ndarray[float64_t, ndim=2] out, # not nan if val == val: nobs[b, 0] += 1 - prodx[b, 0] *= val + sumx[b, 0] += val for i in range(ngroups): for j in range(K): - if nobs[i, j] == 0: + count = nobs[i, j] + if count == 0: out[i, j] = nan else: - out[i, j] = prodx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_prod_bin_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' + out[i, j] = sumx[i, j] / count + +def group_mean_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): cdef: Py_ssize_t i, j, N, K, ngroups, b float32_t val, count - ndarray[float32_t, ndim=2] prodx, nobs + ndarray[float32_t, ndim=2] sumx, nobs nobs = np.zeros_like(out) - prodx = np.ones_like(out) + sumx = np.zeros_like(out) + N, K = ( values).shape if bins[len(bins) - 1] == len(values): ngroups = len(bins) else: ngroups = len(bins) + 1 - N, K = ( values).shape b = 0 if K > 1: @@ -5632,7 +5733,7 @@ def group_prod_bin_float32(ndarray[float32_t, ndim=2] out, # not nan if val == val: nobs[b, j] += 1 - prodx[b, j] *= val + sumx[b, j] += val else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: @@ -5644,153 +5745,103 @@ def group_prod_bin_float32(ndarray[float32_t, ndim=2] out, # not nan if val == val: nobs[b, 0] += 1 - prodx[b, 0] *= val + sumx[b, 0] += val for i in range(ngroups): for j in range(K): - if nobs[i, j] == 0: + count = nobs[i, j] + if count == 0: out[i, j] = nan else: - out[i, j] = prodx[i, j] + out[i, j] = sumx[i, j] / count @cython.wraparound(False) @cython.boundscheck(False) -def group_var_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): +def group_ohlc_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, ct - ndarray[float64_t, ndim=2] nobs, sumx, sumxx - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + float64_t vopen, vhigh, vlow, vclose, NA + bint got_first = 0 - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - sumxx[lab, j] += val * val + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) else: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - sumxx[lab, 0] += val * val - - - for i in range(len(counts)): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = nan - else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) -@cython.wraparound(False) -@cython.boundscheck(False) -def group_var_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab - float32_t val, ct - ndarray[float32_t, ndim=2] nobs, sumx, sumxx - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) + ngroups = len(bins) + 1 N, K = ( values).shape - if K > 1: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') - for j in range(K): - val = values[i, j] + NA = np.nan - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - sumxx[lab, j] += val * val + b = 0 + if K > 1: + raise NotImplementedError else: for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 + counts[b] += 1 val = values[i, 0] + # not nan if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - sumxx[lab, 0] += val * val - - - for i in range(len(counts)): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = nan - else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose @cython.wraparound(False) @cython.boundscheck(False) -def group_var_bin_float64(ndarray[float64_t, ndim=2] out, +def group_ohlc_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, + ndarray[float32_t, ndim=2] values, ndarray[int64_t] bins): - + ''' + Only aggregates on axis=0 + ''' cdef: Py_ssize_t i, j, N, K, ngroups, b - float64_t val, ct - ndarray[float64_t, ndim=2] nobs, sumx, sumxx - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) + float32_t val, count + float32_t vopen, vhigh, vlow, vclose, NA + bint got_first = 0 if bins[len(bins) - 1] == len(values): ngroups = len(bins) @@ -5799,59 +5850,515 @@ def group_var_bin_float64(ndarray[float64_t, ndim=2] out, N, K = ( values).shape + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + b = 0 if K > 1: + raise NotImplementedError + else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - - for j in range(K): - val = values[i, j] + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + int64_t val, count + ndarray[int64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = resx[i, j] + +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + int64_t val, count + ndarray[int64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - sumxx[b, j] += val * val + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + int64_t val, count + ndarray[int64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 + ngroups = len(bins) + 1 - counts[b] += 1 - val = values[i, 0] + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - sumxx[b, 0] += val * val + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val for i in range(ngroups): for j in range(K): - ct = nobs[i, j] - if ct < 2: + if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) -@cython.wraparound(False) + out[i, j] = resx[i, j] @cython.boundscheck(False) -def group_var_bin_float32(ndarray[float32_t, ndim=2] out, +@cython.wraparound(False) +def group_nth_bin_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float32_t, ndim=2] values, - ndarray[int64_t] bins): - + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' cdef: Py_ssize_t i, j, N, K, ngroups, b - float32_t val, ct - ndarray[float32_t, ndim=2] nobs, sumx, sumxx + float32_t val, count + ndarray[float32_t, ndim=2] resx, nobs nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) + resx = np.empty_like(out) if bins[len(bins) - 1] == len(values): ngroups = len(bins) @@ -5861,60 +6368,93 @@ def group_var_bin_float32(ndarray[float32_t, ndim=2] out, N, K = ( values).shape b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 - counts[b] += 1 + counts[b] += 1 + for j in range(K): + val = values[i, j] - for j in range(K): - val = values[i, j] + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - sumxx[b, j] += val * val + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + int64_t val, count + ndarray[int64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 + ngroups = len(bins) + 1 + + N, K = ( values).shape - counts[b] += 1 - val = values[i, 0] + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - sumxx[b, 0] += val * val + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val for i in range(ngroups): for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = nan + if nobs[i, j] == 0: + out[i, j] = iNaT else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) + out[i, j] = resx[i, j] @cython.wraparound(False) @cython.boundscheck(False) -def group_mean_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): +def group_min_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' cdef: Py_ssize_t i, j, N, K, lab float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs + ndarray[float64_t, ndim=2] minx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) - sumx = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) N, K = ( values).shape @@ -5927,10 +6467,12 @@ def group_mean_float64(ndarray[float64_t, ndim=2] out, counts[lab] += 1 for j in range(K): val = values[i, j] + # not nan if val == val: nobs[lab, j] += 1 - sumx[lab, j] += val + if val < minx[lab, j]: + minx[lab, j] = val else: for i in range(N): lab = labels[i] @@ -5939,34 +6481,40 @@ def group_mean_float64(ndarray[float64_t, ndim=2] out, counts[lab] += 1 val = values[i, 0] + # not nan if val == val: nobs[lab, 0] += 1 - sumx[lab, 0] += val + if val < minx[lab, 0]: + minx[lab, 0] = val for i in range(len(counts)): for j in range(K): - count = nobs[i, j] if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = sumx[i, j] / count + out[i, j] = minx[i, j] @cython.wraparound(False) @cython.boundscheck(False) -def group_mean_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): +def group_min_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' cdef: Py_ssize_t i, j, N, K, lab float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs + ndarray[float32_t, ndim=2] minx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) - sumx = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) N, K = ( values).shape @@ -5979,10 +6527,12 @@ def group_mean_float32(ndarray[float32_t, ndim=2] out, counts[lab] += 1 for j in range(K): val = values[i, j] + # not nan if val == val: nobs[lab, j] += 1 - sumx[lab, j] += val + if val < minx[lab, j]: + minx[lab, j] = val else: for i in range(N): lab = labels[i] @@ -5991,139 +6541,32 @@ def group_mean_float32(ndarray[float32_t, ndim=2] out, counts[lab] += 1 val = values[i, 0] + # not nan if val == val: nobs[lab, 0] += 1 - sumx[lab, 0] += val + if val < minx[lab, 0]: + minx[lab, 0] = val for i in range(len(counts)): for j in range(K): - count = nobs[i, j] if nobs[i, j] == 0: out[i, j] = nan else: - out[i, j] = sumx[i, j] / count - - -def group_mean_bin_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - - for i in range(ngroups): - for j in range(K): - count = nobs[i, j] - if count == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] / count - -def group_mean_bin_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] bins): - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - - for i in range(ngroups): - for j in range(K): - count = nobs[i, j] - if count == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] / count - + out[i, j] = minx[i, j] @cython.wraparound(False) @cython.boundscheck(False) -def group_min_float64(ndarray[float64_t, ndim=2] out, +def group_min_int64(ndarray[int64_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, + ndarray[int64_t, ndim=2] values, ndarray[int64_t] labels): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] minx, nobs + int64_t val, count + ndarray[int64_t, ndim=2] minx, nobs if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -6131,7 +6574,7 @@ def group_min_float64(ndarray[float64_t, ndim=2] out, nobs = np.zeros_like(out) minx = np.empty_like(out) - minx.fill(np.inf) + minx.fill(9223372036854775807) N, K = ( values).shape @@ -6168,83 +6611,84 @@ def group_min_float64(ndarray[float64_t, ndim=2] out, for i in range(len(counts)): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = iNaT else: out[i, j] = minx[i, j] + @cython.wraparound(False) @cython.boundscheck(False) -def group_min_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): +def group_min_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: - Py_ssize_t i, j, N, K, lab - float32_t val, count - ndarray[float32_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs nobs = np.zeros_like(out) minx = np.empty_like(out) minx.fill(np.inf) + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + b = 0 if K > 1: for i in range(N): - lab = labels[i] - if lab < 0: - continue + while b < ngroups - 1 and i >= bins[b]: + b += 1 - counts[lab] += 1 + counts[b] += 1 for j in range(K): val = values[i, j] # not nan if val == val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val else: for i in range(N): - lab = labels[i] - if lab < 0: - continue + while b < ngroups - 1 and i >= bins[b]: + b += 1 - counts[lab] += 1 + counts[b] += 1 val = values[i, 0] # not nan if val == val: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val - for i in range(len(counts)): + for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: out[i, j] = nan else: out[i, j] = minx[i, j] - @cython.wraparound(False) @cython.boundscheck(False) -def group_min_bin_float64(ndarray[float64_t, ndim=2] out, +def group_min_bin_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, + ndarray[float32_t, ndim=2] values, ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] minx, nobs + float32_t val, count + ndarray[float32_t, ndim=2] minx, nobs nobs = np.zeros_like(out) @@ -6295,22 +6739,22 @@ def group_min_bin_float64(ndarray[float64_t, ndim=2] out, out[i, j] = minx[i, j] @cython.wraparound(False) @cython.boundscheck(False) -def group_min_bin_float32(ndarray[float32_t, ndim=2] out, +def group_min_bin_int64(ndarray[int64_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, + ndarray[int64_t, ndim=2] values, ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, ngroups, b - float32_t val, count - ndarray[float32_t, ndim=2] minx, nobs + int64_t val, count + ndarray[int64_t, ndim=2] minx, nobs nobs = np.zeros_like(out) minx = np.empty_like(out) - minx.fill(np.inf) + minx.fill(9223372036854775807) if bins[len(bins) - 1] == len(values): ngroups = len(bins) @@ -6351,7 +6795,7 @@ def group_min_bin_float32(ndarray[float32_t, ndim=2] out, for i in range(ngroups): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan + out[i, j] = iNaT else: out[i, j] = minx[i, j] @@ -6475,6 +6919,66 @@ def group_max_float32(ndarray[float32_t, ndim=2] out, out[i, j] = nan else: out[i, j] = maxx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + int64_t val, count + ndarray[int64_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-9223372036854775807) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = maxx[i, j] @cython.wraparound(False) @cython.boundscheck(False) @@ -6596,21 +7100,23 @@ def group_max_bin_float32(ndarray[float32_t, ndim=2] out, out[i, j] = nan else: out[i, j] = maxx[i, j] - @cython.wraparound(False) @cython.boundscheck(False) -def group_ohlc_float64(ndarray[float64_t, ndim=2] out, +def group_max_bin_int64(ndarray[int64_t, ndim=2] out, ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, + ndarray[int64_t, ndim=2] values, ndarray[int64_t] bins): ''' Only aggregates on axis=0 ''' cdef: Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - float64_t vopen, vhigh, vlow, vclose, NA - bint got_first = 0 + int64_t val, count + ndarray[int64_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-9223372036854775807) if bins[len(bins) - 1] == len(values): ngroups = len(bins) @@ -6619,130 +7125,41 @@ def group_ohlc_float64(ndarray[float64_t, ndim=2] out, N, K = ( values).shape - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - NA = np.nan - b = 0 if K > 1: - raise NotImplementedError - else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: - if not got_first: - out[b, 0] = NA - out[b, 1] = NA - out[b, 2] = NA - out[b, 3] = NA - else: - out[b, 0] = vopen - out[b, 1] = vhigh - out[b, 2] = vlow - out[b, 3] = vclose b += 1 - got_first = 0 counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - if not got_first: - got_first = 1 - vopen = val - vlow = val - vhigh = val - else: - if val < vlow: - vlow = val - if val > vhigh: - vhigh = val - vclose = val - - if not got_first: - out[b, 0] = NA - out[b, 1] = NA - out[b, 2] = NA - out[b, 3] = NA - else: - out[b, 0] = vopen - out[b, 1] = vhigh - out[b, 2] = vlow - out[b, 3] = vclose -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float32_t val, count - float32_t vopen, vhigh, vlow, vclose, NA - bint got_first = 0 - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - NA = np.nan + for j in range(K): + val = values[i, j] - b = 0 - if K > 1: - raise NotImplementedError + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: - if not got_first: - out[b, 0] = NA - out[b, 1] = NA - out[b, 2] = NA - out[b, 3] = NA - else: - out[b, 0] = vopen - out[b, 1] = vhigh - out[b, 2] = vlow - out[b, 3] = vclose b += 1 - got_first = 0 counts[b] += 1 val = values[i, 0] # not nan if val == val: - if not got_first: - got_first = 1 - vopen = val - vlow = val - vhigh = val - else: - if val < vlow: - vlow = val - if val > vhigh: - vhigh = val - vclose = val + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val - if not got_first: - out[b, 0] = NA - out[b, 1] = NA - out[b, 2] = NA - out[b, 3] = NA - else: - out[b, 0] = vopen - out[b, 1] = vhigh - out[b, 2] = vlow - out[b, 3] = vclose + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = maxx[i, j] @cython.boundscheck(False) @cython.wraparound(False) @@ -6816,6 +7233,42 @@ def group_count_float32(ndarray[float32_t, ndim=2] out, out[i, j] = nobs[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + int64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + @cython.boundscheck(False) @cython.wraparound(False) def group_count_object(ndarray[object, ndim=2] out, @@ -6854,7 +7307,7 @@ def group_count_object(ndarray[object, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_count_int64(ndarray[float64_t, ndim=2] out, +def group_count_int64(ndarray[int64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[int64_t, ndim=2] values, ndarray[int64_t] labels): @@ -6957,6 +7410,40 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out, out[i, j] = nobs[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + int64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + @cython.boundscheck(False) @cython.wraparound(False) def group_count_bin_object(ndarray[object, ndim=2] out, @@ -6993,7 +7480,7 @@ def group_count_bin_object(ndarray[object, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_count_bin_int64(ndarray[float64_t, ndim=2] out, +def group_count_bin_int64(ndarray[int64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[int64_t, ndim=2] values, ndarray[int64_t] bins): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index d4fcaaec9eb6e..f2ea17db44211 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3483,6 +3483,77 @@ def test_groupby_categorical_no_compress(self): exp = np.array([1,2,4,np.nan]) self.assert_numpy_array_equivalent(result, exp) + def test_groupby_non_arithmetic_agg_types(self): + # GH9311, GH6620 + df = pd.DataFrame([{'a': 1, 'b': 1}, + {'a': 1, 'b': 2}, + {'a': 2, 'b': 3}, + {'a': 2, 'b': 4}]) + + dtypes = ['int8', 'int16', 'int32', 'int64', + 'float32', 'float64'] + + grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, + 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, + 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, + 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, + 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}, + 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}} + + for dtype in dtypes: + df_in = df.copy() + df_in['b'] = df_in.b.astype(dtype) + + for method, data in compat.iteritems(grp_exp): + if 'args' not in data: + data['args'] = [] + + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype + + exp = data['df'] + df_out = pd.DataFrame(exp) + + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) + + grpd = df_in.groupby('a') + t = getattr(grpd, method)(*data['args']) + assert_frame_equal(t, df_out) + + def test_groupby_non_arithmetic_agg_intlike_precision(self): + # GH9311, GH6620 + c = 24650000000000000 + + inputs = ((Timestamp('2011-01-15 12:50:28.502376'), + Timestamp('2011-01-20 12:50:28.593448')), + (1 + c, 2 + c)) + + for i in inputs: + df = pd.DataFrame([{'a': 1, + 'b': i[0]}, + {'a': 1, + 'b': i[1]}]) + + grp_exp = {'first': {'expected': i[0]}, + 'last': {'expected': i[1]}, + 'min': {'expected': i[0]}, + 'max': {'expected': i[1]}, + 'nth': {'expected': i[1], 'args': [1]}, + 'count': {'expected': 2}} + + for method, data in compat.iteritems(grp_exp): + if 'args' not in data: + data['args'] = [] + + grpd = df.groupby('a') + res = getattr(grpd, method)(*data['args']) + self.assertEqual(res.iloc[0].b, data['expected']) + def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) df[1] = df[1].view('M8[ns]')