From f4a08285ae7067cc4427dabb76cb0a8f19ab5c23 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 24 Jul 2024 12:37:06 -0500 Subject: [PATCH] migrate approx_percentile_cont, approx_distinct, and approx_median to UDAF Ref: approx_distinct https://github.com/apache/datafusion/pull/10851 Ref: approx_median https://github.com/apache/datafusion/pull/10840 Ref: approx_percentile_cont and _with_weight https://github.com/apache/datafusion/pull/10917 --- python/datafusion/functions.py | 12 ++++--- src/functions.rs | 58 ++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 46d2a2f04..8a320c427 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1233,9 +1233,9 @@ def flatten(array: Expr) -> Expr: # aggregate functions -def approx_distinct(arg: Expr) -> Expr: +def approx_distinct(expression: Expr) -> Expr: """Returns the approximate number of distinct values.""" - return Expr(f.approx_distinct(arg.expr, distinct=True)) + return Expr(f.approx_distinct(expression.expr)) def approx_median(arg: Expr, distinct: bool = False) -> Expr: @@ -1244,12 +1244,14 @@ def approx_median(arg: Expr, distinct: bool = False) -> Expr: def approx_percentile_cont( - expr: Expr, + expression: Expr, percentile: Expr, - num_centroids: int | None = None, + # num_centroids: int | None = None, distinct: bool = False, ) -> Expr: """Returns the value that is approximately at a given percentile of ``expr``.""" + # TODO: enable num_centroids + num_centroids = None if num_centroids is None: return Expr( f.approx_percentile_cont(expr.expr, percentile.expr, distinct=distinct) @@ -1257,7 +1259,7 @@ def approx_percentile_cont( return Expr( f.approx_percentile_cont( - expr.expr, percentile.expr, num_centroids, distinct=distinct + expr.expr, percentile.expr, distinct=distinct ) ) diff --git a/src/functions.rs b/src/functions.rs index 1bff77053..baa9d5232 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -37,6 +37,57 @@ use datafusion_expr::{ lit, Expr, WindowFunctionDefinition, }; +#[pyfunction] +pub fn approx_distinct(expression: PyExpr) -> PyExpr { + functions_aggregate::expr_fn::approx_distinct::approx_distinct(expression.expr).into() +} + +#[pyfunction] +pub fn approx_median(expression: PyExpr, distinct: bool) -> PyResult { + // TODO: better builder pattern + let expr = functions_aggregate::expr_fn::approx_median(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn approx_percentile_cont( + expression: PyExpr, + percentile: PyExpr, + distinct: bool, +) -> PyResult { + // TODO: better builder pattern + let expr = + functions_aggregate::expr_fn::approx_percentile_cont(expression.expr, percentile.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn approx_percentile_cont_with_weight( + expression: PyExpr, + weight: PyExpr, + percentile: PyExpr, + distinct: bool, +) -> PyResult { + let expr = functions_aggregate::expr_fn::approx_percentile_cont_with_weight( + expression.expr, + weight.expr, + percentile.expr, + ); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + #[pyfunction] pub fn sum(args: PyExpr) -> PyExpr { functions_aggregate::expr_fn::sum(args.expr).into() @@ -727,13 +778,6 @@ array_fn!(list_resize, array_resize, array size value); array_fn!(flatten, array); array_fn!(range, start stop step); -aggregate_function!(approx_distinct, ApproxDistinct); -aggregate_function!(approx_median, ApproxMedian); -aggregate_function!(approx_percentile_cont, ApproxPercentileCont); -aggregate_function!( - approx_percentile_cont_with_weight, - ApproxPercentileContWithWeight -); aggregate_function!(array_agg, ArrayAgg); aggregate_function!(avg, Avg); aggregate_function!(corr, Correlation);