Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add covar, covar_pop and covar_samp aggregate functions #1551

Merged
merged 13 commits into from
Jan 13, 2022
Merged
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ This library currently supports many SQL constructs, including
- `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)`
- Many mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`.
- `WHERE` to filter
- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`, `VAR`, `STDDEV` (sample and population)
- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG`, `VAR`, `COVAR`, `STDDEV` (sample and population)
- `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST`

## Supported Functions
Expand Down
6 changes: 4 additions & 2 deletions ballista/rust/core/proto/ballista.proto
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,10 @@ enum AggregateFunction {
ARRAY_AGG = 6;
VARIANCE=7;
VARIANCE_POP=8;
STDDEV=9;
STDDEV_POP=10;
COVARIANCE=9;
COVARIANCE_POP=10;
STDDEV=11;
STDDEV_POP=12;
}

message AggregateExprNode {
Expand Down
8 changes: 8 additions & 0 deletions ballista/rust/core/src/serde/logical_plan/to_proto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,12 @@ impl TryInto<protobuf::LogicalExprNode> for &Expr {
AggregateFunction::VariancePop => {
protobuf::AggregateFunction::VariancePop
}
AggregateFunction::Covariance => {
protobuf::AggregateFunction::Covariance
}
AggregateFunction::CovariancePop => {
protobuf::AggregateFunction::CovariancePop
}
AggregateFunction::Stddev => protobuf::AggregateFunction::Stddev,
AggregateFunction::StddevPop => {
protobuf::AggregateFunction::StddevPop
Expand Down Expand Up @@ -1266,6 +1272,8 @@ impl From<&AggregateFunction> for protobuf::AggregateFunction {
AggregateFunction::ArrayAgg => Self::ArrayAgg,
AggregateFunction::Variance => Self::Variance,
AggregateFunction::VariancePop => Self::VariancePop,
AggregateFunction::Covariance => Self::Covariance,
AggregateFunction::CovariancePop => Self::CovariancePop,
AggregateFunction::Stddev => Self::Stddev,
AggregateFunction::StddevPop => Self::StddevPop,
}
Expand Down
4 changes: 4 additions & 0 deletions ballista/rust/core/src/serde/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ impl From<protobuf::AggregateFunction> for AggregateFunction {
protobuf::AggregateFunction::ArrayAgg => AggregateFunction::ArrayAgg,
protobuf::AggregateFunction::Variance => AggregateFunction::Variance,
protobuf::AggregateFunction::VariancePop => AggregateFunction::VariancePop,
protobuf::AggregateFunction::Covariance => AggregateFunction::Covariance,
protobuf::AggregateFunction::CovariancePop => {
AggregateFunction::CovariancePop
}
protobuf::AggregateFunction::Stddev => AggregateFunction::Stddev,
protobuf::AggregateFunction::StddevPop => AggregateFunction::StddevPop,
}
Expand Down
41 changes: 40 additions & 1 deletion datafusion/src/physical_plan/aggregates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ use crate::physical_plan::distinct_expressions;
use crate::physical_plan::expressions;
use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
use expressions::{
avg_return_type, stddev_return_type, sum_return_type, variance_return_type,
avg_return_type, covariance_return_type, stddev_return_type, sum_return_type,
variance_return_type,
};
use std::{fmt, str::FromStr, sync::Arc};

Expand Down Expand Up @@ -74,6 +75,10 @@ pub enum AggregateFunction {
Stddev,
/// Standard Deviation (Population)
StddevPop,
/// Covariance (Sample)
Covariance,
/// Covariance (Population)
CovariancePop,
}

impl fmt::Display for AggregateFunction {
Expand All @@ -100,6 +105,9 @@ impl FromStr for AggregateFunction {
"stddev" => AggregateFunction::Stddev,
"stddev_samp" => AggregateFunction::Stddev,
"stddev_pop" => AggregateFunction::StddevPop,
"covar" => AggregateFunction::Covariance,
"covar_samp" => AggregateFunction::Covariance,
"covar_pop" => AggregateFunction::CovariancePop,
_ => {
return Err(DataFusionError::Plan(format!(
"There is no built-in function named {}",
Expand Down Expand Up @@ -134,6 +142,10 @@ pub fn return_type(
AggregateFunction::Sum => sum_return_type(&coerced_data_types[0]),
AggregateFunction::Variance => variance_return_type(&coerced_data_types[0]),
AggregateFunction::VariancePop => variance_return_type(&coerced_data_types[0]),
AggregateFunction::Covariance => covariance_return_type(&coerced_data_types[0]),
AggregateFunction::CovariancePop => {
covariance_return_type(&coerced_data_types[0])
}
AggregateFunction::Stddev => stddev_return_type(&coerced_data_types[0]),
AggregateFunction::StddevPop => stddev_return_type(&coerced_data_types[0]),
AggregateFunction::Avg => avg_return_type(&coerced_data_types[0]),
Expand Down Expand Up @@ -259,6 +271,30 @@ pub fn create_aggregate_expr(
"VAR_POP(DISTINCT) aggregations are not available".to_string(),
));
}
(AggregateFunction::Covariance, false) => Arc::new(expressions::Covariance::new(
coerced_phy_exprs[0].clone(),
coerced_phy_exprs[1].clone(),
name,
return_type,
)),
(AggregateFunction::Covariance, true) => {
return Err(DataFusionError::NotImplemented(
"COVAR(DISTINCT) aggregations are not available".to_string(),
));
}
(AggregateFunction::CovariancePop, false) => {
Arc::new(expressions::CovariancePop::new(
coerced_phy_exprs[0].clone(),
coerced_phy_exprs[1].clone(),
name,
return_type,
))
}
(AggregateFunction::CovariancePop, true) => {
return Err(DataFusionError::NotImplemented(
"COVAR_POP(DISTINCT) aggregations are not available".to_string(),
));
}
(AggregateFunction::Stddev, false) => Arc::new(expressions::Stddev::new(
coerced_phy_exprs[0].clone(),
name,
Expand Down Expand Up @@ -331,6 +367,9 @@ pub fn signature(fun: &AggregateFunction) -> Signature {
| AggregateFunction::StddevPop => {
Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable)
}
AggregateFunction::Covariance | AggregateFunction::CovariancePop => {
Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable)
}
}
}

Expand Down
22 changes: 20 additions & 2 deletions datafusion/src/physical_plan/coercion_rule/aggregate_rule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ use crate::arrow::datatypes::Schema;
use crate::error::{DataFusionError, Result};
use crate::physical_plan::aggregates::AggregateFunction;
use crate::physical_plan::expressions::{
is_avg_support_arg_type, is_stddev_support_arg_type, is_sum_support_arg_type,
is_variance_support_arg_type, try_cast,
is_avg_support_arg_type, is_covariance_support_arg_type, is_stddev_support_arg_type,
is_sum_support_arg_type, is_variance_support_arg_type, try_cast,
};
use crate::physical_plan::functions::{Signature, TypeSignature};
use crate::physical_plan::PhysicalExpr;
Expand Down Expand Up @@ -105,6 +105,24 @@ pub(crate) fn coerce_types(
}
Ok(input_types.to_vec())
}
AggregateFunction::Covariance => {
if !is_covariance_support_arg_type(&input_types[0]) {
return Err(DataFusionError::Plan(format!(
"The function {:?} does not support inputs of type {:?}.",
agg_fun, input_types[0]
)));
}
Ok(input_types.to_vec())
}
AggregateFunction::CovariancePop => {
if !is_covariance_support_arg_type(&input_types[0]) {
return Err(DataFusionError::Plan(format!(
"The function {:?} does not support inputs of type {:?}.",
agg_fun, input_types[0]
)));
}
Ok(input_types.to_vec())
}
AggregateFunction::Stddev => {
if !is_stddev_support_arg_type(&input_types[0]) {
return Err(DataFusionError::Plan(format!(
Expand Down
Loading