Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Upgrade to latest DataFusion revision #909

Merged
merged 19 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 58 additions & 75 deletions native/Cargo.lock

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions native/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ arrow-buffer = { version = "52.2.0" }
arrow-data = { version = "52.2.0" }
arrow-schema = { version = "52.2.0" }
parquet = { version = "52.2.0", default-features = false, features = ["experimental"] }
datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "dff590b" }
datafusion = { default-features = false, git = "https://github.com/apache/datafusion.git", rev = "dff590b", features = ["unicode_expressions", "crypto_expressions"] }
datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "dff590b", features = ["crypto_expressions"] }
datafusion-functions-nested = { git = "https://github.com/apache/datafusion.git", rev = "dff590b", default-features = false }
datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "dff590b", default-features = false }
datafusion-execution = { git = "https://github.com/apache/datafusion.git", rev = "dff590b", default-features = false }
datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "dff590b", default-features = false }
datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "dff590b", default-features = false }
datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5" }
datafusion = { default-features = false, git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5", features = ["unicode_expressions", "crypto_expressions"] }
datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5", features = ["crypto_expressions"] }
datafusion-functions-nested = { git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5", default-features = false }
datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5", default-features = false }
datafusion-execution = { git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5", default-features = false }
datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5", default-features = false }
datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "e5a6cd5", default-features = false }
datafusion-comet-spark-expr = { path = "spark-expr", version = "0.3.0" }
datafusion-comet-proto = { path = "proto", version = "0.3.0" }
chrono = { version = "0.4", default-features = false, features = ["clock"] }
Expand Down
45 changes: 28 additions & 17 deletions native/core/src/execution/datafusion/expressions/avg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
// specific language governing permissions and limitations
// under the License.

use crate::execution::datafusion::expressions::utils::down_cast_any_ref;
use arrow::compute::sum;
use arrow_array::{
builder::PrimitiveBuilder,
Expand All @@ -25,20 +24,24 @@ use arrow_array::{
};
use arrow_schema::{DataType, Field};
use datafusion::logical_expr::{
type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator,
type_coercion::aggregates::avg_return_type, Accumulator, EmitTo, GroupsAccumulator, Signature,
};
use datafusion_common::{not_impl_err, Result, ScalarValue};
use datafusion_physical_expr::{expressions::format_state_name, AggregateExpr, PhysicalExpr};
use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
use std::{any::Any, sync::Arc};

use arrow_array::ArrowNativeTypeOp;

use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::Volatility::Immutable;
use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
use DataType::*;

/// AVG aggregate expression
#[derive(Debug, Clone)]
pub struct Avg {
name: String,
signature: Signature,
expr: Arc<dyn PhysicalExpr>,
input_data_type: DataType,
result_data_type: DataType,
Expand All @@ -51,24 +54,21 @@ impl Avg {

Self {
name: name.into(),
signature: Signature::user_defined(Immutable),
expr,
input_data_type: data_type,
result_data_type,
}
}
}

impl AggregateExpr for Avg {
impl AggregateUDFImpl for Avg {
/// Return a reference to Any that can be used for downcasting
fn as_any(&self) -> &dyn Any {
self
}

fn field(&self) -> Result<Field> {
Ok(Field::new(&self.name, self.result_data_type.clone(), true))
}

fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> {
fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
// instantiate specialized accumulator based for the type
match (&self.input_data_type, &self.result_data_type) {
(Float64, Float64) => Ok(Box::<AvgAccumulator>::default()),
Expand All @@ -80,7 +80,7 @@ impl AggregateExpr for Avg {
}
}

fn state_fields(&self) -> Result<Vec<Field>> {
fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
Ok(vec![
Field::new(
format_state_name(&self.name, "sum"),
Expand All @@ -95,19 +95,22 @@ impl AggregateExpr for Avg {
])
}

fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
vec![Arc::clone(&self.expr)]
}

fn name(&self) -> &str {
&self.name
}

fn groups_accumulator_supported(&self) -> bool {
fn reverse_expr(&self) -> ReversedUDAF {
ReversedUDAF::Identical
}

fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
true
}

fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
fn create_groups_accumulator(
&self,
_args: AccumulatorArgs,
) -> Result<Box<dyn GroupsAccumulator>> {
// instantiate specialized accumulator based for the type
match (&self.input_data_type, &self.result_data_type) {
(Float64, Float64) => Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
Expand All @@ -126,6 +129,14 @@ impl AggregateExpr for Avg {
fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
Ok(ScalarValue::Float64(None))
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
avg_return_type(self.name(), &arg_types[0])
}
}

impl PartialEq<dyn Any> for Avg {
Expand Down
46 changes: 27 additions & 19 deletions native/core/src/execution/datafusion/expressions/avg_decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
// specific language governing permissions and limitations
// under the License.

use crate::execution::datafusion::expressions::utils::down_cast_any_ref;
use arrow::{array::BooleanBufferBuilder, buffer::NullBuffer, compute::sum};
use arrow_array::{
builder::PrimitiveBuilder,
Expand All @@ -24,23 +23,28 @@ use arrow_array::{
Array, ArrayRef, Decimal128Array, Int64Array, PrimitiveArray,
};
use arrow_schema::{DataType, Field};
use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator};
use datafusion::logical_expr::{Accumulator, EmitTo, GroupsAccumulator, Signature};
use datafusion_common::{not_impl_err, Result, ScalarValue};
use datafusion_physical_expr::{expressions::format_state_name, AggregateExpr, PhysicalExpr};
use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};
use std::{any::Any, sync::Arc};

use arrow_array::ArrowNativeTypeOp;
use arrow_data::decimal::{
validate_decimal_precision, MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION,
};

use datafusion::logical_expr::Volatility::Immutable;
use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::avg_return_type;
use datafusion_expr::{AggregateUDFImpl, ReversedUDAF};
use num::{integer::div_ceil, Integer};
use DataType::*;

/// AVG aggregate expression
#[derive(Debug, Clone)]
pub struct AvgDecimal {
name: String,
signature: Signature,
expr: Arc<dyn PhysicalExpr>,
sum_data_type: DataType,
result_data_type: DataType,
Expand All @@ -56,24 +60,21 @@ impl AvgDecimal {
) -> Self {
Self {
name: name.into(),
signature: Signature::user_defined(Immutable),
expr,
result_data_type: result_type,
sum_data_type: sum_type,
}
}
}

impl AggregateExpr for AvgDecimal {
impl AggregateUDFImpl for AvgDecimal {
/// Return a reference to Any that can be used for downcasting
fn as_any(&self) -> &dyn Any {
self
}

fn field(&self) -> Result<Field> {
Ok(Field::new(&self.name, self.result_data_type.clone(), true))
}

fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> {
fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
match (&self.sum_data_type, &self.result_data_type) {
(Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
Ok(Box::new(AvgDecimalAccumulator::new(
Expand All @@ -91,7 +92,7 @@ impl AggregateExpr for AvgDecimal {
}
}

fn state_fields(&self) -> Result<Vec<Field>> {
fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
Ok(vec![
Field::new(
format_state_name(&self.name, "sum"),
Expand All @@ -106,23 +107,22 @@ impl AggregateExpr for AvgDecimal {
])
}

fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
vec![Arc::clone(&self.expr)]
}

fn name(&self) -> &str {
&self.name
}

fn reverse_expr(&self) -> Option<Arc<dyn AggregateExpr>> {
None
fn reverse_expr(&self) -> ReversedUDAF {
ReversedUDAF::Identical
}

fn groups_accumulator_supported(&self) -> bool {
fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
true
}

fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
fn create_groups_accumulator(
&self,
_args: AccumulatorArgs,
) -> Result<Box<dyn GroupsAccumulator>> {
// instantiate specialized accumulator based for the type
match (&self.sum_data_type, &self.result_data_type) {
(Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale)) => {
Expand Down Expand Up @@ -154,6 +154,14 @@ impl AggregateExpr for AvgDecimal {
),
}
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
avg_return_type(self.name(), &arg_types[0])
}
}

impl PartialEq<dyn Any> for AvgDecimal {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,11 @@ use arrow::{
datatypes::{DataType, Schema},
record_batch::RecordBatch,
};
use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
use datafusion::{error::DataFusionError, logical_expr::ColumnarValue};
use datafusion_common::{Result, ScalarValue};
use datafusion_physical_expr::PhysicalExpr;

use crate::execution::datafusion::expressions::utils::down_cast_any_ref;

macro_rules! compute_op {
($OPERAND:expr, $DT:ident) => {{
let operand = $OPERAND
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ use crate::{
use arrow::record_batch::RecordBatch;
use arrow_array::cast::as_primitive_array;
use arrow_schema::{DataType, Schema};
use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
use datafusion::physical_plan::ColumnarValue;
use datafusion_common::{internal_err, Result, ScalarValue};
use datafusion_physical_expr::{aggregate::utils::down_cast_any_ref, PhysicalExpr};
use datafusion_physical_expr::PhysicalExpr;
use std::{
any::Any,
fmt::Display,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ use arrow::{
};
use arrow_schema::{DataType, Schema};
use datafusion::logical_expr::ColumnarValue;
use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
use datafusion_common::{DataFusionError, ScalarValue};
use datafusion_physical_expr::PhysicalExpr;

use crate::execution::datafusion::expressions::utils::down_cast_any_ref;

/// This is from Spark `CheckOverflow` expression. Spark `CheckOverflow` expression rounds decimals
/// to given scale and check if the decimals can fit in given precision. As `cast` kernel rounds
/// decimals already, Comet `CheckOverflow` expression only checks if the decimals can fit in the
Expand Down
42 changes: 23 additions & 19 deletions native/core/src/execution/datafusion/expressions/correlation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,18 @@ use std::{any::Any, sync::Arc};

use crate::execution::datafusion::expressions::{
covariance::CovarianceAccumulator, stats::StatsType, stddev::StddevAccumulator,
utils::down_cast_any_ref,
};
use arrow::{
array::ArrayRef,
datatypes::{DataType, Field},
};
use datafusion::logical_expr::Accumulator;
use datafusion::physical_expr_common::physical_expr::down_cast_any_ref;
use datafusion_common::{Result, ScalarValue};
use datafusion_physical_expr::{expressions::format_state_name, AggregateExpr, PhysicalExpr};
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::NUMERICS;
use datafusion_expr::{AggregateUDFImpl, Signature, Volatility};
use datafusion_physical_expr::{expressions::format_state_name, PhysicalExpr};

/// CORR aggregate expression
/// The implementation mostly is the same as the DataFusion's implementation. The reason
Expand All @@ -39,6 +42,7 @@ use datafusion_physical_expr::{expressions::format_state_name, AggregateExpr, Ph
#[derive(Debug)]
pub struct Correlation {
name: String,
signature: Signature,
expr1: Arc<dyn PhysicalExpr>,
expr2: Arc<dyn PhysicalExpr>,
null_on_divide_by_zero: bool,
Expand All @@ -56,30 +60,42 @@ impl Correlation {
assert!(matches!(data_type, DataType::Float64));
Self {
name: name.into(),
signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
expr1,
expr2,
null_on_divide_by_zero,
}
}
}

impl AggregateExpr for Correlation {
impl AggregateUDFImpl for Correlation {
/// Return a reference to Any that can be used for downcasting
fn as_any(&self) -> &dyn Any {
self
}

fn field(&self) -> Result<Field> {
Ok(Field::new(&self.name, DataType::Float64, true))
fn name(&self) -> &str {
&self.name
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
Ok(DataType::Float64)
}
fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
Ok(ScalarValue::Float64(None))
}

fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> {
fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
Ok(Box::new(CorrelationAccumulator::try_new(
self.null_on_divide_by_zero,
)?))
}

fn state_fields(&self) -> Result<Vec<Field>> {
fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
Ok(vec![
Field::new(
format_state_name(&self.name, "count"),
Expand Down Expand Up @@ -113,18 +129,6 @@ impl AggregateExpr for Correlation {
),
])
}

fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
vec![Arc::clone(&self.expr1), Arc::clone(&self.expr2)]
}

fn name(&self) -> &str {
&self.name
}

fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
Ok(ScalarValue::Float64(None))
}
}

impl PartialEq<dyn Any> for Correlation {
Expand Down
Loading
Loading