apache · ozankabak · Dec 25, 2023 · Dec 25, 2023 · Dec 25, 2023 · Dec 25, 2023
diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs
@@ -45,6 +45,7 @@ impl EliminateCrossJoin {
 /// 'select ... from a, b where (a.x = b.y and b.xx = 100) or (a.x = b.y and b.xx = 200);'
 /// 'select ... from a, b, c where (a.x = b.y and b.xx = 100 and a.z = c.z)
 /// or (a.x = b.y and b.xx = 200 and a.z=c.z);'
+/// 'select ... from a, b where a.x > b.y'
 /// For above queries, the join predicate is available in filters and they are moved to
 /// join nodes appropriately
 /// This fix helps to improve the performance of TPCH Q19. issue#78

diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
@@ -965,11 +965,10 @@ impl PushDownFilter {
     }
 }
 
-/// Convert cross join to join by pushing down filter predicate to the join condition
+/// Converts cross join to the inner join with empty equality predicate and empty filter condition.
 fn convert_cross_join_to_inner_join(cross_join: CrossJoin) -> Result<Join> {
     let CrossJoin { left, right, .. } = cross_join;
     let join_schema = build_join_schema(left.schema(), right.schema(), &JoinType::Inner)?;
-    // predicate is given
     Ok(Join {
         left,
         right,
@@ -991,6 +990,13 @@ fn convert_to_cross_join_if_beneficial(plan: LogicalPlan) -> Result<LogicalPlan>
                 .cross_join(join.right.as_ref().clone())?
                 .build();
         }
+    } else if let LogicalPlan::Filter(filter) = &plan {
+        let new_input =
+            convert_to_cross_join_if_beneficial(filter.input.as_ref().clone())?;
+        return Ok(LogicalPlan::Filter(Filter::try_new(
+            filter.predicate.clone(),
+            Arc::new(new_input),
+        )?));
     }
     Ok(plan)
 }

diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::array::ArrayRef;
 use async_trait::async_trait;
 use datafusion::execution::context::SessionState;
-use datafusion::logical_expr::Expr;
+use datafusion::logical_expr::{create_udf, Expr, ScalarUDF, Volatility};
+use datafusion::physical_expr::functions::make_scalar_function;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::SessionConfig;
 use datafusion::{
@@ -33,6 +35,7 @@ use datafusion::{
     datasource::{MemTable, TableProvider, TableType},
     prelude::{CsvReadOptions, SessionContext},
 };
+use datafusion_common::cast::as_float64_array;
 use datafusion_common::DataFusionError;
 use log::info;
 use std::collections::HashMap;
@@ -102,6 +105,8 @@ impl TestContext {
             }
             "joins.slt" => {
                 info!("Registering partition table tables");
+                let example = create_example_udf();
+                test_ctx.ctx.register_udf(example);
                 register_partition_table(&mut test_ctx).await;
             }
             "metadata.slt" => {
@@ -348,3 +353,58 @@ pub async fn register_metadata_tables(ctx: &SessionContext) {
 
     ctx.register_batch("table_with_metadata", batch).unwrap();
 }
+
+/// Create a UDF function named "example"
+fn create_example_udf() -> ScalarUDF {
+    // First, declare the actual implementation of the calculation
+    let adder = |args: &[ArrayRef]| {
+        // in DataFusion, all `args` and output are dynamically-typed arrays, which means that we need to:
+        // 1. cast the values to the type we want
+        // 2. perform the computation for every element in the array (using a loop or SIMD) and construct the result
+
+        // this is guaranteed by DataFusion based on the function's signature.
+        assert_eq!(args.len(), 2);
+
+        // 1. cast both arguments to f64. These casts MUST be aligned with the signature or this function panics!
+        let lhs = as_float64_array(&args[0]).expect("cast failed");
+        let rhs = as_float64_array(&args[1]).expect("cast failed");
+
+        // this is guaranteed by DataFusion. We place it just to make it obvious.
+        assert_eq!(lhs.len(), rhs.len());
+
+        // 2. perform the computation
+        let array = lhs
+            .iter()
+            .zip(rhs.iter())
+            .map(|(lhs, rhs)| {
+                match (lhs, rhs) {
+                    // in arrow, any value can be null.
+                    // Here we decide to make our UDF to return null when either base or exponent is null.
+                    (Some(lhs), Some(rhs)) => Some(lhs + rhs),
+                    _ => None,
+                }
+            })
+            .collect::<Float64Array>();
+
+        // `Ok` because no error occurred during the calculation
+        // `Arc` because arrays are immutable, thread-safe, trait objects.
+        Ok(Arc::new(array) as ArrayRef)
+    };
+    // the function above expects an `ArrayRef`, but DataFusion may pass a scalar to a UDF.
+    // thus, we use `make_scalar_function` to decorare the closure so that it can handle both Arrays and Scalar values.
+    let adder = make_scalar_function(adder);
+
+    // Next:
+    // * give it a name so that it shows nicely when the plan is printed
+    // * declare what input it expects
+    // * declare its return type
+    create_udf(
+        "example",
+        // expects two f64
+        vec![DataType::Float64, DataType::Float64],
+        // returns f64
+        Arc::new(DataType::Float64),
+        Volatility::Immutable,
+        adder,
+    )
+}
diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt
@@ -3483,6 +3483,28 @@ NestedLoopJoinExec: join_type=Inner, filter=a@0 > a@1
 ----CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true
 --CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true
 
+# Currently datafusion cannot pushdown filter conditions with scalar UDF into
+# cross join.
+query TT
+EXPLAIN SELECT *
+FROM annotated_data as t1, annotated_data as t2
+WHERE EXAMPLE(t1.a, t2.a) > 3
+----
+logical_plan
+Filter: example(CAST(t1.a AS Float64), CAST(t2.a AS Float64)) > Float64(3)
+--CrossJoin:
+----SubqueryAlias: t1
+------TableScan: annotated_data projection=[a0, a, b, c, d]
+----SubqueryAlias: t2
+------TableScan: annotated_data projection=[a0, a, b, c, d]
+physical_plan
+CoalesceBatchesExec: target_batch_size=2
+--FilterExec: example(CAST(a@1 AS Float64), CAST(a@6 AS Float64)) > 3
+----CrossJoinExec
+------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true
+------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true
+
 ####
 # Config teardown
 ####