From e9368a2a6583420e2b0cdcfd64255e5aa20a208e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Feb 2022 16:28:34 -0500 Subject: [PATCH 1/4] API to get Expr type and nullability without a `DFSchema` --- datafusion/src/logical_plan/expr.rs | 52 +++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index a1e51e07422e..70505b1b54f2 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -390,20 +390,54 @@ impl PartialOrd for Expr { } } +/// Provides schema information needed by [Expr] methods such as +/// [Expr::nullable] and [Expr::data_type] +/// +/// Note that this trait is implemented for &[DFSchema] which is +/// widely used in the DataFusion codebase. +pub trait ExprSchema { + /// Is this column reference nullable? + fn nullable(&self, col: &Column) -> Result; + + /// What is the datatype of this column? + fn data_type(&self, col: &Column) -> Result<&DataType>; +} + +// Implement for Arc +impl> ExprSchema for P { + fn nullable(&self, col: &Column) -> Result { + self.as_ref().nullable(col) + } + + fn data_type(&self, col: &Column) -> Result<&DataType> { + self.as_ref().data_type(col) + } +} + +impl ExprSchema for DFSchema { + fn nullable(&self, col: &Column) -> Result { + Ok(self.field_from_column(col)?.is_nullable()) + } + + fn data_type(&self, col: &Column) -> Result<&DataType> { + Ok(self.field_from_column(col)?.data_type()) + } +} + impl Expr { - /// Returns the [arrow::datatypes::DataType] of the expression based on [arrow::datatypes::Schema]. + /// Returns the [arrow::datatypes::DataType] of the expression based on [DFSchema]. /// /// # Errors /// /// This function errors when it is not possible to compute its [arrow::datatypes::DataType]. /// This happens when e.g. the expression refers to a column that does not exist in the schema, or when /// the expression is incorrectly typed (e.g. `[utf8] + [bool]`). - pub fn get_type(&self, schema: &DFSchema) -> Result { + pub fn get_type(&self, schema: &S) -> Result { match self { Expr::Alias(expr, _) | Expr::Sort { expr, .. } | Expr::Negative(expr) => { expr.get_type(schema) } - Expr::Column(c) => Ok(schema.field_from_column(c)?.data_type().clone()), + Expr::Column(c) => Ok(schema.data_type(c)?.clone()), Expr::ScalarVariable(_) => Ok(DataType::Utf8), Expr::Literal(l) => Ok(l.get_datatype()), Expr::Case { when_then_expr, .. } => when_then_expr[0].1.get_type(schema), @@ -470,13 +504,13 @@ impl Expr { } } - /// Returns the nullability of the expression based on [arrow::datatypes::Schema]. + /// Returns the nullability of the expression based on [DFSchema]. /// /// # Errors /// /// This function errors when it is not possible to compute its nullability. /// This happens when the expression refers to a column that does not exist in the schema. - pub fn nullable(&self, input_schema: &DFSchema) -> Result { + pub fn nullable(&self, input_schema: &S) -> Result { match self { Expr::Alias(expr, _) | Expr::Not(expr) @@ -484,7 +518,7 @@ impl Expr { | Expr::Sort { expr, .. } | Expr::Between { expr, .. } | Expr::InList { expr, .. } => expr.nullable(input_schema), - Expr::Column(c) => Ok(input_schema.field_from_column(c)?.is_nullable()), + Expr::Column(c) => input_schema.nullable(c), Expr::Literal(value) => Ok(value.is_null()), Expr::Case { when_then_expr, @@ -559,7 +593,11 @@ impl Expr { /// /// This function errors when it is impossible to cast the /// expression to the target [arrow::datatypes::DataType]. - pub fn cast_to(self, cast_to_type: &DataType, schema: &DFSchema) -> Result { + pub fn cast_to( + self, + cast_to_type: &DataType, + schema: &S, + ) -> Result { // TODO(kszucs): most of the operations do not validate the type correctness // like all of the binary expressions below. Perhaps Expr should track the // type of the expression? From 50e28119f246c6b842ad62427662c171fbcfbcd6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Feb 2022 16:40:09 -0500 Subject: [PATCH 2/4] Add test --- datafusion/src/logical_plan/expr.rs | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 70505b1b54f2..bfc8eb2b051d 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -2527,4 +2527,57 @@ mod tests { combine_filters(&[filter1.clone(), filter2.clone(), filter3.clone()]); assert_eq!(result, Some(and(and(filter1, filter2), filter3))); } + + #[test] + fn expr_schema_nullability() { + let expr = col("foo").eq(lit(1)); + assert!(!expr.nullable(&MockExprSchema::new()).unwrap()); + assert!(expr + .nullable(&MockExprSchema::new().with_nullable(true)) + .unwrap()); + } + + #[test] + fn expr_schema_data_type() { + let expr = col("foo"); + assert_eq!( + DataType::Utf8, + expr.get_type(&MockExprSchema::new().with_data_type(DataType::Utf8)) + .unwrap() + ); + } + + struct MockExprSchema { + nullable: bool, + data_type: DataType, + } + + impl MockExprSchema { + fn new() -> Self { + Self { + nullable: false, + data_type: DataType::Null, + } + } + + fn with_nullable(mut self, nullable: bool) -> Self { + self.nullable = nullable; + self + } + + fn with_data_type(mut self, data_type: DataType) -> Self { + self.data_type = data_type; + self + } + } + + impl ExprSchema for MockExprSchema { + fn nullable(&self, _col: &Column) -> Result { + Ok(self.nullable) + } + + fn data_type(&self, _col: &Column) -> Result<&DataType> { + Ok(&self.data_type) + } + } } From b6b1f9377ff23e6c3e9a67c87e7c728bd626217f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 2 Feb 2022 13:49:40 -0500 Subject: [PATCH 3/4] publically export --- datafusion/src/logical_plan/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index 25714514d78a..22521a1bd1fb 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -46,8 +46,8 @@ pub use expr::{ rewrite_sort_cols_by_aggs, right, round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, to_hex, translate, trim, trunc, unalias, unnormalize_col, unnormalize_cols, upper, when, - Column, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, RewriteRecursion, - SimplifyInfo, + Column, Expr, ExprRewriter, ExprSchema, ExpressionVisitor, Literal, Recursion, + RewriteRecursion, SimplifyInfo, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; From cd91159de66c336354a5d3b77bd80813b1ddfa8d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Feb 2022 11:26:51 -0500 Subject: [PATCH 4/4] Improve docs --- datafusion/src/logical_plan/expr.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index de31aac99a41..300c75137d8a 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -393,7 +393,7 @@ impl PartialOrd for Expr { } /// Provides schema information needed by [Expr] methods such as -/// [Expr::nullable] and [Expr::data_type] +/// [Expr::nullable] and [Expr::data_type]. /// /// Note that this trait is implemented for &[DFSchema] which is /// widely used in the DataFusion codebase. @@ -405,7 +405,7 @@ pub trait ExprSchema { fn data_type(&self, col: &Column) -> Result<&DataType>; } -// Implement for Arc +// Implement `ExprSchema` for `Arc` impl> ExprSchema for P { fn nullable(&self, col: &Column) -> Result { self.as_ref().nullable(col) @@ -427,13 +427,18 @@ impl ExprSchema for DFSchema { } impl Expr { - /// Returns the [arrow::datatypes::DataType] of the expression based on [DFSchema]. + /// Returns the [arrow::datatypes::DataType] of the expression + /// based on [ExprSchema] + /// + /// Note: [DFSchema] implements [ExprSchema]. /// /// # Errors /// - /// This function errors when it is not possible to compute its [arrow::datatypes::DataType]. - /// This happens when e.g. the expression refers to a column that does not exist in the schema, or when - /// the expression is incorrectly typed (e.g. `[utf8] + [bool]`). + /// This function errors when it is not possible to compute its + /// [arrow::datatypes::DataType]. This happens when e.g. the + /// expression refers to a column that does not exist in the + /// schema, or when the expression is incorrectly typed + /// (e.g. `[utf8] + [bool]`). pub fn get_type(&self, schema: &S) -> Result { match self { Expr::Alias(expr, _) | Expr::Sort { expr, .. } | Expr::Negative(expr) => { @@ -506,12 +511,15 @@ impl Expr { } } - /// Returns the nullability of the expression based on [DFSchema]. + /// Returns the nullability of the expression based on [ExprSchema]. + /// + /// Note: [DFSchema] implements [ExprSchema]. /// /// # Errors /// - /// This function errors when it is not possible to compute its nullability. - /// This happens when the expression refers to a column that does not exist in the schema. + /// This function errors when it is not possible to compute its + /// nullability. This happens when the expression refers to a + /// column that does not exist in the schema. pub fn nullable(&self, input_schema: &S) -> Result { match self { Expr::Alias(expr, _)