diff --git a/Cargo.lock b/Cargo.lock index 4e110789bfda..cbb2f381d648 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -361,9 +361,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9b3aaba47ed4b6146563c8b79ad0f7aa283f794cde0c057c656291b81196746" +checksum = "cf7806ee3d229ee866013e83446e937ab3c8a9e6a664b259d41dd960b309c5d0" dependencies = [ "arrow-arith", "arrow-array", @@ -674,9 +674,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.58.0" +version = "1.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16ff718c9ee45cc1ebd4774a0e086bb80a6ab752b4902edf1c9f56b86ee1f770" +checksum = "00a35fc7e74f5be45839eb753568535c074a592185dd0a2d406685018d581c43" dependencies = [ "aws-credential-types", "aws-runtime", @@ -696,9 +696,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.59.0" +version = "1.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5183e088715cc135d8d396fdd3bc02f018f0da4c511f53cb8d795b6a31c55809" +checksum = "f8fa655b4f313124ce272cbc38c5fef13793c832279cec750103e5e6b71a54b8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -718,9 +718,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.59.0" +version = "1.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9f944ef032717596639cea4a2118a3a457268ef51bbb5fde9637e54c465da00" +checksum = "dc1cfe5e16b90421ea031f4c6348b534ef442e76f6bf4a1b2b592c12cc2c6af9" dependencies = [ "aws-credential-types", "aws-runtime", @@ -741,9 +741,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.8" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837" +checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -1035,15 +1035,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.5" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" +checksum = "1230237285e3e10cde447185e8975408ae24deaa67205ce684805c25bc0c7937" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", + "memmap2", ] [[package]] @@ -1077,7 +1078,7 @@ dependencies = [ "hyperlocal", "log", "pin-project-lite", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -1248,9 +1249,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.13" +version = "1.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda" +checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" dependencies = [ "jobserver", "libc", @@ -1345,9 +1346,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.29" +version = "4.5.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184" +checksum = "92b7b18d71fad5313a1e320fa9897994228ce274b60faa4d694fe0ea89cd9e6d" dependencies = [ "clap_builder", "clap_derive", @@ -1355,9 +1356,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.29" +version = "4.5.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9" +checksum = "a35db2071778a7344791a4fb4f95308b5673d219dee3ae348b86642574ecc90c" dependencies = [ "anstream", "anstyle", @@ -1549,7 +1550,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.29", + "clap 4.5.30", "criterion-plot", "futures", "is-terminal", @@ -1642,9 +1643,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -1850,7 +1851,7 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", - "clap 4.5.29", + "clap 4.5.30", "ctor", "datafusion", "dirs", @@ -1917,6 +1918,8 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", "flate2", "futures", @@ -2229,6 +2232,7 @@ version = "45.0.0" dependencies = [ "arrow", "datafusion-common", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -2343,7 +2347,7 @@ dependencies = [ "bigdecimal", "bytes", "chrono", - "clap 4.5.29", + "clap 4.5.30", "datafusion", "env_logger", "futures", @@ -2562,9 +2566,9 @@ dependencies = [ [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" @@ -2923,9 +2927,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2" dependencies = [ "atomic-waker", "bytes", @@ -3133,7 +3137,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "httparse", @@ -3186,7 +3190,7 @@ dependencies = [ "http 1.2.0", "hyper 1.6.0", "hyper-util", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", @@ -3676,7 +3680,7 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.29", + "clap 4.5.30", "escape8259", ] @@ -3750,6 +3754,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.9.1" @@ -3786,9 +3799,9 @@ dependencies = [ [[package]] name = "miniz_oxide" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" dependencies = [ "adler2", ] @@ -4043,9 +4056,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -4484,9 +4497,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" dependencies = [ "cc", ] @@ -4601,7 +4614,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.22", + "rustls 0.23.23", "socket2", "thiserror 2.0.11", "tokio", @@ -4619,7 +4632,7 @@ dependencies = [ "rand 0.8.5", "ring", "rustc-hash", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-pki-types", "slab", "thiserror 2.0.11", @@ -4630,9 +4643,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" dependencies = [ "cfg_aliases", "libc", @@ -4685,8 +4698,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" dependencies = [ "rand_chacha 0.9.0", - "rand_core 0.9.0", - "zerocopy 0.8.17", + "rand_core 0.9.1", + "zerocopy 0.8.18", ] [[package]] @@ -4706,7 +4719,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.0", + "rand_core 0.9.1", ] [[package]] @@ -4720,12 +4733,12 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08f3c9802962f7e1b25113931d94f43ed9725bebc59db9d0c3e9a23b67e15ff" +checksum = "a88e0da7a2c97baa202165137c158d0a2e824ac465d13d81046727b34cb247d3" dependencies = [ "getrandom 0.3.1", - "zerocopy 0.8.17", + "zerocopy 0.8.18", ] [[package]] @@ -4886,7 +4899,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "http-body-util", @@ -4901,7 +4914,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -4924,15 +4937,14 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.8" +version = "0.17.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24" dependencies = [ "cc", "cfg-if", "getrandom 0.2.15", "libc", - "spin", "untrusted", "windows-sys 0.52.0", ] @@ -5078,9 +5090,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.22" +version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ "once_cell", "ring", @@ -5496,9 +5508,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] name = "snafu" @@ -5555,17 +5567,11 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - [[package]] name = "sqllogictest" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07a06aea5e52b0a63b9d8328b46ea2740cdab4cac13def8ef4f2e5288610f9ed" +checksum = "6f1c93848602f92e5925690d4805ccbc1ccdb61bee7d4ae79ad6862b542a539c" dependencies = [ "async-trait", "educe", @@ -5850,9 +5856,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.17.0" +version = "3.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a40f762a77d2afa88c2d919489e390a12bdd261ed568e60cfa7e48d4e20f0d33" +checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230" dependencies = [ "cfg-if", "fastrand", @@ -6123,7 +6129,7 @@ version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.22", + "rustls 0.23.23", "tokio", ] @@ -6194,7 +6200,7 @@ dependencies = [ "axum", "base64 0.22.1", "bytes", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "http-body-util", @@ -6350,9 +6356,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.17.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "typify" @@ -6502,9 +6508,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" +checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6" dependencies = [ "getrandom 0.3.1", "js-sys", @@ -7084,11 +7090,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.17" +version = "0.8.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa91407dacce3a68c56de03abe2760159582b846c6a4acd2f456618087f12713" +checksum = "79386d31a42a4996e3336b0919ddb90f81112af416270cff95b5f5af22b839c2" dependencies = [ - "zerocopy-derive 0.8.17", + "zerocopy-derive 0.8.18", ] [[package]] @@ -7104,9 +7110,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.17" +version = "0.8.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06718a168365cad3d5ff0bb133aad346959a2074bd4a85c121255a11304a8626" +checksum = "76331675d372f91bf8d17e13afbd5fe639200b73d01f0fc748bb059f9cca2db7" dependencies = [ "proc-macro2", "quote", diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index b5bcb8243ea9..13d2d5fd3547 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -28,10 +28,10 @@ use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; use datafusion::catalog::{Session, TableFunctionImpl}; use datafusion::common::{plan_err, Column}; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::logical_expr::Expr; -use datafusion::physical_plan::memory::MemorySourceConfig; use datafusion::physical_plan::ExecutionPlan; use datafusion::scalar::ScalarValue; diff --git a/datafusion-examples/examples/parquet_exec_visitor.rs b/datafusion-examples/examples/parquet_exec_visitor.rs index 20809a1121c1..6c9f1a354430 100644 --- a/datafusion-examples/examples/parquet_exec_visitor.rs +++ b/datafusion-examples/examples/parquet_exec_visitor.rs @@ -20,10 +20,10 @@ use std::sync::Arc; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ListingOptions, PartitionedFile}; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion::error::DataFusionError; use datafusion::execution::context::SessionContext; use datafusion::physical_plan::metrics::MetricValue; -use datafusion::physical_plan::source::DataSourceExec; use datafusion::physical_plan::{ execute_stream, visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor, }; diff --git a/datafusion-examples/examples/remote_catalog.rs b/datafusion-examples/examples/remote_catalog.rs index f84c6a0302ce..70c0963545e0 100644 --- a/datafusion-examples/examples/remote_catalog.rs +++ b/datafusion-examples/examples/remote_catalog.rs @@ -36,9 +36,9 @@ use datafusion::catalog::TableProvider; use datafusion::catalog::{AsyncSchemaProvider, Session}; use datafusion::common::Result; use datafusion::common::{assert_batches_eq, internal_datafusion_err, plan_err}; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::execution::SendableRecordBatchStream; use datafusion::logical_expr::{Expr, TableType}; -use datafusion::physical_plan::memory::MemorySourceConfig; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::{DataFrame, SessionContext}; diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/simple_udtf.rs index afba4c390f71..d2b2d1bf9655 100644 --- a/datafusion-examples/examples/simple_udtf.rs +++ b/datafusion-examples/examples/simple_udtf.rs @@ -23,13 +23,13 @@ use datafusion::arrow::record_batch::RecordBatch; use datafusion::catalog::Session; use datafusion::catalog::TableFunctionImpl; use datafusion::common::{plan_err, ScalarValue}; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::simplify::SimplifyContext; use datafusion::logical_expr::{Expr, TableType}; use datafusion::optimizer::simplify_expressions::ExprSimplifier; -use datafusion::physical_plan::memory::MemorySourceConfig; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; use std::fs::File; diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 784b2a89aae9..0c3796a95300 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -222,3 +222,7 @@ required-features = ["nested_expressions"] [[bench]] harness = false name = "dataframe" + +[[bench]] +harness = false +name = "spm" diff --git a/datafusion/core/benches/physical_plan.rs b/datafusion/core/benches/physical_plan.rs index 53c245ecc2b5..aae1457ab9e6 100644 --- a/datafusion/core/benches/physical_plan.rs +++ b/datafusion/core/benches/physical_plan.rs @@ -33,9 +33,9 @@ use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMerge use datafusion::physical_plan::{ collect, expressions::{col, PhysicalSortExpr}, - memory::MemorySourceConfig, }; use datafusion::prelude::SessionContext; +use datafusion_datasource::memory::MemorySourceConfig; use datafusion_physical_expr_common::sort_expr::LexOrdering; // Initialize the operator using the provided record batches and the sort key diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs index 4d71d4c56a6d..8f0b3753f67c 100644 --- a/datafusion/core/benches/sort.rs +++ b/datafusion/core/benches/sort.rs @@ -79,12 +79,13 @@ use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::{ execution::context::TaskContext, physical_plan::{ - coalesce_partitions::CoalescePartitionsExec, memory::MemorySourceConfig, + coalesce_partitions::CoalescePartitionsExec, sorts::sort_preserving_merge::SortPreservingMergeExec, ExecutionPlan, ExecutionPlanProperties, }, prelude::SessionContext, }; +use datafusion_datasource::memory::MemorySourceConfig; use datafusion_physical_expr::{expressions::col, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; diff --git a/datafusion/physical-plan/benches/spm.rs b/datafusion/core/benches/spm.rs similarity index 98% rename from datafusion/physical-plan/benches/spm.rs rename to datafusion/core/benches/spm.rs index 3a2ecb57394b..63b06f20cd86 100644 --- a/datafusion/physical-plan/benches/spm.rs +++ b/datafusion/core/benches/spm.rs @@ -27,7 +27,7 @@ use datafusion_physical_plan::{collect, ExecutionPlan}; use criterion::async_executor::FuturesExecutor; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_physical_plan::memory::MemorySourceConfig; +use datafusion_datasource::memory::MemorySourceConfig; fn generate_spm_for_round_robin_tie_breaker( has_same_value: bool, diff --git a/datafusion/core/src/datasource/empty.rs b/datafusion/core/src/datasource/empty.rs index abda7fa9ec4b..77686c5eb7c2 100644 --- a/datafusion/core/src/datasource/empty.rs +++ b/datafusion/core/src/datasource/empty.rs @@ -28,7 +28,8 @@ use datafusion_common::project_schema; use crate::datasource::{TableProvider, TableType}; use crate::error::Result; use crate::logical_expr::Expr; -use crate::physical_plan::{empty::EmptyExec, ExecutionPlan}; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::ExecutionPlan; /// An empty plan that is useful for testing and generating plans /// without mapping them to actual data. diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 3be8af59ea2a..819da155a1a2 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -37,7 +37,8 @@ use datafusion_common::{config_err, DataFusionError, Result}; use datafusion_expr::dml::InsertOp; use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; use datafusion_expr::{SortExpr, TableType}; -use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics}; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::{ExecutionPlan, Statistics}; use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef}; use datafusion_common::{ diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index 94c6e45804e8..b8bec410070c 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -38,11 +38,11 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_catalog::Session; use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt}; +pub use datafusion_datasource::memory::MemorySourceConfig; +pub use datafusion_datasource::source::DataSourceExec; use datafusion_execution::TaskContext; use datafusion_expr::dml::InsertOp; use datafusion_expr::SortExpr; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use async_trait::async_trait; use futures::StreamExt; diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 12dd9d7cab38..96687913fb42 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -36,6 +36,8 @@ mod statistics; pub mod stream; pub mod view; +pub use datafusion_datasource::source; + // backwards compatibility pub use self::default_table_source::{ provider_as_source, source_as_provider, DefaultTableSource, diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 4a7cdc192cd3..c6e05893a979 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -32,12 +32,12 @@ use arrow::datatypes::SchemaRef; use arrow_ipc::reader::FileDecoder; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, }; diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index b0a1d8c8c9e2..1a88dc31a64d 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -29,12 +29,12 @@ use crate::error::Result; use arrow::datatypes::SchemaRef; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, }; diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index c0952229b5e0..412c90726af0 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -37,13 +37,13 @@ use arrow::csv; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use datafusion_physical_plan::projection::ProjectionExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, PlanProperties}; use futures::{StreamExt, TryStreamExt}; @@ -409,7 +409,7 @@ impl ExecutionPlan for CsvExec { /// # }; /// # use datafusion::datasource::physical_plan::CsvSource; /// # use datafusion_execution::object_store::ObjectStoreUrl; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// /// # let object_store_url = ObjectStoreUrl::local_filesystem(); /// # let file_schema = Arc::new(Schema::empty()); diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 123ecc2f9582..4996b6d97b58 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -36,13 +36,13 @@ use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, Partitioning} use crate::datasource::data_source::FileSource; pub use datafusion_datasource::file_scan_config::*; +use datafusion_datasource::source::{DataSource, DataSourceExec}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_plan::display::{display_orderings, ProjectSchemaDisplay}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::projection::{ all_alias_free_columns, new_projections_for_columns, ProjectionExec, }; -use datafusion_physical_plan::source::{DataSource, DataSourceExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; /// Convert type to a type suitable for use as a [`ListingTable`] diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 590b1cb88dcd..249f50efa544 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -35,12 +35,12 @@ use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use arrow::json::ReaderBuilder; use arrow::{datatypes::SchemaRef, json}; use datafusion_common::{Constraints, Statistics}; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, PlanProperties}; use futures::{StreamExt, TryStreamExt}; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 4bd43cd1aaca..2a2d6d7fefdf 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -50,10 +50,10 @@ pub use access_plan::{ParquetAccessPlan, RowGroupAccess}; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::Constraints; +use datafusion_datasource::source::DataSourceExec; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr}; use datafusion_physical_optimizer::pruning::PruningPredicate; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; -use datafusion_physical_plan::source::DataSourceExec; pub use metrics::ParquetFileMetrics; pub use page_filter::PagePruningAccessPlanFilter; pub use reader::{DefaultParquetFileReaderFactory, ParquetFileReaderFactory}; @@ -579,10 +579,10 @@ mod tests { use arrow::record_batch::RecordBatch; use bytes::{BufMut, BytesMut}; use datafusion_common::{assert_contains, ScalarValue}; + use datafusion_datasource::source::DataSourceExec; use datafusion_expr::{col, lit, when, Expr}; use datafusion_physical_expr::planner::logical2physical; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; - use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use crate::datasource::physical_plan::parquet::source::ParquetSource; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/source.rs b/datafusion/core/src/datasource/physical_plan/parquet/source.rs index 21881112075d..810a16de41af 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/source.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/source.rs @@ -81,7 +81,7 @@ use object_store::ObjectStore; /// # use datafusion::datasource::listing::PartitionedFile; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_physical_expr::expressions::lit; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// # use datafusion_common::config::TableParquetOptions; /// /// # let file_schema = Arc::new(Schema::empty()); @@ -160,7 +160,7 @@ use object_store::ObjectStore; /// # use arrow::datatypes::Schema; /// # use datafusion::datasource::physical_plan::FileScanConfig; /// # use datafusion::datasource::listing::PartitionedFile; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// /// # fn parquet_exec() -> DataSourceExec { unimplemented!() } /// // Split a single DataSourceExec into multiple DataSourceExecs, one for each file @@ -202,7 +202,7 @@ use object_store::ObjectStore; /// # use datafusion::datasource::physical_plan::FileScanConfig; /// # use datafusion::datasource::physical_plan::parquet::source::ParquetSource; /// # use datafusion_execution::object_store::ObjectStoreUrl; -/// # use datafusion_physical_plan::source::DataSourceExec; +/// # use datafusion::datasource::source::DataSourceExec; /// /// # fn schema() -> SchemaRef { /// # Arc::new(Schema::empty()) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index bce1aab16e5e..f1d29d8c0776 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -38,7 +38,6 @@ use crate::logical_expr::{ use crate::physical_expr::{create_physical_expr, create_physical_exprs}; use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; use crate::physical_plan::analyze::AnalyzeExec; -use crate::physical_plan::empty::EmptyExec; use crate::physical_plan::explain::ExplainExec; use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::filter::FilterExec; @@ -48,7 +47,6 @@ use crate::physical_plan::joins::{ }; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use crate::physical_plan::projection::ProjectionExec; -use crate::physical_plan::recursive_query::RecursiveQueryExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::union::UnionExec; @@ -58,6 +56,8 @@ use crate::physical_plan::{ displayable, windows, ExecutionPlan, ExecutionPlanProperties, InputOrderMode, Partitioning, PhysicalExpr, WindowExpr, }; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::recursive_query::RecursiveQueryExec; use arrow::array::{builder::StringBuilder, RecordBatch}; use arrow::compute::SortOptions; @@ -68,6 +68,7 @@ use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue, }; +use datafusion_datasource::memory::MemorySourceConfig; use datafusion_expr::dml::{CopyTo, InsertOp}; use datafusion_expr::expr::{ physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, @@ -84,7 +85,6 @@ use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::LexOrdering; use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::execution_plan::InvariantLevel; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::unnest::ListUnnest; diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index ba85f9afb6da..c569113a27bd 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -42,7 +42,7 @@ use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::DataFusionError; -use datafusion_physical_plan::source::DataSourceExec; +use datafusion_datasource::source::DataSourceExec; #[cfg(feature = "compression")] use bzip2::write::BzEncoder; diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 0e0090ef028e..fc98b43051f8 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -37,7 +37,7 @@ use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; -use datafusion_physical_plan::source::DataSourceExec; +use datafusion_datasource::source::DataSourceExec; use object_store::path::Path; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 5e1f263b4c76..1025a49ea1e3 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -30,6 +30,8 @@ use arrow::datatypes::{ }; use arrow::util::pretty::pretty_format_batches; use datafusion::common::Result; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::MemTable; use datafusion::physical_expr::aggregate::AggregateExprBuilder; use datafusion::physical_plan::aggregates::{ @@ -43,8 +45,6 @@ use datafusion_functions_aggregate::sum::sum_udaf; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::InputOrderMode; use test_utils::{add_empty_batches, StringBatchGenerator}; diff --git a/datafusion/core/tests/fuzz_cases/join_fuzz.rs b/datafusion/core/tests/fuzz_cases/join_fuzz.rs index 5dd29f90ef83..da93dd5edf29 100644 --- a/datafusion/core/tests/fuzz_cases/join_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/join_fuzz.rs @@ -26,6 +26,8 @@ use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; use datafusion::common::JoinSide; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::logical_expr::{JoinType, Operator}; use datafusion::physical_expr::expressions::BinaryExpr; use datafusion::physical_plan::collect; @@ -38,8 +40,6 @@ use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_common::ScalarValue; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::PhysicalExprRef; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use itertools::Itertools; use rand::Rng; diff --git a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs index 35fca789ddcb..92f375525066 100644 --- a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs @@ -24,10 +24,10 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::physical_plan::{ collect, expressions::{col, PhysicalSortExpr}, - memory::MemorySourceConfig, sorts::sort_preserving_merge::SortPreservingMergeExec, }; use datafusion::prelude::{SessionConfig, SessionContext}; diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index 51a5bc87efd9..0b0f0aa2f105 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -24,6 +24,7 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::physical_plan::expressions::PhysicalSortExpr; use datafusion::physical_plan::sorts::sort::SortExec; @@ -33,7 +34,6 @@ use datafusion_common::cast::as_int32_array; use datafusion_execution::memory_pool::GreedyMemoryPool; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::memory::MemorySourceConfig; use rand::Rng; use test_utils::{batches_to_vec, partitions_to_sorted_vec}; diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index d23408743f9f..06b93d41af36 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -44,9 +44,9 @@ mod sp_repartition_fuzz_tests { }; use test_utils::add_empty_batches; + use datafusion::datasource::memory::MemorySourceConfig; + use datafusion::datasource::source::DataSourceExec; use datafusion_physical_expr_common::sort_expr::LexOrdering; - use datafusion_physical_plan::memory::MemorySourceConfig; - use datafusion_physical_plan::source::DataSourceExec; use itertools::izip; use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index 4a484221a88a..a7f9e38c9ae3 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -22,6 +22,8 @@ use arrow::compute::{concat_batches, SortOptions}; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::functions_window::row_number::row_number_udwf; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::windows::{ @@ -48,8 +50,6 @@ use datafusion_functions_window::rank::{dense_rank_udwf, rank_udwf}; use datafusion_physical_expr::expressions::{cast, col, lit}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use rand::distributions::Alphanumeric; use rand::rngs::StdRng; diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 669294d38af1..a1985a1aa447 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -27,6 +27,8 @@ use arrow::array::{ArrayRef, DictionaryArray, RecordBatch}; use arrow::compute::SortOptions; use arrow::datatypes::{Int32Type, SchemaRef}; use datafusion::assert_batches_eq; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::{MemTable, TableProvider}; use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::runtime_env::RuntimeEnvBuilder; @@ -46,8 +48,6 @@ use datafusion_expr::{Expr, TableType}; use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; use datafusion_physical_optimizer::join_selection::JoinSelection; use datafusion_physical_optimizer::PhysicalOptimizerRule; -use datafusion_physical_plan::memory::MemorySourceConfig; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::spill::get_record_batch_memory_size; use test_utils::AccessLogGenerator; diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 82024a731ed3..ad75cf2607c4 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -22,6 +22,7 @@ use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, }; +use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::TableProvider; use datafusion::execution::context::SessionState; use datafusion::execution::session_state::SessionStateBuilder; @@ -34,7 +35,6 @@ use datafusion_execution::cache::cache_unit::{ use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_expr::{col, lit, Expr}; -use datafusion_physical_plan::source::DataSourceExec; use datafusion::datasource::physical_plan::FileScanConfig; use tempfile::tempdir; diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index ea86bf3685bb..fe96a2eb5e71 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -25,6 +25,7 @@ use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion::execution::context::SessionState; use datafusion::physical_plan::metrics::MetricValue; use datafusion::physical_plan::ExecutionPlan; @@ -33,7 +34,6 @@ use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::{col, lit, Expr}; use datafusion_physical_expr::create_physical_expr; -use datafusion_physical_plan::source::DataSourceExec; use futures::StreamExt; use object_store::path::Path; diff --git a/datafusion/core/tests/parquet/utils.rs b/datafusion/core/tests/parquet/utils.rs index dd5541461ff6..8cb50b22cf63 100644 --- a/datafusion/core/tests/parquet/utils.rs +++ b/datafusion/core/tests/parquet/utils.rs @@ -18,8 +18,8 @@ //! Utilities for parquet tests use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion_physical_plan::metrics::MetricsSet; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::{accept, ExecutionPlan, ExecutionPlanVisitor}; /// Find the metrics from the first DataSourceExec encountered in the plan diff --git a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs index 1757c7150bfe..a79d743cb253 100644 --- a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs @@ -22,6 +22,8 @@ use crate::physical_optimizer::test_utils::TestAggregate; use arrow::array::Int32Array; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::cast::as_int64_array; use datafusion_common::config::ConfigOptions; use datafusion_common::Result; @@ -36,9 +38,7 @@ use datafusion_physical_plan::aggregates::PhysicalGroupBy; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::common; use datafusion_physical_plan::filter::FilterExec; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::projection::ProjectionExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::ExecutionPlan; /// Mock data using a MemorySourceConfig which has an exact count statistic diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 50c67f09c704..66d1380e09c3 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -31,6 +31,7 @@ use datafusion::datasource::file_format::file_compression_type::FileCompressionT use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; @@ -57,7 +58,6 @@ use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion_physical_plan::projection::ProjectionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::ExecutionPlanProperties; use datafusion_physical_plan::PlanProperties; diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 89bd97881e3a..dfba57a584ea 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -21,7 +21,9 @@ use std::sync::Arc; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::datasource::listing::PartitionedFile; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig}; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::config::ConfigOptions; use datafusion_common::Result; use datafusion_common::{JoinSide, JoinType, ScalarValue}; @@ -46,12 +48,10 @@ use datafusion_physical_plan::joins::{ HashJoinExec, NestedLoopJoinExec, PartitionMode, StreamJoinPartitionMode, SymmetricHashJoinExec, }; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::projection::{update_expr, ProjectionExec}; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::streaming::PartitionStream; use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs index d9b569dfa611..58eb866c590c 100644 --- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs @@ -32,13 +32,13 @@ use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::collect; use datafusion_physical_plan::filter::FilterExec; use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; -use datafusion_physical_plan::memory::MemorySourceConfig; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::{ displayable, get_plan_string, ExecutionPlan, Partitioning, }; -use datafusion_physical_plan::source::DataSourceExec; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::tree_node::{TransformedResult, TreeNode}; use datafusion_common::Result; use datafusion_expr::{JoinType, Operator}; diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 162f93facc90..e4d72c112c38 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -26,7 +26,9 @@ use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion::datasource::listing::PartitionedFile; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::utils::expr::COUNT_STAR_EXPANSION; @@ -52,11 +54,9 @@ use datafusion_physical_plan::filter::FilterExec; use datafusion_physical_plan::joins::utils::{JoinFilter, JoinOn}; use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec}; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; -use datafusion_physical_plan::memory::MemorySourceConfig; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::source::DataSourceExec; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::tree_node::PlanContext; use datafusion_physical_plan::union::UnionExec; diff --git a/datafusion/core/tests/sql/path_partition.rs b/datafusion/core/tests/sql/path_partition.rs index 6345f5e4352f..1a19bfe9e86f 100644 --- a/datafusion/core/tests/sql/path_partition.rs +++ b/datafusion/core/tests/sql/path_partition.rs @@ -26,6 +26,7 @@ use std::sync::Arc; use arrow::datatypes::DataType; use datafusion::datasource::listing::ListingTableUrl; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; use datafusion::{ assert_batches_sorted_eq, datasource::{ @@ -43,7 +44,6 @@ use datafusion_common::ScalarValue; use datafusion_execution::config::SessionConfig; use datafusion_expr::{col, lit, Expr, Operator}; use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; -use datafusion_physical_plan::source::DataSourceExec; use async_trait::async_trait; use bytes::Bytes; diff --git a/datafusion/core/tests/user_defined/user_defined_table_functions.rs b/datafusion/core/tests/user_defined/user_defined_table_functions.rs index 0ec9a5fd7620..618f0590ab3d 100644 --- a/datafusion/core/tests/user_defined/user_defined_table_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_table_functions.rs @@ -26,6 +26,7 @@ use arrow::csv::ReaderBuilder; use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::TaskContext; @@ -35,7 +36,6 @@ use datafusion_catalog::Session; use datafusion_catalog::TableFunctionImpl; use datafusion_common::{assert_batches_eq, DFSchema, ScalarValue}; use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, Projection, TableType}; -use datafusion_physical_plan::memory::MemorySourceConfig; use async_trait::async_trait; diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index caf1c60a785d..521c3f59e525 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -49,6 +49,8 @@ datafusion-common = { workspace = true, features = ["object_store"] } datafusion-common-runtime = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-physical-expr = { workspace = true } +datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } flate2 = { version = "1.0.24", optional = true } futures = { workspace = true } diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs new file mode 100644 index 000000000000..efb178ad078e --- /dev/null +++ b/datafusion/datasource/src/memory.rs @@ -0,0 +1,926 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Execution plan for reading in-memory batches of data + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use crate::source::{DataSource, DataSourceExec}; +use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion_physical_plan::memory::MemoryStream; +use datafusion_physical_plan::projection::{ + all_alias_free_columns, new_projections_for_columns, ProjectionExec, +}; +use datafusion_physical_plan::{ + common, ColumnarValue, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, + PhysicalExpr, PlanProperties, SendableRecordBatchStream, Statistics, +}; + +use arrow::array::{RecordBatch, RecordBatchOptions}; +use arrow::datatypes::{Schema, SchemaRef}; +use datafusion_common::{ + internal_err, plan_err, project_schema, Constraints, Result, ScalarValue, +}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::equivalence::ProjectionMapping; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::utils::collect_columns; +use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; + +/// Execution plan for reading in-memory batches of data +#[derive(Clone)] +#[deprecated( + since = "46.0.0", + note = "use MemorySourceConfig and DataSourceExec instead" +)] +pub struct MemoryExec { + inner: DataSourceExec, + /// The partitions to query + partitions: Vec>, + /// Optional projection + projection: Option>, + // Sort information: one or more equivalent orderings + sort_information: Vec, + /// if partition sizes should be displayed + show_sizes: bool, +} + +#[allow(unused, deprecated)] +impl fmt::Debug for MemoryExec { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt_as(DisplayFormatType::Default, f) + } +} + +#[allow(unused, deprecated)] +impl DisplayAs for MemoryExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt_as(t, f) + } +} + +#[allow(unused, deprecated)] +impl ExecutionPlan for MemoryExec { + fn name(&self) -> &'static str { + "MemoryExec" + } + + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn properties(&self) -> &PlanProperties { + self.inner.properties() + } + + fn children(&self) -> Vec<&Arc> { + // This is a leaf node and has no children + vec![] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + // MemoryExec has no children + if children.is_empty() { + Ok(self) + } else { + internal_err!("Children cannot be replaced in {self:?}") + } + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + self.inner.execute(partition, context) + } + + /// We recompute the statistics dynamically from the arrow metadata as it is pretty cheap to do so + fn statistics(&self) -> Result { + self.inner.statistics() + } + + fn try_swapping_with_projection( + &self, + projection: &ProjectionExec, + ) -> Result>> { + self.inner.try_swapping_with_projection(projection) + } +} + +#[allow(unused, deprecated)] +impl MemoryExec { + /// Create a new execution plan for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result { + let source = MemorySourceConfig::try_new(partitions, schema, projection.clone())?; + let data_source = DataSourceExec::new(Arc::new(source)); + Ok(Self { + inner: data_source, + partitions: partitions.to_vec(), + projection, + sort_information: vec![], + show_sizes: true, + }) + } + + /// Create a new execution plan from a list of constant values (`ValuesExec`) + pub fn try_new_as_values( + schema: SchemaRef, + data: Vec>>, + ) -> Result { + if data.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + let n_row = data.len(); + let n_col = schema.fields().len(); + + // We have this single row batch as a placeholder to satisfy evaluation argument + // and generate a single output row + let placeholder_schema = Arc::new(Schema::empty()); + let placeholder_batch = RecordBatch::try_new_with_options( + Arc::clone(&placeholder_schema), + vec![], + &RecordBatchOptions::new().with_row_count(Some(1)), + )?; + + // Evaluate each column + let arrays = (0..n_col) + .map(|j| { + (0..n_row) + .map(|i| { + let expr = &data[i][j]; + let result = expr.evaluate(&placeholder_batch)?; + + match result { + ColumnarValue::Scalar(scalar) => Ok(scalar), + ColumnarValue::Array(array) if array.len() == 1 => { + ScalarValue::try_from_array(&array, 0) + } + ColumnarValue::Array(_) => { + plan_err!("Cannot have array values in a values list") + } + } + }) + .collect::>>() + .and_then(ScalarValue::iter_to_array) + }) + .collect::>>()?; + + let batch = RecordBatch::try_new_with_options( + Arc::clone(&schema), + arrays, + &RecordBatchOptions::new().with_row_count(Some(n_row)), + )?; + + let partitions = vec![batch]; + Self::try_new_from_batches(Arc::clone(&schema), partitions) + } + + /// Create a new plan using the provided schema and batches. + /// + /// Errors if any of the batches don't match the provided schema, or if no + /// batches are provided. + pub fn try_new_from_batches( + schema: SchemaRef, + batches: Vec, + ) -> Result { + if batches.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + for batch in &batches { + let batch_schema = batch.schema(); + if batch_schema != schema { + return plan_err!( + "Batch has invalid schema. Expected: {}, got: {}", + schema, + batch_schema + ); + } + } + + let partitions = vec![batches]; + let source = MemorySourceConfig { + partitions: partitions.clone(), + schema: Arc::clone(&schema), + projected_schema: Arc::clone(&schema), + projection: None, + sort_information: vec![], + show_sizes: true, + fetch: None, + }; + let data_source = DataSourceExec::new(Arc::new(source)); + Ok(Self { + inner: data_source, + partitions, + projection: None, + sort_information: vec![], + show_sizes: true, + }) + } + + fn memory_source_config(&self) -> MemorySourceConfig { + self.inner + .source() + .as_any() + .downcast_ref::() + .unwrap() + .clone() + } + + pub fn with_constraints(mut self, constraints: Constraints) -> Self { + self.inner = self.inner.with_constraints(constraints); + self + } + + /// Set `show_sizes` to determine whether to display partition sizes + pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { + let mut memory_source = self.memory_source_config(); + memory_source.show_sizes = show_sizes; + self.show_sizes = show_sizes; + self.inner = DataSourceExec::new(Arc::new(memory_source)); + self + } + + /// Ref to constraints + pub fn constraints(&self) -> &Constraints { + self.properties().equivalence_properties().constraints() + } + + /// Ref to partitions + pub fn partitions(&self) -> &[Vec] { + &self.partitions + } + + /// Ref to projection + pub fn projection(&self) -> &Option> { + &self.projection + } + + /// Show sizes + pub fn show_sizes(&self) -> bool { + self.show_sizes + } + + /// Ref to sort information + pub fn sort_information(&self) -> &[LexOrdering] { + &self.sort_information + } + + /// A memory table can be ordered by multiple expressions simultaneously. + /// [`EquivalenceProperties`] keeps track of expressions that describe the + /// global ordering of the schema. These columns are not necessarily same; e.g. + /// ```text + /// ┌-------┐ + /// | a | b | + /// |---|---| + /// | 1 | 9 | + /// | 2 | 8 | + /// | 3 | 7 | + /// | 5 | 5 | + /// └---┴---┘ + /// ``` + /// where both `a ASC` and `b DESC` can describe the table ordering. With + /// [`EquivalenceProperties`], we can keep track of these equivalences + /// and treat `a ASC` and `b DESC` as the same ordering requirement. + /// + /// Note that if there is an internal projection, that projection will be + /// also applied to the given `sort_information`. + pub fn try_with_sort_information( + mut self, + sort_information: Vec, + ) -> Result { + self.sort_information = sort_information.clone(); + let mut memory_source = self.memory_source_config(); + memory_source = memory_source.try_with_sort_information(sort_information)?; + self.inner = DataSourceExec::new(Arc::new(memory_source)); + Ok(self) + } + + /// Arc clone of ref to original schema + pub fn original_schema(&self) -> SchemaRef { + Arc::clone(&self.inner.schema()) + } + + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. + fn compute_properties( + schema: SchemaRef, + orderings: &[LexOrdering], + constraints: Constraints, + partitions: &[Vec], + ) -> PlanProperties { + PlanProperties::new( + EquivalenceProperties::new_with_orderings(schema, orderings) + .with_constraints(constraints), + Partitioning::UnknownPartitioning(partitions.len()), + EmissionType::Incremental, + Boundedness::Bounded, + ) + } +} + +/// Data source configuration for reading in-memory batches of data +#[derive(Clone)] +pub struct MemorySourceConfig { + /// The partitions to query + partitions: Vec>, + /// Schema representing the data before projection + schema: SchemaRef, + /// Schema representing the data after the optional projection is applied + projected_schema: SchemaRef, + /// Optional projection + projection: Option>, + /// Sort information: one or more equivalent orderings + sort_information: Vec, + /// if partition sizes should be displayed + show_sizes: bool, + /// The maximum number of records to read from this plan. If `None`, + /// all records after filtering are returned. + fetch: Option, +} + +impl DataSource for MemorySourceConfig { + fn open( + &self, + partition: usize, + _context: Arc, + ) -> Result { + Ok(Box::pin( + MemoryStream::try_new( + self.partitions[partition].clone(), + Arc::clone(&self.projected_schema), + self.projection.clone(), + )? + .with_fetch(self.fetch), + )) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + let partition_sizes: Vec<_> = + self.partitions.iter().map(|b| b.len()).collect(); + + let output_ordering = self + .sort_information + .first() + .map(|output_ordering| { + format!(", output_ordering={}", output_ordering) + }) + .unwrap_or_default(); + + let eq_properties = self.eq_properties(); + let constraints = eq_properties.constraints(); + let constraints = if constraints.is_empty() { + String::new() + } else { + format!(", {}", constraints) + }; + + let limit = self + .fetch + .map_or(String::new(), |limit| format!(", fetch={}", limit)); + if self.show_sizes { + write!( + f, + "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } else { + write!( + f, + "partitions={}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } + } + } + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.partitions.len()) + } + + fn eq_properties(&self) -> EquivalenceProperties { + EquivalenceProperties::new_with_orderings( + Arc::clone(&self.projected_schema), + self.sort_information.as_slice(), + ) + } + + fn statistics(&self) -> Result { + Ok(common::compute_record_batch_statistics( + &self.partitions, + &self.schema, + self.projection.clone(), + )) + } + + fn with_fetch(&self, limit: Option) -> Option> { + let source = self.clone(); + Some(Arc::new(source.with_limit(limit))) + } + + fn fetch(&self) -> Option { + self.fetch + } + + fn try_swapping_with_projection( + &self, + projection: &ProjectionExec, + ) -> Result>> { + // If there is any non-column or alias-carrier expression, Projection should not be removed. + // This process can be moved into MemoryExec, but it would be an overlap of their responsibility. + all_alias_free_columns(projection.expr()) + .then(|| { + let all_projections = (0..self.schema.fields().len()).collect(); + let new_projections = new_projections_for_columns( + projection, + self.projection().as_ref().unwrap_or(&all_projections), + ); + + MemorySourceConfig::try_new_exec( + self.partitions(), + self.original_schema(), + Some(new_projections), + ) + .map(|e| e as _) + }) + .transpose() + } +} + +impl MemorySourceConfig { + /// Create a new `MemorySourceConfig` for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result { + let projected_schema = project_schema(&schema, projection.as_ref())?; + Ok(Self { + partitions: partitions.to_vec(), + schema, + projected_schema, + projection, + sort_information: vec![], + show_sizes: true, + fetch: None, + }) + } + + /// Create a new `DataSourceExec` plan for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new_exec( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result> { + let source = Self::try_new(partitions, schema, projection)?; + Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) + } + + /// Create a new execution plan from a list of constant values (`ValuesExec`) + pub fn try_new_as_values( + schema: SchemaRef, + data: Vec>>, + ) -> Result> { + if data.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + let n_row = data.len(); + let n_col = schema.fields().len(); + + // We have this single row batch as a placeholder to satisfy evaluation argument + // and generate a single output row + let placeholder_schema = Arc::new(Schema::empty()); + let placeholder_batch = RecordBatch::try_new_with_options( + Arc::clone(&placeholder_schema), + vec![], + &RecordBatchOptions::new().with_row_count(Some(1)), + )?; + + // Evaluate each column + let arrays = (0..n_col) + .map(|j| { + (0..n_row) + .map(|i| { + let expr = &data[i][j]; + let result = expr.evaluate(&placeholder_batch)?; + + match result { + ColumnarValue::Scalar(scalar) => Ok(scalar), + ColumnarValue::Array(array) if array.len() == 1 => { + ScalarValue::try_from_array(&array, 0) + } + ColumnarValue::Array(_) => { + plan_err!("Cannot have array values in a values list") + } + } + }) + .collect::>>() + .and_then(ScalarValue::iter_to_array) + }) + .collect::>>()?; + + let batch = RecordBatch::try_new_with_options( + Arc::clone(&schema), + arrays, + &RecordBatchOptions::new().with_row_count(Some(n_row)), + )?; + + let partitions = vec![batch]; + Self::try_new_from_batches(Arc::clone(&schema), partitions) + } + + /// Create a new plan using the provided schema and batches. + /// + /// Errors if any of the batches don't match the provided schema, or if no + /// batches are provided. + pub fn try_new_from_batches( + schema: SchemaRef, + batches: Vec, + ) -> Result> { + if batches.is_empty() { + return plan_err!("Values list cannot be empty"); + } + + for batch in &batches { + let batch_schema = batch.schema(); + if batch_schema != schema { + return plan_err!( + "Batch has invalid schema. Expected: {}, got: {}", + schema, + batch_schema + ); + } + } + + let partitions = vec![batches]; + let source = Self { + partitions, + schema: Arc::clone(&schema), + projected_schema: Arc::clone(&schema), + projection: None, + sort_information: vec![], + show_sizes: true, + fetch: None, + }; + Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) + } + + /// Set the limit of the files + pub fn with_limit(mut self, limit: Option) -> Self { + self.fetch = limit; + self + } + + /// Set `show_sizes` to determine whether to display partition sizes + pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { + self.show_sizes = show_sizes; + self + } + + /// Ref to partitions + pub fn partitions(&self) -> &[Vec] { + &self.partitions + } + + /// Ref to projection + pub fn projection(&self) -> &Option> { + &self.projection + } + + /// Show sizes + pub fn show_sizes(&self) -> bool { + self.show_sizes + } + + /// Ref to sort information + pub fn sort_information(&self) -> &[LexOrdering] { + &self.sort_information + } + + /// A memory table can be ordered by multiple expressions simultaneously. + /// [`EquivalenceProperties`] keeps track of expressions that describe the + /// global ordering of the schema. These columns are not necessarily same; e.g. + /// ```text + /// ┌-------┐ + /// | a | b | + /// |---|---| + /// | 1 | 9 | + /// | 2 | 8 | + /// | 3 | 7 | + /// | 5 | 5 | + /// └---┴---┘ + /// ``` + /// where both `a ASC` and `b DESC` can describe the table ordering. With + /// [`EquivalenceProperties`], we can keep track of these equivalences + /// and treat `a ASC` and `b DESC` as the same ordering requirement. + /// + /// Note that if there is an internal projection, that projection will be + /// also applied to the given `sort_information`. + pub fn try_with_sort_information( + mut self, + mut sort_information: Vec, + ) -> Result { + // All sort expressions must refer to the original schema + let fields = self.schema.fields(); + let ambiguous_column = sort_information + .iter() + .flat_map(|ordering| ordering.clone()) + .flat_map(|expr| collect_columns(&expr.expr)) + .find(|col| { + fields + .get(col.index()) + .map(|field| field.name() != col.name()) + .unwrap_or(true) + }); + if let Some(col) = ambiguous_column { + return internal_err!( + "Column {:?} is not found in the original schema of the MemorySourceConfig", + col + ); + } + + // If there is a projection on the source, we also need to project orderings + if let Some(projection) = &self.projection { + let base_eqp = EquivalenceProperties::new_with_orderings( + self.original_schema(), + &sort_information, + ); + let proj_exprs = projection + .iter() + .map(|idx| { + let base_schema = self.original_schema(); + let name = base_schema.field(*idx).name(); + (Arc::new(Column::new(name, *idx)) as _, name.to_string()) + }) + .collect::>(); + let projection_mapping = + ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?; + sort_information = base_eqp + .project(&projection_mapping, Arc::clone(&self.projected_schema)) + .into_oeq_class() + .into_inner(); + } + + self.sort_information = sort_information; + Ok(self) + } + + /// Arc clone of ref to original schema + pub fn original_schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +#[cfg(test)] +mod memory_source_tests { + use std::sync::Arc; + + use crate::memory::MemorySourceConfig; + use crate::source::DataSourceExec; + use datafusion_physical_plan::ExecutionPlan; + + use arrow::compute::SortOptions; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_physical_expr::expressions::col; + use datafusion_physical_expr::PhysicalSortExpr; + use datafusion_physical_expr_common::sort_expr::LexOrdering; + + #[test] + fn test_memory_order_eq() -> datafusion_common::Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Int64, false), + Field::new("c", DataType::Int64, false), + ])); + let sort1 = LexOrdering::new(vec![ + PhysicalSortExpr { + expr: col("a", &schema)?, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: col("b", &schema)?, + options: SortOptions::default(), + }, + ]); + let sort2 = LexOrdering::new(vec![PhysicalSortExpr { + expr: col("c", &schema)?, + options: SortOptions::default(), + }]); + let mut expected_output_order = LexOrdering::default(); + expected_output_order.extend(sort1.clone()); + expected_output_order.extend(sort2.clone()); + + let sort_information = vec![sort1.clone(), sort2.clone()]; + let mem_exec = Arc::new(DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(&[vec![]], schema, None)? + .try_with_sort_information(sort_information)?, + ))); + + assert_eq!( + mem_exec.properties().output_ordering().unwrap(), + &expected_output_order + ); + let eq_properties = mem_exec.properties().equivalence_properties(); + assert!(eq_properties.oeq_class().contains(&sort1)); + assert!(eq_properties.oeq_class().contains(&sort2)); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ArrayRef, Int32Array}; + use datafusion_physical_plan::expressions::lit; + use std::collections::HashMap; + + use arrow::datatypes::{DataType, Field}; + use datafusion_common::assert_batches_eq; + use datafusion_common::stats::{ColumnStatistics, Precision}; + use futures::StreamExt; + + // Return a RecordBatch with a single Int32 array with values (0..sz) in a field named "i" + pub fn make_partition(sz: i32) -> RecordBatch { + let seq_start = 0; + let seq_end = sz; + let values = (seq_start..seq_end).collect::>(); + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let arr = Arc::new(Int32Array::from(values)); + let arr = arr as ArrayRef; + + RecordBatch::try_new(schema, vec![arr]).unwrap() + } + + #[tokio::test] + async fn exec_with_limit() -> Result<()> { + let task_ctx = Arc::new(TaskContext::default()); + let batch = make_partition(7); + let schema = batch.schema(); + let batches = vec![batch.clone(), batch]; + + let exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); + assert_eq!(exec.fetch(), None); + + let exec = exec.with_fetch(Some(4)).unwrap(); + assert_eq!(exec.fetch(), Some(4)); + + let mut it = exec.execute(0, task_ctx)?; + let mut results = vec![]; + while let Some(batch) = it.next().await { + results.push(batch?); + } + + let expected = [ + "+---+", "| i |", "+---+", "| 0 |", "| 1 |", "| 2 |", "| 3 |", "+---+", + ]; + assert_batches_eq!(expected, &results); + Ok(()) + } + + /// Get the schema for the aggregate_test_* csv files + pub fn aggr_test_schema() -> SchemaRef { + let mut f1 = Field::new("c1", DataType::Utf8, false); + f1.set_metadata(HashMap::from_iter(vec![("testing".into(), "test".into())])); + let schema = Schema::new(vec![ + f1, + Field::new("c2", DataType::UInt32, false), + Field::new("c3", DataType::Int8, false), + Field::new("c4", DataType::Int16, false), + Field::new("c5", DataType::Int32, false), + Field::new("c6", DataType::Int64, false), + Field::new("c7", DataType::UInt8, false), + Field::new("c8", DataType::UInt16, false), + Field::new("c9", DataType::UInt32, false), + Field::new("c10", DataType::UInt64, false), + Field::new("c11", DataType::Float32, false), + Field::new("c12", DataType::Float64, false), + Field::new("c13", DataType::Utf8, false), + ]); + + Arc::new(schema) + } + + #[tokio::test] + async fn values_empty_case() -> Result<()> { + let schema = aggr_test_schema(); + let empty = MemorySourceConfig::try_new_as_values(schema, vec![]); + assert!(empty.is_err()); + Ok(()) + } + + #[test] + fn new_exec_with_batches() { + let batch = make_partition(7); + let schema = batch.schema(); + let batches = vec![batch.clone(), batch]; + let _exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); + } + + #[test] + fn new_exec_with_batches_empty() { + let batch = make_partition(7); + let schema = batch.schema(); + let _ = MemorySourceConfig::try_new_from_batches(schema, Vec::new()).unwrap_err(); + } + + #[test] + fn new_exec_with_batches_invalid_schema() { + let batch = make_partition(7); + let batches = vec![batch.clone(), batch]; + + let invalid_schema = Arc::new(Schema::new(vec![ + Field::new("col0", DataType::UInt32, false), + Field::new("col1", DataType::Utf8, false), + ])); + let _ = MemorySourceConfig::try_new_from_batches(invalid_schema, batches) + .unwrap_err(); + } + + // Test issue: https://github.com/apache/datafusion/issues/8763 + #[test] + fn new_exec_with_non_nullable_schema() { + let schema = Arc::new(Schema::new(vec![Field::new( + "col0", + DataType::UInt32, + false, + )])); + let _ = MemorySourceConfig::try_new_as_values( + Arc::clone(&schema), + vec![vec![lit(1u32)]], + ) + .unwrap(); + // Test that a null value is rejected + let _ = MemorySourceConfig::try_new_as_values( + schema, + vec![vec![lit(ScalarValue::UInt32(None))]], + ) + .unwrap_err(); + } + + #[test] + fn values_stats_with_nulls_only() -> Result<()> { + let data = vec![ + vec![lit(ScalarValue::Null)], + vec![lit(ScalarValue::Null)], + vec![lit(ScalarValue::Null)], + ]; + let rows = data.len(); + let values = MemorySourceConfig::try_new_as_values( + Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])), + data, + )?; + + assert_eq!( + values.statistics()?, + Statistics { + num_rows: Precision::Exact(rows), + total_byte_size: Precision::Exact(8), // not important + column_statistics: vec![ColumnStatistics { + null_count: Precision::Exact(rows), // there are only nulls + distinct_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + },], + } + ); + + Ok(()) + } +} diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index c735c3108b3d..7b3aa8aa78a8 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -24,6 +24,8 @@ pub mod file_meta; pub mod file_scan_config; pub mod file_sink_config; pub mod file_stream; +pub mod memory; +pub mod source; pub mod url; pub mod write; use chrono::TimeZone; diff --git a/datafusion/physical-plan/src/source.rs b/datafusion/datasource/src/source.rs similarity index 95% rename from datafusion/physical-plan/src/source.rs rename to datafusion/datasource/src/source.rs index 0c1dfddd2678..1b0d76902972 100644 --- a/datafusion/physical-plan/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -20,10 +20,12 @@ use std::fmt; use std::fmt::{Debug, Formatter}; use std::sync::Arc; -use crate::execution_plan::{Boundedness, EmissionType}; -use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use crate::projection::ProjectionExec; -use crate::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use datafusion_physical_plan::projection::ProjectionExec; +use datafusion_physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, +}; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Statistics}; diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index c9c86e9c8d5c..ee9cdf052093 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -37,6 +37,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } datafusion-common = { workspace = true, default-features = true } +datafusion-datasource = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true, default-features = true } diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index f0afdaa2de3d..4ff46545b8ce 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -75,10 +75,6 @@ tokio = { workspace = true, features = [ "parking_lot", ] } -[[bench]] -harness = false -name = "spm" - [[bench]] harness = false name = "partial_ordering" diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 85b41da85742..0947a2ff5539 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1347,10 +1347,10 @@ mod tests { use crate::common::collect; use crate::execution_plan::Boundedness; use crate::expressions::col; - use crate::memory::MemorySourceConfig; use crate::metrics::MetricValue; use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use crate::RecordBatchStream; use arrow::array::{ @@ -2207,7 +2207,7 @@ mod tests { vec![test_last_value_agg_expr(&schema, sort_options)?] }; - let memory_exec = MemorySourceConfig::try_new_exec( + let memory_exec = TestMemoryExec::try_new_exec( &[ vec![partition1], vec![partition2], @@ -2442,11 +2442,8 @@ mod tests { }) .collect(); - let input = MemorySourceConfig::try_new_exec( - &[input_batches], - Arc::clone(&schema), - None, - )?; + let input = + TestMemoryExec::try_new_exec(&[input_batches], Arc::clone(&schema), None)?; let aggregate_exec = Arc::new(AggregateExec::try_new( AggregateMode::Single, @@ -2557,7 +2554,7 @@ mod tests { .build() .map(Arc::new)?]; - let input = MemorySourceConfig::try_new_exec( + let input = TestMemoryExec::try_new_exec( &[vec![batch.clone()]], Arc::::clone(&batch.schema()), None, @@ -2627,7 +2624,7 @@ mod tests { ]; let input = - MemorySourceConfig::try_new_exec(&[input_data], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[input_data], Arc::clone(&schema), None)?; let aggregate_exec = Arc::new(AggregateExec::try_new( AggregateMode::Partial, group_by, @@ -2714,7 +2711,7 @@ mod tests { ]; let input = - MemorySourceConfig::try_new_exec(&[input_data], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[input_data], Arc::clone(&schema), None)?; let aggregate_exec = Arc::new(AggregateExec::try_new( AggregateMode::Partial, group_by, @@ -2829,7 +2826,7 @@ mod tests { create_record_batch(&schema, (vec![2, 3, 4, 4], vec![1.0, 2.0, 3.0, 4.0]))?, ]; let plan: Arc = - MemorySourceConfig::try_new_exec(&[batches], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?; let grouping_set = PhysicalGroupBy::new( vec![(col("a", &schema)?, "a".to_string())], diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index 5168c3cc101f..c4e738cb3ad1 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -20,10 +20,10 @@ use std::any::Any; use std::sync::Arc; -use super::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; +use crate::memory::MemoryStream; +use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::{ execution_plan::{Boundedness, EmissionType}, - memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning, }; diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 5866f0938e41..a66873bc6576 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -624,7 +624,6 @@ mod tests { use crate::expressions::*; use crate::test; use crate::test::exec::StatisticsExec; - use arrow::datatypes::{Field, Schema, UnionFields, UnionMode}; use datafusion_common::ScalarValue; diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 2983478ada74..23ffd2a28d3c 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -1638,7 +1638,7 @@ impl EmbeddedProjection for HashJoinExec { #[cfg(test)] mod tests { use super::*; - use crate::memory::MemorySourceConfig; + use crate::test::TestMemoryExec; use crate::{ common, expressions::Column, repartition::RepartitionExec, test::build_table_i32, test::exec::MockExec, @@ -1680,7 +1680,7 @@ mod tests { ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn join( @@ -2083,7 +2083,7 @@ mod tests { build_table_i32(("a1", &vec![2]), ("b2", &vec![2]), ("c1", &vec![9])); let schema = batch1.schema(); let left = - MemorySourceConfig::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) + TestMemoryExec::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) .unwrap(); let right = build_table( @@ -2155,7 +2155,7 @@ mod tests { let schema = batch1.schema(); let left = - MemorySourceConfig::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) + TestMemoryExec::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) .unwrap(); let right = build_table( ("a2", &vec![20, 30, 10]), @@ -2209,7 +2209,7 @@ mod tests { build_table_i32(("a2", &vec![30]), ("b1", &vec![5]), ("c2", &vec![90])); let schema = batch1.schema(); let right = - MemorySourceConfig::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) + TestMemoryExec::try_new_exec(&[vec![batch1], vec![batch2]], schema, None) .unwrap(); let on = vec![( @@ -2288,8 +2288,7 @@ mod tests { ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch.clone(), batch]], schema, None) - .unwrap() + TestMemoryExec::try_new_exec(&[vec![batch.clone(), batch]], schema, None).unwrap() } #[apply(batch_sizes)] @@ -2394,8 +2393,7 @@ mod tests { Arc::new(Column::new_with_schema("b1", &right.schema()).unwrap()) as _, )]; let schema = right.schema(); - let right = - MemorySourceConfig::try_new_exec(&[vec![right]], schema, None).unwrap(); + let right = TestMemoryExec::try_new_exec(&[vec![right]], schema, None).unwrap(); let join = join(left, right, on, &JoinType::Left, false).unwrap(); let columns = columns(&join.schema()); @@ -2432,8 +2430,7 @@ mod tests { Arc::new(Column::new_with_schema("b2", &right.schema()).unwrap()) as _, )]; let schema = right.schema(); - let right = - MemorySourceConfig::try_new_exec(&[vec![right]], schema, None).unwrap(); + let right = TestMemoryExec::try_new_exec(&[vec![right]], schema, None).unwrap(); let join = join(left, right, on, &JoinType::Full, false).unwrap(); let columns = columns(&join.schema()); @@ -3738,13 +3735,12 @@ mod tests { let n: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); let batch = RecordBatch::try_new(Arc::clone(&schema), vec![dates, n])?; let left = - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) .unwrap(); let dates: ArrayRef = Arc::new(Date32Array::from(vec![19108, 19108, 19109])); let n: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6])); let batch = RecordBatch::try_new(Arc::clone(&schema), vec![dates, n])?; - let right = - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap(); + let right = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap(); let on = vec![( Arc::new(Column::new_with_schema("date", &left.schema()).unwrap()) as _, Arc::new(Column::new_with_schema("date", &right.schema()).unwrap()) as _, @@ -4034,7 +4030,7 @@ mod tests { ("b1", &vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0]), ("c1", &vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0]), ); - let left = MemorySourceConfig::try_new_exec( + let left = TestMemoryExec::try_new_exec( &[vec![left_batch.clone()], vec![left_batch.clone()]], left_batch.schema(), None, @@ -4045,7 +4041,7 @@ mod tests { ("b2", &vec![12, 13]), ("c2", &vec![14, 15]), ); - let right = MemorySourceConfig::try_new_exec( + let right = TestMemoryExec::try_new_exec( &[vec![right_batch.clone()], vec![right_batch.clone()]], right_batch.schema(), None, @@ -4130,7 +4126,7 @@ mod tests { ) .unwrap(); let schema_ref = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema_ref, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema_ref, None).unwrap() } #[tokio::test] diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 6de6b3b4dff4..64dfc8219b64 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -1030,8 +1030,7 @@ impl EmbeddedProjection for NestedLoopJoinExec { #[cfg(test)] pub(crate) mod tests { use super::*; - use crate::memory::MemorySourceConfig; - use crate::source::DataSourceExec; + use crate::test::TestMemoryExec; use crate::{ common, expressions::Column, repartition::RepartitionExec, test::build_table_i32, }; @@ -1072,7 +1071,7 @@ pub(crate) mod tests { }; let mut source = - MemorySourceConfig::try_new(&[batches], Arc::clone(&schema), None).unwrap(); + TestMemoryExec::try_new(&[batches], Arc::clone(&schema), None).unwrap(); if !sorted_column_names.is_empty() { let mut sort_info = LexOrdering::default(); for name in sorted_column_names { @@ -1089,7 +1088,7 @@ pub(crate) mod tests { source = source.try_with_sort_information(vec![sort_info]).unwrap(); } - Arc::new(DataSourceExec::new(Arc::new(source))) + Arc::new(TestMemoryExec::update_cache(Arc::new(source))) } fn build_left_table() -> Arc { diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index a3e835c64131..6c933ca21807 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -2547,7 +2547,7 @@ mod tests { use crate::joins::sort_merge_join::{get_corrected_filter_mask, JoinedRecordBatches}; use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn}; use crate::joins::SortMergeJoinExec; - use crate::memory::MemorySourceConfig; + use crate::test::TestMemoryExec; use crate::test::{build_table_i32, build_table_i32_two_cols}; use crate::{common, ExecutionPlan}; @@ -2558,12 +2558,12 @@ mod tests { ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn build_table_from_batches(batches: Vec) -> Arc { let schema = batches.first().unwrap().schema(); - MemorySourceConfig::try_new_exec(&[batches], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap() } fn build_date_table( @@ -2588,7 +2588,7 @@ mod tests { .unwrap(); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn build_date64_table( @@ -2613,7 +2613,7 @@ mod tests { .unwrap(); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } /// returns a table with 3 columns of i32 in memory @@ -2636,7 +2636,7 @@ mod tests { ], ) .unwrap(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } pub fn build_table_two_cols( @@ -2645,7 +2645,7 @@ mod tests { ) -> Arc { let batch = build_table_i32_two_cols(a, b); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } fn join( diff --git a/datafusion/physical-plan/src/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs index 9932c647be0a..e70007aa651f 100644 --- a/datafusion/physical-plan/src/joins/test_utils.rs +++ b/datafusion/physical-plan/src/joins/test_utils.rs @@ -23,9 +23,8 @@ use crate::joins::utils::{JoinFilter, JoinOn}; use crate::joins::{ HashJoinExec, PartitionMode, StreamJoinPartitionMode, SymmetricHashJoinExec, }; -use crate::memory::MemorySourceConfig; use crate::repartition::RepartitionExec; -use crate::source::DataSourceExec; +use crate::test::TestMemoryExec; use crate::{common, ExecutionPlan, ExecutionPlanProperties, Partitioning}; use arrow::array::{ @@ -530,14 +529,14 @@ pub fn create_memory_table( right_sorted: Vec, ) -> Result<(Arc, Arc)> { let left_schema = left_partition[0].schema(); - let left = MemorySourceConfig::try_new(&[left_partition], left_schema, None)? + let left = TestMemoryExec::try_new(&[left_partition], left_schema, None)? .try_with_sort_information(left_sorted)?; let right_schema = right_partition[0].schema(); - let right = MemorySourceConfig::try_new(&[right_partition], right_schema, None)? + let right = TestMemoryExec::try_new(&[right_partition], right_schema, None)? .try_with_sort_information(right_sorted)?; Ok(( - Arc::new(DataSourceExec::new(Arc::new(left))), - Arc::new(DataSourceExec::new(Arc::new(right))), + Arc::new(TestMemoryExec::update_cache(Arc::new(left))), + Arc::new(TestMemoryExec::update_cache(Arc::new(right))), )) } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 9210e3b0273c..a73cf78ab7f4 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -79,13 +79,11 @@ pub mod unnest; pub mod values; pub mod windows; pub mod work_table; - pub mod udaf { pub use datafusion_expr::StatisticsArgs; pub use datafusion_physical_expr::aggregate::AggregateFunctionExpr; } pub mod coalesce; -pub mod source; #[cfg(test)] pub mod test; diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 0077804bdfc9..fd338cc91353 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -22,696 +22,22 @@ use std::fmt; use std::sync::Arc; use std::task::{Context, Poll}; -use super::{ - common, ColumnarValue, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, - PhysicalExpr, PlanProperties, RecordBatchStream, SendableRecordBatchStream, - Statistics, -}; use crate::execution_plan::{Boundedness, EmissionType}; -use crate::projection::{ - all_alias_free_columns, new_projections_for_columns, ProjectionExec, +use crate::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use crate::source::{DataSource, DataSourceExec}; -use arrow::array::{RecordBatch, RecordBatchOptions}; -use arrow::datatypes::{Schema, SchemaRef}; -use datafusion_common::{ - internal_err, plan_err, project_schema, Constraints, Result, ScalarValue, -}; +use arrow::array::RecordBatch; +use arrow::datatypes::SchemaRef; +use datafusion_common::{internal_err, Result}; use datafusion_execution::memory_pool::MemoryReservation; use datafusion_execution::TaskContext; -use datafusion_physical_expr::equivalence::ProjectionMapping; -use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::utils::collect_columns; -use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; +use datafusion_physical_expr::EquivalenceProperties; use futures::Stream; use parking_lot::RwLock; -/// Execution plan for reading in-memory batches of data -#[derive(Clone)] -#[deprecated( - since = "46.0.0", - note = "use MemorySourceConfig and DataSourceExec instead" -)] -pub struct MemoryExec { - inner: DataSourceExec, - /// The partitions to query - partitions: Vec>, - /// Optional projection - projection: Option>, - // Sort information: one or more equivalent orderings - sort_information: Vec, - /// if partition sizes should be displayed - show_sizes: bool, -} - -#[allow(unused, deprecated)] -impl fmt::Debug for MemoryExec { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.inner.fmt_as(DisplayFormatType::Default, f) - } -} - -#[allow(unused, deprecated)] -impl DisplayAs for MemoryExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - self.inner.fmt_as(t, f) - } -} - -#[allow(unused, deprecated)] -impl ExecutionPlan for MemoryExec { - fn name(&self) -> &'static str { - "MemoryExec" - } - - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn properties(&self) -> &PlanProperties { - self.inner.properties() - } - - fn children(&self) -> Vec<&Arc> { - // This is a leaf node and has no children - vec![] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> Result> { - // MemoryExec has no children - if children.is_empty() { - Ok(self) - } else { - internal_err!("Children cannot be replaced in {self:?}") - } - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> Result { - self.inner.execute(partition, context) - } - - /// We recompute the statistics dynamically from the arrow metadata as it is pretty cheap to do so - fn statistics(&self) -> Result { - self.inner.statistics() - } - - fn try_swapping_with_projection( - &self, - projection: &ProjectionExec, - ) -> Result>> { - self.inner.try_swapping_with_projection(projection) - } -} - -#[allow(unused, deprecated)] -impl MemoryExec { - /// Create a new execution plan for reading in-memory record batches - /// The provided `schema` should not have the projection applied. - pub fn try_new( - partitions: &[Vec], - schema: SchemaRef, - projection: Option>, - ) -> Result { - let source = MemorySourceConfig::try_new(partitions, schema, projection.clone())?; - let data_source = DataSourceExec::new(Arc::new(source)); - Ok(Self { - inner: data_source, - partitions: partitions.to_vec(), - projection, - sort_information: vec![], - show_sizes: true, - }) - } - - /// Create a new execution plan from a list of constant values (`ValuesExec`) - pub fn try_new_as_values( - schema: SchemaRef, - data: Vec>>, - ) -> Result { - if data.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - let n_row = data.len(); - let n_col = schema.fields().len(); - - // We have this single row batch as a placeholder to satisfy evaluation argument - // and generate a single output row - let placeholder_schema = Arc::new(Schema::empty()); - let placeholder_batch = RecordBatch::try_new_with_options( - Arc::clone(&placeholder_schema), - vec![], - &RecordBatchOptions::new().with_row_count(Some(1)), - )?; - - // Evaluate each column - let arrays = (0..n_col) - .map(|j| { - (0..n_row) - .map(|i| { - let expr = &data[i][j]; - let result = expr.evaluate(&placeholder_batch)?; - - match result { - ColumnarValue::Scalar(scalar) => Ok(scalar), - ColumnarValue::Array(array) if array.len() == 1 => { - ScalarValue::try_from_array(&array, 0) - } - ColumnarValue::Array(_) => { - plan_err!("Cannot have array values in a values list") - } - } - }) - .collect::>>() - .and_then(ScalarValue::iter_to_array) - }) - .collect::>>()?; - - let batch = RecordBatch::try_new_with_options( - Arc::clone(&schema), - arrays, - &RecordBatchOptions::new().with_row_count(Some(n_row)), - )?; - - let partitions = vec![batch]; - Self::try_new_from_batches(Arc::clone(&schema), partitions) - } - - /// Create a new plan using the provided schema and batches. - /// - /// Errors if any of the batches don't match the provided schema, or if no - /// batches are provided. - pub fn try_new_from_batches( - schema: SchemaRef, - batches: Vec, - ) -> Result { - if batches.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - for batch in &batches { - let batch_schema = batch.schema(); - if batch_schema != schema { - return plan_err!( - "Batch has invalid schema. Expected: {}, got: {}", - schema, - batch_schema - ); - } - } - - let partitions = vec![batches]; - let source = MemorySourceConfig { - partitions: partitions.clone(), - schema: Arc::clone(&schema), - projected_schema: Arc::clone(&schema), - projection: None, - sort_information: vec![], - show_sizes: true, - fetch: None, - }; - let data_source = DataSourceExec::new(Arc::new(source)); - Ok(Self { - inner: data_source, - partitions, - projection: None, - sort_information: vec![], - show_sizes: true, - }) - } - - fn memory_source_config(&self) -> MemorySourceConfig { - self.inner - .source() - .as_any() - .downcast_ref::() - .unwrap() - .clone() - } - - pub fn with_constraints(mut self, constraints: Constraints) -> Self { - self.inner = self.inner.with_constraints(constraints); - self - } - - /// Set `show_sizes` to determine whether to display partition sizes - pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { - let mut memory_source = self.memory_source_config(); - memory_source.show_sizes = show_sizes; - self.show_sizes = show_sizes; - self.inner = DataSourceExec::new(Arc::new(memory_source)); - self - } - - /// Ref to constraints - pub fn constraints(&self) -> &Constraints { - self.properties().equivalence_properties().constraints() - } - - /// Ref to partitions - pub fn partitions(&self) -> &[Vec] { - &self.partitions - } - - /// Ref to projection - pub fn projection(&self) -> &Option> { - &self.projection - } - - /// Show sizes - pub fn show_sizes(&self) -> bool { - self.show_sizes - } - - /// Ref to sort information - pub fn sort_information(&self) -> &[LexOrdering] { - &self.sort_information - } - - /// A memory table can be ordered by multiple expressions simultaneously. - /// [`EquivalenceProperties`] keeps track of expressions that describe the - /// global ordering of the schema. These columns are not necessarily same; e.g. - /// ```text - /// ┌-------┐ - /// | a | b | - /// |---|---| - /// | 1 | 9 | - /// | 2 | 8 | - /// | 3 | 7 | - /// | 5 | 5 | - /// └---┴---┘ - /// ``` - /// where both `a ASC` and `b DESC` can describe the table ordering. With - /// [`EquivalenceProperties`], we can keep track of these equivalences - /// and treat `a ASC` and `b DESC` as the same ordering requirement. - /// - /// Note that if there is an internal projection, that projection will be - /// also applied to the given `sort_information`. - pub fn try_with_sort_information( - mut self, - sort_information: Vec, - ) -> Result { - self.sort_information = sort_information.clone(); - let mut memory_source = self.memory_source_config(); - memory_source = memory_source.try_with_sort_information(sort_information)?; - self.inner = DataSourceExec::new(Arc::new(memory_source)); - Ok(self) - } - - /// Arc clone of ref to original schema - pub fn original_schema(&self) -> SchemaRef { - Arc::clone(&self.inner.schema()) - } - - /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn compute_properties( - schema: SchemaRef, - orderings: &[LexOrdering], - constraints: Constraints, - partitions: &[Vec], - ) -> PlanProperties { - PlanProperties::new( - EquivalenceProperties::new_with_orderings(schema, orderings) - .with_constraints(constraints), - Partitioning::UnknownPartitioning(partitions.len()), - EmissionType::Incremental, - Boundedness::Bounded, - ) - } -} - -/// Data source configuration for reading in-memory batches of data -#[derive(Clone)] -pub struct MemorySourceConfig { - /// The partitions to query - partitions: Vec>, - /// Schema representing the data before projection - schema: SchemaRef, - /// Schema representing the data after the optional projection is applied - projected_schema: SchemaRef, - /// Optional projection - projection: Option>, - /// Sort information: one or more equivalent orderings - sort_information: Vec, - /// if partition sizes should be displayed - show_sizes: bool, - /// The maximum number of records to read from this plan. If `None`, - /// all records after filtering are returned. - fetch: Option, -} - -impl DataSource for MemorySourceConfig { - fn open( - &self, - partition: usize, - _context: Arc, - ) -> Result { - Ok(Box::pin( - MemoryStream::try_new( - self.partitions[partition].clone(), - Arc::clone(&self.projected_schema), - self.projection.clone(), - )? - .with_fetch(self.fetch), - )) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default | DisplayFormatType::Verbose => { - let partition_sizes: Vec<_> = - self.partitions.iter().map(|b| b.len()).collect(); - - let output_ordering = self - .sort_information - .first() - .map(|output_ordering| { - format!(", output_ordering={}", output_ordering) - }) - .unwrap_or_default(); - - let eq_properties = self.eq_properties(); - let constraints = eq_properties.constraints(); - let constraints = if constraints.is_empty() { - String::new() - } else { - format!(", {}", constraints) - }; - - let limit = self - .fetch - .map_or(String::new(), |limit| format!(", fetch={}", limit)); - if self.show_sizes { - write!( - f, - "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}", - partition_sizes.len(), - ) - } else { - write!( - f, - "partitions={}{limit}{output_ordering}{constraints}", - partition_sizes.len(), - ) - } - } - } - } - - fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.partitions.len()) - } - - fn eq_properties(&self) -> EquivalenceProperties { - EquivalenceProperties::new_with_orderings( - Arc::clone(&self.projected_schema), - self.sort_information.as_slice(), - ) - } - - fn statistics(&self) -> Result { - Ok(common::compute_record_batch_statistics( - &self.partitions, - &self.schema, - self.projection.clone(), - )) - } - - fn with_fetch(&self, limit: Option) -> Option> { - let source = self.clone(); - Some(Arc::new(source.with_limit(limit))) - } - - fn fetch(&self) -> Option { - self.fetch - } - - fn try_swapping_with_projection( - &self, - projection: &ProjectionExec, - ) -> Result>> { - // If there is any non-column or alias-carrier expression, Projection should not be removed. - // This process can be moved into MemoryExec, but it would be an overlap of their responsibility. - all_alias_free_columns(projection.expr()) - .then(|| { - let all_projections = (0..self.schema.fields().len()).collect(); - let new_projections = new_projections_for_columns( - projection, - self.projection().as_ref().unwrap_or(&all_projections), - ); - - MemorySourceConfig::try_new_exec( - self.partitions(), - self.original_schema(), - Some(new_projections), - ) - .map(|e| e as _) - }) - .transpose() - } -} - -impl MemorySourceConfig { - /// Create a new `MemorySourceConfig` for reading in-memory record batches - /// The provided `schema` should not have the projection applied. - pub fn try_new( - partitions: &[Vec], - schema: SchemaRef, - projection: Option>, - ) -> Result { - let projected_schema = project_schema(&schema, projection.as_ref())?; - Ok(Self { - partitions: partitions.to_vec(), - schema, - projected_schema, - projection, - sort_information: vec![], - show_sizes: true, - fetch: None, - }) - } - - /// Create a new `DataSourceExec` plan for reading in-memory record batches - /// The provided `schema` should not have the projection applied. - pub fn try_new_exec( - partitions: &[Vec], - schema: SchemaRef, - projection: Option>, - ) -> Result> { - let source = Self::try_new(partitions, schema, projection)?; - Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) - } - - /// Create a new execution plan from a list of constant values (`ValuesExec`) - pub fn try_new_as_values( - schema: SchemaRef, - data: Vec>>, - ) -> Result> { - if data.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - let n_row = data.len(); - let n_col = schema.fields().len(); - - // We have this single row batch as a placeholder to satisfy evaluation argument - // and generate a single output row - let placeholder_schema = Arc::new(Schema::empty()); - let placeholder_batch = RecordBatch::try_new_with_options( - Arc::clone(&placeholder_schema), - vec![], - &RecordBatchOptions::new().with_row_count(Some(1)), - )?; - - // Evaluate each column - let arrays = (0..n_col) - .map(|j| { - (0..n_row) - .map(|i| { - let expr = &data[i][j]; - let result = expr.evaluate(&placeholder_batch)?; - - match result { - ColumnarValue::Scalar(scalar) => Ok(scalar), - ColumnarValue::Array(array) if array.len() == 1 => { - ScalarValue::try_from_array(&array, 0) - } - ColumnarValue::Array(_) => { - plan_err!("Cannot have array values in a values list") - } - } - }) - .collect::>>() - .and_then(ScalarValue::iter_to_array) - }) - .collect::>>()?; - - let batch = RecordBatch::try_new_with_options( - Arc::clone(&schema), - arrays, - &RecordBatchOptions::new().with_row_count(Some(n_row)), - )?; - - let partitions = vec![batch]; - Self::try_new_from_batches(Arc::clone(&schema), partitions) - } - - /// Create a new plan using the provided schema and batches. - /// - /// Errors if any of the batches don't match the provided schema, or if no - /// batches are provided. - pub fn try_new_from_batches( - schema: SchemaRef, - batches: Vec, - ) -> Result> { - if batches.is_empty() { - return plan_err!("Values list cannot be empty"); - } - - for batch in &batches { - let batch_schema = batch.schema(); - if batch_schema != schema { - return plan_err!( - "Batch has invalid schema. Expected: {}, got: {}", - schema, - batch_schema - ); - } - } - - let partitions = vec![batches]; - let source = Self { - partitions, - schema: Arc::clone(&schema), - projected_schema: Arc::clone(&schema), - projection: None, - sort_information: vec![], - show_sizes: true, - fetch: None, - }; - Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) - } - - /// Set the limit of the files - pub fn with_limit(mut self, limit: Option) -> Self { - self.fetch = limit; - self - } - - /// Set `show_sizes` to determine whether to display partition sizes - pub fn with_show_sizes(mut self, show_sizes: bool) -> Self { - self.show_sizes = show_sizes; - self - } - - /// Ref to partitions - pub fn partitions(&self) -> &[Vec] { - &self.partitions - } - - /// Ref to projection - pub fn projection(&self) -> &Option> { - &self.projection - } - - /// Show sizes - pub fn show_sizes(&self) -> bool { - self.show_sizes - } - - /// Ref to sort information - pub fn sort_information(&self) -> &[LexOrdering] { - &self.sort_information - } - - /// A memory table can be ordered by multiple expressions simultaneously. - /// [`EquivalenceProperties`] keeps track of expressions that describe the - /// global ordering of the schema. These columns are not necessarily same; e.g. - /// ```text - /// ┌-------┐ - /// | a | b | - /// |---|---| - /// | 1 | 9 | - /// | 2 | 8 | - /// | 3 | 7 | - /// | 5 | 5 | - /// └---┴---┘ - /// ``` - /// where both `a ASC` and `b DESC` can describe the table ordering. With - /// [`EquivalenceProperties`], we can keep track of these equivalences - /// and treat `a ASC` and `b DESC` as the same ordering requirement. - /// - /// Note that if there is an internal projection, that projection will be - /// also applied to the given `sort_information`. - pub fn try_with_sort_information( - mut self, - mut sort_information: Vec, - ) -> Result { - // All sort expressions must refer to the original schema - let fields = self.schema.fields(); - let ambiguous_column = sort_information - .iter() - .flat_map(|ordering| ordering.clone()) - .flat_map(|expr| collect_columns(&expr.expr)) - .find(|col| { - fields - .get(col.index()) - .map(|field| field.name() != col.name()) - .unwrap_or(true) - }); - if let Some(col) = ambiguous_column { - return internal_err!( - "Column {:?} is not found in the original schema of the MemorySourceConfig", - col - ); - } - - // If there is a projection on the source, we also need to project orderings - if let Some(projection) = &self.projection { - let base_eqp = EquivalenceProperties::new_with_orderings( - self.original_schema(), - &sort_information, - ); - let proj_exprs = projection - .iter() - .map(|idx| { - let base_schema = self.original_schema(); - let name = base_schema.field(*idx).name(); - (Arc::new(Column::new(name, *idx)) as _, name.to_string()) - }) - .collect::>(); - let projection_mapping = - ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?; - sort_information = base_eqp - .project(&projection_mapping, Arc::clone(&self.projected_schema)) - .into_oeq_class() - .into_inner(); - } - - self.sort_information = sort_information; - Ok(self) - } - - /// Arc clone of ref to original schema - pub fn original_schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } -} - /// Iterator over batches pub struct MemoryStream { /// Vector of record batches @@ -746,13 +72,13 @@ impl MemoryStream { } /// Set the memory reservation for the data - pub(super) fn with_reservation(mut self, reservation: MemoryReservation) -> Self { + pub fn with_reservation(mut self, reservation: MemoryReservation) -> Self { self.reservation = Some(reservation); self } /// Set the number of rows to produce - pub(super) fn with_fetch(mut self, fetch: Option) -> Self { + pub fn with_fetch(mut self, fetch: Option) -> Self { self.fetch = fetch; self } @@ -962,62 +288,6 @@ impl RecordBatchStream for LazyMemoryStream { } } -#[cfg(test)] -mod memory_exec_tests { - use std::sync::Arc; - - use crate::memory::MemorySourceConfig; - use crate::source::DataSourceExec; - use crate::ExecutionPlan; - - use arrow::compute::SortOptions; - use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_physical_expr::expressions::col; - use datafusion_physical_expr::PhysicalSortExpr; - use datafusion_physical_expr_common::sort_expr::LexOrdering; - - #[test] - fn test_memory_order_eq() -> datafusion_common::Result<()> { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int64, false), - Field::new("b", DataType::Int64, false), - Field::new("c", DataType::Int64, false), - ])); - let sort1 = LexOrdering::new(vec![ - PhysicalSortExpr { - expr: col("a", &schema)?, - options: SortOptions::default(), - }, - PhysicalSortExpr { - expr: col("b", &schema)?, - options: SortOptions::default(), - }, - ]); - let sort2 = LexOrdering::new(vec![PhysicalSortExpr { - expr: col("c", &schema)?, - options: SortOptions::default(), - }]); - let mut expected_output_order = LexOrdering::default(); - expected_output_order.extend(sort1.clone()); - expected_output_order.extend(sort2.clone()); - - let sort_information = vec![sort1.clone(), sort2.clone()]; - let mem_exec = Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[vec![]], schema, None)? - .try_with_sort_information(sort_information)?, - ))); - - assert_eq!( - mem_exec.properties().output_ordering().unwrap(), - &expected_output_order - ); - let eq_properties = mem_exec.properties().equivalence_properties(); - assert!(eq_properties.oeq_class().contains(&sort1)); - assert!(eq_properties.oeq_class().contains(&sort2)); - Ok(()) - } -} - #[cfg(test)] mod lazy_memory_tests { use super::*; @@ -1137,129 +407,3 @@ mod lazy_memory_tests { Ok(()) } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::expressions::lit; - use crate::test::{self, make_partition}; - - use arrow::datatypes::{DataType, Field}; - use datafusion_common::assert_batches_eq; - use datafusion_common::stats::{ColumnStatistics, Precision}; - use futures::StreamExt; - - #[tokio::test] - async fn exec_with_limit() -> Result<()> { - let task_ctx = Arc::new(TaskContext::default()); - let batch = make_partition(7); - let schema = batch.schema(); - let batches = vec![batch.clone(), batch]; - - let exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); - assert_eq!(exec.fetch(), None); - - let exec = exec.with_fetch(Some(4)).unwrap(); - assert_eq!(exec.fetch(), Some(4)); - - let mut it = exec.execute(0, task_ctx)?; - let mut results = vec![]; - while let Some(batch) = it.next().await { - results.push(batch?); - } - - let expected = [ - "+---+", "| i |", "+---+", "| 0 |", "| 1 |", "| 2 |", "| 3 |", "+---+", - ]; - assert_batches_eq!(expected, &results); - Ok(()) - } - - #[tokio::test] - async fn values_empty_case() -> Result<()> { - let schema = test::aggr_test_schema(); - let empty = MemorySourceConfig::try_new_as_values(schema, vec![]); - assert!(empty.is_err()); - Ok(()) - } - - #[test] - fn new_exec_with_batches() { - let batch = make_partition(7); - let schema = batch.schema(); - let batches = vec![batch.clone(), batch]; - let _exec = MemorySourceConfig::try_new_from_batches(schema, batches).unwrap(); - } - - #[test] - fn new_exec_with_batches_empty() { - let batch = make_partition(7); - let schema = batch.schema(); - let _ = MemorySourceConfig::try_new_from_batches(schema, Vec::new()).unwrap_err(); - } - - #[test] - fn new_exec_with_batches_invalid_schema() { - let batch = make_partition(7); - let batches = vec![batch.clone(), batch]; - - let invalid_schema = Arc::new(Schema::new(vec![ - Field::new("col0", DataType::UInt32, false), - Field::new("col1", DataType::Utf8, false), - ])); - let _ = MemorySourceConfig::try_new_from_batches(invalid_schema, batches) - .unwrap_err(); - } - - // Test issue: https://github.com/apache/datafusion/issues/8763 - #[test] - fn new_exec_with_non_nullable_schema() { - let schema = Arc::new(Schema::new(vec![Field::new( - "col0", - DataType::UInt32, - false, - )])); - let _ = MemorySourceConfig::try_new_as_values( - Arc::clone(&schema), - vec![vec![lit(1u32)]], - ) - .unwrap(); - // Test that a null value is rejected - let _ = MemorySourceConfig::try_new_as_values( - schema, - vec![vec![lit(ScalarValue::UInt32(None))]], - ) - .unwrap_err(); - } - - #[test] - fn values_stats_with_nulls_only() -> Result<()> { - let data = vec![ - vec![lit(ScalarValue::Null)], - vec![lit(ScalarValue::Null)], - vec![lit(ScalarValue::Null)], - ]; - let rows = data.len(); - let values = MemorySourceConfig::try_new_as_values( - Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])), - data, - )?; - - assert_eq!( - values.statistics()?, - Statistics { - num_rows: Precision::Exact(rows), - total_byte_size: Precision::Exact(8), // not important - column_statistics: vec![ColumnStatistics { - null_count: Precision::Exact(rows), // there are only nulls - distinct_count: Precision::Absent, - max_value: Precision::Absent, - min_value: Precision::Absent, - sum_value: Precision::Absent, - },], - } - ); - - Ok(()) - } -} diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs index 6a8f247ec0e6..6e31f601e152 100644 --- a/datafusion/physical-plan/src/placeholder_row.rs +++ b/datafusion/physical-plan/src/placeholder_row.rs @@ -20,10 +20,10 @@ use std::any::Any; use std::sync::Arc; -use super::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::execution_plan::{Boundedness, EmissionType}; -use crate::{memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning}; - +use crate::memory::MemoryStream; +use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; +use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::array::{ArrayRef, NullArray}; use arrow::array::{RecordBatch, RecordBatchOptions}; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; @@ -178,7 +178,8 @@ impl ExecutionPlan for PlaceholderRowExec { #[cfg(test)] mod tests { use super::*; - use crate::{test, with_new_children_if_necessary}; + use crate::test; + use crate::with_new_children_if_necessary; #[test] fn with_new_children() -> Result<()> { diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index bf7d2c7f275c..05b78e4e1da4 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -21,12 +21,12 @@ use std::any::Any; use std::sync::Arc; use std::task::{Context, Poll}; -use super::{ +use super::work_table::{ReservedBatches, WorkTable, WorkTableExec}; +use crate::execution_plan::{Boundedness, EmissionType}; +use crate::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, - work_table::{ReservedBatches, WorkTable, WorkTableExec}, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use crate::execution_plan::{Boundedness, EmissionType}; use crate::{DisplayAs, DisplayFormatType, ExecutionPlan}; use arrow::datatypes::SchemaRef; @@ -156,10 +156,10 @@ impl ExecutionPlan for RecursiveQueryExec { vec![false, false] } - fn required_input_distribution(&self) -> Vec { + fn required_input_distribution(&self) -> Vec { vec![ - datafusion_physical_expr::Distribution::SinglePartition, - datafusion_physical_expr::Distribution::SinglePartition, + crate::Distribution::SinglePartition, + crate::Distribution::SinglePartition, ] } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 25668fa67d5b..40e68cfcae83 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1051,6 +1051,7 @@ mod tests { use std::collections::HashSet; use super::*; + use crate::test::TestMemoryExec; use crate::{ test::{ assert_is_pending, @@ -1059,7 +1060,7 @@ mod tests { ErrorExec, MockExec, }, }, - {collect, expressions::col, memory::MemorySourceConfig}, + {collect, expressions::col}, }; use arrow::array::{ArrayRef, StringArray, UInt32Array}; @@ -1164,11 +1165,8 @@ mod tests { ) -> Result>> { let task_ctx = Arc::new(TaskContext::default()); // create physical plan - let exec = MemorySourceConfig::try_new_exec( - &input_partitions, - Arc::clone(schema), - None, - )?; + let exec = + TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(schema), None)?; let exec = RepartitionExec::try_new(exec, partitioning)?; // execute and collect results @@ -1559,11 +1557,8 @@ mod tests { let task_ctx = Arc::new(task_ctx); // create physical plan - let exec = MemorySourceConfig::try_new_exec( - &input_partitions, - Arc::clone(&schema), - None, - )?; + let exec = + TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?; let exec = RepartitionExec::try_new(exec, partitioning)?; // pull partitions @@ -1604,8 +1599,7 @@ mod test { use arrow::datatypes::{DataType, Field, Schema}; use super::*; - use crate::memory::MemorySourceConfig; - use crate::source::DataSourceExec; + use crate::test::TestMemoryExec; use crate::union::UnionExec; use datafusion_physical_expr::expressions::col; @@ -1711,15 +1705,15 @@ mod test { } fn memory_exec(schema: &SchemaRef) -> Arc { - MemorySourceConfig::try_new_exec(&[vec![]], Arc::clone(schema), None).unwrap() + TestMemoryExec::try_new_exec(&[vec![]], Arc::clone(schema), None).unwrap() } fn sorted_memory_exec( schema: &SchemaRef, sort_exprs: LexOrdering, ) -> Arc { - Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[vec![]], Arc::clone(schema), None) + Arc::new(TestMemoryExec::update_cache(Arc::new( + TestMemoryExec::try_new(&[vec![]], Arc::clone(schema), None) .unwrap() .try_with_sort_information(vec![sort_exprs]) .unwrap(), diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index eeef73c45fc4..dc03c012d9be 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -466,11 +466,11 @@ mod tests { use crate::collect; use crate::expressions::col; use crate::expressions::PhysicalSortExpr; - use crate::memory::MemorySourceConfig; use crate::sorts::sort::SortExec; use crate::test; use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use super::*; @@ -696,7 +696,7 @@ mod tests { ); let schema = batch1.schema(); - MemorySourceConfig::try_new_exec( + TestMemoryExec::try_new_exec( &[vec![batch1, batch2, batch3, batch4]], Arc::clone(&schema), None, @@ -881,7 +881,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data])?; let input = - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?; + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?; let partial_sort_exec = Arc::new(PartialSortExec::new( LexOrdering::new(vec![PhysicalSortExpr { @@ -987,7 +987,7 @@ mod tests { options: option_desc, }, ]), - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?, + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?, 2, )); diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 649468260e56..30b5abcf8897 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1142,10 +1142,10 @@ mod tests { use crate::collect; use crate::execution_plan::Boundedness; use crate::expressions::col; - use crate::memory::MemorySourceConfig; use crate::test; use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use arrow::array::*; use arrow::compute::SortOptions; @@ -1531,7 +1531,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data]).unwrap(); let input = - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None) .unwrap(); let sort_exec = Arc::new(SortExec::new( @@ -1602,7 +1602,7 @@ mod tests { }, }, ]), - MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?, + TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?, )); assert_eq!(DataType::Int32, *sort_exec.schema().field(0).data_type()); @@ -1688,7 +1688,7 @@ mod tests { }, }, ]), - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?, + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?, )); assert_eq!(DataType::Float32, *sort_exec.schema().field(0).data_type()); diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 2cc55d60292a..454a06855175 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -384,12 +384,12 @@ mod tests { use crate::coalesce_partitions::CoalescePartitionsExec; use crate::execution_plan::{Boundedness, EmissionType}; use crate::expressions::col; - use crate::memory::MemorySourceConfig; use crate::metrics::{MetricValue, Timestamp}; use crate::repartition::RepartitionExec; use crate::sorts::sort::SortExec; use crate::stream::RecordBatchReceiverStream; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::TestMemoryExec; use crate::test::{self, assert_is_pending, make_partition}; use crate::{collect, common}; @@ -451,7 +451,7 @@ mod tests { ]); let repartition_exec = RepartitionExec::try_new( - MemorySourceConfig::try_new_exec(&[rbs], schema, None).unwrap(), + TestMemoryExec::try_new_exec(&[rbs], schema, None).unwrap(), Partitioning::RoundRobinBatch(2), )?; let coalesce_batches_exec = @@ -543,7 +543,7 @@ mod tests { let schema = batch.schema(); let sort = LexOrdering::default(); // no sort expressions - let exec = MemorySourceConfig::try_new_exec( + let exec = TestMemoryExec::try_new_exec( &[vec![batch.clone()], vec![batch]], schema, None, @@ -736,7 +736,7 @@ mod tests { options: Default::default(), }, ]); - let exec = MemorySourceConfig::try_new_exec(partitions, schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(partitions, schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, context).await.unwrap(); @@ -844,7 +844,7 @@ mod tests { let sorted = basic_sort(csv, sort, context).await; let split: Vec<_> = sizes.iter().map(|x| split_batch(&sorted, *x)).collect(); - Ok(MemorySourceConfig::try_new_exec(&split, sorted.schema(), None).unwrap()) + Ok(TestMemoryExec::try_new_exec(&split, sorted.schema(), None).unwrap()) } #[tokio::test] @@ -972,8 +972,8 @@ mod tests { }, }, ]); - let exec = MemorySourceConfig::try_new_exec(&[vec![b1], vec![b2]], schema, None) - .unwrap(); + let exec = + TestMemoryExec::try_new_exec(&[vec![b1], vec![b2]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, task_ctx).await.unwrap(); @@ -1015,8 +1015,7 @@ mod tests { nulls_first: true, }, }]); - let exec = - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec).with_fetch(Some(2))); @@ -1051,8 +1050,7 @@ mod tests { nulls_first: true, }, }]); - let exec = - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, task_ctx).await.unwrap(); @@ -1161,8 +1159,8 @@ mod tests { expr: col("b", &schema).unwrap(), options: Default::default(), }]); - let exec = MemorySourceConfig::try_new_exec(&[vec![b1], vec![b2]], schema, None) - .unwrap(); + let exec = + TestMemoryExec::try_new_exec(&[vec![b1], vec![b2]], schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(Arc::clone(&merge) as Arc, task_ctx) @@ -1273,7 +1271,7 @@ mod tests { }, }]); - let exec = MemorySourceConfig::try_new_exec(&partitions, schema, None).unwrap(); + let exec = TestMemoryExec::try_new_exec(&partitions, schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, exec)); let collected = collect(merge, task_ctx).await.unwrap(); diff --git a/datafusion/physical-plan/src/test.rs b/datafusion/physical-plan/src/test.rs index ad0e43503b2b..7d0e3778452f 100644 --- a/datafusion/physical-plan/src/test.rs +++ b/datafusion/physical-plan/src/test.rs @@ -17,27 +17,337 @@ //! Utilities for testing datafusion-physical-plan +use std::any::Any; use std::collections::HashMap; +use std::fmt; +use std::fmt::{Debug, Formatter}; use std::pin::Pin; use std::sync::Arc; +use std::task::Context; + +use crate::common; +use crate::execution_plan::{Boundedness, EmissionType}; +use crate::memory::MemoryStream; +use crate::metrics::MetricsSet; +use crate::stream::RecordBatchStreamAdapter; +use crate::streaming::PartitionStream; +use crate::ExecutionPlan; +use crate::{DisplayAs, DisplayFormatType, PlanProperties}; use arrow::array::{Array, ArrayRef, Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use datafusion_common::{ + config::ConfigOptions, internal_err, project_schema, Result, Statistics, +}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use futures::{Future, FutureExt}; +use datafusion_physical_expr::{ + equivalence::ProjectionMapping, expressions::Column, utils::collect_columns, + EquivalenceProperties, LexOrdering, Partitioning, +}; -use crate::memory::MemorySourceConfig; -use crate::source::DataSourceExec; -use crate::stream::RecordBatchStreamAdapter; -use crate::streaming::PartitionStream; -use crate::ExecutionPlan; +use futures::{Future, FutureExt}; pub mod exec; +/// `TestMemoryExec` is a mock equivalent to [`MemorySourceConfig`] with [`ExecutionPlan`] implemented for testing. +/// i.e. It has some but not all the functionality of [`MemorySourceConfig`]. +/// This implements an in-memory DataSource rather than explicitly implementing a trait. +/// It is implemented in this manner to keep relevant unit tests in place +/// while avoiding circular dependencies between `datafusion-physical-plan` and `datafusion-datasource`. +/// +/// [`MemorySourceConfig`]: https://github.com/apache/datafusion/tree/main/datafusion/datasource/src/memory.rs +#[derive(Clone, Debug)] +pub struct TestMemoryExec { + /// The partitions to query + partitions: Vec>, + /// Schema representing the data before projection + schema: SchemaRef, + /// Schema representing the data after the optional projection is applied + projected_schema: SchemaRef, + /// Optional projection + projection: Option>, + /// Sort information: one or more equivalent orderings + sort_information: Vec, + /// if partition sizes should be displayed + show_sizes: bool, + /// The maximum number of records to read from this plan. If `None`, + /// all records after filtering are returned. + fetch: Option, + cache: PlanProperties, +} + +impl DisplayAs for TestMemoryExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result { + write!(f, "DataSourceExec: ")?; + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + let partition_sizes: Vec<_> = + self.partitions.iter().map(|b| b.len()).collect(); + + let output_ordering = self + .sort_information + .first() + .map(|output_ordering| { + format!(", output_ordering={}", output_ordering) + }) + .unwrap_or_default(); + + let eq_properties = self.eq_properties(); + let constraints = eq_properties.constraints(); + let constraints = if constraints.is_empty() { + String::new() + } else { + format!(", {}", constraints) + }; + + let limit = self + .fetch + .map_or(String::new(), |limit| format!(", fetch={}", limit)); + if self.show_sizes { + write!( + f, + "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } else { + write!( + f, + "partitions={}{limit}{output_ordering}{constraints}", + partition_sizes.len(), + ) + } + } + } + } +} + +impl ExecutionPlan for TestMemoryExec { + fn name(&self) -> &'static str { + "DataSourceExec" + } + + fn as_any(&self) -> &dyn Any { + unimplemented!() + } + + fn properties(&self) -> &PlanProperties { + &self.cache + } + + fn children(&self) -> Vec<&Arc> { + Vec::new() + } + + fn with_new_children( + self: Arc, + _: Vec>, + ) -> Result> { + unimplemented!() + } + + fn repartitioned( + &self, + _target_partitions: usize, + _config: &ConfigOptions, + ) -> Result>> { + unimplemented!() + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + self.open(partition, context) + } + + fn metrics(&self) -> Option { + unimplemented!() + } + + fn statistics(&self) -> Result { + self.statistics() + } + + fn fetch(&self) -> Option { + self.fetch + } +} + +impl TestMemoryExec { + fn open( + &self, + partition: usize, + _context: Arc, + ) -> Result { + Ok(Box::pin( + MemoryStream::try_new( + self.partitions[partition].clone(), + Arc::clone(&self.projected_schema), + self.projection.clone(), + )? + .with_fetch(self.fetch), + )) + } + + fn compute_properties(&self) -> PlanProperties { + PlanProperties::new( + self.eq_properties(), + self.output_partitioning(), + EmissionType::Incremental, + Boundedness::Bounded, + ) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.partitions.len()) + } + + fn eq_properties(&self) -> EquivalenceProperties { + EquivalenceProperties::new_with_orderings( + Arc::clone(&self.projected_schema), + self.sort_information.as_slice(), + ) + } + + fn statistics(&self) -> Result { + Ok(common::compute_record_batch_statistics( + &self.partitions, + &self.schema, + self.projection.clone(), + )) + } + + pub fn try_new( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result { + let projected_schema = project_schema(&schema, projection.as_ref())?; + Ok(Self { + partitions: partitions.to_vec(), + schema, + cache: PlanProperties::new( + EquivalenceProperties::new_with_orderings( + Arc::clone(&projected_schema), + vec![].as_slice(), + ), + Partitioning::UnknownPartitioning(partitions.len()), + EmissionType::Incremental, + Boundedness::Bounded, + ), + projected_schema, + projection, + sort_information: vec![], + show_sizes: true, + fetch: None, + }) + } + + /// Create a new `DataSourceExec` Equivalent plan for reading in-memory record batches + /// The provided `schema` should not have the projection applied. + pub fn try_new_exec( + partitions: &[Vec], + schema: SchemaRef, + projection: Option>, + ) -> Result> { + let mut source = Self::try_new(partitions, schema, projection)?; + let cache = source.compute_properties(); + source.cache = cache; + Ok(Arc::new(source)) + } + + // Equivalent of `DataSourceExec::new` + pub fn update_cache(source: Arc) -> TestMemoryExec { + let cache = source.compute_properties(); + let source = &*source; + let mut source = source.clone(); + source.cache = cache; + source + } + + /// Set the limit of the files + pub fn with_limit(mut self, limit: Option) -> Self { + self.fetch = limit; + self + } + + /// Ref to partitions + pub fn partitions(&self) -> &[Vec] { + &self.partitions + } + + /// Ref to projection + pub fn projection(&self) -> &Option> { + &self.projection + } + + /// Ref to sort information + pub fn sort_information(&self) -> &[LexOrdering] { + &self.sort_information + } + + /// refer to `try_with_sort_information` at MemorySourceConfig for more information. + /// https://github.com/apache/datafusion/tree/main/datafusion/datasource/src/memory.rs + pub fn try_with_sort_information( + mut self, + mut sort_information: Vec, + ) -> Result { + // All sort expressions must refer to the original schema + let fields = self.schema.fields(); + let ambiguous_column = sort_information + .iter() + .flat_map(|ordering| ordering.clone()) + .flat_map(|expr| collect_columns(&expr.expr)) + .find(|col| { + fields + .get(col.index()) + .map(|field| field.name() != col.name()) + .unwrap_or(true) + }); + if let Some(col) = ambiguous_column { + return internal_err!( + "Column {:?} is not found in the original schema of the TestMemoryExec", + col + ); + } + + // If there is a projection on the source, we also need to project orderings + if let Some(projection) = &self.projection { + let base_eqp = EquivalenceProperties::new_with_orderings( + self.original_schema(), + &sort_information, + ); + let proj_exprs = projection + .iter() + .map(|idx| { + let base_schema = self.original_schema(); + let name = base_schema.field(*idx).name(); + (Arc::new(Column::new(name, *idx)) as _, name.to_string()) + }) + .collect::>(); + let projection_mapping = + ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?; + sort_information = base_eqp + .project(&projection_mapping, Arc::clone(&self.projected_schema)) + .into_oeq_class() + .into_inner(); + } + + self.sort_information = sort_information; + Ok(self) + } + + /// Arc clone of ref to original schema + pub fn original_schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + /// Asserts that given future is pending. pub fn assert_is_pending<'a, T>(fut: &mut Pin + Send + 'a>>) { let waker = futures::task::noop_waker(); - let mut cx = futures::task::Context::from_waker(&waker); + let mut cx = Context::from_waker(&waker); let poll = fut.poll_unpin(&mut cx); assert!(poll.is_pending()); @@ -117,7 +427,7 @@ pub fn build_table_scan_i32( ) -> Arc { let batch = build_table_i32(a, b, c); let schema = batch.schema(); - MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).unwrap() + TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap() } /// Return a RecordBatch with a single Int32 array with values (0..sz) in a field named "i" @@ -157,26 +467,24 @@ pub fn scan_partitioned_utf8(partitions: usize) -> Arc { } /// Returns a `DataSourceExec` that scans `partitions` of 100 batches each -pub fn mem_exec(partitions: usize) -> DataSourceExec { +pub fn mem_exec(partitions: usize) -> TestMemoryExec { let data: Vec> = (0..partitions).map(|_| vec![make_partition(100)]).collect(); let schema = data[0][0].schema(); let projection = None; - DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&data, schema, projection).unwrap(), - )) + + TestMemoryExec::try_new(&data, schema, projection).unwrap() } -pub fn mem_exec_utf8(partitions: usize) -> DataSourceExec { +pub fn mem_exec_utf8(partitions: usize) -> TestMemoryExec { let data: Vec> = (0..partitions) .map(|_| vec![make_partition_utf8(100)]) .collect(); let schema = data[0][0].schema(); let projection = None; - DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&data, schema, projection).unwrap(), - )) + + TestMemoryExec::try_new(&data, schema, projection).unwrap() } // Construct a stream partition for test purposes diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index e1972d267b97..68d1803b7133 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -640,10 +640,9 @@ fn stats_union(mut left: Statistics, right: Statistics) -> Statistics { mod tests { use super::*; use crate::collect; - use crate::memory::MemorySourceConfig; use crate::test; + use crate::test::TestMemoryExec; - use crate::source::DataSourceExec; use arrow::compute::SortOptions; use arrow::datatypes::DataType; use datafusion_common::ScalarValue; @@ -865,12 +864,12 @@ mod tests { .iter() .map(|ordering| convert_to_sort_exprs(ordering)) .collect::>(); - let child1 = Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[], Arc::clone(&schema), None)? + let child1 = Arc::new(TestMemoryExec::update_cache(Arc::new( + TestMemoryExec::try_new(&[], Arc::clone(&schema), None)? .try_with_sort_information(first_orderings)?, ))); - let child2 = Arc::new(DataSourceExec::new(Arc::new( - MemorySourceConfig::try_new(&[], Arc::clone(&schema), None)? + let child2 = Arc::new(TestMemoryExec::update_cache(Arc::new( + TestMemoryExec::try_new(&[], Arc::clone(&schema), None)? .try_with_sort_information(second_orderings)?, ))); diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs index 6ab5cc84a21f..b90c50510cb0 100644 --- a/datafusion/physical-plan/src/values.rs +++ b/datafusion/physical-plan/src/values.rs @@ -20,13 +20,12 @@ use std::any::Any; use std::sync::Arc; -use super::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::execution_plan::{Boundedness, EmissionType}; +use crate::memory::MemoryStream; +use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::{ - memory::MemoryStream, ColumnarValue, DisplayFormatType, ExecutionPlan, Partitioning, - PhysicalExpr, + ColumnarValue, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, }; - use arrow::datatypes::{Schema, SchemaRef}; use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 1e21d0757c41..c78c870ff383 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1190,9 +1190,9 @@ mod tests { use crate::common::collect; use crate::expressions::PhysicalSortExpr; - use crate::memory::MemorySourceConfig; use crate::projection::ProjectionExec; use crate::streaming::{PartitionStream, StreamingTableExec}; + use crate::test::TestMemoryExec; use crate::windows::{ create_udwf_window_expr, create_window_expr, BoundedWindowAggExec, InputOrderMode, }; @@ -1551,7 +1551,7 @@ mod tests { vec![Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3]))], )?; - let memory_exec = MemorySourceConfig::try_new_exec( + let memory_exec = TestMemoryExec::try_new_exec( &[vec![batch.clone(), batch.clone(), batch.clone()]], Arc::clone(&schema), None, diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index a6ced527cbb2..d3d29bfad7ce 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -20,12 +20,12 @@ use std::any::Any; use std::sync::{Arc, Mutex}; -use super::{ +use crate::execution_plan::{Boundedness, EmissionType}; +use crate::memory::MemoryStream; +use crate::{ metrics::{ExecutionPlanMetricsSet, MetricsSet}, SendableRecordBatchStream, Statistics, }; -use crate::execution_plan::{Boundedness, EmissionType}; -use crate::memory::MemoryStream; use crate::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; use arrow::datatypes::SchemaRef; diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index a575a42d0b6c..2c596255587b 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -32,6 +32,7 @@ use datafusion::datasource::file_format::parquet::ParquetSink; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; use datafusion::datasource::physical_plan::{AvroSource, CsvSource, FileScanConfig}; +use datafusion::datasource::source::DataSourceExec; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; use datafusion::physical_expr::aggregate::AggregateFunctionExpr; @@ -57,7 +58,6 @@ use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::repartition::RepartitionExec; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion::physical_plan::source::DataSourceExec; use datafusion::physical_plan::union::{InterleaveExec, UnionExec}; use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec}; use datafusion::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index 3fc94a33442b..e8c15731228c 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -23,8 +23,8 @@ use crate::variation_const::{ }; use datafusion::arrow::datatypes::DataType; +use datafusion::datasource::source::DataSourceExec; use datafusion::error::{DataFusionError, Result}; -use datafusion::physical_plan::source::DataSourceExec; use datafusion::physical_plan::{displayable, ExecutionPlan}; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource};