apache · Weijun-H · Dec 14, 2024 · Dec 14, 2024 · Dec 15, 2024 · Dec 24, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -36,6 +36,7 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
 PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
+PREFER_ROUND_ROBIN=${PREFER_ROUND_ROBIN:-true}
 VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
 
 usage() {
@@ -93,6 +94,7 @@ CARGO_COMMAND       command that runs the benchmark binary
 DATAFUSION_DIR      directory to use (default $DATAFUSION_DIR)
 RESULTS_NAME        folder where the benchmark files are stored
 PREFER_HASH_JOIN    Prefer hash join algorithm (default true)
+PREFER_ROUND_ROBIN  Prefer round robin partitioning (default true)
 VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
 "
     exit 1
@@ -163,6 +165,9 @@ main() {
                 tpch10)
                     data_tpch "10"
                     ;;
+                tpch50)
+                    data_tpch "50"
+                    ;;
                 tpch_mem10)
                     # same data as for tpch10
                     data_tpch "10"
@@ -220,6 +225,7 @@ main() {
             echo "RESULTS_DIR: ${RESULTS_DIR}"
             echo "CARGO_COMMAND: ${CARGO_COMMAND}"
             echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
+            echo "PREFER_ROUND_ROBIN: ${PREFER_ROUND_ROBIN}"
             echo "***************************"
 
             # navigate to the appropriate directory
@@ -252,6 +258,9 @@ main() {
                 tpch10)
                     run_tpch "10"
                     ;;
+                tpch50)
+                    run_tpch "50"
+                    ;;
                 tpch_mem10)
                     run_tpch_mem "10"
                     ;;
@@ -378,7 +387,7 @@ run_tpch() {
     RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch benchmark..."
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --prefer_round_robin "${PREFER_ROUND_ROBIN}" --format parquet -o "${RESULTS_FILE}"
 }
 
 # Runs the tpch in memory
@@ -394,7 +403,7 @@ run_tpch_mem() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch_mem benchmark..."
     # -m means in memory
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --prefer_round_robin "${PREFER_ROUND_ROBIN}" --format parquet -o "${RESULTS_FILE}"
 }
 
 # Runs the parquet filter benchmark
@@ -472,7 +481,7 @@ run_clickbench_partitioned() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (partitioned, 100 files) benchmark..."
-    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --prefer_round_robin "${PREFER_ROUND_ROBIN}" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
 }
 
 # Runs the clickbench "extended" benchmark with a single large parquet file

diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -27,6 +27,9 @@ use datafusion_common::exec_datafusion_err;
 use datafusion_common::instant::Instant;
 use structopt::StructOpt;
 
+// hack to avoid `default_value is meaningless for bool` errors
+type BoolDefaultTrue = bool;
+
 /// Run the clickbench benchmark
 ///
 /// The ClickBench[1] benchmarks are widely cited in the industry and
@@ -68,6 +71,11 @@ pub struct RunOpt {
     /// If present, write results json here
     #[structopt(parse(from_os_str), short = "o", long = "output")]
     output_path: Option<PathBuf>,
+
+    /// If true then round robin repartitioning is used, if false then on demand repartitioning
+    /// True by default.
+    #[structopt(short = "r", long = "prefer_round_robin", default_value = "true")]
+    prefer_round_robin: BoolDefaultTrue,
 }
 
 struct AllQueries {
@@ -124,6 +132,10 @@ impl RunOpt {
             parquet_options.binary_as_string = true;
         }
 
+        config
+            .options_mut()
+            .optimizer
+            .prefer_round_robin_repartition = self.prefer_round_robin;
         let rt_builder = self.common.runtime_env_builder()?;
         let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
         self.register_hits(&ctx).await?;