coralogix
diff --git a/‎.dockerignore
+1 b/‎.dockerignore
+1
diff --git a/‎CONTRIBUTING.md
+35-177 b/‎CONTRIBUTING.md
+35-177
diff --git a/‎Cargo.toml
+2-2 b/‎Cargo.toml
+2-2
diff --git a/‎ballista-cli/Cargo.toml
+2-2 b/‎ballista-cli/Cargo.toml
+2-2
diff --git a/‎ballista-cli/src/command.rs
+3-3 b/‎ballista-cli/src/command.rs
+3-3
diff --git a/‎ballista-cli/src/exec.rs
+6-6 b/‎ballista-cli/src/exec.rs
+6-6
@@ -11,4 +11,5 @@ target/
 **/data
 !target/release/ballista-scheduler
 !target/release/ballista-executor
+!target/release/ballista-cli
 !target/release/tpch
@@ -25,22 +25,28 @@ We welcome and encourage contributions of all kinds, such as:
 2. Documentation improvements
 3. Code (PR or PR Review)
 
-In addition to submitting new PRs, we have a healthy tradition of community members helping review each other's PRs. Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases.
+In addition to submitting new PRs, we have a healthy tradition of community members helping review each other's PRs.
+Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases.
 
 You can find a curated
 [good-first-issue](https://github.com/apache/arrow-ballista/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
 list to help you get started.
 
-# Developer's guide
+# Developer's Guide
 
-This section describes how you can get started at developing DataFusion.
+This section describes how you can get started with Ballista development.
 
-For information on developing with Ballista, see the
-[Ballista developer documentation](docs/developer/README.md).
+## Bootstrap Environment
 
-### Bootstrap environment
+Ballista contains components implemented in the following programming languages:
 
-DataFusion is written in Rust and it uses a standard rust toolkit:
+- Rust (Scheduler and Executor processes, Client library)
+- Python (Python bindings)
+- Javascript (Scheduler Web UI)
+
+### Rust Environment
+
+We use the standard Rust development tools.
 
 - `cargo build`
 - `cargo fmt` to format the code
@@ -50,8 +56,6 @@ DataFusion is written in Rust and it uses a standard rust toolkit:
 Testing setup:
 
 - `rustup update stable` DataFusion uses the latest stable release of rust
-- `git submodule init`
-- `git submodule update`
 
 Formatting instructions:
 
@@ -63,192 +67,46 @@ or run them all at once:
 
 - [dev/rust_lint.sh](dev/rust_lint.sh)
 
-## Test Organization
-
-DataFusion has several levels of tests in its [Test
-Pyramid](https://martinfowler.com/articles/practical-test-pyramid.html)
-and tries to follow [Testing Organization](https://doc.rust-lang.org/book/ch11-03-test-organization.html) in the The Book.
-
-This section highlights the most important test modules that exist
-
-### Unit tests
-
-Tests for the code in an individual module are defined in the same source file with a `test` module, following Rust convention
-
-### Rust Integration Tests
-
-There are several tests of the public interface of the DataFusion library in the [tests](https://github.com/apache/arrow-datafusion/blob/master/datafusion/tests) directory.
-
-You can run these tests individually using a command such as
-
-```shell
-cargo test -p datafusion --tests sql_integration
-```
-
-One very important test is the [sql_integraton](https://github.com/apache/arrow-datafusion/blob/master/datafusion/tests/sql_integration.rs) test which validates DataFusion's ability to run a large assortment of SQL queries against an assortment of data setsups.
-
-### SQL / Postgres Integration Tests
-
-The [integration-tests](https://github.com/apache/arrow-datafusion/blob/master/datafusion/integration-tests] directory contains a harness that runs certain queries against both postgres and datafusion and compares results
-
-#### setup environment
-
-```shell
-export POSTGRES_DB=postgres
-export POSTGRES_USER=postgres
-export POSTGRES_HOST=localhost
-export POSTGRES_PORT=5432
-```
-
-#### Install dependencies
-
-```shell
-# Install dependencies
-python -m pip install --upgrade pip setuptools wheel
-python -m pip install -r integration-tests/requirements.txt
-
-# setup environment
-POSTGRES_DB=postgres POSTGRES_USER=postgres POSTGRES_HOST=localhost POSTGRES_PORT=5432 python -m pytest -v integration-tests/test_psql_parity.py
-
-# Create
-psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'CREATE TABLE IF NOT EXISTS test (
-  c1 character varying NOT NULL,
-  c2 integer NOT NULL,
-  c3 smallint NOT NULL,
-  c4 smallint NOT NULL,
-  c5 integer NOT NULL,
-  c6 bigint NOT NULL,
-  c7 smallint NOT NULL,
-  c8 integer NOT NULL,
-  c9 bigint NOT NULL,
-  c10 character varying NOT NULL,
-  c11 double precision NOT NULL,
-  c12 double precision NOT NULL,
-  c13 character varying NOT NULL
-);'
-
-psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c "\copy test FROM '$(pwd)/testing/data/csv/aggregate_test_100.csv' WITH (FORMAT csv, HEADER true);"
-```
-
-#### Invoke the test runner
-
-```shell
-python -m pytest -v integration-tests/test_psql_parity.py
-```
-
-## Benchmarks
+### Rust Process Configuration
 
-### Criterion Benchmarks
+The scheduler and executor processes can be configured using toml files, environment variables and command-line
+arguments. The specification for config options can be found here:
 
-[Criterion](https://docs.rs/criterion/latest/criterion/index.html) is a statistics-driven micro-benchmarking framework used by DataFusion for evaluating the performance of specific code-paths. In particular, the criterion benchmarks help to both guide optimisation efforts, and prevent performance regressions within DataFusion.
+- [ballista/scheduler/scheduler_config_spec.toml](ballista/scheduler/scheduler_config_spec.toml)
+- [ballista/executor/executor_config_spec.toml](ballista/executor/executor_config_spec.toml)
 
-Criterion integrates with Cargo's built-in [benchmark support](https://doc.rust-lang.org/cargo/commands/cargo-bench.html) and a given benchmark can be run with
+Those files fully define Ballista's configuration. If there is a discrepancy between this documentation and the
+files, assume those files are correct.
 
-```
-cargo bench --bench BENCHMARK_NAME
-```
-
-A full list of benchmarks can be found [here](./datafusion/benches).
-
-_[cargo-criterion](https://github.com/bheisler/cargo-criterion) may also be used for more advanced reporting._
-
-#### Parquet SQL Benchmarks
-
-The parquet SQL benchmarks can be run with
-
-```
- cargo bench --bench parquet_query_sql
-```
-
-These randomly generate a parquet file, and then benchmark queries sourced from [parquet_query_sql.sql](./datafusion/core/benches/parquet_query_sql.sql) against it. This can therefore be a quick way to add coverage of particular query and/or data paths.
-
-If the environment variable `PARQUET_FILE` is set, the benchmark will run queries against this file instead of a randomly generated one. This can be useful for performing multiple runs, potentially with different code, against the same source data, or for testing against a custom dataset.
-
-The benchmark will automatically remove any generated parquet file on exit, however, if interrupted (e.g. by CTRL+C) it will not. This can be useful for analysing the particular file after the fact, or preserving it to use with `PARQUET_FILE` in subsequent runs.
+To get a list of command-line arguments, run the binary with `--help`
 
-### Upstream Benchmark Suites
+There is an example config file at [ballista/executor/examples/example_executor_config.toml](ballista/executor/examples/example_executor_config.toml)
 
-Instructions and tooling for running upstream benchmark suites against DataFusion and/or Ballista can be found in [benchmarks](./benchmarks).
+The order of precedence for arguments is: default config file < environment variables < specified config file < command line arguments.
 
-These are valuable for comparative evaluation against alternative Arrow implementations and query engines.
+The executor and scheduler will look for the default config file at `/etc/ballista/[executor|scheduler].toml` To
+specify a config file use the `--config-file` argument.
 
-## How to add a new scalar function
+Environment variables are prefixed by `BALLISTA_EXECUTOR` or `BALLISTA_SCHEDULER` for the executor and scheduler
+respectively. Hyphens in command line arguments become underscores. For example, the `--scheduler-host` argument
+for the executor becomes `BALLISTA_EXECUTOR_SCHEDULER_HOST`
 
-Below is a checklist of what you need to do to add a new scalar function to DataFusion:
+### Python Environment
 
-- Add the actual implementation of the function:
-  - [here](datafusion/physical-expr/src/string_expressions.rs) for string functions
-  - [here](datafusion/physical-expr/src/math_expressions.rs) for math functions
-  - [here](datafusion/physical-expr/src/datetime_expressions.rs) for datetime functions
-  - create a new module [here](datafusion/physical-expr/src) for other functions
-- In [core/src/physical_plan](datafusion/core/src/physical_plan/functions.rs), add:
-  - a new variant to `BuiltinScalarFunction`
-  - a new entry to `FromStr` with the name of the function as called by SQL
-  - a new line in `return_type` with the expected return type of the function, given an incoming type
-  - a new line in `signature` with the signature of the function (number and types of its arguments)
-  - a new line in `create_physical_expr`/`create_physical_fun` mapping the built-in to the implementation
-  - tests to the function.
-- In [core/tests/sql](datafusion/core/tests/sql), add a new test where the function is called through SQL against well known data and returns the expected result.
-- In [core/src/logical_plan/expr](datafusion/core/src/logical_plan/expr.rs), add:
-  - a new entry of the `unary_scalar_expr!` macro for the new function.
-- In [core/src/logical_plan/mod](datafusion/core/src/logical_plan/mod.rs), add:
-  - a new entry in the `pub use expr::{}` set.
+Refer to the instructions in the Python Bindings [README](./python/README.md)
 
-## How to add a new aggregate function
+### Javascript Environment
 
-Below is a checklist of what you need to do to add a new aggregate function to DataFusion:
+Refer to the instructions in the Scheduler Web UI [README](./ballista/scheduler/ui/README.md)
 
-- Add the actual implementation of an `Accumulator` and `AggregateExpr`:
-  - [here](datafusion/src/physical_plan/string_expressions.rs) for string functions
-  - [here](datafusion/src/physical_plan/math_expressions.rs) for math functions
-  - [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions
-  - create a new module [here](datafusion/src/physical_plan) for other functions
-- In [src/physical_plan/aggregates](datafusion/src/physical_plan/aggregates.rs), add:
-  - a new variant to `BuiltinAggregateFunction`
-  - a new entry to `FromStr` with the name of the function as called by SQL
-  - a new line in `return_type` with the expected return type of the function, given an incoming type
-  - a new line in `signature` with the signature of the function (number and types of its arguments)
-  - a new line in `create_aggregate_expr` mapping the built-in to the implementation
-  - tests to the function.
-- In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result.
+## Integration Tests
 
-## How to display plans graphically
-
-The query plans represented by `LogicalPlan` nodes can be graphically
-rendered using [Graphviz](http://www.graphviz.org/).
-
-To do so, save the output of the `display_graphviz` function to a file.:
-
-```rust
-// Create plan somehow...
-let mut output = File::create("/tmp/plan.dot")?;
-write!(output, "{}", plan.display_graphviz());
-```
-
-Then, use the `dot` command line tool to render it into a file that
-can be displayed. For example, the following command creates a
-`/tmp/plan.pdf` file:
+The integration tests can be executed by running the following command from the root of the repository.
 
 ```bash
-dot -Tpdf < /tmp/plan.dot > /tmp/plan.pdf
+./dev/integration-tests.sh
 ```
 
-## Specification
-
-We formalize DataFusion semantics and behaviors through specification
-documents. These specifications are useful to be used as references to help
-resolve ambiguities during development or code reviews.
-
-You are also welcome to propose changes to existing specifications or create
-new specifications as you see fit.
-
-Here is the list current active specifications:
-
-- [Output field name semantic](https://arrow.apache.org/datafusion/specification/output-field-name-semantic.html)
-- [Invariants](https://arrow.apache.org/datafusion/specification/invariants.html)
-
-All specifications are stored in the `docs/source/specification` folder.
-
 ## How to format `.md` document
 
 We are using `prettier` to format `.md` files.
 
@@ -17,13 +17,13 @@
 
 [workspace]
 members = [
-    "benchmarks",
+    "ballista-cli",
     "ballista/client",
     "ballista/core",
     "ballista/executor",
     "ballista/scheduler",
+    "benchmarks",
     "examples",
-    "ballista-cli",
 ]
 exclude = ["python"]
 
 
@@ -33,8 +33,8 @@ ballista = { path = "../ballista/client", version = "0.10.0", features = [
     "standalone",
 ] }
 clap = { version = "3", features = ["derive", "cargo"] }
-datafusion = "15.0.0"
-datafusion-cli = "15.0.0"
+datafusion = "17.0.0"
+datafusion-cli = "17.0.0"
 dirs = "4.0.0"
 env_logger = "0.10"
 mimalloc = { version = "0.1", default-features = false }
 
@@ -67,7 +67,7 @@ impl Command {
                     .map_err(BallistaError::DataFusionError)
             }
             Self::DescribeTable(name) => {
-                let df = ctx.sql(&format!("SHOW COLUMNS FROM {}", name)).await?;
+                let df = ctx.sql(&format!("SHOW COLUMNS FROM {name}")).await?;
                 let batches = df.collect().await?;
                 print_options
                     .print_batches(&batches, now)
@@ -97,10 +97,10 @@ impl Command {
             Self::SearchFunctions(function) => {
                 if let Ok(func) = function.parse::<Function>() {
                     let details = func.function_details()?;
-                    println!("{}", details);
+                    println!("{details}");
                     Ok(())
                 } else {
-                    let msg = format!("{} is not a supported function", function);
+                    let msg = format!("{function} is not a supported function");
                     Err(BallistaError::NotImplemented(msg))
                 }
             }
 
@@ -51,7 +51,7 @@ pub async fn exec_from_lines(
                 if line.ends_with(';') {
                     match exec_and_print(ctx, print_options, query).await {
                         Ok(_) => {}
-                        Err(err) => println!("{:?}", err),
+                        Err(err) => println!("{err:?}"),
                     }
                     query = "".to_owned();
                 } else {
@@ -68,7 +68,7 @@ pub async fn exec_from_lines(
     if !query.is_empty() {
         match exec_and_print(ctx, print_options, query).await {
             Ok(_) => {}
-            Err(err) => println!("{:?}", err),
+            Err(err) => println!("{err:?}"),
         }
     }
 }
@@ -110,7 +110,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti
                                     if let Err(e) =
                                         command.execute(&mut print_options).await
                                     {
-                                        eprintln!("{}", e)
+                                        eprintln!("{e}")
                                     }
                                 } else {
                                     eprintln!(
@@ -124,7 +124,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti
                         }
                         _ => {
                             if let Err(e) = cmd.execute(ctx, &mut print_options).await {
-                                eprintln!("{}", e)
+                                eprintln!("{e}")
                             }
                         }
                     }
@@ -136,7 +136,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti
                 rl.add_history_entry(line.trim_end());
                 match exec_and_print(ctx, &print_options, line).await {
                     Ok(_) => {}
-                    Err(err) => eprintln!("{:?}", err),
+                    Err(err) => eprintln!("{err:?}"),
                 }
             }
             Err(ReadlineError::Interrupted) => {
@@ -148,7 +148,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti
                 break;
             }
             Err(err) => {
-                eprintln!("Unknown error happened {:?}", err);
+                eprintln!("Unknown error happened {err:?}");
                 break;
             }
         }
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ pub async fn exec_from_lines(`
`51`	`51`	`if line.ends_with(';') {`
`52`	`52`	`match exec_and_print(ctx, print_options, query).await {`
`53`	`53`	`Ok(_) => {}`
`54`		`- Err(err) => println!("{:?}", err),`
	`54`	`+ Err(err) => println!("{err:?}"),`
`55`	`55`	`}`
`56`	`56`	`query = "".to_owned();`
`57`	`57`	`} else {`
`@@ -68,7 +68,7 @@ pub async fn exec_from_lines(`
`68`	`68`	`if !query.is_empty() {`
`69`	`69`	`match exec_and_print(ctx, print_options, query).await {`
`70`	`70`	`Ok(_) => {}`
`71`		`- Err(err) => println!("{:?}", err),`
	`71`	`+ Err(err) => println!("{err:?}"),`
`72`	`72`	`}`
`73`	`73`	`}`
`74`	`74`	`}`
`@@ -110,7 +110,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti`
`110`	`110`	`if let Err(e) =`
`111`	`111`	`command.execute(&mut print_options).await`
`112`	`112`	`{`
`113`		`- eprintln!("{}", e)`
	`113`	`+ eprintln!("{e}")`
`114`	`114`	`}`
`115`	`115`	`} else {`
`116`	`116`	`eprintln!(`
`@@ -124,7 +124,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti`
`124`	`124`	`}`
`125`	`125`	`_ => {`
`126`	`126`	`if let Err(e) = cmd.execute(ctx, &mut print_options).await {`
`127`		`- eprintln!("{}", e)`
	`127`	`+ eprintln!("{e}")`
`128`	`128`	`}`
`129`	`129`	`}`
`130`	`130`	`}`
`@@ -136,7 +136,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti`
`136`	`136`	`rl.add_history_entry(line.trim_end());`
`137`	`137`	`match exec_and_print(ctx, &print_options, line).await {`
`138`	`138`	`Ok(_) => {}`
`139`		`- Err(err) => eprintln!("{:?}", err),`
	`139`	`+ Err(err) => eprintln!("{err:?}"),`
`140`	`140`	`}`
`141`	`141`	`}`
`142`	`142`	`Err(ReadlineError::Interrupted) => {`
`@@ -148,7 +148,7 @@ pub async fn exec_from_repl(ctx: &BallistaContext, print_options: &mut PrintOpti`
`148`	`148`	`break;`
`149`	`149`	`}`
`150`	`150`	`Err(err) => {`
`151`		`- eprintln!("Unknown error happened {:?}", err);`
	`151`	`+ eprintln!("Unknown error happened {err:?}");`
`152`	`152`	`break;`
`153`	`153`	`}`
`154`	`154`	`}`