apache · yjshen · Mar 16, 2022 · Feb 23, 2022 · Mar 5, 2022 · Mar 6, 2022
diff --git a/benchmarks/db-benchmark/README.md b/benchmarks/db-benchmark/README.md
@@ -0,0 +1,29 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Run db-benchmark
+
+## Directions
+
+Run the following from root `arrow-datafusion` directory
+
+```bash
+$ docker buildx build -t db-benchmark -f benchmarks/db-benchmark/db-benchmark.dockerfile .
+$ docker run --privileged db-benchmark
+```
diff --git a/benchmarks/db-benchmark/db-benchmark.dockerfile b/benchmarks/db-benchmark/db-benchmark.dockerfile
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM ubuntu
+ARG DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+RUN apt-get update && \
+    apt-get install -y git build-essential
+
+# Install R, curl, and python deps
+RUN apt-get -y install --no-install-recommends --no-install-suggests \
+    ca-certificates software-properties-common gnupg2 gnupg1 \
+    && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
+    && add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' \
+    && apt-get -y install r-base \
+    && apt-get -y install curl \
+    && apt-get -y install python3.8 \
+    && apt-get -y install python3-pip
+
+# Install R libraries
+RUN R -e "install.packages('data.table',dependencies=TRUE, repos='http://cran.rstudio.com/')" \
+    && R -e "install.packages('dplyr',dependencies=TRUE, repos='http://cran.rstudio.com/')"
+
+# Install Rust
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Clone db-benchmark and download data
+RUN git clone https://github.com/h2oai/db-benchmark \
+    && cd db-benchmark \
+    && Rscript _data/groupby-datagen.R 1e7 1e2 0 0 \
+    && Rscript _data/join-datagen.R 1e7 0 0 0 \
+    && mkdir data \
+    && mv G1_1e7_1e2_0_0.csv data \
+    && mv J1_1e7_1e1_0_0.csv data \
+    && mv J1_1e7_1e4_0_0.csv data \
+    && mv J1_1e7_1e7_0_0.csv data \
+    && mv J1_1e7_NA_0_0.csv data \
+    && cd ..
+
+# Clone datafusion-python and build python library
+# Not sure if the wheel will be the same on all computers
+RUN git clone https://github.com/datafusion-contrib/datafusion-python \
+    && cd datafusion-python && git reset --hard 368b50ed9662d5e93c70b539f94cceace685265e \
+    && python3 -m pip install pip \
+    && python3 -m pip install pandas \
+    && python3 -m pip install -r requirements.txt \
+    && cd ..
+
+# Copy local arrow-datafusion
+COPY . arrow-datafusion
+
+# 1. datafusion-python that builds from datafusion version referenced datafusion-python
+RUN cd datafusion-python \
+    && maturin build --release \
+    && case "${TARGETPLATFORM}" in \
+    */amd64) CPUARCH=x86_64 ;; \
+    */arm64) CPUARCH=aarch64 ;; \
+    *) exit 1 ;; \
+    esac \
+    # Version will need to be updated in conjunction with datafusion-python version
+    && python3 -m pip install target/wheels/datafusion-0.4.0-cp36-abi3-linux_${CPUARCH}.whl \
+    && cd ..
+
+# 2. datafusion-python that builds from local datafusion.  use this when making local changes to datafusion.
+# Currently, as of March 5th 2022, this done not build (i think) because datafusion is being split into multiple crates
+# and datafusion-python has not yet been updated to reflect this.
+# RUN cd datafusion-python \
+# && sed -i '/datafusion =/c\datafusion = { path = "../arrow-datafusion/datafusion", features = ["pyarrow"] }' Cargo.toml \
+# && sed -i '/fuzz-utils/d' ../arrow-datafusion/datafusion/Cargo.toml \
+# && maturin build --release \
+# && case "${TARGETPLATFORM}" in \
+#     */amd64) CPUARCH=x86_64 ;; \
+#     */amd64) CPUARCH=aarch64 ;; \
+#     *) exit 1 ;; \
+# esac \
+# && python3 -m pip install target/wheels/datafusion-0.4.0-cp36-abi3-linux_${CPUARCH}.whl \
+# && cd ..
+
+# Make datafusion directory in db-benchmark
+RUN mkdir db-benchmark/datafusion \
+    && cp ../arrow-datafusion/benchmarks/db-benchmark/groupby-datafusion.py db-benchmark/datafusion \
+    && cp ../arrow-datafusion/benchmarks/db-benchmark/join-datafusion.py db-benchmark/datafusion \
+    && cp ../arrow-datafusion/benchmarks/db-benchmark/run-bench.sh db-benchmark/ \
+    && chmod +x db-benchmark/run-bench.sh
+
+WORKDIR /db-benchmark
+
+RUN ls && ls -al data/
+
+ENTRYPOINT ./run-bench.sh
diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py
@@ -0,0 +1,242 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#!/usr/bin/env python
+
+print("# groupby-datafusion.py", flush=True)
+
+import os
+import gc
+import timeit
+import datafusion as df
+from datafusion import functions as f
+from datafusion import col
+from pyarrow import csv as pacsv
+
+# exec(open("./_helpers/helpers.py").read())
+
+def ans_shape(batches):
+    rows, cols = 0, 0
+    for batch in batches:
+        rows += batch.num_rows
+        if cols == 0:
+            cols = batch.num_columns
+        else:
+            assert(cols == batch.num_columns)
+
+    return rows, cols
+
+# ver = df.__version__
+ver = "7.0.0"
+git = ""
+task = "groupby"
+solution = "datafusion"
+fun = ".groupby"
+cache = "TRUE"
+on_disk = "FALSE"
+
+data_name = os.environ["SRC_DATANAME"]
+src_grp = os.path.join("data", data_name + ".csv")
+print("loading dataset %s" % src_grp, flush=True)
+
+data = pacsv.read_csv(src_grp, convert_options=pacsv.ConvertOptions(auto_dict_encode=True))
+print("dataset loaded")
+
+ctx = df.ExecutionContext()
+ctx.register_record_batches("x", [data.to_batches()])
+print("registered record batches")
+# cols = ctx.sql("SHOW columns from x")
+# ans.show()
+
+in_rows = data.num_rows
+# print(in_rows, flush=True)
+
+task_init = timeit.default_timer()
+
+question = "sum v1 by id1" # q1
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id1, SUM(v1) AS v1 FROM x GROUP BY id1").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q1: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("v1"))]).collect()[0].column(0)[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "sum v1 by id1:id2" # q2
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id1, id2, SUM(v1) AS v1 FROM x GROUP BY id1, id2").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q2: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("v1"))]).collect()[0].column(0)[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "sum v1 mean v3 by id3" # q3
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q3: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v3"))]).collect()[0].to_pandas().to_numpy()[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "mean v1:v3 by id4" # q4
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id4, AVG(v1) AS v1, AVG(v2) AS v2, AVG(v3) AS v3 FROM x GROUP BY id4").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q4: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2")), f.sum(col("v3"))]).collect()[0].to_pandas().to_numpy()[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "sum v1:v3 by id6" # q5
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id6, SUM(v1) AS v1, SUM(v2) AS v2, SUM(v3) AS v3 FROM x GROUP BY id6").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q5: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2")), f.sum(col("v3"))]).collect()[0].to_pandas().to_numpy()[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "median v3 sd v3 by id4 id5" # q6
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id4, id5, approx_percentile_cont(v3, .5) AS median_v3, stddev(v3) AS stddev_v3 FROM x GROUP BY id4, id5").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q6: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("median_v3")), f.sum(col("stddev_v3"))]).collect()[0].to_pandas().to_numpy()[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "max v1 - min v2 by id3" # q7
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id3, MAX(v1) - MIN(v2) AS range_v1_v2 FROM x GROUP BY id3").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q7: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("range_v1_v2"))]).collect()[0].column(0)[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "largest two v3 by id6" # q8
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id6, v3 from (SELECT id6, v3, row_number() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS row FROM x) t WHERE row <= 2").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q8: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("v3"))]).collect()[0].column(0)[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "regression v1 v2 by id2 id4" # q9
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT corr(v1, v2) as corr FROM x GROUP BY id2, id4").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q9: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("corr"))]).collect()[0].column(0)[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+question = "sum v3 count by id1:id6" # q10
+gc.collect()
+t_start = timeit.default_timer()
+ans = ctx.sql("SELECT id1, id2, id3, id4, id5, id6, SUM(v3) as v3, COUNT(*) AS cnt FROM x GROUP BY id1, id2, id3, id4, id5, id6").collect()
+shape = ans_shape(ans)
+# print(shape, flush=True)
+t = timeit.default_timer() - t_start
+print(f"q10: {t}")
+# m = memory_usage()
+t_start = timeit.default_timer()
+df = ctx.create_dataframe([ans])
+chk = df.aggregate([], [f.sum(col("v3")), f.sum(col("cnt"))]).collect()[0].to_pandas().to_numpy()[0]
+chkt = timeit.default_timer() - t_start
+# write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=shape[0], out_cols=shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk([chk]), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+
+print("grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True)
+
+exit(0)