Skip to content

Commit

Permalink
Making the data_imdb and clickbench_1 functions atomic.
Browse files Browse the repository at this point in the history
  • Loading branch information
Spaarsh committed Dec 29, 2024
1 parent ab69bb0 commit f558cd9
Showing 1 changed file with 38 additions and 7 deletions.
45 changes: 38 additions & 7 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,10 @@ run_sort() {
#
# Creates data in $DATA_DIR/hits.parquet
data_clickbench_1() {
local output_file="${DATA_DIR}/hits.parquet"

trap 'cleanup_download "${output_file}"' INT TERM

pushd "${DATA_DIR}" > /dev/null

# Avoid downloading if it already exists and is the right size
Expand All @@ -401,9 +405,14 @@ data_clickbench_1() {
else
URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet"
echo -n "... downloading ${URL} (14GB) ... "
wget --continue ${URL}
fi
echo " Done"
if ! wget --continue ${URL}; then
cleanup_download "${output_file}"
fi

trap - INT TERM

echo " Done"
fi
popd > /dev/null
}

Expand Down Expand Up @@ -455,13 +464,27 @@ run_clickbench_extended() {
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
}

# Add cleanup function at the start of script
cleanup_download() {
local file_to_clean="$1"
echo -e "\nCleaning up downloaded files..."
rm -f "${file_to_clean}"
exit 1
}

# Add trap before download starts
trap cleanup_download INT TERM

# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
# https://event.cwi.nl/da/job/imdb.tgz
data_imdb() {
local imdb_dir="${DATA_DIR}/imdb"
local imdb_temp_gz="${imdb_dir}/imdb.tgz"
local imdb_url="https://event.cwi.nl/da/job/imdb.tgz"

# Set trap with parameter
trap 'cleanup_download "${imdb_temp_gz}"' INT TERM

# imdb has 21 files, we just separate them into 3 groups for better readability
local first_required_files=(
"aka_name.parquet"
Expand Down Expand Up @@ -513,11 +536,19 @@ data_imdb() {
echo "Downloading IMDB dataset..."

# Download the dataset
curl -o "${imdb_temp_gz}" "${imdb_url}"
if ! curl -o "${imdb_temp_gz}" "${imdb_url}"; then
echo "Error downloading IMDB dataset. Deleting partial file."
cleanup_download "${imdb_temp_gz}"
fi

# Extract the dataset
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
if ! tar xf "${imdb_temp_gz}" -C "${imdb_dir}"; then
echo "Extraction failed"
cleanup_download "${imdb_temp_gz}"
fi

#Remove trap after successful operations
trap - INT TERM

else
echo "IMDB.tgz already exists."

Expand Down

0 comments on commit f558cd9

Please sign in to comment.