diff --git a/.env b/.env index 0b1a988875f79..859672e7330cf 100644 --- a/.env +++ b/.env @@ -57,7 +57,7 @@ CLANG_TOOLS=14 CUDA=11.2.2 DASK=latest DOTNET=8.0 -GCC_VERSION="" +GCC= HDFS=3.2.1 JDK=11 KARTOTHEK=latest diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index d4c681d1601cb..0607c537d1b7f 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -54,7 +54,7 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.1.0 + uses: actions/setup-dotnet@v4.2.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python @@ -86,7 +86,7 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.1.0 + uses: actions/setup-dotnet@v4.2.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -113,7 +113,7 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.1.0 + uses: actions/setup-dotnet@v4.2.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 52eae68c4f498..d59da447612a6 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -117,7 +117,7 @@ jobs: with: ruby-version: ruby - name: Install .NET - uses: actions/setup-dotnet@3e891b0cb619bf60e2c25674b222b8940e2c1c25 # v4.1.0 + uses: actions/setup-dotnet@87b7050bc53ea08284295505d98d2aa94301e852 # v4.2.0 with: dotnet-version: '8.0.x' - name: Install Dependencies diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index 83d19b7d247f9..2c840e95c8db6 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index c64822461f3e0..bc7db519b64f7 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -177,7 +177,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -237,7 +237,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -299,7 +299,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6bde1cb2964e0..27823cae5fa28 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -182,4 +182,5 @@ repos: ( ?^ci/scripts/c_glib_build\.sh$| ?^ci/scripts/c_glib_test\.sh$| + ?^c_glib/test/run-test\.sh$| ) diff --git a/c_glib/test/run-test.sh b/c_glib/test/run-test.sh index c7bc6edca5f0d..8b1868942073c 100755 --- a/c_glib/test/run-test.sh +++ b/c_glib/test/run-test.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -test_dir="$(cd $(dirname $0); pwd)" +test_dir="$(cd "$(dirname "$0")" && pwd)" build_dir="$(cd .; pwd)" modules=( @@ -47,7 +47,7 @@ if [ "${BUILD}" != "no" ]; then fi for module in "${modules[@]}"; do - MODULE_TYPELIB_DIR_VAR_NAME="$(echo ${module} | tr a-z- A-Z_)_TYPELIB_DIR" + MODULE_TYPELIB_DIR_VAR_NAME="$(echo "${module}" | tr a-z- A-Z_)_TYPELIB_DIR" module_typelib_dir=$(eval "echo \${${MODULE_TYPELIB_DIR_VAR_NAME}}") if [ -z "${module_typelib_dir}" ]; then module_typelib_dir="${build_dir}/${module}" @@ -74,4 +74,4 @@ case "${DEBUGGER}" in DEBUGGER_ARGS+=(--) ;; esac -${DEBUGGER} "${DEBUGGER_ARGS[@]}" "${RUBY}" ${test_dir}/run-test.rb "$@" +${DEBUGGER} "${DEBUGGER_ARGS[@]}" "${RUBY}" "${test_dir}"/run-test.rb "$@" diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 084117f38778a..b1237fc9958c1 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -34,7 +34,6 @@ IF "%ARROW_DEBUG_MEMORY_POOL%"=="" ( set CMAKE_BUILD_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% set CTEST_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% - call activate arrow @rem The "main" C++ build script for Windows CI @@ -113,12 +112,12 @@ ctest --output-on-failure || exit /B popd +pushd python + @rem @rem Build and install pyarrow @rem -pushd python - set PYARROW_CMAKE_GENERATOR=%GENERATOR% set PYARROW_CXXFLAGS=%ARROW_CXXFLAGS% set PYARROW_PARALLEL=2 @@ -137,6 +136,12 @@ set ARROW_HOME=%CONDA_PREFIX%\Library @rem ARROW-3075; pkgconfig is broken for Parquet for now set PARQUET_HOME=%CONDA_PREFIX%\Library +pip install --no-deps --no-build-isolation -vv --editable . + +@rem +@rem Run pyarrow tests +@rem + @rem Download IANA Timezone Database to a non-standard location to @rem test the configurability of the timezone database path curl https://data.iana.org/time-zones/releases/tzdata2024b.tar.gz --output tzdata.tar.gz || exit /B @@ -150,12 +155,9 @@ rmdir /s /q %USERPROFILE%\Downloads\tzdata @rem (only needed for testing purposes) set PYARROW_TZDATA_PATH=%USERPROFILE%\Downloads\test\tzdata -python setup.py develop -q || exit /B - +set AWS_EC2_METADATA_DISABLED=true set PYTHONDEVMODE=1 -py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B +python -m pytest -r sxX --durations=15 pyarrow/tests || exit /B -@rem -@rem Wheels are built and tested separately (see ARROW-5142). -@rem +popd diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index f9463e5074225..912b130acff45 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -17,7 +17,13 @@ @echo on -set "PATH=C:\Miniconda38-x64;C:\Miniconda38-x64\Scripts;C:\Miniconda38-x64\Library\bin;%PATH%" +@rem +@rem The miniconda install on AppVeyor is very outdated, use Mambaforge instead +@rem + +appveyor DownloadFile https://github.com/conda-forge/miniforge/releases/download/24.9.2-0/Mambaforge-Windows-x86_64.exe || exit /B +start /wait "" Mambaforge-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /D=C:\Mambaforge +set "PATH=C:\Mambaforge\scripts;C:\Mambaforge\condabin;%PATH%" @rem @rem Avoid picking up AppVeyor-installed OpenSSL (linker errors with gRPC) @@ -33,26 +39,15 @@ rd /s /q C:\OpenSSL-v30-Win32 rd /s /q C:\OpenSSL-v30-Win64 @rem -@rem Configure miniconda +@rem Configure conda @rem conda config --set auto_update_conda false -conda config --set show_channel_urls True +conda config --set show_channel_urls true +conda config --set always_yes true @rem Help with SSL timeouts to S3 conda config --set remote_connect_timeout_secs 12 -@rem Workaround for ARROW-13636 -conda config --append disallowed_packages pypy3 -conda info -a - -@rem -@rem Install Python to the base environment -@rem -conda install -q -y -c conda-forge python=%PYTHON% || exit /B -@rem Can't use conda-libmamba-solver 2.0.0 -conda config --set solver classic - -@rem Update for newer CA certificates -conda update -q -y -c conda-forge --all || exit /B +conda info -a || exit /B @rem @rem Create conda environment @@ -66,11 +61,8 @@ if "%ARROW_BUILD_GANDIVA%" == "ON" ( ) @rem Install pre-built "toolchain" packages for faster builds set CONDA_PACKAGES=%CONDA_PACKAGES% --file=ci\conda_env_cpp.txt -@rem Force conda to use conda-forge -conda config --add channels conda-forge -conda config --remove channels defaults @rem Arrow conda environment -conda create -n arrow -y -c conda-forge ^ +conda create -n arrow ^ --file=ci\conda_env_python.txt ^ %CONDA_PACKAGES% ^ "ccache" ^ @@ -97,7 +89,6 @@ if "%ARROW_S3%" == "ON" ( appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z -FileName C:\Windows\Minio.exe || exit /B ) - @rem @rem Download IANA Timezone Database for unit tests @rem diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index f0084894e19dc..6d4be52baec05 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -48,7 +48,7 @@ ENV PIPX_BASE_PYTHON=/opt/conda/bin/python3 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts RUN /arrow/ci/scripts/install_gcs_testbench.sh default -# Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to +# Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to # be on the path for the tests to run. ENV PATH=/opt/conda/envs/arrow/bin:$PATH @@ -68,6 +68,7 @@ ENV ARROW_ACERO=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=$CONDA_PREFIX \ + ARROW_JEMALLOC=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/conda-python-emscripten.dockerfile b/ci/docker/conda-python-emscripten.dockerfile index 8ad705c920ba8..47ff550cd59ca 100644 --- a/ci/docker/conda-python-emscripten.dockerfile +++ b/ci/docker/conda-python-emscripten.dockerfile @@ -27,14 +27,14 @@ ARG required_python_min="(3,12)" # fail if python version < 3.12 RUN echo "check PYTHON>=${required_python_min}" && python -c "import sys;sys.exit(0 if sys.version_info>=${required_python_min} else 1)" -# install selenium and pyodide-build and recent python +# install selenium and recent pyodide-build and recent python # needs to be a login shell so ~/.profile is read SHELL ["/bin/bash", "--login", "-c", "-o", "pipefail"] RUN python -m pip install --no-cache-dir selenium==${selenium_version} && \ - python -m pip install --no-cache-dir --upgrade pyodide-build==${pyodide_version} - + python -m pip install --no-cache-dir --upgrade pyodide-build>=${pyodide_version} + # install pyodide dist directory to /pyodide RUN pyodide_dist_url="https://github.com/pyodide/pyodide/releases/download/${pyodide_version}/pyodide-${pyodide_version}.tar.bz2" && \ wget -q "${pyodide_dist_url}" -O- | tar -xj -C / diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index 354e7829cc41f..f486d07ff8894 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -124,6 +124,7 @@ ENV ARROW_ACERO=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=/usr/local \ + ARROW_JEMALLOC=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/debian-experimental-cpp.dockerfile b/ci/docker/debian-experimental-cpp.dockerfile new file mode 100644 index 0000000000000..2721b1d5f2058 --- /dev/null +++ b/ci/docker/debian-experimental-cpp.dockerfile @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG arch=amd64 +FROM ${arch}/debian:experimental +ARG arch + +ENV DEBIAN_FRONTEND noninteractive + +ARG gcc +ARG llvm +RUN if [ -n "${gcc}" ]; then \ + gcc_package_suffix="-${gcc}"; \ + else \ + gcc_package_suffix=""; \ + fi && \ + if [ -n "${llvm}" ]; then \ + llvm_package_suffix="-${llvm}"; \ + else \ + llvm_package_suffix=""; \ + fi && \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + autoconf \ + ccache \ + cmake \ + curl \ + g++ \ + gcc \ + gdb \ + git \ + libbenchmark-dev \ + libboost-filesystem-dev \ + libboost-system-dev \ + libbrotli-dev \ + libbz2-dev \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libgflags-dev \ + libgmock-dev \ + libgoogle-glog-dev \ + libgrpc++-dev \ + libidn2-dev \ + libkrb5-dev \ + libldap-dev \ + liblz4-dev \ + libnghttp2-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libpsl-dev \ + libre2-dev \ + librtmp-dev \ + libsnappy-dev \ + libsqlite3-dev \ + libssh-dev \ + libssh2-1-dev \ + libssl-dev \ + libthrift-dev \ + libutf8proc-dev \ + libxml2-dev \ + libxsimd-dev \ + libzstd-dev \ + make \ + ninja-build \ + nlohmann-json3-dev \ + npm \ + opentelemetry-cpp-dev \ + pkg-config \ + protobuf-compiler-grpc \ + python3-dev \ + python3-pip \ + python3-venv \ + rapidjson-dev \ + rsync \ + tzdata \ + zlib1g-dev && \ + apt-get install -y -q --no-install-recommends -t experimental \ + clang${llvm_package_suffix} \ + g++${gcc_package_suffix} \ + gcc${gcc_package_suffix} \ + llvm${llvm_package_suffix}-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + +COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_azurite.sh + +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +# Prioritize system packages and local installation. +ENV ARROW_ACERO=ON \ + ARROW_AZURE=ON \ + ARROW_BUILD_TESTS=ON \ + ARROW_DATASET=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=ON \ + ARROW_FLIGHT_SQL=ON \ + ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ + ARROW_HOME=/usr/local \ + ARROW_JEMALLOC=ON \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ + ARROW_USE_CCACHE=ON \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_OPENTELEMETRY=ON \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ + CC=gcc${gcc:+-${gcc}} \ + CXX=g++${gcc:+-${gcc}} \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ + ORC_SOURCE=BUNDLED \ + PATH=/usr/lib/ccache/:$PATH \ + PYTHON=python3 diff --git a/ci/docker/fedora-39-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile index 52e879aba4ebf..6c5edd444e253 100644 --- a/ci/docker/fedora-39-cpp.dockerfile +++ b/ci/docker/fedora-39-cpp.dockerfile @@ -87,6 +87,7 @@ ENV ARROW_ACERO=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=/usr/local \ + ARROW_JEMALLOC=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 4be5adf246b88..48c7154ef0eb0 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -65,10 +65,10 @@ RUN apt-get update -y && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -ARG gcc_version="" -RUN if [ "${gcc_version}" != "" ]; then \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc_version} 100 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc_version} 100 && \ +ARG gcc="" +RUN if [ "${gcc}" != "" ]; then \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc} 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc} 100 && \ update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30 && \ update-alternatives --set cc /usr/bin/gcc && \ update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30 && \ diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index c6fa3cc0dce97..0b5645285b6e1 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -107,6 +107,7 @@ RUN --mount=type=secret,id=github_repository_owner \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ + --x-feature=orc \ --x-feature=parquet \ --x-feature=s3 && \ rm -rf ~/.config/NuGet/ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index ec8c9840cf0a7..8dc778d544a6d 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -161,6 +161,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ + ARROW_JEMALLOC=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 78a44b0119e6c..28cef2946385c 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -137,32 +137,27 @@ RUN cd ~ && git clone https://github.com/emscripten-core/emsdk.git && \ echo "Installed emsdk to:" ~/emsdk -ARG gcc_version="" -RUN if [ "${gcc_version}" = "" ]; then \ +ARG gcc="" +RUN if [ "${gcc}" = "" ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ g++ \ gcc; \ else \ - if [ "${gcc_version}" -gt "12" ]; then \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends software-properties-common && \ - add-apt-repository ppa:ubuntu-toolchain-r/volatile; \ - fi; \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ - g++-${gcc_version} \ - gcc-${gcc_version} && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc_version} 100 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc_version} 100 && \ + g++-${gcc} \ + gcc-${gcc} && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc} 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc} 100 && \ update-alternatives --install \ /usr/bin/$(uname --machine)-linux-gnu-gcc \ $(uname --machine)-linux-gnu-gcc \ - /usr/bin/$(uname --machine)-linux-gnu-gcc-${gcc_version} 100 && \ + /usr/bin/$(uname --machine)-linux-gnu-gcc-${gcc} 100 && \ update-alternatives --install \ /usr/bin/$(uname --machine)-linux-gnu-g++ \ $(uname --machine)-linux-gnu-g++ \ - /usr/bin/$(uname --machine)-linux-gnu-g++-${gcc_version} 100 && \ + /usr/bin/$(uname --machine)-linux-gnu-g++-${gcc} 100 && \ update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \ update-alternatives --set cc /usr/bin/gcc && \ update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 && \ @@ -205,6 +200,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ + ARROW_JEMALLOC=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 8cb7f9d5f614e..3f486b09f95ff 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -128,32 +128,27 @@ RUN apt-get update -y -q && \ apt-get clean && \ rm -rf /var/lib/apt/lists* -ARG gcc_version="" -RUN if [ "${gcc_version}" = "" ]; then \ +ARG gcc="" +RUN if [ "${gcc}" = "" ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ g++ \ gcc; \ else \ - if [ "${gcc_version}" -gt "14" ]; then \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends software-properties-common && \ - add-apt-repository ppa:ubuntu-toolchain-r/volatile; \ - fi; \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ - g++-${gcc_version} \ - gcc-${gcc_version} && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc_version} 100 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc_version} 100 && \ + g++-${gcc} \ + gcc-${gcc} && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc} 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc} 100 && \ update-alternatives --install \ /usr/bin/$(uname --machine)-linux-gnu-gcc \ $(uname --machine)-linux-gnu-gcc \ - /usr/bin/$(uname --machine)-linux-gnu-gcc-${gcc_version} 100 && \ + /usr/bin/$(uname --machine)-linux-gnu-gcc-${gcc} 100 && \ update-alternatives --install \ /usr/bin/$(uname --machine)-linux-gnu-g++ \ $(uname --machine)-linux-gnu-g++ \ - /usr/bin/$(uname --machine)-linux-gnu-g++-${gcc_version} 100 && \ + /usr/bin/$(uname --machine)-linux-gnu-g++-${gcc} 100 && \ update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \ update-alternatives --set cc /usr/bin/gcc && \ update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 && \ @@ -190,6 +185,7 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ + ARROW_JEMALLOC=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-24.04-verify-rc.dockerfile b/ci/docker/ubuntu-24.04-verify-rc.dockerfile new file mode 100644 index 0000000000000..42d71afcb0999 --- /dev/null +++ b/ci/docker/ubuntu-24.04-verify-rc.dockerfile @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG arch=amd64 +FROM ${arch}/ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive +COPY dev/release/setup-ubuntu.sh / +RUN /setup-ubuntu.sh && \ + rm /setup-ubuntu.sh && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index e70f5da85ae2e..c1e7adf6a05e0 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -171,10 +171,10 @@ else -DARROW_GCS=${ARROW_GCS:-OFF} \ -DARROW_HDFS=${ARROW_HDFS:-ON} \ -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ - -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC:-OFF} \ -DARROW_JSON=${ARROW_JSON:-ON} \ -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ - -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ + -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ -DARROW_ORC=${ARROW_ORC:-OFF} \ -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 91925e7abe8b0..1eaecd6bea07d 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -144,7 +144,6 @@ cmake \ -DCMAKE_INSTALL_PREFIX=${build_dir}/install \ -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DORC_SOURCE=BUNDLED \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 6365fcfacfc38..b9f4406a2d452 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -125,7 +125,6 @@ cmake \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DORC_SOURCE=BUNDLED \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 97cbb74d1ffda..103e0f08445d9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -92,7 +92,7 @@ endif() string(TOLOWER ${CMAKE_BUILD_TYPE} LOWERCASE_BUILD_TYPE) string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") +list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") # this must be included before the project() command, because of the way # vcpkg (ab)uses CMAKE_TOOLCHAIN_FILE to inject its logic into CMake diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index fd26dc7dd9c79..fdb28b540e2d2 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -163,21 +163,6 @@ if(WIN32) # insecure, like std::getenv add_definitions(-D_CRT_SECURE_NO_WARNINGS) - # Disable static assertion in Microsoft C++ standard library. - # - # """[...]\include\type_traits(1271): error C2338: - # You've instantiated std::aligned_storage with an extended - # alignment (in other words, Align > alignof(max_align_t)). - # Before VS 2017 15.8, the member type would non-conformingly have an - # alignment of only alignof(max_align_t). VS 2017 15.8 was fixed to handle - # this correctly, but the fix inherently changes layout and breaks binary - # compatibility (*only* for uses of aligned_storage with extended alignments). - # Please define either (1) _ENABLE_EXTENDED_ALIGNED_STORAGE to acknowledge - # that you understand this message and that you actually want a type with - # an extended alignment, or (2) _DISABLE_EXTENDED_ALIGNED_STORAGE to silence - # this message and get the old non-conformant behavior.""" - add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE) - if(MSVC) # ARROW-1931 See https://github.com/google/googletest/issues/1318 # diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index f0df5a59948b7..1fc3654ec8d07 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1777,13 +1777,28 @@ macro(build_thrift) set(THRIFT_DEPENDENCIES ${THRIFT_DEPENDENCIES} boost_ep) endif() + set(THRIFT_PATCH_COMMAND) + if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 15.0) + # Thrift 0.21.0 doesn't support GCC 15. + # https://github.com/apache/arrow/issues/45096 + # https://github.com/apache/thrift/pull/3078 + find_program(PATCH patch REQUIRED) + list(APPEND + THRIFT_PATCH_COMMAND + ${PATCH} + -p1 + -i + ${CMAKE_CURRENT_LIST_DIR}/thrift-cstdint.patch) + endif() + externalproject_add(thrift_ep ${EP_COMMON_OPTIONS} URL ${THRIFT_SOURCE_URL} URL_HASH "SHA256=${ARROW_THRIFT_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS "${THRIFT_LIB}" CMAKE_ARGS ${THRIFT_CMAKE_ARGS} - DEPENDS ${THRIFT_DEPENDENCIES}) + DEPENDS ${THRIFT_DEPENDENCIES} + PATCH_COMMAND ${THRIFT_PATCH_COMMAND}) add_library(thrift::thrift STATIC IMPORTED) # The include directory must exist before it is referenced by a target. diff --git a/cpp/cmake_modules/thrift-cstdint.patch b/cpp/cmake_modules/thrift-cstdint.patch new file mode 100644 index 0000000000000..b670ba695e217 --- /dev/null +++ b/cpp/cmake_modules/thrift-cstdint.patch @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# https://github.com/apache/thrift/pull/3078 + +From 1920f04398ca32e320f6cf942534ba9d8b3231fd Mon Sep 17 00:00:00 2001 +From: Sutou Kouhei +Date: Mon, 23 Dec 2024 12:33:22 +0900 +Subject: [PATCH] THRIFT-5842: Add missing cstdint include for int64_t in + Mutex.h + +Client: cpp + +GCC 15 (not released yet) requires `#include ` for `int64_t` +but `lib/cpp/src/thrift/concurrency/Mutex.h` doesn't have it. So we +can't build Thrift with GCC 15: + + [80/359] Building CXX object lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o + FAILED: lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o + /bin/g++-15 -DBOOST_ALL_DYN_LINK -DBOOST_TEST_DYN_LINK -DTHRIFT_STATIC_DEFINE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/kou/work/cpp/thrift.kou.build/lib/cpp -I/home/kou/work/cpp/thrift.kou/lib/cpp -I/home/kou/work/cpp/thrift.kou.build -I/home/kou/work/cpp/thrift.kou/lib/cpp/src -g -std=c++11 -MD -MT lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o -MF lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o.d -o lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o -c /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TSSLServerSocket.cpp + In file included from /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TServerSocket.h:25, + from /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TSSLServerSocket.h:23, + from /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TSSLServerSocket.cpp:21: + /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:47:26: error: 'int64_t' has not been declared + 47 | virtual bool timedlock(int64_t milliseconds) const; + | ^~~~~~~ + /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:25:1: note: 'int64_t' is defined in header ''; this is probably fixable by adding '#include ' + 24 | #include + +++ |+#include + 25 | + /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:60:29: error: 'int64_t' has not been declared + 60 | Guard(const Mutex& value, int64_t timeout = 0) : mutex_(&value) { + | ^~~~~~~ + /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:60:29: note: 'int64_t' is defined in header ''; this is probably fixable by adding '#include ' + +See also: https://github.com/apache/arrow/issues/45096 +--- + lib/cpp/src/thrift/concurrency/Mutex.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/lib/cpp/src/thrift/concurrency/Mutex.h b/lib/cpp/src/thrift/concurrency/Mutex.h +index 1e5c3fba3..12f1729d6 100644 +--- a/lib/cpp/src/thrift/concurrency/Mutex.h ++++ b/lib/cpp/src/thrift/concurrency/Mutex.h +@@ -20,6 +20,7 @@ + #ifndef _THRIFT_CONCURRENCY_MUTEX_H_ + #define _THRIFT_CONCURRENCY_MUTEX_H_ 1 + ++#include + #include + #include + +-- +2.45.2 diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index a07c10fda5af8..7a2fe6f070a56 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -26,7 +26,7 @@ arrow::Status ReadFullFile(std::string path_to_file) { // #include "arrow/io/api.h" - // #include "arrow/parquet/arrow/reader.h" + // #include "parquet/arrow/reader.h" arrow::MemoryPool* pool = arrow::default_memory_pool(); std::shared_ptr input; @@ -44,7 +44,7 @@ arrow::Status ReadFullFile(std::string path_to_file) { arrow::Status ReadInBatches(std::string path_to_file) { // #include "arrow/io/api.h" - // #include "arrow/parquet/arrow/reader.h" + // #include "parquet/arrow/reader.h" arrow::MemoryPool* pool = arrow::default_memory_pool(); diff --git a/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in b/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in index 124cbcbf3d42e..66aa2b4078c7f 100644 --- a/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in +++ b/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in @@ -28,7 +28,6 @@ include(CMakeFindDependencyMacro) find_dependency(Arrow) -find_dependency(Parquet) include("${CMAKE_CURRENT_LIST_DIR}/ArrowAceroTargets.cmake") diff --git a/cpp/src/arrow/acero/doc/key_map.md b/cpp/src/arrow/acero/doc/key_map.md index fdedc88c4d43f..a676343bbeb26 100644 --- a/cpp/src/arrow/acero/doc/key_map.md +++ b/cpp/src/arrow/acero/doc/key_map.md @@ -65,7 +65,7 @@ Every slot can either be **empty** or store data related to a single inserted ke Status byte, as the name suggests, stores 8 bits. The highest bit indicates if the slot is empty (the highest bit is set) or corresponds to one of inserted keys (the highest bit is zero). The remaining 7 bits contain 7 bits of key hash that we call a **stamp**. The stamp is used to eliminate some false positives when searching for a matching key for a given input. Slot also stores **key id**, which is a non-negative integer smaller than the number of inserted keys, that is used as a reference to the actual inserted key. The last piece of information related to an inserted key is its **hash** value. We store hashes for all keys, so that they never need to be re-computed. That greatly simplifies some operations, like resizing of a hash table, that may not even need to look at the keys at all. For an empty slot, the status byte is 0x80, key id is zero and the hash is not used and can be set to any number. -A single block contains 8 slots and can be viewed as a micro-stack of up to 8 inserted keys. When the first key is inserted into an empty block, it will occupy a slot with local id 0. The second inserted key will go into slot number 1 and so on. We use N highest bits of hash to get an index of a **start block**, when searching for a match or an empty slot to insert a previously not seen key when that is the case. If the start block contains any empty slots, then the search for either a match or place to insert a key will end at that block. We will call such a block an **open block**. A block that is not open is a full block. In the case of full block, the input key related search may continue in the next block module the number of blocks. If the key is not inserted into its start block, we will refer to it as an **overflow** entry, other entries being **non-overflow**. Overflow entries are slower to process, since they require visiting more than one block, so we want to keep their percentage low. This is done by choosing the right **load factor** (percentage of occupied slots in the hash table) at which the hash table gets resized and the number of blocks gets doubled. By tuning this value we can control the probability of encountering an overflow entry. +A single block contains 8 slots and can be viewed as a micro-stack of up to 8 inserted keys. When the first key is inserted into an empty block, it will occupy a slot with local id 0. The second inserted key will go into slot number 1 and so on. We use N highest bits of hash to get an index of a **start block**, when searching for a match or an empty slot to insert a previously not seen key when that is the case. If the start block contains any empty slots, then the search for either a match or place to insert a key will end at that block. We will call such a block an **open block**. A block that is not open is a full block. In the case of full block, the input key related search may continue in the next block modulo the number of blocks. If the key is not inserted into its start block, we will refer to it as an **overflow** entry, other entries being **non-overflow**. Overflow entries are slower to process, since they require visiting more than one block, so we want to keep their percentage low. This is done by choosing the right **load factor** (percentage of occupied slots in the hash table) at which the hash table gets resized and the number of blocks gets doubled. By tuning this value we can control the probability of encountering an overflow entry. The most interesting part of each block is the set of status bytes of its slots, which is simply a single 64-bit word. The implementation of efficient searches across these bytes during lookups require using either leading zero count or trailing zero count intrinsic. Since there are cases when only the first one is available, in order to take advantage of it, we order the bytes in the 64-bit status word so that the first slot within a block uses the highest byte and the last one uses the lowest byte (slots are in reversed bytes order). The diagram below shows how the information about slots is stored within a 64-bit status word: diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index e3e37e249e6a3..0a56194f2a3c8 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -20,6 +20,7 @@ #include "arrow/acero/hash_join.h" #include "arrow/acero/hash_join_node.h" #include "arrow/acero/options.h" +#include "arrow/acero/swiss_join_internal.h" #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" #include "arrow/api.h" @@ -365,6 +366,21 @@ static void BM_HashJoinBasic_ComplexResidualFilter(benchmark::State& st, HashJoinBasicBenchmarkImpl(st, settings); } + +static void BM_HashJoinBasic_HeavyBuildPayload(benchmark::State& st) { + BenchmarkSettings settings; + settings.build_payload_types = {boolean(), fixed_size_binary(64), utf8(), + boolean(), fixed_size_binary(64), utf8()}; + settings.probe_payload_types = {int32()}; + settings.null_percentage = 0.5; + settings.cardinality = 1.0 / 16.0; + settings.num_build_batches = static_cast(st.range(0)); + settings.num_probe_batches = settings.num_build_batches; + settings.var_length_min = 64; + settings.var_length_max = 128; + + HashJoinBasicBenchmarkImpl(st, settings); +} #endif std::vector hashtable_krows = benchmark::CreateRange(1, 4096, 8); @@ -622,6 +638,10 @@ BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Full Outer", JoinType::FULL_OUTER) ->ArgNames(complex_residual_filter_argnames) ->ArgsProduct(complex_residual_filter_args); + +BENCHMARK(BM_HashJoinBasic_HeavyBuildPayload) + ->ArgNames({"HashTable krows"}) + ->ArgsProduct({benchmark::CreateRange(1, 512, 8)}); #else BENCHMARK_CAPTURE(BM_HashJoinBasic_KeyTypes, "{int32}", {int32()}) @@ -640,5 +660,106 @@ BENCHMARK(BM_HashJoinBasic_ProbeParallelism) #endif // ARROW_BUILD_DETAILED_BENCHMARKS +void RowArrayDecodeBenchmark(benchmark::State& st, const std::shared_ptr& schema, + int column_to_decode) { + auto batches = MakeRandomBatches(schema, 1, std::numeric_limits::max()); + const auto& batch = batches.batches[0]; + RowArray rows; + std::vector row_ids_encode(batch.length); + std::iota(row_ids_encode.begin(), row_ids_encode.end(), 0); + std::vector temp_column_arrays; + DCHECK_OK(rows.AppendBatchSelection( + default_memory_pool(), internal::CpuInfo::GetInstance()->hardware_flags(), batch, 0, + static_cast(batch.length), static_cast(batch.length), + row_ids_encode.data(), temp_column_arrays)); + std::vector row_ids_decode(batch.length); + // Create a random access pattern to simulate hash join. + std::default_random_engine gen(42); + std::uniform_int_distribution dist(0, + static_cast(batch.length - 1)); + std::transform(row_ids_decode.begin(), row_ids_decode.end(), row_ids_decode.begin(), + [&](uint32_t) { return dist(gen); }); + + for (auto _ : st) { + ResizableArrayData column; + // Allocate at least 8 rows for the convenience of SIMD decoding. + int log_num_rows_min = std::max(3, bit_util::Log2(batch.length)); + DCHECK_OK(column.Init(batch[column_to_decode].type(), default_memory_pool(), + log_num_rows_min)); + DCHECK_OK(rows.DecodeSelected(&column, column_to_decode, + static_cast(batch.length), row_ids_decode.data(), + default_memory_pool())); + } + st.SetItemsProcessed(st.iterations() * batch.length); +} + +static void BM_RowArray_Decode(benchmark::State& st, + const std::shared_ptr& type) { + SchemaBuilder schema_builder; + DCHECK_OK(schema_builder.AddField(field("", type))); + auto schema = *schema_builder.Finish(); + RowArrayDecodeBenchmark(st, schema, 0); +} + +BENCHMARK_CAPTURE(BM_RowArray_Decode, "boolean", boolean()); +BENCHMARK_CAPTURE(BM_RowArray_Decode, "int8", int8()); +BENCHMARK_CAPTURE(BM_RowArray_Decode, "int16", int16()); +BENCHMARK_CAPTURE(BM_RowArray_Decode, "int32", int32()); +BENCHMARK_CAPTURE(BM_RowArray_Decode, "int64", int64()); + +static void BM_RowArray_DecodeFixedSizeBinary(benchmark::State& st) { + int fixed_size = static_cast(st.range(0)); + SchemaBuilder schema_builder; + DCHECK_OK(schema_builder.AddField(field("", fixed_size_binary(fixed_size)))); + auto schema = *schema_builder.Finish(); + RowArrayDecodeBenchmark(st, schema, 0); +} + +BENCHMARK(BM_RowArray_DecodeFixedSizeBinary) + ->ArgNames({"fixed_size"}) + ->ArgsProduct({{3, 5, 6, 7, 9, 16, 42}}); + +static void BM_RowArray_DecodeBinary(benchmark::State& st) { + int max_length = static_cast(st.range(0)); + std::unordered_map metadata; + metadata["max_length"] = std::to_string(max_length); + SchemaBuilder schema_builder; + DCHECK_OK(schema_builder.AddField(field("", utf8(), key_value_metadata(metadata)))); + auto schema = *schema_builder.Finish(); + RowArrayDecodeBenchmark(st, schema, 0); +} + +BENCHMARK(BM_RowArray_DecodeBinary) + ->ArgNames({"max_length"}) + ->ArgsProduct({{32, 64, 128}}); + +static void BM_RowArray_DecodeOneOfColumns(benchmark::State& st, + std::vector> types) { + SchemaBuilder schema_builder; + for (const auto& type : types) { + DCHECK_OK(schema_builder.AddField(field("", type))); + } + auto schema = *schema_builder.Finish(); + int column_to_decode = static_cast(st.range(0)); + RowArrayDecodeBenchmark(st, schema, column_to_decode); +} + +const std::vector> fixed_length_row_column_types{ + boolean(), int32(), fixed_size_binary(64)}; +BENCHMARK_CAPTURE(BM_RowArray_DecodeOneOfColumns, + "fixed_length_row:{boolean,int32,fixed_size_binary(64)}", + fixed_length_row_column_types) + ->ArgNames({"column"}) + ->ArgsProduct( + {benchmark::CreateDenseRange(0, fixed_length_row_column_types.size() - 1, 1)}); + +const std::vector> var_length_row_column_types{ + boolean(), int32(), utf8(), utf8()}; +BENCHMARK_CAPTURE(BM_RowArray_DecodeOneOfColumns, + "var_length_row:{boolean,int32,utf8,utf8}", var_length_row_column_types) + ->ArgNames({"column"}) + ->ArgsProduct({benchmark::CreateDenseRange(0, var_length_row_column_types.size() - 1, + 1)}); + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 6c783110af571..c88279fd54b09 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -57,150 +57,12 @@ int RowArrayAccessor::VarbinaryColumnId(const RowTableMetadata& row_metadata, return varbinary_column_id; } -int RowArrayAccessor::NumRowsToSkip(const RowTableImpl& rows, int column_id, int num_rows, - const uint32_t* row_ids, int num_tail_bytes_to_skip) { - uint32_t num_bytes_skipped = 0; - int num_rows_left = num_rows; - - bool is_fixed_length_column = - rows.metadata().column_metadatas[column_id].is_fixed_length; - - if (!is_fixed_length_column) { - // Varying length column - // - int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); - - while (num_rows_left > 0 && - num_bytes_skipped < static_cast(num_tail_bytes_to_skip)) { - // Find the pointer to the last requested row - // - uint32_t last_row_id = row_ids[num_rows_left - 1]; - const uint8_t* row_ptr = rows.data(2) + rows.offsets()[last_row_id]; - - // Find the length of the requested varying length field in that row - // - uint32_t field_offset_within_row, field_length; - if (varbinary_column_id == 0) { - rows.metadata().first_varbinary_offset_and_length( - row_ptr, &field_offset_within_row, &field_length); - } else { - rows.metadata().nth_varbinary_offset_and_length( - row_ptr, varbinary_column_id, &field_offset_within_row, &field_length); - } - - num_bytes_skipped += field_length; - --num_rows_left; - } - } else { - // Fixed length column - // - uint32_t field_length = rows.metadata().column_metadatas[column_id].fixed_length; - uint32_t num_bytes_skipped = 0; - while (num_rows_left > 0 && - num_bytes_skipped < static_cast(num_tail_bytes_to_skip)) { - num_bytes_skipped += field_length; - --num_rows_left; - } - } - - return num_rows - num_rows_left; -} - -template -void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_rows, - const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) { - bool is_fixed_length_column = - rows.metadata().column_metadatas[column_id].is_fixed_length; - - // There are 4 cases, each requiring different steps: - // 1. Varying length column that is the first varying length column in a row - // 2. Varying length column that is not the first varying length column in a - // row - // 3. Fixed length column in a fixed length row - // 4. Fixed length column in a varying length row - - if (!is_fixed_length_column) { - int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); - const uint8_t* row_ptr_base = rows.data(2); - const RowTableImpl::offset_type* row_offsets = rows.offsets(); - uint32_t field_offset_within_row, field_length; - - if (varbinary_column_id == 0) { - // Case 1: This is the first varbinary column - // - for (int i = 0; i < num_rows; ++i) { - uint32_t row_id = row_ids[i]; - const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; - rows.metadata().first_varbinary_offset_and_length( - row_ptr, &field_offset_within_row, &field_length); - process_value_fn(i, row_ptr + field_offset_within_row, field_length); - } - } else { - // Case 2: This is second or later varbinary column - // - for (int i = 0; i < num_rows; ++i) { - uint32_t row_id = row_ids[i]; - const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; - rows.metadata().nth_varbinary_offset_and_length( - row_ptr, varbinary_column_id, &field_offset_within_row, &field_length); - process_value_fn(i, row_ptr + field_offset_within_row, field_length); - } - } - } - - if (is_fixed_length_column) { - uint32_t field_offset_within_row = rows.metadata().encoded_field_offset( - rows.metadata().pos_after_encoding(column_id)); - uint32_t field_length = rows.metadata().column_metadatas[column_id].fixed_length; - // Bit column is encoded as a single byte - // - if (field_length == 0) { - field_length = 1; - } - uint32_t row_length = rows.metadata().fixed_length; - - bool is_fixed_length_row = rows.metadata().is_fixed_length; - if (is_fixed_length_row) { - // Case 3: This is a fixed length column in a fixed length row - // - const uint8_t* row_ptr_base = rows.data(1) + field_offset_within_row; - for (int i = 0; i < num_rows; ++i) { - uint32_t row_id = row_ids[i]; - const uint8_t* row_ptr = row_ptr_base + row_length * row_id; - process_value_fn(i, row_ptr, field_length); - } - } else { - // Case 4: This is a fixed length column in a varying length row - // - const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row; - const RowTableImpl::offset_type* row_offsets = rows.offsets(); - for (int i = 0; i < num_rows; ++i) { - uint32_t row_id = row_ids[i]; - const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; - process_value_fn(i, row_ptr, field_length); - } - } - } -} - -template -void RowArrayAccessor::VisitNulls(const RowTableImpl& rows, int column_id, int num_rows, - const uint32_t* row_ids, - PROCESS_VALUE_FN process_value_fn) { - const uint8_t* null_masks = rows.null_masks(); - uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; - uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id); - for (int i = 0; i < num_rows; ++i) { - uint32_t row_id = row_ids[i]; - int64_t bit_id = row_id * null_mask_num_bytes * 8 + pos_after_encoding; - process_value_fn(i, bit_util::GetBit(null_masks, bit_id) ? 0xff : 0); - } -} - -Status RowArray::InitIfNeeded(MemoryPool* pool, const RowTableMetadata& row_metadata) { +Status RowArray::InitIfNeeded(MemoryPool* pool, int64_t hardware_flags, + const RowTableMetadata& row_metadata) { if (is_initialized_) { return Status::OK(); } + hardware_flags_ = hardware_flags; encoder_.Init(row_metadata.column_metadatas, sizeof(uint64_t), sizeof(uint64_t)); RETURN_NOT_OK(rows_temp_.Init(pool, row_metadata)); RETURN_NOT_OK(rows_.Init(pool, row_metadata)); @@ -208,7 +70,8 @@ Status RowArray::InitIfNeeded(MemoryPool* pool, const RowTableMetadata& row_meta return Status::OK(); } -Status RowArray::InitIfNeeded(MemoryPool* pool, const ExecBatch& batch) { +Status RowArray::InitIfNeeded(MemoryPool* pool, int64_t hardware_flags, + const ExecBatch& batch) { if (is_initialized_) { return Status::OK(); } @@ -218,14 +81,15 @@ Status RowArray::InitIfNeeded(MemoryPool* pool, const ExecBatch& batch) { row_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t), sizeof(uint64_t)); - return InitIfNeeded(pool, row_metadata); + return InitIfNeeded(pool, hardware_flags, row_metadata); } -Status RowArray::AppendBatchSelection(MemoryPool* pool, const ExecBatch& batch, - int begin_row_id, int end_row_id, int num_row_ids, +Status RowArray::AppendBatchSelection(MemoryPool* pool, int64_t hardware_flags, + const ExecBatch& batch, int begin_row_id, + int end_row_id, int num_row_ids, const uint16_t* row_ids, std::vector& temp_column_arrays) { - RETURN_NOT_OK(InitIfNeeded(pool, batch)); + RETURN_NOT_OK(InitIfNeeded(pool, hardware_flags, batch)); RETURN_NOT_OK(ColumnArraysFromExecBatch(batch, begin_row_id, end_row_id - begin_row_id, &temp_column_arrays)); encoder_.PrepareEncodeSelected( @@ -238,7 +102,7 @@ Status RowArray::AppendBatchSelection(MemoryPool* pool, const ExecBatch& batch, void RowArray::Compare(const ExecBatch& batch, int begin_row_id, int end_row_id, int num_selected, const uint16_t* batch_selection_maybe_null, const uint32_t* array_row_ids, uint32_t* out_num_not_equal, - uint16_t* out_not_equal_selection, int64_t hardware_flags, + uint16_t* out_not_equal_selection, arrow::util::TempVectorStack* temp_stack, std::vector& temp_column_arrays, uint8_t* out_match_bitvector_maybe_null) { @@ -247,7 +111,7 @@ void RowArray::Compare(const ExecBatch& batch, int begin_row_id, int end_row_id, ARROW_DCHECK(status.ok()); LightContext ctx; - ctx.hardware_flags = hardware_flags; + ctx.hardware_flags = hardware_flags_; ctx.stack = temp_stack; KeyCompare::CompareColumnsToRows( num_selected, batch_selection_maybe_null, array_row_ids, &ctx, out_num_not_equal, @@ -259,6 +123,25 @@ Status RowArray::DecodeSelected(ResizableArrayData* output, int column_id, int num_rows_to_append, const uint32_t* row_ids, MemoryPool* pool) const { int num_rows_before = output->num_rows(); +#ifdef ARROW_HAVE_RUNTIME_AVX2 + // Preprocess some rows if necessary to assure that AVX2 version sees 8-row aligned + // output address. + if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && (num_rows_before % 8 != 0) && + (num_rows_to_append >= 8)) { + int num_rows_to_preprocess = 8 - num_rows_before % 8; + // The output must have allocated enough rows to store this few number of preprocessed + // rows without costly resizing the internal buffers. + DCHECK_GE(output->num_rows_allocated(), num_rows_before + num_rows_to_preprocess); + RETURN_NOT_OK( + DecodeSelected(output, column_id, num_rows_to_preprocess, row_ids, pool)); + return DecodeSelected(output, column_id, num_rows_to_append - num_rows_to_preprocess, + row_ids + num_rows_to_preprocess, pool); + } + + bool use_avx2 = + (hardware_flags_ & arrow::internal::CpuInfo::AVX2) && (num_rows_before % 8 == 0); +#endif + RETURN_NOT_OK(output->ResizeFixedLengthBuffers(num_rows_before + num_rows_to_append)); // Both input (KeyRowArray) and output (ResizableArrayData) have buffers with @@ -267,98 +150,59 @@ Status RowArray::DecodeSelected(ResizableArrayData* output, int column_id, // ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata column_metadata, output->column_metadata()); + int num_rows_processed = 0; if (column_metadata.is_fixed_length) { uint32_t fixed_length = column_metadata.fixed_length; - switch (fixed_length) { - case 0: - RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - bit_util::SetBitTo(output->mutable_data(1), - num_rows_before + i, *ptr != 0); - }); - break; - case 1: - RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - output->mutable_data(1)[num_rows_before + i] = *ptr; - }); - break; - case 2: - RowArrayAccessor::Visit( - rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - reinterpret_cast(output->mutable_data(1))[num_rows_before + i] = - *reinterpret_cast(ptr); - }); - break; - case 4: - RowArrayAccessor::Visit( - rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - reinterpret_cast(output->mutable_data(1))[num_rows_before + i] = - *reinterpret_cast(ptr); - }); - break; - case 8: - RowArrayAccessor::Visit( - rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - reinterpret_cast(output->mutable_data(1))[num_rows_before + i] = - *reinterpret_cast(ptr); - }); - break; - default: - RowArrayAccessor::Visit( - rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - uint64_t* dst = reinterpret_cast( - output->mutable_data(1) + num_bytes * (num_rows_before + i)); - const uint64_t* src = reinterpret_cast(ptr); - for (uint32_t word_id = 0; - word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) { - arrow::util::SafeStore(dst + word_id, - arrow::util::SafeLoad(src + word_id)); - } - }); - break; + + // Process fixed length columns + // +#ifdef ARROW_HAVE_RUNTIME_AVX2 + if (use_avx2) { + num_rows_processed = DecodeFixedLength_avx2( + output, num_rows_before, column_id, fixed_length, num_rows_to_append, row_ids); } +#endif + DecodeFixedLength(output, num_rows_before + num_rows_processed, column_id, + fixed_length, num_rows_to_append - num_rows_processed, + row_ids + num_rows_processed); } else { - uint32_t* offsets = - reinterpret_cast(output->mutable_data(1)) + num_rows_before; - uint32_t sum = num_rows_before == 0 ? 0 : offsets[0]; - RowArrayAccessor::Visit( - rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { offsets[i] = num_bytes; }); - for (int i = 0; i < num_rows_to_append; ++i) { - uint32_t length = offsets[i]; - offsets[i] = sum; - sum += length; - } - offsets[num_rows_to_append] = sum; + // Process offsets for varying length columns + // +#ifdef ARROW_HAVE_RUNTIME_AVX2 + if (use_avx2) { + num_rows_processed = DecodeOffsets_avx2(output, num_rows_before, column_id, + num_rows_to_append, row_ids); + } +#endif + DecodeOffsets(output, num_rows_before + num_rows_processed, column_id, + num_rows_to_append - num_rows_processed, row_ids + num_rows_processed); + RETURN_NOT_OK(output->ResizeVaryingLengthBuffer()); - RowArrayAccessor::Visit( - rows_, column_id, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - uint64_t* dst = reinterpret_cast( - output->mutable_data(2) + - reinterpret_cast( - output->mutable_data(1))[num_rows_before + i]); - const uint64_t* src = reinterpret_cast(ptr); - for (uint32_t word_id = 0; - word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) { - arrow::util::SafeStore(dst + word_id, - arrow::util::SafeLoad(src + word_id)); - } - }); + + // Process data for varying length columns + // +#ifdef ARROW_HAVE_RUNTIME_AVX2 + if (use_avx2) { + num_rows_processed = DecodeVarLength_avx2(output, num_rows_before, column_id, + num_rows_to_append, row_ids); + } +#endif + DecodeVarLength(output, num_rows_before + num_rows_processed, column_id, + num_rows_to_append - num_rows_processed, + row_ids + num_rows_processed); } // Process nulls // - RowArrayAccessor::VisitNulls( - rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint8_t value) { - bit_util::SetBitTo(output->mutable_data(0), num_rows_before + i, value == 0); - }); +#ifdef ARROW_HAVE_RUNTIME_AVX2 + if (use_avx2) { + num_rows_processed = + DecodeNulls_avx2(output, num_rows_before, column_id, num_rows_to_append, row_ids); + } +#endif + DecodeNulls(output, num_rows_before + num_rows_processed, column_id, + num_rows_to_append - num_rows_processed, row_ids + num_rows_processed); return Status::OK(); } @@ -437,16 +281,125 @@ void RowArray::DebugPrintToFile(const char* filename, bool print_sorted) const { } } +void RowArray::DecodeFixedLength(ResizableArrayData* output, int output_start_row, + int column_id, uint32_t fixed_length, + int num_rows_to_append, const uint32_t* row_ids) const { + switch (fixed_length) { + case 0: + RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + bit_util::SetBitTo(output->mutable_data(1), + output_start_row + i, *ptr != 0); + }); + break; + case 1: + RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + output->mutable_data(1)[output_start_row + i] = *ptr; + }); + break; + case 2: + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + output->mutable_data_as(1)[output_start_row + i] = + *reinterpret_cast(ptr); + }); + break; + case 4: + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + output->mutable_data_as(1)[output_start_row + i] = + *reinterpret_cast(ptr); + }); + break; + case 8: + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + output->mutable_data_as(1)[output_start_row + i] = + *reinterpret_cast(ptr); + }); + break; + default: + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + uint64_t* dst = reinterpret_cast( + output->mutable_data(1) + num_bytes * (output_start_row + i)); + const uint64_t* src = reinterpret_cast(ptr); + // Note that both `output` and `ptr` have been allocated with enough padding + // to accommodate the memory overshoot. See the allocations for + // `ResizableArrayData` in `JoinResultMaterialize` and `JoinResidualFilter` + // for `output`, and `RowTableImpl::kPaddingForVectors` for `ptr`. + for (uint32_t word_id = 0; + word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) { + arrow::util::SafeStore(dst + word_id, + arrow::util::SafeLoad(src + word_id)); + } + }); + break; + } +} + +void RowArray::DecodeOffsets(ResizableArrayData* output, int output_start_row, + int column_id, int num_rows_to_append, + const uint32_t* row_ids) const { + uint32_t* offsets = output->mutable_data_as(1) + output_start_row; + uint32_t sum = (output_start_row == 0) ? 0 : offsets[0]; + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { offsets[i] = num_bytes; }); + for (int i = 0; i < num_rows_to_append; ++i) { + uint32_t length = offsets[i]; + offsets[i] = sum; + sum += length; + } + offsets[num_rows_to_append] = sum; +} + +void RowArray::DecodeVarLength(ResizableArrayData* output, int output_start_row, + int column_id, int num_rows_to_append, + const uint32_t* row_ids) const { + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + uint64_t* dst = reinterpret_cast( + output->mutable_data(2) + + output->mutable_data_as(1)[output_start_row + i]); + const uint64_t* src = reinterpret_cast(ptr); + // Note that both `output` and `ptr` have been allocated with enough padding to + // accommodate the memory overshoot. See the allocations for `ResizableArrayData` + // in `JoinResultMaterialize` and `JoinResidualFilter` for `output`, and + // `RowTableImpl::kPaddingForVectors` for `ptr`. + for (uint32_t word_id = 0; + word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) { + arrow::util::SafeStore(dst + word_id, + arrow::util::SafeLoad(src + word_id)); + } + }); +} + +void RowArray::DecodeNulls(ResizableArrayData* output, int output_start_row, + int column_id, int num_rows_to_append, + const uint32_t* row_ids) const { + RowArrayAccessor::VisitNulls( + rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint8_t value) { + bit_util::SetBitTo(output->mutable_data(0), output_start_row + i, value == 0); + }); +} + Status RowArrayMerge::PrepareForMerge(RowArray* target, const std::vector& sources, std::vector* first_target_row_id, - MemoryPool* pool) { + MemoryPool* pool, int64_t hardware_flags) { ARROW_DCHECK(!sources.empty()); ARROW_DCHECK(sources[0]->is_initialized_); const RowTableMetadata& metadata = sources[0]->rows_.metadata(); ARROW_DCHECK(!target->is_initialized_); - RETURN_NOT_OK(target->InitIfNeeded(pool, metadata)); + RETURN_NOT_OK(target->InitIfNeeded(pool, hardware_flags, metadata)); // Sum the number of rows from all input sources and calculate their total // size. @@ -895,8 +848,8 @@ void SwissTableWithKeys::EqualCallback(int num_keys, const uint16_t* selection_m uint8_t* match_bitvector = match_bitvector_buf.mutable_data(); keys_.Compare(*in->batch, batch_start_to_use, batch_end_to_use, num_keys, - selection_to_use, group_ids_to_use, nullptr, nullptr, hardware_flags, - in->temp_stack, *in->temp_column_arrays, match_bitvector); + selection_to_use, group_ids_to_use, nullptr, nullptr, in->temp_stack, + *in->temp_column_arrays, match_bitvector); if (selection_maybe_null) { int num_keys_mismatch = 0; @@ -918,8 +871,7 @@ void SwissTableWithKeys::EqualCallback(int num_keys, const uint16_t* selection_m group_ids_to_use = group_ids; keys_.Compare(*in->batch, batch_start_to_use, batch_end_to_use, num_keys, selection_to_use, group_ids_to_use, out_num_keys_mismatch, - out_selection_mismatch, hardware_flags, in->temp_stack, - *in->temp_column_arrays); + out_selection_mismatch, in->temp_stack, *in->temp_column_arrays); } } @@ -944,16 +896,18 @@ Status SwissTableWithKeys::AppendCallback(int num_keys, const uint16_t* selectio batch_end_to_use = static_cast(in->batch->length); selection_to_use = selection_to_use_buf.mutable_data(); - return keys_.AppendBatchSelection(swiss_table_.pool(), *in->batch, batch_start_to_use, - batch_end_to_use, num_keys, selection_to_use, + return keys_.AppendBatchSelection(swiss_table_.pool(), swiss_table_.hardware_flags(), + *in->batch, batch_start_to_use, batch_end_to_use, + num_keys, selection_to_use, *in->temp_column_arrays); } else { batch_start_to_use = in->batch_start_row; batch_end_to_use = in->batch_end_row; selection_to_use = selection; - return keys_.AppendBatchSelection(swiss_table_.pool(), *in->batch, batch_start_to_use, - batch_end_to_use, num_keys, selection_to_use, + return keys_.AppendBatchSelection(swiss_table_.pool(), swiss_table_.hardware_flags(), + *in->batch, batch_start_to_use, batch_end_to_use, + num_keys, selection_to_use, *in->temp_column_arrays); } } @@ -1177,8 +1131,10 @@ Status SwissTableForJoinBuild::Init(SwissTableForJoin* target, int dop, int64_t for (int i = 0; i < num_prtns_; ++i) { PartitionState& prtn_state = prtn_states_[i]; RETURN_NOT_OK(prtn_state.keys.Init(hardware_flags_, pool_)); - RETURN_NOT_OK(prtn_state.keys.keys()->InitIfNeeded(pool, key_row_metadata)); - RETURN_NOT_OK(prtn_state.payloads.InitIfNeeded(pool, payload_row_metadata)); + RETURN_NOT_OK( + prtn_state.keys.keys()->InitIfNeeded(pool, hardware_flags, key_row_metadata)); + RETURN_NOT_OK( + prtn_state.payloads.InitIfNeeded(pool, hardware_flags, payload_row_metadata)); } target_->dop_ = dop_; @@ -1294,7 +1250,7 @@ Status SwissTableForJoinBuild::ProcessPartition(int64_t thread_id, if (!no_payload_) { ARROW_DCHECK(payload_batch_maybe_null); RETURN_NOT_OK(prtn_state.payloads.AppendBatchSelection( - pool_, *payload_batch_maybe_null, 0, + pool_, hardware_flags_, *payload_batch_maybe_null, 0, static_cast(payload_batch_maybe_null->length), num_rows_new, row_ids, locals.temp_column_arrays)); } @@ -1324,7 +1280,8 @@ Status SwissTableForJoinBuild::PreparePrtnMerge() { partition_keys[i] = prtn_states_[i].keys.keys(); } RETURN_NOT_OK(RowArrayMerge::PrepareForMerge(target_->map_.keys(), partition_keys, - &partition_keys_first_row_id_, pool_)); + &partition_keys_first_row_id_, pool_, + hardware_flags_)); // 2. SwissTable: // @@ -1346,8 +1303,8 @@ Status SwissTableForJoinBuild::PreparePrtnMerge() { partition_payloads[i] = &prtn_states_[i].payloads; } RETURN_NOT_OK(RowArrayMerge::PrepareForMerge(&target_->payloads_, partition_payloads, - &partition_payloads_first_row_id_, - pool_)); + &partition_payloads_first_row_id_, pool_, + hardware_flags_)); } // Check if we have duplicate keys @@ -1499,7 +1456,7 @@ void SwissTableForJoinBuild::FinishPrtnMerge(arrow::util::TempVectorStack* temp_ LightContext ctx; ctx.hardware_flags = hardware_flags_; ctx.stack = temp_stack; - std::ignore = target_->map_.keys()->rows_.has_any_nulls(&ctx); + target_->map_.keys()->EnsureHasAnyNullsComputed(ctx); } void JoinResultMaterialize::Init(MemoryPool* pool, @@ -1667,7 +1624,9 @@ Result> JoinResultMaterialize::FlushBuildColumn( const std::shared_ptr& data_type, const RowArray* row_array, int column_id, uint32_t* row_ids) { ResizableArrayData output; - RETURN_NOT_OK(output.Init(data_type, pool_, bit_util::Log2(num_rows_))); + // Allocate at least 8 rows for the convenience of SIMD decoding. + int log_num_rows_min = std::max(3, bit_util::Log2(num_rows_)); + RETURN_NOT_OK(output.Init(data_type, pool_, log_num_rows_min)); for (size_t i = 0; i <= null_ranges_.size(); ++i) { int row_id_begin = @@ -2247,9 +2206,11 @@ Result JoinResidualFilter::MaterializeFilterInput( build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD); for (int i = 0; i < num_build_cols; ++i) { ResizableArrayData column_data; + // Allocate at least 8 rows for the convenience of SIMD decoding. + int log_num_rows_min = std::max(3, bit_util::Log2(num_batch_rows)); RETURN_NOT_OK( column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), - pool_, bit_util::Log2(num_batch_rows))); + pool_, log_num_rows_min)); if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) { RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows, key_ids_maybe_null, pool_)); @@ -2501,7 +2462,6 @@ class SwissJoin : public HashJoinImpl { output_batch_callback_ = std::move(output_batch_callback); finished_callback_ = std::move(finished_callback); - hash_table_ready_.store(false); cancelled_.store(false); { std::lock_guard lock(state_mutex_); @@ -2513,7 +2473,6 @@ class SwissJoin : public HashJoinImpl { local_states_.resize(num_threads_); for (int i = 0; i < num_threads_; ++i) { RETURN_NOT_OK(local_states_[i].stack.Init(pool_, kTempStackUsage)); - local_states_[i].hash_table_ready = false; local_states_[i].num_output_batches = 0; local_states_[i].materialize.Init(pool_, proj_map_left, proj_map_right); } @@ -2559,11 +2518,6 @@ class SwissJoin : public HashJoinImpl { return status(); } - if (!local_states_[thread_index].hash_table_ready) { - local_states_[thread_index].hash_table_ready = hash_table_ready_.load(); - } - ARROW_DCHECK(local_states_[thread_index].hash_table_ready); - ExecBatch keypayload_batch; ARROW_ASSIGN_OR_RAISE(keypayload_batch, KeyPayloadFromInput(/*side=*/0, &batch)); arrow::util::TempVectorStack* temp_stack = &local_states_[thread_index].stack; @@ -2728,7 +2682,6 @@ class SwissJoin : public HashJoinImpl { hash_table_.payloads(), hash_table_.key_to_payload() == nullptr); } - hash_table_ready_.store(true); residual_filter_.OnBuildFinished(); @@ -2949,7 +2902,6 @@ class SwissJoin : public HashJoinImpl { JoinResultMaterialize materialize; std::vector temp_column_arrays; int64_t num_output_batches; - bool hash_table_ready; }; std::vector local_states_; @@ -2966,7 +2918,6 @@ class SwissJoin : public HashJoinImpl { // The other flags that follow them, protected by mutex, will be queried or // updated only a fixed number of times during entire join processing. // - std::atomic hash_table_ready_; std::atomic cancelled_; // Mutex protecting state flags. diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 1076073523448..1d6b7eda6e6a0 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -32,7 +32,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu // Number of rows processed together in a single iteration of the loop (single // call to the provided processing lambda). // - constexpr int unroll = 8; + constexpr int kUnroll = 8; bool is_fixed_length_column = rows.metadata().column_metadatas[column_id].is_fixed_length; @@ -48,6 +48,8 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); const uint8_t* row_ptr_base = rows.data(2); const RowTableImpl::offset_type* row_offsets = rows.offsets(); + auto row_offsets_i64 = + reinterpret_cast(row_offsets); static_assert( sizeof(RowTableImpl::offset_type) == sizeof(int64_t), "RowArrayAccessor::Visit_avx2 only supports 64-bit RowTableImpl::offset_type"); @@ -58,17 +60,17 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu __m256i field_offset_within_row = _mm256_set1_epi32(rows.metadata().fixed_length); __m256i varbinary_end_array_offset = _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset); - for (int i = 0; i < num_rows / unroll; ++i) { + for (int i = 0; i < num_rows / kUnroll; ++i) { // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit // row ids. __m256i row_offset_lo = - _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + _mm256_i32gather_epi64(row_offsets_i64, _mm256_castsi256_si128(row_id), sizeof(RowTableImpl::offset_type)); __m256i row_offset_hi = - _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + _mm256_i32gather_epi64(row_offsets_i64, _mm256_extracti128_si256(row_id, 1), sizeof(RowTableImpl::offset_type)); // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4 // 64-bit row offsets. @@ -81,7 +83,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu // The final 8 32-bit field lengths, subtracting the field offset within row. __m256i field_length = _mm256_sub_epi32( _mm256_set_m128i(field_length_hi, field_length_lo), field_offset_within_row); - process_8_values_fn(i * unroll, row_ptr_base, + process_8_values_fn(i * kUnroll, row_ptr_base, _mm256_add_epi64(row_offset_lo, field_offset_within_row), _mm256_add_epi64(row_offset_hi, field_offset_within_row), field_length); @@ -94,19 +96,17 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu sizeof(uint32_t) * (varbinary_column_id - 1)); auto row_ptr_base_i64 = reinterpret_cast(row_ptr_base); - for (int i = 0; i < num_rows / unroll; ++i) { + for (int i = 0; i < num_rows / kUnroll; ++i) { // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit // row ids. __m256i row_offset_lo = - _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + _mm256_i32gather_epi64(row_offsets_i64, _mm256_castsi256_si128(row_id), sizeof(RowTableImpl::offset_type)); - // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4 - // 64-bit row offsets. __m256i row_offset_hi = - _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + _mm256_i32gather_epi64(row_offsets_i64, _mm256_extracti128_si256(row_id, 1), sizeof(RowTableImpl::offset_type)); // Prepare the lower/higher 4 64-bit end array offsets based on the lower/higher 4 // 64-bit row offsets. @@ -127,8 +127,8 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu __m256i field_offset_within_row = _mm256_blend_epi32( field_offset_within_row_A, field_offset_within_row_B, 0xf0); - __m256i alignment_padding = - _mm256_andnot_si256(field_offset_within_row, _mm256_set1_epi8(0xff)); + __m256i alignment_padding = _mm256_andnot_si256( + field_offset_within_row, _mm256_set1_epi8(static_cast(0xff))); alignment_padding = _mm256_add_epi32(alignment_padding, _mm256_set1_epi32(1)); alignment_padding = _mm256_and_si256( alignment_padding, _mm256_set1_epi32(rows.metadata().string_alignment - 1)); @@ -147,7 +147,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu field_offset_within_row_B = _mm256_add_epi32(field_offset_within_row_B, alignment_padding); - process_8_values_fn(i * unroll, row_ptr_base, + process_8_values_fn(i * kUnroll, row_ptr_base, _mm256_add_epi64(row_offset_lo, field_offset_within_row_A), _mm256_add_epi64(row_offset_hi, field_offset_within_row_B), field_length); @@ -159,15 +159,21 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu __m256i field_offset_within_row = _mm256_set1_epi64x(rows.metadata().encoded_field_offset( rows.metadata().pos_after_encoding(column_id))); - __m256i field_length = - _mm256_set1_epi32(rows.metadata().column_metadatas[column_id].fixed_length); + uint32_t actual_field_length = + rows.metadata().column_metadatas[column_id].fixed_length; + // Bit column is encoded as a single byte + if (actual_field_length == 0) { + actual_field_length = 1; + } + __m256i field_length = _mm256_set1_epi32(actual_field_length); + __m256i row_length = _mm256_set1_epi64x(rows.metadata().fixed_length); bool is_fixed_length_row = rows.metadata().is_fixed_length; if (is_fixed_length_row) { // Case 3: This is a fixed length column in fixed length row // const uint8_t* row_ptr_base = rows.data(1); - for (int i = 0; i < num_rows / unroll; ++i) { + for (int i = 0; i < num_rows / kUnroll; ++i) { // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); @@ -177,15 +183,15 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1)); // Calculate the lower/higher 4 64-bit row offsets based on the lower/higher 4 // 64-bit row ids and the fixed field length. - __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, field_length); - __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, field_length); + __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, row_length); + __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, row_length); // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 // 64-bit row offsets and field offset within row. __m256i field_offset_lo = _mm256_add_epi64(row_offset_lo, field_offset_within_row); __m256i field_offset_hi = _mm256_add_epi64(row_offset_hi, field_offset_within_row); - process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi, + process_8_values_fn(i * kUnroll, row_ptr_base, field_offset_lo, field_offset_hi, field_length); } } else { @@ -193,17 +199,19 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu // const uint8_t* row_ptr_base = rows.data(2); const RowTableImpl::offset_type* row_offsets = rows.offsets(); - for (int i = 0; i < num_rows / unroll; ++i) { + auto row_offsets_i64 = + reinterpret_cast(row_offsets); + for (int i = 0; i < num_rows / kUnroll; ++i) { // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit // row ids. __m256i row_offset_lo = - _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + _mm256_i32gather_epi64(row_offsets_i64, _mm256_castsi256_si128(row_id), sizeof(RowTableImpl::offset_type)); __m256i row_offset_hi = - _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + _mm256_i32gather_epi64(row_offsets_i64, _mm256_extracti128_si256(row_id, 1), sizeof(RowTableImpl::offset_type)); // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 // 64-bit row offsets and field offset within row. @@ -211,13 +219,13 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu _mm256_add_epi64(row_offset_lo, field_offset_within_row); __m256i field_offset_hi = _mm256_add_epi64(row_offset_hi, field_offset_within_row); - process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi, + process_8_values_fn(i * kUnroll, row_ptr_base, field_offset_lo, field_offset_hi, field_length); } } } - return num_rows - (num_rows % unroll); + return num_rows - (num_rows % kUnroll); } template @@ -227,31 +235,296 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, // Number of rows processed together in a single iteration of the loop (single // call to the provided processing lambda). // - constexpr int unroll = 8; + constexpr int kUnroll = 8; const uint8_t* null_masks = rows.null_masks(); __m256i null_bits_per_row = _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row); - for (int i = 0; i < num_rows / unroll; ++i) { + __m256i pos_after_encoding = + _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id)); + for (int i = 0; i < num_rows / kUnroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row); - bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id)); + bit_id = _mm256_add_epi32(bit_id, pos_after_encoding); __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast(null_masks), _mm256_srli_epi32(bit_id, 3), 1); __m256i bit_in_word = _mm256_sllv_epi32( _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7))); + // `result` will contain one 32-bit word per tested null bit, either 0xffffffff if the + // null bit was set or 0 if it was unset. __m256i result = _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word); - uint64_t null_bytes = static_cast( + // NB: Be careful about sign-extension when casting the return value of + // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will pollute the + // higher bits of the following OR. + uint32_t null_bytes_lo = static_cast( _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result)))); - null_bytes |= static_cast(_mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1)))) - << 32; + uint64_t null_bytes_hi = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1))); + uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32); + + process_8_values_fn(i * kUnroll, null_bytes); + } + + return num_rows - (num_rows % kUnroll); +} + +namespace { + +inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit (only lower 1 bit interesting) values based on the + // lower/higher 4 64-bit row offsets. + __m128i row_lo = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_lo, 1); + __m128i row_hi = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_hi, 1); + // Extend to 64-bit. + __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo); + __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi); + // Keep the first 8 bits in each 64-bit value, as the other bits belong to other + // columns. + row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF)); + row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF)); + // If a 64-bit value is zero, then we get 64 set bits. + __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, _mm256_setzero_si256()); + __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, _mm256_setzero_si256()); + // 64 set bits per value to 8 set bits (one byte) per value. + int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64); + int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64); + // 8 set bits to 1 set bit. + uint8_t is_zero = static_cast( + _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8))); + *output = static_cast(~is_zero); +} + +inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) values based on the + // lower/higher 4 64-bit row offsets. + __m128i row_lo = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_lo, 1); + __m128i row_hi = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_hi, 1); + __m256i row = _mm256_set_m128i(row_hi, row_lo); + // Shuffle the lower 8 bits of each 32-bit values to the lower 32 bits of each 128-bit + // lane. + constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL; + const __m256i shuffle_const = + _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, -1); + row = _mm256_shuffle_epi8(row, shuffle_const); + // Get the lower 32-bits (4 8-bit values) from each 128-bit lane. + // NB: Be careful about sign-extension when casting the return value of + // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will pollute the + // higher bits of the following OR. + uint32_t compact_row_lo = static_cast(_mm256_extract_epi32(row, 0)); + uint64_t compact_row_hi = static_cast(_mm256_extract_epi32(row, 4)) << 32; + *reinterpret_cast(output) = compact_row_lo | compact_row_hi; +} + +inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) values based on the + // lower/higher 4 64-bit row offsets. + __m128i row_lo = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_lo, 1); + __m128i row_hi = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_hi, 1); + __m256i row = _mm256_set_m128i(row_hi, row_lo); + // Shuffle the lower 16 bits of each 32-bit values to the lower 64 bits of each 128-bit + // lane. + constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL; + const __m256i shuffle_const = _mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1, + kByteSequence_0_1_4_5_8_9_12_13, -1); + row = _mm256_shuffle_epi8(row, shuffle_const); + // Swap the second and the third 64-bit lane, so that all 16-bit values end up in the + // lower half of `row`. + // (0xd8 = 0b 11 01 10 00) + row = _mm256_permute4x64_epi64(row, 0xd8); + _mm_storeu_si128(reinterpret_cast<__m128i*>(output), _mm256_castsi256_si128(row)); +} + +inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit values based on the lower/higher 4 64-bit row + // offsets. + __m128i row_lo = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_lo, 1); + __m128i row_hi = + _mm256_i64gather_epi32(reinterpret_cast(row_ptr_base), offset_hi, 1); + __m256i row = _mm256_set_m128i(row_hi, row_lo); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row); +} + +inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + auto row_ptr_base_i64 = + reinterpret_cast(row_ptr_base); + // Gather the lower/higher 4 64-bit values based on the lower/higher 4 64-bit row + // offsets. + __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1); + __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi); +} - process_8_values_fn(i * unroll, null_bytes); +inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t num_bytes) { + // Copy 32 bytes at a time. + // Note that both `output` and `row_ptr` have been allocated with enough padding to + // accommodate the memory overshoot. See the allocations for `ResizableArrayData` in + // `JoinResultMaterialize` and `JoinResidualFilter` for `output`, and + // `RowTableImpl::kPaddingForVectors` for `row_ptr`. + __m256i* output_i256 = reinterpret_cast<__m256i*>(output); + const __m256i* row_ptr_i256 = reinterpret_cast(row_ptr); + for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) { + _mm256_storeu_si256(output_i256 + istripe, + _mm256_loadu_si256(row_ptr_i256 + istripe)); } +} + +inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length, + __m256i num_bytes) { + uint32_t num_bytes_last = static_cast(_mm256_extract_epi32(num_bytes, 7)); + // Init every offset with the current length. + __m256i offsets = _mm256_set1_epi32(current_length); + // We keep left-shifting the length and accumulate the offset by adding the length. + __m256i length = + _mm256_permutevar8x32_epi32(num_bytes, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6)); + length = _mm256_insert_epi32(length, 0, 0); + // `length` is now a sequence of 32-bit words such as: + // - length[0] = 0 + // - length[1] = num_bytes[0] + // ... + // - length[7] = num_bytes[6] + // (note that num_bytes[7] is kept in `num_bytes_last`) + for (int i = 0; i < 7; ++i) { + offsets = _mm256_add_epi32(offsets, length); + length = + _mm256_permutevar8x32_epi32(length, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6)); + length = _mm256_insert_epi32(length, 0, 0); + } + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), offsets); + return _mm256_extract_epi32(offsets, 7) + num_bytes_last; +} + +inline void Decode8Null_avx2(uint8_t* output, uint64_t null_bytes) { + uint8_t null_bits = + static_cast(_mm256_movemask_epi8(_mm256_set1_epi64x(null_bytes))); + *output = ~null_bits; +} + +} // namespace + +int RowArray::DecodeFixedLength_avx2(ResizableArrayData* output, int output_start_row, + int column_id, uint32_t fixed_length, + int num_rows_to_append, + const uint32_t* row_ids) const { + DCHECK_EQ(output_start_row % 8, 0); + + int num_rows_processed = 0; + switch (fixed_length) { + case 0: + num_rows_processed = RowArrayAccessor::Visit_avx2( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi, + __m256i num_bytes) { + DCHECK_EQ(i % 8, 0); + Decode8FixedLength0_avx2(output->mutable_data(1) + (output_start_row + i) / 8, + row_ptr_base, offset_lo, offset_hi); + }); + break; + case 1: + num_rows_processed = RowArrayAccessor::Visit_avx2( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi, + __m256i num_bytes) { + Decode8FixedLength1_avx2(output->mutable_data(1) + output_start_row + i, + row_ptr_base, offset_lo, offset_hi); + }); + break; + case 2: + num_rows_processed = RowArrayAccessor::Visit_avx2( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi, + __m256i num_bytes) { + Decode8FixedLength2_avx2( + output->mutable_data_as(1) + output_start_row + i, row_ptr_base, + offset_lo, offset_hi); + }); + break; + case 4: + num_rows_processed = RowArrayAccessor::Visit_avx2( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi, + __m256i num_bytes) { + Decode8FixedLength4_avx2( + output->mutable_data_as(1) + output_start_row + i, row_ptr_base, + offset_lo, offset_hi); + }); + break; + case 8: + num_rows_processed = RowArrayAccessor::Visit_avx2( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi, + __m256i num_bytes) { + Decode8FixedLength8_avx2( + output->mutable_data_as(1) + output_start_row + i, row_ptr_base, + offset_lo, offset_hi); + }); + break; + default: + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr, uint32_t num_bytes) { + Decode1_avx2(output->mutable_data(1) + num_bytes * (output_start_row + i), + row_ptr, num_bytes); + }); + num_rows_processed = num_rows_to_append; + break; + } + + return num_rows_processed; +} + +int RowArray::DecodeOffsets_avx2(ResizableArrayData* output, int output_start_row, + int column_id, int num_rows_to_append, + const uint32_t* row_ids) const { + uint32_t* offsets = output->mutable_data_as(1) + output_start_row; + uint32_t current_length = (output_start_row == 0) ? 0 : offsets[0]; + int num_rows_processed = RowArrayAccessor::Visit_avx2( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi, + __m256i num_bytes) { + current_length = Decode8Offset_avx2(offsets + i, current_length, num_bytes); + }); + offsets[num_rows_processed] = current_length; + return num_rows_processed; +} + +int RowArray::DecodeVarLength_avx2(ResizableArrayData* output, int output_start_row, + int column_id, int num_rows_to_append, + const uint32_t* row_ids) const { + RowArrayAccessor::Visit( + rows_, column_id, num_rows_to_append, row_ids, + [&](int i, const uint8_t* row_ptr, uint32_t num_bytes) { + uint8_t* dst = output->mutable_data(2) + + output->mutable_data_as(1)[output_start_row + i]; + Decode1_avx2(dst, row_ptr, num_bytes); + }); + return num_rows_to_append; +} + +int RowArray::DecodeNulls_avx2(ResizableArrayData* output, int output_start_row, + int column_id, int num_rows_to_append, + const uint32_t* row_ids) const { + DCHECK_EQ(output_start_row % 8, 0); - return num_rows - (num_rows % unroll); + return RowArrayAccessor::VisitNulls_avx2( + rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint64_t null_bytes) { + DCHECK_EQ(i % 8, 0); + Decode8Null_avx2(output->mutable_data(0) + (output_start_row + i) / 8, + null_bytes); + }); } } // namespace acero diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 4d749c1c529ae..f2f3ac5b1bf93 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -32,6 +32,7 @@ namespace arrow { using compute::ExecBatchBuilder; using compute::KeyColumnArray; using compute::KeyColumnMetadata; +using compute::LightContext; using compute::ResizableArrayData; using compute::RowTableEncoder; using compute::RowTableImpl; @@ -47,16 +48,6 @@ class RowArrayAccessor { // static int VarbinaryColumnId(const RowTableMetadata& row_metadata, int column_id); - // Calculate how many rows to skip from the tail of the - // sequence of selected rows, such that the total size of skipped rows is at - // least equal to the size specified by the caller. Skipping of the tail rows - // is used to allow for faster processing by the caller of remaining rows - // without checking buffer bounds (useful with SIMD or fixed size memory loads - // and stores). - // - static int NumRowsToSkip(const RowTableImpl& rows, int column_id, int num_rows, - const uint32_t* row_ids, int num_tail_bytes_to_skip); - // The supplied lambda will be called for each row in the given list of rows. // The arguments given to it will be: // - index of a row (within the set of selected rows), @@ -68,7 +59,80 @@ class RowArrayAccessor { // template static void Visit(const RowTableImpl& rows, int column_id, int num_rows, - const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn); + const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) { + bool is_fixed_length_column = + rows.metadata().column_metadatas[column_id].is_fixed_length; + + // There are 4 cases, each requiring different steps: + // 1. Varying length column that is the first varying length column in a row + // 2. Varying length column that is not the first varying length column in a + // row + // 3. Fixed length column in a fixed length row + // 4. Fixed length column in a varying length row + + if (!is_fixed_length_column) { + int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); + const uint8_t* row_ptr_base = rows.data(2); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); + uint32_t field_offset_within_row, field_length; + + if (varbinary_column_id == 0) { + // Case 1: This is the first varbinary column + // + for (int i = 0; i < num_rows; ++i) { + uint32_t row_id = row_ids[i]; + const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; + rows.metadata().first_varbinary_offset_and_length( + row_ptr, &field_offset_within_row, &field_length); + process_value_fn(i, row_ptr + field_offset_within_row, field_length); + } + } else { + // Case 2: This is second or later varbinary column + // + for (int i = 0; i < num_rows; ++i) { + uint32_t row_id = row_ids[i]; + const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; + rows.metadata().nth_varbinary_offset_and_length( + row_ptr, varbinary_column_id, &field_offset_within_row, &field_length); + process_value_fn(i, row_ptr + field_offset_within_row, field_length); + } + } + } + + if (is_fixed_length_column) { + uint32_t field_offset_within_row = rows.metadata().encoded_field_offset( + rows.metadata().pos_after_encoding(column_id)); + uint32_t field_length = rows.metadata().column_metadatas[column_id].fixed_length; + // Bit column is encoded as a single byte + // + if (field_length == 0) { + field_length = 1; + } + uint32_t row_length = rows.metadata().fixed_length; + + bool is_fixed_length_row = rows.metadata().is_fixed_length; + if (is_fixed_length_row) { + // Case 3: This is a fixed length column in a fixed length row + // + const uint8_t* row_ptr_base = rows.data(1) + field_offset_within_row; + for (int i = 0; i < num_rows; ++i) { + uint32_t row_id = row_ids[i]; + const uint8_t* row_ptr = row_ptr_base + row_length * row_id; + process_value_fn(i, row_ptr, field_length); + } + } else { + // Case 4: This is a fixed length column in a varying length row + // + const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row; + const RowTableImpl::offset_type* row_offsets = rows.offsets(); + for (int i = 0; i < num_rows; ++i) { + uint32_t row_id = row_ids[i]; + const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; + process_value_fn(i, row_ptr, field_length); + } + } + } + } // The supplied lambda will be called for each row in the given list of rows. // The arguments given to it will be: @@ -77,9 +141,17 @@ class RowArrayAccessor { // template static void VisitNulls(const RowTableImpl& rows, int column_id, int num_rows, - const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn); + const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) { + const uint8_t* null_masks = rows.null_masks(); + uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row; + uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id); + for (int i = 0; i < num_rows; ++i) { + uint32_t row_id = row_ids[i]; + int64_t bit_id = row_id * null_mask_num_bytes * 8 + pos_after_encoding; + process_value_fn(i, bit_util::GetBit(null_masks, bit_id) ? 0xff : 0); + } + } - private: #if defined(ARROW_HAVE_RUNTIME_AVX2) // This is equivalent to Visit method, but processing 8 rows at a time in a // loop. @@ -108,13 +180,15 @@ class RowArrayAccessor { // can be called by multiple threads concurrently. // struct RowArray { - RowArray() : is_initialized_(false) {} + RowArray() : is_initialized_(false), hardware_flags_(0) {} - Status InitIfNeeded(MemoryPool* pool, const ExecBatch& batch); - Status InitIfNeeded(MemoryPool* pool, const RowTableMetadata& row_metadata); + Status InitIfNeeded(MemoryPool* pool, int64_t hardware_flags, const ExecBatch& batch); + Status InitIfNeeded(MemoryPool* pool, int64_t hardware_flags, + const RowTableMetadata& row_metadata); - Status AppendBatchSelection(MemoryPool* pool, const ExecBatch& batch, int begin_row_id, - int end_row_id, int num_row_ids, const uint16_t* row_ids, + Status AppendBatchSelection(MemoryPool* pool, int64_t hardware_flags, + const ExecBatch& batch, int begin_row_id, int end_row_id, + int num_row_ids, const uint16_t* row_ids, std::vector& temp_column_arrays); // This can only be called for a minibatch. @@ -122,12 +196,10 @@ struct RowArray { void Compare(const ExecBatch& batch, int begin_row_id, int end_row_id, int num_selected, const uint16_t* batch_selection_maybe_null, const uint32_t* array_row_ids, uint32_t* out_num_not_equal, uint16_t* out_not_equal_selection, - int64_t hardware_flags, arrow::util::TempVectorStack* temp_stack, + arrow::util::TempVectorStack* temp_stack, std::vector& temp_column_arrays, uint8_t* out_match_bitvector_maybe_null = NULLPTR); - // TODO: add AVX2 version - // Status DecodeSelected(ResizableArrayData* target, int column_id, int num_rows_to_append, const uint32_t* row_ids, MemoryPool* pool) const; @@ -135,10 +207,43 @@ struct RowArray { int64_t num_rows() const { return is_initialized_ ? rows_.length() : 0; } + void EnsureHasAnyNullsComputed(const LightContext& ctx) { + std::ignore = rows_.has_any_nulls(&ctx); + } + + private: bool is_initialized_; + + int64_t hardware_flags_; RowTableEncoder encoder_; RowTableImpl rows_; RowTableImpl rows_temp_; + + private: + void DecodeFixedLength(ResizableArrayData* output, int output_start_row, int column_id, + uint32_t fixed_length, int num_rows_to_append, + const uint32_t* row_ids) const; + void DecodeOffsets(ResizableArrayData* output, int output_start_row, int column_id, + int num_rows_to_append, const uint32_t* row_ids) const; + void DecodeVarLength(ResizableArrayData* output, int output_start_row, int column_id, + int num_rows_to_append, const uint32_t* row_ids) const; + void DecodeNulls(ResizableArrayData* output, int output_start_row, int column_id, + int num_rows_to_append, const uint32_t* row_ids) const; + +#if defined(ARROW_HAVE_RUNTIME_AVX2) + int DecodeFixedLength_avx2(ResizableArrayData* output, int output_start_row, + int column_id, uint32_t fixed_length, int num_rows_to_append, + const uint32_t* row_ids) const; + int DecodeOffsets_avx2(ResizableArrayData* output, int output_start_row, int column_id, + int num_rows_to_append, const uint32_t* row_ids) const; + int DecodeVarLength_avx2(ResizableArrayData* output, int output_start_row, + int column_id, int num_rows_to_append, + const uint32_t* row_ids) const; + int DecodeNulls_avx2(ResizableArrayData* output, int output_start_row, int column_id, + int num_rows_to_append, const uint32_t* row_ids) const; +#endif + + friend class RowArrayMerge; }; // Implements concatenating multiple row arrays into a single one, using @@ -161,7 +266,7 @@ class RowArrayMerge { // static Status PrepareForMerge(RowArray* target, const std::vector& sources, std::vector* first_target_row_id, - MemoryPool* pool); + MemoryPool* pool, int64_t hardware_flags); // Copy rows from source array to target array. // Both arrays must have the same row metadata. diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index d16b6cfd2e97d..51cca497485ce 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -145,7 +145,10 @@ class OrcStripeReader : public RecordBatchReader { Status ReadNext(std::shared_ptr* out) override { std::unique_ptr batch; - ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_)); + std::unique_ptr builder; + + ORC_BEGIN_CATCH_NOT_OK + batch = row_reader_->createRowBatch(batch_size_); const liborc::Type& type = row_reader_->getSelectedType(); if (!row_reader_->next(*batch)) { @@ -153,10 +156,8 @@ class OrcStripeReader : public RecordBatchReader { return Status::OK(); } - std::unique_ptr builder; ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema_, pool_, batch->numElements)); - // The top-level type must be a struct to read into an arrow table const auto& struct_batch = checked_cast(*batch); @@ -164,9 +165,9 @@ class OrcStripeReader : public RecordBatchReader { RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0, batch->numElements, builder->GetField(i))); } + ORC_END_CATCH_NOT_OK - ARROW_ASSIGN_OR_RAISE(*out, builder->Flush()); - return Status::OK(); + return builder->Flush().Value(out); } private: @@ -470,15 +471,13 @@ class ORCFileReader::Impl { int64_t nrows) { std::unique_ptr row_reader; std::unique_ptr batch; + std::unique_ptr builder; ORC_BEGIN_CATCH_NOT_OK row_reader = reader_->createRowReader(opts); batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch)); - ORC_END_CATCH_NOT_OK - std::unique_ptr builder; ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema, pool_, nrows)); - // The top-level type must be a struct to read into an arrow table const auto& struct_batch = checked_cast(*batch); @@ -489,6 +488,7 @@ class ORCFileReader::Impl { batch->numElements, builder->GetField(i))); } } + ORC_END_CATCH_NOT_OK return builder->Flush(); } diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 3d18d5f967b72..226f5fc4649af 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1186,7 +1186,8 @@ TEST_F(TestMapArray, BuildingStringToInt) { std::vector offsets = {0, 2, 2, 3, 3}; auto expected_keys = ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])"); auto expected_values = ArrayFromJSON(int32(), "[0, null, 8]"); - ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, internal::BytesToBits({1, 0, 1, 1})); + ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, + internal::BytesToBits(std::vector({1, 0, 1, 1}))); MapArray expected(type, 4, Buffer::Wrap(offsets), expected_keys, expected_values, expected_null_bitmap, 1); diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 99b853ab0fe73..6ccd2f4766e67 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -22,7 +22,7 @@ #include #include -#include "arrow/type_fwd.h" +#include "arrow/type.h" #include "arrow/util/visibility.h" namespace arrow { @@ -34,22 +34,38 @@ namespace arrow { /// as Apache Parquet may have statistics. Statistics associated with /// data source can be read unified API via this class. struct ARROW_EXPORT ArrayStatistics { + /// \brief The type for maximum and minimum values. If the target + /// value exists, one of them is used. `std::nullopt` is used + /// otherwise. using ValueType = std::variant; static const std::shared_ptr& ValueToArrowType( - const std::optional& value) { + const std::optional& value, + const std::shared_ptr& array_type) { if (!value.has_value()) { return null(); } struct Visitor { + const std::shared_ptr& array_type; + const std::shared_ptr& operator()(const bool&) { return boolean(); } const std::shared_ptr& operator()(const int64_t&) { return int64(); } const std::shared_ptr& operator()(const uint64_t&) { return uint64(); } const std::shared_ptr& operator()(const double&) { return float64(); } - // GH-44579: How to support binary data? - const std::shared_ptr& operator()(const std::string&) { return utf8(); } - } visitor; + const std::shared_ptr& operator()(const std::string&) { + switch (array_type->id()) { + case Type::STRING: + case Type::BINARY: + case Type::FIXED_SIZE_BINARY: + case Type::LARGE_STRING: + case Type::LARGE_BINARY: + return array_type; + default: + return utf8(); + } + } + } visitor{array_type}; return std::visit(visitor, value.value()); } @@ -62,7 +78,24 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The minimum value, may not be set std::optional min = std::nullopt; - const std::shared_ptr& MinArrowType() { return ValueToArrowType(min); } + /// \brief Compute Arrow type of the minimum value. + /// + /// If \ref ValueType is `std::string`, `array_type` may be + /// used. If `array_type` is a binary-like type such as \ref + /// arrow::binary and \ref arrow::large_utf8, `array_type` is + /// returned. \ref arrow::utf8 is returned otherwise. + /// + /// If \ref ValueType isn't `std::string`, `array_type` isn't used. + /// + /// \param array_type The Arrow type of the associated array. + /// + /// \return \ref arrow::null if the minimum value is `std::nullopt`, + /// Arrow type based on \ref ValueType of the \ref min + /// otherwise. + const std::shared_ptr& MinArrowType( + const std::shared_ptr& array_type) { + return ValueToArrowType(min, array_type); + } /// \brief Whether the minimum value is exact or not bool is_min_exact = false; @@ -70,7 +103,24 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The maximum value, may not be set std::optional max = std::nullopt; - const std::shared_ptr& MaxArrowType() { return ValueToArrowType(max); } + /// \brief Compute Arrow type of the maximum value. + /// + /// If \ref ValueType is `std::string`, `array_type` may be + /// used. If `array_type` is a binary-like type such as \ref + /// arrow::binary and \ref arrow::large_utf8, `array_type` is + /// returned. \ref arrow::utf8 is returned otherwise. + /// + /// If \ref ValueType isn't `std::string`, `array_type` isn't used. + /// + /// \param array_type The Arrow type of the associated array. + /// + /// \return \ref arrow::null if the maximum value is `std::nullopt`, + /// Arrow type based on \ref ValueType of the \ref max + /// otherwise. + const std::shared_ptr& MaxArrowType( + const std::shared_ptr& array_type) { + return ValueToArrowType(max, array_type); + } /// \brief Whether the maximum value is exact or not bool is_max_exact = false; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index 1fe26b316362d..b000efd1e028b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -479,6 +479,43 @@ struct DecimalConversions { static Decimal256 ConvertOutput(Decimal256&& val) { return val; } }; +template +struct DecimalConversions { + static Decimal32 ConvertInput(InDecimal&& val) { return Decimal32(val.low_bits()); } + static Decimal32 ConvertOutput(Decimal32&& val) { return val; } +}; + +template <> +struct DecimalConversions { + // Convert then scale + static Decimal64 ConvertInput(Decimal32&& val) { return Decimal64(val); } + static Decimal64 ConvertOutput(Decimal64&& val) { return val; } +}; + +template <> +struct DecimalConversions { + static Decimal64 ConvertInput(Decimal64&& val) { return val; } + static Decimal64 ConvertOutput(Decimal64&& val) { return val; } +}; + +template <> +struct DecimalConversions { + // Scale then truncate + static Decimal128 ConvertInput(Decimal128&& val) { return val; } + static Decimal64 ConvertOutput(Decimal128&& val) { + return Decimal64(static_cast(val.low_bits())); + } +}; + +template <> +struct DecimalConversions { + // Scale then truncate + static Decimal256 ConvertInput(Decimal256&& val) { return val; } + static Decimal64 ConvertOutput(Decimal256&& val) { + return Decimal64(static_cast(val.low_bits())); + } +}; + template <> struct DecimalConversions { // Scale then truncate @@ -495,6 +532,20 @@ struct DecimalConversions { static Decimal128 ConvertOutput(Decimal128&& val) { return val; } }; +template <> +struct DecimalConversions { + // convert then scale + static Decimal128 ConvertInput(Decimal64&& val) { return Decimal128(val.value()); } + static Decimal128 ConvertOutput(Decimal128&& val) { return val; } +}; + +template <> +struct DecimalConversions { + // convert then scale + static Decimal128 ConvertInput(Decimal32&& val) { return Decimal128(val.value()); } + static Decimal128 ConvertOutput(Decimal128&& val) { return val; } +}; + struct UnsafeUpscaleDecimal { template OutValue Call(KernelContext*, Arg0Value val, Status*) const { @@ -659,6 +710,18 @@ struct DecimalCastFunctor { } }; +template +struct CastFunctor< + Decimal32Type, I, + enable_if_t::value || is_binary_view_like_type::value>> + : public DecimalCastFunctor {}; + +template +struct CastFunctor< + Decimal64Type, I, + enable_if_t::value || is_binary_view_like_type::value>> + : public DecimalCastFunctor {}; + template struct CastFunctor< Decimal128Type, I, @@ -744,6 +807,10 @@ std::shared_ptr GetCastToInteger(std::string name) { // From decimal to integer DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty, CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DECIMAL32, {InputType(Type::DECIMAL32)}, out_ty, + CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DECIMAL64, {InputType(Type::DECIMAL64)}, out_ty, + CastFunctor::Exec)); DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty, CastFunctor::Exec)); return func; @@ -772,6 +839,10 @@ std::shared_ptr GetCastToFloating(std::string name) { AddCommonNumberCasts(out_ty, func.get()); // From decimal to floating point + DCHECK_OK(func->AddKernel(Type::DECIMAL32, {InputType(Type::DECIMAL32)}, out_ty, + CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DECIMAL64, {InputType(Type::DECIMAL64)}, out_ty, + CastFunctor::Exec)); DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty, CastFunctor::Exec)); DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty, @@ -780,6 +851,94 @@ std::shared_ptr GetCastToFloating(std::string name) { return func; } +std::shared_ptr GetCastToDecimal32() { + OutputType sig_out_ty(ResolveOutputFromOptions); + + auto func = std::make_shared("cast_decimal32", Type::DECIMAL32); + AddCommonCasts(Type::DECIMAL32, sig_out_ty, func.get()); + + // Cast from floating point + DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty, + CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty, + CastFunctor::Exec)); + + // Cast from integer + for (const std::shared_ptr& in_ty : IntTypes()) { + auto exec = GenerateInteger(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + + // Cast from other strings + for (const std::shared_ptr& in_ty : BaseBinaryTypes()) { + auto exec = GenerateVarBinaryBase(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + for (const std::shared_ptr& in_ty : BinaryViewTypes()) { + auto exec = GenerateVarBinaryViewBase(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + + // Cast from other decimal + auto exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL32, {InputType(Type::DECIMAL32)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL64, {InputType(Type::DECIMAL64)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec)); + return func; +} + +std::shared_ptr GetCastToDecimal64() { + OutputType sig_out_ty(ResolveOutputFromOptions); + + auto func = std::make_shared("cast_decimal64", Type::DECIMAL64); + AddCommonCasts(Type::DECIMAL64, sig_out_ty, func.get()); + + // Cast from floating point + DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty, + CastFunctor::Exec)); + DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty, + CastFunctor::Exec)); + + // Cast from integer + for (const std::shared_ptr& in_ty : IntTypes()) { + auto exec = GenerateInteger(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + + // Cast from other strings + for (const std::shared_ptr& in_ty : BaseBinaryTypes()) { + auto exec = GenerateVarBinaryBase(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + for (const std::shared_ptr& in_ty : BinaryViewTypes()) { + auto exec = GenerateVarBinaryViewBase(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + + // Cast from other decimal + auto exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL32, {InputType(Type::DECIMAL32)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL64, {InputType(Type::DECIMAL64)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec)); + return func; +} + std::shared_ptr GetCastToDecimal128() { OutputType sig_out_ty(ResolveOutputFromOptions); @@ -809,8 +968,14 @@ std::shared_ptr GetCastToDecimal128() { } // Cast from other decimal - auto exec = CastFunctor::Exec; + auto exec = CastFunctor::Exec; // We resolve the output type of this kernel from the CastOptions + DCHECK_OK( + func->AddKernel(Type::DECIMAL32, {InputType(Type::DECIMAL32)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL64, {InputType(Type::DECIMAL64)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; DCHECK_OK( func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec)); exec = CastFunctor::Exec; @@ -848,7 +1013,13 @@ std::shared_ptr GetCastToDecimal256() { } // Cast from other decimal - auto exec = CastFunctor::Exec; + auto exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL32, {InputType(Type::DECIMAL32)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; + DCHECK_OK( + func->AddKernel(Type::DECIMAL64, {InputType(Type::DECIMAL64)}, sig_out_ty, exec)); + exec = CastFunctor::Exec; DCHECK_OK( func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec)); exec = CastFunctor::Exec; @@ -950,6 +1121,8 @@ std::vector> GetNumericCasts() { auto cast_double = GetCastToFloating("cast_double"); functions.push_back(cast_double); + functions.push_back(GetCastToDecimal32()); + functions.push_back(GetCastToDecimal64()); functions.push_back(GetCastToDecimal128()); functions.push_back(GetCastToDecimal256()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 4edf00225d317..7186612d25a76 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -683,7 +683,8 @@ void AddNumberToStringCasts(CastFunction* func) { template void AddDecimalToStringCasts(CastFunction* func) { auto out_ty = TypeTraits::type_singleton(); - for (const auto& in_tid : std::vector{Type::DECIMAL128, Type::DECIMAL256}) { + for (const auto& in_tid : std::vector{Type::DECIMAL32, Type::DECIMAL64, + Type::DECIMAL128, Type::DECIMAL256}) { DCHECK_OK( func->AddKernel(in_tid, {in_tid}, out_ty, GenerateDecimal(in_tid), diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 33a01425508e0..80d5b3c46cae1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -447,6 +447,159 @@ TEST(Cast, IntToFloating) { CastOptions::Safe(float64())); } +TEST(Cast, Decimal32ToInt) { + auto options = CastOptions::Safe(int32()); + + for (bool allow_int_overflow : {false, true}) { + for (bool allow_decimal_truncate : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_overflow_no_truncation = ArrayFromJSON(decimal32(9, 5), R"([ + "02.00000", + "-11.00000", + "22.00000", + "-121.00000", + null])"); + CheckCast(no_overflow_no_truncation, + ArrayFromJSON(int32(), "[2, -11, 22, -121, null]"), options); + } + } + + for (bool allow_int_overflow : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + auto truncation_but_no_overflow = ArrayFromJSON(decimal32(9, 5), R"([ + "02.10000", + "-11.00450", + "22.00045", + "-121.12100", + null])"); + + options.allow_decimal_truncate = true; + CheckCast(truncation_but_no_overflow, + ArrayFromJSON(int32(), "[2, -11, 22, -121, null]"), options); + + options.allow_decimal_truncate = false; + CheckCastFails(truncation_but_no_overflow, options); + } + + for (bool allow_int_overflow : {false, true}) { + for (bool allow_decimal_truncate : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + options.allow_decimal_truncate = allow_decimal_truncate; + + auto overflow_and_truncation = ArrayFromJSON(decimal32(9, 5), R"([ + "1234.00453", + "9999.00344", + null])"); + + if (options.allow_decimal_truncate) { + CheckCast(overflow_and_truncation, ArrayFromJSON(int32(), "[1234, 9999, null]"), + options); + } else { + CheckCastFails(overflow_and_truncation, options); + } + } + } + + Decimal32Builder builder(decimal32(9, -3)); + for (auto d : {Decimal32("12345000."), Decimal32("-12000000.")}) { + ASSERT_OK_AND_ASSIGN(d, d.Rescale(0, -3)); + ASSERT_OK(builder.Append(d)); + } + ASSERT_OK_AND_ASSIGN(auto negative_scale, builder.Finish()); + options.allow_int_overflow = true; + options.allow_decimal_truncate = true; + CheckCast(negative_scale, ArrayFromJSON(int32(), "[12345000, -12000000]"), options); +} + +TEST(Cast, Decimal64ToInt) { + auto options = CastOptions::Safe(int64()); + + for (bool allow_int_overflow : {false, true}) { + for (bool allow_decimal_truncate : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_overflow_no_truncation = ArrayFromJSON(decimal64(18, 10), R"([ + "02.0000000000", + "-11.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + CheckCast(no_overflow_no_truncation, + ArrayFromJSON(int64(), "[2, -11, 22, -121, null]"), options); + } + } + + for (bool allow_int_overflow : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + auto truncation_but_no_overflow = ArrayFromJSON(decimal64(18, 10), R"([ + "02.1000000000", + "-11.0000004500", + "22.0000004500", + "-121.1210000000", + null])"); + + options.allow_decimal_truncate = true; + CheckCast(truncation_but_no_overflow, + ArrayFromJSON(int32(), "[2, -11, 22, -121, null]"), options); + + options.allow_decimal_truncate = false; + CheckCastFails(truncation_but_no_overflow, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto overflow_no_truncation = ArrayFromJSON(decimal64(18, 5), R"([ + "1234567890123.00000", + "9999999999999.00000", + null])"); + + options.allow_int_overflow = true; + CheckCast(overflow_no_truncation, + ArrayFromJSON(int64(), "[1234567890123, 9999999999999, null]"), options); + + options.to_type = int32(); + options.allow_int_overflow = false; + CheckCastFails(overflow_no_truncation, options); + } + + for (bool allow_int_overflow : {false, true}) { + for (bool allow_decimal_truncate : {false, true}) { + options.allow_int_overflow = allow_int_overflow; + options.allow_decimal_truncate = allow_decimal_truncate; + options.to_type = int32(); + + auto overflow_and_truncation = ArrayFromJSON(decimal64(18, 5), R"([ + "1234567890123.45345", + "9999999999999.00344", + null])"); + + if (options.allow_int_overflow && options.allow_decimal_truncate) { + CheckCast(overflow_and_truncation, + ArrayFromJSON(int32(), + // 1234567890123 % 2**32, 9999999999999 % 2**32 + "[1912276171, 1316134911, null]"), + options); + } else { + CheckCastFails(overflow_and_truncation, options); + } + } + } + + Decimal64Builder builder(decimal64(18, -4)); + for (auto d : {Decimal64("1234567890000."), Decimal64("-120000.")}) { + ASSERT_OK_AND_ASSIGN(d, d.Rescale(0, -4)); + ASSERT_OK(builder.Append(d)); + } + ASSERT_OK_AND_ASSIGN(auto negative_scale, builder.Finish()); + options.allow_int_overflow = true; + options.allow_decimal_truncate = true; + CheckCast(negative_scale, ArrayFromJSON(int64(), "[1234567890000, -120000]"), options); +} + TEST(Cast, Decimal128ToInt) { auto options = CastOptions::Safe(int64()); @@ -629,11 +782,14 @@ TEST(Cast, Decimal256ToInt) { } TEST(Cast, IntegerToDecimal) { - for (auto decimal_type : {decimal128(22, 2), decimal256(22, 2)}) { + for (auto decimal_type : + {decimal32(9, 2), decimal64(18, 2), decimal128(22, 2), decimal256(22, 2)}) { for (auto integer_type : kIntegerTypes) { - CheckCast( - ArrayFromJSON(integer_type, "[0, 7, null, 100, 99]"), - ArrayFromJSON(decimal_type, R"(["0.00", "7.00", null, "100.00", "99.00"])")); + if (decimal_type->bit_width() > integer_type->bit_width()) { + CheckCast( + ArrayFromJSON(integer_type, "[0, 7, null, 100, 99]"), + ArrayFromJSON(decimal_type, R"(["0.00", "7.00", null, "100.00", "99.00"])")); + } } } @@ -652,6 +808,12 @@ TEST(Cast, IntegerToDecimal) { { CastOptions options; + options.to_type = decimal32(9, 3); + CheckCastFails(ArrayFromJSON(int32(), "[0]"), options); + + options.to_type = decimal64(18, 3); + CheckCastFails(ArrayFromJSON(int64(), "[0]"), options); + options.to_type = decimal128(5, 3); CheckCastFails(ArrayFromJSON(int8(), "[0]"), options); @@ -660,6 +822,166 @@ TEST(Cast, IntegerToDecimal) { } } +TEST(Cast, Decimal32ToDecimal32) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal32(9, 5), R"([ + "02.00000", + "30.00000", + "22.00000", + "-121.00000", + null])"); + auto expected = ArrayFromJSON(decimal32(9, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + CheckCast(expected, no_truncation, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal32(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal32(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_4_2, d_5_2, options); + } + + auto d_9_5 = ArrayFromJSON(decimal32(9, 5), R"([ + "-02.12345", + "30.12345", + null])"); + + auto d_6_0 = ArrayFromJSON(decimal32(6, 0), R"([ + "-02.", + "30.", + null])"); + + auto d_9_5_roundtripped = ArrayFromJSON(decimal32(9, 5), R"([ + "-02.00000", + "30.00000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d_9_5, d_6_0, options); + CheckCast(d_6_0, d_9_5_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d_6_0->type(); + CheckCastFails(d_9_5, options); + CheckCast(d_6_0, d_9_5_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d_4_2 = ArrayFromJSON(decimal32(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal32(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal32(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal32(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d_4_2, options); + } +} + +TEST(Cast, Decimal64ToDecimal64) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal64(18, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal64(9, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + CheckCast(expected, no_truncation, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal64(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal64(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_4_2, d_5_2, options); + } + + auto d_18_10 = ArrayFromJSON(decimal64(18, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d_12_0 = ArrayFromJSON(decimal64(12, 0), R"([ + "-02.", + "30.", + null])"); + + auto d_18_10_roundtripped = ArrayFromJSON(decimal64(18, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d_18_10, d_12_0, options); + CheckCast(d_12_0, d_18_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d_12_0->type(); + CheckCastFails(d_18_10, options); + CheckCast(d_12_0, d_18_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d_4_2 = ArrayFromJSON(decimal64(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal64(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal64(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal64(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d_4_2, options); + } +} + TEST(Cast, Decimal128ToDecimal128) { CastOptions options; @@ -820,19 +1142,19 @@ TEST(Cast, Decimal256ToDecimal256) { } } -TEST(Cast, Decimal128ToDecimal256) { +TEST(Cast, Decimal32ToDecimal64) { CastOptions options; for (bool allow_decimal_truncate : {false, true}) { options.allow_decimal_truncate = allow_decimal_truncate; - auto no_truncation = ArrayFromJSON(decimal128(38, 10), R"([ - "02.0000000000", - "30.0000000000", - "22.0000000000", - "-121.0000000000", + auto no_truncation = ArrayFromJSON(decimal32(9, 5), R"([ + "02.00000", + "30.00000", + "22.00000", + "-121.00000", null])"); - auto expected = ArrayFromJSON(decimal256(48, 0), R"([ + auto expected = ArrayFromJSON(decimal64(16, 0), R"([ "02.", "30.", "22.", @@ -846,47 +1168,731 @@ TEST(Cast, Decimal128ToDecimal256) { options.allow_decimal_truncate = allow_decimal_truncate; // Same scale, different precision - auto d_5_2 = ArrayFromJSON(decimal128(5, 2), R"([ + auto d_5_2 = ArrayFromJSON(decimal32(5, 2), R"([ "12.34", "0.56"])"); - auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([ + auto d_4_2 = ArrayFromJSON(decimal64(4, 2), R"([ "12.34", "0.56"])"); - auto d_40_2 = ArrayFromJSON(decimal256(40, 2), R"([ + auto d_16_2 = ArrayFromJSON(decimal64(16, 2), R"([ "12.34", "0.56"])"); CheckCast(d_5_2, d_4_2, options); - CheckCast(d_5_2, d_40_2, options); + CheckCast(d_5_2, d_16_2, options); } - auto d128_38_10 = ArrayFromJSON(decimal128(38, 10), R"([ - "-02.1234567890", - "30.1234567890", + auto d32_7_5 = ArrayFromJSON(decimal32(7, 5), R"([ + "-02.12345", + "30.12345", null])"); - auto d128_28_0 = ArrayFromJSON(decimal128(28, 0), R"([ + auto d32_9_0 = ArrayFromJSON(decimal32(9, 0), R"([ "-02.", "30.", null])"); - auto d256_28_0 = ArrayFromJSON(decimal256(28, 0), R"([ + auto d64_14_0 = ArrayFromJSON(decimal64(14, 0), R"([ "-02.", "30.", null])"); - auto d256_38_10_roundtripped = ArrayFromJSON(decimal256(38, 10), R"([ + auto d64_18_10_roundtripped = ArrayFromJSON(decimal64(18, 10), R"([ "-02.0000000000", "30.0000000000", null])"); // Rescale which leads to truncation options.allow_decimal_truncate = true; - CheckCast(d128_38_10, d256_28_0, options); - CheckCast(d128_28_0, d256_38_10_roundtripped, options); + CheckCast(d32_7_5, d64_14_0, options); + CheckCast(d32_9_0, d64_18_10_roundtripped, options); options.allow_decimal_truncate = false; - options.to_type = d256_28_0->type(); + options.to_type = d64_14_0->type(); + CheckCastFails(d32_7_5, options); + CheckCast(d32_9_0, d64_18_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d32_4_2 = ArrayFromJSON(decimal32(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal64(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal64(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal64(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d32_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d32_4_2, options); + } +} + +TEST(Cast, Decimal32ToDecimal128) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal32(9, 5), R"([ + "02.00000", + "30.00000", + "22.00000", + "-121.00000", + null])"); + auto expected = ArrayFromJSON(decimal128(16, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal32(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal128(4, 2), R"([ + "12.34", + "0.56"])"); + auto d_16_2 = ArrayFromJSON(decimal128(16, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_5_2, d_16_2, options); + } + + auto d32_7_5 = ArrayFromJSON(decimal32(7, 5), R"([ + "-02.12345", + "30.12345", + null])"); + + auto d32_9_0 = ArrayFromJSON(decimal32(9, 0), R"([ + "-02.", + "30.", + null])"); + + auto d128_14_0 = ArrayFromJSON(decimal128(14, 0), R"([ + "-02.", + "30.", + null])"); + + auto d128_38_10_roundtripped = ArrayFromJSON(decimal128(38, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d32_7_5, d128_14_0, options); + CheckCast(d32_9_0, d128_38_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d128_14_0->type(); + CheckCastFails(d32_7_5, options); + CheckCast(d32_9_0, d128_38_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d32_4_2 = ArrayFromJSON(decimal32(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal128(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal128(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal128(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d32_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d32_4_2, options); + } +} + +TEST(Cast, Decimal32ToDecimal256) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal32(9, 5), R"([ + "02.00000", + "30.00000", + "22.00000", + "-121.00000", + null])"); + auto expected = ArrayFromJSON(decimal256(16, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal32(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([ + "12.34", + "0.56"])"); + auto d_16_2 = ArrayFromJSON(decimal256(16, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_5_2, d_16_2, options); + } + + auto d32_7_5 = ArrayFromJSON(decimal32(7, 5), R"([ + "-02.12345", + "30.12345", + null])"); + + auto d32_9_0 = ArrayFromJSON(decimal32(9, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_14_0 = ArrayFromJSON(decimal256(14, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_76_10_roundtripped = ArrayFromJSON(decimal256(76, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d32_7_5, d256_14_0, options); + CheckCast(d32_9_0, d256_76_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d256_14_0->type(); + CheckCastFails(d32_7_5, options); + CheckCast(d32_9_0, d256_76_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d32_4_2 = ArrayFromJSON(decimal32(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal256(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal256(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d32_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d32_4_2, options); + } +} + +TEST(Cast, Decimal64ToDecimal32) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal64(18, 5), R"([ + "02.00000", + "30.00000", + "22.00000", + "-121.00000", + null])"); + auto expected = ArrayFromJSON(decimal32(9, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_12_2 = ArrayFromJSON(decimal64(12, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal32(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_12_2, d_4_2, options); + } + + auto d64_15_10 = ArrayFromJSON(decimal64(15, 5), R"([ + "-02.12345", + "30.12345", + null])"); + + auto d64_12_0 = ArrayFromJSON(decimal64(12, 0), R"([ + "-02.", + "30.", + null])"); + + auto d32_6_0 = ArrayFromJSON(decimal32(6, 0), R"([ + "-02.", + "30.", + null])"); + + auto d32_9_5_roundtripped = ArrayFromJSON(decimal32(9, 5), R"([ + "-02.00000", + "30.00000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d64_15_10, d32_6_0, options); + CheckCast(d64_12_0, d32_9_5_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d32_6_0->type(); + CheckCastFails(d64_15_10, options); + CheckCast(d64_12_0, d32_9_5_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d64_4_2 = ArrayFromJSON(decimal64(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal32(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal32(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal32(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d64_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d64_4_2, options); + } +} + +TEST(Cast, Decimal64ToDecimal128) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal64(18, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal128(28, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal64(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal128(4, 2), R"([ + "12.34", + "0.56"])"); + auto d_16_2 = ArrayFromJSON(decimal128(16, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_5_2, d_16_2, options); + } + + auto d64_16_10 = ArrayFromJSON(decimal64(16, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d64_18_0 = ArrayFromJSON(decimal64(18, 0), R"([ + "-02.", + "30.", + null])"); + + auto d128_14_0 = ArrayFromJSON(decimal128(14, 0), R"([ + "-02.", + "30.", + null])"); + + auto d128_38_10_roundtripped = ArrayFromJSON(decimal128(38, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d64_16_10, d128_14_0, options); + CheckCast(d64_18_0, d128_38_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d128_14_0->type(); + CheckCastFails(d64_16_10, options); + CheckCast(d64_18_0, d128_38_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d64_4_2 = ArrayFromJSON(decimal64(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal128(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal128(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal128(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d64_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d64_4_2, options); + } +} + +TEST(Cast, Decimal64ToDecimal256) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal64(18, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal256(16, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal64(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([ + "12.34", + "0.56"])"); + auto d_16_2 = ArrayFromJSON(decimal256(16, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_5_2, d_16_2, options); + } + + auto d64_16_10 = ArrayFromJSON(decimal64(16, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d64_18_0 = ArrayFromJSON(decimal64(18, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_14_0 = ArrayFromJSON(decimal256(14, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_76_10_roundtripped = ArrayFromJSON(decimal256(76, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d64_16_10, d256_14_0, options); + CheckCast(d64_18_0, d256_76_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d256_14_0->type(); + CheckCastFails(d64_16_10, options); + CheckCast(d64_18_0, d256_76_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d64_4_2 = ArrayFromJSON(decimal64(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal256(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal256(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal256(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d64_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d64_4_2, options); + } +} + +TEST(Cast, Decimal128ToDecimal32) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal128(26, 5), R"([ + "02.00000", + "30.00000", + "22.00000", + "-121.00000", + null])"); + auto expected = ArrayFromJSON(decimal32(9, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_28_2 = ArrayFromJSON(decimal128(28, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal32(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_28_2, d_4_2, options); + } + + auto d128_28_5 = ArrayFromJSON(decimal128(28, 5), R"([ + "-02.12345", + "30.12345", + null])"); + + auto d128_22_0 = ArrayFromJSON(decimal128(22, 0), R"([ + "-02.", + "30.", + null])"); + + auto d32_7_0 = ArrayFromJSON(decimal32(7, 0), R"([ + "-02.", + "30.", + null])"); + + auto d32_9_5_roundtripped = ArrayFromJSON(decimal32(9, 5), R"([ + "-02.00000", + "30.00000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d128_28_5, d32_7_0, options); + CheckCast(d128_22_0, d32_9_5_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d32_7_0->type(); + CheckCastFails(d128_28_5, options); + CheckCast(d128_22_0, d32_9_5_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d128_4_2 = ArrayFromJSON(decimal128(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal32(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal32(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal32(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d128_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d128_4_2, options); + } +} + +TEST(Cast, Decimal128ToDecimal64) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal128(26, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal64(15, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_28_2 = ArrayFromJSON(decimal128(28, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal64(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_28_2, d_4_2, options); + } + + auto d128_28_10 = ArrayFromJSON(decimal128(28, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d128_22_0 = ArrayFromJSON(decimal128(22, 0), R"([ + "-02.", + "30.", + null])"); + + auto d64_12_0 = ArrayFromJSON(decimal64(12, 0), R"([ + "-02.", + "30.", + null])"); + + auto d64_18_10_roundtripped = ArrayFromJSON(decimal64(18, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d128_28_10, d64_12_0, options); + CheckCast(d128_22_0, d64_18_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d64_12_0->type(); + CheckCastFails(d128_28_10, options); + CheckCast(d128_22_0, d64_18_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d128_4_2 = ArrayFromJSON(decimal128(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal64(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal64(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal64(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d128_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d128_4_2, options); + } +} + +TEST(Cast, Decimal128ToDecimal256) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal128(38, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal256(48, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal128(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal256(4, 2), R"([ + "12.34", + "0.56"])"); + auto d_40_2 = ArrayFromJSON(decimal256(40, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_5_2, d_40_2, options); + } + + auto d128_38_10 = ArrayFromJSON(decimal128(38, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d128_28_0 = ArrayFromJSON(decimal128(28, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_28_0 = ArrayFromJSON(decimal256(28, 0), R"([ + "-02.", + "30.", + null])"); + + auto d256_38_10_roundtripped = ArrayFromJSON(decimal256(38, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d128_38_10, d256_28_0, options); + CheckCast(d128_28_0, d256_38_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d256_28_0->type(); CheckCastFails(d128_38_10, options); CheckCast(d128_28_0, d256_38_10_roundtripped, options); @@ -907,6 +1913,172 @@ TEST(Cast, Decimal128ToDecimal256) { } } +TEST(Cast, Decimal256ToDecimal32) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal256(42, 5), R"([ + "02.00000", + "30.00000", + "22.00000", + "-121.00000", + null])"); + auto expected = ArrayFromJSON(decimal32(9, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_28_2 = ArrayFromJSON(decimal256(42, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal32(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_28_2, d_4_2, options); + } + + auto d256_52_5 = ArrayFromJSON(decimal256(52, 5), R"([ + "-02.12345", + "30.12345", + null])"); + + auto d256_42_0 = ArrayFromJSON(decimal256(42, 0), R"([ + "-02.", + "30.", + null])"); + + auto d32_7_0 = ArrayFromJSON(decimal32(7, 0), R"([ + "-02.", + "30.", + null])"); + + auto d32_9_5_roundtripped = ArrayFromJSON(decimal32(9, 5), R"([ + "-02.00000", + "30.00000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d256_52_5, d32_7_0, options); + CheckCast(d256_42_0, d32_9_5_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d32_7_0->type(); + CheckCastFails(d256_52_5, options); + CheckCast(d256_42_0, d32_9_5_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d256_4_2 = ArrayFromJSON(decimal256(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal32(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal32(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal32(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d256_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d256_4_2, options); + } +} + +TEST(Cast, Decimal256ToDecimal64) { + CastOptions options; + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + auto no_truncation = ArrayFromJSON(decimal256(42, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal64(15, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + } + + for (bool allow_decimal_truncate : {false, true}) { + options.allow_decimal_truncate = allow_decimal_truncate; + + // Same scale, different precision + auto d_42_2 = ArrayFromJSON(decimal256(42, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal64(4, 2), R"([ + "12.34", + "0.56"])"); + + CheckCast(d_42_2, d_4_2, options); + } + + auto d256_52_10 = ArrayFromJSON(decimal256(52, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); + + auto d256_42_0 = ArrayFromJSON(decimal256(42, 0), R"([ + "-02.", + "30.", + null])"); + + auto d64_12_0 = ArrayFromJSON(decimal64(12, 0), R"([ + "-02.", + "30.", + null])"); + + auto d64_18_10_roundtripped = ArrayFromJSON(decimal64(18, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); + + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d256_52_10, d64_12_0, options); + CheckCast(d256_42_0, d64_18_10_roundtripped, options); + + options.allow_decimal_truncate = false; + options.to_type = d64_12_0->type(); + CheckCastFails(d256_52_10, options); + CheckCast(d256_42_0, d64_18_10_roundtripped, options); + + // Precision loss without rescale leads to truncation + auto d256_4_2 = ArrayFromJSON(decimal256(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal64(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal64(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal64(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + ASSERT_OK_AND_ASSIGN(auto invalid, Cast(d256_4_2, expected->type(), options)); + ASSERT_RAISES(Invalid, invalid.make_array()->ValidateFull()); + + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d256_4_2, options); + } +} + TEST(Cast, Decimal256ToDecimal128) { CastOptions options; @@ -992,7 +2164,8 @@ TEST(Cast, Decimal256ToDecimal128) { TEST(Cast, FloatingToDecimal) { for (auto float_type : {float32(), float64()}) { - for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) { + for (auto decimal_type : + {decimal32(5, 2), decimal64(5, 2), decimal128(5, 2), decimal256(5, 2)}) { CheckCast( ArrayFromJSON(float_type, "[0.0, null, 123.45, 123.456, 999.994]"), ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "123.46", "999.99"])")); @@ -1036,7 +2209,8 @@ TEST(Cast, FloatingToDecimal) { TEST(Cast, DecimalToFloating) { for (auto float_type : {float32(), float64()}) { - for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) { + for (auto decimal_type : + {decimal32(5, 2), decimal64(5, 2), decimal128(5, 2), decimal256(5, 2)}) { CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"), ArrayFromJSON(float_type, "[0.0, null, 123.45, 999.99]")); } @@ -1048,7 +2222,8 @@ TEST(Cast, DecimalToFloating) { TEST(Cast, DecimalToString) { for (auto string_type : {utf8(), utf8_view(), large_utf8()}) { - for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) { + for (auto decimal_type : + {decimal32(5, 2), decimal64(5, 2), decimal128(5, 2), decimal256(5, 2)}) { CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"), ArrayFromJSON(string_type, R"(["0.00", null, "123.45", "999.99"])")); } @@ -1960,7 +3135,8 @@ TEST(Cast, StringToFloating) { TEST(Cast, StringToDecimal) { for (auto string_type : {utf8(), large_utf8()}) { - for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) { + for (auto decimal_type : + {decimal32(5, 2), decimal64(5, 2), decimal128(5, 2), decimal256(5, 2)}) { auto strings = ArrayFromJSON(string_type, R"(["0.01", null, "127.32", "200.43", "0.54"])"); auto decimals = diff --git a/cpp/src/arrow/compute/key_map_internal.cc b/cpp/src/arrow/compute/key_map_internal.cc index 9e6d60ab5032b..f134c9145535b 100644 --- a/cpp/src/arrow/compute/key_map_internal.cc +++ b/cpp/src/arrow/compute/key_map_internal.cc @@ -281,13 +281,18 @@ void SwissTable::early_filter_imp(const int num_keys, const uint32_t* hashes, // When we reach this limit, we need to break processing of any further rows and resize. // uint64_t SwissTable::num_groups_for_resize() const { - // Resize small hash tables when 50% full (up to 12KB). - // Resize large hash tables when 75% full. + // Consider N = 9 (aka 2 ^ 9 = 512 blocks) as small. + // When N = 9, a slot id takes N + 3 = 12 bits, rounded up to 16 bits. This is also the + // number of bits needed for a key id. Since each slot stores a status byte and a key + // id, then a slot takes 1 byte + 16 bits = 3 bytes. Therefore a block of 8 slots takes + // 24 bytes. The threshold of a small hash table ends up being 24 bytes * 512 = 12 KB. constexpr int log_blocks_small_ = 9; uint64_t num_slots = 1ULL << (log_blocks_ + 3); if (log_blocks_ <= log_blocks_small_) { + // Resize small hash tables when 50% full. return num_slots / 2; } else { + // Resize large hash tables when 75% full. return num_slots * 3 / 4; } } diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h index 5adb06e540009..60f1a6a21e264 100644 --- a/cpp/src/arrow/compute/light_array_internal.h +++ b/cpp/src/arrow/compute/light_array_internal.h @@ -319,6 +319,9 @@ class ARROW_EXPORT ResizableArrayData { /// \brief The current length (in rows) of the array int num_rows() const { return num_rows_; } + /// \brief The current allocated length (in rows) of the array + int num_rows_allocated() const { return num_rows_allocated_; } + /// \brief A non-owning view into this array KeyColumnArray column_array() const; @@ -347,6 +350,11 @@ class ARROW_EXPORT ResizableArrayData { /// length binary data uint8_t* mutable_data(int i) { return buffers_[i]->mutable_data(); } + template + T* mutable_data_as(int i) { + return reinterpret_cast(mutable_data(i)); + } + private: static constexpr int64_t kNumPaddingBytes = 64; int log_num_rows_min_; diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index 4b5252076af53..5513007aff627 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -22,7 +22,6 @@ #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/result.h" -#include "arrow/result_internal.h" #include "arrow/stl_allocator.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" @@ -129,15 +128,15 @@ class ColumnPopulator { // threading overhead would not be justified. ctx.set_use_threads(false); if (data.type() && is_large_binary_like(data.type()->id())) { - ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), - compute::CastOptions(), &ctx)); + ARROW_ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), + compute::CastOptions(), &ctx)); } else { auto casted = compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx); if (casted.ok()) { array_ = std::move(casted).ValueOrDie(); } else if (casted.status().IsCapacityError()) { - ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), - compute::CastOptions(), &ctx)); + ARROW_ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), + compute::CastOptions(), &ctx)); } else { return casted.status(); } @@ -501,8 +500,8 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { return Status::Invalid("Null string cannot contain quotes."); } - ASSIGN_OR_RAISE(std::shared_ptr null_string, - arrow::AllocateBuffer(options.null_string.length())); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr null_string, + arrow::AllocateBuffer(options.null_string.length())); memcpy(null_string->mutable_data(), options.null_string.data(), options.null_string.length()); @@ -511,7 +510,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { for (int col = 0; col < schema->num_fields(); col++) { const std::string& end_chars = col < schema->num_fields() - 1 ? delimiter : options.eol; - ASSIGN_OR_RAISE( + ARROW_ASSIGN_OR_RAISE( populators[col], MakePopulator(*schema->field(col), end_chars, options.delimiter, null_string, options.quoting_style, options.io_context.pool())); @@ -528,7 +527,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { Status WriteRecordBatch(const RecordBatch& batch) override { RecordBatchIterator iterator = RecordBatchSliceIterator(batch, options_.batch_size); for (auto maybe_slice : iterator) { - ASSIGN_OR_RAISE(std::shared_ptr slice, maybe_slice); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr slice, maybe_slice); RETURN_NOT_OK(TranslateMinimalBatch(*slice)); RETURN_NOT_OK(sink_->Write(data_buffer_)); stats_.num_record_batches++; @@ -570,10 +569,11 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { Status PrepareForContentsWrite() { // Only called once, as part of initialization if (data_buffer_ == nullptr) { - ASSIGN_OR_RAISE(data_buffer_, - AllocateResizableBuffer( - options_.batch_size * schema_->num_fields() * kColumnSizeGuess, - options_.io_context.pool())); + ARROW_ASSIGN_OR_RAISE( + data_buffer_, + AllocateResizableBuffer( + options_.batch_size * schema_->num_fields() * kColumnSizeGuess, + options_.io_context.pool())); } return Status::OK(); } @@ -665,24 +665,24 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { Status WriteCSV(const Table& table, const WriteOptions& options, arrow::io::OutputStream* output) { - ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, table.schema(), options)); + ARROW_ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, table.schema(), options)); RETURN_NOT_OK(writer->WriteTable(table)); return writer->Close(); } Status WriteCSV(const RecordBatch& batch, const WriteOptions& options, arrow::io::OutputStream* output) { - ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, batch.schema(), options)); + ARROW_ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, batch.schema(), options)); RETURN_NOT_OK(writer->WriteRecordBatch(batch)); return writer->Close(); } Status WriteCSV(const std::shared_ptr& reader, const WriteOptions& options, arrow::io::OutputStream* output) { - ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, reader->schema(), options)); + ARROW_ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(output, reader->schema(), options)); std::shared_ptr batch; while (true) { - ASSIGN_OR_RAISE(batch, reader->Next()); + ARROW_ASSIGN_OR_RAISE(batch, reader->Next()); if (batch == nullptr) break; RETURN_NOT_OK(writer->WriteRecordBatch(*batch)); } diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 703179da94093..4fccf4ddbbb48 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -27,7 +27,7 @@ #include "arrow/io/memory.h" #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" -#include "arrow/result_internal.h" +#include "arrow/result.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/type.h" @@ -287,19 +287,19 @@ class TestWriteCSV : public ::testing::TestWithParam { template Result ToCsvString(const Data& data, const WriteOptions& options) { std::shared_ptr out; - ASSIGN_OR_RAISE(out, io::BufferOutputStream::Create()); + ARROW_ASSIGN_OR_RAISE(out, io::BufferOutputStream::Create()); RETURN_NOT_OK(WriteCSV(data, options, out.get())); - ASSIGN_OR_RAISE(std::shared_ptr buffer, out->Finish()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr buffer, out->Finish()); return std::string(reinterpret_cast(buffer->data()), buffer->size()); } Result ToCsvStringUsingWriter(const Table& data, const WriteOptions& options) { std::shared_ptr out; - ASSIGN_OR_RAISE(out, io::BufferOutputStream::Create()); + ARROW_ASSIGN_OR_RAISE(out, io::BufferOutputStream::Create()); // Write row-by-row - ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(out, data.schema(), options)); + ARROW_ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(out, data.schema(), options)); TableBatchReader reader(data); reader.set_chunksize(1); std::shared_ptr batch; @@ -310,7 +310,7 @@ class TestWriteCSV : public ::testing::TestWithParam { } RETURN_NOT_OK(writer->Close()); EXPECT_EQ(data.num_rows(), writer->stats().num_record_batches); - ASSIGN_OR_RAISE(std::shared_ptr buffer, out->Finish()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr buffer, out->Finish()); return std::string(reinterpret_cast(buffer->data()), buffer->size()); } }; diff --git a/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in b/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in index 66b0302cbca80..4573ac3718557 100644 --- a/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in +++ b/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in @@ -26,10 +26,12 @@ @PACKAGE_INIT@ +set(ARROW_DATASET_REQUIRED_DEPENDENCIES "@ARROW_DATASET_REQUIRED_DEPENDENCIES@") + include(CMakeFindDependencyMacro) -find_dependency(Arrow) -find_dependency(ArrowAcero) -find_dependency(Parquet) +foreach(dependency ${ARROW_DATASET_REQUIRED_DEPENDENCIES}) + find_dependency(${dependency}) +endforeach() include("${CMAKE_CURRENT_LIST_DIR}/ArrowDatasetTargets.cmake") diff --git a/cpp/src/arrow/dataset/CMakeLists.txt b/cpp/src/arrow/dataset/CMakeLists.txt index e48bcfaf65bcb..bdb89ee8914f8 100644 --- a/cpp/src/arrow/dataset/CMakeLists.txt +++ b/cpp/src/arrow/dataset/CMakeLists.txt @@ -32,8 +32,10 @@ set(ARROW_DATASET_SRCS scan_node.cc) set(ARROW_DATASET_PKG_CONFIG_REQUIRES "arrow-acero") +set(ARROW_DATASET_REQUIRED_DEPENDENCIES Arrow ArrowAcero) if(ARROW_PARQUET) string(APPEND ARROW_DATASET_PKG_CONFIG_REQUIRES " parquet") + list(APPEND ARROW_DATASET_REQUIRED_DEPENDENCIES Parquet) endif() set(ARROW_DATASET_STATIC_LINK_LIBS) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 78f4ad1edd9a9..4638bb12c783c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -106,6 +106,18 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { std::string tenant_id; std::string client_id; std::string client_secret; + + // These query parameters are the union of the following docs: + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-account-sas#specify-the-account-sas-parameters + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#construct-a-service-sas + // (excluding parameters for table storage only) + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#construct-a-user-delegation-sas + static const std::set sas_token_query_parameters = { + "sv", "ss", "sr", "st", "se", "sp", "si", "sip", "spr", + "skoid", "sktid", "srt", "skt", "ske", "skv", "sks", "saoid", "suoid", + "scid", "sdd", "ses", "sig", "rscc", "rscd", "rsce", "rscl", "rsct", + }; + ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items()); for (const auto& kv : options_items) { if (kv.first == "blob_storage_authority") { @@ -147,6 +159,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { } else if (kv.first == "background_writes") { ARROW_ASSIGN_OR_RAISE(background_writes, ::arrow::internal::ParseBoolean(kv.second)); + } else if (sas_token_query_parameters.find(kv.first) != + sas_token_query_parameters.end()) { + credential_kind = CredentialKind::kSASToken; } else { return Status::Invalid( "Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'"); @@ -180,6 +195,13 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { case CredentialKind::kEnvironment: RETURN_NOT_OK(ConfigureEnvironmentCredential()); break; + case CredentialKind::kSASToken: + // Reconstructing the SAS token without the other URI query parameters is awkward + // because some parts are URI escaped and some parts are not. Instead we just + // pass through the entire query string and Azure ignores the extra query + // parameters. + RETURN_NOT_OK(ConfigureSASCredential("?" + uri.query_string())); + break; default: // Default credential break; @@ -225,7 +247,6 @@ Result AzureOptions::FromUri(const std::string& uri_string, } bool AzureOptions::Equals(const AzureOptions& other) const { - // TODO(GH-38598): update here when more auth methods are added. const bool equals = blob_storage_authority == other.blob_storage_authority && dfs_storage_authority == other.dfs_storage_authority && blob_storage_scheme == other.blob_storage_scheme && @@ -243,6 +264,8 @@ bool AzureOptions::Equals(const AzureOptions& other) const { case CredentialKind::kStorageSharedKey: return storage_shared_key_credential_->AccountName == other.storage_shared_key_credential_->AccountName; + case CredentialKind::kSASToken: + return sas_token_ == other.sas_token_; case CredentialKind::kClientSecret: case CredentialKind::kCLI: case CredentialKind::kManagedIdentity: @@ -311,6 +334,15 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke return Status::OK(); } +Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) { + credential_kind_ = CredentialKind::kSASToken; + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } + sas_token_ = sas_token; + return Status::OK(); +} + Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret) { @@ -372,6 +404,9 @@ Result> AzureOptions::MakeBlobServiceC case CredentialKind::kStorageSharedKey: return std::make_unique(AccountBlobUrl(account_name), storage_shared_key_credential_); + case CredentialKind::kSASToken: + return std::make_unique(AccountBlobUrl(account_name) + + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } @@ -404,29 +439,13 @@ AzureOptions::MakeDataLakeServiceClient() const { case CredentialKind::kStorageSharedKey: return std::make_unique( AccountDfsUrl(account_name), storage_shared_key_credential_); + case CredentialKind::kSASToken: + return std::make_unique( + AccountBlobUrl(account_name) + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } -Result AzureOptions::GenerateSASToken( - Storage::Sas::BlobSasBuilder* builder, Blobs::BlobServiceClient* client) const { - using SasProtocol = Storage::Sas::SasProtocol; - builder->Protocol = - blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly; - if (storage_shared_key_credential_) { - return builder->GenerateSasToken(*storage_shared_key_credential_); - } else { - // GH-39344: This part isn't tested. This may not work. - try { - auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn); - return builder->GenerateSasToken(delegation_key_response.Value, account_name); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "GetUserDelegationKey failed for '", - client->GetUrl(), "'."); - } - } -} - namespace { // An AzureFileSystem represents an Azure storage account. An AzureLocation describes a @@ -3161,19 +3180,7 @@ class AzureFileSystem::Impl { if (src == dest) { return Status::OK(); } - std::string sas_token; - { - Storage::Sas::BlobSasBuilder builder; - std::chrono::seconds available_period(60); - builder.ExpiresOn = std::chrono::system_clock::now() + available_period; - builder.BlobContainerName = src.container; - builder.BlobName = src.path; - builder.Resource = Storage::Sas::BlobSasResource::Blob; - builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read); - ARROW_ASSIGN_OR_RAISE( - sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get())); - } - auto src_url = GetBlobClient(src.container, src.path).GetUrl() + sas_token; + auto src_url = GetBlobClient(src.container, src.path).GetUrl(); auto dest_blob_client = GetBlobClient(dest.container, dest.path); if (!dest.path.empty()) { auto dest_parent = dest.parent(); @@ -3186,9 +3193,21 @@ class AzureFileSystem::Impl { } } try { - dest_blob_client.CopyFromUri(src_url); + // We use StartCopyFromUri instead of CopyFromUri because it supports blobs larger + // than 256 MiB and it doesn't require generating a SAS token to authenticate + // reading a source blob in the same storage account. + auto copy_operation = dest_blob_client.StartCopyFromUri(src_url); + // For large blobs, the copy operation may be slow so we need to poll until it + // completes. We use a polling interval of 1 second. + copy_operation.PollUntilDone(std::chrono::milliseconds(1000)); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ", + // StartCopyFromUri failed or a GetProperties call inside PollUntilDone failed. + return ExceptionToStatus( + exception, "Failed to start blob copy or poll status of ongoing copy. (", + src_url, " -> ", dest_blob_client.GetUrl(), ")"); + } catch (const Azure::Core::RequestFailedException& exception) { + // A GetProperties call inside PollUntilDone returned a failed CopyStatus. + return ExceptionToStatus(exception, "Failed to copy blob. (", src_url, " -> ", dest_blob_client.GetUrl(), ")"); } return Status::OK(); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index c5e5091256959..ee0956afdd7a9 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -37,10 +37,6 @@ namespace Azure::Storage::Blobs { class BlobServiceClient; } -namespace Azure::Storage::Sas { -struct BlobSasBuilder; -} - namespace Azure::Storage::Files::DataLake { class DataLakeFileSystemClient; class DataLakeServiceClient; @@ -120,6 +116,7 @@ struct ARROW_EXPORT AzureOptions { kDefault, kAnonymous, kStorageSharedKey, + kSASToken, kClientSecret, kManagedIdentity, kCLI, @@ -129,6 +126,7 @@ struct ARROW_EXPORT AzureOptions { std::shared_ptr storage_shared_key_credential_; + std::string sas_token_; mutable std::shared_ptr token_credential_; public: @@ -180,6 +178,9 @@ struct ARROW_EXPORT AzureOptions { /// AzureOptions::ConfigureClientSecretCredential() is called. /// * client_secret: You must specify "tenant_id" and "client_id" /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// * A SAS token is made up of several query parameters. Appending a SAS + /// token to the URI configures SAS token auth by calling + /// AzureOptions::ConfigureSASCredential(). /// /// [1]: /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri @@ -189,6 +190,7 @@ struct ARROW_EXPORT AzureOptions { Status ConfigureDefaultCredential(); Status ConfigureAnonymousCredential(); Status ConfigureAccountKeyCredential(const std::string& account_key); + Status ConfigureSASCredential(const std::string& sas_token); Status ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret); @@ -207,10 +209,6 @@ struct ARROW_EXPORT AzureOptions { Result> MakeDataLakeServiceClient() const; - - Result GenerateSASToken( - Azure::Storage::Sas::BlobSasBuilder* builder, - Azure::Storage::Blobs::BlobServiceClient* client) const; }; /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index a04977bdee076..7c1d450051901 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -387,6 +387,30 @@ class TestGeneric : public ::testing::Test, public GenericFileSystemTest { // builddir/main/../../threads.c:580:10 #2 0x7fa914b1cd1e in xmlGetGlobalState // builddir/main/../../threads.c:666:31 bool have_false_positive_memory_leak_with_generator() const override { return true; } + // This false positive leak is similar to the one pinpointed in the + // have_false_positive_memory_leak_with_generator() comments above, + // though the stack trace is different. It happens when a block list + // is committed from a background thread. + // + // clang-format off + // Direct leak of 968 byte(s) in 1 object(s) allocated from: + // #0 calloc + // #1 (/lib/x86_64-linux-gnu/libxml2.so.2+0xe25a4) + // #2 __xmlDefaultBufferSize + // #3 xmlBufferCreate + // #4 Azure::Storage::_internal::XmlWriter::XmlWriter() + // #5 Azure::Storage::Blobs::_detail::BlockBlobClient::CommitBlockList + // #6 Azure::Storage::Blobs::BlockBlobClient::CommitBlockList + // #7 arrow::fs::(anonymous namespace)::CommitBlockList + // #8 arrow::fs::(anonymous namespace)::ObjectAppendStream::FlushAsync()::'lambda' + // clang-format on + // + // TODO perhaps remove this skip once we can rely on + // https://github.com/Azure/azure-sdk-for-cpp/pull/5767 + // + // Also note that ClickHouse has a workaround for a similar issue: + // https://github.com/ClickHouse/ClickHouse/pull/45796 + bool have_false_positive_memory_leak_with_async_close() const override { return true; } BaseAzureEnv* env_; std::shared_ptr azure_fs_; @@ -690,6 +714,36 @@ class TestAzureOptions : public ::testing::Test { ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kEnvironment); } + void TestFromUriCredentialSASToken() { + const std::string sas_token = + "?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%" + "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04"; + ASSERT_OK_AND_ASSIGN( + auto options, + AzureOptions::FromUri( + "abfs://file_system@account.dfs.core.windows.net/" + sas_token, nullptr)); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken); + ASSERT_EQ(options.sas_token_, sas_token); + } + + void TestFromUriCredentialSASTokenWithOtherParameters() { + const std::string uri_query_string = + "?enable_tls=false&se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%" + "2B1SqLxPK%" + "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04"; + ASSERT_OK_AND_ASSIGN( + auto options, + AzureOptions::FromUri( + "abfs://account@127.0.0.1:10000/container/dir/blob" + uri_query_string, + nullptr)); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken); + ASSERT_EQ(options.sas_token_, uri_query_string); + ASSERT_EQ(options.blob_storage_authority, "127.0.0.1:10000"); + ASSERT_EQ(options.dfs_storage_authority, "127.0.0.1:10000"); + ASSERT_EQ(options.blob_storage_scheme, "http"); + ASSERT_EQ(options.dfs_storage_scheme, "http"); + } + void TestFromUriCredentialInvalid() { ASSERT_RAISES(Invalid, AzureOptions::FromUri( "abfs://file_system@account.dfs.core.windows.net/dir/file?" @@ -777,6 +831,10 @@ TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) { TEST_F(TestAzureOptions, FromUriCredentialEnvironment) { TestFromUriCredentialEnvironment(); } +TEST_F(TestAzureOptions, FromUriCredentialSASToken) { TestFromUriCredentialSASToken(); } +TEST_F(TestAzureOptions, FromUriCredentialSASTokenWithOtherParameters) { + TestFromUriCredentialSASTokenWithOtherParameters(); +} TEST_F(TestAzureOptions, FromUriCredentialInvalid) { TestFromUriCredentialInvalid(); } TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) { TestFromUriBlobStorageAuthority(); @@ -912,6 +970,20 @@ class TestAzureFileSystem : public ::testing::Test { .Value; } + Result GetContainerSASToken( + const std::string& container_name, + Azure::Storage::StorageSharedKeyCredential storage_shared_key_credential) { + std::string sas_token; + Azure::Storage::Sas::BlobSasBuilder builder; + std::chrono::seconds available_period(60); + builder.ExpiresOn = std::chrono::system_clock::now() + available_period; + builder.BlobContainerName = container_name; + builder.Resource = Azure::Storage::Sas::BlobSasResource::BlobContainer; + builder.SetPermissions(Azure::Storage::Sas::BlobContainerSasPermissions::All); + builder.Protocol = Azure::Storage::Sas::SasProtocol::HttpsAndHttp; + return builder.GenerateSasToken(storage_shared_key_credential); + } + void UploadLines(const std::vector& lines, const std::string& path, int total_size) { ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); @@ -1536,29 +1608,7 @@ class TestAzureFileSystem : public ::testing::Test { void TestOpenOutputStreamCloseAsync() { #if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND) - // This false positive leak is similar to the one pinpointed in the - // have_false_positive_memory_leak_with_generator() comments above, - // though the stack trace is different. It happens when a block list - // is committed from a background thread. - // - // clang-format off - // Direct leak of 968 byte(s) in 1 object(s) allocated from: - // #0 calloc - // #1 (/lib/x86_64-linux-gnu/libxml2.so.2+0xe25a4) - // #2 __xmlDefaultBufferSize - // #3 xmlBufferCreate - // #4 Azure::Storage::_internal::XmlWriter::XmlWriter() - // #5 Azure::Storage::Blobs::_detail::BlockBlobClient::CommitBlockList - // #6 Azure::Storage::Blobs::BlockBlobClient::CommitBlockList - // #7 arrow::fs::(anonymous namespace)::CommitBlockList - // #8 arrow::fs::(anonymous namespace)::ObjectAppendStream::FlushAsync()::'lambda' - // clang-format on - // - // TODO perhaps remove this skip once we can rely on - // https://github.com/Azure/azure-sdk-for-cpp/pull/5767 - // - // Also note that ClickHouse has a workaround for a similar issue: - // https://github.com/ClickHouse/ClickHouse/pull/45796 + // See comment about have_false_positive_memory_leak_with_generator above. if (options_.background_writes) { GTEST_SKIP() << "False positive memory leak in libxml2 with CloseAsync"; } @@ -1617,6 +1667,31 @@ class TestAzureFileSystem : public ::testing::Test { AssertObjectContents(fs.get(), path, payload); } + void TestSASCredential() { + auto data = SetUpPreexistingData(); + + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); + ASSERT_OK_AND_ASSIGN( + auto sas_token, + GetContainerSASToken(data.container_name, + Azure::Storage::StorageSharedKeyCredential( + env->account_name(), env->account_key()))); + // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so + // make sure these don't cause problems. + ARROW_EXPECT_OK(options.ConfigureSASCredential( + "?blob_storage_authority=dummy_value0&" + sas_token.substr(1) + + "&credential_kind=dummy-value1")); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); + + AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); + + // Test CopyFile because the most obvious implementation requires generating a SAS + // token at runtime which doesn't work when the original auth is SAS token. + ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy")); + AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File); + } + private: using StringMatcher = ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher>; @@ -2328,6 +2403,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) { TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, SASCredential) { + this->TestSASCredential(); +} + // Tests using Azurite (the local Azure emulator) TEST_F(TestAzuriteFileSystem, CheckIfHierarchicalNamespaceIsEnabledRuntimeError) { @@ -2634,6 +2713,17 @@ TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } +TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationDifferentContainer) { + auto data = SetUpPreexistingData(); + auto data2 = SetUpPreexistingData(); + const auto destination_path = data2.ContainerPath("copy-destionation"); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); + EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); +} + TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) { auto data = SetUpPreexistingData(); ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath())); diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index b5765010ec7e9..37619df90fc34 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -630,9 +630,12 @@ Status CopyFiles(const std::vector& sources, destinations.size(), " paths."); } - auto copy_one_file = [&](int i) { - if (sources[i].filesystem->Equals(destinations[i].filesystem)) { - return sources[i].filesystem->CopyFile(sources[i].path, destinations[i].path); + auto copy_one_file = [&](size_t i, + const FileLocator& source_file_locator) -> Result> { + if (source_file_locator.filesystem->Equals(destinations[i].filesystem)) { + RETURN_NOT_OK(source_file_locator.filesystem->CopyFile(source_file_locator.path, + destinations[i].path)); + return Future<>::MakeFinished(); } ARROW_ASSIGN_OR_RAISE(auto source, @@ -642,12 +645,31 @@ Status CopyFiles(const std::vector& sources, ARROW_ASSIGN_OR_RAISE(auto destination, destinations[i].filesystem->OpenOutputStream( destinations[i].path, metadata)); RETURN_NOT_OK(internal::CopyStream(source, destination, chunk_size, io_context)); - return destination->Close(); + // Using the blocking Close() here can cause reduced performance and deadlocks because + // FileSystem implementations that implement background_writes need to queue and wait + // for other IO thread(s). There is a risk that most or all the threads in the IO + // thread pool are blocking on a call Close(), leaving no IO threads left to actually + // fulfil the background writes. + return destination->CloseAsync(); }; - return ::arrow::internal::OptionalParallelFor( - use_threads, static_cast(sources.size()), std::move(copy_one_file), - io_context.executor()); + // Spawn copy_one_file less urgently than default, so that background_writes are done + // with higher priority. Otherwise copy_one_file will keep buffering more data in memory + // without giving the background_writes any chance to upload the data and drop it from + // memory. Therefore, without this large copies would cause OOMs. + TaskHints hints{10}; + auto future = ::arrow::internal::OptionalParallelForAsync( + use_threads, sources, std::move(copy_one_file), io_context.executor(), hints); + + // Wait for all the copy_one_file instances to complete. + ARROW_ASSIGN_OR_RAISE(auto copy_close_async_future, future.result()); + + // Wait for all the futures returned by copy_one_file to complete. When the destination + // filesystem uses background_writes this is when most of the upload happens. + for (const auto& result : copy_close_async_future) { + result.Wait(); + } + return Status::OK(); } Status CopyFiles(const std::shared_ptr& source_fs, diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index a6c897636000e..efe7cff4958ab 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -578,6 +578,67 @@ void GenericFileSystemTest::TestCopyFile(FileSystem* fs) { AssertAllFiles(fs, {"AB/abc", "EF/ghi", "def"}); } +void GenericFileSystemTest::TestCopyFiles(FileSystem* fs) { +#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND) + if (have_false_positive_memory_leak_with_async_close()) { + GTEST_SKIP() << "Filesystem have false positive memory leak with generator"; + } +#endif + auto io_thread_pool = + static_cast(fs->io_context().executor()); + auto original_threads = io_thread_pool->GetCapacity(); + // Needs to be smaller than the number of files we test with to catch GH-15233 + ASSERT_OK(io_thread_pool->SetCapacity(2)); + // Ensure the thread pool capacity is set back to the original value after the test + auto reset_thread_pool = [io_thread_pool, original_threads](void*) { + ASSERT_OK(io_thread_pool->SetCapacity(original_threads)); + }; + std::unique_ptr reset_thread_pool_guard( + nullptr, reset_thread_pool); + + auto mock_fs = std::make_shared( + std::chrono::system_clock::now()); + std::vector dirs0{"0", "0/AB", "0/AB/CD"}; + std::map files0{ + {"0/123", "123 data"}, {"0/AB/abc", "abc data"}, {"0/AB/CD/def", "def data"}}; + + std::vector dirs0and1{"0", "0/AB", "0/AB/CD", "1", "1/AB", "1/AB/CD"}; + std::map files0and1{ + {"0/123", "123 data"}, {"0/AB/abc", "abc data"}, {"0/AB/CD/def", "def data"}, + {"1/123", "123 data"}, {"1/AB/abc", "abc data"}, {"1/AB/CD/def", "def data"}}; + + ASSERT_OK(mock_fs->CreateDir("0/AB/CD")); + for (const auto& kv : files0) { + CreateFile(mock_fs.get(), kv.first, kv.second); + } + + auto selector0 = arrow::fs::FileSelector{}; + selector0.base_dir = "0"; + selector0.recursive = true; + + ASSERT_OK(CopyFiles(mock_fs, selector0, fs->shared_from_this(), "0")); + AssertAllDirs(fs, dirs0); + for (const auto& kv : files0) { + AssertFileContents(fs, kv.first, kv.second); + } + + ASSERT_OK(CopyFiles(fs->shared_from_this(), selector0, fs->shared_from_this(), "1")); + AssertAllDirs(fs, dirs0and1); + for (const auto& kv : files0and1) { + AssertFileContents(fs, kv.first, kv.second); + } + + auto selector1 = arrow::fs::FileSelector{}; + selector1.base_dir = "1"; + selector1.recursive = true; + + ASSERT_OK(CopyFiles(fs->shared_from_this(), selector1, mock_fs, "1")); + AssertAllDirs(mock_fs.get(), dirs0and1); + for (const auto& kv : files0and1) { + AssertFileContents(mock_fs.get(), kv.first, kv.second); + } +} + void GenericFileSystemTest::TestGetFileInfo(FileSystem* fs) { ASSERT_OK(fs->CreateDir("AB/CD/EF")); CreateFile(fs, "AB/CD/ghi", "some data"); @@ -1212,6 +1273,7 @@ GENERIC_FS_TEST_DEFINE(TestDeleteFiles) GENERIC_FS_TEST_DEFINE(TestMoveFile) GENERIC_FS_TEST_DEFINE(TestMoveDir) GENERIC_FS_TEST_DEFINE(TestCopyFile) +GENERIC_FS_TEST_DEFINE(TestCopyFiles) GENERIC_FS_TEST_DEFINE(TestGetFileInfo) GENERIC_FS_TEST_DEFINE(TestGetFileInfoVector) GENERIC_FS_TEST_DEFINE(TestGetFileInfoSelector) diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index 04000c14e9c2a..3a643b7e9f08b 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -140,6 +140,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { void TestMoveFile(); void TestMoveDir(); void TestCopyFile(); + void TestCopyFiles(); void TestGetFileInfo(); void TestGetFileInfoVector(); void TestGetFileInfoSelector(); @@ -189,6 +190,8 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { virtual bool have_file_metadata() const { return false; } // - Whether the filesystem has a false positive memory leak with generator virtual bool have_false_positive_memory_leak_with_generator() const { return false; } + // - Whether the filesystem has a false positive memory leak in async close + virtual bool have_false_positive_memory_leak_with_async_close() const { return false; } void TestEmpty(FileSystem* fs); void TestNormalizePath(FileSystem* fs); @@ -201,6 +204,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { void TestMoveFile(FileSystem* fs); void TestMoveDir(FileSystem* fs); void TestCopyFile(FileSystem* fs); + void TestCopyFiles(FileSystem* fs); void TestGetFileInfo(FileSystem* fs); void TestGetFileInfoVector(FileSystem* fs); void TestGetFileInfoSelector(FileSystem* fs); @@ -233,6 +237,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveFile) \ GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveDir) \ GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFiles) \ GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfo) \ GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoVector) \ GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelector) \ diff --git a/cpp/src/arrow/flight/transport/ucx/ucx.h b/cpp/src/arrow/flight/transport/ucx/ucx.h index dda2c83035c6d..7a12987c31e00 100644 --- a/cpp/src/arrow/flight/transport/ucx/ucx.h +++ b/cpp/src/arrow/flight/transport/ucx/ucx.h @@ -26,6 +26,8 @@ namespace flight { namespace transport { namespace ucx { +/// \deprecated Deprecated in 19.0.0. Flight UCX is deprecated. +ARROW_DEPRECATED(" Deprecated in 19.0.0. Flight UCX is deprecated.") ARROW_FLIGHT_EXPORT void InitializeFlightUcx(); diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc index 1d35549cc4345..f6be60509c45e 100644 --- a/cpp/src/arrow/io/interfaces.cc +++ b/cpp/src/arrow/io/interfaces.cc @@ -68,8 +68,8 @@ Status SetIOThreadPoolCapacity(int threads) { FileInterface::~FileInterface() = default; Future<> FileInterface::CloseAsync() { - return DeferNotOk( - default_io_context().executor()->Submit([this]() { return Close(); })); + return DeferNotOk(default_io_context().executor()->Submit( + [self = shared_from_this()]() { return self->Close(); })); } Status FileInterface::Abort() { return Close(); } diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 7a45f0906639a..31312f1ac6948 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -857,7 +857,8 @@ TEST(TestMap, StringToInteger) { ASSERT_OK_AND_ASSIGN(auto expected_keys, ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])")); ASSERT_OK_AND_ASSIGN(auto expected_values, ArrayFromJSON(int32(), "[0, null, 8]")); - ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, BytesToBits({1, 0, 1, 1})); + ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, + BytesToBits(std::vector({1, 0, 1, 1}))); auto expected = std::make_shared(type, 4, Buffer::Wrap(offsets), expected_keys, expected_values, expected_null_bitmap, 1); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 88aa3f3f8a47a..8cb0f5625760f 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -41,7 +41,7 @@ #include "arrow/ipc/metadata_internal.h" #include "arrow/ipc/util.h" #include "arrow/record_batch.h" -#include "arrow/result_internal.h" +#include "arrow/result.h" #include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/table.h" @@ -840,8 +840,8 @@ Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, Status WriteRecordBatchStream(const std::vector>& batches, const IpcWriteOptions& options, io::OutputStream* dst) { - ASSIGN_OR_RAISE(std::shared_ptr writer, - MakeStreamWriter(dst, batches[0]->schema(), options)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr writer, + MakeStreamWriter(dst, batches[0]->schema(), options)); for (const auto& batch : batches) { DCHECK(batch->schema()->Equals(*batches[0]->schema())) << "Schemas unequal"; RETURN_NOT_OK(writer->WriteRecordBatch(*batch)); diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 3f8237188dc75..5ce33a3731e7e 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -493,8 +493,10 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat RETURN_NOT_OK(on_statistics(statistics)); statistics.start_new_column = false; - const auto num_fields = record_batch.schema()->num_fields(); + const auto& schema = record_batch.schema(); + const auto num_fields = schema->num_fields(); for (int nth_column = 0; nth_column < num_fields; ++nth_column) { + const auto& field = schema->field(nth_column); auto column_statistics = record_batch.column(nth_column)->statistics(); if (!column_statistics) { continue; @@ -527,7 +529,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } else { statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE; } - statistics.type = column_statistics->MinArrowType(); + statistics.type = column_statistics->MinArrowType(field->type()); statistics.value = column_statistics->min.value(); RETURN_NOT_OK(on_statistics(statistics)); statistics.start_new_column = false; @@ -540,7 +542,7 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } else { statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE; } - statistics.type = column_statistics->MaxArrowType(); + statistics.type = column_statistics->MaxArrowType(field->type()); statistics.value = column_statistics->max.value(); RETURN_NOT_OK(on_statistics(statistics)); statistics.start_new_column = false; diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 21202c6acb05a..21d51ae5068b6 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -1116,7 +1116,7 @@ Result> MakeStatisticsArray( } keys_indices.push_back(key_index); - auto values_type = ArrayStatistics::ValueToArrowType(value); + auto values_type = ArrayStatistics::ValueToArrowType(value, arrow::null()); int8_t values_type_code = 0; for (; values_type_code < static_cast(values_types.size()); ++values_type_code) { diff --git a/cpp/src/arrow/result_internal.h b/cpp/src/arrow/result_internal.h deleted file mode 100644 index 134902e1b75ad..0000000000000 --- a/cpp/src/arrow/result_internal.h +++ /dev/null @@ -1,22 +0,0 @@ -// -// Copyright 2017 Asylo authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#pragma once - -#include "arrow/result.h" - -#ifndef ASSIGN_OR_RAISE -# define ASSIGN_OR_RAISE(lhs, rhs) ARROW_ASSIGN_OR_RAISE(lhs, rhs) -#endif diff --git a/cpp/src/arrow/status.cc b/cpp/src/arrow/status.cc index 8cbc6842c4bc3..55ce3fb78d257 100644 --- a/cpp/src/arrow/status.cc +++ b/cpp/src/arrow/status.cc @@ -141,6 +141,16 @@ std::string Status::ToStringWithoutContextLines() const { return message; } +const std::string& Status::message() const { + static const std::string no_message = ""; + return ok() ? no_message : state_->msg; +} + +const std::shared_ptr& Status::detail() const { + static std::shared_ptr no_detail = NULLPTR; + return state_ ? state_->detail : no_detail; +} + void Status::Abort() const { Abort(std::string()); } void Status::Abort(const std::string& message) const { diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 853fc284ee317..42e8929ce0b4c 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -332,16 +332,10 @@ class ARROW_EXPORT [[nodiscard]] Status : public util::EqualityComparablecode; } /// \brief Return the specific error message attached to this status. - const std::string& message() const { - static const std::string no_message = ""; - return ok() ? no_message : state_->msg; - } + const std::string& message() const; /// \brief Return the status detail attached to this message. - const std::shared_ptr& detail() const { - static std::shared_ptr no_detail = NULLPTR; - return state_ ? state_->detail : no_detail; - } + const std::shared_ptr& detail() const; /// \brief Return a new Status copying the existing status, but /// updating with the existing detail. diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h index b542ee5c34868..ae5462c661a8c 100644 --- a/cpp/src/arrow/stl.h +++ b/cpp/src/arrow/stl.h @@ -187,6 +187,35 @@ struct ConversionTraits> } }; +template +struct ConversionTraits> + : public CTypeTraits> { + static arrow::Status AppendRow(FixedSizeListBuilder& builder, + const std::array& values) { + auto vb = + ::arrow::internal::checked_cast::BuilderType*>( + builder.value_builder()); + ARROW_RETURN_NOT_OK(builder.Append()); + return vb->AppendValues(values.data(), N); + } + + static std::array GetEntry(const ::arrow::FixedSizeListArray& array, + size_t j) { + using ElementArrayType = typename TypeTraits< + typename stl::ConversionTraits::ArrowType>::ArrayType; + + const ElementArrayType& value_array = + ::arrow::internal::checked_cast(*array.values()); + + std::array arr; + for (size_t i = 0; i < N; i++) { + arr[i] = stl::ConversionTraits::GetEntry(value_array, + array.value_offset(j) + i); + } + return arr; + } +}; + template struct ConversionTraits> : public CTypeTraits())>::type> { diff --git a/cpp/src/arrow/stl_test.cc b/cpp/src/arrow/stl_test.cc index 48e6f8014c923..ce5adf0c0e268 100644 --- a/cpp/src/arrow/stl_test.cc +++ b/cpp/src/arrow/stl_test.cc @@ -245,6 +245,26 @@ TEST(TestTableFromTupleVector, ListType) { ASSERT_TRUE(expected_table->Equals(*table)); } +TEST(TestTableFromTupleVector, FixedSizeListType) { + using tuple_type = std::tuple>; + + auto expected_schema = std::make_shared( + FieldVector{field("column1", fixed_size_list(int64(), 4), false)}); + std::shared_ptr expected_array = + ArrayFromJSON(fixed_size_list(int64(), 4), "[[1, 1, 2, 34], [2, -4, 1, 1]]"); + std::shared_ptr expected_table = Table::Make(expected_schema, {expected_array}); + + std::vector rows{tuple_type(std::array{1, 1, 2, 34}), + tuple_type(std::array{2, -4, 1, 1})}; + std::vector names{"column1"}; + + std::shared_ptr
table; + ASSERT_OK(TableFromTupleRange(default_memory_pool(), rows, names, &table)); + ASSERT_OK(table->ValidateFull()); + + AssertTablesEqual(*expected_table, *table); +} + TEST(TestTableFromTupleVector, ReferenceTuple) { std::vector names{"column1", "column2", "column3", "column4", "column5", "column6", "column7", "column8", "column9", "column10"}; @@ -468,6 +488,26 @@ TEST(TestTupleVectorFromTable, ListType) { ASSERT_EQ(rows, expected_rows); } +TEST(TestTupleVectorFromTable, FixedSizeListType) { + using tuple_type = std::tuple>; + + compute::ExecContext ctx; + compute::CastOptions cast_options; + auto expected_schema = std::make_shared( + FieldVector{field("column1", fixed_size_list(int64(), 4), false)}); + std::shared_ptr expected_array = + ArrayFromJSON(fixed_size_list(int64(), 4), "[[1, 1, 2, 34], [2, -4, 1, 1]]"); + std::shared_ptr
table = Table::Make(expected_schema, {expected_array}); + ASSERT_OK(table->ValidateFull()); + + std::vector expected_rows{tuple_type(std::array{1, 1, 2, 34}), + tuple_type(std::array{2, -4, 1, 1})}; + + std::vector rows(2); + ASSERT_OK(TupleRangeFromTable(*table, cast_options, &ctx, &rows)); + ASSERT_EQ(rows, expected_rows); +} + TEST(TestTupleVectorFromTable, CastingNeeded) { using tuple_type = std::tuple>; diff --git a/cpp/src/arrow/testing/process.cc b/cpp/src/arrow/testing/process.cc index 133768ff015e6..57df0196c117f 100644 --- a/cpp/src/arrow/testing/process.cc +++ b/cpp/src/arrow/testing/process.cc @@ -85,9 +85,14 @@ # include # ifdef BOOST_PROCESS_USE_V2 -namespace asio = BOOST_PROCESS_V2_ASIO_NAMESPACE; namespace process = BOOST_PROCESS_V2_NAMESPACE; namespace filesystem = process::filesystem; +// For Boost < 1.87.0 +# ifdef BOOST_PROCESS_V2_ASIO_NAMESPACE +namespace asio = BOOST_PROCESS_V2_ASIO_NAMESPACE; +# else +namespace asio = process::net; +# endif # elif defined(BOOST_PROCESS_HAVE_V1) namespace process = boost::process::v1; namespace filesystem = boost::process::v1::filesystem; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 6da05bd8f1435..92009c8560c4e 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -540,6 +540,16 @@ struct CTypeTraits> : public TypeTraits { } }; +/// \addtogroup c-type-traits +template +struct CTypeTraits> : public TypeTraits { + using ArrowType = FixedSizeListType; + + static auto type_singleton() { + return fixed_size_list(CTypeTraits::type_singleton(), N); + } +}; + /// \addtogroup type-traits /// @{ template <> diff --git a/cpp/src/arrow/util/aligned_storage.h b/cpp/src/arrow/util/aligned_storage.h index 01e3ced2d1f61..588806507039c 100644 --- a/cpp/src/arrow/util/aligned_storage.h +++ b/cpp/src/arrow/util/aligned_storage.h @@ -119,26 +119,7 @@ class AlignedStorage { } private: -#if !defined(__clang__) && defined(__GNUC__) && defined(__i386__) - // Workaround for GCC bug on i386: - // alignof(int64 | float64) can give different results depending on the - // compilation context, leading to internal ABI mismatch manifesting - // in incorrect propagation of Result between - // compilation units. - // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88115) - static constexpr size_t alignment() { - if (std::is_integral_v && sizeof(T) == 8) { - return 4; - } else if (std::is_floating_point_v && sizeof(T) == 8) { - return 4; - } - return alignof(T); - } - - typename std::aligned_storage::type data_; -#else - typename std::aligned_storage::type data_; -#endif + alignas(T) std::byte data_[sizeof(T)]; }; } // namespace internal diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h index 9c1f2e479c712..b5404bb7bc6d5 100644 --- a/cpp/src/arrow/util/basic_decimal.h +++ b/cpp/src/arrow/util/basic_decimal.h @@ -739,6 +739,16 @@ class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal(value.high_bits()), SignExtend(value.high_bits()), SignExtend(value.high_bits())})) {} + explicit BasicDecimal256(const BasicDecimal64& value) noexcept + : BasicDecimal256(bit_util::little_endian::ToNative( + {value.low_bits(), SignExtend(value.value()), SignExtend(value.value()), + SignExtend(value.value())})) {} + + explicit BasicDecimal256(const BasicDecimal32& value) noexcept + : BasicDecimal256(bit_util::little_endian::ToNative( + {value.low_bits(), SignExtend(value.value()), SignExtend(value.value()), + SignExtend(value.value())})) {} + /// \brief Negate the current value (in-place) BasicDecimal256& Negate(); diff --git a/cpp/src/arrow/util/bitmap_builders.cc b/cpp/src/arrow/util/bitmap_builders.cc index c5cf3d2bc72b5..000dda718d0da 100644 --- a/cpp/src/arrow/util/bitmap_builders.cc +++ b/cpp/src/arrow/util/bitmap_builders.cc @@ -33,7 +33,7 @@ namespace internal { namespace { -void FillBitsFromBytes(const std::vector& bytes, uint8_t* bits) { +void FillBitsFromBytes(util::span bytes, uint8_t* bits) { for (size_t i = 0; i < bytes.size(); ++i) { if (bytes[i] > 0) { bit_util::SetBit(bits, i); @@ -43,7 +43,7 @@ void FillBitsFromBytes(const std::vector& bytes, uint8_t* bits) { } // namespace -Result> BytesToBits(const std::vector& bytes, +Result> BytesToBits(util::span bytes, MemoryPool* pool) { int64_t bit_length = bit_util::BytesForBits(bytes.size()); diff --git a/cpp/src/arrow/util/bitmap_builders.h b/cpp/src/arrow/util/bitmap_builders.h index 5bd2ad4414083..4bf2edfdcbd69 100644 --- a/cpp/src/arrow/util/bitmap_builders.h +++ b/cpp/src/arrow/util/bitmap_builders.h @@ -23,6 +23,7 @@ #include "arrow/result.h" #include "arrow/type_fwd.h" +#include "arrow/util/span.h" #include "arrow/util/visibility.h" namespace arrow { @@ -36,7 +37,7 @@ Result> BitmapAllButOne(MemoryPool* pool, int64_t length /// \brief Convert vector of bytes to bitmap buffer ARROW_EXPORT -Result> BytesToBits(const std::vector&, +Result> BytesToBits(util::span bytes, MemoryPool* pool = default_memory_pool()); } // namespace internal diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 4ead1a7283d81..52525a83aa2ea 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -843,6 +843,14 @@ class BinaryMemoTable : public MemoTable { } } + // Visit the stored value at a specific index in insertion order. + // The visitor function should have the signature `void(std::string_view)` + // or `void(const std::string_view&)`. + template + void VisitValue(int32_t idx, VisitFunc&& visit) const { + visit(binary_builder_.GetView(idx)); + } + protected: struct Payload { int32_t memo_index; diff --git a/cpp/src/arrow/util/parallel.h b/cpp/src/arrow/util/parallel.h index 80f60fbdb3676..ae48a606e366f 100644 --- a/cpp/src/arrow/util/parallel.h +++ b/cpp/src/arrow/util/parallel.h @@ -48,12 +48,13 @@ Status ParallelFor(int num_tasks, FUNCTION&& func, template ::ValueType> -Future> ParallelForAsync( - std::vector inputs, FUNCTION&& func, - Executor* executor = internal::GetCpuThreadPool()) { +Future> ParallelForAsync(std::vector inputs, FUNCTION&& func, + Executor* executor = internal::GetCpuThreadPool(), + TaskHints hints = TaskHints{}) { std::vector> futures(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i]))); + ARROW_ASSIGN_OR_RAISE(futures[i], + executor->Submit(hints, func, i, std::move(inputs[i]))); } return All(std::move(futures)) .Then([](const std::vector>& results) -> Result> { @@ -86,9 +87,10 @@ template ::ValueType> Future> OptionalParallelForAsync( bool use_threads, std::vector inputs, FUNCTION&& func, - Executor* executor = internal::GetCpuThreadPool()) { + Executor* executor = internal::GetCpuThreadPool(), TaskHints hints = TaskHints{}) { if (use_threads) { - return ParallelForAsync(std::move(inputs), std::forward(func), executor); + return ParallelForAsync(std::move(inputs), std::forward(func), executor, + hints); } else { std::vector result(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { diff --git a/cpp/src/arrow/util/thread_pool.cc b/cpp/src/arrow/util/thread_pool.cc index 8aa6d548893de..faef51307e5d2 100644 --- a/cpp/src/arrow/util/thread_pool.cc +++ b/cpp/src/arrow/util/thread_pool.cc @@ -52,10 +52,28 @@ struct Task { Executor::StopCallback stop_callback; }; +struct QueuedTask { + Task task; + int32_t priority; + uint64_t spawn_index; + + // Implement comparison so that std::priority_queue will pop the low priorities more + // urgently. + bool operator<(const QueuedTask& other) const { + if (priority == other.priority) { + // Maintain execution order for tasks with the same priority. Its preferable to keep + // the execution order of tasks deterministic. + return spawn_index > other.spawn_index; + } + return priority > other.priority; + } +}; + } // namespace struct SerialExecutor::State { - std::deque task_queue; + std::priority_queue task_queue; + uint64_t spawned_tasks_count_ = 0; std::mutex mutex; std::condition_variable wait_for_tasks; std::thread::id current_thread; @@ -153,8 +171,9 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce task, "Attempt to schedule a task on a serial executor that has already finished or " "been abandoned"); } - state->task_queue.push_back( - Task{std::move(task), std::move(stop_token), std::move(stop_callback)}); + state->task_queue.push(QueuedTask{std::move(task), std::move(stop_token), + std::move(stop_callback), hints.priority, + state_->spawned_tasks_count_++}); } state->wait_for_tasks.notify_one(); return Status::OK(); @@ -189,8 +208,9 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce task, "been abandoned"); } - state_->task_queue.push_back( - Task{std::move(task), std::move(stop_token), std::move(stop_callback)}); + state_->task_queue.push(QueuedTask{std::move(task), std::move(stop_token), + std::move(stop_callback), hints.priority, + state_->spawned_tasks_count_++}); return Status::OK(); } @@ -245,8 +265,8 @@ void SerialExecutor::RunLoop() { // because sometimes we will pause even with work leftover when processing // an async generator while (!state_->paused && !state_->task_queue.empty()) { - Task task = std::move(state_->task_queue.front()); - state_->task_queue.pop_front(); + Task task = std::move(const_cast(state_->task_queue.top().task)); + state_->task_queue.pop(); lk.unlock(); if (!task.stop_token.IsStopRequested()) { std::move(task.callable)(); @@ -309,8 +329,8 @@ bool SerialExecutor::RunTasksOnAllExecutors() { if (exe->state_->paused == false && exe->state_->task_queue.empty() == false) { SerialExecutor* old_exe = globalState->current_executor; globalState->current_executor = exe; - Task task = std::move(exe->state_->task_queue.front()); - exe->state_->task_queue.pop_front(); + Task task = std::move(const_cast(exe->state_->task_queue.top().task)); + exe->state_->task_queue.pop(); run_task = true; exe->state_->tasks_running += 1; if (!task.stop_token.IsStopRequested()) { @@ -344,8 +364,8 @@ void SerialExecutor::RunLoop() { // we can't run any more until something else drops off the queue if (state_->tasks_running <= state_->max_tasks_running) { while (!state_->paused && !state_->task_queue.empty()) { - Task task = std::move(state_->task_queue.front()); - state_->task_queue.pop_front(); + Task task = std::move(const_cast(state_->task_queue.top().task)); + state_->task_queue.pop(); auto last_executor = globalState->current_executor; globalState->current_executor = this; state_->tasks_running += 1; @@ -386,7 +406,8 @@ struct ThreadPool::State { std::list workers_; // Trashcan for finished threads std::vector finished_workers_; - std::deque pending_tasks_; + std::priority_queue pending_tasks_; + uint64_t spawned_tasks_count_ = 0; // Desired number of threads int desired_capacity_ = 0; @@ -449,8 +470,8 @@ static void WorkerLoop(std::shared_ptr state, DCHECK_GE(state->tasks_queued_or_running_, 0); { - Task task = std::move(state->pending_tasks_.front()); - state->pending_tasks_.pop_front(); + Task task = std::move(const_cast(state->pending_tasks_.top().task)); + state->pending_tasks_.pop(); StopToken* stop_token = &task.stop_token; lock.unlock(); if (!stop_token->IsStopRequested()) { @@ -592,7 +613,8 @@ Status ThreadPool::Shutdown(bool wait) { if (!state_->quick_shutdown_) { DCHECK_EQ(state_->pending_tasks_.size(), 0); } else { - state_->pending_tasks_.clear(); + std::priority_queue empty; + std::swap(state_->pending_tasks_, empty); } CollectFinishedWorkersUnlocked(); return Status::OK(); @@ -653,8 +675,10 @@ Status ThreadPool::SpawnReal(TaskHints hints, FnOnce task, StopToken sto // We can still spin up more workers so spin up a new worker LaunchWorkersUnlocked(/*threads=*/1); } - state_->pending_tasks_.push_back( - {std::move(task), std::move(stop_token), std::move(stop_callback)}); + state_->pending_tasks_.push( + QueuedTask{{std::move(task), std::move(stop_token), std::move(stop_callback)}, + hints.priority, + state_->spawned_tasks_count_++}); } state_->cv_.notify_one(); return Status::OK(); @@ -737,7 +761,8 @@ Status ThreadPool::Shutdown(bool wait) { } else { // clear any pending tasks so that we behave // the same as threadpool on fast shutdown - state_->task_queue.clear(); + std::priority_queue empty; + std::swap(state_->task_queue, empty); } return Status::OK(); } @@ -777,7 +802,8 @@ Result> ThreadPool::MakeEternal(int threads) { ThreadPool::~ThreadPool() { // clear threadpool, otherwise ~SerialExecutor will // run any tasks left (which isn't threadpool behaviour) - state_->task_queue.clear(); + std::priority_queue empty; + std::swap(state_->task_queue, empty); } #endif // ARROW_ENABLE_THREADING diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc index 7cf8826e8a173..2c83146030243 100644 --- a/cpp/src/arrow/util/thread_pool_test.cc +++ b/cpp/src/arrow/util/thread_pool_test.cc @@ -21,6 +21,7 @@ #endif #include +#include #include #include #include @@ -578,6 +579,62 @@ TEST_F(TestThreadPool, Spawn) { SpawnAdds(pool.get(), 7, task_add); } +TEST_F(TestThreadPool, TasksRunInPriorityOrder) { + auto pool = this->MakeThreadPool(1); + constexpr int kNumTasks = 10; + auto recorded_times = std::vector(kNumTasks); + auto futures = std::vector>(kNumTasks); + std::mutex mutex; + + auto wait_task = [&mutex] { std::unique_lock lock(mutex); }; + { + std::unique_lock lock(mutex); + // Spawn wait_task to block the pool while we add the other tasks. This + // ensures all the tasks are queued before any of them start running, so that + // their running order is fully determined by their priority. + ASSERT_OK(pool->Spawn(wait_task)); + + for (int i = 0; i < kNumTasks; ++i) { + auto record_time = [&recorded_times, i]() { + recorded_times[i] = std::chrono::steady_clock::now(); + return i; + }; + // Spawn tasks in opposite order to urgency. + ASSERT_OK_AND_ASSIGN(futures[i], + pool->Submit(TaskHints{kNumTasks - i}, record_time)); + } + } + + ASSERT_OK(pool->Shutdown()); + + for (size_t i = 1; i < kNumTasks; ++i) { + ASSERT_GE(recorded_times[i - 1], recorded_times[i]); + ASSERT_LT(futures[i - 1].result().ValueOrDie(), futures[i].result().ValueOrDie()); + } +} + +TEST_F(TestThreadPool, TasksOfEqualPriorityRunInSpawnOrder) { + auto pool = this->MakeThreadPool(1); + constexpr int kNumTasks = 10; + auto recorded_times = std::vector(kNumTasks); + auto futures = std::vector>(kNumTasks); + + for (int i = 0; i < kNumTasks; ++i) { + auto record_time = [&recorded_times, i]() { + recorded_times[i] = std::chrono::steady_clock::now(); + return i; + }; + ASSERT_OK_AND_ASSIGN(futures[i], pool->Submit(record_time)); + } + + ASSERT_OK(pool->Shutdown()); + + for (size_t i = 1; i < kNumTasks; ++i) { + ASSERT_LE(recorded_times[i - 1], recorded_times[i]); + ASSERT_LT(futures[i - 1].result().ValueOrDie(), futures[i].result().ValueOrDie()); + } +} + TEST_F(TestThreadPool, StressSpawn) { auto pool = this->MakeThreadPool(30); SpawnAdds(pool.get(), 1000, task_add); diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 9c28b749e4319..0a9f92cebbbc4 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -181,6 +181,7 @@ set(PARQUET_SRCS printer.cc properties.cc schema.cc + size_statistics.cc statistics.cc stream_reader.cc stream_writer.cc @@ -373,6 +374,7 @@ add_parquet_test(internals-test metadata_test.cc page_index_test.cc public_api_test.cc + size_statistics_test.cc types_test.cc) set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 0ee595508fec4..c19e2b9e48bb3 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -25,7 +25,7 @@ #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/api.h" -#include "arrow/result_internal.h" +#include "arrow/result.h" #include "arrow/type.h" #include "arrow/util/base64.h" #include "arrow/util/checked_cast.h" @@ -484,8 +484,8 @@ bool IsDictionaryReadSupported(const ArrowType& type) { ::arrow::Result> GetTypeForNode( int column_index, const schema::PrimitiveNode& primitive_node, SchemaTreeContext* ctx) { - ASSIGN_OR_RAISE(std::shared_ptr storage_type, - GetArrowType(primitive_node, ctx->properties)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr storage_type, + GetArrowType(primitive_node, ctx->properties)); if (ctx->properties.read_dictionary(column_index) && IsDictionaryReadSupported(*storage_type)) { return ::arrow::dictionary(::arrow::int32(), storage_type); @@ -723,8 +723,8 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // yields list ?nullable const auto& primitive_node = static_cast(list_node); int column_index = ctx->schema->GetColumnIndex(primitive_node); - ASSIGN_OR_RAISE(std::shared_ptr type, - GetTypeForNode(column_index, primitive_node, ctx)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, + GetTypeForNode(column_index, primitive_node, ctx)); auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false, FieldIdMetadata(list_node.field_id())); RETURN_NOT_OK( @@ -799,8 +799,8 @@ Status NodeToSchemaField(const Node& node, LevelInfo current_levels, // repeated $TYPE $FIELD_NAME const auto& primitive_node = static_cast(node); int column_index = ctx->schema->GetColumnIndex(primitive_node); - ASSIGN_OR_RAISE(std::shared_ptr type, - GetTypeForNode(column_index, primitive_node, ctx)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, + GetTypeForNode(column_index, primitive_node, ctx)); if (node.is_repeated()) { // One-level list encoding, e.g. // a: repeated int32; diff --git a/cpp/src/parquet/column_page.h b/cpp/src/parquet/column_page.h index b389ffd98e6c7..111265a842ee7 100644 --- a/cpp/src/parquet/column_page.h +++ b/cpp/src/parquet/column_page.h @@ -26,6 +26,7 @@ #include #include +#include "parquet/size_statistics.h" #include "parquet/statistics.h" #include "parquet/types.h" @@ -69,20 +70,22 @@ class DataPage : public Page { /// Currently it is only present from data pages created by ColumnWriter in order /// to collect page index. std::optional first_row_index() const { return first_row_index_; } + const SizeStatistics& size_statistics() const { return size_statistics_; } virtual ~DataPage() = default; protected: DataPage(PageType::type type, const std::shared_ptr& buffer, int32_t num_values, Encoding::type encoding, int64_t uncompressed_size, - EncodedStatistics statistics = EncodedStatistics(), - std::optional first_row_index = std::nullopt) + EncodedStatistics statistics, std::optional first_row_index, + SizeStatistics size_statistics) : Page(buffer, type), num_values_(num_values), encoding_(encoding), uncompressed_size_(uncompressed_size), statistics_(std::move(statistics)), - first_row_index_(std::move(first_row_index)) {} + first_row_index_(std::move(first_row_index)), + size_statistics_(std::move(size_statistics)) {} int32_t num_values_; Encoding::type encoding_; @@ -90,6 +93,7 @@ class DataPage : public Page { EncodedStatistics statistics_; /// Row ordinal within the row group to the first row in the data page. std::optional first_row_index_; + SizeStatistics size_statistics_; }; class DataPageV1 : public DataPage { @@ -98,9 +102,11 @@ class DataPageV1 : public DataPage { Encoding::type encoding, Encoding::type definition_level_encoding, Encoding::type repetition_level_encoding, int64_t uncompressed_size, EncodedStatistics statistics = EncodedStatistics(), - std::optional first_row_index = std::nullopt) + std::optional first_row_index = std::nullopt, + SizeStatistics size_statistics = SizeStatistics()) : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size, - std::move(statistics), std::move(first_row_index)), + std::move(statistics), std::move(first_row_index), + std::move(size_statistics)), definition_level_encoding_(definition_level_encoding), repetition_level_encoding_(repetition_level_encoding) {} @@ -120,9 +126,11 @@ class DataPageV2 : public DataPage { int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, int64_t uncompressed_size, bool is_compressed = false, EncodedStatistics statistics = EncodedStatistics(), - std::optional first_row_index = std::nullopt) + std::optional first_row_index = std::nullopt, + SizeStatistics size_statistics = SizeStatistics()) : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size, - std::move(statistics), std::move(first_row_index)), + std::move(statistics), std::move(first_row_index), + std::move(size_statistics)), num_nulls_(num_nulls), num_rows_(num_rows), definition_levels_byte_length_(definition_levels_byte_length), diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3ffc6f720061f..2a3bbf76d1c6e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -291,7 +291,6 @@ class SerializedPageReader : public PageReader { std::shared_ptr stream_; format::PageHeader current_page_header_; - std::shared_ptr current_page_; // Compression codec to use. std::unique_ptr<::arrow::util::Codec> decompressor_; diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index d3e0fdfa811c0..12cbcf20affa4 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -55,6 +55,7 @@ #include "parquet/platform.h" #include "parquet/properties.h" #include "parquet/schema.h" +#include "parquet/size_statistics.h" #include "parquet/statistics.h" #include "parquet/thrift_internal.h" #include "parquet/types.h" @@ -437,7 +438,7 @@ class SerializedPageWriter : public PageWriter { /// Collect page index if (column_index_builder_ != nullptr) { - column_index_builder_->AddPage(page.statistics()); + column_index_builder_->AddPage(page.statistics(), page.size_statistics()); } if (offset_index_builder_ != nullptr) { const int64_t compressed_size = output_data_len + header_size; @@ -451,8 +452,9 @@ class SerializedPageWriter : public PageWriter { /// start_pos is a relative offset in the buffered mode. It should be /// adjusted via OffsetIndexBuilder::Finish() after BufferedPageWriter /// has flushed all data pages. - offset_index_builder_->AddPage(start_pos, static_cast(compressed_size), - *page.first_row_index()); + offset_index_builder_->AddPage( + start_pos, static_cast(compressed_size), *page.first_row_index(), + page.size_statistics().unencoded_byte_array_data_bytes); } total_uncompressed_size_ += uncompressed_size + header_size; @@ -774,11 +776,17 @@ class ColumnWriterImpl { // Serializes Dictionary Page if enabled virtual void WriteDictionaryPage() = 0; + // A convenience struct to combine the encoded statistics and size statistics + struct StatisticsPair { + EncodedStatistics encoded_stats; + SizeStatistics size_stats; + }; + // Plain-encoded statistics of the current page - virtual EncodedStatistics GetPageStatistics() = 0; + virtual StatisticsPair GetPageStatistics() = 0; // Plain-encoded statistics of the whole chunk - virtual EncodedStatistics GetChunkStatistics() = 0; + virtual StatisticsPair GetChunkStatistics() = 0; // Merges page statistics into chunk statistics, then resets the values virtual void ResetPageStatistics() = 0; @@ -981,8 +989,7 @@ void ColumnWriterImpl::BuildDataPageV1(int64_t definition_levels_rle_size, PARQUET_THROW_NOT_OK(uncompressed_data_->Resize(uncompressed_size, false)); ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size, values, uncompressed_data_->mutable_data()); - - EncodedStatistics page_stats = GetPageStatistics(); + auto [page_stats, page_size_stats] = GetPageStatistics(); page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path())); page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order()); ResetPageStatistics(); @@ -1006,13 +1013,15 @@ void ColumnWriterImpl::BuildDataPageV1(int64_t definition_levels_rle_size, compressed_data->CopySlice(0, compressed_data->size(), allocator_)); std::unique_ptr page_ptr = std::make_unique( compressed_data_copy, num_values, encoding_, Encoding::RLE, Encoding::RLE, - uncompressed_size, std::move(page_stats), first_row_index); + uncompressed_size, std::move(page_stats), first_row_index, + std::move(page_size_stats)); total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader); data_pages_.push_back(std::move(page_ptr)); } else { // Eagerly write pages DataPageV1 page(compressed_data, num_values, encoding_, Encoding::RLE, Encoding::RLE, - uncompressed_size, std::move(page_stats), first_row_index); + uncompressed_size, std::move(page_stats), first_row_index, + std::move(page_size_stats)); WriteDataPage(page); } } @@ -1039,7 +1048,7 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size, ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size, compressed_values, combined->mutable_data()); - EncodedStatistics page_stats = GetPageStatistics(); + auto [page_stats, page_size_stats] = GetPageStatistics(); page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path())); page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order()); ResetPageStatistics(); @@ -1062,14 +1071,15 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size, combined->CopySlice(0, combined->size(), allocator_)); std::unique_ptr page_ptr = std::make_unique( combined, num_values, null_count, num_rows, encoding_, def_levels_byte_length, - rep_levels_byte_length, uncompressed_size, pager_->has_compressor(), page_stats, - first_row_index); + rep_levels_byte_length, uncompressed_size, pager_->has_compressor(), + std::move(page_stats), first_row_index, std::move(page_size_stats)); total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader); data_pages_.push_back(std::move(page_ptr)); } else { DataPageV2 page(combined, num_values, null_count, num_rows, encoding_, def_levels_byte_length, rep_levels_byte_length, uncompressed_size, - pager_->has_compressor(), page_stats, first_row_index); + pager_->has_compressor(), std::move(page_stats), first_row_index, + std::move(page_size_stats)); WriteDataPage(page); } } @@ -1083,7 +1093,7 @@ int64_t ColumnWriterImpl::Close() { FlushBufferedDataPages(); - EncodedStatistics chunk_statistics = GetChunkStatistics(); + auto [chunk_statistics, chunk_size_statistics] = GetChunkStatistics(); chunk_statistics.ApplyStatSizeLimits( properties_->max_statistics_size(descr_->path())); chunk_statistics.set_is_signed(SortOrder::SIGNED == descr_->sort_order()); @@ -1092,6 +1102,9 @@ int64_t ColumnWriterImpl::Close() { if (rows_written_ > 0 && chunk_statistics.is_set()) { metadata_->SetStatistics(chunk_statistics); } + if (rows_written_ > 0 && chunk_size_statistics.is_set()) { + metadata_->SetSizeStatistics(chunk_size_statistics); + } metadata_->SetKeyValueMetadata(key_value_metadata_); pager_->Close(has_dictionary_, fallback_); } @@ -1217,6 +1230,11 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< page_statistics_ = MakeStatistics(descr_, allocator_); chunk_statistics_ = MakeStatistics(descr_, allocator_); } + if (properties->size_statistics_level() == SizeStatisticsLevel::ColumnChunk || + properties->size_statistics_level() == SizeStatisticsLevel::PageAndColumnChunk) { + page_size_statistics_ = SizeStatistics::Make(descr_); + chunk_size_statistics_ = SizeStatistics::Make(descr_); + } pages_change_on_record_boundaries_ = properties->data_page_version() == ParquetDataPageVersion::V2 || properties->page_index_enabled(descr_->path()); @@ -1355,15 +1373,26 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< total_bytes_written_ += pager_->WriteDictionaryPage(page); } - EncodedStatistics GetPageStatistics() override { - EncodedStatistics result; - if (page_statistics_) result = page_statistics_->Encode(); + StatisticsPair GetPageStatistics() override { + StatisticsPair result; + if (page_statistics_) { + result.encoded_stats = page_statistics_->Encode(); + } + if (properties_->size_statistics_level() == SizeStatisticsLevel::PageAndColumnChunk) { + ARROW_DCHECK(page_size_statistics_ != nullptr); + result.size_stats = *page_size_statistics_; + } return result; } - EncodedStatistics GetChunkStatistics() override { - EncodedStatistics result; - if (chunk_statistics_) result = chunk_statistics_->Encode(); + StatisticsPair GetChunkStatistics() override { + StatisticsPair result; + if (chunk_statistics_) { + result.encoded_stats = chunk_statistics_->Encode(); + } + if (chunk_size_statistics_) { + result.size_stats = *chunk_size_statistics_; + } return result; } @@ -1372,6 +1401,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< chunk_statistics_->Merge(*page_statistics_); page_statistics_->Reset(); } + if (page_size_statistics_ != nullptr) { + chunk_size_statistics_->Merge(*page_size_statistics_); + page_size_statistics_->Reset(); + } } Type::type type() const override { return descr_->physical_type(); } @@ -1425,6 +1458,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< DictEncoder* current_dict_encoder_; std::shared_ptr page_statistics_; std::shared_ptr chunk_statistics_; + std::unique_ptr page_size_statistics_; + std::shared_ptr chunk_size_statistics_; bool pages_change_on_record_boundaries_; // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep the @@ -1467,6 +1502,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< rows_written_ += num_values; num_buffered_rows_ += num_values; } + + UpdateLevelHistogram(num_values, def_levels, rep_levels); return values_to_write; } @@ -1558,6 +1595,47 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< rows_written_ += num_levels; num_buffered_rows_ += num_levels; } + + UpdateLevelHistogram(num_levels, def_levels, rep_levels); + } + + void UpdateLevelHistogram(int64_t num_levels, const int16_t* def_levels, + const int16_t* rep_levels) const { + if (page_size_statistics_ == nullptr) { + return; + } + + auto add_levels = [](std::vector& level_histogram, + ::arrow::util::span levels) { + for (int16_t level : levels) { + ARROW_DCHECK_LT(level, static_cast(level_histogram.size())); + ++level_histogram[level]; + } + }; + + if (descr_->max_definition_level() > 0) { + add_levels(page_size_statistics_->definition_level_histogram, + {def_levels, static_cast(num_levels)}); + } else { + page_size_statistics_->definition_level_histogram[0] += num_levels; + } + + if (descr_->max_repetition_level() > 0) { + add_levels(page_size_statistics_->repetition_level_histogram, + {rep_levels, static_cast(num_levels)}); + } else { + page_size_statistics_->repetition_level_histogram[0] += num_levels; + } + } + + // Update the unencoded data bytes for ByteArray only per the specification. + void UpdateUnencodedDataBytes() const { + if constexpr (std::is_same_v) { + if (page_size_statistics_ != nullptr) { + page_size_statistics_->IncrementUnencodedByteArrayDataBytes( + current_encoder_->ReportUnencodedDataBytes()); + } + } } void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values, @@ -1611,6 +1689,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< if (page_statistics_ != nullptr) { page_statistics_->Update(values, num_values, num_nulls); } + UpdateUnencodedDataBytes(); } /// \brief Write values with spaces and update page statistics accordingly. @@ -1639,6 +1718,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_spaced_values, num_values, num_nulls); } + UpdateUnencodedDataBytes(); } }; @@ -1739,6 +1819,8 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( writeable_indices, MaybeReplaceValidity(writeable_indices, null_count, ctx->memory_pool)); dict_encoder->PutIndices(*writeable_indices); + // Update unencoded byte array data size to size statistics + UpdateUnencodedDataBytes(); CommitWriteAndCheckPageLimit(batch_size, batch_num_values, null_count, check_page); value_offset += batch_num_spaced_values; }; @@ -2219,6 +2301,7 @@ Status TypedColumnWriterImpl::WriteArrowDense( page_statistics_->IncrementNullCount(batch_size - non_null); page_statistics_->IncrementNumValues(non_null); } + UpdateUnencodedDataBytes(); CommitWriteAndCheckPageLimit(batch_size, batch_num_values, batch_size - non_null, check_page); CheckDictionarySizeLimit(); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index d2b3aa0dff003..25446aefd6814 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1001,8 +1001,8 @@ TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) { auto values_data = reinterpret_cast(values_buffer->data()); std::shared_ptr valid_bits; - ASSERT_OK_AND_ASSIGN(valid_bits, ::arrow::internal::BytesToBits( - {1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1})); + std::vector bitmap_bytes = {1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1}; + ASSERT_OK_AND_ASSIGN(valid_bits, ::arrow::internal::BytesToBits(bitmap_bytes)); // valgrind will warn about out of bounds access into def_levels_data typed_writer->WriteBatchSpaced(14, def_levels.data(), rep_levels.data(), diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 89d5d44c5219c..f41eb9a19123c 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -79,6 +79,15 @@ class EncoderImpl : virtual public Encoder { MemoryPool* memory_pool() const override { return pool_; } + int64_t ReportUnencodedDataBytes() override { + if (descr_->physical_type() != Type::BYTE_ARRAY) { + throw ParquetException("ReportUnencodedDataBytes is only supported for BYTE_ARRAY"); + } + int64_t bytes = unencoded_byte_array_data_bytes_; + unencoded_byte_array_data_bytes_ = 0; + return bytes; + } + protected: // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY const ColumnDescriptor* descr_; @@ -87,6 +96,8 @@ class EncoderImpl : virtual public Encoder { /// Type length from descr const int type_length_; + /// Number of unencoded bytes written to the encoder. Used for ByteArray type only. + int64_t unencoded_byte_array_data_bytes_ = 0; }; // ---------------------------------------------------------------------- @@ -132,6 +143,7 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL"; sink_.UnsafeAppend(&length, sizeof(uint32_t)); sink_.UnsafeAppend(data, static_cast(length)); + unencoded_byte_array_data_bytes_ += length; } void Put(const ByteArray& val) { @@ -513,6 +525,18 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { static_cast(values[i + position]); } }); + + // Track unencoded bytes based on dictionary value type + if constexpr (std::is_same_v) { + // For ByteArray, need to look up actual lengths from dictionary + for (size_t idx = + buffer_position - static_cast(data.length() - data.null_count()); + idx < buffer_position; ++idx) { + memo_table_.VisitValue(buffered_indices_[idx], [&](std::string_view value) { + unencoded_byte_array_data_bytes_ += value.length(); + }); + } + } } void PutIndices(const ::arrow::Array& data) override { @@ -656,6 +680,7 @@ inline void DictEncoderImpl::PutByteArray(const void* ptr, PARQUET_THROW_NOT_OK( memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index)); buffered_indices_.push_back(memo_index); + unencoded_byte_array_data_bytes_ += length; } template <> @@ -1268,6 +1293,7 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, } length_encoder_.Put({static_cast(view.length())}, 1); PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length())); + unencoded_byte_array_data_bytes_ += view.size(); return Status::OK(); }, []() { return Status::OK(); })); @@ -1313,6 +1339,7 @@ void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) { for (int idx = 0; idx < num_values; idx++) { sink_.UnsafeAppend(src[idx].ptr, src[idx].len); } + unencoded_byte_array_data_bytes_ += total_increment_size; } void DeltaLengthByteArrayEncoder::PutSpaced(const T* src, int num_values, @@ -1444,6 +1471,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(); type_length_ = descr_->type_length(); + unencoded_byte_array_data_bytes_ = 0; allocator_ = default_memory_pool(); } @@ -197,6 +198,8 @@ class TestEncodingBase : public ::testing::Test { draws_[nvalues * j + i] = draws_[i]; } } + + InitUnencodedByteArrayDataBytes(); } virtual void CheckRoundtrip() = 0; @@ -222,6 +225,16 @@ class TestEncodingBase : public ::testing::Test { } } + void InitUnencodedByteArrayDataBytes() { + // Calculate expected unencoded bytes based on type + if constexpr (std::is_same_v) { + unencoded_byte_array_data_bytes_ = 0; + for (int i = 0; i < num_values_; i++) { + unencoded_byte_array_data_bytes_ += draws_[i].len; + } + } + } + protected: MemoryPool* allocator_; @@ -235,6 +248,7 @@ class TestEncodingBase : public ::testing::Test { std::shared_ptr encode_buffer_; std::shared_ptr descr_; + int64_t unencoded_byte_array_data_bytes_; // unencoded data size for dense values }; // Member variables are not visible to templated subclasses. Possibly figure @@ -261,6 +275,10 @@ class TestPlainEncoding : public TestEncodingBase { auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); encoder->Put(draws_, num_values_); encode_buffer_ = encoder->FlushValues(); + if constexpr (std::is_same_v) { + ASSERT_EQ(encoder->ReportUnencodedDataBytes(), + this->unencoded_byte_array_data_bytes_); + } decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); @@ -346,6 +364,10 @@ class TestDictionaryEncoding : public TestEncodingBase { AllocateBuffer(default_memory_pool(), dict_traits->dict_encoded_size()); dict_traits->WriteDict(dict_buffer_->mutable_data()); std::shared_ptr indices = encoder->FlushValues(); + if constexpr (std::is_same_v) { + ASSERT_EQ(encoder->ReportUnencodedDataBytes(), + this->unencoded_byte_array_data_bytes_); + } auto base_spaced_encoder = MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get()); @@ -1992,6 +2014,10 @@ class TestDeltaLengthByteArrayEncoding : public TestEncodingBase { encoder->Put(draws_, num_values_); encode_buffer_ = encoder->FlushValues(); + if constexpr (std::is_same_v) { + ASSERT_EQ(encoder->ReportUnencodedDataBytes(), + this->unencoded_byte_array_data_bytes_); + } decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); @@ -2296,6 +2322,8 @@ class TestDeltaByteArrayEncoding : public TestDeltaLengthByteArrayEncoding draws_[nvalues * j + i] = draws_[i]; } } + + TestEncodingBase::InitUnencodedByteArrayDataBytes(); } Encoding::type GetEncoding() override { return Encoding::DELTA_BYTE_ARRAY; } diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 3cc42ae370217..1c9b2323de500 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -83,8 +83,6 @@ bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) { } } // namespace -// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file -static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; // For PARQUET-816 @@ -482,7 +480,8 @@ class SerializedFile : public ParquetFileReader::Contents { "Parquet file size is ", source_size_, " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)"); } - return std::min(source_size_, kDefaultFooterReadSize); + + return std::min(static_cast(source_size_), properties_.footer_read_size()); } // Validate the magic bytes and get the length of the full footer. diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 8f577be45b96d..f47c61421936c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -37,6 +37,7 @@ #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" +#include "parquet/size_statistics.h" #include "parquet/thrift_internal.h" namespace parquet { @@ -265,6 +266,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { LoadEnumSafe(&encoding_stats.encoding), encoding_stats.count}); } + if (column_metadata_->__isset.size_statistics) { + size_statistics_ = + std::make_shared(FromThrift(column_metadata_->size_statistics)); + size_statistics_->Validate(descr_); + } possible_stats_ = nullptr; InitKeyValueMetadata(); } @@ -308,6 +314,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return is_stats_set() ? possible_stats_ : nullptr; } + inline std::shared_ptr size_statistics() const { + return size_statistics_; + } + inline Compression::type compression() const { return LoadEnumSafe(&column_metadata_->codec); } @@ -396,6 +406,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ReaderProperties properties_; const ApplicationVersion* writer_version_; std::shared_ptr key_value_metadata_; + std::shared_ptr size_statistics_; }; std::unique_ptr ColumnChunkMetaData::Make( @@ -439,6 +450,10 @@ std::shared_ptr ColumnChunkMetaData::statistics() const { bool ColumnChunkMetaData::is_stats_set() const { return impl_->is_stats_set(); } +std::shared_ptr ColumnChunkMetaData::size_statistics() const { + return impl_->size_statistics(); +} + std::optional ColumnChunkMetaData::bloom_filter_offset() const { return impl_->bloom_filter_offset(); } @@ -1543,6 +1558,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->meta_data.__set_statistics(ToThrift(val)); } + void SetSizeStatistics(const SizeStatistics& size_stats) { + column_chunk_->meta_data.__set_size_statistics(ToThrift(size_stats)); + } + void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, @@ -1752,6 +1771,10 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) impl_->SetStatistics(result); } +void ColumnChunkMetaDataBuilder::SetSizeStatistics(const SizeStatistics& size_stats) { + impl_->SetSizeStatistics(size_stats); +} + void ColumnChunkMetaDataBuilder::SetKeyValueMetadata( std::shared_ptr key_value_metadata) { impl_->SetKeyValueMetadata(std::move(key_value_metadata)); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index dc97d816daa74..9a3964f7d6574 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -28,23 +28,9 @@ #include "parquet/encryption/type_fwd.h" #include "parquet/platform.h" #include "parquet/properties.h" -#include "parquet/schema.h" -#include "parquet/types.h" namespace parquet { -class ColumnDescriptor; -class EncodedStatistics; -class FileCryptoMetaData; -class Statistics; -class SchemaDescriptor; - -namespace schema { - -class ColumnPath; - -} // namespace schema - using KeyValueMetadata = ::arrow::KeyValueMetadata; class PARQUET_EXPORT ApplicationVersion { @@ -156,6 +142,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { std::shared_ptr path_in_schema() const; bool is_stats_set() const; std::shared_ptr statistics() const; + std::shared_ptr size_statistics() const; Compression::type compression() const; // Indicate if the ColumnChunk compression is supported by the current @@ -451,6 +438,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // column metadata void SetStatistics(const EncodedStatistics& stats); + void SetSizeStatistics(const SizeStatistics& size_stats); void SetKeyValueMetadata(std::shared_ptr key_value_metadata); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index afda4c6064b36..8cc819f10cacd 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -159,6 +159,22 @@ class TypedColumnIndexImpl : public TypedColumnIndex { const std::vector& max_values() const override { return max_values_; } + bool has_definition_level_histograms() const override { + return column_index_.__isset.definition_level_histograms; + } + + bool has_repetition_level_histograms() const override { + return column_index_.__isset.repetition_level_histograms; + } + + const std::vector& definition_level_histograms() const override { + return column_index_.definition_level_histograms; + } + + const std::vector& repetition_level_histograms() const override { + return column_index_.repetition_level_histograms; + } + private: /// Wrapped thrift column index. const format::ColumnIndex column_index_; @@ -178,14 +194,22 @@ class OffsetIndexImpl : public OffsetIndex { page_location.compressed_page_size, page_location.first_row_index}); } + if (offset_index.__isset.unencoded_byte_array_data_bytes) { + unencoded_byte_array_data_bytes_ = offset_index.unencoded_byte_array_data_bytes; + } } const std::vector& page_locations() const override { return page_locations_; } + const std::vector& unencoded_byte_array_data_bytes() const override { + return unencoded_byte_array_data_bytes_; + } + private: std::vector page_locations_; + std::vector unencoded_byte_array_data_bytes_; }; class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader { @@ -460,7 +484,8 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.boundary_order = format::BoundaryOrder::UNORDERED; } - void AddPage(const EncodedStatistics& stats) override { + void AddPage(const EncodedStatistics& stats, + const SizeStatistics& size_stats) override { if (state_ == BuilderState::kFinished) { throw ParquetException("Cannot add page to finished ColumnIndexBuilder."); } else if (state_ == BuilderState::kDiscarded) { @@ -493,6 +518,17 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.__isset.null_counts = false; column_index_.null_counts.clear(); } + + if (size_stats.is_set()) { + const auto& page_def_level_hist = size_stats.definition_level_histogram; + const auto& page_ref_level_hist = size_stats.repetition_level_histogram; + column_index_.definition_level_histograms.insert( + column_index_.definition_level_histograms.end(), page_def_level_hist.cbegin(), + page_def_level_hist.cend()); + column_index_.repetition_level_histograms.insert( + column_index_.repetition_level_histograms.end(), page_ref_level_hist.cbegin(), + page_ref_level_hist.cend()); + } } void Finish() override { @@ -533,6 +569,29 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { /// Decide the boundary order from decoded min/max values. auto boundary_order = DetermineBoundaryOrder(min_values, max_values); column_index_.__set_boundary_order(ToThrift(boundary_order)); + + // Finalize level histogram. + const int64_t num_pages = column_index_.null_pages.size(); + const int64_t def_level_hist_size = column_index_.definition_level_histograms.size(); + const int64_t rep_level_hist_size = column_index_.repetition_level_histograms.size(); + if (def_level_hist_size != 0 && + def_level_hist_size != (descr_->max_definition_level() + 1) * num_pages) { + std::stringstream ss; + ss << "Invalid definition level histogram size: " << def_level_hist_size + << ", expected: " << (descr_->max_definition_level() + 1) * num_pages; + throw ParquetException(ss.str()); + } + if (rep_level_hist_size != 0 && + rep_level_hist_size != (descr_->max_repetition_level() + 1) * num_pages) { + std::stringstream ss; + ss << "Invalid repetition level histogram size: " << rep_level_hist_size + << ", expected: " << (descr_->max_repetition_level() + 1) * num_pages; + throw ParquetException(ss.str()); + } + column_index_.__isset.definition_level_histograms = + !column_index_.definition_level_histograms.empty(); + column_index_.__isset.repetition_level_histograms = + !column_index_.repetition_level_histograms.empty(); } void WriteTo(::arrow::io::OutputStream* sink, Encryptor* encryptor) const override { @@ -604,8 +663,8 @@ class OffsetIndexBuilderImpl final : public OffsetIndexBuilder { public: OffsetIndexBuilderImpl() = default; - void AddPage(int64_t offset, int32_t compressed_page_size, - int64_t first_row_index) override { + void AddPage(int64_t offset, int32_t compressed_page_size, int64_t first_row_index, + std::optional unencoded_byte_array_length) override { if (state_ == BuilderState::kFinished) { throw ParquetException("Cannot add page to finished OffsetIndexBuilder."); } else if (state_ == BuilderState::kDiscarded) { @@ -620,6 +679,10 @@ class OffsetIndexBuilderImpl final : public OffsetIndexBuilder { page_location.__set_compressed_page_size(compressed_page_size); page_location.__set_first_row_index(first_row_index); offset_index_.page_locations.emplace_back(std::move(page_location)); + if (unencoded_byte_array_length.has_value()) { + offset_index_.unencoded_byte_array_data_bytes.emplace_back( + unencoded_byte_array_length.value()); + } } void Finish(int64_t final_position) override { @@ -636,6 +699,19 @@ class OffsetIndexBuilderImpl final : public OffsetIndexBuilder { page_location.__set_offset(page_location.offset + final_position); } } + + // Finalize unencoded_byte_array_data_bytes and make sure page sizes match. + if (offset_index_.page_locations.size() == + offset_index_.unencoded_byte_array_data_bytes.size()) { + offset_index_.__isset.unencoded_byte_array_data_bytes = true; + } else if (!offset_index_.unencoded_byte_array_data_bytes.empty()) { + std::stringstream ss; + ss << "Invalid count of unencoded BYTE_ARRAY data bytes: " + << offset_index_.unencoded_byte_array_data_bytes.size() + << ", expected page count: " << offset_index_.page_locations.size(); + throw ParquetException(ss.str()); + } + state_ = BuilderState::kFinished; break; } @@ -813,6 +889,14 @@ class PageIndexBuilderImpl final : public PageIndexBuilder { } // namespace +void OffsetIndexBuilder::AddPage(const PageLocation& page_location, + const SizeStatistics& size_stats) { + this->AddPage( + page_location.offset, page_location.compressed_page_size, + page_location.first_row_index, + size_stats.is_set() ? size_stats.unencoded_byte_array_data_bytes : std::nullopt); +} + RowGroupIndexReadRange PageIndexReader::DeterminePageIndexRangesInRowGroup( const RowGroupMetaData& row_group_metadata, const std::vector& columns) { int64_t ci_start = std::numeric_limits::max(); diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index d45c59cab223f..3083159783ba7 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -19,6 +19,7 @@ #include "arrow/io/interfaces.h" #include "parquet/encryption/type_fwd.h" +#include "parquet/type_fwd.h" #include "parquet/types.h" #include @@ -26,9 +27,6 @@ namespace parquet { -class EncodedStatistics; -struct PageIndexLocation; - /// \brief ColumnIndex is a proxy around format::ColumnIndex. class PARQUET_EXPORT ColumnIndex { public: @@ -76,6 +74,18 @@ class PARQUET_EXPORT ColumnIndex { /// \brief A vector of page indices for non-null pages. virtual const std::vector& non_null_page_indices() const = 0; + + /// \brief Whether definition level histogram is available. + virtual bool has_definition_level_histograms() const = 0; + + /// \brief Whether repetition level histogram is available. + virtual bool has_repetition_level_histograms() const = 0; + + /// \brief List of definition level histograms for each page concatenated together. + virtual const std::vector& definition_level_histograms() const = 0; + + /// \brief List of repetition level histograms for each page concatenated together. + virtual const std::vector& repetition_level_histograms() const = 0; }; /// \brief Typed implementation of ColumnIndex. @@ -129,6 +139,10 @@ class PARQUET_EXPORT OffsetIndex { /// \brief A vector of locations for each data page in this column. virtual const std::vector& page_locations() const = 0; + + /// \brief A vector of unencoded/uncompressed size of each page for BYTE_ARRAY types, + /// or empty for other types. + virtual const std::vector& unencoded_byte_array_data_bytes() const = 0; }; /// \brief Interface for reading the page index for a Parquet row group. @@ -266,7 +280,9 @@ class PARQUET_EXPORT ColumnIndexBuilder { /// not update statistics anymore. /// /// \param stats Page statistics in the encoded form. - virtual void AddPage(const EncodedStatistics& stats) = 0; + /// \param size_stats Size statistics of the page if available. + virtual void AddPage(const EncodedStatistics& stats, + const SizeStatistics& size_stats) = 0; /// \brief Complete the column index. /// @@ -299,15 +315,13 @@ class PARQUET_EXPORT OffsetIndexBuilder { virtual ~OffsetIndexBuilder() = default; - /// \brief Add page location of a data page. + /// \brief Add page location and size stats of a data page. virtual void AddPage(int64_t offset, int32_t compressed_page_size, - int64_t first_row_index) = 0; + int64_t first_row_index, + std::optional unencoded_byte_array_length = {}) = 0; - /// \brief Add page location of a data page. - void AddPage(const PageLocation& page_location) { - AddPage(page_location.offset, page_location.compressed_page_size, - page_location.first_row_index); - } + /// \brief Add page location and size stats of a data page. + void AddPage(const PageLocation& page_location, const SizeStatistics& size_stats); /// \brief Complete the offset index. /// diff --git a/cpp/src/parquet/page_index_benchmark.cc b/cpp/src/parquet/page_index_benchmark.cc index 5631034105056..e94fa0365d189 100644 --- a/cpp/src/parquet/page_index_benchmark.cc +++ b/cpp/src/parquet/page_index_benchmark.cc @@ -82,7 +82,7 @@ void BM_ReadColumnIndex(::benchmark::State& state) { GenerateBenchmarkData(values_per_page, /*seed=*/0, values.data(), &heap, kDataStringLength); stats->Update(values.data(), values_per_page, /*null_count=*/0); - builder->AddPage(stats->Encode()); + builder->AddPage(stats->Encode(), /*size_stats=*/{}); } builder->Finish(); diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 4db49b4267415..916e28f8cea8e 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -419,15 +419,20 @@ TEST(PageIndex, DeterminePageIndexRangesInRowGroupWithMissingPageIndex) { -1); } -TEST(PageIndex, WriteOffsetIndex) { +void TestWriteOffsetIndex(bool write_size_stats) { /// Create offset index via the OffsetIndexBuilder interface. auto builder = OffsetIndexBuilder::Make(); const size_t num_pages = 5; const std::vector offsets = {100, 200, 300, 400, 500}; const std::vector page_sizes = {1024, 2048, 3072, 4096, 8192}; const std::vector first_row_indices = {0, 10000, 20000, 30000, 40000}; + const std::vector unencoded_byte_array_lengths = {1111, 2222, 0, 3333, 4444}; for (size_t i = 0; i < num_pages; ++i) { - builder->AddPage(offsets[i], page_sizes[i], first_row_indices[i]); + auto unencoded_byte_array_length = + write_size_stats ? std::make_optional(unencoded_byte_array_lengths[i]) + : std::nullopt; + builder->AddPage(offsets[i], page_sizes[i], first_row_indices[i], + unencoded_byte_array_length); } const int64_t final_position = 4096; builder->Finish(final_position); @@ -446,23 +451,73 @@ TEST(PageIndex, WriteOffsetIndex) { /// Verify the data of the offset index. for (const auto& offset_index : offset_indexes) { ASSERT_EQ(num_pages, offset_index->page_locations().size()); + if (write_size_stats) { + ASSERT_EQ(num_pages, offset_index->unencoded_byte_array_data_bytes().size()); + } else { + ASSERT_TRUE(offset_index->unencoded_byte_array_data_bytes().empty()); + } for (size_t i = 0; i < num_pages; ++i) { const auto& page_location = offset_index->page_locations().at(i); ASSERT_EQ(offsets[i] + final_position, page_location.offset); ASSERT_EQ(page_sizes[i], page_location.compressed_page_size); ASSERT_EQ(first_row_indices[i], page_location.first_row_index); + if (write_size_stats) { + ASSERT_EQ(unencoded_byte_array_lengths[i], + offset_index->unencoded_byte_array_data_bytes()[i]); + } } } } +TEST(PageIndex, WriteOffsetIndexWithoutSizeStats) { + TestWriteOffsetIndex(/*write_size_stats=*/false); +} + +TEST(PageIndex, WriteOffsetIndexWithSizeStats) { + TestWriteOffsetIndex(/*write_size_stats=*/true); +} + +struct PageLevelHistogram { + std::vector def_levels; + std::vector rep_levels; +}; + +std::unique_ptr ConstructFakeSizeStatistics( + const ColumnDescriptor* descr, const PageLevelHistogram& page_level_histogram) { + auto stats = SizeStatistics::Make(descr); + stats->definition_level_histogram = page_level_histogram.def_levels; + stats->repetition_level_histogram = page_level_histogram.rep_levels; + return stats; +} + +void VerifyPageLevelHistogram(size_t page_id, + const std::vector& expected_page_levels, + const std::vector& all_page_levels) { + const size_t max_level = expected_page_levels.size() - 1; + const size_t offset = page_id * (max_level + 1); + for (size_t level = 0; level <= max_level; ++level) { + ASSERT_EQ(expected_page_levels[level], all_page_levels[offset + level]); + } +} + void TestWriteTypedColumnIndex(schema::NodePtr node, const std::vector& page_stats, - BoundaryOrder::type boundary_order, bool has_null_counts) { - auto descr = std::make_unique(node, /*max_definition_level=*/1, 0); - + BoundaryOrder::type boundary_order, bool has_null_counts, + int16_t max_definition_level = 1, + int16_t max_repetition_level = 0, + const std::vector& page_levels = {}) { + const bool build_size_stats = !page_levels.empty(); + if (build_size_stats) { + ASSERT_EQ(page_levels.size(), page_stats.size()); + } + auto descr = std::make_unique(node, max_definition_level, + max_repetition_level); auto builder = ColumnIndexBuilder::Make(descr.get()); - for (const auto& stats : page_stats) { - builder->AddPage(stats); + for (size_t i = 0; i < page_stats.size(); ++i) { + auto size_stats = build_size_stats + ? ConstructFakeSizeStatistics(descr.get(), page_levels[i]) + : std::make_unique(); + builder->AddPage(page_stats[i], *size_stats); } ASSERT_NO_THROW(builder->Finish()); @@ -482,6 +537,13 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, ASSERT_EQ(boundary_order, column_index->boundary_order()); ASSERT_EQ(has_null_counts, column_index->has_null_counts()); const size_t num_pages = column_index->null_pages().size(); + if (build_size_stats) { + ASSERT_EQ(num_pages * (max_repetition_level + 1), + column_index->repetition_level_histograms().size()); + ASSERT_EQ(num_pages * (max_definition_level + 1), + column_index->definition_level_histograms().size()); + } + for (size_t i = 0; i < num_pages; ++i) { ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]); ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]); @@ -489,6 +551,12 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, if (has_null_counts) { ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]); } + if (build_size_stats) { + ASSERT_NO_FATAL_FAILURE(VerifyPageLevelHistogram( + i, page_levels[i].def_levels, column_index->definition_level_histograms())); + ASSERT_NO_FATAL_FAILURE(VerifyPageLevelHistogram( + i, page_levels[i].rep_levels, column_index->repetition_level_histograms())); + } } } } @@ -640,7 +708,7 @@ TEST(PageIndex, WriteColumnIndexWithCorruptedStats) { ColumnDescriptor descr(schema::Int32("c1"), /*max_definition_level=*/1, 0); auto builder = ColumnIndexBuilder::Make(&descr); for (const auto& stats : page_stats) { - builder->AddPage(stats); + builder->AddPage(stats, SizeStatistics()); } ASSERT_NO_THROW(builder->Finish()); ASSERT_EQ(nullptr, builder->Build()); @@ -651,6 +719,31 @@ TEST(PageIndex, WriteColumnIndexWithCorruptedStats) { EXPECT_EQ(0, buffer->size()); } +TEST(PageIndex, WriteInt64ColumnIndexWithSizeStats) { + auto encode = [=](int64_t value) { + return std::string(reinterpret_cast(&value), sizeof(int64_t)); + }; + + // Integer values in the descending order. + std::vector page_stats(3); + page_stats.at(0).set_null_count(4).set_min(encode(-1)).set_max(encode(-2)); + page_stats.at(1).set_null_count(0).set_min(encode(-2)).set_max(encode(-3)); + page_stats.at(2).set_null_count(4).set_min(encode(-3)).set_max(encode(-4)); + + // Page level histograms. + std::vector page_levels; + page_levels.push_back( + PageLevelHistogram{/*def_levels=*/{2, 4, 6, 8}, /*rep_levels=*/{10, 5, 5}}); + page_levels.push_back( + PageLevelHistogram{/*def_levels=*/{1, 3, 5, 7}, /*rep_levels=*/{4, 8, 4}}); + page_levels.push_back( + PageLevelHistogram{/*def_levels=*/{0, 2, 4, 6}, /*rep_levels=*/{3, 4, 5}}); + + TestWriteTypedColumnIndex(schema::Int64("c1"), page_stats, BoundaryOrder::Descending, + /*has_null_counts=*/true, /*max_definition_level=*/3, + /*max_repetition_level=*/2, page_levels); +} + TEST(PageIndex, TestPageIndexBuilderWithZeroRowGroup) { schema::NodeVector fields = {schema::Int32("c1"), schema::ByteArray("c2")}; schema::NodePtr root = schema::GroupNode::Make("schema", Repetition::REPEATED, fields); @@ -689,14 +782,15 @@ class PageIndexBuilderTest : public ::testing::Test { for (int column = 0; column < num_columns; ++column) { if (static_cast(column) < page_stats[row_group].size()) { auto column_index_builder = builder->GetColumnIndexBuilder(column); - ASSERT_NO_THROW(column_index_builder->AddPage(page_stats[row_group][column])); + ASSERT_NO_THROW( + column_index_builder->AddPage(page_stats[row_group][column], {})); ASSERT_NO_THROW(column_index_builder->Finish()); } if (static_cast(column) < page_locations[row_group].size()) { auto offset_index_builder = builder->GetOffsetIndexBuilder(column); - ASSERT_NO_THROW( - offset_index_builder->AddPage(page_locations[row_group][column])); + ASSERT_NO_THROW(offset_index_builder->AddPage(page_locations[row_group][column], + /*size_stats=*/{})); ASSERT_NO_THROW(offset_index_builder->Finish(final_position)); } } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 7f2e371df66d7..c942010396826 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -47,6 +47,16 @@ namespace parquet { /// DataPageV2 at all. enum class ParquetDataPageVersion { V1, V2 }; +/// Controls the level of size statistics that are written to the file. +enum class SizeStatisticsLevel : uint8_t { + // No size statistics are written. + None = 0, + // Only column chunk size statistics are written. + ColumnChunk, + // Both size statistics in the column chunk and page index are written. + PageAndColumnChunk +}; + /// Align the default buffer size to a small multiple of a page size. constexpr int64_t kDefaultBufferSize = 4096 * 4; @@ -56,6 +66,9 @@ constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000; // kDefaultStringSizeLimit. constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000; +// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file +constexpr int64_t kDefaultFooterReadSize = 64 * 1024; + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool()) @@ -120,6 +133,12 @@ class PARQUET_EXPORT ReaderProperties { page_checksum_verification_ = check_crc; } + // Set the default read size to read the footer from a file. For high latency + // file systems and files with large metadata (>64KB) this can increase performance + // by reducing the number of round-trips to retrieve the entire file metadata. + void set_footer_read_size(size_t size) { footer_read_size_ = size; } + size_t footer_read_size() const { return footer_read_size_; } + private: MemoryPool* pool_; int64_t buffer_size_ = kDefaultBufferSize; @@ -129,6 +148,7 @@ class PARQUET_EXPORT ReaderProperties { bool page_checksum_verification_ = false; // Used with a RecordReader. bool read_dense_for_nullable_ = false; + size_t footer_read_size_ = kDefaultFooterReadSize; std::shared_ptr file_decryption_properties_; }; @@ -237,7 +257,8 @@ class PARQUET_EXPORT WriterProperties { data_page_version_(ParquetDataPageVersion::V1), created_by_(DEFAULT_CREATED_BY), store_decimal_as_integer_(false), - page_checksum_enabled_(false) {} + page_checksum_enabled_(false), + size_statistics_level_(SizeStatisticsLevel::None) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -639,6 +660,16 @@ class PARQUET_EXPORT WriterProperties { return this->disable_write_page_index(path->ToDotString()); } + /// \brief Set the level to write size statistics for all columns. Default is None. + /// + /// \param level The level to write size statistics. Note that if page index is not + /// enabled, page level size statistics will not be written even if the level + /// is set to PageAndColumnChunk. + Builder* set_size_statistics_level(SizeStatisticsLevel level) { + size_statistics_level_ = level; + return this; + } + /// \brief Build the WriterProperties with the builder parameters. /// \return The WriterProperties defined by the builder. std::shared_ptr build() { @@ -665,9 +696,9 @@ class PARQUET_EXPORT WriterProperties { return std::shared_ptr(new WriterProperties( pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, pagesize_, version_, created_by_, page_checksum_enabled_, - std::move(file_encryption_properties_), default_column_properties_, - column_properties, data_page_version_, store_decimal_as_integer_, - std::move(sorting_columns_))); + size_statistics_level_, std::move(file_encryption_properties_), + default_column_properties_, column_properties, data_page_version_, + store_decimal_as_integer_, std::move(sorting_columns_))); } private: @@ -681,6 +712,7 @@ class PARQUET_EXPORT WriterProperties { std::string created_by_; bool store_decimal_as_integer_; bool page_checksum_enabled_; + SizeStatisticsLevel size_statistics_level_; std::shared_ptr file_encryption_properties_; @@ -719,6 +751,10 @@ class PARQUET_EXPORT WriterProperties { inline bool page_checksum_enabled() const { return page_checksum_enabled_; } + inline SizeStatisticsLevel size_statistics_level() const { + return size_statistics_level_; + } + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -812,6 +848,7 @@ class PARQUET_EXPORT WriterProperties { MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, bool page_write_checksum_enabled, + SizeStatisticsLevel size_statistics_level, std::shared_ptr file_encryption_properties, const ColumnProperties& default_column_properties, const std::unordered_map& column_properties, @@ -827,6 +864,7 @@ class PARQUET_EXPORT WriterProperties { parquet_created_by_(created_by), store_decimal_as_integer_(store_short_decimal_as_integer), page_checksum_enabled_(page_write_checksum_enabled), + size_statistics_level_(size_statistics_level), file_encryption_properties_(file_encryption_properties), sorting_columns_(std::move(sorting_columns)), default_column_properties_(default_column_properties), @@ -842,6 +880,7 @@ class PARQUET_EXPORT WriterProperties { std::string parquet_created_by_; bool store_decimal_as_integer_; bool page_checksum_enabled_; + SizeStatisticsLevel size_statistics_level_; std::shared_ptr file_encryption_properties_; diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc index b2c574413abf7..35fc11565914e 100644 --- a/cpp/src/parquet/properties_test.cc +++ b/cpp/src/parquet/properties_test.cc @@ -35,6 +35,7 @@ TEST(TestReaderProperties, Basics) { ReaderProperties props; ASSERT_EQ(props.buffer_size(), kDefaultBufferSize); + ASSERT_EQ(props.footer_read_size(), kDefaultFooterReadSize); ASSERT_FALSE(props.is_buffered_stream_enabled()); ASSERT_FALSE(props.page_checksum_verification()); } diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc new file mode 100644 index 0000000000000..a02cef7aba46f --- /dev/null +++ b/cpp/src/parquet/size_statistics.cc @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliancec +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/size_statistics.h" + +#include + +#include "arrow/util/logging.h" +#include "parquet/exception.h" +#include "parquet/schema.h" + +namespace parquet { + +void SizeStatistics::Merge(const SizeStatistics& other) { + if (repetition_level_histogram.size() != other.repetition_level_histogram.size()) { + throw ParquetException("Repetition level histogram size mismatch"); + } + if (definition_level_histogram.size() != other.definition_level_histogram.size()) { + throw ParquetException("Definition level histogram size mismatch"); + } + if (unencoded_byte_array_data_bytes.has_value() != + other.unencoded_byte_array_data_bytes.has_value()) { + throw ParquetException("Unencoded byte array data bytes are not consistent"); + } + std::transform(repetition_level_histogram.begin(), repetition_level_histogram.end(), + other.repetition_level_histogram.begin(), + repetition_level_histogram.begin(), std::plus<>()); + std::transform(definition_level_histogram.begin(), definition_level_histogram.end(), + other.definition_level_histogram.begin(), + definition_level_histogram.begin(), std::plus<>()); + if (unencoded_byte_array_data_bytes.has_value()) { + unencoded_byte_array_data_bytes = unencoded_byte_array_data_bytes.value() + + other.unencoded_byte_array_data_bytes.value(); + } +} + +void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) { + ARROW_CHECK(unencoded_byte_array_data_bytes.has_value()); + unencoded_byte_array_data_bytes = unencoded_byte_array_data_bytes.value() + value; +} + +void SizeStatistics::Validate(const ColumnDescriptor* descr) const { + if (repetition_level_histogram.size() != + static_cast(descr->max_repetition_level() + 1)) { + throw ParquetException("Repetition level histogram size mismatch"); + } + if (definition_level_histogram.size() != + static_cast(descr->max_definition_level() + 1)) { + throw ParquetException("Definition level histogram size mismatch"); + } + if (unencoded_byte_array_data_bytes.has_value() && + descr->physical_type() != Type::BYTE_ARRAY) { + throw ParquetException("Unencoded byte array data bytes does not support " + + TypeToString(descr->physical_type())); + } + if (!unencoded_byte_array_data_bytes.has_value() && + descr->physical_type() == Type::BYTE_ARRAY) { + throw ParquetException("Missing unencoded byte array data bytes"); + } +} + +void SizeStatistics::Reset() { + repetition_level_histogram.assign(repetition_level_histogram.size(), 0); + definition_level_histogram.assign(definition_level_histogram.size(), 0); + if (unencoded_byte_array_data_bytes.has_value()) { + unencoded_byte_array_data_bytes = 0; + } +} + +std::unique_ptr SizeStatistics::Make(const ColumnDescriptor* descr) { + auto size_stats = std::make_unique(); + size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); + size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + if (descr->physical_type() == Type::BYTE_ARRAY) { + size_stats->unencoded_byte_array_data_bytes = 0; + } + return size_stats; +} + +} // namespace parquet diff --git a/cpp/src/parquet/size_statistics.h b/cpp/src/parquet/size_statistics.h new file mode 100644 index 0000000000000..c25e70ee36d8a --- /dev/null +++ b/cpp/src/parquet/size_statistics.h @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "parquet/platform.h" +#include "parquet/type_fwd.h" + +namespace parquet { + +/// A structure for capturing metadata for estimating the unencoded, +/// uncompressed size of data written. This is useful for readers to estimate +/// how much memory is needed to reconstruct data in their memory model and for +/// fine-grained filter push down on nested structures (the histograms contained +/// in this structure can help determine the number of nulls at a particular +/// nesting level and maximum length of lists). +struct PARQUET_EXPORT SizeStatistics { + /// When present, there is expected to be one element corresponding to each + /// definition (i.e. size=max definition+1) where each element + /// represents the number of times the definition level was observed in the + /// data. + /// + /// This field may be omitted (a.k.a. zero-length vector) if max_definition_level + /// is 0 without loss of information. + std::vector definition_level_histogram; + + /// Same as definition_level_histogram except for repetition levels. + /// + /// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level + /// is 0 without loss of information. + std::vector repetition_level_histogram; + + /// The number of physical bytes stored for BYTE_ARRAY data values assuming + /// no encoding. This is exclusive of the bytes needed to store the length of + /// each byte array. In other words, this field is equivalent to the `(size + /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + /// written)`. To determine unencoded sizes of other types readers can use + /// schema information multiplied by the number of non-null and null values. + /// The number of null/non-null values can be inferred from the histograms + /// below. + /// + /// For example, if a column chunk is dictionary-encoded with dictionary + /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + /// then this value for that data page should be 7 (1 + 1 + 2 + 3). + /// + /// This field should only be set for types that use BYTE_ARRAY as their + /// physical type. + std::optional unencoded_byte_array_data_bytes; + + /// \brief Check if the SizeStatistics is set. + bool is_set() const { + return !repetition_level_histogram.empty() || !definition_level_histogram.empty() || + unencoded_byte_array_data_bytes.has_value(); + } + + /// \brief Increment the unencoded byte array data bytes. + void IncrementUnencodedByteArrayDataBytes(int64_t value); + + /// \brief Merge two SizeStatistics. + /// \throws ParquetException if SizeStatistics to merge is not compatible. + void Merge(const SizeStatistics& other); + + /// \brief Validate the SizeStatistics + /// \throws ParquetException if the histograms don't have the right length, + /// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column. + void Validate(const ColumnDescriptor* descr) const; + + /// \brief Reset the SizeStatistics to be empty. + void Reset(); + + /// \brief Make an empty SizeStatistics object for specific type. + static std::unique_ptr Make(const ColumnDescriptor* descr); +}; + +} // namespace parquet diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc new file mode 100644 index 0000000000000..cefd31dce285d --- /dev/null +++ b/cpp/src/parquet/size_statistics_test.cc @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include +#include + +#include "arrow/buffer.h" +#include "arrow/table.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/span.h" +#include "parquet/arrow/reader.h" +#include "parquet/arrow/reader_internal.h" +#include "parquet/arrow/schema.h" +#include "parquet/arrow/writer.h" +#include "parquet/column_writer.h" +#include "parquet/file_writer.h" +#include "parquet/page_index.h" +#include "parquet/schema.h" +#include "parquet/size_statistics.h" +#include "parquet/test_util.h" +#include "parquet/thrift_internal.h" +#include "parquet/types.h" + +namespace parquet { + +TEST(SizeStatistics, ThriftSerDe) { + const std::vector kDefLevels = {128, 64, 32, 16}; + const std::vector kRepLevels = {100, 80, 60, 40, 20}; + constexpr int64_t kUnencodedByteArrayDataBytes = 1234; + + for (const auto& descr : + {std::make_unique(schema::Int32("a"), /*max_def_level=*/3, + /*max_rep_level=*/4), + std::make_unique(schema::ByteArray("a"), /*max_def_level=*/3, + /*max_rep_level=*/4)}) { + auto size_statistics = SizeStatistics::Make(descr.get()); + size_statistics->repetition_level_histogram = kRepLevels; + size_statistics->definition_level_histogram = kDefLevels; + if (descr->physical_type() == Type::BYTE_ARRAY) { + size_statistics->IncrementUnencodedByteArrayDataBytes(kUnencodedByteArrayDataBytes); + } + auto thrift_statistics = ToThrift(*size_statistics); + auto restored_statistics = FromThrift(thrift_statistics); + EXPECT_EQ(restored_statistics.definition_level_histogram, kDefLevels); + EXPECT_EQ(restored_statistics.repetition_level_histogram, kRepLevels); + if (descr->physical_type() == Type::BYTE_ARRAY) { + EXPECT_TRUE(restored_statistics.unencoded_byte_array_data_bytes.has_value()); + EXPECT_EQ(restored_statistics.unencoded_byte_array_data_bytes.value(), + kUnencodedByteArrayDataBytes); + } else { + EXPECT_FALSE(restored_statistics.unencoded_byte_array_data_bytes.has_value()); + } + } +} + +bool operator==(const SizeStatistics& lhs, const SizeStatistics& rhs) { + return lhs.repetition_level_histogram == rhs.repetition_level_histogram && + lhs.definition_level_histogram == rhs.definition_level_histogram && + lhs.unencoded_byte_array_data_bytes == rhs.unencoded_byte_array_data_bytes; +} + +struct PageSizeStatistics { + std::vector def_levels; + std::vector rep_levels; + std::vector byte_array_bytes; + bool operator==(const PageSizeStatistics& other) const { + return def_levels == other.def_levels && rep_levels == other.rep_levels && + byte_array_bytes == other.byte_array_bytes; + } +}; + +class SizeStatisticsRoundTripTest : public ::testing::Test { + public: + void WriteFile(SizeStatisticsLevel level, + const std::shared_ptr<::arrow::Table>& table) { + auto writer_properties = WriterProperties::Builder() + .max_row_group_length(2) /* every row group has 2 rows */ + ->data_pagesize(1) /* every page has 1 row */ + ->enable_write_page_index() + ->enable_statistics() + ->set_size_statistics_level(level) + ->build(); + + // Get schema from table. + auto schema = table->schema(); + std::shared_ptr parquet_schema; + auto arrow_writer_properties = default_arrow_writer_properties(); + ASSERT_OK_NO_THROW(arrow::ToParquetSchema(schema.get(), *writer_properties, + *arrow_writer_properties, &parquet_schema)); + auto schema_node = + std::static_pointer_cast(parquet_schema->schema_root()); + + // Write table to buffer. + auto sink = CreateOutputStream(); + auto pool = ::arrow::default_memory_pool(); + auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties); + std::unique_ptr arrow_writer; + ASSERT_OK(arrow::FileWriter::Make(pool, std::move(writer), schema, + arrow_writer_properties, &arrow_writer)); + ASSERT_OK_NO_THROW(arrow_writer->WriteTable(*table)); + ASSERT_OK_NO_THROW(arrow_writer->Close()); + ASSERT_OK_AND_ASSIGN(buffer_, sink->Finish()); + } + + void ReadSizeStatistics() { + auto read_properties = default_arrow_reader_properties(); + auto reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_)); + + // Read row group size statistics in order. + auto metadata = reader->metadata(); + for (int i = 0; i < metadata->num_row_groups(); ++i) { + auto row_group_metadata = metadata->RowGroup(i); + for (int j = 0; j < metadata->num_columns(); ++j) { + auto column_metadata = row_group_metadata->ColumnChunk(j); + auto size_stats = column_metadata->size_statistics(); + row_group_stats_.push_back(size_stats ? *size_stats : SizeStatistics{}); + } + } + + // Read page size statistics in order. + auto page_index_reader = reader->GetPageIndexReader(); + ASSERT_NE(page_index_reader, nullptr); + + for (int i = 0; i < metadata->num_row_groups(); ++i) { + auto row_group_index_reader = page_index_reader->RowGroup(i); + ASSERT_NE(row_group_index_reader, nullptr); + + for (int j = 0; j < metadata->num_columns(); ++j) { + PageSizeStatistics page_stats; + + auto column_index = row_group_index_reader->GetColumnIndex(j); + if (column_index != nullptr) { + if (column_index->has_definition_level_histograms()) { + page_stats.def_levels = column_index->definition_level_histograms(); + } + if (column_index->has_repetition_level_histograms()) { + page_stats.rep_levels = column_index->repetition_level_histograms(); + } + } + + auto offset_index = row_group_index_reader->GetOffsetIndex(j); + if (offset_index != nullptr) { + page_stats.byte_array_bytes = offset_index->unencoded_byte_array_data_bytes(); + } + + page_stats_.emplace_back(std::move(page_stats)); + } + } + } + + void Reset() { + buffer_.reset(); + row_group_stats_.clear(); + page_stats_.clear(); + } + + protected: + std::shared_ptr buffer_; + std::vector row_group_stats_; + std::vector page_stats_; + inline static const SizeStatistics kEmptyRowGroupStats{}; + inline static const PageSizeStatistics kEmptyPageStats{}; +}; + +TEST_F(SizeStatisticsRoundTripTest, EnableSizeStats) { + auto schema = ::arrow::schema({ + ::arrow::field("a", ::arrow::list(::arrow::list(::arrow::int32()))), + ::arrow::field("b", ::arrow::list(::arrow::list(::arrow::utf8()))), + }); + // First two rows are in one row group, and the other two rows are in another row group. + auto table = ::arrow::TableFromJSON(schema, {R"([ + [ [[1],[1,1],[1,1,1]], [["a"],["a","a"],["a","a","a"]] ], + [ [[0,1,null]], [["foo","bar",null]] ], + [ [], [] ], + [ [[],[null],null], [[],[null],null] ] + ])"}); + + for (auto size_stats_level : + {SizeStatisticsLevel::None, SizeStatisticsLevel::ColumnChunk, + SizeStatisticsLevel::PageAndColumnChunk}) { + WriteFile(size_stats_level, table); + ReadSizeStatistics(); + + if (size_stats_level == SizeStatisticsLevel::None) { + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(kEmptyRowGroupStats, kEmptyRowGroupStats, + kEmptyRowGroupStats, kEmptyRowGroupStats)); + } else { + EXPECT_THAT(row_group_stats_, ::testing::ElementsAre( + SizeStatistics{/*def_levels=*/{0, 0, 0, 0, 1, 8}, + /*rep_levels=*/{2, 2, 5}, + /*byte_array_bytes=*/std::nullopt}, + SizeStatistics{/*def_levels=*/{0, 0, 0, 0, 1, 8}, + /*rep_levels=*/{2, 2, 5}, + /*byte_array_bytes=*/12}, + SizeStatistics{/*def_levels=*/{0, 1, 1, 1, 1, 0}, + /*rep_levels=*/{2, 2, 0}, + /*byte_array_bytes=*/std::nullopt}, + SizeStatistics{/*def_levels=*/{0, 1, 1, 1, 1, 0}, + /*rep_levels=*/{2, 2, 0}, + /*byte_array_bytes=*/0})); + } + + if (size_stats_level == SizeStatisticsLevel::PageAndColumnChunk) { + EXPECT_THAT( + page_stats_, + ::testing::ElementsAre( + PageSizeStatistics{/*def_levels=*/{0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 1, 2}, + /*rep_levels=*/{1, 2, 3, 1, 0, 2}, + /*byte_array_bytes=*/{}}, + PageSizeStatistics{/*def_levels=*/{0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 1, 2}, + /*rep_levels=*/{1, 2, 3, 1, 0, 2}, + /*byte_array_bytes=*/{6, 6}}, + PageSizeStatistics{/*def_levels=*/{0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0}, + /*rep_levels=*/{1, 0, 0, 1, 2, 0}, + /*byte_array_bytes=*/{}}, + PageSizeStatistics{/*def_levels=*/{0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0}, + /*rep_levels=*/{1, 0, 0, 1, 2, 0}, + /*byte_array_bytes=*/{0, 0}})); + } else { + EXPECT_THAT(page_stats_, ::testing::ElementsAre(kEmptyPageStats, kEmptyPageStats, + kEmptyPageStats, kEmptyPageStats)); + } + + Reset(); + } +} + +TEST_F(SizeStatisticsRoundTripTest, WriteDictionaryArray) { + auto schema = ::arrow::schema( + {::arrow::field("a", ::arrow::dictionary(::arrow::int16(), ::arrow::utf8()))}); + WriteFile( + SizeStatisticsLevel::PageAndColumnChunk, + ::arrow::TableFromJSON(schema, {R"([["aa"],["aaa"],[null],["a"],["aaa"],["a"]])"})); + + ReadSizeStatistics(); + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{0, 2}, + /*rep_levels=*/{2}, + /*byte_array_bytes=*/5}, + SizeStatistics{/*def_levels=*/{1, 1}, + /*rep_levels=*/{2}, + /*byte_array_bytes=*/1}, + SizeStatistics{/*def_levels=*/{0, 2}, + /*rep_levels=*/{2}, + /*byte_array_bytes=*/4})); + EXPECT_THAT(page_stats_, + ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{0, 2}, + /*rep_levels=*/{2}, + /*byte_array_bytes=*/{5}}, + PageSizeStatistics{/*def_levels=*/{1, 1}, + /*rep_levels=*/{2}, + /*byte_array_bytes=*/{1}}, + PageSizeStatistics{/*def_levels=*/{0, 2}, + /*rep_levels=*/{2}, + /*byte_array_bytes=*/{4}})); +} + +} // namespace parquet diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index e7bfd434c81a8..744af743118e2 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -43,6 +43,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/properties.h" +#include "parquet/size_statistics.h" #include "parquet/statistics.h" #include "parquet/types.h" @@ -254,6 +255,14 @@ static inline SortingColumn FromThrift(format::SortingColumn thrift_sorting_colu return sorting_column; } +static inline SizeStatistics FromThrift(const format::SizeStatistics& size_stats) { + return SizeStatistics{ + size_stats.definition_level_histogram, size_stats.repetition_level_histogram, + size_stats.__isset.unencoded_byte_array_data_bytes + ? std::make_optional(size_stats.unencoded_byte_array_data_bytes) + : std::nullopt}; +} + // ---------------------------------------------------------------------- // Convert Thrift enums from Parquet enums @@ -383,6 +392,17 @@ static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryptio return encryption_algorithm; } +static inline format::SizeStatistics ToThrift(const SizeStatistics& size_stats) { + format::SizeStatistics size_statistics; + size_statistics.__set_definition_level_histogram(size_stats.definition_level_histogram); + size_statistics.__set_repetition_level_histogram(size_stats.repetition_level_histogram); + if (size_stats.unencoded_byte_array_data_bytes.has_value()) { + size_statistics.__set_unencoded_byte_array_data_bytes( + size_stats.unencoded_byte_array_data_bytes.value()); + } + return size_statistics; +} + // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities diff --git a/cpp/src/parquet/type_fwd.h b/cpp/src/parquet/type_fwd.h index da0d0f7bdee96..cda0dc5a77e1f 100644 --- a/cpp/src/parquet/type_fwd.h +++ b/cpp/src/parquet/type_fwd.h @@ -68,7 +68,10 @@ struct ParquetVersion { }; }; +struct PageIndexLocation; + class FileMetaData; +class FileCryptoMetaData; class RowGroupMetaData; class ColumnDescriptor; @@ -82,10 +85,22 @@ class WriterPropertiesBuilder; class ArrowWriterProperties; class ArrowWriterPropertiesBuilder; +class EncodedStatistics; +class Statistics; +struct SizeStatistics; + +class ColumnIndex; +class OffsetIndex; + namespace arrow { class FileWriter; class FileReader; } // namespace arrow + +namespace schema { +class ColumnPath; +} // namespace schema + } // namespace parquet diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 7b50ed48d06b0..bee75c335afd5 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -95,31 +96,46 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) { const char* bytes = val.data(); switch (parquet_type) { - case Type::BOOLEAN: - result << reinterpret_cast(bytes)[0]; + case Type::BOOLEAN: { + bool value{}; + std::memcpy(&value, bytes, sizeof(bool)); + result << value; break; - case Type::INT32: - result << reinterpret_cast(bytes)[0]; + } + case Type::INT32: { + int32_t value{}; + std::memcpy(&value, bytes, sizeof(int32_t)); + result << value; break; - case Type::INT64: - result << reinterpret_cast(bytes)[0]; + } + case Type::INT64: { + int64_t value{}; + std::memcpy(&value, bytes, sizeof(int64_t)); + result << value; break; - case Type::DOUBLE: - result << reinterpret_cast(bytes)[0]; + } + case Type::DOUBLE: { + double value{}; + std::memcpy(&value, bytes, sizeof(double)); + result << value; break; - case Type::FLOAT: - result << reinterpret_cast(bytes)[0]; + } + case Type::FLOAT: { + float value{}; + std::memcpy(&value, bytes, sizeof(float)); + result << value; break; + } case Type::INT96: { - auto const i32_val = reinterpret_cast(bytes); - result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2]; + std::array values{}; + std::memcpy(values.data(), bytes, 3 * sizeof(int32_t)); + result << values[0] << " " << values[1] << " " << values[2]; break; } - case Type::BYTE_ARRAY: { - return std::string(val); - } + case Type::BYTE_ARRAY: case Type::FIXED_LEN_BYTE_ARRAY: { - return std::string(val); + result << val; + break; } case Type::UNDEFINED: default: diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 6a5666d8f06b2..e8c387a1f3946 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -13,7 +13,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 20f659176882d..58b65a3a7130a 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index cccf8394ad942..6c7f311bf821d 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -9,7 +9,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index a574effdcee26..c7ef3e3afd50b 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj index 3e63d8c895e0a..08215ef323d8c 100644 --- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj +++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index fa7c845517bd0..dd5c595e0d515 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 9c51f979aeadd..3c48b4c71ae6c 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,15 +15,24 @@ net8.0 - - - + all runtime; build; native; contentfiles; analyzers + + + + all + runtime; build; native; contentfiles; analyzers + + + + + + - + diff --git a/dev/archery/archery/release/core.py b/dev/archery/archery/release/core.py index d6eab45e1804c..bbaba2f648f29 100644 --- a/dev/archery/archery/release/core.py +++ b/dev/archery/archery/release/core.py @@ -25,7 +25,6 @@ from git import Repo from github import Github -from jira import JIRA from semver import VersionInfo as SemVer from ..utils.source import ArrowSources @@ -50,14 +49,6 @@ def __init__(self, released=False, release_date=None, **kwargs): def parse(cls, version, **kwargs): return cls(**SemVer.parse(version).to_dict(), **kwargs) - @classmethod - def from_jira(cls, jira_version): - return cls.parse( - jira_version.name, - released=jira_version.released, - release_date=getattr(jira_version, 'releaseDate', None) - ) - @classmethod def from_milestone(cls, milestone): return cls.parse( @@ -76,14 +67,6 @@ def __init__(self, key, type, summary, github_issue=None): self.github_issue_id = getattr(github_issue, "number", None) self._github_issue = github_issue - @classmethod - def from_jira(cls, jira_issue): - return cls( - key=jira_issue.key, - type=jira_issue.fields.issuetype.name, - summary=jira_issue.fields.summary - ) - @classmethod def from_github(cls, github_issue): return cls( @@ -117,15 +100,6 @@ def is_pr(self): return bool(self._github_issue and self._github_issue.pull_request) -class Jira(JIRA): - - def __init__(self, url='https://issues.apache.org/jira'): - super().__init__(url) - - def issue(self, key): - return Issue.from_jira(super().issue(key)) - - class IssueTracker: def __init__(self, github_token=None): @@ -401,10 +375,6 @@ def commits(self): commit_range = f"{lower}..{upper}" return list(map(Commit, self.repo.iter_commits(commit_range))) - @cached_property - def jira_instance(self): - return Jira() - @cached_property def default_branch(self): default_branch_name = os.getenv("ARCHERY_DEFAULT_BRANCH") @@ -459,20 +429,12 @@ def curate(self, minimal=False): else: outside.append( (self.issue_tracker.issue(int(c.issue_id)), c)) - elif c.project == 'ARROW': - if c.issue in release_issues: - within.append((release_issues[c.issue], c)) - else: - outside.append((self.jira_instance.issue(c.issue), c)) - elif c.project == 'PARQUET': - parquet.append((self.jira_instance.issue(c.issue), c)) else: warnings.warn( - f'Issue {c.issue} does not pertain to GH' + - ', ARROW or PARQUET') + f'Issue {c.issue} does not pertain to GH') outside.append((c.issue, c)) - # remaining jira tickets + # remaining tickets within_keys = {i.key for i, c in within} # Take into account that some issues milestoned are prs nopatch = [issue for key, issue in release_issues.items() @@ -488,12 +450,10 @@ def changelog(self): # get organized report for the release curation = self.curate() - # jira tickets having patches in the release + # issues having patches in the release issue_commit_pairs.extend(curation.within) - # parquet patches in the release - issue_commit_pairs.extend(curation.parquet) - # jira tickets without patches + # issues without patches for issue in curation.nopatch: issue_commit_pairs.append((issue, None)) @@ -576,7 +536,7 @@ def cherry_pick_commits(self, recreate_branch=True): logger.info(f"Checking out branch {self.branch}") self.repo.git.checkout(self.branch) - # cherry pick the commits based on the jira tickets + # cherry pick the commits based on the GH issue for commit in self.commits_to_pick(): logger.info(f"Cherry-picking commit {commit.hexsha}") self.repo.git.cherry_pick(commit.hexsha) diff --git a/dev/archery/archery/release/tests/test_release.py b/dev/archery/archery/release/tests/test_release.py index 22b43c7cb3bc4..fae2bdcea04a0 100644 --- a/dev/archery/archery/release/tests/test_release.py +++ b/dev/archery/archery/release/tests/test_release.py @@ -21,7 +21,6 @@ Release, MajorRelease, MinorRelease, PatchRelease, IssueTracker, Version, Issue, CommitTitle, Commit ) -from archery.testing import DotDict # subset of issues per revision @@ -141,22 +140,6 @@ def test_issue(fake_issue_tracker): assert i.project == "PARQUET" assert i.number == 1111 - fake_jira_issue = DotDict({ - 'key': 'ARROW-2222', - 'fields': { - 'issuetype': { - 'name': 'Feature' - }, - 'summary': 'Issue title' - } - }) - i = Issue.from_jira(fake_jira_issue) - assert i.key == "ARROW-2222" - assert i.type == "Feature" - assert i.summary == "Issue title" - assert i.project == "ARROW" - assert i.number == 2222 - def test_commit_title(): t = CommitTitle.parse( diff --git a/dev/archery/archery/templates/release_changelog.md.j2 b/dev/archery/archery/templates/release_changelog.md.j2 index 0eedb217a8b84..9fa9a1476af6f 100644 --- a/dev/archery/archery/templates/release_changelog.md.j2 +++ b/dev/archery/archery/templates/release_changelog.md.j2 @@ -23,11 +23,7 @@ ## {{ category }} {% for issue, commit in issue_commit_pairs -%} -{% if issue.project in ('ARROW', 'PARQUET') -%} -* [{{ issue.key }}](https://issues.apache.org/jira/browse/{{ issue.key }}) - {{ commit.title.to_string(with_issue=False) if commit else issue.summary | md }} -{% else -%} * [GH-{{ issue.key }}](https://github.com/apache/arrow/issues/{{ issue.key }}) - {{ commit.title.to_string(with_issue=False) if commit else issue.summary | md }} -{% endif -%} {% endfor %} {% endfor %} diff --git a/dev/archery/archery/testing.py b/dev/archery/archery/testing.py index 471a54d4c72cf..3b1061ac85fa4 100644 --- a/dev/archery/archery/testing.py +++ b/dev/archery/archery/testing.py @@ -21,19 +21,6 @@ import re -class DotDict(dict): - - def __getattr__(self, key): - try: - item = self[key] - except KeyError: - raise AttributeError(key) - if isinstance(item, dict): - return DotDict(item) - else: - return item - - class PartialEnv(dict): def __eq__(self, other): diff --git a/dev/archery/setup.py b/dev/archery/setup.py index f1e0df6231436..6587e61546b5a 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -39,9 +39,9 @@ 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', 'cmake_format==0.6.13', 'sphinx-lint==0.9.1'], 'numpydoc': ['numpydoc==1.1.0'], - 'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'], + 'release': ['pygithub', jinja_req, 'semver', 'gitpython'], } -extras['bot'] = extras['crossbow'] + ['pygithub', 'jira'] +extras['bot'] = extras['crossbow'] + ['pygithub'] extras['all'] = list(set(functools.reduce(operator.add, extras.values()))) setup( diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py index 788d1df0ab3eb..473f95ae37e2e 100755 --- a/dev/release/download_rc_binaries.py +++ b/dev/release/download_rc_binaries.py @@ -121,17 +121,29 @@ def _download_url(self, url, dest_path, *, extra_args=None): dest_path, url, ] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = proc.communicate() - if proc.returncode != 0: - try: - # Don't leave possibly partial file around - os.remove(dest_path) - except IOError: - pass - raise Exception(f"Downloading {url} failed\n" - f"stdout: {stdout}\nstderr: {stderr}") + # Retry subprocess in case it fails with OpenSSL Connection errors + # https://issues.apache.org/jira/browse/INFRA-25274 + for attempt in range(5): + if attempt > 0: + delay = attempt * 3 + print(f"Waiting {delay} seconds before retrying {url}") + time.sleep(delay) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + if proc.returncode != 0: + try: + # Don't leave possibly partial file around + os.remove(dest_path) + except IOError: + pass + if "OpenSSL" not in stderr: + # We assume curl has already retried on other errors. + break + else: + return + raise Exception(f"Downloading {url} failed\n" + f"stdout: {stdout}\nstderr: {stderr}") def _curl_version(self): cmd = ["curl", "--version"] diff --git a/dev/release/setup-ubuntu.sh b/dev/release/setup-ubuntu.sh index b877c1225ab6a..686507d6257a3 100755 --- a/dev/release/setup-ubuntu.sh +++ b/dev/release/setup-ubuntu.sh @@ -22,27 +22,20 @@ set -exu -codename=$(. /etc/os-release && echo ${UBUNTU_CODENAME}) +version=$(. /etc/os-release && echo ${VERSION_ID}) -case ${codename} in - *) - nlohmann_json=3 - python=3 - apt-get update -y -q - apt-get install -y -q --no-install-recommends \ - llvm-dev - ;; -esac +apt-get update -y -q -case ${codename} in - focal) - ;; - *) - apt-get update -y -q - apt-get install -y -q --no-install-recommends \ - libxsimd-dev - ;; -esac +if [ ${version} \> "20.04" ]; then + apt-get install -y -q --no-install-recommends \ + libxsimd-dev +fi + +if [ ${version} \> "22.04" ]; then + # Some tests rely on legacy timezone aliases such as "US/Pacific" + apt-get install -y -q --no-install-recommends \ + tzdata-legacy +fi apt-get install -y -q --no-install-recommends \ build-essential \ @@ -58,10 +51,10 @@ apt-get install -y -q --no-install-recommends \ libsqlite3-dev \ libssl-dev \ ninja-build \ - nlohmann-json${nlohmann_json}-dev \ + nlohmann-json3-dev \ pkg-config \ - python${python}-dev \ - python${python}-venv \ + python3-dev \ + python3-venv \ python3-pip \ ruby-dev \ tzdata \ diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh index 8c54fe5c11cf1..7cef30357d141 100755 --- a/dev/release/verify-apt.sh +++ b/dev/release/verify-apt.sh @@ -124,8 +124,13 @@ if [ "${TYPE}" = "local" ]; then if [ -f "${keys}" ]; then gpg \ --no-default-keyring \ - --keyring /usr/share/keyrings/apache-arrow-apt-source.gpg \ + --keyring /tmp/apache-arrow-apt-source.kbx \ --import "${keys}" + gpg \ + --no-default-keyring \ + --keyring /tmp/apache-arrow-apt-source.kbx \ + --armor \ + --export > /usr/share/keyrings/apache-arrow-apt-source.asc fi else case "${TYPE}" in diff --git a/dev/tasks/fuzz-tests/github.oss-fuzz.yml b/dev/tasks/fuzz-tests/github.oss-fuzz.yml index d7cf516266831..e591499b0ef0a 100644 --- a/dev/tasks/fuzz-tests/github.oss-fuzz.yml +++ b/dev/tasks/fuzz-tests/github.oss-fuzz.yml @@ -33,6 +33,12 @@ jobs: run: | git clone --depth=50 https://github.com/google/oss-fuzz.git + - uses: actions/setup-python@v5 + # Use a Python version that's compatible with the pinned requirements + # for dependencies below. + with: + python-version: '3.11' + - name: Install dependencies working-directory: oss-fuzz run: | diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules index bf7a85c8c8bcc..382611f6ec927 100755 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules @@ -12,8 +12,13 @@ export DH_OPTIONS override_dh_auto_build: gpg \ --no-default-keyring \ - --keyring ./apache-arrow-apt-source.gpg \ + --keyring ./apache-arrow-apt-source.kbx \ --import KEYS + gpg \ + --no-default-keyring \ + --keyring ./apache-arrow-apt-source.kbx \ + --armor \ + --export > apache-arrow-apt-source.asc ( \ distribution=$$(lsb_release --id --short | tr 'A-Z' 'a-z'); \ @@ -22,12 +27,12 @@ override_dh_auto_build: echo "URIs: https://apache.jfrog.io/artifactory/arrow/$${distribution}/"; \ echo "Suites: $${code_name}"; \ echo "Components: main"; \ - echo "Signed-By: /usr/share/keyrings/apache-arrow-apt-source.gpg"; \ + echo "Signed-By: /usr/share/keyrings/apache-arrow-apt-source.asc"; \ ) > apache-arrow.sources override_dh_install: install -d debian/tmp/usr/share/keyrings/ - install -m 0644 apache-arrow-apt-source.gpg \ + install -m 0644 apache-arrow-apt-source.asc \ debian/tmp/usr/share/keyrings/ install -d debian/tmp/etc/apt/sources.list.d/ diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 221d2a48b87df..dded9492f0c4d 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -158,14 +158,12 @@ env: - name: Install gemfury client on ARM self-hosted if: runner.arch != 'X64' run: | - # GH-36692: Pin gemfury due to wrong faraday dependency declaration. - gem install --user-install gemfury -v 0.12.0 + gem install --user-install gemfury ruby -r rubygems -e 'puts("#{Gem.user_dir}/bin")' >> $GITHUB_PATH - name: Install gemfury client if: runner.arch == 'X64' run: | - # GH-36692: Pin gemfury due to wrong faraday dependency declaration. - gem install gemfury -v 0.12.0 + gem install gemfury - name: Upload package to Gemfury shell: bash run: | diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml index f083b7c0c8f61..ec5b9b31da8e1 100644 --- a/dev/tasks/python-wheels/github.linux.yml +++ b/dev/tasks/python-wheels/github.linux.yml @@ -50,6 +50,15 @@ jobs: {{ macros.github_install_archery()|indent }} {{ macros.github_login_dockerhub()|indent }} + - name: Prepare + run: | + if [ "${PYTHON_ABI_TAG}" = "cp313t" ]; then + test_image_prefix=python-free-threaded + else + test_image_prefix=python + fi + echo "TEST_IMAGE_PREFIX=${test_image_prefix}" >> ${GITHUB_ENV} + - name: Build wheel shell: bash env: @@ -72,23 +81,11 @@ jobs: # TODO(kszucs): auditwheel show - name: Test wheel - if: | - '{{ python_abi_tag }}' != 'cp313t' shell: bash run: | source arrow/ci/scripts/util_enable_core_dumps.sh - archery docker run python-wheel-manylinux-test-imports - archery docker run python-wheel-manylinux-test-unittests - - # Free-threaded wheels need to be tested using a different Docker Compose service - - name: Test free-threaded wheel - if: | - '{{ python_abi_tag }}' == 'cp313t' - shell: bash - run: | - source arrow/ci/scripts/util_enable_core_dumps.sh - archery docker run python-free-threaded-wheel-manylinux-test-imports - archery docker run python-free-threaded-wheel-manylinux-test-unittests + archery docker run ${TEST_IMAGE_PREFIX}-wheel-manylinux-test-imports + archery docker run ${TEST_IMAGE_PREFIX}-wheel-manylinux-test-unittests - name: Test wheel on AlmaLinux 8 shell: bash @@ -136,14 +133,29 @@ jobs: -e TEST_WHEELS=1 \ ubuntu-verify-rc + - name: Test wheel on Ubuntu 24.04 + shell: bash + if: | + '{{ python_version }}' == '3.12' + env: + UBUNTU: "24.04" + run: | + archery docker run \ + -e TEST_DEFAULT=0 \ + -e TEST_PYARROW_VERSION={{ arrow.no_rc_version }} \ + -e TEST_PYTHON_VERSIONS={{ python_version }} \ + -e TEST_WHEEL_PLATFORM_TAGS={{ wheel_platform_tag }} \ + -e TEST_WHEELS=1 \ + ubuntu-verify-rc + {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }} {{ macros.github_upload_gemfury("arrow/python/repaired_wheels/*.whl")|indent }} {{ macros.github_upload_wheel_scientific_python("arrow/python/repaired_wheels/*.whl")|indent }} {% if arrow.is_default_branch() %} - - name: Push Docker Image + - name: Push Docker images shell: bash run: | archery docker push python-wheel-manylinux-{{ manylinux_version }} - archery docker push python-wheel-manylinux-test-unittests + archery docker push ${TEST_IMAGE_PREFIX}-wheel-manylinux-test-unittests {% endif %} diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml index 1799bd6ad6b6f..031bad94227e8 100644 --- a/dev/tasks/python-wheels/github.osx.yml +++ b/dev/tasks/python-wheels/github.osx.yml @@ -89,6 +89,7 @@ jobs: --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ + --x-feature=orc \ --x-feature=parquet \ --x-feature=s3 diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 839e3d5341070..181e978569104 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -40,7 +40,7 @@ jobs: - uses: r-lib/actions/setup-r@v2 with: - install-r: false + install-r: true - name: Build R source package shell: bash @@ -447,7 +447,7 @@ jobs: - name: Install R uses: r-lib/actions/setup-r@v2 with: - install-r: false + install-r: true - name: Rename artifacts shell: Rscript {0} run: | diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ff1de34a3db32..c43df2b6f2502 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -74,9 +74,13 @@ groups: test: - test-* + # Can be removed after we improved C++20 support + - ~test-debian-experimental-cpp-gcc-15 cpp: - test-*cpp* + # Can be removed after we improved C++20 support + - ~test-debian-experimental-cpp-gcc-15 - example-*cpp* c-glib: @@ -157,6 +161,8 @@ groups: nightly-tests: - test-* + # Can be removed after we improved C++20 support + - ~test-debian-experimental-cpp-gcc-15 - example-* nightly-packaging: @@ -935,7 +941,7 @@ tasks: params: env: UBUNTU: 24.04 - GCC_VERSION: 13 + GCC: 13 image: ubuntu-cpp-bundled test-ubuntu-24.04-cpp: @@ -954,7 +960,7 @@ tasks: params: env: CLANG_TOOLS: 15 - GCC_VERSION: 14 + GCC: 14 LLVM: 15 UBUNTU: 24.04 # rapidjson 1.1.0 has an error caught by gcc 14. @@ -997,6 +1003,17 @@ tasks: image: debian-cpp {% endfor %} + test-debian-experimental-cpp-gcc-15: + ci: github + template: docker-tests/github.linux.yml + params: + env: + ARCH: "amd64" + DEBIAN: "experimental" + GCC: "15" + flags: "-e CMAKE_CXX_STANDARD=20" + image: debian-cpp + test-fedora-39-cpp: ci: github template: docker-tests/github.linux.yml @@ -1301,7 +1318,7 @@ tasks: params: env: UBUNTU: 22.04 - GCC_VERSION: 11 + GCC: 11 image: ubuntu-r-only-r # This also has -flto=auto @@ -1311,7 +1328,7 @@ tasks: params: env: UBUNTU: 22.04 - GCC_VERSION: 12 + GCC: 12 image: ubuntu-r-only-r test-r-minimal-build: diff --git a/docker-compose.yml b/docker-compose.yml index 6e699f9ef385c..4b42a8145f770 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -188,8 +188,6 @@ volumes: name: ${ARCH}-conda-ccache debian-ccache: name: ${ARCH}-debian-${DEBIAN}-ccache - debian-rust: - name: ${ARCH}-debian-${DEBIAN}-rust fedora-ccache: name: ${ARCH}-fedora-${FEDORA}-ccache maven-cache: @@ -346,7 +344,7 @@ services: # docker compose run --rm debian-cpp # Parameters: # ARCH: amd64, arm64v8, ... - # DEBIAN: 12 + # DEBIAN: 12, experimental image: ${REPO}:${ARCH}-debian-${DEBIAN}-cpp build: context: . @@ -355,6 +353,7 @@ services: - ${REPO}:${ARCH}-debian-${DEBIAN}-cpp args: arch: ${ARCH} + gcc: ${GCC} llvm: ${LLVM} shm_size: *shm-size ulimits: *ulimits @@ -389,8 +388,8 @@ services: arch: ${ARCH} base: "${ARCH}/ubuntu:${UBUNTU}" clang_tools: ${CLANG_TOOLS} + gcc: ${GCC} llvm: ${LLVM} - gcc_version: ${GCC_VERSION} shm_size: *shm-size cap_add: - SYS_ADMIN @@ -425,8 +424,8 @@ services: arch: ${ARCH} base: "${ARCH}/ubuntu:${UBUNTU}" clang_tools: ${CLANG_TOOLS} + gcc: ${GCC} llvm: ${LLVM} - gcc_version: ${GCC_VERSION} shm_size: *shm-size cap_add: - SYS_ADMIN @@ -639,6 +638,7 @@ services: ARROW_FLIGHT_SQL: "OFF" ARROW_FUZZING: "ON" # Check fuzz regressions ARROW_JEMALLOC: "OFF" + ARROW_MIMALLOC: "OFF" ARROW_ORC: "OFF" ARROW_S3: "OFF" ARROW_USE_ASAN: "ON" @@ -677,6 +677,7 @@ services: ARROW_FLIGHT: "OFF" ARROW_FLIGHT_SQL: "OFF" ARROW_JEMALLOC: "OFF" + ARROW_MIMALLOC: "OFF" ARROW_ORC: "OFF" ARROW_USE_TSAN: "ON" command: *cpp-command @@ -1516,12 +1517,12 @@ services: - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-r-${R} args: arch: ${ARCH} - r: ${R} base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - gcc_version: ${GCC_VERSION} - tz: ${TZ} - r_prune_deps: ${R_PRUNE_DEPS} + gcc: ${GCC} + r: ${R} r_duckdb_dev: ${R_DUCKDB_DEV:-} + r_prune_deps: ${R_PRUNE_DEPS} + tz: ${TZ} shm_size: *shm-size environment: <<: [*common, *ccache, *sccache] @@ -1807,7 +1808,6 @@ services: ARROW_SUBSTRAIT: "ON" BUILD_DOCS_C_GLIB: "ON" BUILD_DOCS_CPP: "ON" - BUILD_DOCS_JAVA: "ON" BUILD_DOCS_JS: "ON" BUILD_DOCS_PYTHON: "ON" BUILD_DOCS_R: "ON" @@ -1820,8 +1820,7 @@ services: /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/c_glib_build.sh /arrow /build && /arrow/ci/scripts/r_build.sh /arrow /build && - /arrow/ci/scripts/js_build.sh /arrow /build && - /arrow/ci/scripts/java_build.sh /arrow /build" + /arrow/ci/scripts/js_build.sh /arrow /build" ################################# Tools ##################################### diff --git a/docs/source/cpp/flight.rst b/docs/source/cpp/flight.rst index a1e9420bfd34e..c076e0b8c1a67 100644 --- a/docs/source/cpp/flight.rst +++ b/docs/source/cpp/flight.rst @@ -362,38 +362,4 @@ Closing unresponsive connections .. _ARROW-16697: https://issues.apache.org/jira/browse/ARROW-16697 .. _ARROW-6062: https://issues.apache.org/jira/browse/ARROW-6062 - -Alternative Transports -====================== - -The standard transport for Arrow Flight is gRPC_. The C++ -implementation also experimentally supports a transport based on -UCX_. To use it, use the protocol scheme ``ucx:`` when starting a -server or creating a client. - -UCX Transport -------------- - -Not all features of the gRPC transport are supported. See -:ref:`status-flight-rpc` for details. Also note these specific -caveats: - -- The server creates an independent UCP worker for each client. This - consumes more resources but provides better throughput. -- The client creates an independent UCP worker for each RPC - call. Again, this trades off resource consumption for - performance. This also means that unlike with gRPC, it is - essentially equivalent to make all calls with a single client or - with multiple clients. -- The UCX transport attempts to avoid copies where possible. In some - cases, it can directly reuse UCX-allocated buffers to back - :class:`arrow::Buffer` objects, however, this will also extend the - lifetime of associated UCX resources beyond the lifetime of the - Flight client or server object. -- Depending on the transport that UCX itself selects, you may find - that increasing ``UCX_MM_SEG_SIZE`` from the default (around 8KB) to - around 60KB improves performance (UCX will copy more data in a - single call). - .. _gRPC: https://grpc.io/ -.. _UCX: https://openucx.org/ diff --git a/docs/source/developers/cpp/emscripten.rst b/docs/source/developers/cpp/emscripten.rst index b4c563aae1a3b..bfa0c5bc35021 100644 --- a/docs/source/developers/cpp/emscripten.rst +++ b/docs/source/developers/cpp/emscripten.rst @@ -33,7 +33,9 @@ activate it using the commands below (see https://emscripten.org/docs/getting_st git clone https://github.com/emscripten-core/emsdk.git cd emsdk # replace with the desired EMSDK version. - # e.g. for Pyodide 0.24, you need EMSDK version 3.1.45 + # e.g. for Pyodide 0.26, you need EMSDK version 3.1.58 + # the versions can be found in the Makefile.envs file in the Pyodide repo: + # https://github.com/pyodide/pyodide/blob/10b484cfe427e076c929a55dc35cfff01ea8d3bc/Makefile.envs ./emsdk install ./emsdk activate source ./emsdk_env.sh @@ -46,8 +48,8 @@ versions of emsdk tools. .. code:: shell # install Pyodide build tools. - # e.g. for version 0.24 of Pyodide: - pip install pyodide-build==0.24 + # e.g., for version 0.26 of Pyodide, pyodide-build 0.26 and later work + pip install "pyodide-build>=0.26" Then build with the ``ninja-release-emscripten`` CMake preset, like below: @@ -69,8 +71,7 @@ go to ``arrow/python`` and run pyodide build It should make a wheel targeting the currently enabled version of -Pyodide (i.e. the version corresponding to the currently installed -``pyodide-build``) in the ``dist`` subdirectory. +Pyodide in the ``dist`` subdirectory. Manual Build @@ -85,9 +86,8 @@ you will need to override. In particular you will need: #. ``CMAKE_TOOLCHAIN_FILE`` set by using ``emcmake cmake`` instead of just ``cmake``. -#. You will quite likely need to set ``ARROW_ENABLE_THREADING`` to ``OFF`` - for builds targeting single threaded Emscripten environments such as - Pyodide. +#. You will need to set ``ARROW_ENABLE_THREADING`` to ``OFF`` for builds + targeting single-threaded Emscripten environments such as Pyodide. #. ``ARROW_FLIGHT`` and anything else that uses network probably won't work. diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 33c937ea34820..9ef6a933528f8 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1619,6 +1619,7 @@ example as above, an alternate encoding could be: :: 0 EOS +.. _format_metadata: Custom Application Metadata --------------------------- diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index 2c5487d857ea4..2a34db0f1ba50 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -333,9 +333,14 @@ schemes for the given transports: +----------------------------+--------------------------------+ | (reuse connection) | arrow-flight-reuse-connection: | +----------------------------+--------------------------------+ -| UCX_ (plaintext) | ucx: | +| UCX_ (plaintext) (1) | ucx: | +----------------------------+--------------------------------+ +Notes: + +* \(1) Flight UCX transport has been deprecated on the 19.0.0 release. + The :ref:`dissociated-ipc` section proposes an alternative solution. + .. _UCX: https://openucx.org/ Connection Reuse diff --git a/docs/source/format/Intro.rst b/docs/source/format/Intro.rst index c230be724dbcc..8635d6bc219ea 100644 --- a/docs/source/format/Intro.rst +++ b/docs/source/format/Intro.rst @@ -71,7 +71,7 @@ In a columnar format, the data is organized column-by-column instead. This organization makes analytical operations like filtering, grouping, aggregations and others, more efficient thanks to memory locality. When processing the data, the memory locations accessed by the CPU tend -be near one another. By keeping the data contiguous in memory, it also +to be near one another. By keeping the data contiguous in memory, it also enables vectorization of the computations. Most modern CPUs have `SIMD instructions`_ (a single instruction that operates on multiple values at once) enabling parallel processing and execution of operations on vector data @@ -439,8 +439,9 @@ of the same length. An ordered collection of fields that communicates all the data types of an object like a RecordBatch or Table. Schemas can contain optional key/value metadata. -A Field includes a field name, a data type, a nullability flag and optional key-value metadata -for a specific column in a RecordBatch. +**Field** +A Field includes a field name, a data type, a nullability flag and +optional key-value metadata for a specific column in a RecordBatch. **Table** A discontiguous, two-dimensional chunk of data consisting of an ordered collection of Chunked diff --git a/docs/source/format/StatisticsSchema.rst b/docs/source/format/StatisticsSchema.rst new file mode 100644 index 0000000000000..01cc0da7c4eb5 --- /dev/null +++ b/docs/source/format/StatisticsSchema.rst @@ -0,0 +1,873 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _statistics-schema: + +================= +Statistics schema +================= + +.. warning:: This specification should be considered experimental. + +Rationale +========= + +Statistics are useful for fast query processing. Many query engines +use statistics to optimize their query plan. + +Apache Arrow format doesn't have statistics but other formats that can +be read as Apache Arrow data may have statistics. For example, the +Apache Parquet C++ implementation can read an Apache Parquet file as +Apache Arrow data and the Apache Parquet file may have statistics. + +We standardize the representation of statistics as an Apache Arrow +array for ease of exchange. + +Use case +-------- + +One of :ref:`c-stream-interface` use cases is the following: + +1. Module A reads Apache Parquet file as Apache Arrow data. +2. Module A passes the read Apache Arrow data to module B through the + Arrow C stream interface. +3. Module B processes the passed Apache Arrow data. + +If module A can pass the statistics associated with the Apache Parquet +file to module B, module B can use the statistics to optimize its +query plan. + +For example, DuckDB uses this approach but DuckDB couldn't use +statistics because there wasn't a standardized way to represent +statistics for the Apache Arrow data. + +.. seealso:: + + `duckdb::ArrowTableFunction::ArrowScanBind() in DuckDB 1.1.3 + `_ + +Goals +----- + +* Establish a standard way to represent statistics as an Apache Arrow + array. + +Non-goals +--------- + +* Establish a standard way to pass an Apache Arrow array that + represents statistics. +* Establish a standard way to embed statistics into an Apache Arrow + array itself. + +Schema +====== + +This specification provides only the schema for statistics. This is +the canonical schema to represent statistics about an Apache Arrow +dataset as Apache Arrow data. + +Here is the outline of the schema for statistics:: + + struct< + column: int32, + statistics: map< + key: dictionary, + items: dense_union<...all needed types...> + > + > + +Here is the details of top-level ``struct``: + +.. list-table:: + :header-rows: 1 + + * - Name + - Data type + - Nullable + - Notes + * - ``column`` + - ``int32`` + - ``true`` + - The zero-based column index, or null if the statistics + describe the whole table or record batch. + + The column index is computed as the same rule used by + :ref:`ipc-recordbatch-message`. + * - ``statistics`` + - ``map`` + - ``false`` + - Statistics for the target column, table or record batch. See + the separate table below for details. + +Here is the details of the ``map`` of the ``statistics``: + +.. list-table:: + :header-rows: 1 + + * - Key or items + - Data type + - Nullable + - Notes + * - key + - ``dictionary`` + - ``false`` + - The string key is the name of the + statistic. Dictionary-encoding is used for efficiency as the + same statistic may be repeated for different columns. + Different keys are assigned for exact and approximate statistic + values. Each statistic has their own description below. + * - items + - ``dense_union`` + - ``false`` + - Statistics value is dense union. It has at least all needed + types based on statistics kinds in the keys. For example, you + need at least ``int64`` and ``float64`` types when you have a + ``int64`` distinct count statistic and a ``float64`` average + byte width statistic. See the description of each statistic below. + + Dense union arrays have names for each field but we don't standardize + field names for these because we can access the proper + field by type code instead. So we can use any valid name for + the fields. + +.. _statistics-schema-name: + +Standard statistics +------------------- + +Each statistic kind has a name that appears as a key in the statistics +map for each column or entire table. ``dictionary`` is used to encode the name for space-efficiency. + +We assign different names for variations of the same statistic instead +of using flags. For example, we assign different statistic names for +exact and approximate values of the "distinct count" statistic. + +The colon symbol ``:`` is to be used as a namespace separator like +:ref:`format_metadata`. It can be used multiple times in a name. + +The ``ARROW`` prefix is a reserved namespace for pre-defined statistic +names in current and future versions of this specification. +User-defined statistics must not use it. For example, you can use your +product name as namespace such as ``MY_PRODUCT:my_statistics:exact``. + +Here are pre-defined statistics names: + +.. list-table:: + :header-rows: 1 + + * - Name + - Data type + - Notes + * - ``ARROW:average_byte_width:exact`` + - ``float64`` + - The average size in bytes of a row in the target + column. (exact) + * - ``ARROW:average_byte_width:approximate`` + - ``float64`` + - The average size in bytes of a row in the target + column. (approximate) + * - ``ARROW:distinct_count:exact`` + - ``int64`` + - The number of distinct values in the target column. (exact) + * - ``ARROW:distinct_count:approximate`` + - ``float64`` + - The number of distinct values in the target + column. (approximate) + * - ``ARROW:max_byte_width:exact`` + - ``int64`` + - The maximum size in bytes of a row in the target + column. (exact) + * - ``ARROW:max_byte_width:approximate`` + - ``float64`` + - The maximum size in bytes of a row in the target + column. (approximate) + * - ``ARROW:max_value:exact`` + - Target dependent + - The maximum value in the target column. (exact) + * - ``ARROW:max_value:approximate`` + - Target dependent + - The maximum value in the target column. (approximate) + * - ``ARROW:min_value:exact`` + - Target dependent + - The minimum value in the target column. (exact) + * - ``ARROW:min_value:approximate`` + - Target dependent + - The minimum value in the target column. (approximate) + * - ``ARROW:null_count:exact`` + - ``int64`` + - The number of nulls in the target column. (exact) + * - ``ARROW:null_count:approximate`` + - ``float64`` + - The number of nulls in the target column. (approximate) + * - ``ARROW:row_count:exact`` + - ``int64`` + - The number of rows in the target table, record batch or + array. (exact) + * - ``ARROW:row_count:approximate`` + - ``float64`` + - The number of rows in the target table, record batch or + array. (approximate) + +If you find a statistic that might be useful to multiple systems, +please propose it on the `Apache Arrow development mailing-list +`__. + +Interoperability improves when producers and consumers of statistics +follow a previously agreed upon statistic specification. + +.. _statistics-schema-examples: + +Examples +======== + +Here are some examples to help you understand. + +Simple record batch +------------------- + +Schema:: + + vendor_id: int32 + passenger_count: int64 + +Data:: + + vendor_id: [5, 1, 5, 1, 5] + passenger_count: [1, 1, 2, 0, null] + +Statistics: + +.. list-table:: + :header-rows: 1 + + * - Target + - Name + - Value + * - Record batch + - The number of rows + - ``5`` + * - ``vendor_id`` + - The number of nulls + - ``0`` + * - ``vendor_id`` + - The number of distinct values + - ``2`` + * - ``vendor_id`` + - The max value + - ``5`` + * - ``vendor_id`` + - The min value + - ``1`` + * - ``passenger_count`` + - The number of nulls + - ``1`` + * - ``passenger_count`` + - The number of distinct values + - ``3`` + * - ``passenger_count`` + - The max value + - ``2`` + * - ``passenger_count`` + - The min value + - ``0`` + +Column indexes: + +.. list-table:: + :header-rows: 1 + + * - Index + - Target + * - ``0`` + - ``vendor_id`` + * - ``1`` + - ``passenger_count`` + +Statistics schema:: + + struct< + column: int32, + statistics: map< + key: dictionary, + items: dense_union<0: int64> + > + > + +Statistics array:: + + column: [ + null, # record batch + 0, # vendor_id + 0, # vendor_id + 0, # vendor_id + 0, # vendor_id + 1, # passenger_count + 1, # passenger_count + 1, # passenger_count + 1, # passenger_count + ] + statistics: + key: + values: [ + "ARROW:row_count:exact", + "ARROW:null_count:exact", + "ARROW:distinct_count:exact", + "ARROW:max_value:exact", + "ARROW:min_value:exact", + ], + indices: [ + 0, # "ARROW:row_count:exact" + 1, # "ARROW:null_count:exact" + 2, # "ARROW:distinct_count:exact" + 3, # "ARROW:max_value:exact" + 4, # "ARROW:min_value:exact" + 1, # "ARROW:null_count:exact" + 2, # "ARROW:distinct_count:exact" + 3, # "ARROW:max_value:exact" + 4, # "ARROW:min_value:exact" + ] + items: + children: + 0: [ # int64 + 5, # record batch: "ARROW:row_count:exact" + 0, # vendor_id: "ARROW:null_count:exact" + 2, # vendor_id: "ARROW:distinct_count:exact" + 5, # vendor_id: "ARROW:max_value:exact" + 1, # vendor_id: "ARROW:min_value:exact" + 1, # passenger_count: "ARROW:null_count:exact" + 3, # passenger_count: "ARROW:distinct_count:exact" + 2, # passenger_count: "ARROW:max_value:exact" + 0, # passenger_count: "ARROW:min_value:exact" + ] + types: [ # all values are int64 + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ] + offsets: [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + ] + +Complex record batch +-------------------- + +This uses nested types. + +Schema:: + + col1: struct, c: float64> + col2: utf8 + +Data:: + + col1: [ + {a: 1, b: [20, 30, 40], c: 2.9}, + {a: 2, b: null, c: -2.9}, + {a: 3, b: [99], c: null}, + ] + col2: ["x", null, "z"] + +Statistics: + +.. list-table:: + :header-rows: 1 + + * - Target + - Name + - Value + * - Record batch + - The number of rows + - ``3`` + * - ``col1`` + - The number of nulls + - ``0`` + * - ``col1.a`` + - The number of nulls + - ``0`` + * - ``col1.a`` + - The number of distinct values + - ``3`` + * - ``col1.a`` + - The approximate max value + - ``5`` + * - ``col1.a`` + - The approximate min value + - ``0`` + * - ``col1.b`` + - The number of nulls + - ``1`` + * - ``col1.b.item`` + - The max value + - ``99`` + * - ``col1.b.item`` + - The min value + - ``20`` + * - ``col1.c`` + - The number of nulls + - ``1`` + * - ``col1.c`` + - The approximate max value + - ``3.0`` + * - ``col1.c`` + - The approximate min value + - ``-3.0`` + * - ``col2`` + - The number of nulls + - ``1`` + * - ``col2`` + - The number of distinct values + - ``2`` + +Column indexes: + +.. list-table:: + :header-rows: 1 + + * - Index + - Target + * - ``0`` + - ``col1`` + * - ``1`` + - ``col1.a`` + * - ``2`` + - ``col1.b`` + * - ``3`` + - ``col1.b.item`` + * - ``4`` + - ``col1.c`` + * - ``5`` + - ``col2`` + +See also :ref:`ipc-recordbatch-message` how to compute column indexes. + +Statistics schema:: + + struct< + column: int32, + statistics: map< + key: dictionary, + items: dense_union< + # For the number of rows, the number of nulls and so on. + 0: int64, + # For the max/min values of col1.c. + 1: float64 + > + > + > + +Statistics array:: + + column: [ + null, # record batch + 0, # col1 + 1, # col1.a + 1, # col1.a + 1, # col1.a + 1, # col1.a + 2, # col1.b + 3, # col1.b.item + 3, # col1.b.item + 4, # col1.c + 4, # col1.c + 4, # col1.c + 5, # col2 + 5, # col2 + ] + statistics: + key: + values: [ + "ARROW:row_count:exact", + "ARROW:null_count:exact", + "ARROW:distinct_count:exact", + "ARROW:max_value:approximate", + "ARROW:min_value:approximate", + "ARROW:max_value:exact", + "ARROW:min_value:exact", + ] + indices: [ + 0, # "ARROW:row_count:exact" + 1, # "ARROW:null_count:exact" + 1, # "ARROW:null_count:exact" + 2, # "ARROW:distinct_count:exact" + 3, # "ARROW:max_value:approximate" + 4, # "ARROW:min_value:approximate" + 1, # "ARROW:null_count:exact" + 5, # "ARROW:max_value:exact" + 6, # "ARROW:min_value:exact" + 1, # "ARROW:null_count:exact" + 3, # "ARROW:max_value:approximate" + 4, # "ARROW:min_value:approximate" + 1, # "ARROW:null_count:exact" + 2, # "ARROW:distinct_count:exact" + ] + items: + children: + 0: [ # int64 + 3, # record batch: "ARROW:row_count:exact" + 0, # col1: "ARROW:null_count:exact" + 0, # col1.a: "ARROW:null_count:exact" + 3, # col1.a: "ARROW:distinct_count:exact" + 5, # col1.a: "ARROW:max_value:approximate" + 0, # col1.a: "ARROW:min_value:approximate" + 1, # col1.b: "ARROW:null_count:exact" + 99, # col1.b.item: "ARROW:max_value:exact" + 20, # col1.b.item: "ARROW:min_value:exact" + 1, # col1.c: "ARROW:null_count:exact" + 1, # col2: "ARROW:null_count:exact" + 2, # col2: "ARROW:distinct_count:exact" + ] + 1: [ # float64 + 3.0, # col1.c: "ARROW:max_value:approximate" + -3.0, # col1.c: "ARROW:min_value:approximate" + ] + types: [ + 0, # int64: record batch: "ARROW:row_count:exact" + 0, # int64: col1: "ARROW:null_count:exact" + 0, # int64: col1.a: "ARROW:null_count:exact" + 0, # int64: col1.a: "ARROW:distinct_count:exact" + 0, # int64: col1.a: "ARROW:max_value:approximate" + 0, # int64: col1.a: "ARROW:min_value:approximate" + 0, # int64: col1.b: "ARROW:null_count:exact" + 0, # int64: col1.b.item: "ARROW:max_value:exact" + 0, # int64: col1.b.item: "ARROW:min_value:exact" + 0, # int64: col1.c: "ARROW:null_count:exact" + 1, # float64: col1.c: "ARROW:max_value:approximate" + 1, # float64: col1.c: "ARROW:min_value:approximate" + 0, # int64: col2: "ARROW:null_count:exact" + 0, # int64: col2: "ARROW:distinct_count:exact" + ] + offsets: [ + 0, # int64: record batch: "ARROW:row_count:exact" + 1, # int64: col1: "ARROW:null_count:exact" + 2, # int64: col1.a: "ARROW:null_count:exact" + 3, # int64: col1.a: "ARROW:distinct_count:exact" + 4, # int64: col1.a: "ARROW:max_value:approximate" + 5, # int64: col1.a: "ARROW:min_value:approximate" + 6, # int64: col1.b: "ARROW:null_count:exact" + 7, # int64: col1.b.item: "ARROW:max_value:exact" + 8, # int64: col1.b.item: "ARROW:min_value:exact" + 9, # int64: col1.c: "ARROW:null_count:exact" + 0, # float64: col1.c: "ARROW:max_value:approximate" + 1, # float64: col1.c: "ARROW:min_value:approximate" + 10, # int64: col2: "ARROW:null_count:exact" + 11, # int64: col2: "ARROW:distinct_count:exact" + ] + +Simple array +------------ + +Schema:: + + int64 + +Data:: + + [1, 1, 2, 0, null] + +Statistics: + +.. list-table:: + :header-rows: 1 + + * - Target + - Name + - Value + * - Array + - The number of rows + - ``5`` + * - Array + - The number of nulls + - ``1`` + * - Array + - The number of distinct values + - ``3`` + * - Array + - The max value + - ``2`` + * - Array + - The min value + - ``0`` + +Column indexes: + +.. list-table:: + :header-rows: 1 + + * - Index + - Target + * - ``0`` + - Array + +Statistics schema:: + + struct< + column: int32, + statistics: map< + key: dictionary, + items: dense_union<0: int64> + > + > + +Statistics array:: + + column: [ + 0, # array + 0, # array + 0, # array + 0, # array + 0, # array + ] + statistics: + key: + values: [ + "ARROW:row_count:exact", + "ARROW:null_count:exact", + "ARROW:distinct_count:exact", + "ARROW:max_value:exact", + "ARROW:min_value:exact", + ] + indices: [ + 0, # "ARROW:row_count:exact" + 1, # "ARROW:null_count:exact" + 2, # "ARROW:distinct_count:exact" + 3, # "ARROW:max_value:exact" + 4, # "ARROW:min_value:exact" + ] + items: + children: + 0: [ # int64 + 5, # array: "ARROW:row_count:exact" + 1, # array: "ARROW:null_count:exact" + 3, # array: "ARROW:distinct_count:exact" + 2, # array: "ARROW:max_value:exact" + 0, # array: "ARROW:min_value:exact" + ] + types: [ # all values are int64 + 0, + 0, + 0, + 0, + 0, + ] + offsets: [ + 0, + 1, + 2, + 3, + 4, + ] + +Complex array +------------- + +This uses nested types. + +Schema:: + + struct, c: float64> + +Data:: + + [ + {a: 1, b: [20, 30, 40], c: 2.9}, + {a: 2, b: null, c: -2.9}, + {a: 3, b: [99], c: null}, + ] + +Statistics: + +.. list-table:: + :header-rows: 1 + + * - Target + - Name + - Value + * - Array + - The number of rows + - ``3`` + * - Array + - The number of nulls + - ``0`` + * - ``a`` + - The number of nulls + - ``0`` + * - ``a`` + - The number of distinct values + - ``3`` + * - ``a`` + - The approximate max value + - ``5`` + * - ``a`` + - The approximate min value + - ``0`` + * - ``b`` + - The number of nulls + - ``1`` + * - ``b.item`` + - The max value + - ``99`` + * - ``b.item`` + - The min value + - ``20`` + * - ``c`` + - The number of nulls + - ``1`` + * - ``c`` + - The approximate max value + - ``3.0`` + * - ``c`` + - The approximate min value + - ``-3.0`` + +Column indexes: + +.. list-table:: + :header-rows: 1 + + * - Index + - Target + * - ``0`` + - Array + * - ``1`` + - ``a`` + * - ``2`` + - ``b`` + * - ``3`` + - ``b.item`` + * - ``4`` + - ``c`` + +See also :ref:`ipc-recordbatch-message` how to compute column indexes. + +Statistics schema:: + + struct< + column: int32, + statistics: map< + key: dictionary, + items: dense_union< + # For the number of rows, the number of nulls and so on. + 0: int64, + # For the max/min values of c. + 1: float64 + > + > + > + +Statistics array:: + + column: [ + 0, # array + 0, # array + 1, # a + 1, # a + 1, # a + 1, # a + 2, # b + 3, # b.item + 3, # b.item + 4, # c + 4, # c + 4, # c + ] + statistics: + key: + values: [ + "ARROW:row_count:exact", + "ARROW:null_count:exact", + "ARROW:distinct_count:exact", + "ARROW:max_value:approximate", + "ARROW:min_value:approximate", + "ARROW:max_value:exact", + "ARROW:min_value:exact", + ] + indices: [ + 0, # "ARROW:row_count:exact" + 1, # "ARROW:null_count:exact" + 1, # "ARROW:null_count:exact" + 2, # "ARROW:distinct_count:exact" + 3, # "ARROW:max_value:approximate" + 4, # "ARROW:min_value:approximate" + 1, # "ARROW:null_count:exact" + 5, # "ARROW:max_value:exact" + 6, # "ARROW:min_value:exact" + 1, # "ARROW:null_count:exact" + 3, # "ARROW:max_value:approximate" + 4, # "ARROW:min_value:approximate" + ] + items: + children: + 0: [ # int64 + 3, # array: "ARROW:row_count:exact" + 0, # array: "ARROW:null_count:exact" + 0, # a: "ARROW:null_count:exact" + 3, # a: "ARROW:distinct_count:exact" + 5, # a: "ARROW:max_value:approximate" + 0, # a: "ARROW:min_value:approximate" + 1, # b: "ARROW:null_count:exact" + 99, # b.item: "ARROW:max_value:exact" + 20, # b.item: "ARROW:min_value:exact" + 1, # c: "ARROW:null_count:exact" + ] + 1: [ # float64 + 3.0, # c: "ARROW:max_value:approximate" + -3.0, # c: "ARROW:min_value:approximate" + ] + types: [ + 0, # int64: array: "ARROW:row_count:exact" + 0, # int64: array: "ARROW:null_count:exact" + 0, # int64: a: "ARROW:null_count:exact" + 0, # int64: a: "ARROW:distinct_count:exact" + 0, # int64: a: "ARROW:max_value:approximate" + 0, # int64: a: "ARROW:min_value:approximate" + 0, # int64: b: "ARROW:null_count:exact" + 0, # int64: b.item: "ARROW:max_value:exact" + 0, # int64: b.item: "ARROW:min_value:exact" + 0, # int64: c: "ARROW:null_count:exact" + 1, # float64: c: "ARROW:max_value:approximate" + 1, # float64: c: "ARROW:min_value:approximate" + ] + offsets: [ + 0, # int64: array: "ARROW:row_count:exact" + 1, # int64: array: "ARROW:null_count:exact" + 2, # int64: a: "ARROW:null_count:exact" + 3, # int64: a: "ARROW:distinct_count:exact" + 4, # int64: a: "ARROW:max_value:approximate" + 5, # int64: a: "ARROW:min_value:approximate" + 6, # int64: b: "ARROW:null_count:exact" + 7, # int64: b.item: "ARROW:max_value:exact" + 8, # int64: b.item: "ARROW:min_value:exact" + 9, # int64: c: "ARROW:null_count:exact" + 0, # float64: c: "ARROW:max_value:approximate" + 1, # float64: c: "ARROW:min_value:approximate" + ] diff --git a/docs/source/format/index.rst b/docs/source/format/index.rst index ce31a15a1f36a..91912a5325d52 100644 --- a/docs/source/format/index.rst +++ b/docs/source/format/index.rst @@ -32,6 +32,7 @@ Specifications CDataInterface CStreamInterface CDeviceDataInterface + StatisticsSchema DissociatedIPC Flight FlightSql diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst index 0224cc3235652..290625ba71483 100644 --- a/docs/source/java/flight_sql_jdbc_driver.rst +++ b/docs/source/java/flight_sql_jdbc_driver.rst @@ -48,7 +48,7 @@ To add a dependency via Maven, use a ``pom.xml`` like the following: demo 1.0-SNAPSHOT - 10.0.0 + 18.1.0 diff --git a/docs/source/java/memory.rst b/docs/source/java/memory.rst index 8014a27444ac9..28ff01fb9447f 100644 --- a/docs/source/java/memory.rst +++ b/docs/source/java/memory.rst @@ -107,7 +107,7 @@ Child allocators can also be named, which makes it easier to tell where an Arrow Reference counting ------------------ -Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers +Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To manage shared buffers deterministically, we use manual reference counting instead of the garbage collector. This simply means that each buffer has a counter keeping track of the number of references to the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used. diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 5219902362375..dc24be8bd06d8 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -72,6 +72,8 @@ may expose data type-specific methods or properties. TimestampArray DurationArray MonthDayNanoIntervalArray + Decimal32Array + Decimal64Array Decimal128Array Decimal256Array DictionaryArray diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 65f6da56a553c..5e151a1f93af5 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -116,6 +116,8 @@ functions above. Time64Type DurationType FixedSizeBinaryType + Decimal32Type + Decimal64Type Decimal128Type Decimal256Type Field diff --git a/docs/source/status.rst b/docs/source/status.rst index 83bee8975bbf7..c5883afa8f345 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -202,7 +202,7 @@ Flight RPC +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | gRPC + TLS transport (grpc+tls:) | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ -| UCX_ transport (ucx:) | ✓ | | | | | | | | +| UCX_ transport (ucx:) (1) | ✓ | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ Supported features in the gRPC transport: @@ -212,13 +212,13 @@ Supported features in the gRPC transport: +============================================+=======+=======+=======+====+=======+=======+=======+=======+ | All RPC methods | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ -| Authentication handlers | ✓ | ✓ | ✓ | | ✓ (1) | ✓ | | | +| Authentication handlers | ✓ | ✓ | ✓ | | ✓ (2) | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Call timeouts | ✓ | ✓ | ✓ | | | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Call cancellation | ✓ | ✓ | ✓ | | | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ -| Concurrent client calls (2) | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +| Concurrent client calls (3) | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Custom middleware | ✓ | ✓ | ✓ | | | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ @@ -230,7 +230,7 @@ Supported features in the UCX transport: +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Flight RPC Feature | C++ | Java | Go | JS | C# | Rust | Julia | Swift | +============================================+=======+=======+=======+====+=======+=======+=======+=======+ -| All RPC methods | ✓ (3) | | | | | | | | +| All RPC methods | ✓ (4) | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Authentication handlers | | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ @@ -238,7 +238,7 @@ Supported features in the UCX transport: +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Call cancellation | | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ -| Concurrent client calls | ✓ (4) | | | | | | | | +| Concurrent client calls | ✓ (5) | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Custom middleware | | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ @@ -247,10 +247,11 @@ Supported features in the UCX transport: Notes: -* \(1) Support using AspNetCore authentication handlers. -* \(2) Whether a single client can support multiple concurrent calls. -* \(3) Only support for DoExchange, DoGet, DoPut, and GetFlightInfo. -* \(4) Each concurrent call is a separate connection to the server +* \(1) Flight UCX transport has been deprecated on the 19.0.0 release. +* \(2) Support using AspNetCore authentication handlers. +* \(3) Whether a single client can support multiple concurrent calls. +* \(4) Only support for DoExchange, DoGet, DoPut, and GetFlightInfo. +* \(5) Each concurrent call is a separate connection to the server (unlike gRPC where concurrent calls are multiplexed over a single connection). This will generally provide better throughput but consumes more resources both on the server and the client. diff --git a/format/Schema.fbs b/format/Schema.fbs index e8e14b112a771..f902b6bc1e56d 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -61,8 +61,8 @@ enum MetadataVersion:short { /// forward compatibility guarantees). /// 2. A means of negotiating between a client and server /// what features a stream is allowed to use. The enums -/// values here are intented to represent higher level -/// features, additional details maybe negotiated +/// values here are intended to represent higher level +/// features, additional details may be negotiated /// with key-value pairs specific to the protocol. /// /// Enums added to this list should be assigned power-of-two values @@ -421,7 +421,7 @@ table Interval { // An absolute length of time unrelated to any calendar artifacts. // // For the purposes of Arrow Implementations, adding this value to a Timestamp -// ("t1") naively (i.e. simply summing the two number) is acceptable even +// ("t1") naively (i.e. simply summing the two numbers) is acceptable even // though in some cases the resulting Timestamp (t2) would not account for // leap-seconds during the elapsed time between "t1" and "t2". Similarly, // representing the difference between two Unix timestamp is acceptable, but @@ -510,7 +510,7 @@ table DictionaryEncoding { /// nested type. table Field { - /// Name is not required, in i.e. a List + /// Name is not required (e.g., in a List) name: string; /// Whether or not this field can contain nulls. Should be true in general. diff --git a/js/package.json b/js/package.json index 01c85bad29f99..17bb95b7a5f51 100644 --- a/js/package.json +++ b/js/package.json @@ -65,7 +65,7 @@ "devDependencies": { "@openpgp/web-stream-tools": "0.0.13", "@rollup/plugin-alias": "5.1.1", - "@rollup/plugin-node-resolve": "15.3.0", + "@rollup/plugin-node-resolve": "16.0.0", "@rollup/stream": "3.0.1", "@swc/core": "1.6.6", "@types/benchmark": "2.1.5", @@ -77,9 +77,9 @@ "async-done": "2.0.0", "benny": "3.7.1", "cross-env": "7.0.3", - "del": "7.1.0", + "del": "8.0.0", "del-cli": "5.1.0", - "esbuild": "0.23.0", + "esbuild": "0.24.2", "esbuild-plugin-alias": "0.2.1", "eslint": "8.57.0", "eslint-plugin-jest": "28.9.0", @@ -106,7 +106,7 @@ "rxjs": "7.8.1", "ts-jest": "29.1.4", "ts-node": "10.9.2", - "typedoc": "0.26.3", + "typedoc": "0.27.6", "typescript": "5.4.5", "vinyl-buffer": "1.0.1", "vinyl-named": "1.1.0", diff --git a/js/yarn.lock b/js/yarn.lock index aae663174aa6a..4be93037c98f5 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -354,235 +354,240 @@ resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz#c7184a326533fcdf1b8ee0733e21c713b975575f" integrity sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ== -"@esbuild/aix-ppc64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.23.0.tgz#145b74d5e4a5223489cabdc238d8dad902df5259" - integrity sha512-3sG8Zwa5fMcA9bgqB8AfWPQ+HFke6uD3h1s3RIwUNK8EG7a4buxvuFTs3j1IMs2NXAk9F30C/FF4vxRgQCcmoQ== +"@esbuild/aix-ppc64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.24.2.tgz#38848d3e25afe842a7943643cbcd387cc6e13461" + integrity sha512-thpVCb/rhxE/BnMLQ7GReQLLN8q9qbHmI55F4489/ByVg2aQaQ6kbcLb6FHkocZzQhxc4gx0sCk0tJkKBFzDhA== "@esbuild/android-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz#09d9b4357780da9ea3a7dfb833a1f1ff439b4052" integrity sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A== -"@esbuild/android-arm64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.23.0.tgz#453bbe079fc8d364d4c5545069e8260228559832" - integrity sha512-EuHFUYkAVfU4qBdyivULuu03FhJO4IJN9PGuABGrFy4vUuzk91P2d+npxHcFdpUnfYKy0PuV+n6bKIpHOB3prQ== +"@esbuild/android-arm64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.24.2.tgz#f592957ae8b5643129fa889c79e69cd8669bb894" + integrity sha512-cNLgeqCqV8WxfcTIOeL4OAtSmL8JjcN6m09XIgro1Wi7cF4t/THaWEa7eL5CMoMBdjoHOTh/vwTO/o2TRXIyzg== "@esbuild/android-arm@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.21.5.tgz#9b04384fb771926dfa6d7ad04324ecb2ab9b2e28" integrity sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg== -"@esbuild/android-arm@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.23.0.tgz#26c806853aa4a4f7e683e519cd9d68e201ebcf99" - integrity sha512-+KuOHTKKyIKgEEqKbGTK8W7mPp+hKinbMBeEnNzjJGyFcWsfrXjSTNluJHCY1RqhxFurdD8uNXQDei7qDlR6+g== +"@esbuild/android-arm@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.24.2.tgz#72d8a2063aa630308af486a7e5cbcd1e134335b3" + integrity sha512-tmwl4hJkCfNHwFB3nBa8z1Uy3ypZpxqxfTQOcHX+xRByyYgunVbZ9MzUUfb0RxaHIMnbHagwAxuTL+tnNM+1/Q== "@esbuild/android-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.21.5.tgz#29918ec2db754cedcb6c1b04de8cd6547af6461e" integrity sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA== -"@esbuild/android-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.23.0.tgz#1e51af9a6ac1f7143769f7ee58df5b274ed202e6" - integrity sha512-WRrmKidLoKDl56LsbBMhzTTBxrsVwTKdNbKDalbEZr0tcsBgCLbEtoNthOW6PX942YiYq8HzEnb4yWQMLQuipQ== +"@esbuild/android-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.24.2.tgz#9a7713504d5f04792f33be9c197a882b2d88febb" + integrity sha512-B6Q0YQDqMx9D7rvIcsXfmJfvUYLoP722bgfBlO5cGvNVb5V/+Y7nhBE3mHV9OpxBf4eAS2S68KZztiPaWq4XYw== "@esbuild/darwin-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz#e495b539660e51690f3928af50a76fb0a6ccff2a" integrity sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ== -"@esbuild/darwin-arm64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.23.0.tgz#d996187a606c9534173ebd78c58098a44dd7ef9e" - integrity sha512-YLntie/IdS31H54Ogdn+v50NuoWF5BDkEUFpiOChVa9UnKpftgwzZRrI4J132ETIi+D8n6xh9IviFV3eXdxfow== +"@esbuild/darwin-arm64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.24.2.tgz#02ae04ad8ebffd6e2ea096181b3366816b2b5936" + integrity sha512-kj3AnYWc+CekmZnS5IPu9D+HWtUI49hbnyqk0FLEJDbzCIQt7hg7ucF1SQAilhtYpIujfaHr6O0UHlzzSPdOeA== "@esbuild/darwin-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz#c13838fa57372839abdddc91d71542ceea2e1e22" integrity sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw== -"@esbuild/darwin-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.23.0.tgz#30c8f28a7ef4e32fe46501434ebe6b0912e9e86c" - integrity sha512-IMQ6eme4AfznElesHUPDZ+teuGwoRmVuuixu7sv92ZkdQcPbsNHzutd+rAfaBKo8YK3IrBEi9SLLKWJdEvJniQ== +"@esbuild/darwin-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.24.2.tgz#9ec312bc29c60e1b6cecadc82bd504d8adaa19e9" + integrity sha512-WeSrmwwHaPkNR5H3yYfowhZcbriGqooyu3zI/3GGpF8AyUdsrrP0X6KumITGA9WOyiJavnGZUwPGvxvwfWPHIA== "@esbuild/freebsd-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz#646b989aa20bf89fd071dd5dbfad69a3542e550e" integrity sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g== -"@esbuild/freebsd-arm64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.23.0.tgz#30f4fcec8167c08a6e8af9fc14b66152232e7fb4" - integrity sha512-0muYWCng5vqaxobq6LB3YNtevDFSAZGlgtLoAc81PjUfiFz36n4KMpwhtAd4he8ToSI3TGyuhyx5xmiWNYZFyw== +"@esbuild/freebsd-arm64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.24.2.tgz#5e82f44cb4906d6aebf24497d6a068cfc152fa00" + integrity sha512-UN8HXjtJ0k/Mj6a9+5u6+2eZ2ERD7Edt1Q9IZiB5UZAIdPnVKDoG7mdTVGhHJIeEml60JteamR3qhsr1r8gXvg== "@esbuild/freebsd-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz#aa615cfc80af954d3458906e38ca22c18cf5c261" integrity sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ== -"@esbuild/freebsd-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.23.0.tgz#1003a6668fe1f5d4439e6813e5b09a92981bc79d" - integrity sha512-XKDVu8IsD0/q3foBzsXGt/KjD/yTKBCIwOHE1XwiXmrRwrX6Hbnd5Eqn/WvDekddK21tfszBSrE/WMaZh+1buQ== +"@esbuild/freebsd-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.24.2.tgz#3fb1ce92f276168b75074b4e51aa0d8141ecce7f" + integrity sha512-TvW7wE/89PYW+IevEJXZ5sF6gJRDY/14hyIGFXdIucxCsbRmLUcjseQu1SyTko+2idmCw94TgyaEZi9HUSOe3Q== "@esbuild/linux-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz#70ac6fa14f5cb7e1f7f887bcffb680ad09922b5b" integrity sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q== -"@esbuild/linux-arm64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.23.0.tgz#3b9a56abfb1410bb6c9138790f062587df3e6e3a" - integrity sha512-j1t5iG8jE7BhonbsEg5d9qOYcVZv/Rv6tghaXM/Ug9xahM0nX/H2gfu6X6z11QRTMT6+aywOMA8TDkhPo8aCGw== +"@esbuild/linux-arm64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.24.2.tgz#856b632d79eb80aec0864381efd29de8fd0b1f43" + integrity sha512-7HnAD6074BW43YvvUmE/35Id9/NB7BeX5EoNkK9obndmZBUk8xmJJeU7DwmUeN7tkysslb2eSl6CTrYz6oEMQg== "@esbuild/linux-arm@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz#fc6fd11a8aca56c1f6f3894f2bea0479f8f626b9" integrity sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA== -"@esbuild/linux-arm@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.23.0.tgz#237a8548e3da2c48cd79ae339a588f03d1889aad" - integrity sha512-SEELSTEtOFu5LPykzA395Mc+54RMg1EUgXP+iw2SJ72+ooMwVsgfuwXo5Fn0wXNgWZsTVHwY2cg4Vi/bOD88qw== +"@esbuild/linux-arm@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.24.2.tgz#c846b4694dc5a75d1444f52257ccc5659021b736" + integrity sha512-n0WRM/gWIdU29J57hJyUdIsk0WarGd6To0s+Y+LwvlC55wt+GT/OgkwoXCXvIue1i1sSNWblHEig00GBWiJgfA== "@esbuild/linux-ia32@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz#3271f53b3f93e3d093d518d1649d6d68d346ede2" integrity sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg== -"@esbuild/linux-ia32@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.23.0.tgz#4269cd19cb2de5de03a7ccfc8855dde3d284a238" - integrity sha512-P7O5Tkh2NbgIm2R6x1zGJJsnacDzTFcRWZyTTMgFdVit6E98LTxO+v8LCCLWRvPrjdzXHx9FEOA8oAZPyApWUA== +"@esbuild/linux-ia32@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.24.2.tgz#f8a16615a78826ccbb6566fab9a9606cfd4a37d5" + integrity sha512-sfv0tGPQhcZOgTKO3oBE9xpHuUqguHvSo4jl+wjnKwFpapx+vUDcawbwPNuBIAYdRAvIDBfZVvXprIj3HA+Ugw== "@esbuild/linux-loong64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz#ed62e04238c57026aea831c5a130b73c0f9f26df" integrity sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg== -"@esbuild/linux-loong64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.23.0.tgz#82b568f5658a52580827cc891cb69d2cb4f86280" - integrity sha512-InQwepswq6urikQiIC/kkx412fqUZudBO4SYKu0N+tGhXRWUqAx+Q+341tFV6QdBifpjYgUndV1hhMq3WeJi7A== +"@esbuild/linux-loong64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.24.2.tgz#1c451538c765bf14913512c76ed8a351e18b09fc" + integrity sha512-CN9AZr8kEndGooS35ntToZLTQLHEjtVB5n7dl8ZcTZMonJ7CCfStrYhrzF97eAecqVbVJ7APOEe18RPI4KLhwQ== "@esbuild/linux-mips64el@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz#e79b8eb48bf3b106fadec1ac8240fb97b4e64cbe" integrity sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg== -"@esbuild/linux-mips64el@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.23.0.tgz#9a57386c926262ae9861c929a6023ed9d43f73e5" - integrity sha512-J9rflLtqdYrxHv2FqXE2i1ELgNjT+JFURt/uDMoPQLcjWQA5wDKgQA4t/dTqGa88ZVECKaD0TctwsUfHbVoi4w== +"@esbuild/linux-mips64el@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.24.2.tgz#0846edeefbc3d8d50645c51869cc64401d9239cb" + integrity sha512-iMkk7qr/wl3exJATwkISxI7kTcmHKE+BlymIAbHO8xanq/TjHaaVThFF6ipWzPHryoFsesNQJPE/3wFJw4+huw== "@esbuild/linux-ppc64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz#5f2203860a143b9919d383ef7573521fb154c3e4" integrity sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w== -"@esbuild/linux-ppc64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.23.0.tgz#f3a79fd636ba0c82285d227eb20ed8e31b4444f6" - integrity sha512-cShCXtEOVc5GxU0fM+dsFD10qZ5UpcQ8AM22bYj0u/yaAykWnqXJDpd77ublcX6vdDsWLuweeuSNZk4yUxZwtw== +"@esbuild/linux-ppc64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.24.2.tgz#8e3fc54505671d193337a36dfd4c1a23b8a41412" + integrity sha512-shsVrgCZ57Vr2L8mm39kO5PPIb+843FStGt7sGGoqiiWYconSxwTiuswC1VJZLCjNiMLAMh34jg4VSEQb+iEbw== "@esbuild/linux-riscv64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz#07bcafd99322d5af62f618cb9e6a9b7f4bb825dc" integrity sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA== -"@esbuild/linux-riscv64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.23.0.tgz#f9d2ef8356ce6ce140f76029680558126b74c780" - integrity sha512-HEtaN7Y5UB4tZPeQmgz/UhzoEyYftbMXrBCUjINGjh3uil+rB/QzzpMshz3cNUxqXN7Vr93zzVtpIDL99t9aRw== +"@esbuild/linux-riscv64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.24.2.tgz#6a1e92096d5e68f7bb10a0d64bb5b6d1daf9a694" + integrity sha512-4eSFWnU9Hhd68fW16GD0TINewo1L6dRrB+oLNNbYyMUAeOD2yCK5KXGK1GH4qD/kT+bTEXjsyTCiJGHPZ3eM9Q== "@esbuild/linux-s390x@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz#b7ccf686751d6a3e44b8627ababc8be3ef62d8de" integrity sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A== -"@esbuild/linux-s390x@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.23.0.tgz#45390f12e802201f38a0229e216a6aed4351dfe8" - integrity sha512-WDi3+NVAuyjg/Wxi+o5KPqRbZY0QhI9TjrEEm+8dmpY9Xir8+HE/HNx2JoLckhKbFopW0RdO2D72w8trZOV+Wg== +"@esbuild/linux-s390x@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.24.2.tgz#ab18e56e66f7a3c49cb97d337cd0a6fea28a8577" + integrity sha512-S0Bh0A53b0YHL2XEXC20bHLuGMOhFDO6GN4b3YjRLK//Ep3ql3erpNcPlEFed93hsQAjAQDNsvcK+hV90FubSw== "@esbuild/linux-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz#6d8f0c768e070e64309af8004bb94e68ab2bb3b0" integrity sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ== -"@esbuild/linux-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.23.0.tgz#c8409761996e3f6db29abcf9b05bee8d7d80e910" - integrity sha512-a3pMQhUEJkITgAw6e0bWA+F+vFtCciMjW/LPtoj99MhVt+Mfb6bbL9hu2wmTZgNd994qTAEw+U/r6k3qHWWaOQ== +"@esbuild/linux-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.24.2.tgz#8140c9b40da634d380b0b29c837a0b4267aff38f" + integrity sha512-8Qi4nQcCTbLnK9WoMjdC9NiTG6/E38RNICU6sUNqK0QFxCYgoARqVqxdFmWkdonVsvGqWhmm7MO0jyTqLqwj0Q== + +"@esbuild/netbsd-arm64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/netbsd-arm64/-/netbsd-arm64-0.24.2.tgz#65f19161432bafb3981f5f20a7ff45abb2e708e6" + integrity sha512-wuLK/VztRRpMt9zyHSazyCVdCXlpHkKm34WUyinD2lzK07FAHTq0KQvZZlXikNWkDGoT6x3TD51jKQ7gMVpopw== "@esbuild/netbsd-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz#bbe430f60d378ecb88decb219c602667387a6047" integrity sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg== -"@esbuild/netbsd-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.23.0.tgz#ba70db0114380d5f6cfb9003f1d378ce989cd65c" - integrity sha512-cRK+YDem7lFTs2Q5nEv/HHc4LnrfBCbH5+JHu6wm2eP+d8OZNoSMYgPZJq78vqQ9g+9+nMuIsAO7skzphRXHyw== +"@esbuild/netbsd-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.24.2.tgz#7a3a97d77abfd11765a72f1c6f9b18f5396bcc40" + integrity sha512-VefFaQUc4FMmJuAxmIHgUmfNiLXY438XrL4GDNV1Y1H/RW3qow68xTwjZKfj/+Plp9NANmzbH5R40Meudu8mmw== -"@esbuild/openbsd-arm64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/openbsd-arm64/-/openbsd-arm64-0.23.0.tgz#72fc55f0b189f7a882e3cf23f332370d69dfd5db" - integrity sha512-suXjq53gERueVWu0OKxzWqk7NxiUWSUlrxoZK7usiF50C6ipColGR5qie2496iKGYNLhDZkPxBI3erbnYkU0rQ== +"@esbuild/openbsd-arm64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-arm64/-/openbsd-arm64-0.24.2.tgz#58b00238dd8f123bfff68d3acc53a6ee369af89f" + integrity sha512-YQbi46SBct6iKnszhSvdluqDmxCJA+Pu280Av9WICNwQmMxV7nLRHZfjQzwbPs3jeWnuAhE9Jy0NrnJ12Oz+0A== "@esbuild/openbsd-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz#99d1cf2937279560d2104821f5ccce220cb2af70" integrity sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow== -"@esbuild/openbsd-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.23.0.tgz#b6ae7a0911c18fe30da3db1d6d17a497a550e5d8" - integrity sha512-6p3nHpby0DM/v15IFKMjAaayFhqnXV52aEmv1whZHX56pdkK+MEaLoQWj+H42ssFarP1PcomVhbsR4pkz09qBg== +"@esbuild/openbsd-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.24.2.tgz#0ac843fda0feb85a93e288842936c21a00a8a205" + integrity sha512-+iDS6zpNM6EnJyWv0bMGLWSWeXGN/HTaF/LXHXHwejGsVi+ooqDfMCCTerNFxEkM3wYVcExkeGXNqshc9iMaOA== "@esbuild/sunos-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz#08741512c10d529566baba837b4fe052c8f3487b" integrity sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg== -"@esbuild/sunos-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.23.0.tgz#58f0d5e55b9b21a086bfafaa29f62a3eb3470ad8" - integrity sha512-BFelBGfrBwk6LVrmFzCq1u1dZbG4zy/Kp93w2+y83Q5UGYF1d8sCzeLI9NXjKyujjBBniQa8R8PzLFAUrSM9OA== +"@esbuild/sunos-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.24.2.tgz#8b7aa895e07828d36c422a4404cc2ecf27fb15c6" + integrity sha512-hTdsW27jcktEvpwNHJU4ZwWFGkz2zRJUz8pvddmXPtXDzVKTTINmlmga3ZzwcuMpUvLw7JkLy9QLKyGpD2Yxig== "@esbuild/win32-arm64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz#675b7385398411240735016144ab2e99a60fc75d" integrity sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A== -"@esbuild/win32-arm64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.23.0.tgz#b858b2432edfad62e945d5c7c9e5ddd0f528ca6d" - integrity sha512-lY6AC8p4Cnb7xYHuIxQ6iYPe6MfO2CC43XXKo9nBXDb35krYt7KGhQnOkRGar5psxYkircpCqfbNDB4uJbS2jQ== +"@esbuild/win32-arm64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.24.2.tgz#c023afb647cabf0c3ed13f0eddfc4f1d61c66a85" + integrity sha512-LihEQ2BBKVFLOC9ZItT9iFprsE9tqjDjnbulhHoFxYQtQfai7qfluVODIYxt1PgdoyQkz23+01rzwNwYfutxUQ== "@esbuild/win32-ia32@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz#1bfc3ce98aa6ca9a0969e4d2af72144c59c1193b" integrity sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA== -"@esbuild/win32-ia32@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.23.0.tgz#167ef6ca22a476c6c0c014a58b4f43ae4b80dec7" - integrity sha512-7L1bHlOTcO4ByvI7OXVI5pNN6HSu6pUQq9yodga8izeuB1KcT2UkHaH6118QJwopExPn0rMHIseCTx1CRo/uNA== +"@esbuild/win32-ia32@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.24.2.tgz#96c356132d2dda990098c8b8b951209c3cd743c2" + integrity sha512-q+iGUwfs8tncmFC9pcnD5IvRHAzmbwQ3GPS5/ceCyHdjXubwQWI12MKWSNSMYLJMq23/IUCvJMS76PDqXe1fxA== "@esbuild/win32-x64@0.21.5": version "0.21.5" resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz#acad351d582d157bb145535db2a6ff53dd514b5c" integrity sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw== -"@esbuild/win32-x64@0.23.0": - version "0.23.0" - resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.23.0.tgz#db44a6a08520b5f25bbe409f34a59f2d4bcc7ced" - integrity sha512-Arm+WgUFLUATuoxCJcahGuk6Yj9Pzxd6l11Zb/2aAuv5kWWvvfhLFo2fni4uSK5vzlUdCGZ/BdV5tH8klj8p8g== +"@esbuild/win32-x64@0.24.2": + version "0.24.2" + resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.24.2.tgz#34aa0b52d0fbb1a654b596acfa595f0c7b77a77b" + integrity sha512-7VTgWzgMGvup6aSqDPLiW5zHaxYJGTO4OokMjIlrCtf+VpEL+cXKtCvg723iguPYI5oaUNdS+/V7OU2gvXVWEg== "@eslint-community/eslint-utils@^4.2.0", "@eslint-community/eslint-utils@^4.4.0": version "4.4.0" @@ -616,6 +621,15 @@ resolved "https://registry.yarnpkg.com/@eslint/js/-/js-8.57.0.tgz#a5417ae8427873f1dd08b70b3574b453e67b5f7f" integrity sha512-Ys+3g2TaW7gADOJzPt83SJtCDhMjndcDMFVQ/Tj9iA1BfJzFKD9mAUXT3OenpuPHbI6P/myECxRJrofUsDx/5g== +"@gerrit0/mini-shiki@^1.24.0": + version "1.24.4" + resolved "https://registry.yarnpkg.com/@gerrit0/mini-shiki/-/mini-shiki-1.24.4.tgz#e5328ca0dccc094460c03701d62158137a5a2eda" + integrity sha512-YEHW1QeAg6UmxEmswiQbOVEg1CW22b1XUD/lNTliOsu0LD0wqoyleFMnmbTp697QE0pcadQiR5cVtbbAPncvpw== + dependencies: + "@shikijs/engine-oniguruma" "^1.24.2" + "@shikijs/types" "^1.24.2" + "@shikijs/vscode-textmate" "^9.3.1" + "@gulp-sourcemaps/identity-map@^2.0.1": version "2.0.1" resolved "https://registry.yarnpkg.com/@gulp-sourcemaps/identity-map/-/identity-map-2.0.1.tgz#a6e8b1abec8f790ec6be2b8c500e6e68037c0019" @@ -994,10 +1008,10 @@ resolved "https://registry.yarnpkg.com/@rollup/plugin-alias/-/plugin-alias-5.1.1.tgz#53601d88cda8b1577aa130b4a6e452283605bf26" integrity sha512-PR9zDb+rOzkRb2VD+EuKB7UC41vU5DIwZ5qqCpk0KJudcWAyi8rvYOhS7+L5aZCspw1stTViLgN5v6FF1p5cgQ== -"@rollup/plugin-node-resolve@15.3.0": - version "15.3.0" - resolved "https://registry.yarnpkg.com/@rollup/plugin-node-resolve/-/plugin-node-resolve-15.3.0.tgz#efbb35515c9672e541c08d59caba2eff492a55d5" - integrity sha512-9eO5McEICxMzJpDW9OnMYSv4Sta3hmt7VtBFz5zR9273suNOydOyq/FrGeGy+KsTRFm8w0SLVhzig2ILFT63Ag== +"@rollup/plugin-node-resolve@16.0.0": + version "16.0.0" + resolved "https://registry.yarnpkg.com/@rollup/plugin-node-resolve/-/plugin-node-resolve-16.0.0.tgz#b1a0594661f40d7b061d82136e847354ff85f211" + integrity sha512-0FPvAeVUT/zdWoO0jnb/V5BlBsUSNfkIOtFHzMO4H9MOklrmQFY6FduVHKucNb/aTFxvnGhj4MNj/T1oNdDfNg== dependencies: "@rollup/pluginutils" "^5.0.1" "@types/resolve" "1.20.2" @@ -1109,16 +1123,37 @@ resolved "https://registry.yarnpkg.com/@rollup/stream/-/stream-3.0.1.tgz#485452d6f1016ac1b0513060f90ff02aaca3e1c0" integrity sha512-wdzoakLc9UiPOFa1k17ukfEtvQ0p7JuNFvOZT1DhO5Z5CrTf71An01U9+v+aebYcaLCwy3tLwpCSUF7K7xVN0A== -"@shikijs/core@1.10.0": - version "1.10.0" - resolved "https://registry.yarnpkg.com/@shikijs/core/-/core-1.10.0.tgz#ec3356ace7cb8b41f6baee0116f036fca85054cc" - integrity sha512-BZcr6FCmPfP6TXaekvujZcnkFmJHZ/Yglu97r/9VjzVndQA56/F4WjUKtJRQUnK59Wi7p/UTAOekMfCJv7jnYg== +"@shikijs/engine-oniguruma@^1.24.2": + version "1.24.4" + resolved "https://registry.yarnpkg.com/@shikijs/engine-oniguruma/-/engine-oniguruma-1.24.4.tgz#6adc430ddf247eeed155d8a41883e36160f302cf" + integrity sha512-Do2ry6flp2HWdvpj2XOwwa0ljZBRy15HKZITzPcNIBOGSeprnA8gOooA/bLsSPuy8aJBa+Q/r34dMmC3KNL/zw== + dependencies: + "@shikijs/types" "1.24.4" + "@shikijs/vscode-textmate" "^9.3.1" + +"@shikijs/types@1.24.4", "@shikijs/types@^1.24.2": + version "1.24.4" + resolved "https://registry.yarnpkg.com/@shikijs/types/-/types-1.24.4.tgz#06ec8975732b68508f8423b01a5649eef8d9cea3" + integrity sha512-0r0XU7Eaow0PuDxuWC1bVqmWCgm3XqizIaT7SM42K03vc69LGooT0U8ccSR44xP/hGlNx4FKhtYpV+BU6aaKAA== + dependencies: + "@shikijs/vscode-textmate" "^9.3.1" + "@types/hast" "^3.0.4" + +"@shikijs/vscode-textmate@^9.3.1": + version "9.3.1" + resolved "https://registry.yarnpkg.com/@shikijs/vscode-textmate/-/vscode-textmate-9.3.1.tgz#afda31f8f42cab70a26f3603f52eae3f1c35d2f7" + integrity sha512-79QfK1393x9Ho60QFyLti+QfdJzRQCVLFb97kOIV7Eo9vQU/roINgk7m24uv0a7AUvN//RDH36FLjjK48v0s9g== "@sinclair/typebox@^0.27.8": version "0.27.8" resolved "https://registry.yarnpkg.com/@sinclair/typebox/-/typebox-0.27.8.tgz#6667fac16c436b5434a387a34dedb013198f6e6e" integrity sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA== +"@sindresorhus/merge-streams@^2.1.0": + version "2.3.0" + resolved "https://registry.yarnpkg.com/@sindresorhus/merge-streams/-/merge-streams-2.3.0.tgz#719df7fb41766bc143369eaa0dd56d8dc87c9958" + integrity sha512-LtoMMhxAlorcGhmFYI+LhPgbPZCkgP6ra1YL604EeF6U98pLlQ3iWIGMdWSC+vWmPBWBNgmDBAhnAobLROJmwg== + "@sinonjs/commons@^3.0.0": version "3.0.1" resolved "https://registry.yarnpkg.com/@sinonjs/commons/-/commons-3.0.1.tgz#1029357e44ca901a615585f6d27738dbc89084cd" @@ -1330,6 +1365,13 @@ dependencies: "@types/node" "*" +"@types/hast@^3.0.4": + version "3.0.4" + resolved "https://registry.yarnpkg.com/@types/hast/-/hast-3.0.4.tgz#1d6b39993b82cea6ad783945b0508c25903e15aa" + integrity sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ== + dependencies: + "@types/unist" "*" + "@types/istanbul-lib-coverage@*", "@types/istanbul-lib-coverage@^2.0.0", "@types/istanbul-lib-coverage@^2.0.1": version "2.0.6" resolved "https://registry.yarnpkg.com/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.6.tgz#7739c232a1fee9b4d3ce8985f314c0c6d33549d7" @@ -1401,6 +1443,11 @@ resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.3.tgz#6209321eb2c1712a7e7466422b8cb1fc0d9dd5d8" integrity sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw== +"@types/unist@*": + version "3.0.3" + resolved "https://registry.yarnpkg.com/@types/unist/-/unist-3.0.3.tgz#acaab0f919ce69cce629c2d4ed2eb4adc1b6c20c" + integrity sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q== + "@types/vinyl@^2.0.4": version "2.0.11" resolved "https://registry.yarnpkg.com/@types/vinyl/-/vinyl-2.0.11.tgz#b95a5bb007e7a0a61dad5a8971dc9922abbc2629" @@ -2782,7 +2829,19 @@ del-cli@5.1.0: del "^7.1.0" meow "^10.1.3" -del@7.1.0, del@^7.1.0: +del@8.0.0: + version "8.0.0" + resolved "https://registry.yarnpkg.com/del/-/del-8.0.0.tgz#f333a5673cfeb72e46084031714a7c30515e80aa" + integrity sha512-R6ep6JJ+eOBZsBr9esiNN1gxFbZE4Q2cULkUSFumGYecAiS6qodDvcPx/sFuWHMNul7DWmrtoEOpYSm7o6tbSA== + dependencies: + globby "^14.0.2" + is-glob "^4.0.3" + is-path-cwd "^3.0.0" + is-path-inside "^4.0.0" + p-map "^7.0.2" + slash "^5.1.0" + +del@^7.1.0: version "7.1.0" resolved "https://registry.yarnpkg.com/del/-/del-7.1.0.tgz#0de0044d556b649ff05387f1fa7c885e155fd1b6" integrity sha512-v2KyNk7efxhlyHpjEvfyxaAihKKK0nWCuf6ZtqZcFFpQRG0bJ12Qsr0RpvsICMjAAZ8DOVCxrlqpxISlMHC4Kg== @@ -2976,35 +3035,36 @@ esbuild-plugin-alias@0.2.1: resolved "https://registry.yarnpkg.com/esbuild-plugin-alias/-/esbuild-plugin-alias-0.2.1.tgz#45a86cb941e20e7c2bc68a2bea53562172494fcb" integrity sha512-jyfL/pwPqaFXyKnj8lP8iLk6Z0m099uXR45aSN8Av1XD4vhvQutxxPzgA2bTcAwQpa1zCXDcWOlhFgyP3GKqhQ== -esbuild@0.23.0: - version "0.23.0" - resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.23.0.tgz#de06002d48424d9fdb7eb52dbe8e95927f852599" - integrity sha512-1lvV17H2bMYda/WaFb2jLPeHU3zml2k4/yagNMG8Q/YtfMjCwEUZa2eXXMgZTVSL5q1n4H7sQ0X6CdJDqqeCFA== +esbuild@0.24.2: + version "0.24.2" + resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.24.2.tgz#b5b55bee7de017bff5fb8a4e3e44f2ebe2c3567d" + integrity sha512-+9egpBW8I3CD5XPe0n6BfT5fxLzxrlDzqydF3aviG+9ni1lDC/OvMHcxqEFV0+LANZG5R1bFMWfUrjVsdwxJvA== optionalDependencies: - "@esbuild/aix-ppc64" "0.23.0" - "@esbuild/android-arm" "0.23.0" - "@esbuild/android-arm64" "0.23.0" - "@esbuild/android-x64" "0.23.0" - "@esbuild/darwin-arm64" "0.23.0" - "@esbuild/darwin-x64" "0.23.0" - "@esbuild/freebsd-arm64" "0.23.0" - "@esbuild/freebsd-x64" "0.23.0" - "@esbuild/linux-arm" "0.23.0" - "@esbuild/linux-arm64" "0.23.0" - "@esbuild/linux-ia32" "0.23.0" - "@esbuild/linux-loong64" "0.23.0" - "@esbuild/linux-mips64el" "0.23.0" - "@esbuild/linux-ppc64" "0.23.0" - "@esbuild/linux-riscv64" "0.23.0" - "@esbuild/linux-s390x" "0.23.0" - "@esbuild/linux-x64" "0.23.0" - "@esbuild/netbsd-x64" "0.23.0" - "@esbuild/openbsd-arm64" "0.23.0" - "@esbuild/openbsd-x64" "0.23.0" - "@esbuild/sunos-x64" "0.23.0" - "@esbuild/win32-arm64" "0.23.0" - "@esbuild/win32-ia32" "0.23.0" - "@esbuild/win32-x64" "0.23.0" + "@esbuild/aix-ppc64" "0.24.2" + "@esbuild/android-arm" "0.24.2" + "@esbuild/android-arm64" "0.24.2" + "@esbuild/android-x64" "0.24.2" + "@esbuild/darwin-arm64" "0.24.2" + "@esbuild/darwin-x64" "0.24.2" + "@esbuild/freebsd-arm64" "0.24.2" + "@esbuild/freebsd-x64" "0.24.2" + "@esbuild/linux-arm" "0.24.2" + "@esbuild/linux-arm64" "0.24.2" + "@esbuild/linux-ia32" "0.24.2" + "@esbuild/linux-loong64" "0.24.2" + "@esbuild/linux-mips64el" "0.24.2" + "@esbuild/linux-ppc64" "0.24.2" + "@esbuild/linux-riscv64" "0.24.2" + "@esbuild/linux-s390x" "0.24.2" + "@esbuild/linux-x64" "0.24.2" + "@esbuild/netbsd-arm64" "0.24.2" + "@esbuild/netbsd-x64" "0.24.2" + "@esbuild/openbsd-arm64" "0.24.2" + "@esbuild/openbsd-x64" "0.24.2" + "@esbuild/sunos-x64" "0.24.2" + "@esbuild/win32-arm64" "0.24.2" + "@esbuild/win32-ia32" "0.24.2" + "@esbuild/win32-x64" "0.24.2" esbuild@^0.21.5: version "0.21.5" @@ -3767,6 +3827,18 @@ globby@^13.1.2: merge2 "^1.4.1" slash "^4.0.0" +globby@^14.0.2: + version "14.0.2" + resolved "https://registry.yarnpkg.com/globby/-/globby-14.0.2.tgz#06554a54ccfe9264e5a9ff8eded46aa1e306482f" + integrity sha512-s3Fq41ZVh7vbbe2PN3nrW7yC7U7MFVc5c98/iTl9c2GawNMKx/J648KQRW6WKkuU8GIbbh2IXfIRQjOZnXcTnw== + dependencies: + "@sindresorhus/merge-streams" "^2.1.0" + fast-glob "^3.3.2" + ignore "^5.2.4" + path-type "^5.0.0" + slash "^5.1.0" + unicorn-magic "^0.1.0" + glogg@^1.0.0: version "1.0.2" resolved "https://registry.yarnpkg.com/glogg/-/glogg-1.0.2.tgz#2d7dd702beda22eb3bffadf880696da6d846313f" @@ -5658,6 +5730,11 @@ p-map@^5.5.0: dependencies: aggregate-error "^4.0.0" +p-map@^7.0.2: + version "7.0.3" + resolved "https://registry.yarnpkg.com/p-map/-/p-map-7.0.3.tgz#7ac210a2d36f81ec28b736134810f7ba4418cdb6" + integrity sha512-VkndIv2fIB99swvQoA65bm+fsmt6UNdGeIB0oxBs+WhAhdh08QA04JXpI7rbB9r08/nkbysKoya9rtDERYOYMA== + p-try@^2.0.0: version "2.2.0" resolved "https://registry.yarnpkg.com/p-try/-/p-try-2.2.0.tgz#cb2868540e313d61de58fafbe35ce9004d5540e6" @@ -5777,6 +5854,11 @@ path-type@^4.0.0: resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== +path-type@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/path-type/-/path-type-5.0.0.tgz#14b01ed7aea7ddf9c7c3f46181d4d04f9c785bb8" + integrity sha512-5HviZNaZcfqP95rwpv+1HDgUamezbqdSYTyzjTvwtJSnIH+3vnbmWsItli8OFEndS984VT55M3jduxZbX351gg== + picocolors@^0.2.1: version "0.2.1" resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-0.2.1.tgz#570670f793646851d1ba135996962abad587859f" @@ -6374,13 +6456,6 @@ shebang-regex@^3.0.0: resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-3.0.0.tgz#ae16f1644d873ecad843b0307b143362d4c42172" integrity sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A== -shiki@^1.9.1: - version "1.10.0" - resolved "https://registry.yarnpkg.com/shiki/-/shiki-1.10.0.tgz#304ab080a12458abc78eb0cb83eb0f7ace546215" - integrity sha512-YD2sXQ+TMD/F9BimV9Jn0wj35pqOvywvOG/3PB6hGHyGKlM7TJ9tyJ02jOb2kF8F0HfJwKNYrh3sW7jEcuRlXA== - dependencies: - "@shikijs/core" "1.10.0" - signal-exit@^3.0.2, signal-exit@^3.0.3, signal-exit@^3.0.7: version "3.0.7" resolved "https://registry.yarnpkg.com/signal-exit/-/signal-exit-3.0.7.tgz#a9a1767f8af84155114eaabd73f99273c8f59ad9" @@ -6415,6 +6490,11 @@ slash@^4.0.0: resolved "https://registry.yarnpkg.com/slash/-/slash-4.0.0.tgz#2422372176c4c6c5addb5e2ada885af984b396a7" integrity sha512-3dOsAHXXUkQTpOYcoAxLIorMTp4gIQr5IW3iVb7A7lFIp0VHhnynm9izx6TssdrIcVIESAlVjtnO2K8bg+Coew== +slash@^5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/slash/-/slash-5.1.0.tgz#be3adddcdf09ac38eebe8dcdc7b1a57a75b095ce" + integrity sha512-ZA6oR3T/pEyuqwMgAKT0/hAv8oAXckzbkmR0UkUosQ+Mc4RxGoJkRmwHgHufaenlyAgE1Mxgpdcrf75y6XcnDg== + slice-ansi@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/slice-ansi/-/slice-ansi-4.0.0.tgz#500e8dd0fd55b05815086255b3195adf2a45fe6b" @@ -7029,16 +7109,16 @@ typedarray@^0.0.6: resolved "https://registry.yarnpkg.com/typedarray/-/typedarray-0.0.6.tgz#867ac74e3864187b1d3d47d996a78ec5c8830777" integrity sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA== -typedoc@0.26.3: - version "0.26.3" - resolved "https://registry.yarnpkg.com/typedoc/-/typedoc-0.26.3.tgz#723b2c4ca5dd1d9baf43d6a5a1f4d640ba4207a8" - integrity sha512-6d2Sw9disvvpdk4K7VNjKr5/3hzijtfQVHRthhDqJgnhMHy1wQz4yPMJVKXElvnZhFr0nkzo+GzjXDTRV5yLpg== +typedoc@0.27.6: + version "0.27.6" + resolved "https://registry.yarnpkg.com/typedoc/-/typedoc-0.27.6.tgz#7e8d067bd5386b7908afcb12c9054a83e8bb326b" + integrity sha512-oBFRoh2Px6jFx366db0lLlihcalq/JzyCVp7Vaq1yphL/tbgx2e+bkpkCgJPunaPvPwoTOXSwasfklWHm7GfAw== dependencies: + "@gerrit0/mini-shiki" "^1.24.0" lunr "^2.3.9" markdown-it "^14.1.0" minimatch "^9.0.5" - shiki "^1.9.1" - yaml "^2.4.5" + yaml "^2.6.1" typescript@5.4.5: version "5.4.5" @@ -7086,6 +7166,11 @@ undici-types@~5.26.4: resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.26.5.tgz#bcd539893d00b56e964fd2657a4866b221a65617" integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA== +unicorn-magic@^0.1.0: + version "0.1.0" + resolved "https://registry.yarnpkg.com/unicorn-magic/-/unicorn-magic-0.1.0.tgz#1bb9a51c823aaf9d73a8bfcd3d1a23dde94b0ce4" + integrity sha512-lRfVq8fE8gz6QMBuDM6a+LO3IAzTi05H6gCVaUpir2E1Rwpo4ZUog45KpNXKC/Mn3Yb9UDuHumeFTo9iV/D9FQ== + union-value@^1.0.0: version "1.0.1" resolved "https://registry.yarnpkg.com/union-value/-/union-value-1.0.1.tgz#0b6fe7b835aecda61c6ea4d4f02c14221e109847" @@ -7485,10 +7570,10 @@ yallist@^4.0.0: resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72" integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A== -yaml@^2.4.5: - version "2.4.5" - resolved "https://registry.yarnpkg.com/yaml/-/yaml-2.4.5.tgz#60630b206dd6d84df97003d33fc1ddf6296cca5e" - integrity sha512-aBx2bnqDzVOyNKfsysjA2ms5ZlnjSAW2eG3/L5G/CSujfjLJTJsEw1bGw8kCf04KodQWk1pxlGnZ56CRxiawmg== +yaml@^2.6.1: + version "2.7.0" + resolved "https://registry.yarnpkg.com/yaml/-/yaml-2.7.0.tgz#aef9bb617a64c937a9a748803786ad8d3ffe1e98" + integrity sha512-+hSoy/QHluxmC9kCIJyL/uyFmLmc+e5CFR5Wa+bpIhIj85LVb9ZH2nVnqrHoSvKogwODv0ClqZkmiSSaIH5LTA== yargs-parser@>=5.0.0-security.0, yargs-parser@^21.0.1, yargs-parser@^21.1.1: version "21.1.1" diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index e5a5df6f4bcb6..425e089d9f2f9 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -249,5 +249,7 @@ static const char* IPC_RECORD_BATCH_READER_OPEN_FAILED = "arrow:io:ipc:FailedToOpenRecordBatchReader"; static const char* IPC_RECORD_BATCH_READ_INVALID_INDEX = "arrow:io:ipc:InvalidIndex"; static const char* IPC_RECORD_BATCH_READ_FAILED = "arrow:io:ipc:ReadFailed"; +static const char* IPC_TABLE_READ_FAILED = "arrow:io:ipc:TableReadFailed"; +static const char* IPC_END_OF_STREAM = "arrow:io:ipc:EndOfStream"; } // namespace arrow::matlab::error diff --git a/matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_reader.cc b/matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_reader.cc new file mode 100644 index 0000000000000..f3c833484d38e --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_reader.cc @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/io/ipc/proxy/record_batch_stream_reader.h" +#include "arrow/io/file.h" +#include "arrow/matlab/error/error.h" +#include "arrow/matlab/tabular/proxy/record_batch.h" +#include "arrow/matlab/tabular/proxy/schema.h" +#include "arrow/matlab/tabular/proxy/table.h" +#include "arrow/util/utf8.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::io::ipc::proxy { + +RecordBatchStreamReader::RecordBatchStreamReader( + const std::shared_ptr reader) + : reader{std::move(reader)} { + REGISTER_METHOD(RecordBatchStreamReader, getSchema); + REGISTER_METHOD(RecordBatchStreamReader, readRecordBatch); + REGISTER_METHOD(RecordBatchStreamReader, hasNextRecordBatch); + REGISTER_METHOD(RecordBatchStreamReader, readTable); +} + +libmexclass::proxy::MakeResult RecordBatchStreamReader::make( + const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using RecordBatchStreamReaderProxy = + arrow::matlab::io::ipc::proxy::RecordBatchStreamReader; + + const mda::StructArray opts = constructor_arguments[0]; + + const mda::StringArray filename_mda = opts[0]["Filename"]; + const auto filename_utf16 = std::u16string(filename_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, + arrow::util::UTF16StringToUTF8(filename_utf16), + error::UNICODE_CONVERSION_ERROR_ID); + + MATLAB_ASSIGN_OR_ERROR(auto input_stream, arrow::io::ReadableFile::Open(filename_utf8), + error::FAILED_TO_OPEN_FILE_FOR_READ); + + MATLAB_ASSIGN_OR_ERROR(auto reader, + arrow::ipc::RecordBatchStreamReader::Open(input_stream), + error::IPC_RECORD_BATCH_READER_OPEN_FAILED); + + return std::make_shared(std::move(reader)); +} + +void RecordBatchStreamReader::getSchema(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using SchemaProxy = arrow::matlab::tabular::proxy::Schema; + + auto schema = reader->schema(); + + auto schema_proxy = std::make_shared(std::move(schema)); + const auto schema_proxy_id = + libmexclass::proxy::ProxyManager::manageProxy(schema_proxy); + + mda::ArrayFactory factory; + const auto schema_proxy_id_mda = factory.createScalar(schema_proxy_id); + context.outputs[0] = schema_proxy_id_mda; +} + +void RecordBatchStreamReader::readTable(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using TableProxy = arrow::matlab::tabular::proxy::Table; + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto table, reader->ToTable(), context, + error::IPC_TABLE_READ_FAILED); + auto table_proxy = std::make_shared(table); + const auto table_proxy_id = libmexclass::proxy::ProxyManager::manageProxy(table_proxy); + + mda::ArrayFactory factory; + const auto table_proxy_id_mda = factory.createScalar(table_proxy_id); + context.outputs[0] = table_proxy_id_mda; +} + +void RecordBatchStreamReader::readRecordBatch( + libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch; + using namespace libmexclass::error; + // If we don't have a "pre-cached" record batch to return, then try reading another + // record batch from the IPC Stream. If there are no more record batches in the stream, + // then error. + if (!nextRecordBatch) { + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(nextRecordBatch, reader->Next(), context, + error::IPC_RECORD_BATCH_READ_FAILED); + } + // Even if the read was "successful", the resulting record batch may be empty, + // signaling the end of the stream. + if (!nextRecordBatch) { + context.error = + Error{error::IPC_END_OF_STREAM, + "Reached end of Arrow IPC Stream. No more record batches to read."}; + return; + } + auto record_batch_proxy = std::make_shared(nextRecordBatch); + const auto record_batch_proxy_id = + libmexclass::proxy::ProxyManager::manageProxy(record_batch_proxy); + // Once we have "consumed" the next RecordBatch, set nextRecordBatch to nullptr + // so that the next call to hasNextRecordBatch correctly checks whether there are more + // record batches remaining in the IPC Stream. + nextRecordBatch = nullptr; + mda::ArrayFactory factory; + const auto record_batch_proxy_id_mda = factory.createScalar(record_batch_proxy_id); + context.outputs[0] = record_batch_proxy_id_mda; +} + +void RecordBatchStreamReader::hasNextRecordBatch( + libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + bool has_next_record_batch = true; + if (!nextRecordBatch) { + // Try to read another RecordBatch from the + // IPC Stream. + auto maybe_record_batch = reader->Next(); + if (!maybe_record_batch.ok()) { + has_next_record_batch = false; + } else { + // If we read a RecordBatch successfully, + // then "cache" the RecordBatch + // so that we can return it on the next + // call to readRecordBatch. + nextRecordBatch = *maybe_record_batch; + + // Even if the read was "successful", the resulting + // record batch may be empty, signaling that + // the end of the IPC stream has been reached. + if (!nextRecordBatch) { + has_next_record_batch = false; + } + } + } + + mda::ArrayFactory factory; + context.outputs[0] = factory.createScalar(has_next_record_batch); +} + +} // namespace arrow::matlab::io::ipc::proxy diff --git a/matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_reader.h b/matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_reader.h new file mode 100644 index 0000000000000..56fb293987825 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_reader.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/ipc/reader.h" +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::io::ipc::proxy { + +class RecordBatchStreamReader : public libmexclass::proxy::Proxy { + public: + RecordBatchStreamReader(std::shared_ptr reader); + + ~RecordBatchStreamReader() = default; + + static libmexclass::proxy::MakeResult make( + const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + std::shared_ptr reader; + std::shared_ptr nextRecordBatch; + + void getSchema(libmexclass::proxy::method::Context& context); + void readRecordBatch(libmexclass::proxy::method::Context& context); + void hasNextRecordBatch(libmexclass::proxy::method::Context& context); + void readTable(libmexclass::proxy::method::Context& context); +}; + +} // namespace arrow::matlab::io::ipc::proxy diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index a08a7495c00c9..902546fd052f8 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -36,6 +36,7 @@ #include "arrow/matlab/io/feather/proxy/writer.h" #include "arrow/matlab/io/ipc/proxy/record_batch_file_reader.h" #include "arrow/matlab/io/ipc/proxy/record_batch_file_writer.h" +#include "arrow/matlab/io/ipc/proxy/record_batch_stream_reader.h" #include "arrow/matlab/io/ipc/proxy/record_batch_stream_writer.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/matlab/tabular/proxy/schema.h" @@ -113,6 +114,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy( REGISTER_PROXY(arrow.io.ipc.proxy.RecordBatchFileReader , arrow::matlab::io::ipc::proxy::RecordBatchFileReader); REGISTER_PROXY(arrow.io.ipc.proxy.RecordBatchFileWriter , arrow::matlab::io::ipc::proxy::RecordBatchFileWriter); REGISTER_PROXY(arrow.io.ipc.proxy.RecordBatchStreamWriter , arrow::matlab::io::ipc::proxy::RecordBatchStreamWriter); + REGISTER_PROXY(arrow.io.ipc.proxy.RecordBatchStreamReader , arrow::matlab::io::ipc::proxy::RecordBatchStreamReader); // clang-format on diff --git a/matlab/src/matlab/+arrow/+io/+ipc/RecordBatchStreamReader.m b/matlab/src/matlab/+arrow/+io/+ipc/RecordBatchStreamReader.m new file mode 100644 index 0000000000000..60ca38eba9ad5 --- /dev/null +++ b/matlab/src/matlab/+arrow/+io/+ipc/RecordBatchStreamReader.m @@ -0,0 +1,83 @@ +%RECORDBATCHSTREAMREADER Class for reading Arrow record batches from the +% Arrow IPC Stream format. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef RecordBatchStreamReader < matlab.mixin.Scalar + + properties(SetAccess=private, GetAccess=public, Hidden) + Proxy + end + + properties (Dependent, SetAccess=private, GetAccess=public) + Schema + end + + methods + function obj = RecordBatchStreamReader(filename) + arguments + filename(1, 1) string {mustBeNonzeroLengthText} + end + args = struct(Filename=filename); + proxyName = "arrow.io.ipc.proxy.RecordBatchStreamReader"; + obj.Proxy = arrow.internal.proxy.create(proxyName, args); + end + + function schema = get.Schema(obj) + proxyID = obj.Proxy.getSchema(); + proxyName = "arrow.tabular.proxy.Schema"; + proxy = libmexclass.proxy.Proxy(ID=proxyID, Name=proxyName); + schema = arrow.tabular.Schema(proxy); + end + + function tf = hasnext(obj) + tf = obj.Proxy.hasNextRecordBatch(); + end + + function tf = done(obj) + tf = ~obj.Proxy.hasNextRecordBatch(); + end + + function arrowRecordBatch = read(obj) + % NOTE: This function is a "convenience alias" for the readRecordBatch + % method, which has a longer name. This is the exact same implementation + % as readRecordBatch. Since this method might be called in a tight loop, + % it should be slightly more efficient to call the C++ code directly, + % rather than invoking obj.readRecordBatch indirectly. We are intentionally + % trading off code duplication for performance here. + proxyID = obj.Proxy.readRecordBatch(); + proxyName = "arrow.tabular.proxy.RecordBatch"; + proxy = libmexclass.proxy.Proxy(ID=proxyID, Name=proxyName); + arrowRecordBatch = arrow.tabular.RecordBatch(proxy); + end + + function arrowRecordBatch = readRecordBatch(obj) + proxyID = obj.Proxy.readRecordBatch(); + proxyName = "arrow.tabular.proxy.RecordBatch"; + proxy = libmexclass.proxy.Proxy(ID=proxyID, Name=proxyName); + arrowRecordBatch = arrow.tabular.RecordBatch(proxy); + end + + function arrowTable = readTable(obj) + proxyID = obj.Proxy.readTable(); + proxyName = "arrow.tabular.proxy.Table"; + proxy = libmexclass.proxy.Proxy(ID=proxyID, Name=proxyName); + arrowTable = arrow.tabular.Table(proxy); + end + + end + +end diff --git a/matlab/test/arrow/io/ipc/tRecordBatchStreamReader.m b/matlab/test/arrow/io/ipc/tRecordBatchStreamReader.m new file mode 100644 index 0000000000000..6ca67197739ae --- /dev/null +++ b/matlab/test/arrow/io/ipc/tRecordBatchStreamReader.m @@ -0,0 +1,336 @@ +%TRECORDBATCHSTREAMREADER Unit tests for arrow.io.ipc.RecordBatchStreamReader. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tRecordBatchStreamReader < matlab.unittest.TestCase + + properties + DataFolder + ZeroBatchStreamFile + OneBatchStreamFile + MultipleBatchStreamFile + RandomAccessFile + end + + properties (TestParameter) + RecordBatchReadFcn = {@read, @readRecordBatch} + end + + methods(TestClassSetup) + + function setupDataFolder(testCase) + import matlab.unittest.fixtures.TemporaryFolderFixture + fixture = testCase.applyFixture(TemporaryFolderFixture); + testCase.DataFolder = string(fixture.Folder); + end + + function setupRandomAccessFile(testCase) + fieldA = arrow.field("A", arrow.string()); + fieldB = arrow.field("B", arrow.float32()); + schema = arrow.schema([fieldA, fieldB]); + fname = fullfile(testCase.DataFolder, "RandomAccessFile.arrow"); + writer = arrow.io.ipc.RecordBatchFileWriter(fname, schema); + writer.close(); + testCase.RandomAccessFile = fname; + end + + function setupZeroBatchStreamFile(testCase) + fieldA = arrow.field("A", arrow.string()); + fieldB = arrow.field("B", arrow.float32()); + schema = arrow.schema([fieldA, fieldB]); + fname = fullfile(testCase.DataFolder, "ZeroBatchStreamFile.arrows"); + writer = arrow.io.ipc.RecordBatchStreamWriter(fname, schema); + writer.close(); + testCase.ZeroBatchStreamFile = fname; + end + + function setupOneBatchStreamFile(testCase) + t = table(["Row1"; "Row2"], single([1; 2]), VariableNames=["A", "B"]); + recordBatch = arrow.recordBatch(t); + fname = fullfile(testCase.DataFolder, "OneBatchFile.arrows"); + writer = arrow.io.ipc.RecordBatchStreamWriter(fname, recordBatch.Schema); + writer.writeRecordBatch(recordBatch); + writer.close(); + testCase.OneBatchStreamFile = fname; + end + + function setupMultipleBatchStreamFile(testCase) + t1 = table(["Row1"; "Row2"], single([1; 2]), VariableNames=["A", "B"]); + t2 = table(["Row3"; "Row4"], single([3; 4]), VariableNames=["A", "B"]); + recordBatch1 = arrow.recordBatch(t1); + recordBatch2 = arrow.recordBatch(t2); + fname = fullfile(testCase.DataFolder, "MultipleBatchStreamFile.arrows"); + writer = arrow.io.ipc.RecordBatchStreamWriter(fname, recordBatch1.Schema); + writer.writeRecordBatch(recordBatch1); + writer.writeRecordBatch(recordBatch2); + writer.close(); + testCase.MultipleBatchStreamFile = fname; + end + end + + methods (Test) + + function ZeroLengthFilenameError(testCase) + % Verify RecordBatchStreamReader throws an exception with the + % identifier MATLAB:validators:mustBeNonzeroLengthText if the + % filename input argument given is a zero length string. + fcn = @() arrow.io.ipc.RecordBatchStreamReader(""); + testCase.verifyError(fcn, "MATLAB:validators:mustBeNonzeroLengthText"); + end + + function MissingStringFilenameError(testCase) + % Verify RecordBatchStreamReader throws an exception with the + % identifier MATLAB:validators:mustBeNonzeroLengthText if the + % filename input argument given is a missing string. + fcn = @() arrow.io.ipc.RecordBatchStreamReader(string(missing)); + testCase.verifyError(fcn, "MATLAB:validators:mustBeNonzeroLengthText"); + end + + function FilenameInvalidTypeError(testCase) + % Verify RecordBatchStreamReader throws an exception with the + % identifier MATLAB:validators:UnableToConvert if the filename + % input argument is neither a scalar string nor a char vector. + fcn = @() arrow.io.ipc.RecordBatchStreamReader(table); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + end + + function Schema(testCase) + % Verify the getter method for Schema returns the + % expected value. + fieldA = arrow.field("A", arrow.string()); + fieldB = arrow.field("B", arrow.float32()); + expectedSchema = arrow.schema([fieldA fieldB]); + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.ZeroBatchStreamFile); + testCase.verifyEqual(reader.Schema, expectedSchema); + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.OneBatchStreamFile); + testCase.verifyEqual(reader.Schema, expectedSchema); + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.MultipleBatchStreamFile); + testCase.verifyEqual(reader.Schema, expectedSchema); + end + + function SchemaNoSetter(testCase) + % Verify the Schema property is not settable. + fieldC = arrow.field("C", arrow.date32()); + schema = arrow.schema(fieldC); + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.ZeroBatchStreamFile); + testCase.verifyError(@() setfield(reader, "Schema", schema), "MATLAB:class:SetProhibited"); + end + + function ReadErrorIfEndOfStream(testCase, RecordBatchReadFcn) + % Verify read throws an execption with the identifier arrow:io:ipc:EndOfStream + % on an Arrow IPC Stream file containing zero batches. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.ZeroBatchStreamFile); + fcn = @() RecordBatchReadFcn(reader); + testCase.verifyError(fcn, "arrow:io:ipc:EndOfStream"); + end + + function ReadOneBatchStreamFile(testCase, RecordBatchReadFcn) + % Verify read can successfully read an Arrow IPC Stream file + % containing one batch. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.OneBatchStreamFile); + + expectedMatlabTable = table(["Row1"; "Row2"], single([1; 2]), VariableNames=["A", "B"]); + expected = arrow.recordBatch(expectedMatlabTable); + actual = RecordBatchReadFcn(reader); + testCase.verifyEqual(actual, expected); + + fcn = @() RecordBatchReadFcn(reader); + testCase.verifyError(fcn, "arrow:io:ipc:EndOfStream"); + end + + function ReadMultipleBatchStreamFile(testCase, RecordBatchReadFcn) + % Verify read can successfully read an Arrow IPC Stream file + % containing mulitple batches. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.MultipleBatchStreamFile); + + expectedMatlabTable1 = table(["Row1"; "Row2"], single([1; 2]), VariableNames=["A", "B"]); + expected1 = arrow.recordBatch(expectedMatlabTable1); + actual1 = RecordBatchReadFcn(reader); + testCase.verifyEqual(actual1, expected1); + + expectedMatlabTable2 = table(["Row3"; "Row4"], single([3; 4]), VariableNames=["A", "B"]); + expected2 = arrow.recordBatch(expectedMatlabTable2); + actual2 = RecordBatchReadFcn(reader); + testCase.verifyEqual(actual2, expected2); + + fcn = @() RecordBatchReadFcn(reader); + testCase.verifyError(fcn, "arrow:io:ipc:EndOfStream"); + end + + function HasNext(testCase, RecordBatchReadFcn) + % Verify that the hasnext method returns true the correct + % number of times depending on the number of record + % batches in an Arrow IPC Stream format. + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.ZeroBatchStreamFile); + % hasnext should return true 0 times for a 0 batch file. + iterations = 0; + while reader.hasnext() + RecordBatchReadFcn(reader); + iterations = iterations + 1; + end + testCase.verifyEqual(iterations, 0); + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.OneBatchStreamFile); + % hasnext should return true 1 time for a 1 batch file. + iterations = 0; + while reader.hasnext() + RecordBatchReadFcn(reader); + iterations = iterations + 1; + end + testCase.verifyEqual(iterations, 1); + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.MultipleBatchStreamFile); + % hasnext should return true 2 times for a 2 batch file. + iterations = 0; + while reader.hasnext() + RecordBatchReadFcn(reader); + iterations = iterations + 1; + end + testCase.verifyEqual(iterations, 2); + end + + function Done(testCase, RecordBatchReadFcn) + % Verify that the done method returns false the correct + % number of times depending on the number of record + % batches in an Arrow IPC Stream format. + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.ZeroBatchStreamFile); + % done should return false 0 times for a 0 batch file. + iterations = 0; + while ~reader.done() + RecordBatchReadFcn(reader); + iterations = iterations + 1; + end + testCase.verifyEqual(iterations, 0); + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.OneBatchStreamFile); + % done should return false 1 time for a 1 batch file. + iterations = 0; + while ~reader.done() + RecordBatchReadFcn(reader); + iterations = iterations + 1; + end + testCase.verifyEqual(iterations, 1); + + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.MultipleBatchStreamFile); + % done should return false 2 times for a 2 batch file. + iterations = 0; + while ~reader.done() + RecordBatchReadFcn(reader); + iterations = iterations + 1; + end + testCase.verifyEqual(iterations, 2); + end + + function ReadTableZeroBatchStreamFile(testCase) + % Verify read can successfully read an Arrow IPC Stream file + % containing zero batches as an arrow.tabular.Table. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.ZeroBatchStreamFile); + matlabTable = table('Size', [0, 2], 'VariableTypes', ["string", "single"], 'VariableNames', ["A", "B"]); + expected = arrow.table(matlabTable); + actual = reader.readTable(); + testCase.verifyEqual(actual, expected); + end + + function ReadTableOneBatchStreamFile(testCase) + % Verify read can successfully read an Arrow IPC Stream file + % containing one batch as an arrow.tabular.Table. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.OneBatchStreamFile); + matlabTable = table(["Row1"; "Row2"], single([1; 2]), VariableNames=["A", "B"]); + expected = arrow.table(matlabTable); + actual = reader.readTable(); + testCase.verifyEqual(actual, expected); + end + + function ReadTableMultipleBatchStreamFile(testCase) + % Verify read can successfully read an Arrow IPC Stream file + % containing multiple batches as an arrow.tabular.Table. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.MultipleBatchStreamFile); + matlabTable = table(["Row1"; "Row2"; "Row3"; "Row4"], single([1; 2; 3; 4]), VariableNames=["A", "B"]); + expected = arrow.table(matlabTable); + actual = reader.readTable(); + testCase.verifyEqual(actual, expected); + end + + function ReadTableAfterReadRecordBatch(testCase, RecordBatchReadFcn) + % Verify readTable returns only the remaining record batches + % in an Arrow IPC Stream file after calling readRecordBatch first. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.MultipleBatchStreamFile); + + testCase.verifyTrue(reader.hasnext()); + testCase.verifyFalse(reader.done()); + + expectedRecordBatch = arrow.recordBatch(... + table(["Row1"; "Row2"], single([1; 2]), VariableNames=["A", "B"]) ... + ); + actualRecordBatch = RecordBatchReadFcn(reader); + testCase.verifyEqual(actualRecordBatch, expectedRecordBatch); + + expectedTable = arrow.table(... + table(["Row3"; "Row4"], single([3; 4]), VariableNames=["A", "B"]) ... + ); + actualTable = reader.readTable(); + testCase.verifyEqual(actualTable, expectedTable); + + testCase.verifyFalse(reader.hasnext()); + testCase.verifyTrue(reader.done()); + end + + function ReadTableMultipleCalls(testCase) + % Verify readTable returns an empty table if it is called + % multiple times in a row. + reader = arrow.io.ipc.RecordBatchStreamReader(testCase.MultipleBatchStreamFile); + + expected = arrow.table(... + table(["Row1"; "Row2"; "Row3"; "Row4"], single([1; 2; 3; 4]), VariableNames=["A", "B"]) ... + ); + actual = reader.readTable(); + testCase.verifyEqual(actual, expected); + + testCase.verifyFalse(reader.hasnext()); + testCase.verifyTrue(reader.done()); + + expectedEmpty = arrow.table(... + table('Size', [0, 2], 'VariableTypes', ["string", "single"], 'VariableNames', ["A", "B"]) ... + ); + + actualEmpty = reader.readTable(); + testCase.verifyEqual(actualEmpty, expectedEmpty); + + testCase.verifyFalse(reader.hasnext()); + testCase.verifyTrue(reader.done()); + + actualEmpty = reader.readTable(); + testCase.verifyEqual(actualEmpty, expectedEmpty); + + testCase.verifyFalse(reader.hasnext()); + testCase.verifyTrue(reader.done()); + end + + function ErrorIfNotIpcStreamFile(testCase) + % Verify RecordBatchStreamReader throws an exception with the + % identifier arrow:io:ipc:FailedToOpenRecordBatchReader if + % the provided file is not an Arrow IPC Stream file. + fcn = @() arrow.io.ipc.RecordBatchStreamReader(testCase.RandomAccessFile); + testCase.verifyError(fcn, "arrow:io:ipc:FailedToOpenRecordBatchReader"); + end + + end + +end diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 29a737a6ecf25..27af19676b73b 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -83,6 +83,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_writer.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_writer.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_stream_writer.cc") diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index c39a1129ac17a..80d1cd31ac231 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -166,8 +166,17 @@ if($ENV{PYODIDE}) # modules (at least under Pyodide it does). set(Python3_INCLUDE_DIR $ENV{PYTHONINCLUDE}) set(Python3_LIBRARY $ENV{CPYTHONLIB}) - set(Python3_NumPy_INCLUDE_DIR $ENV{NUMPY_LIB}/core/include) set(Python3_EXECUTABLE) + execute_process(COMMAND ${Python3_EXECUTABLE} -c + "import numpy; print(numpy.__version__)" + OUTPUT_VARIABLE PYODIDE_NUMPY_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX MATCH "^([0-9]+)" PYODIDE_NUMPY_MAJOR_VERSION ${PYODIDE_NUMPY_VERSION}) + if(PYODIDE_NUMPY_MAJOR_VERSION GREATER_EQUAL 2) + set(Python3_NumPy_INCLUDE_DIR $ENV{NUMPY_LIB}/_core/include) + else() + set(Python3_NumPy_INCLUDE_DIR $ENV{NUMPY_LIB}/core/include) + endif() set(ENV{_PYTHON_SYSCONFIGDATA_NAME} $ENV{SYSCONFIG_NAME}) # we set the c and cxx compiler manually to bypass pywasmcross # which is pyodide's way of messing with C++ build parameters. diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 8c8c09265d0bf..d00a731324c92 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -166,7 +166,7 @@ def print_entry(label, value): float16, float32, float64, binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, - decimal128, decimal256, + decimal32, decimal64, decimal128, decimal256, list_, large_list, list_view, large_list_view, map_, struct, union, sparse_union, dense_union, @@ -180,7 +180,8 @@ def print_entry(label, value): ListViewType, LargeListViewType, MapType, UnionType, SparseUnionType, DenseUnionType, TimestampType, Time32Type, Time64Type, DurationType, - FixedSizeBinaryType, Decimal128Type, Decimal256Type, + FixedSizeBinaryType, + Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, Bool8Type, FixedShapeTensorType, JsonType, OpaqueType, UuidType, @@ -216,7 +217,8 @@ def print_entry(label, value): Date32Array, Date64Array, TimestampArray, Time32Array, Time64Array, DurationArray, MonthDayNanoIntervalArray, - Decimal128Array, Decimal256Array, StructArray, ExtensionArray, + Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, + StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, JsonArray, OpaqueArray, UuidArray, scalar, NA, _NULL as NULL, Scalar, @@ -224,7 +226,7 @@ def print_entry(label, value): Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, HalfFloatScalar, FloatScalar, DoubleScalar, - Decimal128Scalar, Decimal256Scalar, + Decimal32Scalar, Decimal64Scalar, Decimal128Scalar, Decimal256Scalar, ListScalar, LargeListScalar, FixedSizeListScalar, ListViewScalar, LargeListViewScalar, Date32Scalar, Date64Scalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index c92bd4b61be41..2ef42051d9ad2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2329,6 +2329,15 @@ cdef class FixedSizeBinaryArray(Array): Concrete class for Arrow arrays of a fixed-size binary data type. """ +cdef class Decima32Array(FixedSizeBinaryArray): + """ + Concrete class for Arrow arrays of decimal32 data type. + """ + +cdef class Decimal64Array(FixedSizeBinaryArray): + """ + Concrete class for Arrow arrays of decimal64 data type. + """ cdef class Decimal128Array(FixedSizeBinaryArray): """ @@ -4045,7 +4054,7 @@ cdef class StructArray(Array): memory_pool : MemoryPool (optional) For memory allocations, if required, otherwise uses default pool. type : pyarrow.StructType (optional) - Struct type for name and type of each child. + Struct type for name and type of each child. Returns ------- @@ -4707,6 +4716,8 @@ cdef dict _array_classes = { _Type_STRING_VIEW: StringViewArray, _Type_DICTIONARY: DictionaryArray, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, + _Type_DECIMAL32: Decimal32Array, + _Type_DECIMAL64: Decimal64Array, _Type_DECIMAL128: Decimal128Array, _Type_DECIMAL256: Decimal256Array, _Type_STRUCT: StructArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8bf61b73cc211..b2edeb0b4192f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -45,6 +45,16 @@ cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil: c_bool Contains(const c_string& key) const +cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil: + cdef cppclass CDecimal32" arrow::Decimal32": + c_string ToString(int32_t scale) const + + +cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil: + cdef cppclass CDecimal64" arrow::Decimal64": + c_string ToString(int32_t scale) const + + cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil: cdef cppclass CDecimal128" arrow::Decimal128": c_string ToString(int32_t scale) const @@ -110,6 +120,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_FLOAT" arrow::Type::FLOAT" _Type_DOUBLE" arrow::Type::DOUBLE" + _Type_DECIMAL32" arrow::Type::DECIMAL32" + _Type_DECIMAL64" arrow::Type::DECIMAL64" _Type_DECIMAL128" arrow::Type::DECIMAL128" _Type_DECIMAL256" arrow::Type::DECIMAL256" @@ -453,6 +465,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int byte_width() int bit_width() + cdef cppclass CDecimal32Type \ + " arrow::Decimal32Type"(CFixedSizeBinaryType): + CDecimal32Type(int precision, int scale) + int precision() + int scale() + + cdef cppclass CDecimal64Type \ + " arrow::Decimal64Type"(CFixedSizeBinaryType): + CDecimal64Type(int precision, int scale) + int precision() + int scale() + cdef cppclass CDecimal128Type \ " arrow::Decimal128Type"(CFixedSizeBinaryType): CDecimal128Type(int precision, int scale) @@ -680,6 +704,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeBinaryArray" arrow::FixedSizeBinaryArray"(CArray): const uint8_t* GetValue(int i) + cdef cppclass CDecimal32Array" arrow::Decimal32Array"( + CFixedSizeBinaryArray + ): + c_string FormatValue(int i) + + cdef cppclass CDecimal64Array" arrow::Decimal64Array"( + CFixedSizeBinaryArray + ): + c_string FormatValue(int i) + cdef cppclass CDecimal128Array" arrow::Decimal128Array"( CFixedSizeBinaryArray ): @@ -1263,6 +1297,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CDoubleScalar" arrow::DoubleScalar"(CScalar): double value + cdef cppclass CDecimal32Scalar" arrow::Decimal32Scalar"(CScalar): + CDecimal32 value + + cdef cppclass CDecimal64Scalar" arrow::Decimal64Scalar"(CScalar): + CDecimal64 value + cdef cppclass CDecimal128Scalar" arrow::Decimal128Scalar"(CScalar): CDecimal128 value diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index f3d4e1eec0899..bc9811b92b007 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -185,6 +185,16 @@ cdef class FixedSizeBinaryType(DataType): const CFixedSizeBinaryType* fixed_size_binary_type +cdef class Decimal32Type(FixedSizeBinaryType): + cdef: + const CDecimal32Type* decimal32_type + + +cdef class Decimal64Type(FixedSizeBinaryType): + cdef: + const CDecimal64Type* decimal64_type + + cdef class Decimal128Type(FixedSizeBinaryType): cdef: const CDecimal128Type* decimal128_type @@ -430,6 +440,14 @@ cdef class FixedSizeBinaryArray(Array): pass +cdef class Decimal32Array(FixedSizeBinaryArray): + pass + + +cdef class Decimal64Array(FixedSizeBinaryArray): + pass + + cdef class Decimal128Array(FixedSizeBinaryArray): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 6b82eb6566896..2c92ecbfa7344 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -87,9 +87,9 @@ def set_cpu_count(int count): def is_threading_enabled() -> bool: """ - Returns True if threading is enabled in libarrow. + Returns True if threading is enabled in libarrow. - If it isn't enabled, then python shouldn't create any + If it isn't enabled, then python shouldn't create any threads either, because we're probably on a system where threading doesn't work (e.g. Emscripten). """ @@ -109,6 +109,8 @@ Type_INT64 = _Type_INT64 Type_HALF_FLOAT = _Type_HALF_FLOAT Type_FLOAT = _Type_FLOAT Type_DOUBLE = _Type_DOUBLE +Type_DECIMAL32 = _Type_DECIMAL32 +Type_DECIMAL64 = _Type_DECIMAL64 Type_DECIMAL128 = _Type_DECIMAL128 Type_DECIMAL256 = _Type_DECIMAL256 Type_DATE32 = _Type_DATE32 diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 913e25e308254..d1fa1192debc3 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -111,6 +111,10 @@ cdef api object pyarrow_wrap_data_type( out = DurationType.__new__(DurationType) elif type.get().id() == _Type_FIXED_SIZE_BINARY: out = FixedSizeBinaryType.__new__(FixedSizeBinaryType) + elif type.get().id() == _Type_DECIMAL32: + out = Decimal32Type.__new__(Decimal32Type) + elif type.get().id() == _Type_DECIMAL64: + out = Decimal64Type.__new__(Decimal64Type) elif type.get().id() == _Type_DECIMAL128: out = Decimal128Type.__new__(Decimal128Type) elif type.get().id() == _Type_DECIMAL256: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 2bfdcddf30736..2235cd0b981a6 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -336,6 +336,46 @@ cdef class DoubleScalar(Scalar): return sp.value if sp.is_valid else None +cdef class Decimal32Scalar(Scalar): + """ + Concrete class for decimal32 scalars. + """ + + def as_py(self): + """ + Return this value as a Python Decimal. + """ + cdef: + CDecimal32Scalar* sp = self.wrapped.get() + CDecimal32Type* dtype = sp.type.get() + if sp.is_valid: + return _pydecimal.Decimal( + frombytes(sp.value.ToString(dtype.scale())) + ) + else: + return None + + +cdef class Decimal64Scalar(Scalar): + """ + Concrete class for decimal64 scalars. + """ + + def as_py(self): + """ + Return this value as a Python Decimal. + """ + cdef: + CDecimal64Scalar* sp = self.wrapped.get() + CDecimal64Type* dtype = sp.type.get() + if sp.is_valid: + return _pydecimal.Decimal( + frombytes(sp.value.ToString(dtype.scale())) + ) + else: + return None + + cdef class Decimal128Scalar(Scalar): """ Concrete class for decimal128 scalars. @@ -1132,6 +1172,8 @@ cdef dict _scalar_classes = { _Type_HALF_FLOAT: HalfFloatScalar, _Type_FLOAT: FloatScalar, _Type_DOUBLE: DoubleScalar, + _Type_DECIMAL32: Decimal32Scalar, + _Type_DECIMAL64: Decimal64Scalar, _Type_DECIMAL128: Decimal128Scalar, _Type_DECIMAL256: Decimal256Scalar, _Type_DATE32: Date32Scalar, diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 4e0a103e31899..a0f1d5bbbed8b 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1317,15 +1317,8 @@ struct ObjectWriterVisitor { out_values); } - Status Visit(const Decimal32Type& type) { - return Status::NotImplemented("Decimal32 type not yet implemented"); - } - - Status Visit(const Decimal64Type& type) { - return Status::NotImplemented("Decimal64 type not yet implemented"); - } - - Status Visit(const Decimal128Type& type) { + template + Status VisitDecimal(const DecimalT& type) { OwnedRef decimal; OwnedRef Decimal; RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); @@ -1333,7 +1326,7 @@ struct ObjectWriterVisitor { PyObject* decimal_constructor = Decimal.obj(); for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); + const auto& arr = checked_cast(*data.chunk(c)); for (int64_t i = 0; i < arr.length(); ++i) { if (arr.IsNull(i)) { @@ -1350,29 +1343,20 @@ struct ObjectWriterVisitor { return Status::OK(); } - Status Visit(const Decimal256Type& type) { - OwnedRef decimal; - OwnedRef Decimal; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); - RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal)); - PyObject* decimal_constructor = Decimal.obj(); + Status Visit(const Decimal32Type& type) { + return VisitDecimal(type); + } - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); + Status Visit(const Decimal64Type& type) { + return VisitDecimal(type); + } - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - *out_values++ = - internal::DecimalFromString(decimal_constructor, arr.FormatValue(i)); - RETURN_IF_PYERROR(); - } - } - } + Status Visit(const Decimal128Type& type) { + return VisitDecimal(type); + } - return Status::OK(); + Status Visit(const Decimal256Type& type) { + return VisitDecimal(type); } template diff --git a/python/pyarrow/src/arrow/python/decimal.cc b/python/pyarrow/src/arrow/python/decimal.cc index 0c00fcfaa8e59..e6caff2201ddc 100644 --- a/python/pyarrow/src/arrow/python/decimal.cc +++ b/python/pyarrow/src/arrow/python/decimal.cc @@ -164,6 +164,24 @@ Status InternalDecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, } // namespace +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal32* out) { + return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out); +} + +Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out) { + return InternalDecimalFromPyObject(obj, arrow_type, out); +} + +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal64* out) { + return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out); +} + +Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out) { + return InternalDecimalFromPyObject(obj, arrow_type, out); +} + Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, Decimal128* out) { return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out); diff --git a/python/pyarrow/src/arrow/python/decimal.h b/python/pyarrow/src/arrow/python/decimal.h index 1187037aed29e..83ded0b82b922 100644 --- a/python/pyarrow/src/arrow/python/decimal.h +++ b/python/pyarrow/src/arrow/python/decimal.h @@ -56,6 +56,40 @@ ARROW_PYTHON_EXPORT PyObject* DecimalFromString(PyObject* decimal_constructor, const std::string& decimal_string); +// \brief Convert a Python decimal to an Arrow Decimal128 object +// \param[in] python_decimal A Python decimal.Decimal instance +// \param[in] arrow_type An instance of arrow::DecimalType +// \param[out] out A pointer to a Decimal128 +// \return The status of the operation +ARROW_PYTHON_EXPORT +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal32* out); + +// \brief Convert a Python object to an Arrow Decimal128 object +// \param[in] python_decimal A Python int or decimal.Decimal instance +// \param[in] arrow_type An instance of arrow::DecimalType +// \param[out] out A pointer to a Decimal128 +// \return The status of the operation +ARROW_PYTHON_EXPORT +Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out); + +// \brief Convert a Python decimal to an Arrow Decimal128 object +// \param[in] python_decimal A Python decimal.Decimal instance +// \param[in] arrow_type An instance of arrow::DecimalType +// \param[out] out A pointer to a Decimal128 +// \return The status of the operation +ARROW_PYTHON_EXPORT +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal64* out); + +// \brief Convert a Python object to an Arrow Decimal128 object +// \param[in] python_decimal A Python int or decimal.Decimal instance +// \param[in] arrow_type An instance of arrow::DecimalType +// \param[out] out A pointer to a Decimal128 +// \return The status of the operation +ARROW_PYTHON_EXPORT +Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out); + // \brief Convert a Python decimal to an Arrow Decimal128 object // \param[in] python_decimal A Python decimal.Decimal instance // \param[in] arrow_type An instance of arrow::DecimalType diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index e7195e99072b0..709338b4e7756 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -260,6 +260,18 @@ class PyValue { return value; } + static Result Convert(const Decimal32Type* type, const O&, I obj) { + Decimal32 value; + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); + return value; + } + + static Result Convert(const Decimal64Type* type, const O&, I obj) { + Decimal64 value; + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); + return value; + } + static Result Convert(const Decimal128Type* type, const O&, I obj) { Decimal128 value; RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 7a1b31a4d9d77..450cce74f1d43 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -92,6 +92,16 @@ pa.float32(), pa.float64() ]) +decimal32_type = st.builds( + pa.decimal32, + precision=st.integers(min_value=1, max_value=9), + scale=st.integers(min_value=1, max_value=9) +) +decimal64_type = st.builds( + pa.decimal64, + precision=st.integers(min_value=1, max_value=18), + scale=st.integers(min_value=1, max_value=18) +) decimal128_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 95a4853ed3b1a..6f28205a18e13 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1900,7 +1900,9 @@ def test_fsl_to_fsl_cast(value_type): FloatToDecimalCase = namedtuple('FloatToDecimalCase', ('precision', 'scale', 'float_val')) -decimal_type_traits = [DecimalTypeTraits('decimal128', pa.decimal128, 38), +decimal_type_traits = [DecimalTypeTraits('decimal32', pa.decimal32, 9), + DecimalTypeTraits('decimal64', pa.decimal64, 18), + DecimalTypeTraits('decimal128', pa.decimal128, 38), DecimalTypeTraits('decimal256', pa.decimal256, 76)] @@ -1991,7 +1993,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx, # very high precisions as rounding errors can accumulate in # the iterative algorithm (GH-35576). diff_digits = abs(actual - expected) * 10**decimal_ty.scale - limit = 2 if decimal_ty.precision < max_precision - 1 else 4 + limit = 2 if decimal_ty.precision < max_precision - 2 else 4 assert diff_digits <= limit, ( f"float_val = {float_val!r}, precision={decimal_ty.precision}, " f"expected = {expected!r}, actual = {actual!r}, " @@ -2041,6 +2043,11 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): mantissa_digits = math.floor(math.log10(2**mantissa_bits)) max_precision = decimal_traits.max_precision + # For example, decimal32 <-> float64 + if max_precision < mantissa_digits: + mantissa_bits = math.floor(math.log2(10**max_precision)) + mantissa_digits = math.floor(math.log10(2**mantissa_bits)) + with decimal.localcontext() as ctx: precision = mantissa_digits ctx.prec = precision diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index c3589877e6423..07286125c4cf6 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1592,7 +1592,7 @@ def test_sequence_mixed_types_with_specified_type_fails(): def test_sequence_decimal(): data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')] - for type in [pa.decimal128, pa.decimal256]: + for type in [pa.decimal32, pa.decimal64, pa.decimal128, pa.decimal256]: arr = pa.array(data, type=type(precision=7, scale=3)) assert arr.to_pylist() == data @@ -1601,28 +1601,28 @@ def test_sequence_decimal_different_precisions(): data = [ decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234') ] - for type in [pa.decimal128, pa.decimal256]: + for type in [pa.decimal64, pa.decimal128, pa.decimal256]: arr = pa.array(data, type=type(precision=13, scale=3)) assert arr.to_pylist() == data def test_sequence_decimal_no_scale(): data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')] - for type in [pa.decimal128, pa.decimal256]: + for type in [pa.decimal64, pa.decimal128, pa.decimal256]: arr = pa.array(data, type=type(precision=10)) assert arr.to_pylist() == data def test_sequence_decimal_negative(): data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')] - for type in [pa.decimal128, pa.decimal256]: + for type in [pa.decimal64, pa.decimal128, pa.decimal256]: arr = pa.array(data, type=type(precision=10, scale=6)) assert arr.to_pylist() == data def test_sequence_decimal_no_whole_part(): data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')] - for type in [pa.decimal128, pa.decimal256]: + for type in [pa.decimal32, pa.decimal64, pa.decimal128, pa.decimal256]: arr = pa.array(data, type=type(precision=7, scale=7)) assert arr.to_pylist() == data diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index 3bb4440e89750..978c92307a69e 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -256,7 +256,9 @@ def test_explicit_schema_decimal(self): expected = { 'a': [Decimal("1"), Decimal("1.45"), Decimal("-23.456"), None], } - for type_factory in (pa.decimal128, pa.decimal256): + + decimal_types = (pa.decimal32, pa.decimal64, pa.decimal128, pa.decimal256) + for type_factory in decimal_types: schema = pa.schema([('a', type_factory(9, 4))]) opts = ParseOptions(explicit_schema=schema) table = self.read_bytes(rows, parse_options=opts) diff --git a/python/pyarrow/tests/test_memory.py b/python/pyarrow/tests/test_memory.py index b1eef176665af..6ed999db42cee 100644 --- a/python/pyarrow/tests/test_memory.py +++ b/python/pyarrow/tests/test_memory.py @@ -17,7 +17,6 @@ import contextlib import os -import platform import signal import subprocess import sys @@ -30,15 +29,19 @@ pytestmark = pytest.mark.processes possible_backends = ["system", "jemalloc", "mimalloc"] +# Backends which are expected to be present in all builds of PyArrow, +# except if the user manually recompiled Arrow C++. +mandatory_backends = ["system", "mimalloc"] -should_have_jemalloc = (sys.platform == "linux" and platform.machine() == 'x86_64') -should_have_mimalloc = sys.platform == "win32" + +def backend_factory(backend_name): + return getattr(pa, f"{backend_name}_memory_pool") def supported_factories(): yield pa.default_memory_pool - for backend in pa.supported_memory_backends(): - yield getattr(pa, f"{backend}_memory_pool") + for backend_name in pa.supported_memory_backends(): + yield backend_factory(backend_name) @contextlib.contextmanager @@ -149,17 +152,12 @@ def check_env_var(name, expected, *, expect_warning=False): def test_env_var(): - check_env_var("system", ["system"]) - if should_have_jemalloc: - check_env_var("jemalloc", ["jemalloc"]) - if should_have_mimalloc: - check_env_var("mimalloc", ["mimalloc"]) + for backend_name in mandatory_backends: + check_env_var(backend_name, [backend_name]) check_env_var("nonexistent", possible_backends, expect_warning=True) -def test_specific_memory_pools(): - specific_pools = set() - +def test_memory_pool_factories(): def check(factory, name, *, can_fail=False): if can_fail: try: @@ -169,23 +167,16 @@ def check(factory, name, *, can_fail=False): else: pool = factory() assert pool.backend_name == name - specific_pools.add(pool) - check(pa.system_memory_pool, "system") - check(pa.jemalloc_memory_pool, "jemalloc", - can_fail=not should_have_jemalloc) - check(pa.mimalloc_memory_pool, "mimalloc", - can_fail=not should_have_mimalloc) + for backend_name in possible_backends: + check(backend_factory(backend_name), backend_name, + can_fail=backend_name not in mandatory_backends) def test_supported_memory_backends(): backends = pa.supported_memory_backends() - - assert "system" in backends - if should_have_jemalloc: - assert "jemalloc" in backends - if should_have_mimalloc: - assert "mimalloc" in backends + assert set(backends) >= set(mandatory_backends) + assert set(backends) <= set(possible_backends) def run_debug_memory_pool(pool_factory, env_value): @@ -246,6 +237,9 @@ def test_debug_memory_pool_warn(pool_factory): def check_debug_memory_pool_disabled(pool_factory, env_value, msg): + if sys.maxsize < 2**32: + # GH-45011: mimalloc may print warnings in this test on 32-bit Linux, ignore. + pytest.skip("Test may fail on 32-bit platforms") res = run_debug_memory_pool(pool_factory.__name__, env_value) # The subprocess either returned successfully or was killed by a signal # (due to writing out of bounds), depending on the underlying allocator. diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 0b2055018f695..dbba7852190f4 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -165,6 +165,8 @@ def test_set_timezone_db_path_non_windows(): pa.Time32Type, pa.Time64Type, pa.TimestampType, + pa.Decimal32Type, + pa.Decimal64Type, pa.Decimal128Type, pa.Decimal256Type, pa.DictionaryType, diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 1b467d523304c..b0f9e813b103d 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -15,9 +15,14 @@ # specific language governing permissions and limitations # under the License. -import pytest import decimal import datetime +from pathlib import Path +import shutil +import subprocess +import sys + +import pytest import pyarrow as pa from pyarrow import fs @@ -140,6 +145,57 @@ def test_example_using_json(filename, datadir): check_example_file(path, table, need_fix=True) +def test_timezone_database_absent(datadir): + # Example file relies on the timezone "US/Pacific". It should gracefully + # fail, not crash, if the timezone database is not found. + path = datadir / 'TestOrcFile.testDate1900.orc' + code = f"""if 1: + import os + os.environ['TZDIR'] = '/tmp/non_existent' + + from pyarrow import orc + try: + orc_file = orc.ORCFile({str(path)!r}) + orc_file.read() + except Exception as e: + assert "time zone database" in str(e).lower(), e + else: + assert False, "Should have raised exception" + """ + subprocess.run([sys.executable, "-c", code], check=True) + + +def test_timezone_absent(datadir, tmpdir): + # Example file relies on the timezone "US/Pacific". It should gracefully + # fail, not crash, if the timezone database is present but the timezone + # is not found (GH-40633). + source_tzdir = Path('/usr/share/zoneinfo') + if not source_tzdir.exists(): + pytest.skip(f"Test needs timezone database in {source_tzdir}") + tzdir = Path(tmpdir / 'zoneinfo') + try: + shutil.copytree(source_tzdir, tzdir, symlinks=True) + except OSError as e: + pytest.skip(f"Failed to copy timezone database: {e}") + (tzdir / 'US' / 'Pacific').unlink(missing_ok=True) + + path = datadir / 'TestOrcFile.testDate1900.orc' + code = f"""if 1: + import os + os.environ['TZDIR'] = {str(tzdir)!r} + + from pyarrow import orc + orc_file = orc.ORCFile({str(path)!r}) + try: + orc_file.read() + except Exception as e: + assert "zoneinfo/US/Pacific" in str(e), e + else: + assert False, "Should have raised exception" + """ + subprocess.run([sys.executable, "-c", code], check=True) + + def test_orcfile_empty(datadir): from pyarrow import orc diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index bdcb6c2b42d78..b6d36787fbd37 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -615,6 +615,8 @@ def test_type_schema_pickling(pickle_module): pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), + pa.decimal32(9, 3), + pa.decimal64(11, 4), pa.decimal128(12, 2), pa.decimal256(76, 38), pa.field('a', 'string', metadata={b'foo': b'bar'}), diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index de439b6bb8cd7..926de46318036 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -57,6 +57,8 @@ def get_many_types(): pa.float16(), pa.float32(), pa.float64(), + pa.decimal32(9, 4), + pa.decimal64(18, 4), pa.decimal128(19, 4), pa.decimal256(76, 38), pa.string(), @@ -139,18 +141,38 @@ def test_null_field_may_not_be_non_nullable(): def test_is_decimal(): + decimal32 = pa.decimal32(9, 4) + decimal64 = pa.decimal64(18, 4) decimal128 = pa.decimal128(19, 4) decimal256 = pa.decimal256(76, 38) int32 = pa.int32() + assert types.is_decimal(decimal32) + assert types.is_decimal(decimal64) assert types.is_decimal(decimal128) assert types.is_decimal(decimal256) assert not types.is_decimal(int32) + assert types.is_decimal32(decimal32) + assert not types.is_decimal32(decimal64) + assert not types.is_decimal32(decimal128) + assert not types.is_decimal32(decimal256) + assert not types.is_decimal32(int32) + + assert not types.is_decimal64(decimal32) + assert types.is_decimal64(decimal64) + assert not types.is_decimal64(decimal128) + assert not types.is_decimal64(decimal256) + assert not types.is_decimal64(int32) + + assert not types.is_decimal128(decimal32) + assert not types.is_decimal128(decimal64) assert types.is_decimal128(decimal128) assert not types.is_decimal128(decimal256) assert not types.is_decimal128(int32) + assert not types.is_decimal256(decimal32) + assert not types.is_decimal256(decimal64) assert not types.is_decimal256(decimal128) assert types.is_decimal256(decimal256) assert not types.is_decimal256(int32) @@ -970,6 +992,8 @@ def test_bit_and_byte_width(): (pa.float16(), 16, 2), (pa.timestamp('s'), 64, 8), (pa.date32(), 32, 4), + (pa.decimal32(9, 4), 32, 4), + (pa.decimal64(18, 4), 64, 8), (pa.decimal128(19, 4), 128, 16), (pa.decimal256(76, 38), 256, 32), (pa.binary(42), 42 * 8, 42), @@ -1002,6 +1026,14 @@ def test_fixed_size_binary_byte_width(): def test_decimal_properties(): + ty = pa.decimal32(9, 4) + assert ty.byte_width == 4 + assert ty.precision == 9 + assert ty.scale == 4 + ty = pa.decimal64(18, 4) + assert ty.byte_width == 8 + assert ty.precision == 18 + assert ty.scale == 4 ty = pa.decimal128(19, 4) assert ty.byte_width == 16 assert ty.precision == 19 @@ -1013,6 +1045,18 @@ def test_decimal_properties(): def test_decimal_overflow(): + pa.decimal32(1, 0) + pa.decimal32(9, 0) + for i in (0, -1, 10): + with pytest.raises(ValueError): + pa.decimal32(i, 0) + + pa.decimal64(1, 0) + pa.decimal64(18, 0) + for i in (0, -1, 19): + with pytest.raises(ValueError): + pa.decimal64(i, 0) + pa.decimal128(1, 0) pa.decimal128(38, 0) for i in (0, -1, 39): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0d6787cf2a049..3caf068a4c9b1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -73,7 +73,10 @@ def _get_pandas_type_map(): _Type_STRING: np.object_, _Type_LIST: np.object_, _Type_MAP: np.object_, + _Type_DECIMAL32: np.object_, + _Type_DECIMAL64: np.object_, _Type_DECIMAL128: np.object_, + _Type_DECIMAL256: np.object_, }) return _pandas_type_map @@ -1417,6 +1420,104 @@ cdef class FixedSizeBinaryType(DataType): return binary, (self.byte_width,) +cdef class Decimal32Type(FixedSizeBinaryType): + """ + Concrete class for decimal32 data types. + + Examples + -------- + Create an instance of decimal32 type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + FixedSizeBinaryType.init(self, type) + self.decimal32_type = type.get() + + def __reduce__(self): + return decimal32, (self.precision, self.scale) + + @property + def precision(self): + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.precision + 5 + """ + return self.decimal32_type.precision() + + @property + def scale(self): + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.scale + 2 + """ + return self.decimal32_type.scale() + + +cdef class Decimal64Type(FixedSizeBinaryType): + """ + Concrete class for decimal64 data types. + + Examples + -------- + Create an instance of decimal64 type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + FixedSizeBinaryType.init(self, type) + self.decimal64_type = type.get() + + def __reduce__(self): + return decimal64, (self.precision, self.scale) + + @property + def precision(self): + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.precision + 5 + """ + return self.decimal64_type.precision() + + @property + def scale(self): + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.scale + 2 + """ + return self.decimal64_type.scale() + + cdef class Decimal128Type(FixedSizeBinaryType): """ Concrete class for decimal128 data types. @@ -2549,7 +2650,11 @@ cdef class Field(_Weakrefable): @property def metadata(self): """ - The field metadata. + The field metadata (if any is set). + + Returns + ------- + metadata : dict or None Examples -------- @@ -2982,11 +3087,11 @@ cdef class Schema(_Weakrefable): @property def metadata(self): """ - The schema's metadata. + The schema's metadata (if any is set). Returns ------- - metadata: dict + metadata: dict or None Examples -------- @@ -4496,6 +4601,116 @@ def float64(): return primitive_type(_Type_DOUBLE) +cpdef DataType decimal32(int precision, int scale=0): + """ + Create decimal type with precision and scale and 32-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal32(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 32-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal32(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 32-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 9 significant digits, consider + using ``decimal64``, ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 9 + scale : int + + Returns + ------- + decimal_type : Decimal32Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal('123.45') + >>> pa.array([a], pa.decimal32(5, 2)) + + [ + 123.45 + ] + """ + cdef shared_ptr[CDataType] decimal_type + if precision < 1 or precision > 9: + raise ValueError("precision should be between 1 and 9") + decimal_type.reset(new CDecimal32Type(precision, scale)) + return pyarrow_wrap_data_type(decimal_type) + + +cpdef DataType decimal64(int precision, int scale=0): + """ + Create decimal type with precision and scale and 64-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal64(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 64-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal64(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 64-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 18 significant digits, consider + using ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 18 + scale : int + + Returns + ------- + decimal_type : Decimal64Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal('123.45') + >>> pa.array([a], pa.decimal64(5, 2)) + + [ + 123.45 + ] + """ + cdef shared_ptr[CDataType] decimal_type + if precision < 1 or precision > 18: + raise ValueError("precision should be between 1 and 18") + decimal_type.reset(new CDecimal64Type(precision, scale)) + return pyarrow_wrap_data_type(decimal_type) + + cpdef DataType decimal128(int precision, int scale=0): """ Create decimal type with precision and scale and 128-bit width. diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 66b1ec33953a9..2bb5cfcf8b739 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -32,7 +32,8 @@ lib.Type_UINT64} _INTEGER_TYPES = _SIGNED_INTEGER_TYPES | _UNSIGNED_INTEGER_TYPES _FLOATING_TYPES = {lib.Type_HALF_FLOAT, lib.Type_FLOAT, lib.Type_DOUBLE} -_DECIMAL_TYPES = {lib.Type_DECIMAL128, lib.Type_DECIMAL256} +_DECIMAL_TYPES = {lib.Type_DECIMAL32, lib.Type_DECIMAL64, lib.Type_DECIMAL128, + lib.Type_DECIMAL256} _DATE_TYPES = {lib.Type_DATE32, lib.Type_DATE64} _TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64} _INTERVAL_TYPES = {lib.Type_INTERVAL_MONTH_DAY_NANO} @@ -289,6 +290,16 @@ def is_decimal(t): return t.id in _DECIMAL_TYPES +@doc(is_null, datatype="decimal32") +def is_decimal32(t): + return t.id == lib.Type_DECIMAL32 + + +@doc(is_null, datatype="decimal64") +def is_decimal64(t): + return t.id == lib.Type_DECIMAL64 + + @doc(is_null, datatype="decimal128") def is_decimal128(t): return t.id == lib.Type_DECIMAL128 diff --git a/ruby/red-arrow/lib/arrow/column.rb b/ruby/red-arrow/lib/arrow/column.rb index ba575381eefdc..a1de0d61282c2 100644 --- a/ruby/red-arrow/lib/arrow/column.rb +++ b/ruby/red-arrow/lib/arrow/column.rb @@ -58,11 +58,11 @@ def reverse_each(&block) @data.reverse_each(&block) end - def n_rows - @data.n_rows + def size + @data.size end - alias_method :size, :n_rows - alias_method :length, :n_rows + alias_method :length, :size + alias_method :n_rows, :size def n_nulls @data.n_nulls diff --git a/ruby/red-arrow/test/test-column.rb b/ruby/red-arrow/test/test-column.rb index f78377e363ba4..c3a6ee0f4d34f 100644 --- a/ruby/red-arrow/test/test-column.rb +++ b/ruby/red-arrow/test/test-column.rb @@ -49,6 +49,14 @@ def setup assert_equal([false, nil, true], @column.reverse_each.to_a) end + test("#size") do + assert_equal(3, @column.size) + end + + test("#length") do + assert_equal(3, @column.length) + end + test("#n_rows") do assert_equal(3, @column.n_rows) end diff --git a/ruby/red-arrow/test/test-record-batch.rb b/ruby/red-arrow/test/test-record-batch.rb index e94c26f2e329b..fec640343c60a 100644 --- a/ruby/red-arrow/test/test-record-batch.rb +++ b/ruby/red-arrow/test/test-record-batch.rb @@ -178,5 +178,11 @@ def setup @record_batch[[:c, "a", -1, 3..4]]) end end + + sub_test_case("#column") do + test("#size") do + assert_equal(@counts.size, @record_batch[:count].size) + end + end end end