diff --git a/.env b/.env index 298c100c094b0..f379ca14cd205 100644 --- a/.env +++ b/.env @@ -98,9 +98,10 @@ VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release # ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-03-19 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-04-09 -# Use conanio/${CONAN} for "docker-compose run --rm conan". See -# https://github.com/conan-io/conan-docker-tools#readme for available -# images. -CONAN=gcc10 +# Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker-compose run --rm conan". +# See https://github.com/conan-io/conan-docker-tools#readme and +# https://hub.docker.com/u/conanio for available images. +CONAN_BASE=gcc10 +CONAN_VERSION=1.62.0 diff --git a/ci/conan/all/conan_cmake_project_include.cmake b/ci/conan/all/conan_cmake_project_include.cmake new file mode 100644 index 0000000000000..a6dee0c43461c --- /dev/null +++ b/ci/conan/all/conan_cmake_project_include.cmake @@ -0,0 +1,35 @@ +# MIT License +# +# Copyright (c) 2019 Conan.io +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +if(ARROW_S3) + find_package(AWSSDK REQUIRED) + # Fix issue where scripts expect a variable called "AWSSDK_LINK_LIBRARIES" + # which is not defined by the generated AWSSDKConfig.cmake + if(NOT DEFINED AWSSDK_LINK_LIBRARIES) + set(AWSSDK_LINK_LIBRARIES "${AWSSDK_LIBRARIES}") + endif() + + # Causes logic used for generated .pc file to not run + # avoiding instropection of target `aws-cpp-sdk::aws-cpp-sdk` + # This is fine because the generated .pc file is not of use + set(AWSSDK_SOURCE "conan") +endif() diff --git a/ci/conan/all/conandata.yml b/ci/conan/all/conandata.yml index 7402272a4b366..fb75f3995c62e 100644 --- a/ci/conan/all/conandata.yml +++ b/ci/conan/all/conandata.yml @@ -21,6 +21,30 @@ # SOFTWARE. sources: + "15.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-15.0.0/apache-arrow-15.0.0.tar.gz?action=download" + sha256: "01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d" + "14.0.2": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.2/apache-arrow-14.0.2.tar.gz?action=download" + sha256: "1304dedb41896008b89fe0738c71a95d9b81752efc77fa70f264cb1da15d9bc2" + "14.0.1": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.1/apache-arrow-14.0.1.tar.gz?action=download" + sha256: "5c70eafb1011f9d124bafb328afe54f62cc5b9280b7080e1e3d668f78c0e407e" + "14.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.0/apache-arrow-14.0.0.tar.gz?action=download" + sha256: "4eb0da50ec071baf15fc163cb48058931e006f1c862c8def0e180fd07d531021" + "13.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-13.0.0/apache-arrow-13.0.0.tar.gz?action=download" + sha256: "35dfda191262a756be934eef8afee8d09762cad25021daa626eb249e251ac9e6" + "12.0.1": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.1/apache-arrow-12.0.1.tar.gz?action=download" + sha256: "3481c411393aa15c75e88d93cf8315faf7f43e180fe0790128d3840d417de858" + "12.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.0/apache-arrow-12.0.0.tar.gz?action=download" + sha256: "ddd8347882775e53af7d0965a1902b7d8fcd0a030fd14f783d4f85e821352d52" + "11.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-11.0.0/apache-arrow-11.0.0.tar.gz?action=download" + sha256: "2dd8f0ea0848a58785628ee3a57675548d509e17213a2f5d72b0d900b43f5430" "10.0.1": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-10.0.1/apache-arrow-10.0.1.tar.gz?action=download" sha256: "c814e0670112a22c1a6ec03ab420a52ae236a9a42e9e438c3cbd37f37e658fb3" @@ -36,12 +60,6 @@ sources: "7.0.0": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz?action=download" sha256: "e8f49b149a15ecef4e40fcfab1b87c113c6b1ee186005c169e5cdf95d31a99de" - "2.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-2.0.0/apache-arrow-2.0.0.tar.gz?action=download" - sha256: "be0342cc847bb340d86aeaef43596a0b6c1dbf1ede9c789a503d939e01c71fbe" - "1.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-1.0.0/apache-arrow-1.0.0.tar.gz?action=download" - sha256: "86ddb9feb48203a5aaf9cc4f2827525e20a2ca4d7239e492af17e74532ccf243" patches: "8.0.1": - patch_file: "patches/8.0.0-0005-install-utils.patch" @@ -64,23 +82,3 @@ patches: - patch_file: "patches/7.0.0-0007-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - "2.0.0": - - patch_file: "patches/2.0.0-0003-fix-shared-msvc.patch" - patch_description: "make shared enabled in msvc" - patch_type: "official" - - patch_file: "patches/2.0.0-0005-gandiva-engine.patch" - patch_description: "fix grandiva compilation error" - patch_type: "official" - - patch_file: "patches/2.0.0-0008-fix-cmake.patch" - patch_description: "use cci package" - patch_type: "conan" - "1.0.0": - - patch_file: "patches/1.0.0-0003-fix-shared-msvc.patch" - patch_description: "make shared enabled in msvc" - patch_type: "official" - - patch_file: "patches/1.0.0-0005-fix-make12-namespace.patch" - patch_description: "fix ambiguous `make12` function between std and date" - patch_type: "official" - - patch_file: "patches/1.0.0-0006-fix-cmake.patch" - patch_description: "use cci package" - patch_type: "conan" diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 7e87f82e7e018..178cd03da1555 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -21,12 +21,12 @@ # SOFTWARE. from conan import ConanFile -from conan.errors import ConanInvalidConfiguration -from conan.tools.microsoft import is_msvc_static_runtime, is_msvc, check_min_vs -from conan.tools.files import export_conandata_patches, apply_conandata_patches, get, copy, rmdir +from conan.errors import ConanInvalidConfiguration, ConanException from conan.tools.build import check_min_cppstd, cross_building -from conan.tools.scm import Version from conan.tools.cmake import CMake, CMakeDeps, CMakeToolchain, cmake_layout +from conan.tools.files import apply_conandata_patches, copy, export_conandata_patches, get, rmdir +from conan.tools.microsoft import is_msvc, is_msvc_static_runtime +from conan.tools.scm import Version import os import glob @@ -39,7 +39,8 @@ class ArrowConan(ConanFile): license = ("Apache-2.0",) url = "https://github.com/conan-io/conan-center-index" homepage = "https://arrow.apache.org/" - topics = ("memory", "gandiva", "parquet", "skyhook", "plasma", "hdfs", "csv", "cuda", "gcs", "json", "hive", "s3", "grpc") + topics = ("memory", "gandiva", "parquet", "skyhook", "acero", "hdfs", "csv", "cuda", "gcs", "json", "hive", "s3", "grpc") + package_type = "library" settings = "os", "arch", "compiler", "build_type" options = { "shared": [True, False], @@ -48,15 +49,15 @@ class ArrowConan(ConanFile): "parquet": ["auto", True, False], "substrait": [True, False], "skyhook": [True, False], - "plasma": [True, False], + "acero": [True, False], "cli": [True, False], "compute": ["auto", True, False], - "acero": ["auto", True, False], "dataset_modules": ["auto", True, False], "deprecated": [True, False], "encryption": [True, False], "filesystem_layer": [True, False], "hdfs_bridgs": [True, False], + "plasma": [True, False, "deprecated"], "simd_level": [None, "default", "sse4_2", "avx2", "avx512", "neon", ], "runtime_simd_level": [None, "sse4_2", "avx2", "avx512", "max"], "with_backtrace": [True, False], @@ -70,8 +71,9 @@ class ArrowConan(ConanFile): "with_glog": ["auto", True, False], "with_grpc": ["auto", True, False], "with_jemalloc": ["auto", True, False], - "with_mimalloc": ["auto", True, False], + "with_mimalloc": [True, False], "with_json": [True, False], + "with_thrift": ["auto", True, False], "with_llvm": ["auto", True, False], "with_openssl": ["auto", True, False], "with_opentelemetry": [True, False], @@ -91,43 +93,44 @@ class ArrowConan(ConanFile): "shared": False, "fPIC": True, "gandiva": False, - "parquet": "auto", + "parquet": False, "skyhook": False, "substrait": False, - "plasma": False, + "acero": False, "cli": False, - "compute": "auto", - "acero": "auto", - "dataset_modules": "auto", + "compute": False, + "dataset_modules": False, "deprecated": True, "encryption": False, "filesystem_layer": False, "hdfs_bridgs": False, + "plasma": "deprecated", "simd_level": "default", "runtime_simd_level": "max", "with_backtrace": False, - "with_boost": "auto", + "with_boost": False, "with_brotli": False, "with_bz2": False, "with_csv": False, "with_cuda": False, - "with_flight_rpc": "auto", + "with_flight_rpc": False, "with_flight_sql": False, "with_gcs": False, - "with_gflags": "auto", - "with_jemalloc": "auto", + "with_gflags": False, + "with_jemalloc": False, "with_mimalloc": False, - "with_glog": "auto", - "with_grpc": "auto", + "with_glog": False, + "with_grpc": False, "with_json": False, - "with_llvm": "auto", - "with_openssl": "auto", + "with_thrift": False, + "with_llvm": False, + "with_openssl": False, "with_opentelemetry": False, "with_orc": False, - "with_protobuf": "auto", - "with_re2": "auto", + "with_protobuf": False, + "with_re2": False, "with_s3": False, - "with_utf8proc": "auto", + "with_utf8proc": False, "with_lz4": False, "with_snappy": False, "with_zlib": False, @@ -136,283 +139,147 @@ class ArrowConan(ConanFile): short_paths = True @property - def _minimum_cpp_standard(self): + def _min_cppstd(self): # arrow >= 10.0.0 requires C++17. # https://github.com/apache/arrow/pull/13991 - return 11 if Version(self.version) < "10.0.0" else 17 + return "11" if Version(self.version) < "10.0.0" else "17" @property def _compilers_minimum_version(self): return { - "gcc": "8", - "clang": "7", - "apple-clang": "10", - } + "11": { + "clang": "3.9", + }, + "17": { + "gcc": "8", + "clang": "7", + "apple-clang": "10", + "Visual Studio": "15", + "msvc": "191", + }, + }.get(self._min_cppstd, {}) def export_sources(self): export_conandata_patches(self) + copy(self, "conan_cmake_project_include.cmake", self.recipe_folder, os.path.join(self.export_sources_folder, "src")) def config_options(self): if self.settings.os == "Windows": del self.options.fPIC - if Version(self.version) < "2.0.0": - del self.options.simd_level - del self.options.runtime_simd_level - elif Version(self.version) < "6.0.0": - self.options.simd_level = "sse4_2" - if Version(self.version) < "6.0.0": - del self.options.with_gcs - if Version(self.version) < "7.0.0": - del self.options.skyhook - del self.options.with_flight_sql - del self.options.with_opentelemetry if Version(self.version) < "8.0.0": del self.options.substrait + if is_msvc(self): + self.options.with_boost = True def configure(self): if self.options.shared: self.options.rm_safe("fPIC") - def validate(self): - if self.info.settings.compiler.cppstd: - check_min_cppstd(self, self._minimum_cpp_standard) - - if self._minimum_cpp_standard == 11: - if self.info.settings.compiler == "clang" and self.info.settings.compiler.version <= Version("3.9"): - raise ConanInvalidConfiguration("This recipe does not support this compiler version") - else: - check_min_vs(self, 191) - if not is_msvc(self): - minimum_version = self._compilers_minimum_version.get(str(self.info.settings.compiler), False) - if minimum_version and Version(self.info.settings.compiler.version) < minimum_version: - raise ConanInvalidConfiguration( - f"{self.ref} requires C++{self._minimum_cpp_standard}, which your compiler does not support." - ) - - if self.options.shared: - del self.options.fPIC - if self.options.compute == False and not self._compute(True): - raise ConanInvalidConfiguration("compute options is required (or choose auto)") - if self.options.acero == False and not self._acero(True): - raise ConanInvalidConfiguration("acero options is required (or choose auto)") - if self.options.parquet == False and self._parquet(True): - raise ConanInvalidConfiguration("parquet options is required (or choose auto)") - if self.options.dataset_modules == False and self._dataset_modules(True): - raise ConanInvalidConfiguration("dataset_modules options is required (or choose auto)") - if self.options.get_safe("skyhook", False): - raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") - if self.options.with_jemalloc == False and self._with_jemalloc(True): - raise ConanInvalidConfiguration("with_jemalloc option is required (or choose auto)") - if self.options.with_re2 == False and self._with_re2(True): - raise ConanInvalidConfiguration("with_re2 option is required (or choose auto)") - if self.options.with_protobuf == False and self._with_protobuf(True): - raise ConanInvalidConfiguration("with_protobuf option is required (or choose auto)") - if self.options.with_gflags == False and self._with_gflags(True): - raise ConanInvalidConfiguration("with_gflags options is required (or choose auto)") - if self.options.with_flight_rpc == False and self._with_flight_rpc(True): - raise ConanInvalidConfiguration("with_flight_rpc options is required (or choose auto)") - if self.options.with_grpc == False and self._with_grpc(True): - raise ConanInvalidConfiguration("with_grpc options is required (or choose auto)") - if self.options.with_boost == False and self._with_boost(True): - raise ConanInvalidConfiguration("with_boost options is required (or choose auto)") - if self.options.with_openssl == False and self._with_openssl(True): - raise ConanInvalidConfiguration("with_openssl options is required (or choose auto)") - if self.options.with_llvm == False and self._with_llvm(True): - raise ConanInvalidConfiguration("with_llvm options is required (or choose auto)") - if self.options.with_cuda: - raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") - if self.options.with_orc: - raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") - if self.options.with_s3 and not self.options["aws-sdk-cpp"].config: - raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") - - if self.options.shared and self._with_jemalloc(): - if self.options["jemalloc"].enable_cxx: - raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") - - if Version(self.version) < "6.0.0" and self.options.get_safe("simd_level") == "default": - raise ConanInvalidConfiguration(f"In {self.ref}, simd_level options is not supported `default` value.") - def layout(self): cmake_layout(self, src_folder="src") - def _compute(self, required=False): - if required or self.options.compute == "auto": - return bool(self._parquet()) or bool(self._acero()) - else: - return bool(self.options.compute) - - def _acero(self, required=False): - if required or self.options.acero == "auto": - return bool(self._dataset_modules()) - else: - return bool(self.options.acero) - - def _parquet(self, required=False): - if required or self.options.parquet == "auto": - return bool(self.options.get_safe("substrait", False)) - else: - return bool(self.options.parquet) - - def _plasma(self, required=False): - if Version(self.version) >= "12.0.0": - return False - else: - return required or self.options.plasma - - def _dataset_modules(self, required=False): - if required or self.options.dataset_modules == "auto": - return bool(self.options.get_safe("substrait", False)) - else: - return bool(self.options.dataset_modules) - - def _with_jemalloc(self, required=False): - if required or self.options.with_jemalloc == "auto": - return bool("BSD" in str(self.settings.os)) - else: - return bool(self.options.with_jemalloc) - - def _with_re2(self, required=False): - if required or self.options.with_re2 == "auto": - if self.options.gandiva or self.options.parquet: - return True - if Version(self) >= "7.0.0" and (self._compute() or self._dataset_modules()): - return True - return False - else: - return bool(self.options.with_re2) - - def _with_protobuf(self, required=False): - if required or self.options.with_protobuf == "auto": - return bool(self.options.gandiva or self._with_flight_rpc() or self.options.with_orc or self.options.get_safe("substrait", False)) - else: - return bool(self.options.with_protobuf) - - def _with_flight_rpc(self, required=False): - if required or self.options.with_flight_rpc == "auto": - return bool(self.options.get_safe("with_flight_sql", False)) - else: - return bool(self.options.with_flight_rpc) - - def _with_gflags(self, required=False): - if required or self.options.with_gflags == "auto": - return bool(self._plasma() or self._with_glog() or self._with_grpc()) - else: - return bool(self.options.with_gflags) - - def _with_glog(self, required=False): - if required or self.options.with_glog == "auto": - return False - else: - return bool(self.options.with_glog) - - def _with_grpc(self, required=False): - if required or self.options.with_grpc == "auto": - return self._with_flight_rpc() - else: - return bool(self.options.with_grpc) - - def _with_boost(self, required=False): - if required or self.options.with_boost == "auto": - if self.options.gandiva: - return True - version = Version(self.version) - if version.major == "1": - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): - return True - elif version.major >= "2": - if is_msvc(self): - return True - return False - else: - return bool(self.options.with_boost) - - def _with_thrift(self, required=False): - # No self.options.with_thrift exists - return bool(required or self._parquet()) - - def _with_utf8proc(self, required=False): - if required or self.options.with_utf8proc == "auto": - return bool(self._compute() or self.options.gandiva) - else: - return bool(self.options.with_utf8proc) - - def _with_llvm(self, required=False): - if required or self.options.with_llvm == "auto": - return bool(self.options.gandiva) - else: - return bool(self.options.with_llvm) - - def _with_openssl(self, required=False): - if required or self.options.with_openssl == "auto": - return bool(self.options.encryption or self._with_flight_rpc() or self.options.with_s3) - else: - return bool(self.options.with_openssl) - - def _with_rapidjson(self): - if self.options.with_json: - return True - if Version(self.version) >= "7.0.0" and self.options.encryption: - return True - return False + def _requires_rapidjson(self): + return self.options.with_json or self.options.encryption def requirements(self): - if self._with_thrift(): - self.requires("zlib/1.2.13") + if self.options.with_thrift: self.requires("thrift/0.17.0") - if self._with_protobuf(): - self.requires("protobuf/3.21.4") - if self._with_jemalloc(): + if self.options.with_protobuf: + self.requires("protobuf/3.21.9") + if self.options.with_jemalloc: self.requires("jemalloc/5.3.0") if self.options.with_mimalloc: self.requires("mimalloc/1.7.6") - if self._with_boost(): - self.requires("boost/1.80.0") - if self._with_gflags(): + if self.options.with_boost: + self.requires("boost/1.84.0") + if self.options.with_gflags: self.requires("gflags/2.2.2") - if self._with_glog(): + if self.options.with_glog: self.requires("glog/0.6.0") if self.options.get_safe("with_gcs"): self.requires("google-cloud-cpp/1.40.1") - if self._with_grpc(): + if self.options.with_grpc: self.requires("grpc/1.50.0") - if self._with_rapidjson(): + if self._requires_rapidjson(): self.requires("rapidjson/1.1.0") - if self._with_llvm(): + if self.options.with_llvm: self.requires("llvm-core/13.0.0") - if self._with_openssl(): + if self.options.with_openssl: # aws-sdk-cpp requires openssl/1.1.1. it uses deprecated functions in openssl/3.0.0 if self.options.with_s3: - self.requires("openssl/1.1.1s") + self.requires("openssl/1.1.1w") else: - self.requires("openssl/1.1.1s") + self.requires("openssl/[>=1.1 <4]") if self.options.get_safe("with_opentelemetry"): self.requires("opentelemetry-cpp/1.7.0") if self.options.with_s3: self.requires("aws-sdk-cpp/1.9.234") if self.options.with_brotli: - self.requires("brotli/1.0.9") + self.requires("brotli/1.1.0") if self.options.with_bz2: self.requires("bzip2/1.0.8") if self.options.with_lz4: self.requires("lz4/1.9.4") if self.options.with_snappy: self.requires("snappy/1.1.9") - if Version(self.version) >= "6.0.0" and \ - self.options.get_safe("simd_level") != None or \ + if self.options.get_safe("simd_level") != None or \ self.options.get_safe("runtime_simd_level") != None: self.requires("xsimd/9.0.1") if self.options.with_zlib: - self.requires("zlib/1.2.13") + self.requires("zlib/[>=1.2.11 <2]") if self.options.with_zstd: - self.requires("zstd/1.5.2") - if self._with_re2(): - self.requires("re2/20220601") - if self._with_utf8proc(): + self.requires("zstd/1.5.5") + if self.options.with_re2: + self.requires("re2/20230301") + if self.options.with_utf8proc: self.requires("utf8proc/2.8.0") if self.options.with_backtrace: self.requires("libbacktrace/cci.20210118") + def validate(self): + # Do not allow options with 'auto' value + # TODO: Remove "auto" from the possible values for these options + auto_options = [option for option, value in self.options.items() if value == "auto"] + if auto_options: + raise ConanException("Options with value 'auto' are deprecated. Please set them true/false or use its default value." + f" Please change the following options: {auto_options}") + + # From https://github.com/conan-io/conan-center-index/pull/23163#issuecomment-2039808851 + if self.options.gandiva: + if not self.options.with_re2: + raise ConanException("'with_re2' option should be True when'gandiva=True'") + if not self.options.with_boost: + raise ConanException("'with_boost' option should be True when'gandiva=True'") + if not self.options.with_utf8proc: + raise ConanException("'with_utf8proc' option should be True when'gandiva=True'") + + if self.settings.compiler.get_safe("cppstd"): + check_min_cppstd(self, self._min_cppstd) + + minimum_version = self._compilers_minimum_version.get(str(self.settings.compiler), False) + if minimum_version and Version(self.settings.compiler.version) < minimum_version: + raise ConanInvalidConfiguration( + f"{self.ref} requires C++{self._min_cppstd}, which your compiler does not support." + ) + + if self.options.get_safe("skyhook", False): + raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") + if self.options.with_cuda: + raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") + if self.options.with_orc: + raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") + if self.options.with_s3 and not self.dependencies["aws-sdk-cpp"].options.config: + raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") + + if self.options.shared and self.options.with_jemalloc: + if self.dependencies["jemalloc"].options.enable_cxx: + raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") + + + def build_requirements(self): + if Version(self.version) >= "13.0.0": + self.tool_requires("cmake/[>=3.16 <4]") + def source(self): # START # This block should be removed when we update upstream: @@ -435,17 +302,15 @@ def source(self): return # END get(self, **self.conan_data["sources"][self.version], - filename=f"apache-arrow-{self.version}.tar.gz", destination=self.source_folder, strip_root=True) + filename=f"apache-arrow-{self.version}.tar.gz", strip_root=True) def generate(self): - # BUILD_SHARED_LIBS and POSITION_INDEPENDENT_CODE are automatically parsed when self.options.shared or self.options.fPIC exist tc = CMakeToolchain(self) if cross_building(self): cmake_system_processor = { "armv8": "aarch64", "armv8.3": "aarch64", }.get(str(self.settings.arch), str(self.settings.arch)) - tc.variables["CMAKE_SYSTEM_PROCESSOR"] = cmake_system_processor if cmake_system_processor == "aarch64": tc.variables["ARROW_CPU_FLAG"] = "armv8" if is_msvc(self): @@ -453,12 +318,10 @@ def generate(self): tc.variables["ARROW_DEPENDENCY_SOURCE"] = "SYSTEM" tc.variables["ARROW_PACKAGE_KIND"] = "conan" # See https://github.com/conan-io/conan-center-index/pull/14903/files#r1057938314 for details tc.variables["ARROW_GANDIVA"] = bool(self.options.gandiva) - tc.variables["ARROW_PARQUET"] = self._parquet() + tc.variables["ARROW_PARQUET"] = self.options.parquet tc.variables["ARROW_SUBSTRAIT"] = bool(self.options.get_safe("substrait", False)) - if Version(self.version) < "12.0.0": - tc.variables["ARROW_PLASMA"] = bool(self._plasma()) - tc.variables["ARROW_ACERO"] = self._acero() - tc.variables["ARROW_DATASET"] = self._dataset_modules() + tc.variables["ARROW_ACERO"] = bool(self.options.acero) + tc.variables["ARROW_DATASET"] = self.options.dataset_modules tc.variables["ARROW_FILESYSTEM"] = bool(self.options.filesystem_layer) tc.variables["PARQUET_REQUIRE_ENCRYPTION"] = bool(self.options.encryption) tc.variables["ARROW_HDFS"] = bool(self.options.hdfs_bridgs) @@ -466,12 +329,12 @@ def generate(self): tc.variables["ARROW_BUILD_SHARED"] = bool(self.options.shared) tc.variables["ARROW_BUILD_STATIC"] = not bool(self.options.shared) tc.variables["ARROW_NO_DEPRECATED_API"] = not bool(self.options.deprecated) - tc.variables["ARROW_FLIGHT"] = self._with_flight_rpc() + tc.variables["ARROW_FLIGHT"] = self.options.with_flight_rpc tc.variables["ARROW_FLIGHT_SQL"] = bool(self.options.get_safe("with_flight_sql", False)) - tc.variables["ARROW_COMPUTE"] = self._compute() + tc.variables["ARROW_COMPUTE"] = bool(self.options.compute) tc.variables["ARROW_CSV"] = bool(self.options.with_csv) tc.variables["ARROW_CUDA"] = bool(self.options.with_cuda) - tc.variables["ARROW_JEMALLOC"] = self._with_jemalloc() + tc.variables["ARROW_JEMALLOC"] = self.options.with_jemalloc tc.variables["jemalloc_SOURCE"] = "SYSTEM" tc.variables["ARROW_MIMALLOC"] = bool(self.options.with_mimalloc) tc.variables["ARROW_JSON"] = bool(self.options.with_json) @@ -479,61 +342,58 @@ def generate(self): tc.variables["ARROW_GCS"] = bool(self.options.get_safe("with_gcs", False)) tc.variables["BOOST_SOURCE"] = "SYSTEM" tc.variables["Protobuf_SOURCE"] = "SYSTEM" - if self._with_protobuf(): - tc.variables["ARROW_PROTOBUF_USE_SHARED"] = bool(self.options["protobuf"].shared) + if self.options.with_protobuf: + tc.variables["ARROW_PROTOBUF_USE_SHARED"] = bool(self.dependencies["protobuf"].options.shared) tc.variables["gRPC_SOURCE"] = "SYSTEM" - if self._with_grpc(): - tc.variables["ARROW_GRPC_USE_SHARED"] = bool(self.options["grpc"].shared) + if self.options.with_grpc: + tc.variables["ARROW_GRPC_USE_SHARED"] = bool(self.dependencies["grpc"].options.shared) - tc.variables["ARROW_USE_GLOG"] = self._with_glog() + tc.variables["ARROW_USE_GLOG"] = self.options.with_glog tc.variables["GLOG_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_BACKTRACE"] = bool(self.options.with_backtrace) tc.variables["ARROW_WITH_BROTLI"] = bool(self.options.with_brotli) tc.variables["brotli_SOURCE"] = "SYSTEM" if self.options.with_brotli: - tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.options["brotli"].shared) + tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.dependencies["brotli"].options.shared) tc.variables["gflags_SOURCE"] = "SYSTEM" - if self._with_gflags(): - tc.variables["ARROW_GFLAGS_USE_SHARED"] = bool(self.options["gflags"].shared) + if self.options.with_gflags: + tc.variables["ARROW_GFLAGS_USE_SHARED"] = bool(self.dependencies["gflags"].options.shared) tc.variables["ARROW_WITH_BZ2"] = bool(self.options.with_bz2) tc.variables["BZip2_SOURCE"] = "SYSTEM" if self.options.with_bz2: - tc.variables["ARROW_BZ2_USE_SHARED"] = bool(self.options["bzip2"].shared) + tc.variables["ARROW_BZ2_USE_SHARED"] = bool(self.dependencies["bzip2"].options.shared) tc.variables["ARROW_WITH_LZ4"] = bool(self.options.with_lz4) tc.variables["lz4_SOURCE"] = "SYSTEM" if self.options.with_lz4: - tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.options["lz4"].shared) + tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.dependencies["lz4"].options.shared) tc.variables["ARROW_WITH_SNAPPY"] = bool(self.options.with_snappy) tc.variables["RapidJSON_SOURCE"] = "SYSTEM" tc.variables["Snappy_SOURCE"] = "SYSTEM" if self.options.with_snappy: - tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.options["snappy"].shared) + tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.dependencies["snappy"].options.shared) tc.variables["ARROW_WITH_ZLIB"] = bool(self.options.with_zlib) tc.variables["re2_SOURCE"] = "SYSTEM" tc.variables["ZLIB_SOURCE"] = "SYSTEM" tc.variables["xsimd_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_ZSTD"] = bool(self.options.with_zstd) - if Version(self.version) >= "2.0": - tc.variables["zstd_SOURCE"] = "SYSTEM" - tc.variables["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() - tc.variables["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() - else: - tc.variables["ZSTD_SOURCE"] = "SYSTEM" + tc.variables["zstd_SOURCE"] = "SYSTEM" + tc.variables["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() + tc.variables["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() if self.options.with_zstd: - tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.options["zstd"].shared) + tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.dependencies["zstd"].options.shared) tc.variables["ORC_SOURCE"] = "SYSTEM" - tc.variables["ARROW_WITH_THRIFT"] = self._with_thrift() + tc.variables["ARROW_WITH_THRIFT"] = bool(self.options.with_thrift) tc.variables["Thrift_SOURCE"] = "SYSTEM" - if self._with_thrift(): - tc.variables["THRIFT_VERSION"] = bool(self.deps_cpp_info["thrift"].version) # a recent thrift does not require boost - tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.options["thrift"].shared) - tc.variables["ARROW_USE_OPENSSL"] = self._with_openssl() - if self._with_openssl(): - tc.variables["OPENSSL_ROOT_DIR"] = self.deps_cpp_info["openssl"].rootpath.replace("\\", "/") - tc.variables["ARROW_OPENSSL_USE_SHARED"] = bool(self.options["openssl"].shared) - if self._with_boost(): + if self.options.with_thrift: + tc.variables["THRIFT_VERSION"] = bool(self.dependencies["thrift"].ref.version) # a recent thrift does not require boost + tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.dependencies["thrift"].options.shared) + tc.variables["ARROW_USE_OPENSSL"] = self.options.with_openssl + if self.options.with_openssl: + tc.variables["OPENSSL_ROOT_DIR"] = self.dependencies["openssl"].package_folder.replace("\\", "/") + tc.variables["ARROW_OPENSSL_USE_SHARED"] = bool(self.dependencies["openssl"].options.shared) + if self.options.with_boost: tc.variables["ARROW_USE_BOOST"] = True - tc.variables["ARROW_BOOST_USE_SHARED"] = bool(self.options["boost"].shared) + tc.variables["ARROW_BOOST_USE_SHARED"] = bool(self.dependencies["boost"].options.shared) tc.variables["ARROW_S3"] = bool(self.options.with_s3) tc.variables["AWSSDK_SOURCE"] = "SYSTEM" tc.variables["ARROW_BUILD_UTILITIES"] = bool(self.options.cli) @@ -544,16 +404,18 @@ def generate(self): tc.variables["ARROW_ENABLE_TIMING_TESTS"] = False tc.variables["ARROW_BUILD_BENCHMARKS"] = False tc.variables["LLVM_SOURCE"] = "SYSTEM" - tc.variables["ARROW_WITH_UTF8PROC"] = self._with_utf8proc() - tc.variables["ARROW_BOOST_REQUIRED"] = self._with_boost() + tc.variables["ARROW_WITH_UTF8PROC"] = self.options.with_utf8proc + tc.variables["ARROW_BOOST_REQUIRED"] = self.options.with_boost tc.variables["utf8proc_SOURCE"] = "SYSTEM" - if self._with_utf8proc(): - tc.variables["ARROW_UTF8PROC_USE_SHARED"] = bool(self.options["utf8proc"].shared) + if self.options.with_utf8proc: + tc.variables["ARROW_UTF8PROC_USE_SHARED"] = bool(self.dependencies["utf8proc"].options.shared) tc.variables["BUILD_WARNING_LEVEL"] = "PRODUCTION" if is_msvc(self): - tc.variables["ARROW_USE_STATIC_CRT"] = "MT" in str(self.settings.compiler.runtime) - if self._with_llvm(): - tc.variables["LLVM_DIR"] = self.deps_cpp_info["llvm-core"].rootpath.replace("\\", "/") + tc.variables["ARROW_USE_STATIC_CRT"] = is_msvc_static_runtime(self) + if self.options.with_llvm: + tc.variables["LLVM_DIR"] = self.dependencies["llvm-core"].package_folder.replace("\\", "/") + + tc.cache_variables["CMAKE_PROJECT_arrow_INCLUDE"] = os.path.join(self.source_folder, "conan_cmake_project_include.cmake") tc.generate() deps = CMakeDeps(self) @@ -561,10 +423,11 @@ def generate(self): def _patch_sources(self): apply_conandata_patches(self) - if "7.0.0" <= Version(self.version) < "10.0.0": + if Version(self.version) < "10.0.0": for filename in glob.glob(os.path.join(self.source_folder, "cpp", "cmake_modules", "Find*.cmake")): if os.path.basename(filename) not in [ "FindArrow.cmake", + "FindArrowAcero.cmake", "FindArrowCUDA.cmake", "FindArrowDataset.cmake", "FindArrowFlight.cmake", @@ -576,7 +439,6 @@ def _patch_sources(self): "FindArrowTesting.cmake", "FindGandiva.cmake", "FindParquet.cmake", - "FindPlasma.cmake", ]: os.remove(filename) @@ -596,129 +458,106 @@ def package(self): rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) rmdir(self, os.path.join(self.package_folder, "share")) - def _lib_name(self, name): - if is_msvc(self) and not self.options.shared: - return "{}_static".format(name) - else: - return "{}".format(name) - - def package_id(self): - self.info.options.with_gflags = self._with_gflags() - self.info.options.with_protobuf = self._with_protobuf() - self.info.options.with_re2 = self._with_re2() - self.info.options.with_jemalloc = self._with_jemalloc() - self.info.options.with_openssl = self._with_openssl() - self.info.options.with_boost = self._with_boost() - self.info.options.with_glog = self._with_glog() - self.info.options.with_grpc = self._with_grpc() - def package_info(self): - self.cpp_info.filenames["cmake_find_package"] = "Arrow" - self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" - self.cpp_info.components["libarrow"].libs = [self._lib_name("arrow")] - self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" - self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" - self.cpp_info.components["libarrow"].names["pkg_config"] = "arrow" + # FIXME: fix CMake targets of components + + self.cpp_info.set_property("cmake_file_name", "Arrow") + + suffix = "_static" if is_msvc(self) and not self.options.shared else "" + + self.cpp_info.components["libarrow"].set_property("pkg_config_name", "arrow") + self.cpp_info.components["libarrow"].libs = [f"arrow{suffix}"] if not self.options.shared: self.cpp_info.components["libarrow"].defines = ["ARROW_STATIC"] if self.settings.os in ["Linux", "FreeBSD"]: self.cpp_info.components["libarrow"].system_libs = ["pthread", "m", "dl", "rt"] - if self._parquet(): - self.cpp_info.components["libparquet"].libs = [self._lib_name("parquet")] - self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" - self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" - self.cpp_info.components["libparquet"].names["pkg_config"] = "parquet" + if self.options.parquet: + self.cpp_info.components["libparquet"].set_property("pkg_config_name", "parquet") + self.cpp_info.components["libparquet"].libs = [f"parquet{suffix}"] self.cpp_info.components["libparquet"].requires = ["libarrow"] if not self.options.shared: self.cpp_info.components["libparquet"].defines = ["PARQUET_STATIC"] - if self.options.get_safe("substrait", False): - self.cpp_info.components["libarrow_substrait"].libs = [self._lib_name("arrow_substrait")] - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["pkg_config"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset", "acero"] + if self.options.get_safe("substrait"): + self.cpp_info.components["libarrow_substrait"].set_property("pkg_config_name", "arrow_substrait") + self.cpp_info.components["libarrow_substrait"].libs = [f"arrow_substrait{suffix}"] + self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset"] + + # Plasma was deprecated in Arrow 12.0.0 + del self.options.plasma - if self._plasma(): - self.cpp_info.components["libplasma"].libs = [self._lib_name("plasma")] - self.cpp_info.components["libplasma"].names["cmake_find_package"] = "plasma" - self.cpp_info.components["libplasma"].names["cmake_find_package_multi"] = "plasma" - self.cpp_info.components["libplasma"].names["pkg_config"] = "plasma" - self.cpp_info.components["libplasma"].requires = ["libarrow"] + if self.options.acero: + self.cpp_info.components["libacero"].libs = [f"arrow_acero{suffix}"] + self.cpp_info.components["libacero"].names["cmake_find_package"] = "acero" + self.cpp_info.components["libacero"].names["cmake_find_package_multi"] = "acero" + self.cpp_info.components["libacero"].names["pkg_config"] = "acero" + self.cpp_info.components["libacero"].requires = ["libarrow"] if self.options.gandiva: - self.cpp_info.components["libgandiva"].libs = [self._lib_name("gandiva")] - self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" - self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" - self.cpp_info.components["libgandiva"].names["pkg_config"] = "gandiva" + self.cpp_info.components["libgandiva"].set_property("pkg_config_name", "gandiva") + self.cpp_info.components["libgandiva"].libs = [f"gandiva{suffix}"] self.cpp_info.components["libgandiva"].requires = ["libarrow"] if not self.options.shared: self.cpp_info.components["libgandiva"].defines = ["GANDIVA_STATIC"] - if self._with_flight_rpc(): - self.cpp_info.components["libarrow_flight"].libs = [self._lib_name("arrow_flight")] - self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["pkg_config"] = "flight_rpc" + if self.options.with_flight_rpc: + self.cpp_info.components["libarrow_flight"].set_property("pkg_config_name", "flight_rpc") + self.cpp_info.components["libarrow_flight"].libs = [f"arrow_flight{suffix}"] self.cpp_info.components["libarrow_flight"].requires = ["libarrow"] if self.options.get_safe("with_flight_sql"): - self.cpp_info.components["libarrow_flight_sql"].libs = [self._lib_name("arrow_flight_sql")] - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["pkg_config"] = "flight_sql" + self.cpp_info.components["libarrow_flight_sql"].set_property("pkg_config_name", "flight_sql") + self.cpp_info.components["libarrow_flight_sql"].libs = [f"arrow_flight_sql{suffix}"] self.cpp_info.components["libarrow_flight_sql"].requires = ["libarrow", "libarrow_flight"] - if self._acero(): - self.cpp_info.components["acero"].libs = ["arrow_acero"] - - if self._dataset_modules(): + if self.options.dataset_modules: self.cpp_info.components["dataset"].libs = ["arrow_dataset"] + if self.options.parquet: + self.cpp_info.components["dataset"].requires = ["libparquet"] - if (self.options.cli and (self.options.with_cuda or self._with_flight_rpc() or self._parquet())) or self._plasma(): + if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): binpath = os.path.join(self.package_folder, "bin") self.output.info(f"Appending PATH env var: {binpath}") self.env_info.PATH.append(binpath) - if self._with_boost(): + if self.options.with_boost: if self.options.gandiva: # FIXME: only filesystem component is used self.cpp_info.components["libgandiva"].requires.append("boost::boost") - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): + if self.options.parquet and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): self.cpp_info.components["libparquet"].requires.append("boost::boost") - if Version(self.version) >= "2.0": - # FIXME: only headers components is used - self.cpp_info.components["libarrow"].requires.append("boost::boost") - if self._with_openssl(): + # FIXME: only headers components is used + self.cpp_info.components["libarrow"].requires.append("boost::boost") + if self.options.with_openssl: self.cpp_info.components["libarrow"].requires.append("openssl::openssl") - if self._with_gflags(): + if self.options.with_gflags: self.cpp_info.components["libarrow"].requires.append("gflags::gflags") - if self._with_glog(): + if self.options.with_glog: self.cpp_info.components["libarrow"].requires.append("glog::glog") - if self._with_jemalloc(): + if self.options.with_jemalloc: self.cpp_info.components["libarrow"].requires.append("jemalloc::jemalloc") if self.options.with_mimalloc: self.cpp_info.components["libarrow"].requires.append("mimalloc::mimalloc") - if self._with_re2(): + if self.options.with_re2: if self.options.gandiva: self.cpp_info.components["libgandiva"].requires.append("re2::re2") - if self._parquet(): + if self.options.parquet: self.cpp_info.components["libparquet"].requires.append("re2::re2") self.cpp_info.components["libarrow"].requires.append("re2::re2") - if self._with_llvm(): + if self.options.with_llvm: self.cpp_info.components["libgandiva"].requires.append("llvm-core::llvm-core") - if self._with_protobuf(): + if self.options.with_protobuf: self.cpp_info.components["libarrow"].requires.append("protobuf::protobuf") - if self._with_utf8proc(): + if self.options.with_utf8proc: self.cpp_info.components["libarrow"].requires.append("utf8proc::utf8proc") - if self._with_thrift(): + if self.options.with_thrift: self.cpp_info.components["libarrow"].requires.append("thrift::thrift") if self.options.with_backtrace: self.cpp_info.components["libarrow"].requires.append("libbacktrace::libbacktrace") if self.options.with_cuda: self.cpp_info.components["libarrow"].requires.append("cuda::cuda") - if self._with_rapidjson(): + if self._requires_rapidjson(): self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson") if self.options.with_s3: self.cpp_info.components["libarrow"].requires.append("aws-sdk-cpp::s3") @@ -742,9 +581,32 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("zlib::zlib") if self.options.with_zstd: self.cpp_info.components["libarrow"].requires.append("zstd::zstd") - if self._with_boost(): + if self.options.with_boost: self.cpp_info.components["libarrow"].requires.append("boost::boost") - if self._with_grpc(): + if self.options.with_grpc: self.cpp_info.components["libarrow"].requires.append("grpc::grpc") - if self._with_flight_rpc(): + if self.options.with_flight_rpc: self.cpp_info.components["libarrow_flight"].requires.append("protobuf::protobuf") + + # TODO: to remove in conan v2 + self.cpp_info.filenames["cmake_find_package"] = "Arrow" + self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" + self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" + self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" + if self.options.parquet: + self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" + self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" + if self.options.get_safe("substrait"): + self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" + self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" + if self.options.gandiva: + self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" + self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" + if self.options.with_flight_rpc: + self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" + self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" + if self.options.get_safe("with_flight_sql"): + self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" + self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" + if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): + self.env_info.PATH.append(os.path.join(self.package_folder, "bin")) diff --git a/ci/conan/config.yml b/ci/conan/config.yml index be333447f348c..3fa90be6f669a 100644 --- a/ci/conan/config.yml +++ b/ci/conan/config.yml @@ -21,6 +21,22 @@ # SOFTWARE. versions: + "15.0.0": + folder: all + "14.0.2": + folder: all + "14.0.1": + folder: all + "14.0.0": + folder: all + "13.0.0": + folder: all + "12.0.1": + folder: all + "12.0.0": + folder: all + "11.0.0": + folder: all "10.0.1": folder: all "10.0.0": @@ -31,7 +47,3 @@ versions: folder: all "7.0.0": folder: all - "2.0.0": - folder: all - "1.0.0": - folder: all diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index c59766c4a665c..d93732abb0032 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -27,6 +27,11 @@ ENV R_PRUNE_DEPS=${r_prune_deps} ARG r_duckdb_dev=FALSE ENV R_DUCKDB_DEV=${r_duckdb_dev} +# This is needed to avoid errors with utf8 characters in some +# R package's DESCRIPTION files +# https://github.com/statnmap/HatchedPolygons/issues/4 +ENV LANG=C.UTF-8 + # Build R # [1] https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-18-04 # [2] https://linuxize.com/post/how-to-install-r-on-ubuntu-18-04/#installing-r-packages-from-cran diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index b1d9ed5ab88d9..63fd7b1d46820 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -99,5 +99,4 @@ SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] COPY python/requirements-wheel-build.txt /arrow/python/ -# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release -RUN pip install -r /arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" +RUN pip install -r /arrow/python/requirements-wheel-build.txt diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile index 0ab5071abb86c..ff42de939d91f 100644 --- a/ci/docker/python-wheel-windows-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-vs2019.dockerfile @@ -89,8 +89,7 @@ RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt arrow/python/ -# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release -RUN python -m pip install -r arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" +RUN python -m pip install -r arrow/python/requirements-wheel-build.txt # ENV CLCACHE_DIR="C:\clcache" # ENV CLCACHE_COMPRESS=1 diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 629d532a3dc76..4a37818f94396 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -29,7 +29,7 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \ # while debugging package list with docker build. ARG clang_tools ARG llvm -RUN latest_system_llvm=14 && \ +RUN latest_system_llvm=18 && \ if [ ${llvm} -gt ${latest_system_llvm} -o \ ${clang_tools} -gt ${latest_system_llvm} ]; then \ apt-get update -y -q && \ @@ -127,7 +127,7 @@ RUN if [ "${gcc_version}" = "" ]; then \ g++ \ gcc; \ else \ - if [ "${gcc_version}" -gt "12" ]; then \ + if [ "${gcc_version}" -gt "14" ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends software-properties-common && \ add-apt-repository ppa:ubuntu-toolchain-r/volatile; \ diff --git a/ci/scripts/conan_build.sh b/ci/scripts/conan_build.sh index b1ee0a8fc2afd..0ea3fc29192dd 100755 --- a/ci/scripts/conan_build.sh +++ b/ci/scripts/conan_build.sh @@ -30,34 +30,39 @@ export CONAN_HOOK_ERROR_LEVEL=40 conan_args=() conan_args+=(--build=missing) if [ -n "${ARROW_CONAN_PARQUET:-}" ]; then - conan_args+=(--options arrow:parquet=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:parquet=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:with_thrift=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:with_boost=${ARROW_CONAN_PARQUET}) fi if [ -n "${ARROW_CONAN_WITH_BROTLI:-}" ]; then - conan_args+=(--options arrow:with_brotli=${ARROW_CONAN_WITH_BROTLI}) + conan_args+=(--options arrow/*:with_brotli=${ARROW_CONAN_WITH_BROTLI}) fi if [ -n "${ARROW_CONAN_WITH_BZ2:-}" ]; then - conan_args+=(--options arrow:with_bz2=${ARROW_CONAN_WITH_BZ2}) + conan_args+=(--options arrow/*:with_bz2=${ARROW_CONAN_WITH_BZ2}) fi if [ -n "${ARROW_CONAN_WITH_FLIGHT_RPC:-}" ]; then - conan_args+=(--options arrow:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_grpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_protobuf=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_re2=${ARROW_CONAN_WITH_FLIGHT_RPC}) fi if [ -n "${ARROW_CONAN_WITH_GLOG:-}" ]; then - conan_args+=(--options arrow:with_glog=${ARROW_CONAN_WITH_GLOG}) + conan_args+=(--options arrow/*:with_glog=${ARROW_CONAN_WITH_GLOG}) fi if [ -n "${ARROW_CONAN_WITH_JEMALLOC:-}" ]; then - conan_args+=(--options arrow:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) + conan_args+=(--options arrow/*:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) fi if [ -n "${ARROW_CONAN_WITH_JSON:-}" ]; then - conan_args+=(--options arrow:with_json=${ARROW_CONAN_WITH_JSON}) + conan_args+=(--options arrow/*:with_json=${ARROW_CONAN_WITH_JSON}) fi if [ -n "${ARROW_CONAN_WITH_LZ4:-}" ]; then - conan_args+=(--options arrow:with_lz4=${ARROW_CONAN_WITH_LZ4}) + conan_args+=(--options arrow/*:with_lz4=${ARROW_CONAN_WITH_LZ4}) fi if [ -n "${ARROW_CONAN_WITH_SNAPPY:-}" ]; then - conan_args+=(--options arrow:with_snappy=${ARROW_CONAN_WITH_SNAPPY}) + conan_args+=(--options arrow/*:with_snappy=${ARROW_CONAN_WITH_SNAPPY}) fi if [ -n "${ARROW_CONAN_WITH_ZSTD:-}" ]; then - conan_args+=(--options arrow:with_zstd=${ARROW_CONAN_WITH_ZSTD}) + conan_args+=(--options arrow/*:with_zstd=${ARROW_CONAN_WITH_ZSTD}) fi version=$(grep '^set(ARROW_VERSION ' ${ARROW_HOME}/cpp/CMakeLists.txt | \ diff --git a/ci/scripts/install_cmake.sh b/ci/scripts/install_cmake.sh index 2f5e5d52051ed..7fdb06d90f02c 100755 --- a/ci/scripts/install_cmake.sh +++ b/ci/scripts/install_cmake.sh @@ -21,7 +21,10 @@ set -e declare -A archs archs=([amd64]=x86_64 - [arm64v8]=aarch64) + [arch64]=aarch64 + [arm64]=aarch64 + [arm64v8]=aarch64 + [x86_64]=x86_64) declare -A platforms platforms=([linux]=linux @@ -38,5 +41,25 @@ platform=${platforms[$2]} version=$3 prefix=$4 -url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-${arch}.tar.gz" -wget -q ${url} -O - | tar -xzf - --directory ${prefix} --strip-components=1 +mkdir -p ${prefix} +url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-" +case ${platform} in + macos) + url+="universal.tar.gz" + curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 + ln -s CMake.app/Contents/bin ${prefix}/bin + ;; + windows) + url+="${arch}.zip" + archive_name=$(basename ${url}) + curl -L -o ${archive_name} ${url} + unzip ${archive_name} + base_name=$(basename ${archive_name} .zip) + mv ${base_name}/* ${prefix} + rm -rf ${base_name} ${archive_name} + ;; + *) + url+="${arch}.tar.gz" + curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 + ;; +esac diff --git a/ci/scripts/install_sccache.sh b/ci/scripts/install_sccache.sh index 0346c0cc9ce7d..136f39b3ae2ab 100755 --- a/ci/scripts/install_sccache.sh +++ b/ci/scripts/install_sccache.sh @@ -59,7 +59,7 @@ fi # Extract only the sccache binary into $PREFIX and ignore README and LICENSE. # --wildcards doesn't work on busybox. tar -xzvf $SCCACHE_ARCHIVE --strip-component=1 --directory $PREFIX --exclude="sccache*/*E*E*" -chmod u+x $PREFIX/sccache +chmod a+x $PREFIX/sccache if [ -n "${GITHUB_PATH}" ]; then echo "$PREFIX" >> $GITHUB_PATH diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index a94dac40e931f..3ed9d5d8dd12f 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -50,15 +50,12 @@ echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" -# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release pip install \ --upgrade \ --only-binary=:all: \ --target $PIP_SITE_PACKAGES \ --platform $PIP_TARGET_PLATFORM \ - -r ${source_dir}/python/requirements-wheel-build.txt \ - --pre \ - --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" + -r ${source_dir}/python/requirements-wheel-build.txt pip install "delocate>=0.10.3" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 7d54ccccf7c19..ddea1c399cbba 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1190,6 +1190,12 @@ if(MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() set(Boost_ADDITIONAL_VERSIONS + "1.84.0" + "1.84" + "1.83.0" + "1.83" + "1.82.0" + "1.82" "1.81.0" "1.81" "1.80.0" diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 993c0b9a705b4..ad1bd67cc8ec7 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -83,8 +83,8 @@ class JoinBenchmark { build_metadata["null_probability"] = std::to_string(settings.null_percentage); build_metadata["min"] = std::to_string(min_build_value); build_metadata["max"] = std::to_string(max_build_value); - build_metadata["min_length"] = settings.var_length_min; - build_metadata["max_length"] = settings.var_length_max; + build_metadata["min_length"] = std::to_string(settings.var_length_min); + build_metadata["max_length"] = std::to_string(settings.var_length_max); std::unordered_map probe_metadata; probe_metadata["null_probability"] = std::to_string(settings.null_percentage); diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 63969d9a3ed4b..9c3dbc176ff4f 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -2036,6 +2036,29 @@ TEST(HashJoin, ResidualFilter) { [3, 4, "alpha", 4, 16, "alpha"]])")}); } +TEST(HashJoin, FilterEmptyRows) { + // Regression test for GH-41121. + BatchesWithSchema input_left; + input_left.batches = { + ExecBatchFromJSON({int32(), utf8(), int32()}, R"([[2, "Jarry", 28]])")}; + input_left.schema = + schema({field("id", int32()), field("name", utf8()), field("age", int32())}); + + BatchesWithSchema input_right; + input_right.batches = {ExecBatchFromJSON( + {int32(), int32(), utf8()}, + R"([[2, 10, "Jack"], [3, 12, "Mark"], [4, 15, "Tom"], [1, 10, "Jack"]])")}; + input_right.schema = + schema({field("id", int32()), field("stu_id", int32()), field("subject", utf8())}); + + const ResidualFilterCaseRunner runner{std::move(input_left), std::move(input_right)}; + + Expression filter = greater(field_ref("age"), literal(25)); + + runner.Run(JoinType::LEFT_ANTI, {"id"}, {"stu_id"}, std::move(filter), + {ExecBatchFromJSON({int32(), utf8(), int32()}, R"([[2, "Jarry", 28]])")}); +} + TEST(HashJoin, TrivialResidualFilter) { Expression always_true = equal(call("add", {field_ref("l1"), field_ref("r1")}), literal(2)); // 1 + 1 == 2 diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 61c8bfe95414e..542e943c4a82b 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2167,6 +2167,11 @@ Status JoinResidualFilter::FilterOneBatch(const ExecBatch& keypayload_batch, ARROW_DCHECK(!output_payload_ids || payload_ids_maybe_null); *num_passing_rows = 0; + + if (num_batch_rows == 0) { + return Status::OK(); + } + ARROW_ASSIGN_OR_RAISE(Datum mask, EvalFilter(keypayload_batch, num_batch_rows, batch_row_ids, key_ids_maybe_null, payload_ids_maybe_null)); diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index b08fa99168616..18afcc90d71f8 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -735,7 +735,7 @@ class TestListArray : public ::testing::Test { ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]")); auto sliced_list_array = std::dynamic_pointer_cast(list_array->Slice(3, 4)); - ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + ASSERT_OK_AND_ASSIGN(auto flattened, sliced_list_array->Flatten()); ASSERT_OK(flattened->ValidateFull()); // Note the difference between values() and Flatten(). EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[5, 6]"))); @@ -763,6 +763,52 @@ class TestListArray : public ::testing::Test { << flattened->ToString(); } + void TestFlattenRecursively() { + auto inner_type = std::make_shared(int32()); + auto type = std::make_shared(inner_type); + + // List types with two nested level: list> + auto nested_list_array = std::dynamic_pointer_cast(ArrayFromJSON(type, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])")); + ASSERT_OK_AND_ASSIGN(auto flattened, nested_list_array->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(10, flattened->length()); + ASSERT_TRUE( + flattened->Equals(ArrayFromJSON(int32(), "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"))); + + // Empty nested list should flatten until non-list type is reached + nested_list_array = + std::dynamic_pointer_cast(ArrayFromJSON(type, R"([null])")); + ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively()); + ASSERT_TRUE(flattened->type()->Equals(int32())); + + // List types with three nested level: list>> + type = std::make_shared(std::make_shared(fixed_size_list(int32(), 2))); + nested_list_array = std::dynamic_pointer_cast(ArrayFromJSON(type, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])")); + ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(10, flattened->length()); + ASSERT_EQ(3, flattened->null_count()); + ASSERT_TRUE(flattened->Equals( + ArrayFromJSON(int32(), "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"))); + } + Status ValidateOffsetsAndSizes(int64_t length, std::vector offsets, std::vector sizes, std::shared_ptr values, int64_t offset = 0) { @@ -925,10 +971,12 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) { TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); } TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); } TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); } +TYPED_TEST(TestListArray, FlattenSliced) { this->TestFlattenSliced(); } TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); } TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); } +TYPED_TEST(TestListArray, FlattenRecursively) { this->TestFlattenRecursively(); } TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); } @@ -1714,4 +1762,23 @@ TEST_F(TestFixedSizeListArray, Flatten) { } } +TEST_F(TestFixedSizeListArray, FlattenRecursively) { + // Nested fixed-size list-array: fixed_size_list(fixed_size_list(int32, 2), 2) + auto inner_type = fixed_size_list(value_type_, 2); + type_ = fixed_size_list(inner_type, 2); + + auto values = std::dynamic_pointer_cast(ArrayFromJSON(type_, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])")); + ASSERT_OK(values->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto flattened, values->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(8, flattened->length()); + ASSERT_EQ(2, flattened->null_count()); + AssertArraysEqual(*flattened, + *ArrayFromJSON(value_type_, "[0, 1, null, 3, 7, null, 2, 5]")); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 958c2e25380b0..24e0dfb7081ac 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -42,6 +42,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/list_util.h" #include "arrow/util/logging.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -469,6 +470,49 @@ inline void SetListData(VarLengthListLikeArray* self, self->values_ = MakeArray(self->data_->child_data[0]); } +Result> FlattenLogicalListRecursively(const Array& in_array, + MemoryPool* memory_pool) { + std::shared_ptr array = in_array.Slice(0, in_array.length()); + for (auto kind = array->type_id(); is_list(kind) || is_list_view(kind); + kind = array->type_id()) { + switch (kind) { + case Type::LIST: { + ARROW_ASSIGN_OR_RAISE( + array, (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LARGE_LIST: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LIST_VIEW: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LARGE_LIST_VIEW: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::FIXED_SIZE_LIST: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + default: + Unreachable("unexpected non-list type"); + break; + } + } + return array; +} + } // namespace internal // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 768a630e0af54..5744f5fcadf05 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -58,6 +58,20 @@ void SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id = TYPE::type_id); +/// \brief A version of Flatten that keeps recursively flattening until an array of +/// non-list values is reached. +/// +/// Array types considered to be lists by this function: +/// - list +/// - large_list +/// - list_view +/// - large_list_view +/// - fixed_size_list +/// +/// \see ListArray::Flatten +ARROW_EXPORT Result> FlattenLogicalListRecursively( + const Array& in_array, MemoryPool* memory_pool); + } // namespace internal /// Base class for variable-sized list and list-view arrays, regardless of offset size. @@ -103,6 +117,15 @@ class VarLengthListLikeArray : public Array { return values_->Slice(value_offset(i), value_length(i)); } + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + protected: friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, @@ -595,6 +618,15 @@ class ARROW_EXPORT FixedSizeListArray : public Array { Result> Flatten( MemoryPool* memory_pool = default_memory_pool()) const; + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + /// \brief Construct FixedSizeListArray from child value array and value_length /// /// \param[in] values Array containing list values diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index 7c4a14d93400f..da3810aa392c9 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -56,7 +56,7 @@ int64_t BooleanArray::false_count() const { } int64_t BooleanArray::true_count() const { - if (data_->null_count.load() != 0) { + if (data_->MayHaveNulls()) { DCHECK(data_->buffers[0]); return internal::CountAndSetBits(data_->buffers[0]->data(), data_->offset, data_->buffers[1]->data(), data_->offset, diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 21ac1a09f56e7..60efdb47683f4 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -1307,6 +1307,13 @@ TEST(TestBooleanArray, TrueCountFalseCount) { CheckArray(checked_cast(*arr)); CheckArray(checked_cast(*arr->Slice(5))); CheckArray(checked_cast(*arr->Slice(0, 0))); + + // GH-41016 true_count() with array without validity buffer with null_count of -1 + auto arr_unknown_null_count = ArrayFromJSON(boolean(), "[true, false, true]"); + arr_unknown_null_count->data()->null_count = kUnknownNullCount; + ASSERT_EQ(arr_unknown_null_count->data()->null_count.load(), -1); + ASSERT_EQ(arr_unknown_null_count->null_bitmap(), nullptr); + ASSERT_EQ(checked_pointer_cast(arr_unknown_null_count)->true_count(), 2); } TEST(TestPrimitiveAdHoc, TestType) { diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index 0b198759de1e6..5b9c51cda5576 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -128,10 +128,20 @@ class ColumnPopulator { // Populators are intented to be applied to reasonably small data. In most cases // threading overhead would not be justified. ctx.set_use_threads(false); - ASSIGN_OR_RAISE( - std::shared_ptr casted, - compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx)); - casted_array_ = checked_pointer_cast(casted); + if (data.type() && is_large_binary_like(data.type()->id())) { + ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), + compute::CastOptions(), &ctx)); + } else { + auto casted = compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx); + if (casted.ok()) { + array_ = std::move(casted).ValueOrDie(); + } else if (casted.status().IsCapacityError()) { + ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), + compute::CastOptions(), &ctx)); + } else { + return casted.status(); + } + } return UpdateRowLengths(row_lengths); } @@ -146,7 +156,8 @@ class ColumnPopulator { protected: virtual Status UpdateRowLengths(int64_t* row_lengths) = 0; - std::shared_ptr casted_array_; + // It must be a `StringArray` or `LargeStringArray`. + std::shared_ptr array_; const std::string end_chars_; std::shared_ptr null_string_; @@ -181,15 +192,28 @@ class UnquotedColumnPopulator : public ColumnPopulator { reject_values_with_quotes_(reject_values_with_quotes) {} Status UpdateRowLengths(int64_t* row_lengths) override { + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return UpdateRowLengths(row_lengths); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return UpdateRowLengths(row_lengths); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status UpdateRowLengths(int64_t* row_lengths) { + auto casted_array = checked_pointer_cast(array_); if (reject_values_with_quotes_) { // When working on values that, after casting, could produce quotes, // we need to return an error in accord with RFC4180. - RETURN_NOT_OK(CheckStringArrayHasNoStructuralChars(*casted_array_, delimiter_)); + RETURN_NOT_OK(CheckStringArrayHasNoStructuralChars(*casted_array, + delimiter_)); } int64_t row_number = 0; - VisitArraySpanInline( - *casted_array_->data(), + VisitArraySpanInline( + *casted_array->data(), [&](std::string_view s) { row_lengths[row_number] += static_cast(s.length()); row_number++; @@ -202,6 +226,17 @@ class UnquotedColumnPopulator : public ColumnPopulator { } Status PopulateRows(char* output, int64_t* offsets) const override { + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return PopulateRows(output, offsets); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return PopulateRows(output, offsets); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status PopulateRows(char* output, int64_t* offsets) const { // Function applied to valid values cast to string. auto valid_function = [&](std::string_view s) { memcpy(output + *offsets, s.data(), s.length()); @@ -222,13 +257,14 @@ class UnquotedColumnPopulator : public ColumnPopulator { return Status::OK(); }; - return VisitArraySpanInline(*casted_array_->data(), valid_function, - null_function); + return VisitArraySpanInline( + *array_->data(), valid_function, null_function); } private: // Returns an error status if string array has any structural characters. - static Status CheckStringArrayHasNoStructuralChars(const StringArray& array, + template + static Status CheckStringArrayHasNoStructuralChars(const ArrayType& array, const char delimiter) { // scan the underlying string array buffer as a single big string const uint8_t* const data = array.raw_data() + array.value_offset(0); @@ -282,14 +318,26 @@ class QuotedColumnPopulator : public ColumnPopulator { : ColumnPopulator(pool, std::move(end_chars), std::move(null_string)) {} Status UpdateRowLengths(int64_t* row_lengths) override { - const StringArray& input = *casted_array_; + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return UpdateRowLengths(row_lengths); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return UpdateRowLengths(row_lengths); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status UpdateRowLengths(int64_t* row_lengths) { + auto casted_array = checked_pointer_cast(array_); + const StringArrayType& input = *casted_array; - row_needs_escaping_.resize(casted_array_->length(), false); + row_needs_escaping_.resize(casted_array->length(), false); if (NoQuoteInArray(input)) { // fast path if no quote int row_number = 0; - VisitArraySpanInline( + VisitArraySpanInline( *input.data(), [&](std::string_view s) { row_lengths[row_number] += static_cast(s.length()) + kQuoteCount; @@ -301,7 +349,7 @@ class QuotedColumnPopulator : public ColumnPopulator { }); } else { int row_number = 0; - VisitArraySpanInline( + VisitArraySpanInline( *input.data(), [&](std::string_view s) { // Each quote in the value string needs to be escaped. @@ -320,9 +368,20 @@ class QuotedColumnPopulator : public ColumnPopulator { } Status PopulateRows(char* output, int64_t* offsets) const override { + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return PopulateRows(output, offsets); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return PopulateRows(output, offsets); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status PopulateRows(char* output, int64_t* offsets) const { auto needs_escaping = row_needs_escaping_.begin(); - VisitArraySpanInline( - *(casted_array_->data()), + VisitArraySpanInline( + *array_->data(), [&](std::string_view s) { // still needs string content length to be added char* row = output + *offsets; @@ -355,7 +414,8 @@ class QuotedColumnPopulator : public ColumnPopulator { private: // Returns true if there's no quote in the string array - static bool NoQuoteInArray(const StringArray& array) { + template + static bool NoQuoteInArray(const StringArrayType& array) { const uint8_t* data = array.raw_data() + array.value_offset(0); const int64_t buffer_size = array.total_values_length(); return std::memchr(data, '"', buffer_size) == nullptr; diff --git a/cpp/src/arrow/csv/writer.h b/cpp/src/arrow/csv/writer.h index 4323337212472..d9d79e1660867 100644 --- a/cpp/src/arrow/csv/writer.h +++ b/cpp/src/arrow/csv/writer.h @@ -29,7 +29,8 @@ namespace arrow { namespace csv { // Functionality for converting Arrow data to Comma separated value text. -// This library supports all primitive types that can be cast to a StringArrays. +// This library supports all primitive types that can be cast to a StringArray or +// a LargeStringArray. // It applies to following formatting rules: // - For non-binary types no quotes surround values. Nulls are represented as the empty // string. diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index c1f5622289edb..703179da94093 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -73,17 +73,17 @@ WriteOptions DefaultTestOptions(bool include_header = false, } std::string UtilGetExpectedWithEOL(const std::string& eol) { - return std::string("1,,-1,,,,") + eol + // line 1 - R"(1,"abc""efg",2324,,,,)" + eol + // line 2 - R"(,"abcd",5467,,,,)" + eol + // line 3 - R"(,,,,,,)" + eol + // line 4 - R"(546,"",517,,,,)" + eol + // line 5 - R"(124,"a""""b""",,,,,)" + eol + // line 6 - R"(,,,1970-01-01,,,)" + eol + // line 7 - R"(,,,,1970-01-02,,)" + eol + // line 8 - R"(,,,,,2004-02-29 01:02:03,)" + eol + // line 9 - R"(,,,,,,3600)" + eol + // line 10 - R"(,"NA",,,,,)" + eol; // line 11 + return std::string("1,,-1,,,,,") + eol + // line 1 + R"(1,"abc""efg",2324,,,,,)" + eol + // line 2 + R"(,"abcd",5467,,,,,"efghi")" + eol + // line 3 + R"(,,,,,,,)" + eol + // line 4 + R"(546,"",517,,,,,)" + eol + // line 5 + R"(124,"a""""b""",,,,,,)" + eol + // line 6 + R"(,,,1970-01-01,,,,"jklm")" + eol + // line 7 + R"(,,,,1970-01-02,,,)" + eol + // line 8 + R"(,,,,,2004-02-29 01:02:03,,)" + eol + // line 9 + R"(,,,,,,3600,)" + eol + // line 10 + R"(,"NA",,,,,,)" + eol; // line 11 } std::vector GenerateTestCases() { @@ -100,20 +100,22 @@ std::vector GenerateTestCases() { field("e", date64()), field("f", timestamp(TimeUnit::SECOND)), field("g", duration(TimeUnit::SECOND)), + field("h", large_utf8()), }); auto populated_batch = R"([{"a": 1, "c ": -1}, { "a": 1, "b\"": "abc\"efg", "c ": 2324}, - { "b\"": "abcd", "c ": 5467}, + { "b\"": "abcd", "c ": 5467, "h": "efghi"}, { }, { "a": 546, "b\"": "", "c ": 517 }, { "a": 124, "b\"": "a\"\"b\"" }, - { "d": 0 }, + { "d": 0, "h": "jklm" }, { "e": 86400000 }, { "f": 1078016523 }, { "g": 3600 }, { "b\"": "NA" }])"; - std::string expected_header = std::string(R"("a","b""","c ","d","e","f","g")") + "\n"; + std::string expected_header = + std::string(R"("a","b""","c ","d","e","f","g","h")") + "\n"; // Expected output without header when using default QuotingStyle::Needed. std::string expected_without_header = UtilGetExpectedWithEOL("\n"); @@ -122,42 +124,42 @@ std::vector GenerateTestCases() { // Expected output without header when using QuotingStyle::AllValid. std::string expected_quoting_style_all_valid = - std::string(R"("1",,"-1",,,,)") + "\n" + // line 1 - R"("1","abc""efg","2324",,,,)" + "\n" + // line 2 - R"(,"abcd","5467",,,,)" + "\n" + // line 3 - R"(,,,,,,)" + "\n" + // line 4 - R"("546","","517",,,,)" + "\n" + // line 5 - R"("124","a""""b""",,,,,)" + "\n" + // line 6 - R"(,,,"1970-01-01",,,)" + "\n" + // line 7 - R"(,,,,"1970-01-02",,)" + "\n" + // line 8 - R"(,,,,,"2004-02-29 01:02:03",)" + "\n" + // line 9 - R"(,,,,,,"3600")" + "\n" + // line 10 - R"(,"NA",,,,,)" + "\n"; // line 11 + std::string(R"("1",,"-1",,,,,)") + "\n" + // line 1 + R"("1","abc""efg","2324",,,,,)" + "\n" + // line 2 + R"(,"abcd","5467",,,,,"efghi")" + "\n" + // line 3 + R"(,,,,,,,)" + "\n" + // line 4 + R"("546","","517",,,,,)" + "\n" + // line 5 + R"("124","a""""b""",,,,,,)" + "\n" + // line 6 + R"(,,,"1970-01-01",,,,"jklm")" + "\n" + // line 7 + R"(,,,,"1970-01-02",,,)" + "\n" + // line 8 + R"(,,,,,"2004-02-29 01:02:03",,)" + "\n" + // line 9 + R"(,,,,,,"3600",)" + "\n" + // line 10 + R"(,"NA",,,,,,)" + "\n"; // line 11 // Batch when testing QuotingStyle::None. The values may not contain any quotes for this // style according to RFC4180. auto populated_batch_quoting_style_none = R"([{"a": 1, "c ": -1}, { "a": 1, "b\"": "abcefg", "c ": 2324}, - { "b\"": "abcd", "c ": 5467}, + { "b\"": "abcd", "c ": 5467, "h": "efghi"}, { }, { "a": 546, "b\"": "", "c ": 517 }, { "a": 124, "b\"": "ab" }, - { "d": 0 }, + { "d": 0, "h": "jklm" }, { "e": 86400000 }, { "f": 1078016523 }, { "g": 3600 }])"; // Expected output for QuotingStyle::None. - std::string expected_quoting_style_none = std::string("1,,-1,,,,") + "\n" + // line 1 - R"(1,abcefg,2324,,,,)" + "\n" + // line 2 - R"(,abcd,5467,,,,)" + "\n" + // line 3 - R"(,,,,,,)" + "\n" + // line 4 - R"(546,,517,,,,)" + "\n" + // line 5 - R"(124,ab,,,,,)" + "\n" + // line 6 - R"(,,,1970-01-01,,,)" + "\n" + // line 7 - R"(,,,,1970-01-02,,)" + "\n" + // line 8 - R"(,,,,,2004-02-29 01:02:03,)" + - "\n" + // line 9 - R"(,,,,,,3600)" + "\n"; // line 10 + std::string expected_quoting_style_none = std::string("1,,-1,,,,,") + "\n" + // line 1 + R"(1,abcefg,2324,,,,,)" + "\n" + // line 2 + R"(,abcd,5467,,,,,efghi)" + "\n" + // line 3 + R"(,,,,,,,)" + "\n" + // line 4 + R"(546,,517,,,,,)" + "\n" + // line 5 + R"(124,ab,,,,,,)" + "\n" + // line 6 + R"(,,,1970-01-01,,,,jklm)" + "\n" + // line 7 + R"(,,,,1970-01-02,,,)" + "\n" + // line 8 + R"(,,,,,2004-02-29 01:02:03,,)" + + "\n" + // line 9 + R"(,,,,,,3600,)" + "\n"; // line 10 // Schema and data to test custom null value string. auto schema_custom_na = schema({field("g", uint64()), field("h", utf8())}); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 19226ce01ae2f..8eb00b8ae44f3 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -752,7 +752,7 @@ void GenericFileSystemTest::TestGetFileInfoSelector(FileSystem* fs) { } void GenericFileSystemTest::TestGetFileInfoGenerator(FileSystem* fs) { -#ifdef ADDRESS_SANITIZER +#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND) if (have_false_positive_memory_leak_with_generator()) { GTEST_SKIP() << "Filesystem have false positive memory leak with generator"; } diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index 5faa4d095eb1e..6a6fbf40f9628 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -269,7 +269,7 @@ class CompressedInputStream::Impl { // Read compressed data if necessary Status EnsureCompressedData() { - int64_t compressed_avail = compressed_ ? compressed_->size() - compressed_pos_ : 0; + int64_t compressed_avail = compressed_buffer_available(); if (compressed_avail == 0) { // Ensure compressed_ buffer is allocated with kChunkSize. if (!supports_zero_copy_from_raw_) { @@ -297,10 +297,14 @@ class CompressedInputStream::Impl { return Status::OK(); } - // Decompress some data from the compressed_ buffer. - // Call this function only if the decompressed_ buffer is empty. + // Decompress some data from the compressed_ buffer into decompressor_. + // Call this function only if the decompressed_ buffer is fully consumed. Status DecompressData() { + // compressed_buffer_available() could be 0 here because there might + // still be some decompressed data left to emit even though the compressed + // data was entirely consumed (especially if the expansion factor is large) DCHECK_NE(compressed_->data(), nullptr); + DCHECK_EQ(0, decompressed_buffer_available()); int64_t decompress_size = kDecompressSize; @@ -352,8 +356,10 @@ class CompressedInputStream::Impl { } // Try to feed more data into the decompressed_ buffer. - Status RefillDecompressed(bool* has_data) { - // First try to read data from the decompressor + // Returns whether there is more data to read. + Result RefillDecompressed() { + // First try to read data from the decompressor, unless we haven't read any + // compressed data yet. if (compressed_ && compressed_->size() != 0) { if (decompressor_->IsFinished()) { // We just went over the end of a previous compressed stream. @@ -362,21 +368,21 @@ class CompressedInputStream::Impl { } RETURN_NOT_OK(DecompressData()); } - if (!decompressed_ || decompressed_->size() == 0) { - // Got nothing, need to read more compressed data + int64_t decompress_avail = decompressed_buffer_available(); + if (decompress_avail == 0) { + // Got nothing from existing `compressed_` and `decompressor_`, + // need to read more compressed data. RETURN_NOT_OK(EnsureCompressedData()); - if (compressed_pos_ == compressed_->size()) { + if (compressed_buffer_available() == 0) { // No more data to decompress if (!fresh_decompressor_ && !decompressor_->IsFinished()) { return Status::IOError("Truncated compressed stream"); } - *has_data = false; - return Status::OK(); + return false; } RETURN_NOT_OK(DecompressData()); } - *has_data = true; - return Status::OK(); + return true; } Result Read(int64_t nbytes, void* out) { @@ -394,7 +400,7 @@ class CompressedInputStream::Impl { // At this point, no more decompressed data remains, so we need to // decompress more - RETURN_NOT_OK(RefillDecompressed(&decompressor_has_data)); + ARROW_ASSIGN_OR_RAISE(decompressor_has_data, RefillDecompressed()); } total_pos_ += total_read; @@ -405,13 +411,22 @@ class CompressedInputStream::Impl { ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes, pool_)); ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buf->mutable_data())); RETURN_NOT_OK(buf->Resize(bytes_read)); - // Using std::move because the some compiler might has issue below: + // Using std::move because some compiler might has issue below: // https://wg21.cmeerw.net/cwg/issue1579 return std::move(buf); } const std::shared_ptr& raw() const { return raw_; } + private: + int64_t compressed_buffer_available() const { + return compressed_ ? compressed_->size() - compressed_pos_ : 0; + } + + int64_t decompressed_buffer_available() const { + return decompressed_ ? decompressed_->size() - decompressed_pos_ : 0; + } + private: // Read 64 KB compressed data at a time static const int64_t kChunkSize = 64 * 1024; diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index e84aac995e35f..8cecc6365a3b9 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -53,8 +53,11 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* o float temp_out; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); - *out = Float16::FromFloat(temp_out).bits(); - return res.ec == std::errc() && res.ptr == s + length; + const bool ok = res.ec == std::errc() && res.ptr == s + length; + if (ok) { + *out = Float16::FromFloat(temp_out).bits(); + } + return ok; } // ---------------------------------------------------------------------- diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index 03a4af90d8991..ef07a9ef0bc9b 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -99,13 +99,14 @@ Result> LikeHolder::Make(const FunctionNode& node) { "'like' function requires a string literal as the second parameter")); RE2::Options regex_op; + regex_op.set_dot_nl(true); // set dotall mode for the regex. if (node.descriptor()->name() == "ilike") { regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. return Make(std::get(literal->holder()), regex_op); } if (node.children().size() == 2) { - return Make(std::get(literal->holder())); + return Make(std::get(literal->holder()), regex_op); } else { auto escape_char = dynamic_cast(node.children().at(2).get()); ARROW_RETURN_IF( @@ -118,7 +119,7 @@ Result> LikeHolder::Make(const FunctionNode& node) { Status::Invalid( "'like' function requires a string literal as the third parameter")); return Make(std::get(literal->holder()), - std::get(escape_char->holder())); + std::get(escape_char->holder()), regex_op); } } @@ -126,7 +127,9 @@ Result> LikeHolder::Make(const std::string& sql_patt std::string pcre_pattern; ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + RE2::Options regex_op; + regex_op.set_dot_nl(true); // set dotall mode for the regex. + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); @@ -135,7 +138,8 @@ Result> LikeHolder::Make(const std::string& sql_patt } Result> LikeHolder::Make(const std::string& sql_pattern, - const std::string& escape_char) { + const std::string& escape_char, + RE2::Options regex_op) { ARROW_RETURN_IF(escape_char.length() > 1, Status::Invalid("The length of escape char ", escape_char, " in 'like' function is greater than 1")); @@ -147,7 +151,7 @@ Result> LikeHolder::Make(const std::string& sql_patt ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); } - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h index 36d942510bb5b..354c2b53d95e1 100644 --- a/cpp/src/gandiva/regex_functions_holder.h +++ b/cpp/src/gandiva/regex_functions_holder.h @@ -40,7 +40,8 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Result> Make(const std::string& sql_pattern); static Result> Make(const std::string& sql_pattern, - const std::string& escape_char); + const std::string& escape_char, + RE2::Options regex_op); static Result> Make(const std::string& sql_pattern, RE2::Options regex_op); diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 534be5987a233..64657e88c6473 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -28,6 +28,8 @@ namespace gandiva { class TestLikeHolder : public ::testing::Test { public: RE2::Options regex_op; + void SetUp() { regex_op.set_dot_nl(true); } + FunctionNode BuildLike(std::string pattern) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = @@ -77,6 +79,14 @@ TEST_F(TestLikeHolder, TestPcreSpecial) { EXPECT_FALSE(like("xxabc")); } +TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) { + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%Space1.%", regex_op)); + + auto& like = *like_holder; + EXPECT_TRUE( + like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]")); +} + TEST_F(TestLikeHolder, TestRegexEscape) { std::string res; ARROW_EXPECT_OK(RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res)); @@ -91,14 +101,22 @@ TEST_F(TestLikeHolder, TestDot) { EXPECT_FALSE(like("abcd")); } +TEST_F(TestLikeHolder, TestMatchWithNewLine) { + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%abc%", regex_op)); + + auto& like = *like_holder; + EXPECT_TRUE(like("abc\nd")); +} + TEST_F(TestLikeHolder, TestMatchSubString) { - EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\")); + EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like("abc")); EXPECT_FALSE(like("xxabdc")); - EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\")); + EXPECT_OK_AND_ASSIGN(like_holder, + LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", regex_op)); auto& like_reserved_char = *like_holder; EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d")); @@ -173,7 +191,7 @@ TEST_F(TestLikeHolder, TestOptimise) { } TEST_F(TestLikeHolder, TestMatchOneEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\", regex_op)); auto& like = *like_holder; @@ -187,7 +205,7 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { } TEST_F(TestLikeHolder, TestMatchManyEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\", regex_op)); auto& like = *like_holder; @@ -201,7 +219,8 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { } TEST_F(TestLikeHolder, TestMatchEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, + LikeHolder::Make("ab\\\\", "\\", regex_op)); auto& like = *like_holder; @@ -211,7 +230,7 @@ TEST_F(TestLikeHolder, TestMatchEscape) { } TEST_F(TestLikeHolder, TestEmptyEscapeChar) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "", regex_op)); auto& like = *like_holder; @@ -223,7 +242,7 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) { } TEST_F(TestLikeHolder, TestMultipleEscapeChar) { - ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\").status()); + ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\", regex_op).status()); } class TestILikeHolder : public ::testing::Test { diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index a858c53e931d8..e74a9f55b124f 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -396,7 +396,7 @@ struct ByteStreamSplitDummyValue> { using Array = std::array; static constexpr Array value() { - Array array; + Array array{}; array.fill(ByteStreamSplitDummyValue::value()); return array; } diff --git a/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj b/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj index 3fbd772db5ec6..55497203a12be 100644 --- a/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj +++ b/csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Arrays/Array.cs b/csharp/src/Apache.Arrow/Arrays/Array.cs index 0838134b19c6d..4abe63e05ad83 100644 --- a/csharp/src/Apache.Arrow/Arrays/Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Array.cs @@ -31,7 +31,7 @@ protected Array(ArrayData data) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public ArrowBuffer NullBitmapBuffer => Data.Buffers[0]; diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs index 55d77f598c4e4..cdb6ed6b39418 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs @@ -15,7 +15,6 @@ using Apache.Arrow.Memory; using Apache.Arrow.Types; -using Google.FlatBuffers; using System; using System.Collections.Generic; using System.Linq; @@ -28,12 +27,30 @@ public sealed class ArrayData : IDisposable public readonly IArrowType DataType; public readonly int Length; - public readonly int NullCount; + + /// + /// The number of null values in the Array. May be -1 if the null count has not been computed. + /// + public int NullCount; + public readonly int Offset; public readonly ArrowBuffer[] Buffers; public readonly ArrayData[] Children; public readonly ArrayData Dictionary; // Only used for dictionary type + /// + /// Get the number of null values in the Array, computing the count if required. + /// + public int GetNullCount() + { + if (NullCount == RecalculateNullCount) + { + NullCount = ComputeNullCount(); + } + + return NullCount; + } + // This is left for compatibility with lower version binaries // before the dictionary type was supported. public ArrayData( @@ -111,7 +128,25 @@ public ArrayData Slice(int offset, int length) length = Math.Min(Length - offset, length); offset += Offset; - return new ArrayData(DataType, length, RecalculateNullCount, offset, Buffers, Children, Dictionary); + int nullCount; + if (NullCount == 0) + { + nullCount = 0; + } + else if (NullCount == Length) + { + nullCount = length; + } + else if (offset == Offset && length == Length) + { + nullCount = NullCount; + } + else + { + nullCount = RecalculateNullCount; + } + + return new ArrayData(DataType, length, nullCount, offset, Buffers, Children, Dictionary); } public ArrayData Clone(MemoryAllocator allocator = default) @@ -125,5 +160,24 @@ public ArrayData Clone(MemoryAllocator allocator = default) Children?.Select(b => b.Clone(allocator))?.ToArray(), Dictionary?.Clone(allocator)); } + + private int ComputeNullCount() + { + if (DataType.TypeId == ArrowTypeId.Union) + { + return UnionArray.ComputeNullCount(this); + } + + if (Buffers == null || Buffers.Length == 0 || Buffers[0].IsEmpty) + { + return 0; + } + + // Note: Dictionary arrays may be logically null if there is a null in the dictionary values, + // but this isn't accounted for by the IArrowArray.IsNull implementation, + // so we maintain consistency with that behaviour here. + + return Length - BitUtility.CountBits(Buffers[0].Span, Offset, Length); + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 698d74e4bac84..84658a5fab812 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -71,7 +71,7 @@ public ArrayDataConcatenationVisitor(IReadOnlyList arrayDataList, Mem foreach (ArrayData arrayData in _arrayDataList) { _totalLength += arrayData.Length; - _totalNullCount += arrayData.NullCount; + _totalNullCount += arrayData.GetNullCount(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs index 4f62dffd1ddeb..b7c9b07336a5a 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs @@ -322,7 +322,7 @@ public ReadOnlySpan GetBytes(int index, out bool isNull) BinaryView binaryView = Views[index]; if (binaryView.IsInline) { - return ViewsBuffer.Span.Slice(16 * index + 4, binaryView.Length); + return ViewsBuffer.Span.Slice(16 * (Offset + index) + 4, binaryView.Length); } return DataBuffer(binaryView._bufferIndex).Span.Slice(binaryView._bufferOffset, binaryView.Length); diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs index 5a51175b7c4da..2fb6f4c961127 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs @@ -144,7 +144,7 @@ public Decimal128Array(ArrayData data) { return null; } - return DecimalUtility.GetDecimal(ValueBuffer, index, Scale, ByteWidth); + return DecimalUtility.GetDecimal(ValueBuffer, Offset + index, Scale, ByteWidth); } public IList ToList(bool includeNulls = false) @@ -177,7 +177,7 @@ public string GetString(int index) { return null; } - return DecimalUtility.GetString(ValueBuffer, index, Precision, Scale, ByteWidth); + return DecimalUtility.GetString(ValueBuffer, Offset + index, Precision, Scale, ByteWidth); } public SqlDecimal? GetSqlDecimal(int index) @@ -187,7 +187,7 @@ public string GetString(int index) return null; } - return DecimalUtility.GetSqlDecimal128(ValueBuffer, index, Precision, Scale); + return DecimalUtility.GetSqlDecimal128(ValueBuffer, Offset + index, Precision, Scale); } int IReadOnlyCollection.Count => Length; diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs index eca2611b6f3bb..fa6f765475240 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs @@ -151,7 +151,7 @@ public Decimal256Array(ArrayData data) return null; } - return DecimalUtility.GetDecimal(ValueBuffer, index, Scale, ByteWidth); + return DecimalUtility.GetDecimal(ValueBuffer, Offset + index, Scale, ByteWidth); } public IList ToList(bool includeNulls = false) @@ -184,7 +184,7 @@ public string GetString(int index) { return null; } - return DecimalUtility.GetString(ValueBuffer, index, Precision, Scale, ByteWidth); + return DecimalUtility.GetString(ValueBuffer, Offset + index, Precision, Scale, ByteWidth); } public bool TryGetSqlDecimal(int index, out SqlDecimal? value) @@ -196,11 +196,11 @@ public bool TryGetSqlDecimal(int index, out SqlDecimal? value) } const int longWidth = 4; - var span = ValueBuffer.Span.CastTo().Slice(index * longWidth); + var span = ValueBuffer.Span.CastTo().Slice((Offset + index) * longWidth); if ((span[2] == 0 && span[3] == 0) || (span[2] == -1 && span[3] == -1)) { - value = DecimalUtility.GetSqlDecimal128(ValueBuffer, 2 * index, Precision, Scale); + value = DecimalUtility.GetSqlDecimal128(ValueBuffer, 2 * (Offset + index), Precision, Scale); return true; } diff --git a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs index b6b61c560e482..79880c894b13d 100644 --- a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs @@ -24,7 +24,7 @@ public class DenseUnionArray : UnionArray { public ArrowBuffer ValueOffsetBuffer => Data.Buffers[1]; - public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.Span.CastTo(); + public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.Span.CastTo().Slice(Offset, Length); public DenseUnionArray( IArrowType dataType, @@ -38,7 +38,6 @@ public DenseUnionArray( dataType, length, nullCount, offset, new[] { typeIds, valuesOffsetBuffer }, children.Select(child => child.Data))) { - _fields = children.ToArray(); ValidateMode(UnionMode.Dense, Type.Mode); } @@ -53,5 +52,28 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(ValueOffsets[index]); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var valueOffsets = data.Buffers[1].Span.CastTo().Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < length; ++i) + { + var typeId = typeIds[i]; + var valueOffset = valueOffsets[i]; + nullCount += childArrays[typeId].IsNull(valueOffset) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs index 0fa7954724f38..9d597ef1624ea 100644 --- a/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/FixedSizeBinaryArray.cs @@ -68,7 +68,7 @@ public ReadOnlySpan GetBytes(int index) } int size = ((FixedSizeBinaryType)Data.DataType).ByteWidth; - return ValueBuffer.Span.Slice(index * size, size); + return ValueBuffer.Span.Slice((Offset + index) * size, size); } int IReadOnlyCollection.Count => Length; diff --git a/csharp/src/Apache.Arrow/Arrays/NullArray.cs b/csharp/src/Apache.Arrow/Arrays/NullArray.cs index 762540065c929..7f3e183829243 100644 --- a/csharp/src/Apache.Arrow/Arrays/NullArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/NullArray.cs @@ -95,7 +95,7 @@ public NullArray(int length) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public void Dispose() { } public bool IsNull(int index) => true; diff --git a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs index 07d36e25cfc23..5b29489ebb1f0 100644 --- a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs @@ -32,7 +32,6 @@ public SparseUnionArray( dataType, length, nullCount, offset, new[] { typeIds }, children.Select(child => child.Data))) { - _fields = children.ToArray(); ValidateMode(UnionMode.Sparse, Type.Mode); } @@ -47,5 +46,26 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(index); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < data.Length; ++i) + { + var typeId = typeIds[i]; + nullCount += childArrays[typeId].IsNull(offset + i) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs index 5fcb276655162..c1deb9b651a89 100644 --- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs @@ -25,7 +25,7 @@ public abstract class UnionArray : IArrowArray protected IReadOnlyList _fields; public IReadOnlyList Fields => - LazyInitializer.EnsureInitialized(ref _fields, () => InitializeFields()); + LazyInitializer.EnsureInitialized(ref _fields, InitializeFields); public ArrayData Data { get; } @@ -35,13 +35,13 @@ public abstract class UnionArray : IArrowArray public ArrowBuffer TypeBuffer => Data.Buffers[0]; - public ReadOnlySpan TypeIds => TypeBuffer.Span; + public ReadOnlySpan TypeIds => TypeBuffer.Span.Slice(Offset, Length); public int Length => Data.Length; public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public bool IsValid(int index) => NullCount == 0 || FieldIsValid(Fields[TypeIds[index]], index); @@ -91,12 +91,29 @@ protected static void ValidateMode(UnionMode expected, UnionMode actual) } } + internal static int ComputeNullCount(ArrayData data) + { + return ((UnionType)data.DataType).Mode switch + { + UnionMode.Sparse => SparseUnionArray.ComputeNullCount(data), + UnionMode.Dense => DenseUnionArray.ComputeNullCount(data), + _ => throw new InvalidOperationException("unknown union mode in null count computation") + }; + } + private IReadOnlyList InitializeFields() { IArrowArray[] result = new IArrowArray[Data.Children.Length]; for (int i = 0; i < Data.Children.Length; i++) { - result[i] = ArrowArrayFactory.BuildArray(Data.Children[i]); + var childData = Data.Children[i]; + if (Mode == UnionMode.Sparse && (Data.Offset != 0 || childData.Length != Data.Length)) + { + // We only slice the child data for sparse mode, + // so that the sliced value offsets remain valid in dense mode + childData = childData.Slice(Data.Offset, Data.Length); + } + result[i] = ArrowArrayFactory.BuildArray(childData); } return result; } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 03059eaf5d4df..b241fdfea3bda 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -115,7 +115,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr { cArray->length = array.Length; cArray->offset = array.Offset; - cArray->null_count = array.NullCount; + cArray->null_count = array.NullCount; // The C Data interface allows the null count to be -1 cArray->release = ReleaseArrayPtr; cArray->private_data = MakePrivateData(sharedOwner); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index 4e273dbde5690..a37c501072f4b 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -261,7 +261,7 @@ private ArrayData LoadField( if (fieldNullCount < 0) { - throw new InvalidDataException("Null count length must be >= 0"); // TODO:Localize exception message + throw new InvalidDataException("Null count must be >= 0"); // TODO:Localize exception message } int buffers; diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index b002f8c8b1578..6127c5a662dfe 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -19,6 +19,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.IO; +using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; using Apache.Arrow.Arrays; @@ -69,23 +70,37 @@ private class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor { + public readonly struct FieldNode + { + public readonly int Length; + public readonly int NullCount; + + public FieldNode(int length, int nullCount) + { + Length = length; + NullCount = nullCount; + } + } + public readonly struct Buffer { - public readonly ArrowBuffer DataBuffer; + public readonly ReadOnlyMemory DataBuffer; public readonly int Offset; - public Buffer(ArrowBuffer buffer, int offset) + public Buffer(ReadOnlyMemory buffer, int offset) { DataBuffer = buffer; Offset = offset; } } + private readonly List _fieldNodes; private readonly List _buffers; private readonly ICompressionCodec _compressionCodec; private readonly MemoryAllocator _allocator; private readonly MemoryStream _compressionStream; + public IReadOnlyList FieldNodes => _fieldNodes; public IReadOnlyList Buffers => _buffers; public List VariadicCounts { get; private set; } @@ -97,56 +112,80 @@ public ArrowRecordBatchFlatBufferBuilder( _compressionCodec = compressionCodec; _compressionStream = compressionStream; _allocator = allocator; + _fieldNodes = new List(); _buffers = new List(); TotalLength = 0; } - public void Visit(Int8Array array) => CreateBuffers(array); - public void Visit(Int16Array array) => CreateBuffers(array); - public void Visit(Int32Array array) => CreateBuffers(array); - public void Visit(Int64Array array) => CreateBuffers(array); - public void Visit(UInt8Array array) => CreateBuffers(array); - public void Visit(UInt16Array array) => CreateBuffers(array); - public void Visit(UInt32Array array) => CreateBuffers(array); - public void Visit(UInt64Array array) => CreateBuffers(array); + public void VisitArray(IArrowArray array) + { + _fieldNodes.Add(new FieldNode(array.Length, array.NullCount)); + + array.Accept(this); + } + + public void Visit(Int8Array array) => VisitPrimitiveArray(array); + public void Visit(Int16Array array) => VisitPrimitiveArray(array); + public void Visit(Int32Array array) => VisitPrimitiveArray(array); + public void Visit(Int64Array array) => VisitPrimitiveArray(array); + public void Visit(UInt8Array array) => VisitPrimitiveArray(array); + public void Visit(UInt16Array array) => VisitPrimitiveArray(array); + public void Visit(UInt32Array array) => VisitPrimitiveArray(array); + public void Visit(UInt64Array array) => VisitPrimitiveArray(array); #if NET5_0_OR_GREATER - public void Visit(HalfFloatArray array) => CreateBuffers(array); + public void Visit(HalfFloatArray array) => VisitPrimitiveArray(array); #endif - public void Visit(FloatArray array) => CreateBuffers(array); - public void Visit(DoubleArray array) => CreateBuffers(array); - public void Visit(TimestampArray array) => CreateBuffers(array); - public void Visit(BooleanArray array) => CreateBuffers(array); - public void Visit(Date32Array array) => CreateBuffers(array); - public void Visit(Date64Array array) => CreateBuffers(array); - public void Visit(Time32Array array) => CreateBuffers(array); - public void Visit(Time64Array array) => CreateBuffers(array); - public void Visit(DurationArray array) => CreateBuffers(array); - public void Visit(YearMonthIntervalArray array) => CreateBuffers(array); - public void Visit(DayTimeIntervalArray array) => CreateBuffers(array); - public void Visit(MonthDayNanosecondIntervalArray array) => CreateBuffers(array); + public void Visit(FloatArray array) => VisitPrimitiveArray(array); + public void Visit(DoubleArray array) => VisitPrimitiveArray(array); + public void Visit(TimestampArray array) => VisitPrimitiveArray(array); + public void Visit(Date32Array array) => VisitPrimitiveArray(array); + public void Visit(Date64Array array) => VisitPrimitiveArray(array); + public void Visit(Time32Array array) => VisitPrimitiveArray(array); + public void Visit(Time64Array array) => VisitPrimitiveArray(array); + public void Visit(DurationArray array) => VisitPrimitiveArray(array); + public void Visit(YearMonthIntervalArray array) => VisitPrimitiveArray(array); + public void Visit(DayTimeIntervalArray array) => VisitPrimitiveArray(array); + public void Visit(MonthDayNanosecondIntervalArray array) => VisitPrimitiveArray(array); + + private void VisitPrimitiveArray(PrimitiveArray array) + where T : struct + { + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateSlicedBuffer(array.ValueBuffer, array.Offset, array.Length)); + } + + public void Visit(BooleanArray array) + { + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateBitmapBuffer(array.ValueBuffer, array.Offset, array.Length)); + } public void Visit(ListArray array) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueOffsetsBuffer)); + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateSlicedBuffer(array.ValueOffsetsBuffer, array.Offset, array.Length + 1)); - array.Values.Accept(this); + VisitArray(array.Values); } public void Visit(ListViewArray array) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueOffsetsBuffer)); - _buffers.Add(CreateBuffer(array.SizesBuffer)); + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateSlicedBuffer(array.ValueOffsetsBuffer, array.Offset, array.Length)); + _buffers.Add(CreateSlicedBuffer(array.SizesBuffer, array.Offset, array.Length)); - array.Values.Accept(this); + VisitArray(array.Values); } public void Visit(FixedSizeListArray array) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); - array.Values.Accept(this); + var listSize = ((FixedSizeListType)array.Data.DataType).ListSize; + var valuesSlice = + ArrowArrayFactory.Slice(array.Values, array.Offset * listSize, array.Length * listSize); + + VisitArray(valuesSlice); } public void Visit(StringArray array) => Visit(array as BinaryArray); @@ -155,15 +194,15 @@ public void Visit(FixedSizeListArray array) public void Visit(BinaryArray array) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueOffsetsBuffer)); + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateSlicedBuffer(array.ValueOffsetsBuffer, array.Offset, array.Length + 1)); _buffers.Add(CreateBuffer(array.ValueBuffer)); } public void Visit(BinaryViewArray array) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ViewsBuffer)); + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateSlicedBuffer(array.ViewsBuffer, array.Offset, array.Length)); for (int i = 0; i < array.DataBufferCount; i++) { _buffers.Add(CreateBuffer(array.DataBuffer(i))); @@ -174,45 +213,40 @@ public void Visit(BinaryViewArray array) public void Visit(FixedSizeBinaryArray array) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueBuffer)); + var itemSize = ((FixedSizeBinaryType)array.Data.DataType).ByteWidth; + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); + _buffers.Add(CreateSlicedBuffer(array.ValueBuffer, itemSize, array.Offset, array.Length)); } - public void Visit(Decimal128Array array) - { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueBuffer)); - } + public void Visit(Decimal128Array array) => Visit(array as FixedSizeBinaryArray); - public void Visit(Decimal256Array array) - { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueBuffer)); - } + public void Visit(Decimal256Array array) => Visit(array as FixedSizeBinaryArray); public void Visit(StructArray array) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); for (int i = 0; i < array.Fields.Count; i++) { - array.Fields[i].Accept(this); + // Fields property accessor handles slicing field arrays if required + VisitArray(array.Fields[i]); } } public void Visit(UnionArray array) { - _buffers.Add(CreateBuffer(array.TypeBuffer)); + _buffers.Add(CreateSlicedBuffer(array.TypeBuffer, array.Offset, array.Length)); ArrowBuffer? offsets = (array as DenseUnionArray)?.ValueOffsetBuffer; if (offsets != null) { - _buffers.Add(CreateBuffer(offsets.Value)); + _buffers.Add(CreateSlicedBuffer(offsets.Value, array.Offset, array.Length)); } for (int i = 0; i < array.Fields.Count; i++) { - array.Fields[i].Accept(this); + // Fields property accessor handles slicing field arrays for sparse union arrays if required + VisitArray(array.Fields[i]); } } @@ -221,8 +255,7 @@ public void Visit(DictionaryArray array) // Dictionary is serialized separately in Dictionary serialization. // We are only interested in indices at this context. - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.IndicesBuffer)); + array.Indices.Accept(this); } public void Visit(NullArray array) @@ -230,25 +263,67 @@ public void Visit(NullArray array) // There are no buffers for a NullArray } - private void CreateBuffers(BooleanArray array) + private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, int length) { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueBuffer)); + if (buffer.IsEmpty) + { + return CreateBuffer(buffer.Memory); + } + + var paddedLength = CalculatePaddedBufferLength(BitUtility.ByteCount(length)); + if (offset % 8 == 0) + { + var byteOffset = offset / 8; + var sliceLength = Math.Min(paddedLength, buffer.Length - byteOffset); + + return CreateBuffer(buffer.Memory.Slice(byteOffset, sliceLength)); + } + else + { + // Need to copy bitmap so the first bit is aligned with the first byte + var memoryOwner = _allocator.Allocate(paddedLength); + var outputSpan = memoryOwner.Memory.Span; + var inputSpan = buffer.Span; + for (var i = 0; i < length; ++i) + { + BitUtility.SetBit(outputSpan, i, BitUtility.GetBit(inputSpan, offset + i)); + } + + return CreateBuffer(memoryOwner.Memory); + } } - private void CreateBuffers(PrimitiveArray array) + private Buffer CreateSlicedBuffer(ArrowBuffer buffer, int offset, int length) where T : struct { - _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); - _buffers.Add(CreateBuffer(array.ValueBuffer)); + return CreateSlicedBuffer(buffer, Unsafe.SizeOf(), offset, length); + } + + private Buffer CreateSlicedBuffer(ArrowBuffer buffer, int itemSize, int offset, int length) + { + var byteLength = length * itemSize; + var paddedLength = CalculatePaddedBufferLength(byteLength); + if (offset != 0 || paddedLength < buffer.Length) + { + var byteOffset = offset * itemSize; + var sliceLength = Math.Min(paddedLength, buffer.Length - byteOffset); + return CreateBuffer(buffer.Memory.Slice(byteOffset, sliceLength)); + } + + return CreateBuffer(buffer.Memory); } private Buffer CreateBuffer(ArrowBuffer buffer) + { + return CreateBuffer(buffer.Memory); + } + + private Buffer CreateBuffer(ReadOnlyMemory buffer) { int offset = TotalLength; const int UncompressedLengthSize = 8; - ArrowBuffer bufferToWrite; + ReadOnlyMemory bufferToWrite; if (_compressionCodec == null) { bufferToWrite = buffer; @@ -258,7 +333,7 @@ private Buffer CreateBuffer(ArrowBuffer buffer) // Write zero length and skip compression var uncompressedLengthBytes = _allocator.Allocate(UncompressedLengthSize); BinaryPrimitives.WriteInt64LittleEndian(uncompressedLengthBytes.Memory.Span, 0); - bufferToWrite = new ArrowBuffer(uncompressedLengthBytes); + bufferToWrite = uncompressedLengthBytes.Memory; } else { @@ -266,14 +341,14 @@ private Buffer CreateBuffer(ArrowBuffer buffer) // compressed buffers are stored. _compressionStream.Seek(0, SeekOrigin.Begin); _compressionStream.SetLength(0); - _compressionCodec.Compress(buffer.Memory, _compressionStream); + _compressionCodec.Compress(buffer, _compressionStream); if (_compressionStream.Length < buffer.Length) { var newBuffer = _allocator.Allocate((int) _compressionStream.Length + UncompressedLengthSize); BinaryPrimitives.WriteInt64LittleEndian(newBuffer.Memory.Span, buffer.Length); _compressionStream.Seek(0, SeekOrigin.Begin); _compressionStream.ReadFullBuffer(newBuffer.Memory.Slice(UncompressedLengthSize)); - bufferToWrite = new ArrowBuffer(newBuffer); + bufferToWrite = newBuffer.Memory; } else { @@ -281,8 +356,8 @@ private Buffer CreateBuffer(ArrowBuffer buffer) // buffer instead, and indicate this by setting the uncompressed length to -1 var newBuffer = _allocator.Allocate(buffer.Length + UncompressedLengthSize); BinaryPrimitives.WriteInt64LittleEndian(newBuffer.Memory.Span, -1); - buffer.Memory.CopyTo(newBuffer.Memory.Slice(UncompressedLengthSize)); - bufferToWrite = new ArrowBuffer(newBuffer); + buffer.CopyTo(newBuffer.Memory.Slice(UncompressedLengthSize)); + bufferToWrite = newBuffer.Memory; } } @@ -366,29 +441,6 @@ public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOp } } - private void CreateSelfAndChildrenFieldNodes(ArrayData data) - { - if (data.DataType is NestedType) - { - // flatbuffer struct vectors have to be created in reverse order - for (int i = data.Children.Length - 1; i >= 0; i--) - { - CreateSelfAndChildrenFieldNodes(data.Children[i]); - } - } - Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.NullCount); - } - - private static int CountAllNodes(IReadOnlyList fields) - { - int count = 0; - foreach (Field arrowArray in fields) - { - CountSelfAndChildrenNodes(arrowArray.DataType, ref count); - } - return count; - } - private Offset GetBodyCompression() { if (_options.CompressionCodec == null) @@ -406,18 +458,6 @@ private static int CountAllNodes(IReadOnlyList fields) Builder, compressionType, Flatbuf.BodyCompressionMethod.BUFFER); } - private static void CountSelfAndChildrenNodes(IArrowType type, ref int count) - { - if (type is NestedType nestedType) - { - foreach (Field childField in nestedType.Fields) - { - CountSelfAndChildrenNodes(childField.DataType, ref count); - } - } - count++; - } - private protected void WriteRecordBatchInternal(RecordBatch recordBatch) { // TODO: Truncate buffers with extraneous padding / unused capacity @@ -461,8 +501,6 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBatch, CancellationToken cancellationToken = default) { - // TODO: Truncate buffers with extraneous padding / unused capacity - if (!HasWrittenSchema) { await WriteSchemaAsync(Schema, cancellationToken).ConfigureAwait(false); @@ -506,11 +544,11 @@ private long WriteBufferData(IReadOnlyList buffer = buffers[i].DataBuffer; if (buffer.IsEmpty) continue; - WriteBuffer(buffer); + BaseStream.Write(buffer); int paddedLength = checked((int)BitUtility.RoundUpToMultipleOf8(buffer.Length)); int padding = paddedLength - buffer.Length; @@ -537,11 +575,11 @@ private async ValueTask WriteBufferDataAsync(IReadOnlyList buffer = buffers[i].DataBuffer; if (buffer.IsEmpty) continue; - await WriteBufferAsync(buffer, cancellationToken).ConfigureAwait(false); + await BaseStream.WriteAsync(buffer, cancellationToken).ConfigureAwait(false); int paddedLength = checked((int)BitUtility.RoundUpToMultipleOf8(buffer.Length)); int padding = paddedLength - buffer.Length; @@ -571,22 +609,6 @@ private Tuple Pre { Builder.Clear(); - // Serialize field nodes - - int fieldCount = fields.Count; - - Flatbuf.RecordBatch.StartNodesVector(Builder, CountAllNodes(fields)); - - // flatbuffer struct vectors have to be created in reverse order - for (int i = fieldCount - 1; i >= 0; i--) - { - CreateSelfAndChildrenFieldNodes(arrays[i].Data); - } - - VectorOffset fieldNodesVectorOffset = Builder.EndVector(); - - // Serialize buffers - // CompressionCodec can be disposed after all data is visited by the builder, // and doesn't need to be alive for the full lifetime of the ArrowRecordBatchFlatBufferBuilder using var compressionCodec = _options.CompressionCodec.HasValue @@ -594,20 +616,34 @@ private Tuple Pre : null; var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(compressionCodec, _allocator, _compressionStream); - for (int i = 0; i < fieldCount; i++) + + // Visit all arrays recursively + for (int i = 0; i < fields.Count; i++) { IArrowArray fieldArray = arrays[i]; - fieldArray.Accept(recordBatchBuilder); + recordBatchBuilder.VisitArray(fieldArray); + } + + // Serialize field nodes + IReadOnlyList fieldNodes = recordBatchBuilder.FieldNodes; + Flatbuf.RecordBatch.StartNodesVector(Builder, fieldNodes.Count); + + // flatbuffer struct vectors have to be created in reverse order + for (int i = fieldNodes.Count - 1; i >= 0; i--) + { + Flatbuf.FieldNode.CreateFieldNode(Builder, fieldNodes[i].Length, fieldNodes[i].NullCount); } + VectorOffset fieldNodesVectorOffset = Builder.EndVector(); + VectorOffset variadicCountOffset = default; if (recordBatchBuilder.VariadicCounts != null) { variadicCountOffset = Flatbuf.RecordBatch.CreateVariadicCountsVectorBlock(Builder, recordBatchBuilder.VariadicCounts.ToArray()); } + // Serialize buffers IReadOnlyList buffers = recordBatchBuilder.Buffers; - Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count); // flatbuffer struct vectors have to be created in reverse order @@ -783,16 +819,6 @@ public async Task WriteEndAsync(CancellationToken cancellationToken = default) } } - private void WriteBuffer(ArrowBuffer arrowBuffer) - { - BaseStream.Write(arrowBuffer.Memory); - } - - private ValueTask WriteBufferAsync(ArrowBuffer arrowBuffer, CancellationToken cancellationToken = default) - { - return BaseStream.WriteAsync(arrowBuffer.Memory, cancellationToken); - } - private protected Offset SerializeSchema(Schema schema) { // Build metadata @@ -1056,6 +1082,15 @@ protected int CalculatePadding(long offset, int alignment = 8) } } + private static int CalculatePaddedBufferLength(int length) + { + long result = BitUtility.RoundUpToMultiplePowerOfTwo(length, MemoryAllocator.DefaultAlignment); + checked + { + return (int)result; + } + } + private protected void WritePadding(int length) { if (length > 0) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index d4d124668e081..5cc0d303e881e 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,8 +8,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index ad6efbd7b45e7..5b7c10f35bed0 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 6f1b4e180e4fc..050d0f452cc4e 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 3febfc92b97c8..92f6e2d662f38 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,8 +15,8 @@ - - + + all runtime; build; native; contentfiles; analyzers diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs index 25ef289f0dc25..700de58adb8c1 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs @@ -29,6 +29,12 @@ public void TestStandardCases() { foreach ((List testTargetArrayList, IArrowArray expectedArray) in GenerateTestData()) { + if (expectedArray is UnionArray) + { + // Union array concatenation is incorrect. See https://github.com/apache/arrow/issues/41198 + continue; + } + IArrowArray actualArray = ArrowArrayConcatenator.Concatenate(testTargetArrayList); ArrowReaderVerifier.CompareArrays(expectedArray, actualArray); } diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index a0e90cbbc7c61..682ebec323dc0 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -185,6 +185,7 @@ public void SlicePrimitiveArrayWithNulls() TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); + TestSlice(x => x.AppendNull().AppendNull().AppendNull()); // All nulls static void TestNumberSlice() where T : struct, INumber @@ -314,6 +315,8 @@ private void ValidateArrays(PrimitiveArray slicedArray) .SequenceEqual(slicedArray.Values)); Assert.Equal(baseArray.GetValue(slicedArray.Offset), slicedArray.GetValue(0)); + + ValidateNullCount(slicedArray); } private void ValidateArrays(BooleanArray slicedArray) @@ -333,6 +336,8 @@ private void ValidateArrays(BooleanArray slicedArray) #pragma warning disable CS0618 Assert.Equal(baseArray.GetBoolean(slicedArray.Offset), slicedArray.GetBoolean(0)); #pragma warning restore CS0618 + + ValidateNullCount(slicedArray); } private void ValidateArrays(BinaryArray slicedArray) @@ -347,6 +352,16 @@ private void ValidateArrays(BinaryArray slicedArray) .SequenceEqual(slicedArray.ValueOffsets)); Assert.True(baseArray.GetBytes(slicedArray.Offset).SequenceEqual(slicedArray.GetBytes(0))); + + ValidateNullCount(slicedArray); + } + + private static void ValidateNullCount(IArrowArray slicedArray) + { + var expectedNullCount = Enumerable.Range(0, slicedArray.Length) + .Select(i => slicedArray.IsNull(i) ? 1 : 0) + .Sum(); + Assert.Equal(expectedNullCount, slicedArray.NullCount); } } } diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs index 69b8410d030f2..faf650973d64c 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs @@ -15,8 +15,11 @@ using Apache.Arrow.Ipc; using System; +using System.Collections.Generic; using System.IO; +using System.Linq; using System.Threading.Tasks; +using Apache.Arrow.Types; using Xunit; namespace Apache.Arrow.Tests @@ -106,13 +109,38 @@ public async Task WritesFooterAlignedMultipleOf8Async() await ValidateRecordBatchFile(stream, originalBatch); } - private async Task ValidateRecordBatchFile(Stream stream, RecordBatch recordBatch) + [Theory] + [InlineData(0, 45)] + [InlineData(3, 45)] + [InlineData(16, 45)] + public async Task WriteSlicedArrays(int sliceOffset, int sliceLength) + { + var originalBatch = TestData.CreateSampleRecordBatch(length: 100); + var slicedArrays = originalBatch.Arrays + .Select(array => ArrowArrayFactory.Slice(array, sliceOffset, sliceLength)) + .ToList(); + var slicedBatch = new RecordBatch(originalBatch.Schema, slicedArrays, sliceLength); + + var stream = new MemoryStream(); + var writer = new ArrowFileWriter(stream, slicedBatch.Schema, leaveOpen: true); + + await writer.WriteRecordBatchAsync(slicedBatch); + await writer.WriteEndAsync(); + + stream.Position = 0; + + // Disable strict comparison because we don't expect buffers to match exactly + // due to writing slices of buffers, and instead need to compare array values + await ValidateRecordBatchFile(stream, slicedBatch, strictCompare: false); + } + + private async Task ValidateRecordBatchFile(Stream stream, RecordBatch recordBatch, bool strictCompare = true) { var reader = new ArrowFileReader(stream); int count = await reader.RecordBatchCountAsync(); Assert.Equal(1, count); RecordBatch readBatch = await reader.ReadRecordBatchAsync(0); - ArrowReaderVerifier.CompareBatches(recordBatch, readBatch); + ArrowReaderVerifier.CompareBatches(recordBatch, readBatch, strictCompare); } /// diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index ceeab92860e6f..07c8aa3f56b3b 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -160,7 +160,7 @@ public void Visit(StructArray array) Assert.Equal(expectedArray.Length, array.Length); Assert.Equal(expectedArray.NullCount, array.NullCount); - Assert.Equal(expectedArray.Offset, array.Offset); + Assert.Equal(0, array.Offset); Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length); Assert.Equal(expectedArray.Fields.Count, array.Fields.Count); @@ -178,10 +178,42 @@ public void Visit(UnionArray array) Assert.Equal(expectedArray.Mode, array.Mode); Assert.Equal(expectedArray.Length, array.Length); Assert.Equal(expectedArray.NullCount, array.NullCount); - Assert.Equal(expectedArray.Offset, array.Offset); + Assert.Equal(0, array.Offset); Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length); Assert.Equal(expectedArray.Fields.Count, array.Fields.Count); + if (_strictCompare) + { + Assert.True(expectedArray.TypeBuffer.Span.SequenceEqual(array.TypeBuffer.Span)); + } + else + { + for (int i = 0; i < expectedArray.Length; i++) + { + Assert.Equal(expectedArray.TypeIds[i], array.TypeIds[i]); + } + } + + if (_expectedArray is DenseUnionArray expectedDenseArray) + { + Assert.IsAssignableFrom(array); + var denseArray = array as DenseUnionArray; + Assert.NotNull(denseArray); + + if (_strictCompare) + { + Assert.True(expectedDenseArray.ValueOffsetBuffer.Span.SequenceEqual(denseArray.ValueOffsetBuffer.Span)); + } + else + { + for (int i = 0; i < expectedDenseArray.Length; i++) + { + Assert.Equal( + expectedDenseArray.ValueOffsets[i], denseArray.ValueOffsets[i]); + } + } + } + for (int i = 0; i < array.Fields.Count; i++) { array.Fields[i].Accept(new ArrayComparer(expectedArray.Fields[i], _strictCompare)); @@ -220,9 +252,9 @@ private void CompareBinaryArrays(BinaryArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { @@ -252,9 +284,9 @@ private void CompareVariadicArrays(BinaryViewArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); Assert.True(expectedArray.Views.SequenceEqual(actualArray.Views)); @@ -277,9 +309,9 @@ private void CompareArrays(FixedSizeBinaryArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { @@ -306,9 +338,9 @@ private void CompareArrays(PrimitiveArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { @@ -338,9 +370,9 @@ private void CompareArrays(BooleanArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { @@ -365,9 +397,9 @@ private void CompareArrays(ListArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { @@ -375,8 +407,9 @@ private void CompareArrays(ListArray actualArray) } else { + int offsetsStart = (expectedArray.Offset) * sizeof(int); int offsetsLength = (expectedArray.Length + 1) * sizeof(int); - Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(0, offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, offsetsLength))); + Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(offsetsStart, offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, offsetsLength))); } actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); @@ -391,9 +424,9 @@ private void CompareArrays(ListViewArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { @@ -402,9 +435,10 @@ private void CompareArrays(ListViewArray actualArray) } else { + int start = expectedArray.Offset * sizeof(int); int length = expectedArray.Length * sizeof(int); - Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(0, length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length))); - Assert.True(expectedArray.SizesBuffer.Span.Slice(0, length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length))); + Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(start, length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length))); + Assert.True(expectedArray.SizesBuffer.Span.Slice(start, length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length))); } actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); @@ -419,23 +453,31 @@ private void CompareArrays(FixedSizeListArray actualArray) Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(expectedArray.Offset, actualArray.Offset); + Assert.Equal(0, actualArray.Offset); - CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); - actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); + var listSize = ((FixedSizeListType)expectedArray.Data.DataType).ListSize; + var expectedValuesSlice = ArrowArrayFactory.Slice( + expectedArray.Values, expectedArray.Offset * listSize, expectedArray.Length * listSize); + actualArray.Values.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare)); } - private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, ArrowBuffer actualValidityBuffer) + private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer actualValidityBuffer) { if (_strictCompare) { Assert.True(expectedValidityBuffer.Span.SequenceEqual(actualValidityBuffer.Span)); } - else if (nullCount != 0 && arrayLength > 0) + else if (actualValidityBuffer.IsEmpty) + { + Assert.True(nullCount == 0 || arrayLength == 0); + } + else if (expectedBufferOffset % 8 == 0) { int validityBitmapByteCount = BitUtility.ByteCount(arrayLength); - ReadOnlySpan expectedSpanPartial = expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount - 1); + int byteOffset = BitUtility.ByteCount(expectedBufferOffset); + ReadOnlySpan expectedSpanPartial = expectedValidityBuffer.Span.Slice(byteOffset, validityBitmapByteCount - 1); ReadOnlySpan actualSpanPartial = actualValidityBuffer.Span.Slice(0, validityBitmapByteCount - 1); // Compare the first validityBitmapByteCount - 1 bytes @@ -445,7 +487,7 @@ private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer e // Compare the last byte bitwise (because there is no guarantee about the value of // bits outside the range [0, arrayLength]) - ReadOnlySpan expectedSpanFull = expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount); + ReadOnlySpan expectedSpanFull = expectedValidityBuffer.Span.Slice(byteOffset, validityBitmapByteCount); ReadOnlySpan actualSpanFull = actualValidityBuffer.Span.Slice(0, validityBitmapByteCount); for (int i = 8 * (validityBitmapByteCount - 1); i < arrayLength; i++) { @@ -454,6 +496,18 @@ private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer e string.Format("Bit at index {0}/{1} is not equal", i, arrayLength)); } } + else + { + // Have to compare all values bitwise + var expectedSpan = expectedValidityBuffer.Span; + var actualSpan = actualValidityBuffer.Span; + for (int i = 0; i < arrayLength; i++) + { + Assert.True( + BitUtility.GetBit(expectedSpan, expectedBufferOffset + i) == BitUtility.GetBit(actualSpan, i), + string.Format("Bit at index {0}/{1} is not equal", i, arrayLength)); + } + } } } } diff --git a/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs index c4c0b6ec9ff21..db8369fa618e9 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs @@ -203,7 +203,37 @@ public async Task WriteBatchWithNullsAsync() await TestRoundTripRecordBatchAsync(originalBatch); } - private static void TestRoundTripRecordBatches(List originalBatches, IpcOptions options = null) + [Theory] + [InlineData(0, 45)] + [InlineData(3, 45)] + [InlineData(16, 45)] + public void WriteSlicedArrays(int sliceOffset, int sliceLength) + { + var originalBatch = TestData.CreateSampleRecordBatch(length: 100); + var slicedArrays = originalBatch.Arrays + .Select(array => ArrowArrayFactory.Slice(array, sliceOffset, sliceLength)) + .ToList(); + var slicedBatch = new RecordBatch(originalBatch.Schema, slicedArrays, sliceLength); + + TestRoundTripRecordBatch(slicedBatch, strictCompare: false); + } + + [Theory] + [InlineData(0, 45)] + [InlineData(3, 45)] + [InlineData(16, 45)] + public async Task WriteSlicedArraysAsync(int sliceOffset, int sliceLength) + { + var originalBatch = TestData.CreateSampleRecordBatch(length: 100); + var slicedArrays = originalBatch.Arrays + .Select(array => ArrowArrayFactory.Slice(array, sliceOffset, sliceLength)) + .ToList(); + var slicedBatch = new RecordBatch(originalBatch.Schema, slicedArrays, sliceLength); + + await TestRoundTripRecordBatchAsync(slicedBatch, strictCompare: false); + } + + private static void TestRoundTripRecordBatches(List originalBatches, IpcOptions options = null, bool strictCompare = true) { using (MemoryStream stream = new MemoryStream()) { @@ -223,13 +253,13 @@ private static void TestRoundTripRecordBatches(List originalBatches foreach (RecordBatch originalBatch in originalBatches) { RecordBatch newBatch = reader.ReadNextRecordBatch(); - ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch, strictCompare: strictCompare); } } } } - private static async Task TestRoundTripRecordBatchesAsync(List originalBatches, IpcOptions options = null) + private static async Task TestRoundTripRecordBatchesAsync(List originalBatches, IpcOptions options = null, bool strictCompare = true) { using (MemoryStream stream = new MemoryStream()) { @@ -249,20 +279,20 @@ private static async Task TestRoundTripRecordBatchesAsync(List orig foreach (RecordBatch originalBatch in originalBatches) { RecordBatch newBatch = reader.ReadNextRecordBatch(); - ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch, strictCompare: strictCompare); } } } } - private static void TestRoundTripRecordBatch(RecordBatch originalBatch, IpcOptions options = null) + private static void TestRoundTripRecordBatch(RecordBatch originalBatch, IpcOptions options = null, bool strictCompare = true) { - TestRoundTripRecordBatches(new List { originalBatch }, options); + TestRoundTripRecordBatches(new List { originalBatch }, options, strictCompare: strictCompare); } - private static async Task TestRoundTripRecordBatchAsync(RecordBatch originalBatch, IpcOptions options = null) + private static async Task TestRoundTripRecordBatchAsync(RecordBatch originalBatch, IpcOptions options = null, bool strictCompare = true) { - await TestRoundTripRecordBatchesAsync(new List { originalBatch }, options); + await TestRoundTripRecordBatchesAsync(new List { originalBatch }, options, strictCompare: strictCompare); } [Fact] diff --git a/csharp/test/Apache.Arrow.Tests/BinaryViewArrayTests.cs b/csharp/test/Apache.Arrow.Tests/BinaryViewArrayTests.cs new file mode 100644 index 0000000000000..7c18a49e96944 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/BinaryViewArrayTests.cs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Xunit; + +namespace Apache.Arrow.Tests; + +public class BinaryViewArrayTests +{ + [Fact] + public void SliceBinaryViewArray() + { + var array = new BinaryViewArray.Builder() + .Append(new byte[] { 0, 1, 2 }) + .Append(new byte[] { 3, 4 }) + .AppendNull() + .Append(new byte[] { 5, 6 }) + .Append(new byte[] { 7, 8 }) + .Build(); + + var slice = (BinaryViewArray)array.Slice(1, 3); + + Assert.Equal(3, slice.Length); + Assert.Equal(new byte[] {3, 4}, slice.GetBytes(0).ToArray()); + Assert.True(slice.GetBytes(1).IsEmpty); + Assert.Equal(new byte[] {5, 6}, slice.GetBytes(2).ToArray()); + } +} diff --git a/csharp/test/Apache.Arrow.Tests/Decimal128ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Decimal128ArrayTests.cs index fdc07effb715f..c5e0647f60a39 100644 --- a/csharp/test/Apache.Arrow.Tests/Decimal128ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Decimal128ArrayTests.cs @@ -458,5 +458,48 @@ public void AppendRangeSqlDecimal() } } } + + [Fact] + public void SliceDecimal128Array() + { + // Arrange + const int originalLength = 50; + const int offset = 3; + const int sliceLength = 32; + + var builder = new Decimal128Array.Builder(new Decimal128Type(14, 10)); + var random = new Random(); + + for (int i = 0; i < originalLength; i++) + { + if (random.NextDouble() < 0.2) + { + builder.AppendNull(); + } + else + { + builder.Append(i * (decimal)Math.Round(random.NextDouble(), 10)); + } + } + + var array = builder.Build(); + + // Act + var slice = (Decimal128Array)array.Slice(offset, sliceLength); + + // Assert + Assert.NotNull(slice); + Assert.Equal(sliceLength, slice.Length); + for (int i = 0; i < sliceLength; ++i) + { + Assert.Equal(array.GetValue(offset + i), slice.GetValue(i)); + Assert.Equal(array.GetSqlDecimal(offset + i), slice.GetSqlDecimal(i)); + Assert.Equal(array.GetString(offset + i), slice.GetString(i)); + } + + Assert.Equal( + array.ToList(includeNulls: true).Skip(offset).Take(sliceLength).ToList(), + slice.ToList(includeNulls: true)); + } } } diff --git a/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs index baeb7ee5419b9..ba0eb5f017766 100644 --- a/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Decimal256ArrayTests.cs @@ -476,5 +476,56 @@ public void AppendRangeSqlDecimal() } } } + + [Fact] + public void SliceDecimal256Array() + { + // Arrange + const int originalLength = 50; + const int offset = 3; + const int sliceLength = 32; + + var builder = new Decimal256Array.Builder(new Decimal256Type(14, 10)); + var random = new Random(); + + for (int i = 0; i < originalLength; i++) + { + if (random.NextDouble() < 0.2) + { + builder.AppendNull(); + } + else + { + builder.Append(i * (decimal)Math.Round(random.NextDouble(), 10)); + } + } + + var array = builder.Build(); + + // Act + var slice = (Decimal256Array)array.Slice(offset, sliceLength); + + // Assert + Assert.NotNull(slice); + Assert.Equal(sliceLength, slice.Length); + for (int i = 0; i < sliceLength; ++i) + { + Assert.Equal(array.GetValue(offset + i), slice.GetValue(i)); + if (array.TryGetSqlDecimal(offset + i, out var expectedSqlDecimal)) + { + Assert.True(slice.TryGetSqlDecimal(i, out var actualSqlDecimal)); + Assert.Equal(expectedSqlDecimal, actualSqlDecimal); + } + else + { + Assert.False(slice.TryGetSqlDecimal(i, out _)); + } + Assert.Equal(array.GetString(offset + i), slice.GetString(i)); + } + + Assert.Equal( + array.ToList(includeNulls: true).Skip(offset).Take(sliceLength).ToList(), + slice.ToList(includeNulls: true)); + } } } diff --git a/csharp/test/Apache.Arrow.Tests/FixedSizeBinaryArrayTests.cs b/csharp/test/Apache.Arrow.Tests/FixedSizeBinaryArrayTests.cs new file mode 100644 index 0000000000000..abc66d6ce9c9d --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/FixedSizeBinaryArrayTests.cs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Linq; +using Apache.Arrow.Arrays; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests; + +public class FixedSizeBinaryArrayTests +{ + [Fact] + public void SliceFixedSizeBinaryArray() + { + const int byteWidth = 2; + const int length = 5; + const int nullCount = 1; + + var validityBuffer = new ArrowBuffer.BitmapBuilder() + .AppendRange(true, 2) + .Append(false) + .AppendRange(true, 2) + .Build(); + var dataBuffer = new ArrowBuffer.Builder() + .AppendRange(Enumerable.Range(0, length * byteWidth).Select(i => (byte)i)) + .Build(); + var arrayData = new ArrayData( + new FixedSizeBinaryType(byteWidth), + length, nullCount, 0, new [] {validityBuffer, dataBuffer}); + var array = new FixedSizeBinaryArray(arrayData); + + var slice = (FixedSizeBinaryArray)array.Slice(1, 3); + + Assert.Equal(3, slice.Length); + Assert.Equal(new byte[] {2, 3}, slice.GetBytes(0).ToArray()); + Assert.True(slice.GetBytes(1).IsEmpty); + Assert.Equal(new byte[] {6, 7}, slice.GetBytes(2).ToArray()); + } +} diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs index 1fb5cf2415c68..712a87a252b6c 100644 --- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System; using System.Linq; using Apache.Arrow.Types; using Xunit; @@ -24,17 +25,116 @@ public class UnionArrayTests [Theory] [InlineData(UnionMode.Sparse)] [InlineData(UnionMode.Dense)] - public void UnionArray_IsNull(UnionMode mode) + public void UnionArrayIsNull(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 100); + + for (var i = 0; i < array.Length; ++i) + { + Assert.Equal(expectedNull[i], array.IsNull(i)); + Assert.Equal(!expectedNull[i], array.IsValid(i)); + } + } + + [Theory] + [InlineData(UnionMode.Sparse)] + [InlineData(UnionMode.Dense)] + public void UnionArraySlice(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 10); + + for (var offset = 0; offset < array.Length; ++offset) + { + for (var length = 0; length < array.Length - offset; ++length) + { + var slicedArray = (UnionArray)ArrowArrayFactory.Slice(array, offset, length); + + var nullCount = 0; + for (var i = 0; i < slicedArray.Length; ++i) + { + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(i)); + nullCount += expectedNull[offset + i] ? 1 : 0; + + CompareValue(array, offset + i, slicedArray, i); + } + + Assert.Equal(nullCount, slicedArray.NullCount); + } + } + } + + [Theory] + [InlineData(UnionMode.Sparse)] + [InlineData(UnionMode.Dense)] + public void UnionArrayConstructedWithOffset(UnionMode mode) + { + const int length = 10; + var (array, expectedNull) = BuildUnionArray(mode, length); + + for (var offset = 0; offset < array.Length; ++offset) + { + var (slicedArray, _) = BuildUnionArray(mode, length, offset); + + var nullCount = 0; + for (var i = 0; i < slicedArray.Length; ++i) + { + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(i)); + nullCount += expectedNull[offset + i] ? 1 : 0; + + CompareValue(array, offset + i, slicedArray, i); + } + + Assert.Equal(nullCount, slicedArray.NullCount); + } + } + + private static void CompareValue(UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) + { + var typeId = originalArray.TypeIds[originalIndex]; + var sliceTypeId = slicedArray.TypeIds[sliceIndex]; + Assert.Equal(typeId, sliceTypeId); + + switch (typeId) + { + case 0: + CompareFieldValue(typeId, originalArray, originalIndex, slicedArray, sliceIndex); + break; + case 1: + CompareFieldValue(typeId, originalArray, originalIndex, slicedArray, sliceIndex); + break; + default: + throw new Exception($"Unexpected type id {typeId}"); + } + } + + private static void CompareFieldValue(byte typeId, UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) + where T: struct + where TArray : PrimitiveArray + { + if (originalArray is DenseUnionArray denseOriginalArray) + { + Assert.IsType(slicedArray); + + originalIndex = denseOriginalArray.ValueOffsets[originalIndex]; + sliceIndex = ((DenseUnionArray)slicedArray).ValueOffsets[sliceIndex]; + } + var originalValue = ((TArray)originalArray.Fields[typeId]).GetValue(originalIndex); + var sliceValue = ((TArray)slicedArray.Fields[typeId]).GetValue(sliceIndex); + Assert.Equal(originalValue, sliceValue); + } + + private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode mode, int length, int offset=0) { var fields = new Field[] { new Field("field0", new Int32Type(), true), new Field("field1", new FloatType(), true), }; - var typeIds = fields.Select(f => (int) f.DataType.TypeId).ToArray(); + var typeIds = new[] { 0, 1 }; var type = new UnionType(fields, typeIds, mode); - const int length = 100; var nullCount = 0; var field0Builder = new Int32Array.Builder(); var field1Builder = new FloatArray.Builder(); @@ -44,9 +144,9 @@ public void UnionArray_IsNull(UnionMode mode) for (var i = 0; i < length; ++i) { - var isNull = i % 5 == 0; + var isNull = i % 3 == 0; expectedNull[i] = isNull; - nullCount += isNull ? 1 : 0; + nullCount += (isNull && i >= offset) ? 1 : 0; if (i % 2 == 0) { @@ -101,13 +201,9 @@ public void UnionArray_IsNull(UnionMode mode) }; UnionArray array = mode == UnionMode.Dense - ? new DenseUnionArray(type, length, children, typeIdsBuffer, valuesOffsetBuffer, nullCount) - : new SparseUnionArray(type, length, children, typeIdsBuffer, nullCount); + ? new DenseUnionArray(type, length - offset, children, typeIdsBuffer, valuesOffsetBuffer, nullCount, offset) + : new SparseUnionArray(type, length - offset, children, typeIdsBuffer, nullCount, offset); - for (var i = 0; i < length; ++i) - { - Assert.Equal(expectedNull[i], array.IsNull(i)); - Assert.Equal(!expectedNull[i], array.IsValid(i)); - } + return (array, expectedNull); } } diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index 8226eb5db360a..06d3016c72af9 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -122,7 +122,10 @@ cmake --build . --target INSTALL --config Release || exit /B 1 @rem Needed so python-test.exe works set PYTHONPATH_ORIGINAL=%PYTHONPATH% set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%;%PYTHONPATH% -ctest -j%NUMBER_OF_PROCESSORS% --output-on-failure || exit /B 1 +ctest ^ + --build-config Release ^ + --output-on-failure ^ + --parallel %NUMBER_OF_PROCESSORS% || exit /B 1 set PYTHONPATH=%PYTHONPATH_ORIGINAL% popd diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index f18b18aaa997c..cf49751e6e2a9 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -831,7 +831,9 @@ test_glib() { show_header "Build and test C GLib libraries" # Build and test C GLib - maybe_setup_conda glib gobject-introspection meson ninja ruby + # We can unpin gobject-introspection after + # https://github.com/conda-forge/glib-feedstock/pull/174 is merged. + maybe_setup_conda glib gobject-introspection=1.78.1 meson ninja ruby maybe_setup_virtualenv meson # Install bundler if doesn't exist diff --git a/dev/tasks/docker-tests/azure.linux.yml b/dev/tasks/docker-tests/azure.linux.yml deleted file mode 100644 index b66bfbdfe940a..0000000000000 --- a/dev/tasks/docker-tests/azure.linux.yml +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -jobs: -- job: linux - pool: - vmImage: ubuntu-latest - timeoutInMinutes: 360 - {% if env is defined %} - variables: - {% for key, value in env.items() %} - {{ key }}: {{ value }} - {% endfor %} - {% endif %} - - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.8' - - - script: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - git -C arrow remote add upstream https://github.com/apache/arrow.git - displayName: Clone arrow - - - script: pip install -e arrow/dev/archery[docker] - displayName: Setup Archery - - - script: | - archery --debug docker --using-docker-cli run \ - -e ARROW_DOCS_VERSION="{{ arrow.no_rc_version }}" \ - -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ - {{ flags|default("") }} \ - {{ image }} \ - {{ command|default("") }} - displayName: Execute Docker Build - env: - {{ macros.azure_set_sccache_envvars()|indent(4) }} - - {% if post_script is defined %} - - script: | - {{ post_script|indent(6) }} - displayName: Post Script - {% endif %} - - {% if artifacts is defined %} - {{ macros.azure_upload_releases(artifacts) }} - {% endif %} diff --git a/dev/tasks/docker-tests/circle.linux.yml b/dev/tasks/docker-tests/circle.linux.yml deleted file mode 100644 index faad449c8ef88..0000000000000 --- a/dev/tasks/docker-tests/circle.linux.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -version: 2 -jobs: - build: - machine: - image: ubuntu-1604:202004-01 - {%- if env is defined %} - environment: - {%- for key, value in env.items() %} - {{ key }}: {{ value }} - {%- endfor %} - {%- endif %} - steps: - - run: | - docker -v - docker-compose -v - - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - run: - name: Execute Docker Build - command: | - pyenv versions - pyenv global 3.8.12 - pip install -e arrow/dev/archery[docker] - archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" {{ run }} - no_output_timeout: "1h" - -workflows: - version: 2 - build: - jobs: - - build diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index f55a7f9481e56..1e819d3cf4556 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -59,7 +59,8 @@ env: {%- macro github_install_archery() -%} - name: Set up Python by actions/setup-python - if: runner.arch == 'X64' + if: | + !(runner.os == 'Linux' && runner.arch != 'X64') uses: actions/setup-python@v4 with: cache: 'pip' @@ -86,7 +87,8 @@ env: {%- macro github_upload_releases(pattern) -%} - name: Set up Python by actions/setup-python - if: runner.arch == 'X64' + if: | + !(runner.os == 'Linux' && runner.arch != 'X64') uses: actions/setup-python@v4 with: python-version: 3.12 diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml index cf99c84c60bfd..ce9613545eb54 100644 --- a/dev/tasks/python-wheels/github.osx.yml +++ b/dev/tasks/python-wheels/github.osx.yml @@ -50,6 +50,14 @@ jobs: run: | brew list + # CMake 3.29.1 that is pre-installed on the macOS image has a problem. + # See also: https://github.com/microsoft/vcpkg/issues/37968 + - name: Install CMake 3.29.0 + shell: bash + run: | + arrow/ci/scripts/install_cmake.sh $(arch) macos 3.29.0 ${PWD}/local + echo "${PWD}/local/bin" >> $GITHUB_PATH + - name: Retrieve VCPKG version from arrow/.env run: | vcpkg_version=$(cat "arrow/.env" | grep "VCPKG" | cut -d "=" -f2 | tr -d '"') diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index f98c0a2b48caa..da9d2cefe5f51 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1049,14 +1049,13 @@ tasks: image: {{ image }} {% endfor %} - # Use azure to run valgrind tests to prevent OOM test-conda-cpp-valgrind: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: image: conda-cpp-valgrind -{% for ubuntu_version in ["20.04", "22.04", "24.04"] %} +{% for ubuntu_version in ["20.04", "22.04"] %} test-ubuntu-{{ ubuntu_version }}-cpp: ci: github template: docker-tests/github.linux.yml @@ -1074,13 +1073,25 @@ tasks: UBUNTU: 20.04 image: ubuntu-cpp-bundled + test-ubuntu-24.04-cpp: + ci: github + template: docker-tests/github.linux.yml + params: + env: + CLANG_TOOLS: 15 + LLVM: 15 + UBUNTU: 24.04 + image: ubuntu-cpp + test-ubuntu-24.04-cpp-gcc-14: ci: github template: docker-tests/github.linux.yml params: env: - UBUNTU: "24.04" + CLANG_TOOLS: 15 GCC_VERSION: 14 + LLVM: 15 + UBUNTU: 24.04 # rapidjson 1.1.0 has an error caught by gcc 14. # https://github.com/Tencent/rapidjson/issues/718 flags: -e CC=gcc-14 -e CXX=g++-14 -e RapidJSON_SOURCE=BUNDLED @@ -1215,8 +1226,8 @@ tasks: image: conda-python-cython2 test-debian-12-python-3-amd64: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: DEBIAN: 12 @@ -1233,8 +1244,8 @@ tasks: image: debian-python test-ubuntu-20.04-python-3: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: UBUNTU: 20.04 @@ -1249,16 +1260,16 @@ tasks: image: ubuntu-python test-fedora-39-python-3: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: FEDORA: 39 image: fedora-python test-r-linux-valgrind: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: ARROW_R_DEV: "TRUE" @@ -1419,16 +1430,16 @@ tasks: flags: "-e LIBARROW_MINIMAL=TRUE" test-ubuntu-r-sanitizer: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: R_PRUNE_DEPS: TRUE image: ubuntu-r-sanitizer test-fedora-r-clang-sanitizer: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: R_PRUNE_DEPS: TRUE @@ -1436,8 +1447,8 @@ tasks: {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %} test-debian-12-go-{{ go_version }}: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: DEBIAN: 12 diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat index 8f160ee7c4fd2..3b337bb175005 100644 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat @@ -78,7 +78,8 @@ cmake --build . --target INSTALL --config Release || exit /B 1 @rem Test Arrow C++ library -ctest --output-on-failure ^ +ctest --build-config Release ^ + --output-on-failure ^ --parallel %NUMBER_OF_PROCESSORS% ^ --timeout 300 || exit /B 1 diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index 618c997c2527b..af12db595286f 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -15,14 +15,9 @@ # specific language governing permissions and limitations # under the License. -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow +{% import 'macros.jinja' as macros with context %} -on: - push: - branches: - - "*-github-*" +{{ macros.github_header() }} jobs: test-vcpkg-win: @@ -31,12 +26,14 @@ jobs: env: VCPKG_BINARY_SOURCES: 'clear;nuget,GitHub,readwrite' steps: - - name: Checkout Arrow + {{ macros.github_checkout_arrow()|indent }} + # CMake 3.29.1 that is pre-installed on the Windows image has a problem. + # See also: https://github.com/microsoft/vcpkg/issues/37968 + - name: Install CMake 3.29.0 + shell: bash run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive + arrow/ci/scripts/install_cmake.sh amd64 windows 3.29.0 /c/cmake + echo "c:\\cmake\\bin" >> $GITHUB_PATH - name: Download Timezone Database shell: bash run: arrow/ci/scripts/download_tz_database.sh @@ -59,7 +56,7 @@ jobs: CALL setx PATH "%PATH%;C:\vcpkg" - name: Setup NuGet Credentials shell: bash - env: + env: GITHUB_TOKEN: {{ '${{ secrets.GITHUB_TOKEN }}' }} run: | `vcpkg fetch nuget | tail -n 1` \ diff --git a/docker-compose.yml b/docker-compose.yml index 46717557bc337..60edf1420bc0f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -711,10 +711,11 @@ services: # Usage: # docker-compose run --rm conan # Parameters: - # CONAN: gcc11, gcc11-armv7, ... + # CONAN_BASE: gcc11, gcc11-armv7, ... + # CONAN_VERSION: 1.62.0 # See https://github.com/conan-io/conan-docker-tools#readme for # available images. - image: conanio/${CONAN} + image: conanio/${CONAN_BASE}:${CONAN_VERSION} user: root:root shm_size: *shm-size ulimits: *ulimits @@ -724,7 +725,7 @@ services: - .:/arrow:delegated command: >- /bin/bash -c " - /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin && + sudo /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin && /arrow/ci/scripts/conan_setup.sh && /arrow/ci/scripts/conan_build.sh /arrow /build" diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index b7261005e66ee..8df0ef0b1fe99 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -37,14 +37,14 @@ under the hood, you can implement the following methods on those objects: - ``__arrow_c_schema__`` for schema or type-like objects. - ``__arrow_c_array__`` for arrays and record batches (contiguous tables). -- ``__arrow_c_stream__`` for chunked tables or streams of data. +- ``__arrow_c_stream__`` for chunked arrays, tables and streams of data. Those methods return `PyCapsule `__ objects, and more details on the exact semantics can be found in the :ref:`specification `. When your data structures have those methods defined, the PyArrow constructors -(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those objects as +(see below) will recognize those objects as supporting this protocol, and convert them to PyArrow data structures zero-copy. And the same can be true for any other library supporting this protocol on ingesting data. @@ -53,6 +53,31 @@ support for this protocol by checking for the presence of those methods, and therefore accept any Arrow data (instead of harcoding support for a specific Arrow producer such as PyArrow). +For consuming data through this protocol with PyArrow, the following constructors +can be used to create the various PyArrow objects: + ++----------------------------+-----------------------------------------------+--------------------+ +| Result class | PyArrow constructor | Supported protocol | ++============================+===============================================+====================+ +| :class:`Array` | :func:`pyarrow.array` | array | ++----------------------------+-----------------------------------------------+--------------------+ +| :class:`ChunkedArray` | :func:`pyarrow.chunked_array` | array, stream | ++----------------------------+-----------------------------------------------+--------------------+ +| :class:`RecordBatch` | :func:`pyarrow.record_batch` | array | ++----------------------------+-----------------------------------------------+--------------------+ +| :class:`Table` | :func:`pyarrow.table` | array, stream | ++----------------------------+-----------------------------------------------+--------------------+ +| :class:`RecordBatchReader` | :meth:`pyarrow.RecordBatchReader.from_stream` | stream | ++----------------------------+-----------------------------------------------+--------------------+ +| :class:`Field` | :func:`pyarrow.field` | schema | ++----------------------------+-----------------------------------------------+--------------------+ +| :class:`Schema` | :func:`pyarrow.schema` | schema | ++----------------------------+-----------------------------------------------+--------------------+ + +A :class:`DataType` can be created by consuming the schema-compatible object +using :func:`pyarrow.field` and then accessing the ``.type`` of the resulting +Field. + .. _arrow_array_protocol: Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol diff --git a/go/parquet/internal/utils/bit_reader_test.go b/go/parquet/internal/utils/bit_reader_test.go index 3e5d4ed724bc5..91202979520ef 100644 --- a/go/parquet/internal/utils/bit_reader_test.go +++ b/go/parquet/internal/utils/bit_reader_test.go @@ -59,6 +59,23 @@ func TestBitWriter(t *testing.T) { assert.Equal(t, byte(0xAA), buf[0]) assert.Equal(t, byte(0xCC), buf[1]) + + for i := 0; i < 3; i++ { + assert.True(t, bw.WriteVlqInt(uint64(i))) + } + assert.Equal(t, byte(0xAA), buf[0]) + assert.Equal(t, byte(0xCC), buf[1]) + assert.Equal(t, byte(0), buf[2]) + assert.Equal(t, byte(1), buf[3]) + assert.Equal(t, byte(2), buf[4]) +} + +func BenchmarkBitWriter(b *testing.B) { + buf := make([]byte, b.N) + bw := utils.NewBitWriter(utils.NewWriterAtBuffer(buf)) + for i := 0; i < b.N; i++ { + assert.True(b, bw.WriteVlqInt(uint64(1))) + } } func TestBitReader(t *testing.T) { diff --git a/go/parquet/internal/utils/bit_writer.go b/go/parquet/internal/utils/bit_writer.go index 106461d33e048..cf1d6cf13b113 100644 --- a/go/parquet/internal/utils/bit_writer.go +++ b/go/parquet/internal/utils/bit_writer.go @@ -75,6 +75,7 @@ type BitWriter struct { byteoffset int bitoffset uint raw [8]byte + buf [binary.MaxVarintLen64]byte } // NewBitWriter initializes a new bit writer to write to the passed in interface @@ -163,9 +164,8 @@ func (b *BitWriter) WriteAligned(val uint64, nbytes int) bool { // without buffering. func (b *BitWriter) WriteVlqInt(v uint64) bool { b.Flush(true) - var buf [binary.MaxVarintLen64]byte - nbytes := binary.PutUvarint(buf[:], v) - if _, err := b.wr.WriteAt(buf[:nbytes], int64(b.byteoffset)); err != nil { + nbytes := binary.PutUvarint(b.buf[:], v) + if _, err := b.wr.WriteAt(b.buf[:nbytes], int64(b.byteoffset)); err != nil { log.Println(err) return false } diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 45fd29ad3b3f3..60fc09ea861b6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1516,11 +1516,28 @@ cdef class Array(_PandasConvertible): def _to_pandas(self, options, types_mapper=None, **kwargs): return _array_like_to_pandas(self, options, types_mapper=types_mapper) - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + try: + values = self.to_numpy(zero_copy_only=True) + except ArrowInvalid: + raise ValueError( + "Unable to avoid a copy while creating a numpy array as requested.\n" + "If using `np.array(obj, copy=False)` replace it with " + "`np.asarray(obj)` to allow a copy when needed" + ) + # values is already a numpy array at this point, but calling np.array(..) + # again to handle the `dtype` keyword with a no-copy guarantee + return np.array(values, dtype=dtype, copy=False) + values = self.to_numpy(zero_copy_only=False) + if copy is True and is_numeric(self.type.id) and self.null_count == 0: + # to_numpy did not yet make a copy (is_numeric = integer/floats, no decimal) + return np.array(values, dtype=dtype, copy=True) + if dtype is None: return values - return values.astype(dtype) + return np.asarray(values, dtype=dtype) def to_numpy(self, zero_copy_only=True, writable=False): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index a35919579541a..6dae45ab80b1c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -173,6 +173,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string ToString() c_bool is_primitive(Type type) + c_bool is_numeric(Type type) cdef cppclass CArrayData" arrow::ArrayData": shared_ptr[CDataType] type diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 6b3c7d0b56266..379bb82ea6ede 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -525,11 +525,19 @@ cdef class ChunkedArray(_PandasConvertible): return values - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid a copy while creating a numpy array as requested " + "(converting a pyarrow.ChunkedArray always results in a copy).\n" + "If using `np.array(obj, copy=False)` replace it with " + "`np.asarray(obj)` to allow a copy when needed" + ) + # 'copy' can further be ignored because to_numpy() already returns a copy values = self.to_numpy() if dtype is None: return values - return values.astype(dtype) + return values.astype(dtype, copy=False) def cast(self, object target_type=None, safe=None, options=None): """ @@ -1344,17 +1352,28 @@ cdef class ChunkedArray(_PandasConvertible): A capsule containing a C ArrowArrayStream struct. """ cdef: + ChunkedArray chunked ArrowArrayStream* c_stream = NULL if requested_schema is not None: - out_type = DataType._import_from_c_capsule(requested_schema) - if self.type != out_type: - raise NotImplementedError("Casting to requested_schema") + target_type = DataType._import_from_c_capsule(requested_schema) + + if target_type != self.type: + try: + chunked = self.cast(target_type, safe=True) + except ArrowInvalid as e: + raise ValueError( + f"Could not cast {self.type} to requested type {target_type}: {e}" + ) + else: + chunked = self + else: + chunked = self stream_capsule = alloc_c_stream(&c_stream) with nogil: - check_status(ExportChunkedArray(self.sp_chunked_array, c_stream)) + check_status(ExportChunkedArray(chunked.sp_chunked_array, c_stream)) return stream_capsule @@ -1397,6 +1416,9 @@ def chunked_array(arrays, type=None): ---------- arrays : Array, list of Array, or array-like Must all be the same data type. Can be empty only if type also passed. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be + passed as well. type : DataType or string coercible to DataType Returns @@ -1437,6 +1459,21 @@ def chunked_array(arrays, type=None): if isinstance(arrays, Array): arrays = [arrays] + elif hasattr(arrays, "__arrow_c_stream__"): + if type is not None: + requested_type = type.__arrow_c_schema__() + else: + requested_type = None + capsule = arrays.__arrow_c_stream__(requested_type) + result = ChunkedArray._import_from_c_capsule(capsule) + if type is not None and result.type != type: + # __arrow_c_stream__ coerces schema with best effort, so we might + # need to cast it if the producer wasn't able to cast to exact schema. + result = result.cast(type) + return result + elif hasattr(arrays, "__arrow_c_array__"): + arr = array(arrays, type=type) + arrays = [arr] for x in arrays: arr = x if isinstance(x, Array) else array(x, type=type) @@ -1533,7 +1570,16 @@ cdef class _Tabular(_PandasConvertible): raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use " f"one of the `{self.__class__.__name__}.from_*` functions instead.") - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid a copy while creating a numpy array as requested " + f"(converting a pyarrow.{self.__class__.__name__} always results " + "in a copy).\n" + "If using `np.array(obj, copy=False)` replace it with " + "`np.asarray(obj)` to allow a copy when needed" + ) + # 'copy' can further be ignored because stacking will result in a copy column_arrays = [ np.asarray(self.column(i), dtype=dtype) for i in range(self.num_columns) ] @@ -2787,8 +2833,17 @@ cdef class RecordBatch(_Tabular): Parameters ---------- - names : list of str - List of new column names. + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. Returns ------- @@ -2809,13 +2864,38 @@ cdef class RecordBatch(_Tabular): ---- n: [2,4,5,100] name: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] """ cdef: shared_ptr[CRecordBatch] c_batch vector[c_string] c_names - for name in names: - c_names.push_back(tobytes(name)) + if isinstance(names, list): + for name in names: + c_names.push_back(tobytes(name)) + elif isinstance(names, dict): + idx_to_new_name = {} + for name, new_name in names.items(): + indices = self.schema.get_all_field_indices(name) + + if not indices: + raise KeyError("Column {!r} not found".format(name)) + + for index in indices: + idx_to_new_name[index] = new_name + + for i in range(self.num_columns): + new_name = idx_to_new_name.get(i, self.column_names[i]) + c_names.push_back(tobytes(new_name)) + else: + raise TypeError(f"names must be a list or dict not {type(names)!r}") with nogil: c_batch = GetResultValue(self.batch.RenameColumns(move(c_names))) @@ -5186,8 +5266,17 @@ cdef class Table(_Tabular): Parameters ---------- - names : list of str - List of new column names. + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. Returns ------- @@ -5208,13 +5297,37 @@ cdef class Table(_Tabular): ---- n: [[2,4,5,100]] name: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] """ cdef: shared_ptr[CTable] c_table vector[c_string] c_names - for name in names: - c_names.push_back(tobytes(name)) + if isinstance(names, list): + for name in names: + c_names.push_back(tobytes(name)) + elif isinstance(names, dict): + idx_to_new_name = {} + for name, new_name in names.items(): + indices = self.schema.get_all_field_indices(name) + + if not indices: + raise KeyError("Column {!r} not found".format(name)) + + for index in indices: + idx_to_new_name[index] = new_name + + for i in range(self.num_columns): + c_names.push_back(tobytes(idx_to_new_name.get(i, self.schema[i].name))) + else: + raise TypeError(f"names must be a list or dict not {type(names)!r}") with nogil: c_table = GetResultValue(self.table.RenameColumns(move(c_names))) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 472a6c5dce750..156d58326b961 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -31,6 +31,7 @@ import pyarrow as pa import pyarrow.tests.strategies as past +from pyarrow.vendored.version import Version def test_total_bytes_allocated(): @@ -3302,6 +3303,52 @@ def test_array_from_large_pyints(): pa.array([int(2 ** 63)]) +def test_numpy_array_protocol(): + # test the __array__ method on pyarrow.Array + arr = pa.array([1, 2, 3]) + result = np.asarray(arr) + expected = np.array([1, 2, 3], dtype="int64") + np.testing.assert_array_equal(result, expected) + + # this should not raise a deprecation warning with numpy 2.0+ + result = np.array(arr, copy=False) + np.testing.assert_array_equal(result, expected) + + result = np.array(arr, dtype="int64", copy=False) + np.testing.assert_array_equal(result, expected) + + # no zero-copy is possible + arr = pa.array([1, 2, None]) + expected = np.array([1, 2, np.nan], dtype="float64") + result = np.asarray(arr) + np.testing.assert_array_equal(result, expected) + + if Version(np.__version__) < Version("2.0"): + # copy keyword is not strict and not passed down to __array__ + result = np.array(arr, copy=False) + np.testing.assert_array_equal(result, expected) + + result = np.array(arr, dtype="float64", copy=False) + np.testing.assert_array_equal(result, expected) + else: + # starting with numpy 2.0, the copy=False keyword is assumed to be strict + with pytest.raises(ValueError, match="Unable to avoid a copy"): + np.array(arr, copy=False) + + arr = pa.array([1, 2, 3]) + with pytest.raises(ValueError): + np.array(arr, dtype="float64", copy=False) + + # copy=True -> not yet passed by numpy, so we have to call this directly to test + arr = pa.array([1, 2, 3]) + result = arr.__array__(copy=True) + assert result.flags.writeable + + arr = pa.array([1, 2, 3]) + result = arr.__array__(dtype=np.dtype("float64"), copy=True) + assert result.dtype == "float64" + + def test_array_protocol(): class MyArray: @@ -3382,7 +3429,7 @@ def __arrow_c_array__(self, requested_schema=None): result = pa.array(arr) assert result == arr.data - # Will case to requested type + # Will cast to requested type result = pa.array(arr, type=pa.int32()) assert result == pa.array([1, 2, 3], type=pa.int32()) diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index f8b2ea15d31ad..5bf41c3c14b6e 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -692,8 +692,16 @@ def test_roundtrip_chunked_array_capsule_requested_schema(): imported_chunked = pa.ChunkedArray._import_from_c_capsule(capsule) assert imported_chunked == chunked - # Casting to something else should error + # Casting to something else should error if not possible requested_type = pa.binary() requested_capsule = requested_type.__arrow_c_schema__() - with pytest.raises(NotImplementedError): + capsule = chunked.__arrow_c_stream__(requested_capsule) + imported_chunked = pa.ChunkedArray._import_from_c_capsule(capsule) + assert imported_chunked == chunked.cast(pa.binary()) + + requested_type = pa.int64() + requested_capsule = requested_type.__arrow_c_schema__() + with pytest.raises( + ValueError, match="Could not cast string to requested type int64" + ): chunked.__arrow_c_stream__(requested_capsule) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 31d34058b61ef..a58010d083e92 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -24,6 +24,7 @@ import pytest import pyarrow as pa import pyarrow.compute as pc +from pyarrow.vendored.version import Version def test_chunked_array_basics(): @@ -493,6 +494,47 @@ def test_recordbatch_dunder_init(): pa.RecordBatch() +def test_chunked_array_c_array_interface(): + class ArrayWrapper: + def __init__(self, array): + self.array = array + + def __arrow_c_array__(self, requested_schema=None): + return self.array.__arrow_c_array__(requested_schema) + + data = pa.array([1, 2, 3], pa.int64()) + chunked = pa.chunked_array([data]) + wrapper = ArrayWrapper(data) + + # Can roundtrip through the wrapper. + result = pa.chunked_array(wrapper) + assert result == chunked + + # Can also import with a type that implementer can cast to. + result = pa.chunked_array(wrapper, type=pa.int16()) + assert result == chunked.cast(pa.int16()) + + +def test_chunked_array_c_stream_interface(): + class ChunkedArrayWrapper: + def __init__(self, chunked): + self.chunked = chunked + + def __arrow_c_stream__(self, requested_schema=None): + return self.chunked.__arrow_c_stream__(requested_schema) + + data = pa.chunked_array([[1, 2, 3], [4, None, 6]]) + wrapper = ChunkedArrayWrapper(data) + + # Can roundtrip through the wrapper. + result = pa.chunked_array(wrapper) + assert result == data + + # Can also import with a type that implementer can cast to. + result = pa.chunked_array(wrapper, type=pa.int16()) + assert result == data.cast(pa.int16()) + + def test_recordbatch_c_array_interface(): class BatchWrapper: def __init__(self, batch): @@ -1696,6 +1738,43 @@ def test_table_rename_columns(cls): expected = cls.from_arrays(data, names=['eh', 'bee', 'sea']) assert t2.equals(expected) + message = "names must be a list or dict not " + with pytest.raises(TypeError, match=message): + table.rename_columns('not a list') + + +@pytest.mark.parametrize( + ('cls'), + [ + (pa.Table), + (pa.RecordBatch) + ] +) +def test_table_rename_columns_mapping(cls): + data = [ + pa.array(range(5)), + pa.array([-10, -5, 0, 5, 10]), + pa.array(range(5, 10)) + ] + table = cls.from_arrays(data, names=['a', 'b', 'c']) + assert table.column_names == ['a', 'b', 'c'] + + expected = cls.from_arrays(data, names=['eh', 'b', 'sea']) + t1 = table.rename_columns({'a': 'eh', 'c': 'sea'}) + t1.validate() + assert t1 == expected + + # Test renaming duplicate column names + table = cls.from_arrays(data, names=['a', 'a', 'c']) + expected = cls.from_arrays(data, names=['eh', 'eh', 'sea']) + t2 = table.rename_columns({'a': 'eh', 'c': 'sea'}) + t2.validate() + assert t2 == expected + + # Test column not found + with pytest.raises(KeyError, match=r"Column 'd' not found"): + table.rename_columns({'a': 'eh', 'd': 'sea'}) + def test_table_flatten(): ty1 = pa.struct([pa.field('x', pa.int16()), @@ -3197,6 +3276,21 @@ def test_numpy_asarray(constructor): assert result.dtype == "int32" +@pytest.mark.parametrize("constructor", [pa.table, pa.record_batch]) +def test_numpy_array_protocol(constructor): + table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"]) + expected = np.array([[1, 4], [2, 5], [3, 6]], dtype="float64") + + if Version(np.__version__) < Version("2.0"): + # copy keyword is not strict and not passed down to __array__ + result = np.array(table, copy=False) + np.testing.assert_array_equal(result, expected) + else: + # starting with numpy 2.0, the copy=False keyword is assumed to be strict + with pytest.raises(ValueError, match="Unable to avoid a copy"): + np.array(table, copy=False) + + @pytest.mark.acero def test_invalid_non_join_column(): NUM_ITEMS = 30 diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 21b3829803487..4f66a6f41672d 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -1335,3 +1335,25 @@ def __arrow_c_schema__(self): wrapped_schema = Wrapper(schema) assert pa.schema(wrapped_schema) == schema + + +def test_field_import_c_schema_interface(): + class Wrapper: + def __init__(self, field): + self.field = field + + def __arrow_c_schema__(self): + return self.field.__arrow_c_schema__() + + field = pa.field("field_name", pa.int32(), metadata={"key": "value"}) + wrapped_field = Wrapper(field) + + assert pa.field(wrapped_field) == field + + with pytest.raises(ValueError, match="cannot specify 'type'"): + pa.field(wrapped_field, type=pa.int64()) + + # override nullable or metadata + assert pa.field(wrapped_field, nullable=False).nullable is False + result = pa.field(wrapped_field, metadata={"other": "meta"}) + assert result.metadata == {b"other": b"meta"} diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 6cbad8eeb653c..018099ae7e659 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -3462,7 +3462,7 @@ cdef DataType primitive_type(Type type): # Type factory functions -def field(name, type, bint nullable=True, metadata=None): +def field(name, type=None, nullable=None, metadata=None): """ Create a pyarrow.Field instance. @@ -3470,6 +3470,8 @@ def field(name, type, bint nullable=True, metadata=None): ---------- name : str or bytes Name of the field. + Alternatively, you can also pass an object that implements the Arrow + PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). type : pyarrow.DataType Arrow datatype of the field. nullable : bool, default True @@ -3504,11 +3506,25 @@ def field(name, type, bint nullable=True, metadata=None): >>> pa.struct([field]) StructType(struct) """ + if hasattr(name, "__arrow_c_schema__"): + if type is not None: + raise ValueError( + "cannot specify 'type' when creating a Field from an ArrowSchema" + ) + field = Field._import_from_c_capsule(name.__arrow_c_schema__()) + if metadata is not None: + field = field.with_metadata(metadata) + if nullable is not None: + field = field.with_nullable(nullable) + return field + cdef: Field result = Field.__new__(Field) DataType _type = ensure_type(type, allow_none=False) shared_ptr[const CKeyValueMetadata] c_meta + nullable = True if nullable is None else nullable + metadata = ensure_metadata(metadata, allow_none=True) c_meta = pyarrow_unwrap_metadata(metadata) diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index 9920a38a4e288..3ea57b611f9d8 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,6 +1,6 @@ cython>=0.29.31 oldest-supported-numpy>=0.14; python_version<'3.9' -numpy>=1.25; python_version>='3.9' +numpy>=2.0.0rc1; python_version>='3.9' setuptools_scm setuptools>=58 wheel