Skip to content

Commit

Permalink
Arrow 9.0.0 and gcc-11 update (#601)
Browse files Browse the repository at this point in the history
* gcc11 fixes

* new api changes

* trying to fix cudf cython changes

* fixing cython compilation

* fixing test cases

* upgrading cuda in CI

* bumping jimver version

* bumping jimver version

* upgrading arrow to 8.0.1

* upgrading arrow to 8.0.1

* remove upgrade

* fixing error

* adding arrow_dataset libs

* upgrading to arrow 9.0.0

* enable mpi

* arrow version

* update windows

* fix ucx issue

* fixing gcylon api changes

* fixing version issue

Co-authored-by: niranda perera <[email protected]>
  • Loading branch information
nirandaperera and niranda perera authored Jan 22, 2023
1 parent 4c867b1 commit d9a6420
Show file tree
Hide file tree
Showing 42 changed files with 319 additions and 380 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ jobs:
- name: Install dependencies
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends --no-install-suggests g++ python3 python3-dev python3-pip maven libnuma-dev libc-dev python3-venv python3-numpy openmpi-bin libopenmpi-dev
- name: Installing UCX
run: wget https://github.com/openucx/ucx/releases/download/v1.10.1/ucx-1.10.1.tar.gz -P $HOME/ucx && (cd $HOME/ucx && tar xzf ucx-1.10.1.tar.gz) && (cd $HOME/ucx/ucx-1.10.1 && ./contrib/configure-release --prefix=$PWD/install && make -j8 install)
run: wget https://github.com/openucx/ucx/releases/download/v1.13.1/ucx-1.13.1.tar.gz -P $HOME/ucx && (cd $HOME/ucx && tar xzf ucx-1.13.1.tar.gz) && (cd $HOME/ucx/ucx-1.13.1 && ./contrib/configure-release --prefix=$PWD/install --with-go=no && make -j8 install)
- name: Build and test
run: python3 -m venv ENV && ./build.sh -pyenv $(pwd)/ENV -bpath $(pwd)/build --cpp --test --python --pytest --cmake-flags "-DCYLON_UCX=ON -DUCX_LIBDIR=$HOME/ucx/ucx-1.10.1/install/lib -DUCX_INCLUDEDIR=$HOME/ucx/ucx-1.10.1/install/include"
run: python3 -m venv ENV && ./build.sh -pyenv $(pwd)/ENV -bpath $(pwd)/build --cpp --test --python --pytest --cmake-flags "-DCYLON_UCX=ON -DUCX_LIBDIR=$HOME/ucx/ucx-1.13.1/install/lib -DUCX_INCLUDEDIR=$HOME/ucx/ucx-1.13.1/install/include"
- name: Build java
run: ./build.sh -pyenv $(pwd)/ENV -bpath $(pwd)/build --java

7 changes: 2 additions & 5 deletions .github/workflows/conda-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,8 @@ jobs:
include:
# 20.04 supports CUDA 11.0+
- os: ubuntu-20.04
cuda: "11.2.2"
cuda: "11.5.2"
gcc: 9
# - os: ubuntu-20.04
# cuda: "11.2"
# gcc: 10

steps:
- uses: actions/checkout@v2
Expand All @@ -40,7 +37,7 @@ jobs:
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
- uses: Jimver/[email protected].4
- uses: Jimver/[email protected].8
id: cuda-toolkit
with:
cuda: ${{ matrix.cuda }}
Expand Down
7 changes: 3 additions & 4 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
# limitations under the License.

import argparse
import os
import subprocess
import logging
import os
import platform
from pathlib import Path

import subprocess
import sys
from pathlib import Path

logging.basicConfig(format='[%(levelname)s] %(message)s')
logger = logging.getLogger("cylon_build")
Expand Down
17 changes: 9 additions & 8 deletions conda/environments/cylon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ channels:
- conda-forge
- defaults
dependencies:
- python=3.8
- cmake>=3.17
- pyarrow=5.0.0
- glog=0.5.0
- python>=3.8,<3.10
- cmake>=3.23.1,!=3.25.0
- arrow-cpp=9
- pyarrow=9.0.0
- glog
- openmpi=4.1.3=ha1ae619_105
- ucx>=1.12.1
- cython>=0.29,<0.30
- numpy>=1.20
- pandas>=1.0
- fsspec
- setuptools>=40.0,<60.0
- numpy
- pandas>=1.0,<1.6.0dev0
- fsspec>=0.6.0
- setuptools
# they are not needed for using pygcylon or compiling it
- pytest
- pytest-mpi
Expand Down
17 changes: 9 additions & 8 deletions conda/environments/cylon_MacOS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@ channels:
- conda-forge
- defaults
dependencies:
- python=3.9
- cmake>=3.17
- pyarrow=5.0.0
- glog=0.5.0
- python>=3.9,<3.10
- cmake>=3.23.1,!=3.25.0
- arrow-cpp=9
- pyarrow=9.0.0
- glog
- openmpi>=4.1.2
- cython>=0.29,<0.30
- numpy>=1.16
- pandas>=1.0
- fsspec
- setuptools>=40.0
- numpy
- pandas>=1.0,<1.6.0dev0
- fsspec>=0.6.0
- setuptools
# they are not needed for using pygcylon or compiling it
- pytest
- pytest-mpi
Expand Down
24 changes: 13 additions & 11 deletions conda/environments/gcylon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@ channels:
- conda-forge
- defaults
dependencies:
- python=3.8
- cmake>=3.17
- python>=3.8,<3.10
- cmake>=3.23.1,!=3.25.0
- arrow-cpp=9
- pyarrow=9.0.0
- cython>=0.29,<0.30
- cudf=21.10.01
- cudatoolkit=11.2
- glog=0.5.0
- cudf=22.12.01
- cudatoolkit=11.5
- glog
- openmpi=4.1.3=ha1ae619_105
- ucx>=1.12.1
- numpy>=1.20
- pandas>=1.0
- fsspec
- setuptools>=40.0,<60.0
# these are for running tests only,
# they are not needed for using pygcylon or compiling it
- numpy
- pandas>=1.0,<1.6.0dev0
- fsspec>=0.6.0
- setuptools
# these are for running tests only,
# they are not needed for using pygcylon or compiling it
- pytest
- pytest-mpi
- mpi4py
17 changes: 9 additions & 8 deletions conda/environments/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@ channels:
- conda-forge
- defaults
dependencies:
- python=3.8
- cmake>=3.17
- pyarrow=5.0.0
- glog=0.5.0
- python>=3.8,<3.10
- cmake>=3.23.1,!=3.25.0
- arrow-cpp=9
- pyarrow=9.0.0
- glog
- msmpi
- cython>=0.29,<0.30
- numpy>=1.16
- pandas>=1.0
- fsspec
- setuptools>=40.0,<60.0
- numpy
- pandas>=1.0,<1.6.0dev0
- fsspec>=0.6.0
- setuptools
# they are not needed for using pygcylon or compiling it
- pytest
- pytest-mpi
Expand Down
42 changes: 32 additions & 10 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ if (NOT CMAKE_BUILD_TYPE)
endif ()

# cmake modules directories
set(CYLON_ARROW_VERSION 5.0.0)
set(CYLON_ARROW_VERSION 9.0.0)
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake/Modules/" ${CMAKE_MODULE_PATH})
list(APPEND CMAKE_MODULE_PATH ${CYLON_SOURCE_DIR}/CMake)

Expand Down Expand Up @@ -201,20 +201,46 @@ message(STATUS "Glog libs: ${GLOG_LIBRARIES}")
# this is required on windows to prevent clashes between ERROR abbreviation in logging.h and windows.h
if (WIN32)
add_definitions("-DGLOG_NO_ABBREVIATED_SEVERITIES")
add_definitions("-DNOMINMAX=1")
endif ()

# include Modules/Build.cmake
include(Build)

# Off if you dont want to build tests -- ON default
option(CYLON_WITH_TEST "Build Cylon C++ tests." OFF)
if (CYLON_WITH_TEST)
message("C++ tests enabled")
set(CYLON_CATCH2_GIT_TAG v2.13.9)
set(CYLON_CATCH2_HEADER_HASH 27da57c7a06d09be8dd81fab7246b79e7892b6ae7e4e49ba8631f1d5a955e3fc)


# Ref: https://cliutils.gitlab.io/modern-cmake/chapters/testing/catch.html
message("Downloading Catch2 header ${CYLON_CATCH2_GIT_TAG}")
set(catch2_url https://github.com/catchorg/Catch2/releases/download/${CYLON_CATCH2_GIT_TAG}/catch.hpp)
set(CYLON_CATCH2_HEADER_PATH "${CMAKE_CURRENT_BINARY_DIR}/test/catch.hpp")
file(DOWNLOAD ${catch2_url} ${CYLON_CATCH2_HEADER_PATH}
STATUS status
EXPECTED_HASH SHA256=${CYLON_CATCH2_HEADER_HASH})
list(GET status 0 error)
if (error)
message(FATAL_ERROR "Could not download ${catch2_url}")
else ()
message(STATUS "Catch2 header downloaded to ${CYLON_CATCH2_HEADER_PATH}")
endif ()

set(CMAKE_CTEST_ARGUMENTS "--output-on-failure")
enable_testing()
endif(CYLON_WITH_TEST)


# if building gcylon, no need to build cylon
option(GCYLON_BUILD "Build GCylon" OFF)
if (GCYLON_BUILD)
message("GCylon build enabled")
add_subdirectory(src/gcylon)
add_subdirectory(src/examples/gcylon)
if (CYLON_WITH_TEST)
message("Tests enabled!")
enable_testing()
add_subdirectory(test/gcylon)
endif ()
return()
Expand Down Expand Up @@ -322,6 +348,9 @@ if (${ARROW_BUILD_TYPE} STREQUAL "SYSTEM")
find_library(ARROW_LIB arrow ${CYLON_ARROW_VERSION} REQUIRED)
message(STATUS "Arrow lib: ${ARROW_LIB}")

find_library(ARROW_DATASET_LIB arrow_dataset ${CYLON_ARROW_VERSION} REQUIRED)
message(STATUS "Arrow dataset lib: ${ARROW_DATASET_LIB}")

if (PYCYLON_BUILD)
find_library(ARROW_PY_LIB arrow_python ${CYLON_ARROW_VERSION} REQUIRED)
message(STATUS "Arrow py lib: ${ARROW_PY_LIB}")
Expand Down Expand Up @@ -389,13 +418,6 @@ if (${ARROW_BUILD_TYPE} STREQUAL "SOURCE")
endif ()

# Off if you dont want to build tests -- ON default
option(CYLON_WITH_TEST "Build Cylon C++ tests." OFF)
if (CYLON_WITH_TEST)
message("C++ tests enabled")
set(CYLON_CATCH2_GIT_TAG v2.13.9)
set(CYLON_CATCH2_HEADER_HASH 27da57c7a06d09be8dd81fab7246b79e7892b6ae7e4e49ba8631f1d5a955e3fc)

enable_testing()
set(CMAKE_CTEST_ARGUMENTS "--output-on-failure")
add_subdirectory(test)
endif ()
1 change: 1 addition & 0 deletions cpp/src/cylon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ ENDIF()
target_link_libraries(cylon ${MPI_CXX_LIBRARIES})
target_link_libraries(cylon ${GLOG_LIBRARIES})
target_link_libraries(cylon ${ARROW_LIB})
target_link_libraries(cylon ${ARROW_DATASET_LIB})
target_link_libraries(cylon Threads::Threads)
target_link_libraries(cylon ${PARQUET_LIB})

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/cylon/arrow/arrow_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#include <glog/logging.h>
#include <type_traits>
#include <arrow/visitor_inline.h>
#include <arrow/visit_data_inline.h>

#include <cylon/arrow/arrow_kernels.hpp>
#include <cylon/util/macros.hpp>
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/cylon/arrow/arrow_partition_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#include <glog/logging.h>

#include <arrow/visitor_inline.h>
#include <arrow/visit_data_inline.h>

#include <cylon/util/murmur3.hpp>
#include <cylon/util/macros.hpp>
Expand Down Expand Up @@ -52,7 +52,7 @@ inline Status visit_chunked_array(const std::shared_ptr<arrow::ChunkedArray> &id
for (auto &&array: idx_col->chunks()) {
const auto &arr_data = array->data();

arrow::VisitArrayDataInline<ArrowT>(*arr_data,
arrow::VisitArraySpanInline<ArrowT>(*arr_data,
[&](ValueT val) {
valid_fn(global_idx, val);
global_idx++;
Expand Down Expand Up @@ -541,7 +541,7 @@ class RangePartitionKernel : public PartitionKernel {
// create sample histogram
std::vector<uint64_t> local_counts(num_bins + 2, 0);
for (const auto &arr: sampled_array->chunks()) {
arrow::VisitArrayDataInline<ARROW_T>(*arr->data(),
arrow::VisitArraySpanInline<ARROW_T>(*arr->data(),
[&](ValueT val) {
local_counts[get_bin_pos(val)]++;
},
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/cylon/compute/aggregate_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@
#include <cmath>
#include <vector>
#include <unordered_set>
#include <stdexcept>
#include <limits>


#include "cylon/util/macros.hpp"

namespace cylon {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/cylon/compute/aggregate_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ cylon::Status AllReduce(const std::shared_ptr<CylonContext> &ctx,
auto rcv_scalar = std::make_shared<ScalarT>(*send_scalar);
std::memset(&rcv_scalar->value, 0, sizeof(CType));

RETURN_CYLON_STATUS_IF_FAILED(cylon::mpi::AllReduce(ctx, send_scalar->data(),
RETURN_CYLON_STATUS_IF_FAILED(cylon::mpi::AllReduce(ctx, send_scalar->mutable_data(),
rcv_scalar->mutable_data(),
1, data_type, reduce_ops[i]));
rcv_scalar_vector.push_back(rcv_scalar);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/cylon/compute/aggregates.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ cylon::Status Count(const std::shared_ptr<cylon::Table> &table, int32_t col_idx,
const auto &data_type = cylon::Int64();

arrow::compute::ExecContext exec_ctx(cylon::ToArrowPool(ctx));
arrow::compute::ScalarAggregateOptions options(true, 0);
arrow::compute::CountOptions options;
CYLON_ASSIGN_OR_RAISE(auto count_res, arrow::compute::Count(a_col, options, &exec_ctx));

if (ctx->GetWorldSize() > 1) {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/cylon/compute/scalar_aggregate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include <cmath>

#include <arrow/compute/api.h>
#include <arrow/visitor_inline.h>
#include <arrow/util/bit_util.h>

#include "cylon/compute/aggregates.hpp"
#include "cylon/ctx/arrow_memory_pool_utils.hpp"
Expand Down Expand Up @@ -367,7 +367,7 @@ std::shared_ptr<arrow::DataType> PromoteDatatype(const std::vector<arrow::Type::
}

if (max_width_signed <= max_width_unsigned) {
max_width_signed = static_cast<int>(arrow::BitUtil::NextPower2(max_width_unsigned + 1));
max_width_signed = static_cast<int>(arrow::bit_util::NextPower2(max_width_unsigned + 1));
}

if (max_width_signed >= 64) return arrow::int64();
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/cylon/groupby/hash_groupby.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
*/

#include <arrow/api.h>
#include <arrow/visitor_inline.h>
#include <arrow/visit_data_inline.h>
#include <arrow/compute/api.h>
#include <chrono>
#include <glog/logging.h>
Expand Down Expand Up @@ -207,7 +207,7 @@ inline Status aggregate(arrow::MemoryPool *pool,

const auto &arr = table->column(col_idx)->chunk(0);
int64_t i = 0;
arrow::VisitArrayDataInline<ARROW_T>(*arr->data(),
arrow::VisitArraySpanInline<ARROW_T>(*arr->data(),
[&](const C_TYPE &val) {
kernel->Update(&val, &agg_states[group_ids[i]]);
i++;
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/cylon/groupby/pipeline_groupby.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ inline arrow::Status Sum(const arrow::Datum &array, arrow::compute::ExecContext
inline arrow::Status Count(const arrow::Datum &array,
arrow::compute::ExecContext *fn_ctx,
arrow::Datum *res) {
auto result = arrow::compute::Count(array, arrow::compute::ScalarAggregateOptions(true, 0),
fn_ctx);
auto result = arrow::compute::Count(array, arrow::compute::CountOptions(), fn_ctx);

if (result.ok()) {
*res = result.ValueOrDie();
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/cylon/mapreduce/mapreduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "cylon/util/macros.hpp"

#include <arrow/buffer_builder.h>
#include <arrow/visitor_inline.h>
#include <arrow/visit_data_inline.h>

namespace cylon {
namespace mapred {
Expand All @@ -27,7 +27,7 @@ void CombineVisit(const std::shared_ptr<arrow::Array> &value_col, const int64_t
Visitor &&visitor) {
using T = typename ArrowT::c_type;
int64_t i = 0;
arrow::VisitArrayDataInline<ArrowT>(*value_col->data(),
arrow::VisitArraySpanInline<ArrowT>(*value_col->data(),
[&](const T &val) {
int64_t gid = local_group_ids[i];
visitor(val, gid);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/cylon/serialize/table_serialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Status CollectBitmapInfo(const arrow::ArrayData &data, int32_t *buffer_sizes,
}

// there are nulls
*buffer_sizes = (int32_t) arrow::BitUtil::BytesForBits(data.length);
*buffer_sizes = (int32_t) arrow::bit_util::BytesForBits(data.length);
if (data.offset == 0) { // no offset
*data_buffers = data.buffers[buf_idx]->data();
} else if (data.offset % CHAR_BIT == 0) { // offset is at a byte boundary
Expand Down
Loading

0 comments on commit d9a6420

Please sign in to comment.