Skip to content

Commit

Permalink
Fixing 552 (#553)
Browse files Browse the repository at this point in the history
* removing CYLON_PARQUET cmake variable

* fixing ci

* minor changes

* adding parquet test

* adding parquet test

* fixing import error

* attempting to fix error

* attempting to fix error

* attempting to fix error

* attempting to fix error

* attempting to fix error

* removing redundant CI

* attempting to fix mpi4py + setuptools

* adding MacOS tests

* Revert "adding MacOS tests"

This reverts commit 74a0d55.
  • Loading branch information
nirandaperera authored Dec 27, 2021
1 parent f5e31a1 commit 1ce4c6b
Show file tree
Hide file tree
Showing 11 changed files with 84 additions and 63 deletions.
8 changes: 3 additions & 5 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ jobs:
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends --no-install-suggests g++ python3 python3-dev python3-pip maven libnuma-dev libc-dev python3-venv python3-numpy openmpi-bin libopenmpi-dev
- name: Installing UCX
run: wget https://github.com/openucx/ucx/releases/download/v1.10.1/ucx-1.10.1.tar.gz -P $HOME/ucx && (cd $HOME/ucx && tar xzf ucx-1.10.1.tar.gz) && (cd $HOME/ucx/ucx-1.10.1 && ./contrib/configure-release --prefix=$PWD/install && make -j8 install)
- name: build cpp and test
run: ./build.sh -pyenv $(pwd)/ENV -bpath $(pwd)/build --cpp --release --test --cmake-flags "-DCYLON_PARQUET=ON -DCYLON_UCX=ON -DUCX_LIBDIR=$HOME/ucx/ucx-1.10.1/install/lib -DUCX_INCLUDEDIR=$HOME/ucx/ucx-1.10.1/install/include"
- name: build python and test
run: python3 -m venv ENV && ./build.sh -pyenv $(pwd)/ENV -bpath $(pwd)/build --python --pytest --cmake-flags "-DCYLON_PARQUET=ON -DCYLON_UCX=ON -DUCX_LIBDIR=$HOME/ucx/ucx-1.10.1/install/lib -DUCX_INCLUDEDIR=$HOME/ucx/ucx-1.10.1/install/include"
- name: build java
- name: Build and test
run: python3 -m venv ENV && ./build.sh -pyenv $(pwd)/ENV -bpath $(pwd)/build --cpp --test --python --pytest --cmake-flags "-DCYLON_UCX=ON -DUCX_LIBDIR=$HOME/ucx/ucx-1.10.1/install/lib -DUCX_INCLUDEDIR=$HOME/ucx/ucx-1.10.1/install/include"
- name: Build java
run: ./build.sh -pyenv $(pwd)/ENV -bpath $(pwd)/build --java

4 changes: 2 additions & 2 deletions .github/workflows/conda-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ jobs:
run: conda activate gcylon_dev

- name: build cpp and test
run: ./build.sh --conda_cpp --release --test
run: ./build.sh --conda_cpp --release --test --cmake-flags "-DCYLON_UCX=ON"
- name: build pycylon and test
run: ./build.sh --conda_python --release --pytest
run: ./build.sh --conda_python --release --pytest --cmake-flags "-DCYLON_UCX=ON"
- name: build gcylon cpp, no testing at github for gpu
run: ./build.sh --gcylon --release
- name: build pygcylon, no testing at github for gpu
Expand Down
9 changes: 6 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ echo "=================================================================";
}

read_python_requirements(){
pip3 install -U pip || exit 1
# required for mpi4py
pip3 install setuptools==60.0.0 || exit 1
pip3 install -r requirements.txt || exit 1
}

Expand Down Expand Up @@ -316,7 +319,7 @@ build_cpp_conda(){
cmake -DPYCYLON_BUILD=${PYTHON_BUILD} -DCMAKE_BUILD_TYPE=${BUILD_MODE} \
-DCYLON_WITH_TEST=${RUN_CPP_TESTS} -DCMAKE_INSTALL_PREFIX=${INSTALL_PATH} \
-DARROW_BUILD_TYPE="SYSTEM" -DARROW_LIB_DIR=${ARROW_LIB} -DARROW_INCLUDE_DIR=${ARROW_INC} \
-DCYLON_PARQUET=ON -DGCYLON_BUILD=${GCYLON_BUILD}\
-DGCYLON_BUILD=${GCYLON_BUILD} \
${CMAKE_FLAGS} \
${SOURCE_DIR} \
|| exit 1
Expand Down Expand Up @@ -361,7 +364,7 @@ build_pyarrow(){
read_python_requirements
check_python_pre_requisites
pushd ${BUILD_PATH}/arrow/arrow/python || exit 1
PYARROW_CMAKE_OPTIONS="-DCMAKE_MODULE_PATH=${ARROW_HOME}/lib/cmake/arrow" python3 setup.py install || exit 1
PYARROW_CMAKE_OPTIONS="-DCMAKE_MODULE_PATH=${ARROW_HOME}/lib/cmake/arrow" PYARROW_WITH_PARQUET=1 python3 setup.py install || exit 1
popd || exit 1
print_line
}
Expand All @@ -371,7 +374,7 @@ build_python_pyarrow() {
echo "Building Python"
source "${PYTHON_ENV_PATH}"/bin/activate || exit 1
read_python_requirements
pip install pyarrow==4.0.1 || exit 1
pip install pyarrow==5.0.0 || exit 1

ARROW_LIB=$(python3 -c 'import pyarrow as pa; import os; print(os.path.dirname(pa.__file__))') || exit 1
LD_LIBRARY_PATH="${ARROW_LIB}:${BUILD_PATH}/lib:${LD_LIBRARY_PATH}" || exit 1
Expand Down
25 changes: 7 additions & 18 deletions cpp/CMake/Modules/ConfigureArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,6 @@ include(GNUInstallDirs)
set(ARROW_HOME ${CMAKE_BINARY_DIR}/arrow/install)
set(ARROW_ROOT ${CMAKE_BINARY_DIR}/arrow)

if (CYLON_PARQUET)
set(PARQUET_ARGS " -DARROW_WITH_BROTLI=ON"
" -DARROW_WITH_SNAPPY=ON"
" -DARROW_WITH_ZLIB=ON"
" -DARROW_PARQUET=ON")
else (CYLON_PARQUET)
set(PARQUET_ARGS " -DARROW_WITH_BROTLI=OFF"
" -DARROW_WITH_SNAPPY=OFF"
" -DARROW_WITH_ZLIB=OFF"
" -DARROW_PARQUET=OFF")
endif (CYLON_PARQUET)

set(ARROW_CMAKE_ARGS " -DARROW_WITH_LZ4=OFF"
" -DARROW_WITH_ZSTD=OFF"
" -DARROW_BUILD_STATIC=ON"
Expand All @@ -51,7 +39,10 @@ set(ARROW_CMAKE_ARGS " -DARROW_WITH_LZ4=OFF"
" -DARROW_CSV=ON"
" -DARROW_JSON=ON"
" -DARROW_BOOST_USE_SHARED=OFF"
${PARQUET_ARGS}
" -DARROW_WITH_BROTLI=ON"
" -DARROW_WITH_SNAPPY=ON"
" -DARROW_WITH_ZLIB=ON"
" -DARROW_PARQUET=ON"
)

if (PYCYLON_BUILD)
Expand Down Expand Up @@ -102,11 +93,9 @@ find_package(Arrow REQUIRED HINTS "${ARROW_LIBRARY_DIR}/cmake/arrow" CONFIGS Fin
message(STATUS "Arrow lib: ${ARROW_SHARED_LIB}")
set(ARROW_LIB ${ARROW_SHARED_LIB})

if (CYLON_PARQUET)
find_package(Parquet REQUIRED HINTS "${ARROW_LIBRARY_DIR}/cmake/arrow" CONFIGS FindParquet.cmake)
message(STATUS "Parquet lib: ${PARQUET_SHARED_LIB}")
set(PARQUET_LIB ${PARQUET_SHARED_LIB})
endif (CYLON_PARQUET)
find_package(Parquet REQUIRED HINTS "${ARROW_LIBRARY_DIR}/cmake/arrow" CONFIGS FindParquet.cmake)
message(STATUS "Parquet lib: ${PARQUET_SHARED_LIB}")
set(PARQUET_LIB ${PARQUET_SHARED_LIB})

if (PYCYLON_BUILD)
find_package(arrow_python REQUIRED HINTS "${ARROW_LIBRARY_DIR}/cmake/arrow" CONFIGS FindArrowPython.cmake)
Expand Down
11 changes: 4 additions & 7 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -285,13 +285,10 @@ else ()
endif ()

# parquet
option(CYLON_PARQUET "Build Cylon with Parquet functionalities" OFF)
if (CYLON_PARQUET)
message("Cylon Parquet enabled")
add_definitions(-DBUILD_CYLON_PARQUET)
find_library(PARQUET_LIB parquet REQUIRED HINTS "${ARROW_LIB_DIR}")
message(STATUS "Parquet lib: ${PARQUET_LIB}")
endif (CYLON_PARQUET)
message("Cylon Parquet enabled")
add_definitions(-DBUILD_CYLON_PARQUET)
find_library(PARQUET_LIB parquet REQUIRED HINTS "${ARROW_LIB_DIR}")
message(STATUS "Parquet lib: ${PARQUET_LIB}")

add_subdirectory(src/cylon)
add_subdirectory(src/examples)
Expand Down
16 changes: 3 additions & 13 deletions cpp/src/cylon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,6 @@ IF(WIN32)
ELSE()
set_source_files_properties(util/murmur3.cpp util/murmur3.hpp PROPERTIES COMPILE_FLAGS -Wno-implicit-fallthrough)
ENDIF()
if (CYLON_PARQUET)
set(PARQUET_CYLON_FILES
io/parquet_config.hpp
io/parquet_config.cpp
)
else (CYLON_PARQUET)
set(PARQUET_CYLON_FILES)
endif (CYLON_PARQUET)

if (CYLON_UCX)
set(UCX_CYLON_FILES
Expand All @@ -42,7 +34,6 @@ else (CYLON_UCX)
endif (CYLON_UCX)

add_library(cylon SHARED
${PARQUET_CYLON_FILES}
${UCX_CYLON_FILES}
arrow/arrow_all_to_all.cpp
arrow/arrow_all_to_all.hpp
Expand Down Expand Up @@ -94,6 +85,8 @@ add_library(cylon SHARED
io/csv_read_config_holder.hpp
io/csv_write_config.cpp
io/csv_write_config.hpp
io/parquet_config.hpp
io/parquet_config.cpp
join/hash_join.cpp
join/hash_join.hpp
join/join.cpp
Expand Down Expand Up @@ -202,10 +195,7 @@ target_link_libraries(cylon ${MPI_CXX_LIBRARIES})
target_link_libraries(cylon ${GLOG_LIBRARIES})
target_link_libraries(cylon ${ARROW_LIB})
target_link_libraries(cylon Threads::Threads)

if (CYLON_PARQUET)
target_link_libraries(cylon ${PARQUET_LIB})
endif ()
target_link_libraries(cylon ${PARQUET_LIB})

if (CYLON_UCX)
target_link_libraries(cylon ${UCX_LIBRARIES})
Expand Down
7 changes: 2 additions & 5 deletions cpp/src/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,8 @@ cylon_add_exe(indexing_example)
cylon_add_exe(sorting_example)
cylon_add_exe(multicolumn_sorting_example)
cylon_add_exe(multi_idx_join_example)

if (CYLON_PARQUET)
cylon_add_exe(parquet_union_example)
cylon_add_exe(parquet_join_example)
endif (CYLON_PARQUET)
cylon_add_exe(parquet_union_example)
cylon_add_exe(parquet_join_example)

if (CYLON_UCX)
cylon_add_exe(ucx_join_example)
Expand Down
12 changes: 5 additions & 7 deletions cpp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,11 @@ cylon_run_test(join_test 1)
cylon_run_test(join_test 2)
cylon_run_test(join_test 4)

if (CYLON_PARQUET)
# join parquet tests
cylon_add_test(parquet_join_test)
cylon_run_test(parquet_join_test 1)
cylon_run_test(parquet_join_test 2)
cylon_run_test(parquet_join_test 4)
endif (CYLON_PARQUET)
# join parquet tests
cylon_add_test(parquet_join_test)
cylon_run_test(parquet_join_test 1)
cylon_run_test(parquet_join_test 2)
cylon_run_test(parquet_join_test 4)

# set operation tests
cylon_add_test(set_op_test)
Expand Down
6 changes: 6 additions & 0 deletions python/pycylon/test/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,12 @@ def test_repartition():
assert responses[-1] == 0


def test_parquet():
print("32. DataFrame Test")
responses.append(os.system("pytest -q python/pycylon/test/test_parquet.py"))
assert responses[-1] == 0


def test_all():
ar = np.array(responses)
total = len(responses)
Expand Down
40 changes: 40 additions & 0 deletions python/pycylon/test/test_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
##
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

"""
Run test:
>> pytest -q python/pycylon/test/test_parquet.py
"""
import pandas as pd
from pyarrow.parquet import read_table
from pycylon.frame import DataFrame, CylonEnv


def test_read_parquet():
tbl = read_table('data/input/parquet1_0.parquet')
cdf = DataFrame(tbl)
pdf = pd.read_parquet('file://data/input/parquet1_0.parquet')

assert (pdf.values.tolist() == cdf.to_pandas().values.tolist())


def test_parquet_join():
cdf1 = DataFrame(read_table('data/input/parquet1_0.parquet'))
cdf2 = DataFrame(read_table('data/input/parquet2_0.parquet'))
expected = DataFrame(read_table('data/output/join_inner_1_0.parquet'))

out = cdf1.merge(cdf2, how='inner', on=[0], algorithm='sort', suffixes=('lt-', 'rt-'))

# assert(expected.equals(out, ordered=False)) # should pass
assert (len(expected.to_table().subtract(out.to_table())) == 0)
9 changes: 6 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
wheel
numpy
cython
wheel
fsspec # required for pandas-parquet
pandas
pytest
pytest-mpi
# setuptools==60.0.0 # required for mpi4py
# mpi4py==3.1.3
mpi4py
pytest
pytest-mpi

0 comments on commit 1ce4c6b

Please sign in to comment.