Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dist sort cpu #574

Merged
merged 35 commits into from
Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
2e9e98f
dist sort
kaiyingshan Feb 12, 2022
af7d35b
dist sort cont.
kaiyingshan Feb 17, 2022
a8c195b
find split point indices
kaiyingshan Feb 23, 2022
4f9c063
debug comparator
kaiyingshan Feb 24, 2022
e970358
csv file
kaiyingshan Feb 24, 2022
5e1e560
add tests back
kaiyingshan Feb 24, 2022
cfb81d1
binary search for splitting table
kaiyingshan Feb 27, 2022
572e46c
use pq for merge
kaiyingshan Feb 27, 2022
42cd8f0
use merge in final step
kaiyingshan Feb 27, 2022
55ce894
delete useless apis
kaiyingshan Feb 27, 2022
e340bef
modify SortOption to select sort methods; default sort to regular sam…
kaiyingshan Feb 28, 2022
1d7db06
resolve conflict
kaiyingshan Feb 28, 2022
80cd2f2
remove useless comment & fix
kaiyingshan Feb 28, 2022
fa61ae9
Merge branch 'cylondata:main' into dist-sort-cpu
kaiyingshan Mar 19, 2022
108556f
formatting, combine all_to_all functions
kaiyingshan Mar 20, 2022
440310a
Merge branch 'dist-sort-cpu' of github.com:kaiyingshan/cylon into dis…
kaiyingshan Mar 20, 2022
18621db
Merge branch 'dist-sort-cpu' of github.com:kaiyingshan/cylon into dis…
kaiyingshan Mar 20, 2022
72e61cd
Merge branch 'dist-sort-cpu' of github.com:kaiyingshan/cylon into dis…
kaiyingshan Mar 20, 2022
5b131e8
format
kaiyingshan Mar 20, 2022
6d1b822
Update cpp/src/cylon/table.cpp
kaiyingshan Mar 27, 2022
f010ecf
fix
kaiyingshan Mar 27, 2022
4bfadc7
Merge branch 'dist-sort-cpu' of github.com:kaiyingshan/cylon into dis…
kaiyingshan Mar 27, 2022
cf750c6
sample table only return sort columns
kaiyingshan Mar 27, 2022
f312ac6
add dist sort final step merge vs sort example
kaiyingshan Mar 29, 2022
f4d1f05
use tableRowIndexEqualTo to compare rows
kaiyingshan Mar 30, 2022
af6f9f9
test merge/sort speed & fix ctx_ptr null pointer issue
kaiyingshan Mar 31, 2022
7f1ffba
refine example
kaiyingshan Apr 8, 2022
1b53b0c
add binary search & merge test
kaiyingshan Apr 8, 2022
b9e22b9
Merge branch 'main' into dist-sort-cpu
nirandaperera Apr 12, 2022
157b2fb
use template type in test & remove setbit
kaiyingshan Apr 19, 2022
1edfd24
add const
kaiyingshan Apr 19, 2022
8f068b8
use sort for ws > 4
kaiyingshan Apr 20, 2022
fe5ff99
limiting openmpi version
nirandaperera Apr 21, 2022
4dcc3a5
attempting to fix openmpi
nirandaperera Apr 21, 2022
9cae3fc
attempting to fix openmpi
nirandaperera Apr 21, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/conda-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
# Specify the correct host compilers
- name: Install/Select gcc and g++
run: |
sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }}
sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} libopenmpi-dev
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> $GITHUB_ENV
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> $GITHUB_ENV
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cylon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ dependencies:
- cmake>=3.17
- pyarrow=5.0.0
- glog=0.5.0
- openmpi>=4.1
- openmpi>=4.1.3
- ucx>=1.12.1
- cython>=0.29,<0.30
- numpy>=1.16
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cylon_MacOS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ dependencies:
- cmake>=3.17
- pyarrow=5.0.0
- glog=0.5.0
- openmpi>=4.1
- openmpi>=4.1.2
- cython>=0.29,<0.30
- numpy>=1.16
- pandas>=1.0
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/gcylon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ dependencies:
- cudf=21.10.01
- cudatoolkit=11.2
- glog=0.5.0
- openmpi>=4.1
- ucx>=1.12.0
- openmpi>=4.1.2
- ucx>=1.12.1
- numpy>=1.16
- pandas>=1.0
- setuptools>=40.0,<60.0
Expand Down
18 changes: 13 additions & 5 deletions cpp/src/cylon/arrow/arrow_comparator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,10 +759,13 @@ Status CreateDualArrayIndexComparator(const std::shared_ptr<arrow::Array> &a1,

Status TableRowIndexEqualTo::Make(const std::shared_ptr<arrow::Table> &table,
const std::vector<int> &col_ids,
std::unique_ptr<TableRowIndexEqualTo> *out_equal_to) {
std::unique_ptr<TableRowIndexEqualTo> *out_equal_to,
const std::vector<bool> &sort_order) {
auto comps = std::make_shared<std::vector<std::shared_ptr<ArrayIndexComparator>>>();
comps->reserve(col_ids.size());
for (int col_id: col_ids) {
bool order_not_given = sort_order.size() == 0;
for (std::size_t i = 0; i < col_ids.size(); i++) {
int col_id = col_ids[i];
if (table->column(col_id)->num_chunks() > 1) {
return {Code::Invalid, "TableRowIndexEqualTo does not support multiple chunks"};
}
Expand All @@ -772,7 +775,7 @@ Status TableRowIndexEqualTo::Make(const std::shared_ptr<arrow::Table> &table,
} else {
const auto &array = table->column(col_id)->chunk(0);
std::unique_ptr<ArrayIndexComparator> comp;
RETURN_CYLON_STATUS_IF_FAILED(CreateArrayIndexComparator(array, &comp));
RETURN_CYLON_STATUS_IF_FAILED(CreateArrayIndexComparator(array, &comp, order_not_given || sort_order[i]));
comps->emplace_back(std::move(comp));
}
}
Expand Down Expand Up @@ -920,7 +923,8 @@ Status DualTableRowIndexEqualTo::Make(const std::shared_ptr<arrow::Table> &t1,
const std::shared_ptr<arrow::Table> &t2,
const std::vector<int> &t1_indices,
const std::vector<int> &t2_indices,
std::unique_ptr<DualTableRowIndexEqualTo> *out_equal_to) {
std::unique_ptr<DualTableRowIndexEqualTo> *out_equal_to,
const std::vector<bool> &sort_order) {
int num_cols = (int) t1_indices.size();
if (num_cols != (int) t2_indices.size()) {
return {Code::Invalid, "sizes of indices of t1 and t2 are not equal!"};
Expand All @@ -929,6 +933,8 @@ Status DualTableRowIndexEqualTo::Make(const std::shared_ptr<arrow::Table> &t1,
auto comps = std::make_shared<std::vector<std::shared_ptr<DualArrayIndexComparator>>>();
comps->reserve(num_cols);

bool order_not_given = sort_order.size() == 0;

for (int i = 0; i < num_cols; i++) {
if (t1->column(t1_indices[i])->num_chunks() > 1
|| t2->column(t2_indices[i])->num_chunks() > 1) {
Expand All @@ -939,7 +945,9 @@ Status DualTableRowIndexEqualTo::Make(const std::shared_ptr<arrow::Table> &t1,
const auto &a2 = util::GetChunkOrEmptyArray(t2->column(t2_indices[i]), 0);

std::unique_ptr<DualArrayIndexComparator> comp;
RETURN_CYLON_STATUS_IF_FAILED(CreateDualArrayIndexComparator(a1, a2, &comp));

// asc=true if sort_order is not provided
RETURN_CYLON_STATUS_IF_FAILED(CreateDualArrayIndexComparator(a1, a2, &comp, order_not_given || sort_order[i]));
comps->emplace_back(std::move(comp));
}

Expand Down
10 changes: 7 additions & 3 deletions cpp/src/cylon/arrow/arrow_comparator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,11 @@ class TableRowIndexEqualTo {
// equality, less than, greater than
int compare(const int64_t &record1, const int64_t &record2) const;

static Status Make(const std::shared_ptr<arrow::Table> &table, const std::vector<int> &col_ids,
std::unique_ptr<TableRowIndexEqualTo> *out_equal_to);
static Status Make(const std::shared_ptr<arrow::Table> &table,
const std::vector<int> &col_ids,
std::unique_ptr<TableRowIndexEqualTo> *out_equal_to,
const std::vector<bool> &sort_order = {});

static Status Make(const std::vector<std::shared_ptr<arrow::Array>> &arrays,
std::unique_ptr<TableRowIndexEqualTo> *out_equal_to);

Expand Down Expand Up @@ -278,7 +281,8 @@ class DualTableRowIndexEqualTo {
const std::shared_ptr<arrow::Table> &t2,
const std::vector<int> &t1_indices,
const std::vector<int> &t2_indices,
std::unique_ptr<DualTableRowIndexEqualTo> *out_equal_to);
std::unique_ptr<DualTableRowIndexEqualTo> *out_equal_to,
const std::vector<bool> &sort_order = {});

bool operator()(const int64_t &record1, const int64_t &record2) const;

Expand Down
Loading