Skip to content

Commit

Permalink
Extending datatypes (#568)
Browse files Browse the repository at this point in the history
* extending datatypes

* extending datatypes

* macos fixes
  • Loading branch information
nirandaperera authored Feb 17, 2022
1 parent e3d553c commit 9c2fdc4
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 132 deletions.
4 changes: 2 additions & 2 deletions cpp/src/cylon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ add_library(cylon SHARED
ctx/cylon_context.cpp
ctx/cylon_context.hpp
ctx/memory_pool.hpp
data_types.cpp
data_types.hpp
groupby/groupby.cpp
groupby/groupby.hpp
Expand Down Expand Up @@ -179,8 +180,7 @@ add_library(cylon SHARED
util/sort.hpp
util/to_string.hpp
util/uuid.cpp
util/uuid.hpp
)
util/uuid.hpp)

IF(NOT MSVC)
if(APPLE)
Expand Down
90 changes: 65 additions & 25 deletions cpp/src/cylon/arrow/arrow_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
namespace cylon {
namespace tarrow {

std::shared_ptr<arrow::DataType> convertToArrowType(const std::shared_ptr<DataType> &tType, int32_t width) {
switch (tType->getType()) {
std::shared_ptr<arrow::DataType> ToArrowType(const std::shared_ptr<DataType> &type) {
switch (type->getType()) {
case Type::BOOL:return std::make_shared<arrow::BooleanType>();
case Type::UINT8:return std::make_shared<arrow::UInt8Type>();
case Type::INT8:return std::make_shared<arrow::Int8Type>();
Expand All @@ -36,19 +36,30 @@ std::shared_ptr<arrow::DataType> convertToArrowType(const std::shared_ptr<DataTy
case Type::DOUBLE:return std::make_shared<arrow::DoubleType>();
case Type::STRING:return std::make_shared<arrow::StringType>();
case Type::BINARY:return std::make_shared<arrow::BinaryType>();
case Type::FIXED_SIZE_BINARY: {
if (width < 0) break;
return std::make_shared<arrow::FixedSizeBinaryType>(width);
}
case Type::FIXED_SIZE_BINARY:
return arrow::fixed_size_binary(std::static_pointer_cast<FixedSizeBinaryType>(type)
->byte_width_);
case Type::DATE32:return std::make_shared<arrow::Date32Type>();
case Type::DATE64:return std::make_shared<arrow::Date64Type>();
case Type::TIMESTAMP:return std::make_shared<arrow::TimestampType>();
case Type::TIMESTAMP: {
const auto &casted = std::static_pointer_cast<TimestampType>(type);
return arrow::timestamp(ToArrowTimeUnit(casted->unit_), casted->timezone_);
}
case Type::TIME32:return std::make_shared<arrow::Time32Type>();
case Type::TIME64:return std::make_shared<arrow::Time64Type>();
case Type::DURATION:return std::make_shared<arrow::DurationType>();
case Type::DURATION: {
const auto &casted = std::static_pointer_cast<DurationType>(type);
return std::make_shared<arrow::DurationType>(ToArrowTimeUnit(casted->unit_));
}
case Type::LARGE_STRING:return std::make_shared<arrow::LargeStringType>();
case Type::LARGE_BINARY:return std::make_shared<arrow::LargeBinaryType>();
case Type::DECIMAL: break;
case Type::DECIMAL: {
const auto &casted = std::static_pointer_cast<DecimalType>(type);
if (casted->byte_width_ == 16) return arrow::decimal128(casted->precision_, casted->scale_);
else if (casted->byte_width_ == 32)
return arrow::decimal256(casted->precision_, casted->scale_);
else break;
}
case Type::INTERVAL:break;
case Type::LIST:break;
case Type::FIXED_SIZE_LIST:break;
Expand All @@ -58,10 +69,6 @@ std::shared_ptr<arrow::DataType> convertToArrowType(const std::shared_ptr<DataTy
return nullptr;
}

bool validateArrowTableTypes(const std::shared_ptr<arrow::Table> &table) {
return CheckSupportedTypes(table).is_ok();
}

Status CheckSupportedTypes(const std::shared_ptr<arrow::Table> &table) {
const auto &schema = table->schema();
for (const auto &t: schema->fields()) {
Expand All @@ -79,9 +86,11 @@ Status CheckSupportedTypes(const std::shared_ptr<arrow::Table> &table) {
case arrow::Type::HALF_FLOAT:
case arrow::Type::FLOAT:
case arrow::Type::DOUBLE:
case arrow::Type::BINARY:
case arrow::Type::FIXED_SIZE_BINARY:
case arrow::Type::BINARY:
case arrow::Type::STRING:
case arrow::Type::LARGE_BINARY:
case arrow::Type::LARGE_STRING:
case arrow::Type::DATE32:
case arrow::Type::DATE64:
case arrow::Type::TIMESTAMP:
Expand All @@ -103,7 +112,8 @@ Status CheckSupportedTypes(const std::shared_ptr<arrow::Table> &table) {
case arrow::Type::FLOAT:
case arrow::Type::DOUBLE:continue;
default:
return {Code::NotImplemented, "unsupported value type for lists " + t_value->value_type()->ToString()};;
return {Code::NotImplemented,
"unsupported value type for lists " + t_value->value_type()->ToString()};;
}
}
default: return {Code::NotImplemented, "unsupported type " + t->type()->ToString()};
Expand All @@ -112,8 +122,28 @@ Status CheckSupportedTypes(const std::shared_ptr<arrow::Table> &table) {
return Status::OK();
}

std::shared_ptr<DataType> ToCylonType(const std::shared_ptr<arrow::DataType> &arr_type) {
switch (arr_type->id()) {
TimeUnit::type ToCylonTimeUnit(arrow::TimeUnit::type a_time_unit) {
switch (a_time_unit) {
case arrow::TimeUnit::MICRO: return TimeUnit::MICRO;
case arrow::TimeUnit::SECOND: return TimeUnit::SECOND;
case arrow::TimeUnit::MILLI: return TimeUnit::MILLI;
case arrow::TimeUnit::NANO: return TimeUnit::NANO;
}
return TimeUnit::MICRO;
}

arrow::TimeUnit::type ToArrowTimeUnit(TimeUnit::type time_unit) {
switch (time_unit) {
case TimeUnit::MICRO: return arrow::TimeUnit::MICRO;
case TimeUnit::SECOND: return arrow::TimeUnit::SECOND;
case TimeUnit::MILLI: return arrow::TimeUnit::MILLI;
case TimeUnit::NANO: return arrow::TimeUnit::NANO;
}
return arrow::TimeUnit::MICRO;
}

std::shared_ptr<DataType> ToCylonType(const std::shared_ptr<arrow::DataType> &a_type) {
switch (a_type->id()) {
case arrow::Type::BOOL:return cylon::Bool();
case arrow::Type::UINT8:return cylon::UInt8();
case arrow::Type::INT8:return cylon::Int8();
Expand All @@ -126,15 +156,29 @@ std::shared_ptr<DataType> ToCylonType(const std::shared_ptr<arrow::DataType> &ar
case arrow::Type::HALF_FLOAT:return cylon::HalfFloat();
case arrow::Type::FLOAT:return cylon::Float();
case arrow::Type::DOUBLE:return cylon::Double();
case arrow::Type::FIXED_SIZE_BINARY:
return cylon::FixedSizeBinary(std::static_pointer_cast<arrow::FixedSizeBinaryType>(a_type)
->byte_width());
case arrow::Type::BINARY:return cylon::Binary();
case arrow::Type::FIXED_SIZE_BINARY:return cylon::FixedBinary();
case arrow::Type::STRING:return cylon::String();
case arrow::Type::LARGE_STRING: return cylon::LargeString();
case arrow::Type::LARGE_BINARY: return cylon::LargeBinary();
case arrow::Type::DATE32:return cylon::Date32();
case arrow::Type::DATE64:return cylon::Date64();
case arrow::Type::TIMESTAMP:return cylon::Timestamp();
case arrow::Type::TIMESTAMP: {
const auto &casted = std::static_pointer_cast<arrow::TimestampType>(a_type);
return cylon::Timestamp(ToCylonTimeUnit(casted->unit()), casted->timezone());
}
case arrow::Type::TIME32:return cylon::Time32();
case arrow::Type::TIME64:return cylon::Time64();
case arrow::Type::DECIMAL:return cylon::Decimal();
case arrow::Type::DECIMAL128: {
const auto &casted = std::static_pointer_cast<arrow::Decimal128Type>(a_type);
return cylon::Decimal(16, casted->precision(), casted->scale());
}
case arrow::Type::DECIMAL256: {
const auto &casted = std::static_pointer_cast<arrow::Decimal128Type>(a_type);
return cylon::Decimal(32, casted->precision(), casted->scale());
}
case arrow::Type::NA:break;
case arrow::Type::INTERVAL_MONTHS:break;
case arrow::Type::INTERVAL_DAY_TIME:break;
Expand All @@ -147,11 +191,8 @@ std::shared_ptr<DataType> ToCylonType(const std::shared_ptr<arrow::DataType> &ar
case arrow::Type::EXTENSION:break;
case arrow::Type::FIXED_SIZE_LIST:break;
case arrow::Type::DURATION:break;
case arrow::Type::LARGE_STRING:break;
case arrow::Type::LARGE_BINARY:break;
case arrow::Type::LARGE_LIST:break;
case arrow::Type::MAX_ID:break;
case arrow::Type::DECIMAL256:break;
}
return nullptr;
}
Expand Down Expand Up @@ -180,8 +221,7 @@ Type::type ToCylonTypeId(const std::shared_ptr<arrow::DataType> &type) {
case arrow::Type::TIME64:return Type::TIME64;
case arrow::Type::LARGE_STRING:return Type::LARGE_STRING;
case arrow::Type::LARGE_BINARY:return Type::LARGE_BINARY;
default:
return Type::MAX_ID;
default:return Type::MAX_ID;
}
}

Expand Down
16 changes: 8 additions & 8 deletions cpp/src/cylon/arrow/arrow_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,28 +26,28 @@ namespace tarrow {

/**
* Convert a cylon type to an arrow type
* @param tType the cylon type
* @param type the cylon type
* @return corresponding arrow type
*/
// todo: need to handle this better
std::shared_ptr<arrow::DataType> convertToArrowType(const std::shared_ptr<DataType> &tType, int32_t width = -1);
std::shared_ptr<arrow::DataType> ToArrowType(const std::shared_ptr<DataType> &type);

/**
* Convert arrow data type pointer to Cylon Data type pointer
* @param arr_type
* @param a_type
* @return corresponding
*/
std::shared_ptr<DataType> ToCylonType(const std::shared_ptr<arrow::DataType> &arr_type);
std::shared_ptr<DataType> ToCylonType(const std::shared_ptr<arrow::DataType> &a_type);

TimeUnit::type ToCylonTimeUnit(arrow::TimeUnit::type a_time_unit);
arrow::TimeUnit::type ToArrowTimeUnit(TimeUnit::type time_unit);

Type::type ToCylonTypeId(const std::shared_ptr<arrow::DataType> &type);

/**
* Validate the types of an arrow table
* Checks if the types of an arrow table are supported in Cylon
* @param table true if we support the types
* @return false if we don't support the types
*/
bool validateArrowTableTypes(const std::shared_ptr<arrow::Table> &table);

cylon::Status CheckSupportedTypes(const std::shared_ptr<arrow::Table> &table);

} // namespace tarrow
Expand Down
1 change: 1 addition & 0 deletions cpp/src/cylon/compute/aggregate_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ struct QuantileKernelOptions : public KernelOptions {
//};

struct AggregationOp {
virtual ~AggregationOp() = default;
virtual AggregationOpId id() const = 0;
virtual KernelOptions *options() const { return nullptr; };
};
Expand Down
51 changes: 51 additions & 0 deletions cpp/src/cylon/data_types.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


#include "data_types.hpp"

namespace cylon {

std::shared_ptr<DataType> FixedSizeBinary(int32_t byte_width) {
return std::make_shared<FixedSizeBinaryType>(byte_width);
}

std::shared_ptr<DataType> Timestamp(TimeUnit::type unit, std::string time_zone) {
return std::make_shared<TimestampType>(unit, std::move(time_zone));
}

std::shared_ptr<DataType> Duration(TimeUnit::type unit) {
return std::make_shared<DurationType>(unit);
}

std::shared_ptr<DataType> Decimal(int32_t byte_width, int32_t precision, int32_t scale) {
return std::make_shared<DecimalType>(byte_width, precision, scale);
}

FixedSizeBinaryType::FixedSizeBinaryType(int32_t byte_width)
: DataType(Type::FIXED_SIZE_BINARY, Layout::FIXED_WIDTH), byte_width_(byte_width) {}

FixedSizeBinaryType::FixedSizeBinaryType(int32_t byte_width, Type::type override_type)
: DataType(override_type, Layout::FIXED_WIDTH), byte_width_(byte_width) {}

TimestampType::TimestampType(TimeUnit::type unit, std::string timezone)
: DataType(Type::TIMESTAMP, Layout::FIXED_WIDTH), unit_(unit),
timezone_(std::move(timezone)) {}

DurationType::DurationType(TimeUnit::type unit)
: DataType(Type::DURATION, Layout::FIXED_WIDTH), unit_(unit) {}

DecimalType::DecimalType(int32_t byte_width, int32_t precision, int32_t scale)
: FixedSizeBinaryType(byte_width, Type::DECIMAL), precision_(precision), scale_(scale) {}
}
Loading

0 comments on commit 9c2fdc4

Please sign in to comment.