From dd45596dfbe98d2885b7abf578860a5cc778207c Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 10 Jan 2020 14:52:42 -0800 Subject: [PATCH 01/10] Add defs and imlementation for OneHotEncoders, adjuist date_time_transformer kernel and test. Test now fails. --- cgmanifest.json | 2 +- cmake/external/featurizers.cmake | 2 +- .../graph/featurizers_ops/featurizers_defs.cc | 367 ++++++++++++++---- .../cpu/cat_imputer_transformer.cc | 2 +- .../cpu/date_time_transformer.cc | 7 +- .../hash_one_hot_vectorizer_transformer.cc | 90 +++++ .../cpu/imputation_marker_transformer.cc | 4 +- .../cpu/label_encoder_transformer.cc | 2 +- .../cpu/max_abs_scalar_transformer.cc | 3 +- .../cpu/missing_dummies_transformer.cc | 3 +- .../cpu/one_hot_encoder_transformer.cc | 90 +++++ .../cpu_featurizers_kernels.cc | 4 + .../datetimetransformer_test.cc | 14 +- 13 files changed, 501 insertions(+), 89 deletions(-) create mode 100644 onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc create mode 100644 onnxruntime/featurizers_ops/cpu/one_hot_encoder_transformer.cc diff --git a/cgmanifest.json b/cgmanifest.json index 62ad0a3031625..cd688939b1d6d 100644 --- a/cgmanifest.json +++ b/cgmanifest.json @@ -450,7 +450,7 @@ { "component": { "git": { - "commitHash": "573070aeeb77e267da2579ac1d75d92c688bbe97", + "commitHash": "4948801a488d7c97ba21ca5c07ccc2977c3838af", "repositoryUrl": "https://github.com/microsoft/FeaturizersLibrary.git" }, "type": "git" diff --git a/cmake/external/featurizers.cmake b/cmake/external/featurizers.cmake index 4700e85f032db..90e5fe96201f7 100644 --- a/cmake/external/featurizers.cmake +++ b/cmake/external/featurizers.cmake @@ -3,7 +3,7 @@ # This source code should not depend on the onnxruntime and may be built independently set(featurizers_URL "https://github.com/microsoft/FeaturizersLibrary.git") -set(featurizers_TAG "573070aeeb77e267da2579ac1d75d92c688bbe97") +set(featurizers_TAG "4948801a488d7c97ba21ca5c07ccc2977c3838af") set(featurizers_pref FeaturizersLibrary) set(featurizers_ROOT ${PROJECT_SOURCE_DIR}/external/${featurizers_pref}) diff --git a/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc b/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc index c46822774918d..6497e6cd882bd 100644 --- a/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc +++ b/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc @@ -34,11 +34,13 @@ using ONNX_NAMESPACE::OPTIONAL; // Forward declarations static void RegisterCatImputerFeaturizerVer1(); static void RegisterDateTimeFeaturizerVer1(); +static void RegisterHashOneHotVectorizerFeaturizerVer1(); static void RegisterImputationMarkerFeaturizerVer1(); static void RegisterLabelEncoderFeaturizerVer1(); static void RegisterMaxAbsScalarFeaturizerVer1(); static void RegisterMinMaxScalarFeaturizerVer1(); static void RegisterMissingDummiesFeaturizerVer1(); +static void RegisterOneHotEncoderFeaturizerVer1(); static void RegisterRobustScalarFeaturizerVer1(); static void RegisterStringFeaturizerVer1(); static void RegisterTimeSeriesImputerFeaturizerVer1(); @@ -49,11 +51,13 @@ static void RegisterTimeSeriesImputerFeaturizerVer1(); void RegisterMSFeaturizersSchemas() { RegisterCatImputerFeaturizerVer1(); RegisterDateTimeFeaturizerVer1(); + RegisterHashOneHotVectorizerFeaturizerVer1(); RegisterImputationMarkerFeaturizerVer1(); RegisterLabelEncoderFeaturizerVer1(); RegisterMaxAbsScalarFeaturizerVer1(); RegisterMinMaxScalarFeaturizerVer1(); RegisterMissingDummiesFeaturizerVer1(); + RegisterOneHotEncoderFeaturizerVer1(); RegisterRobustScalarFeaturizerVer1(); RegisterStringFeaturizerVer1(); RegisterTimeSeriesImputerFeaturizerVer1(); @@ -70,15 +74,15 @@ void RegisterCatImputerFeaturizerVer1() { within the host frameworks and programming languages. C++-style pseudo signature: - std::float_t execute(std::float_t const &value); - std::double_t execute(std::double_t const &value); + float execute(float const &value); + double execute(double const &value); template T execute(std::optional const &value); Examples (where 55.5 is the mode value): execute(1.0) -> 1.0 execute(NaN) -> 55.5 execute(2.0) -> 2.0 - )DOC"; + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(CatImputerTransformer) .SinceVersion(1) @@ -121,7 +125,7 @@ void RegisterDateTimeFeaturizerVer1() { Extracts various datetime-related values from a UTC time_point. C++-style pseudo signature: - TimePoint execute(std::chron::system_clock::time_point const &value); + TimePoint execute(std::chrono::system_clock::time_point const &value); Examples: Given a time_point 'value' representing "November 17, 1976 12:27:04PM": @@ -149,7 +153,7 @@ void RegisterDateTimeFeaturizerVer1() { "holidayName": "", "isPaidTimeOff": 0 } - )DOC"; + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(DateTimeTransformer) .SinceVersion(1) @@ -209,52 +213,191 @@ void RegisterDateTimeFeaturizerVer1() { .TypeAndShapeInferenceFunction( [](ONNX_NAMESPACE::InferenceContext& ctx) { const bool has_shape = hasInputShape(ctx, 1); - for (int output = 0; output < 21; ++output) { - switch (output) { - case 0: - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT32, output); - break; - case 1: // fall through - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: - case 8: - case 9: - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, output); - break; - case 10: // fall through - case 11: - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT16, output); - break; - case 12: // fall through - case 13: - case 14: - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, output); - break; - case 15: - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT32, output); - break; - case 16: - case 17: - case 18: - case 19: - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_STRING, output); - break; - case 20: - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, output); - break; - default: - assert(false); - break; - } - if (has_shape) { - propagateShapeFromInputToOutput(ctx, 1, output); - } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT32, 0); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 0); } - }); + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 1); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 1); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 2); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 2); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 3); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 3); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 4); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 4); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 5); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 5); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 6); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 6); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 7); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 7); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 8); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 8); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 9); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 9); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT16, 10); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 10); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT16, 11); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 11); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 12); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 12); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 13); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 13); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 14); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 14); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT32, 15); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 15); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_STRING, 16); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 16); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_STRING, 17); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 17); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_STRING, 18); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 18); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_STRING, 19); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 19); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 20); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 20); + } + + } + ); +} + +void RegisterHashOneHotVectorizerFeaturizerVer1() { + static const char* doc = R"DOC( + Hashes the input to a categorical value, then produces a one hot encoded vector + based on that value. + + C++-style pseudo signature: + template HashOneHotVectorizerStruct execute(T const &value); + + Examples: + Assuming the hashing algorithm... + "A" -> 1 + "B" -> 2 + "C" -> 5 + + and 'numCols' set to 8: + + execute("A") -> [1, 0, 0, 0, 0, 0, 0, 0] + execute("B") -> [0, 1, 0, 0, 0, 0, 0, 0] + execute("C") -> [0, 0, 0, 0, 1, 0, 0, 0] + )DOC"; + + MS_FEATURIZERS_OPERATOR_SCHEMA(HashOneHotVectorizerTransformer) + .SinceVersion(1) + .SetDomain(kMSFeaturizersDomain) + .SetDoc(doc) + .Input( + 0, + "State", + "State generated during training that is used for prediction", + "T0") + .Input( + 1, + "Input", + "No information is available", + "InputT") + .Output(0, "NumElements", "No information available", "OutputT0") + .Output(1, "Value", "No information available", "OutputT1") + .Output(2, "Index", "No information available", "OutputT0") + .TypeConstraint( + "T0", + {"tensor(uint8)"}, + "No information is available") + .TypeConstraint( + "InputT", + {"tensor(int8)", "tensor(int16)", "tensor(int32)", "tensor(int64)", "tensor(uint8)", "tensor(uint16)", "tensor(uint32)", "tensor(uint64)", "tensor(float)", "tensor(double)", "tensor(bool)", "tensor(string)"}, + "No information is available") + .TypeConstraint( + "OutputT0", + {"tensor(uint64)"}, + "No information is available") + .TypeConstraint( + "OutputT1", + {"tensor(uint8)"}, + "No information is available") + .TypeAndShapeInferenceFunction( + [](ONNX_NAMESPACE::InferenceContext& ctx) { + const bool has_shape = hasInputShape(ctx, 1); + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 0); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 0); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 1); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 1); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 2); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 2); + } + + } + ); } void RegisterImputationMarkerFeaturizerVer1() { @@ -262,17 +405,17 @@ void RegisterImputationMarkerFeaturizerVer1() { Returns true if the input is null, false if it is not. C++-style pseudo signature: - bool execute(std::float_t const &value); - bool execute(std::double_t const &value); + bool execute(float const &value); + bool execute(double const &value); template bool execute(std::optional const &value); Examples: 3.0 -> false NaN -> true "foo" -> false - std::optional() -> true - std::optional("bar") -> false - )DOC"; + std::optional() -> true + std::optional("bar") -> false + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(ImputationMarkerTransformer) .SinceVersion(1) @@ -315,7 +458,7 @@ void RegisterLabelEncoderFeaturizerVer1() { Returns a unique id for the input based on all values encountered during training. C++-style pseudo signature: - template std::uint32_t execute(T const &value); + template uint32 execute(T const &value); Examples: Assuming the training data of ["A", "B", "C"]... @@ -324,7 +467,7 @@ void RegisterLabelEncoderFeaturizerVer1() { execute("B") -> 2 execute("C") -> 3 execute("This value was not seen during training") -> 0 - )DOC"; + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(LabelEncoderTransformer) .SinceVersion(1) @@ -367,8 +510,8 @@ void RegisterMaxAbsScalarFeaturizerVer1() { Scales input based on the maximum absolute value of all data encountered during training. C++-style pseudo signature: - std::float_t execute(std::uint16_t value); - std::double_t execute(std::uint32_t value); + float execute(uint16 value); + double execute(uint32 value); Examples: Given a training set of [1.0, -2.0, 3.0, -4.0], where 4.0 is the absolute value of the @@ -377,7 +520,7 @@ void RegisterMaxAbsScalarFeaturizerVer1() { execute(1.0) -> 1.0 / 4.0 execute(-4.0) -> -4.0 / 4.0 execute(100.0) -> 100 / 4.0 - )DOC"; + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(MaxAbsScalarTransformer) .SinceVersion(1) @@ -426,11 +569,13 @@ void RegisterMaxAbsScalarFeaturizerVer1() { input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_DOUBLE) { propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, 0); } else { - fail_type_inference("input 1 is expected to have a accepted type"); + fail_type_inference("input 1 is expected to have an accepted type"); } + if (hasInputShape(ctx, 1)) { propagateShapeFromInputToOutput(ctx, 1, 0); } + }); } @@ -440,7 +585,7 @@ void RegisterMinMaxScalarFeaturizerVer1() { during training. C++-style pseudo signature: - template std::double_t(T const &value); + template double(T const &value); Examples: Given the training data [1, 2, 3, 4, 5]; @@ -450,7 +595,7 @@ void RegisterMinMaxScalarFeaturizerVer1() { execute(2) = 2 / 4 execute(20) = 20 / 4 - )DOC"; + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(MinMaxScalarTransformer) .SinceVersion(1) @@ -493,17 +638,17 @@ void RegisterMissingDummiesFeaturizerVer1() { Returns 1 if the input is null, 0 if it is not. C++-style pseudo signature: - std::int8_t execute(std::float_t const &value); - std::int8_t execute(std::double_t const &value); - template std::int8_t execute(T const &value); + int8 execute(float const &value); + int8 execute(double const &value); + template int8 execute(T const &value); Examples: 1.0 -> 0 NaN -> 1 "foo" -> 0 - std::optional() -> 1 - std::optional("bar") -> 0 - )DOC"; + std::optional() -> 1 + std::optional("bar") -> 0 + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(MissingDummiesTransformer) .SinceVersion(1) @@ -541,6 +686,80 @@ void RegisterMissingDummiesFeaturizerVer1() { }); } +void RegisterOneHotEncoderFeaturizerVer1() { + static const char* doc = R"DOC( + Produces a one hot vector based on categories calculated during training. + + C++-style pseudo signature: + template OneHotVector execute(T const &value); + + Examples: + Assuming the training data [10, 20, 30, 40]... + + execute(10) -> [0, 1, 0, 0, 0] + execute(20) -> [0, 0, 1, 0, 0] + execute(30) -> [0, 0, 0, 1, 0] + execute(40) -> [0, 0, 0, 0, 1] + execute(200) -> [1, 0, 0, 0, 0] + execute(-1) -> [1, 0, 0, 0, 0] + )DOC"; + + MS_FEATURIZERS_OPERATOR_SCHEMA(OneHotEncoderTransformer) + .SinceVersion(1) + .SetDomain(kMSFeaturizersDomain) + .SetDoc(doc) + .Input( + 0, + "State", + "State generated during training that is used for prediction", + "T0") + .Input( + 1, + "Input", + "No information is available", + "InputT") + .Output(0, "NumElements", "No information available", "OutputT0") + .Output(1, "Value", "No information available", "OutputT1") + .Output(2, "Index", "No information available", "OutputT0") + .TypeConstraint( + "T0", + {"tensor(uint8)"}, + "No information is available") + .TypeConstraint( + "InputT", + {"tensor(int8)", "tensor(int16)", "tensor(int32)", "tensor(int64)", "tensor(uint8)", "tensor(uint16)", "tensor(uint32)", "tensor(uint64)", "tensor(float)", "tensor(double)", "tensor(bool)", "tensor(string)"}, + "No information is available") + .TypeConstraint( + "OutputT0", + {"tensor(uint64)"}, + "No information is available") + .TypeConstraint( + "OutputT1", + {"tensor(uint8)"}, + "No information is available") + .TypeAndShapeInferenceFunction( + [](ONNX_NAMESPACE::InferenceContext& ctx) { + const bool has_shape = hasInputShape(ctx, 1); + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 0); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 0); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 1); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 1); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 2); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 2); + } + + } + ); +} + void RegisterRobustScalarFeaturizerVer1() { static const char* doc = R"DOC( MinMaxScalarEstimator + centering? @@ -550,7 +769,7 @@ void RegisterRobustScalarFeaturizerVer1() { Examples: TODO - )DOC"; + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(RobustScalarTransformer) .SinceVersion(1) @@ -599,11 +818,13 @@ void RegisterRobustScalarFeaturizerVer1() { input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_DOUBLE) { propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, 0); } else { - fail_type_inference("input 1 is expected to have a accepted type"); + fail_type_inference("input 1 is expected to have an accepted type"); } + if (hasInputShape(ctx, 1)) { propagateShapeFromInputToOutput(ctx, 1, 0); } + }); } @@ -612,12 +833,12 @@ void RegisterStringFeaturizerVer1() { Converts the input into a string representation based on the input's type. C++-style pseudo signature: - template std::string execute(T const &value); + template string execute(T const &value); Examples: execute(1) -> "1" execute(3.14) -> "3.14" - )DOC"; + )DOC"; MS_FEATURIZERS_OPERATOR_SCHEMA(StringTransformer) .SinceVersion(1) diff --git a/onnxruntime/featurizers_ops/cpu/cat_imputer_transformer.cc b/onnxruntime/featurizers_ops/cpu/cat_imputer_transformer.cc index b2adb9108f8b9..7a5ee782e3d2f 100644 --- a/onnxruntime/featurizers_ops/cpu/cat_imputer_transformer.cc +++ b/onnxruntime/featurizers_ops/cpu/cat_imputer_transformer.cc @@ -27,7 +27,7 @@ inline nonstd::optional PreprocessOptional(std::string value) { return value.empty() ? nonstd::optional() : nonstd::optional(std::move(value)); } -template +template struct CatImputerTransformerImpl { void operator()(OpKernelContext* ctx) const { // Create the transformer diff --git a/onnxruntime/featurizers_ops/cpu/date_time_transformer.cc b/onnxruntime/featurizers_ops/cpu/date_time_transformer.cc index 0e25e58fac68e..13f780414d988 100644 --- a/onnxruntime/featurizers_ops/cpu/date_time_transformer.cc +++ b/onnxruntime/featurizers_ops/cpu/date_time_transformer.cc @@ -3,6 +3,7 @@ #include "core/common/common.h" #include "core/framework/data_types.h" +#include "core/framework/data_types_internal.h" #include "core/framework/op_kernel.h" #include "Featurizers/DateTimeFeaturizer.h" @@ -29,7 +30,7 @@ class DateTimeTransformer final : public OpKernel { // Get the input const auto* input_tensor(ctx->Input(1)); - const std::int64_t* input_data(input_tensor->Data()); + const int64_t* input_data(input_tensor->Data()); // Prepare the output Tensor* year_tensor(ctx->Output(0, input_tensor->Shape())); @@ -80,7 +81,7 @@ class DateTimeTransformer final : public OpKernel { const int64_t length(input_tensor->Shape().Size()); for (int64_t i = 0; i < length; ++i) { - auto result(transformer.execute(input_data[i])); + auto result(transformer.execute(std::chrono::system_clock::from_time_t(input_data[i]))); year_data[i] = std::move(result.year); month_data[i] = std::move(result.month); @@ -116,7 +117,7 @@ ONNX_OPERATOR_KERNEL_EX( kCpuExecutionProvider, KernelDefBuilder() .TypeConstraint("T0", DataTypeImpl::GetTensorType()) - .TypeConstraint("T1", DataTypeImpl::GetTensorType()), + .TypeConstraint("T1", DataTypeImpl::GetTensorType()), DateTimeTransformer); } // namespace featurizers diff --git a/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc b/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc new file mode 100644 index 0000000000000..747dcf885cb23 --- /dev/null +++ b/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/common.h" +#include "core/framework/data_types.h" +#include "core/framework/data_types_internal.h" +#include "core/framework/op_kernel.h" + +#include "Featurizers/HashOneHotVectorizerFeaturizer.h" +#include "Archive.h" + +namespace onnxruntime { +namespace featurizers { + +template +struct HashOneHotVectorizerTransformerImpl { + void operator()(OpKernelContext* ctx) const { + // Create the transformer + Microsoft::Featurizer::Featurizers::HashOneHotVectorizerTransformer transformer( + [ctx](void) { + const auto* state_tensor(ctx->Input(0)); + const uint8_t* const state_data(state_tensor->Data()); + + Microsoft::Featurizer::Archive archive(state_data, state_tensor->Shape().GetDims()[0]); + return Microsoft::Featurizer::Featurizers::HashOneHotVectorizerTransformer(archive); + }()); + + // Get the input + const auto* input_tensor(ctx->Input(1)); + const InputT* input_data(input_tensor->Data()); + + // Prepare the output + Tensor* NumElements_tensor(ctx->Output(0, input_tensor->Shape())); + Tensor* Value_tensor(ctx->Output(1, input_tensor->Shape())); + Tensor* Index_tensor(ctx->Output(2, input_tensor->Shape())); + + uint64_t* NumElements_data(NumElements_tensor->MutableData()); + uint8_t* Value_data(Value_tensor->MutableData()); + uint64_t* Index_data(Index_tensor->MutableData()); + + // Execute + const int64_t length(input_tensor->Shape().Size()); + + for (int64_t i = 0; i < length; ++i) { + auto result(transformer.execute(input_data[i])); + + NumElements_data[i] = std::move(result.NumElements); + Value_data[i] = std::move(result.Value); + Index_data[i] = std::move(result.Index); + } + } +}; + +class HashOneHotVectorizerTransformer final : public OpKernel { + public: + explicit HashOneHotVectorizerTransformer(const OpKernelInfo& info) : OpKernel(info) { + } + + Status Compute(OpKernelContext* ctx) const override { + utils::MLTypeCallDispatcher + t_disp(ctx->Input(1)->GetElementType()); + t_disp.Invoke(ctx); + return Status::OK(); + } +}; + +ONNX_OPERATOR_KERNEL_EX( + HashOneHotVectorizerTransformer, + kMSFeaturizersDomain, + 1, + kCpuExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T0", DataTypeImpl::GetTensorType()) + .TypeConstraint("InputT", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + HashOneHotVectorizerTransformer); + +} // namespace featurizers +} // namespace onnxruntime diff --git a/onnxruntime/featurizers_ops/cpu/imputation_marker_transformer.cc b/onnxruntime/featurizers_ops/cpu/imputation_marker_transformer.cc index a0f36a943ff01..2612f448e7335 100644 --- a/onnxruntime/featurizers_ops/cpu/imputation_marker_transformer.cc +++ b/onnxruntime/featurizers_ops/cpu/imputation_marker_transformer.cc @@ -54,8 +54,7 @@ class ImputationMarkerTransformer final : public OpKernel { } Status Compute(OpKernelContext* ctx) const override { - utils::MLTypeCallDispatcher - t_disp(ctx->Input(1)->GetElementType()); + utils::MLTypeCallDispatcher t_disp(ctx->Input(1)->GetElementType()); t_disp.Invoke(ctx); return Status::OK(); } @@ -72,5 +71,6 @@ ONNX_OPERATOR_KERNEL_EX( DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}), ImputationMarkerTransformer); + } // namespace featurizers } // namespace onnxruntime diff --git a/onnxruntime/featurizers_ops/cpu/label_encoder_transformer.cc b/onnxruntime/featurizers_ops/cpu/label_encoder_transformer.cc index 96ec25285133c..6350da43fc906 100644 --- a/onnxruntime/featurizers_ops/cpu/label_encoder_transformer.cc +++ b/onnxruntime/featurizers_ops/cpu/label_encoder_transformer.cc @@ -31,7 +31,7 @@ struct LabelEncoderTransformerImpl { // Prepare the output Tensor* output_tensor(ctx->Output(0, input_tensor->Shape())); - std::uint32_t* output_data(output_tensor->MutableData()); + uint32_t* output_data(output_tensor->MutableData()); // Execute const int64_t length(input_tensor->Shape().Size()); diff --git a/onnxruntime/featurizers_ops/cpu/max_abs_scalar_transformer.cc b/onnxruntime/featurizers_ops/cpu/max_abs_scalar_transformer.cc index 8fea4f9fb806e..0208db93af0bf 100644 --- a/onnxruntime/featurizers_ops/cpu/max_abs_scalar_transformer.cc +++ b/onnxruntime/featurizers_ops/cpu/max_abs_scalar_transformer.cc @@ -72,7 +72,8 @@ class MaxAbsScalarTransformer final : public OpKernel { Status Compute(OpKernelContext* ctx) const override { utils::MLTypeCallDispatcher t_disp(ctx->Input(1)->GetElementType()); + int64_t, uint64_t, float, double> + t_disp(ctx->Input(1)->GetElementType()); t_disp.Invoke(ctx); return Status::OK(); } diff --git a/onnxruntime/featurizers_ops/cpu/missing_dummies_transformer.cc b/onnxruntime/featurizers_ops/cpu/missing_dummies_transformer.cc index 255bfd78d3cc3..5d40abcf761ad 100644 --- a/onnxruntime/featurizers_ops/cpu/missing_dummies_transformer.cc +++ b/onnxruntime/featurizers_ops/cpu/missing_dummies_transformer.cc @@ -54,8 +54,7 @@ class MissingDummiesTransformer final : public OpKernel { } Status Compute(OpKernelContext* ctx) const override { - utils::MLTypeCallDispatcher - t_disp(ctx->Input(1)->GetElementType()); + utils::MLTypeCallDispatcher t_disp(ctx->Input(1)->GetElementType()); t_disp.Invoke(ctx); return Status::OK(); } diff --git a/onnxruntime/featurizers_ops/cpu/one_hot_encoder_transformer.cc b/onnxruntime/featurizers_ops/cpu/one_hot_encoder_transformer.cc new file mode 100644 index 0000000000000..1a299456ecbed --- /dev/null +++ b/onnxruntime/featurizers_ops/cpu/one_hot_encoder_transformer.cc @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/common.h" +#include "core/framework/data_types.h" +#include "core/framework/data_types_internal.h" +#include "core/framework/op_kernel.h" + +#include "Featurizers/OneHotEncoderFeaturizer.h" +#include "Archive.h" + +namespace onnxruntime { +namespace featurizers { + +template +struct OneHotEncoderTransformerImpl { + void operator()(OpKernelContext* ctx) const { + // Create the transformer + Microsoft::Featurizer::Featurizers::OneHotEncoderTransformer transformer( + [ctx](void) { + const auto* state_tensor(ctx->Input(0)); + const uint8_t* const state_data(state_tensor->Data()); + + Microsoft::Featurizer::Archive archive(state_data, state_tensor->Shape().GetDims()[0]); + return Microsoft::Featurizer::Featurizers::OneHotEncoderTransformer(archive); + }()); + + // Get the input + const auto* input_tensor(ctx->Input(1)); + const InputT* input_data(input_tensor->Data()); + + // Prepare the output + Tensor* NumElements_tensor(ctx->Output(0, input_tensor->Shape())); + Tensor* Value_tensor(ctx->Output(1, input_tensor->Shape())); + Tensor* Index_tensor(ctx->Output(2, input_tensor->Shape())); + + uint64_t* NumElements_data(NumElements_tensor->MutableData()); + uint8_t* Value_data(Value_tensor->MutableData()); + uint64_t* Index_data(Index_tensor->MutableData()); + + // Execute + const int64_t length(input_tensor->Shape().Size()); + + for (int64_t i = 0; i < length; ++i) { + auto result(transformer.execute(input_data[i])); + + NumElements_data[i] = std::move(result.NumElements); + Value_data[i] = std::move(result.Value); + Index_data[i] = std::move(result.Index); + } + } +}; + +class OneHotEncoderTransformer final : public OpKernel { + public: + explicit OneHotEncoderTransformer(const OpKernelInfo& info) : OpKernel(info) { + } + + Status Compute(OpKernelContext* ctx) const override { + utils::MLTypeCallDispatcher + t_disp(ctx->Input(1)->GetElementType()); + t_disp.Invoke(ctx); + return Status::OK(); + } +}; + +ONNX_OPERATOR_KERNEL_EX( + OneHotEncoderTransformer, + kMSFeaturizersDomain, + 1, + kCpuExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T0", DataTypeImpl::GetTensorType()) + .TypeConstraint("InputT", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + OneHotEncoderTransformer); + +} // namespace featurizers +} // namespace onnxruntime diff --git a/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc b/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc index 2fa7168880743..96326b10de936 100644 --- a/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc +++ b/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc @@ -12,11 +12,13 @@ namespace featurizers { // Forward declarations class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, CatImputerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, DateTimeTransformer); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, HashOneHotVectorizerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, ImputationMarkerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, LabelEncoderTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, MaxAbsScalarTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, MinMaxScalarTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, MissingDummiesTransformer); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, OneHotEncoderTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, RobustScalarTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, StringTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, TimeSeriesImputerTransformer); @@ -25,11 +27,13 @@ Status RegisterCpuMSFeaturizersKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc b/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc index 1044667fd54be..b0f88683d6c8b 100644 --- a/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc +++ b/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc @@ -15,6 +15,8 @@ namespace test { TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_04) { const time_t date = 217081624; + char buffer[1024]; + std::cout << "Time: " << ctime_s(buffer, sizeof(buffer), &date) << std::endl; OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); // Add state input @@ -76,6 +78,7 @@ TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_04) { TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_05) { const time_t date = 217081625; + const auto date_tp = SysClock::from_time_t(date); OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); @@ -86,7 +89,7 @@ TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_05) { test.AddInput("Date", {1}, {date}); dft::DateTimeTransformer dt("", ""); - dft::TimePoint tp(dt.execute(date)); + dft::TimePoint tp(dt.execute(date_tp)); ASSERT_EQ(tp.year, 1976); ASSERT_EQ(tp.month, dft::TimePoint::NOVEMBER); ASSERT_EQ(tp.day, 17); @@ -137,7 +140,9 @@ TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_05) { TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17__12_27_05_and_past_1976_nov_17_12_27_04) { const time_t date1 = 217081625; + const auto date1_tp = SysClock::from_time_t(date1); const time_t date2 = 217081624; + const auto date2_tp = SysClock::from_time_t(date2); OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); @@ -148,8 +153,8 @@ TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17__12_27_05_and_past_1 test.AddInput("Date", {2}, {date1, date2}); dft::DateTimeTransformer dt("", ""); - dft::TimePoint tp1(dt.execute(date1)); - dft::TimePoint tp2(dt.execute(date2)); + dft::TimePoint tp1(dt.execute(date1_tp)); + dft::TimePoint tp2(dt.execute(date2_tp)); // Date1 ASSERT_EQ(tp1.year, 1976); @@ -225,6 +230,7 @@ TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17__12_27_05_and_past_1 TEST(FeaturizersTests, DateTimeTransformer_future_2025_june_30) { const time_t date = 1751241600; + const auto date_tp = std::chrono::system_clock::from_time_t(date); OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); @@ -235,7 +241,7 @@ TEST(FeaturizersTests, DateTimeTransformer_future_2025_june_30) { test.AddInput("Date", {1}, {date}); dft::DateTimeTransformer dt("", ""); - dft::TimePoint tp = dt.execute(date); + dft::TimePoint tp = dt.execute(date_tp); ASSERT_EQ(tp.year, 2025); ASSERT_EQ(tp.month, dft::TimePoint::JUNE); ASSERT_EQ(tp.day, 30); From 60431c5a639c5f5f951a3f7efd4deedc45e1a1c7 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 10 Jan 2020 15:57:57 -0800 Subject: [PATCH 02/10] Add OneHotEncoder kernel test. --- .../featurizers_ops/one_hot_encoder_test.cc | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc diff --git a/onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc b/onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc new file mode 100644 index 0000000000000..d5dbf0ba37c48 --- /dev/null +++ b/onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/providers/provider_test_utils.h" + +#include "Featurizers/OneHotEncoderFeaturizer.h" +#include "Featurizers/TestHelpers.h" +#include "Archive.h" + +namespace NS = Microsoft::Featurizer; + +namespace onnxruntime { +namespace test { + +template +std::vector GetStream(const std::vector>& trainingBatches, bool allowMissingValues) { + using Estimator = NS::Featurizers::OneHotEncoderEstimator; + + Estimator estimator(NS::CreateTestAnnotationMapsPtr(1), 0, allowMissingValues); + NS::TestHelpers::Train(estimator, trainingBatches); + Estimator::TransformerUniquePtr pTransformer(estimator.create_transformer()); + + NS::Archive ar; + pTransformer->save(ar); + return ar.commit(); +} + +TEST(FeaturizersTests, OneHotEncoder_uint32_t) { + using InputType = uint32_t; + + auto trainingBatches = NS::TestHelpers::make_vector>( + NS::TestHelpers::make_vector(10, 20, 10), + NS::TestHelpers::make_vector(30), + NS::TestHelpers::make_vector(10, 10, 11, 15), + NS::TestHelpers::make_vector(18, 8)); + + auto stream = GetStream(trainingBatches, false); + auto dim = static_cast(stream.size()); + + OpTester test("OneHotEncoderTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {5}, {11u, 8u, 10u, 15u, 20u}); + + test.AddOutput("NumElements", {5}, {7u, 7u, 7u, 7u, 7u}); + test.AddOutput("Value", {5}, {1u, 1u, 1u, 1u, 1u}); + test.AddOutput("Index", {5}, {2u, 0u, 1u, 3u, 5u}); + + test.Run(); +} + +TEST(FeaturizersTests, OneHotEncoder_string) { + using InputType = std::string; + + auto trainingBatches = NS::TestHelpers::make_vector>( + NS::TestHelpers::make_vector("orange", "apple", "orange", + "grape", "carrot", "carrot", + "peach", "banana", "orange")); + + auto stream = GetStream(trainingBatches, false); + auto dim = static_cast(stream.size()); + + OpTester test("OneHotEncoderTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {4}, {"banana", "grape", "apple", "orange"}); + + test.AddOutput("NumElements", {4}, {6u, 6u, 6u, 6u}); + test.AddOutput("Value", {4}, {1u, 1u, 1u, 1u}); + test.AddOutput("Index", {4}, {1u, 3u, 0u, 4u}); + + test.Run(); +} + +TEST(FeaturizersTests, OneHotEncoder_unseen_values) { + using InputType = std::string; + + auto trainingBatches = NS::TestHelpers::make_vector>( + NS::TestHelpers::make_vector("orange", "apple", "orange", + "grape", "carrot", "carrot", + "peach", "banana", "orange")); + + auto stream = GetStream(trainingBatches, true); + auto dim = static_cast(stream.size()); + + OpTester test("OneHotEncoderTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {5}, {"banana", "grape", "apple", "orange", "hello"}); + + test.AddOutput("NumElements", {5}, {7u, 7u, 7u, 7u, 7u}); + test.AddOutput("Value", {5}, {1u, 1u, 1u, 1u, 1u}); + test.AddOutput("Index", {5}, {2u, 4u, 1u, 5u, 0u}); + + test.Run(); +} + +TEST(FeaturizersTests, OneHotEncoder_unseen_values_throws) { + using InputType = std::string; + + auto trainingBatches = NS::TestHelpers::make_vector>( + NS::TestHelpers::make_vector("orange", "apple", "orange", + "grape", "carrot", "carrot", + "peach", "banana", "orange")); + + auto stream = GetStream(trainingBatches, false); + auto dim = static_cast(stream.size()); + + OpTester test("OneHotEncoderTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {5}, {"banana", "grape", "apple", "orange", "hello"}); + + test.AddOutput("NumElements", {5}, {7u, 7u, 7u, 7u, 7u}); + test.AddOutput("Value", {5}, {1u, 1u, 1u, 1u, 1u}); + test.AddOutput("Index", {5}, {2u, 4u, 1u, 5u, 0u}); + + test.Run(OpTester::ExpectResult::kExpectFailure); +} + +} // namespace test +} // namespace onnxruntime From cb2963155a150357e01a3e5e5024396ffa08bc99 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 10 Jan 2020 16:48:38 -0800 Subject: [PATCH 03/10] Add HashOneHotVectorizerTransformer unit test. This does not link due to multiple definitions of functions that are included into header from a CPP file. --- .../hash_one_hot_encoder_transformer_test.cc | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc diff --git a/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc b/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc new file mode 100644 index 0000000000000..915173110b69c --- /dev/null +++ b/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/providers/provider_test_utils.h" + +#include "Featurizers/HashOneHotVectorizerFeaturizer.h" +#include "Featurizers/TestHelpers.h" +#include "Archive.h" + +namespace NS = Microsoft::Featurizer; + +namespace onnxruntime { +namespace test { + +template +std::vector GetStream() { + NS::Featurizers::HashOneHotVectorizerTransformer hvtransformer(2, 100); + NS::Archive ar; + hvtransformer.save(ar); + return ar.commit(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_int8) { + using Type = int8_t; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {15}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {29u}); + + test.Run(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_int32) { + using Type = int32_t; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {15}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {22u}); + + test.Run(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_double) { + using Type = double; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {15.0}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {99u}); + + test.Run(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_string) { + using Type = std::string; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {"hello"}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {25u}); + + test.Run(); +} + +} // namespace test +} // namespace onnxruntime \ No newline at end of file From aa7e47dd6922338d36339cde8f8438acae439a46 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 13 Jan 2020 11:09:30 -0800 Subject: [PATCH 04/10] Temporarily delete hash_one_hot_encoder_transformer_test --- .../hash_one_hot_encoder_transformer_test.cc | 89 ------------------- 1 file changed, 89 deletions(-) delete mode 100644 onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc diff --git a/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc b/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc deleted file mode 100644 index 915173110b69c..0000000000000 --- a/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "gtest/gtest.h" -#include "test/providers/provider_test_utils.h" - -#include "Featurizers/HashOneHotVectorizerFeaturizer.h" -#include "Featurizers/TestHelpers.h" -#include "Archive.h" - -namespace NS = Microsoft::Featurizer; - -namespace onnxruntime { -namespace test { - -template -std::vector GetStream() { - NS::Featurizers::HashOneHotVectorizerTransformer hvtransformer(2, 100); - NS::Archive ar; - hvtransformer.save(ar); - return ar.commit(); -} - -TEST(FeaturizersTests, HashOneHotVectorizerTransformer_int8) { - using Type = int8_t; - auto stream = GetStream(); - auto dim = static_cast(stream.size()); - - OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); - test.AddInput("State", {dim}, stream); - test.AddInput("Input", {1}, {15}); - - test.AddOutput("NumElements", {1}, {100u}); - test.AddOutput("Value", {1}, {1u}); - test.AddOutput("Index", {1}, {29u}); - - test.Run(); -} - -TEST(FeaturizersTests, HashOneHotVectorizerTransformer_int32) { - using Type = int32_t; - auto stream = GetStream(); - auto dim = static_cast(stream.size()); - - OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); - test.AddInput("State", {dim}, stream); - test.AddInput("Input", {1}, {15}); - - test.AddOutput("NumElements", {1}, {100u}); - test.AddOutput("Value", {1}, {1u}); - test.AddOutput("Index", {1}, {22u}); - - test.Run(); -} - -TEST(FeaturizersTests, HashOneHotVectorizerTransformer_double) { - using Type = double; - auto stream = GetStream(); - auto dim = static_cast(stream.size()); - - OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); - test.AddInput("State", {dim}, stream); - test.AddInput("Input", {1}, {15.0}); - - test.AddOutput("NumElements", {1}, {100u}); - test.AddOutput("Value", {1}, {1u}); - test.AddOutput("Index", {1}, {99u}); - - test.Run(); -} - -TEST(FeaturizersTests, HashOneHotVectorizerTransformer_string) { - using Type = std::string; - auto stream = GetStream(); - auto dim = static_cast(stream.size()); - - OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); - test.AddInput("State", {dim}, stream); - test.AddInput("Input", {1}, {"hello"}); - - test.AddOutput("NumElements", {1}, {100u}); - test.AddOutput("Value", {1}, {1u}); - test.AddOutput("Index", {1}, {25u}); - - test.Run(); -} - -} // namespace test -} // namespace onnxruntime \ No newline at end of file From 498f266980d506917711f30dfe0cc1bb970c7e1b Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 13 Jan 2020 11:11:15 -0800 Subject: [PATCH 05/10] Remove debug output --- onnxruntime/test/featurizers_ops/datetimetransformer_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc b/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc index b0f88683d6c8b..6996a3b5fb822 100644 --- a/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc +++ b/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc @@ -15,8 +15,6 @@ namespace test { TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_04) { const time_t date = 217081624; - char buffer[1024]; - std::cout << "Time: " << ctime_s(buffer, sizeof(buffer), &date) << std::endl; OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); // Add state input From fc6b0aab0ee8cff1e2ebc1de56d013da579e61ec Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 13 Jan 2020 11:21:05 -0800 Subject: [PATCH 06/10] Get State from real serialization. --- .../datetimetransformer_test.cc | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc b/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc index 6996a3b5fb822..354f201661dee 100644 --- a/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc +++ b/onnxruntime/test/featurizers_ops/datetimetransformer_test.cc @@ -4,6 +4,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "Archive.h" #include "Featurizers/DateTimeFeaturizer.h" namespace dft = Microsoft::Featurizer::Featurizers; @@ -13,12 +14,21 @@ using SysClock = std::chrono::system_clock; namespace onnxruntime { namespace test { +std::vector GetStream () { + dft::DateTimeTransformer dt("", ""); + Microsoft::Featurizer::Archive ar; + dt.save(ar); + return ar.commit(); +} + TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_04) { const time_t date = 217081624; OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); // Add state input - test.AddInput("State", {8}, {1, 0, 0, 0, 0, 0, 0, 0}); + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + test.AddInput("State", {dim}, stream); // We are adding a scalar Tensor in this instance test.AddInput("Date", {1}, {date}); @@ -81,7 +91,9 @@ TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17_12_27_05) { OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); // Add state input - test.AddInput("State", {8}, {1, 0, 0, 0, 0, 0, 0, 0}); + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + test.AddInput("State", {dim}, stream); // We are adding a scalar Tensor in this instance test.AddInput("Date", {1}, {date}); @@ -145,7 +157,10 @@ TEST(FeaturizersTests, DateTimeTransformer_past_1976_nov_17__12_27_05_and_past_1 OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); // Add state input - test.AddInput("State", {8}, {1, 0, 0, 0, 0, 0, 0, 0}); + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + test.AddInput("State", {dim}, stream); + // We are adding a scalar Tensor in this instance test.AddInput("Date", {2}, {date1, date2}); @@ -233,7 +248,9 @@ TEST(FeaturizersTests, DateTimeTransformer_future_2025_june_30) { OpTester test("DateTimeTransformer", 1, onnxruntime::kMSFeaturizersDomain); // Add state input - test.AddInput("State", {8}, {1, 0, 0, 0, 0, 0, 0, 0}); + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + test.AddInput("State", {dim}, stream); // We are adding a scalar Tensor in this instance test.AddInput("Date", {1}, {date}); From aed9caf12d42dbc832997c07b4fbdc92b1d9994a Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 13 Jan 2020 14:05:42 -0800 Subject: [PATCH 07/10] Temporarily remove hash_one_hot_vectorizer_transformer. --- .../graph/featurizers_ops/featurizers_defs.cc | 79 ---------------- .../hash_one_hot_vectorizer_transformer.cc | 90 ------------------- .../cpu_featurizers_kernels.cc | 2 - 3 files changed, 171 deletions(-) delete mode 100644 onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc diff --git a/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc b/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc index 6497e6cd882bd..6b3d185b8fa11 100644 --- a/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc +++ b/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc @@ -34,7 +34,6 @@ using ONNX_NAMESPACE::OPTIONAL; // Forward declarations static void RegisterCatImputerFeaturizerVer1(); static void RegisterDateTimeFeaturizerVer1(); -static void RegisterHashOneHotVectorizerFeaturizerVer1(); static void RegisterImputationMarkerFeaturizerVer1(); static void RegisterLabelEncoderFeaturizerVer1(); static void RegisterMaxAbsScalarFeaturizerVer1(); @@ -51,7 +50,6 @@ static void RegisterTimeSeriesImputerFeaturizerVer1(); void RegisterMSFeaturizersSchemas() { RegisterCatImputerFeaturizerVer1(); RegisterDateTimeFeaturizerVer1(); - RegisterHashOneHotVectorizerFeaturizerVer1(); RegisterImputationMarkerFeaturizerVer1(); RegisterLabelEncoderFeaturizerVer1(); RegisterMaxAbsScalarFeaturizerVer1(); @@ -323,83 +321,6 @@ void RegisterDateTimeFeaturizerVer1() { ); } -void RegisterHashOneHotVectorizerFeaturizerVer1() { - static const char* doc = R"DOC( - Hashes the input to a categorical value, then produces a one hot encoded vector - based on that value. - - C++-style pseudo signature: - template HashOneHotVectorizerStruct execute(T const &value); - - Examples: - Assuming the hashing algorithm... - "A" -> 1 - "B" -> 2 - "C" -> 5 - - and 'numCols' set to 8: - - execute("A") -> [1, 0, 0, 0, 0, 0, 0, 0] - execute("B") -> [0, 1, 0, 0, 0, 0, 0, 0] - execute("C") -> [0, 0, 0, 0, 1, 0, 0, 0] - )DOC"; - - MS_FEATURIZERS_OPERATOR_SCHEMA(HashOneHotVectorizerTransformer) - .SinceVersion(1) - .SetDomain(kMSFeaturizersDomain) - .SetDoc(doc) - .Input( - 0, - "State", - "State generated during training that is used for prediction", - "T0") - .Input( - 1, - "Input", - "No information is available", - "InputT") - .Output(0, "NumElements", "No information available", "OutputT0") - .Output(1, "Value", "No information available", "OutputT1") - .Output(2, "Index", "No information available", "OutputT0") - .TypeConstraint( - "T0", - {"tensor(uint8)"}, - "No information is available") - .TypeConstraint( - "InputT", - {"tensor(int8)", "tensor(int16)", "tensor(int32)", "tensor(int64)", "tensor(uint8)", "tensor(uint16)", "tensor(uint32)", "tensor(uint64)", "tensor(float)", "tensor(double)", "tensor(bool)", "tensor(string)"}, - "No information is available") - .TypeConstraint( - "OutputT0", - {"tensor(uint64)"}, - "No information is available") - .TypeConstraint( - "OutputT1", - {"tensor(uint8)"}, - "No information is available") - .TypeAndShapeInferenceFunction( - [](ONNX_NAMESPACE::InferenceContext& ctx) { - const bool has_shape = hasInputShape(ctx, 1); - - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 0); - if(has_shape) { - propagateShapeFromInputToOutput(ctx, 1, 0); - } - - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 1); - if(has_shape) { - propagateShapeFromInputToOutput(ctx, 1, 1); - } - - propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 2); - if(has_shape) { - propagateShapeFromInputToOutput(ctx, 1, 2); - } - - } - ); -} - void RegisterImputationMarkerFeaturizerVer1() { static const char* doc = R"DOC( Returns true if the input is null, false if it is not. diff --git a/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc b/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc deleted file mode 100644 index 747dcf885cb23..0000000000000 --- a/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/common/common.h" -#include "core/framework/data_types.h" -#include "core/framework/data_types_internal.h" -#include "core/framework/op_kernel.h" - -#include "Featurizers/HashOneHotVectorizerFeaturizer.h" -#include "Archive.h" - -namespace onnxruntime { -namespace featurizers { - -template -struct HashOneHotVectorizerTransformerImpl { - void operator()(OpKernelContext* ctx) const { - // Create the transformer - Microsoft::Featurizer::Featurizers::HashOneHotVectorizerTransformer transformer( - [ctx](void) { - const auto* state_tensor(ctx->Input(0)); - const uint8_t* const state_data(state_tensor->Data()); - - Microsoft::Featurizer::Archive archive(state_data, state_tensor->Shape().GetDims()[0]); - return Microsoft::Featurizer::Featurizers::HashOneHotVectorizerTransformer(archive); - }()); - - // Get the input - const auto* input_tensor(ctx->Input(1)); - const InputT* input_data(input_tensor->Data()); - - // Prepare the output - Tensor* NumElements_tensor(ctx->Output(0, input_tensor->Shape())); - Tensor* Value_tensor(ctx->Output(1, input_tensor->Shape())); - Tensor* Index_tensor(ctx->Output(2, input_tensor->Shape())); - - uint64_t* NumElements_data(NumElements_tensor->MutableData()); - uint8_t* Value_data(Value_tensor->MutableData()); - uint64_t* Index_data(Index_tensor->MutableData()); - - // Execute - const int64_t length(input_tensor->Shape().Size()); - - for (int64_t i = 0; i < length; ++i) { - auto result(transformer.execute(input_data[i])); - - NumElements_data[i] = std::move(result.NumElements); - Value_data[i] = std::move(result.Value); - Index_data[i] = std::move(result.Index); - } - } -}; - -class HashOneHotVectorizerTransformer final : public OpKernel { - public: - explicit HashOneHotVectorizerTransformer(const OpKernelInfo& info) : OpKernel(info) { - } - - Status Compute(OpKernelContext* ctx) const override { - utils::MLTypeCallDispatcher - t_disp(ctx->Input(1)->GetElementType()); - t_disp.Invoke(ctx); - return Status::OK(); - } -}; - -ONNX_OPERATOR_KERNEL_EX( - HashOneHotVectorizerTransformer, - kMSFeaturizersDomain, - 1, - kCpuExecutionProvider, - KernelDefBuilder() - .TypeConstraint("T0", DataTypeImpl::GetTensorType()) - .TypeConstraint("InputT", {DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), - HashOneHotVectorizerTransformer); - -} // namespace featurizers -} // namespace onnxruntime diff --git a/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc b/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc index 96326b10de936..bd536156cfa23 100644 --- a/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc +++ b/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc @@ -12,7 +12,6 @@ namespace featurizers { // Forward declarations class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, CatImputerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, DateTimeTransformer); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, HashOneHotVectorizerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, ImputationMarkerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, LabelEncoderTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, MaxAbsScalarTransformer); @@ -27,7 +26,6 @@ Status RegisterCpuMSFeaturizersKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, From e9b21d413dd0dc26f5dc29512777f1fb6a333108 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 13 Jan 2020 14:18:59 -0800 Subject: [PATCH 08/10] Revert "Temporarily remove hash_one_hot_vectorizer_transformer." This reverts commit aed9caf12d42dbc832997c07b4fbdc92b1d9994a. --- .../graph/featurizers_ops/featurizers_defs.cc | 79 ++++++++++++++++ .../hash_one_hot_vectorizer_transformer.cc | 90 +++++++++++++++++++ .../cpu_featurizers_kernels.cc | 2 + 3 files changed, 171 insertions(+) create mode 100644 onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc diff --git a/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc b/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc index 6b3d185b8fa11..6497e6cd882bd 100644 --- a/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc +++ b/onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc @@ -34,6 +34,7 @@ using ONNX_NAMESPACE::OPTIONAL; // Forward declarations static void RegisterCatImputerFeaturizerVer1(); static void RegisterDateTimeFeaturizerVer1(); +static void RegisterHashOneHotVectorizerFeaturizerVer1(); static void RegisterImputationMarkerFeaturizerVer1(); static void RegisterLabelEncoderFeaturizerVer1(); static void RegisterMaxAbsScalarFeaturizerVer1(); @@ -50,6 +51,7 @@ static void RegisterTimeSeriesImputerFeaturizerVer1(); void RegisterMSFeaturizersSchemas() { RegisterCatImputerFeaturizerVer1(); RegisterDateTimeFeaturizerVer1(); + RegisterHashOneHotVectorizerFeaturizerVer1(); RegisterImputationMarkerFeaturizerVer1(); RegisterLabelEncoderFeaturizerVer1(); RegisterMaxAbsScalarFeaturizerVer1(); @@ -321,6 +323,83 @@ void RegisterDateTimeFeaturizerVer1() { ); } +void RegisterHashOneHotVectorizerFeaturizerVer1() { + static const char* doc = R"DOC( + Hashes the input to a categorical value, then produces a one hot encoded vector + based on that value. + + C++-style pseudo signature: + template HashOneHotVectorizerStruct execute(T const &value); + + Examples: + Assuming the hashing algorithm... + "A" -> 1 + "B" -> 2 + "C" -> 5 + + and 'numCols' set to 8: + + execute("A") -> [1, 0, 0, 0, 0, 0, 0, 0] + execute("B") -> [0, 1, 0, 0, 0, 0, 0, 0] + execute("C") -> [0, 0, 0, 0, 1, 0, 0, 0] + )DOC"; + + MS_FEATURIZERS_OPERATOR_SCHEMA(HashOneHotVectorizerTransformer) + .SinceVersion(1) + .SetDomain(kMSFeaturizersDomain) + .SetDoc(doc) + .Input( + 0, + "State", + "State generated during training that is used for prediction", + "T0") + .Input( + 1, + "Input", + "No information is available", + "InputT") + .Output(0, "NumElements", "No information available", "OutputT0") + .Output(1, "Value", "No information available", "OutputT1") + .Output(2, "Index", "No information available", "OutputT0") + .TypeConstraint( + "T0", + {"tensor(uint8)"}, + "No information is available") + .TypeConstraint( + "InputT", + {"tensor(int8)", "tensor(int16)", "tensor(int32)", "tensor(int64)", "tensor(uint8)", "tensor(uint16)", "tensor(uint32)", "tensor(uint64)", "tensor(float)", "tensor(double)", "tensor(bool)", "tensor(string)"}, + "No information is available") + .TypeConstraint( + "OutputT0", + {"tensor(uint64)"}, + "No information is available") + .TypeConstraint( + "OutputT1", + {"tensor(uint8)"}, + "No information is available") + .TypeAndShapeInferenceFunction( + [](ONNX_NAMESPACE::InferenceContext& ctx) { + const bool has_shape = hasInputShape(ctx, 1); + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 0); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 0); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, 1); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 1); + } + + propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT64, 2); + if(has_shape) { + propagateShapeFromInputToOutput(ctx, 1, 2); + } + + } + ); +} + void RegisterImputationMarkerFeaturizerVer1() { static const char* doc = R"DOC( Returns true if the input is null, false if it is not. diff --git a/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc b/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc new file mode 100644 index 0000000000000..747dcf885cb23 --- /dev/null +++ b/onnxruntime/featurizers_ops/cpu/hash_one_hot_vectorizer_transformer.cc @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/common.h" +#include "core/framework/data_types.h" +#include "core/framework/data_types_internal.h" +#include "core/framework/op_kernel.h" + +#include "Featurizers/HashOneHotVectorizerFeaturizer.h" +#include "Archive.h" + +namespace onnxruntime { +namespace featurizers { + +template +struct HashOneHotVectorizerTransformerImpl { + void operator()(OpKernelContext* ctx) const { + // Create the transformer + Microsoft::Featurizer::Featurizers::HashOneHotVectorizerTransformer transformer( + [ctx](void) { + const auto* state_tensor(ctx->Input(0)); + const uint8_t* const state_data(state_tensor->Data()); + + Microsoft::Featurizer::Archive archive(state_data, state_tensor->Shape().GetDims()[0]); + return Microsoft::Featurizer::Featurizers::HashOneHotVectorizerTransformer(archive); + }()); + + // Get the input + const auto* input_tensor(ctx->Input(1)); + const InputT* input_data(input_tensor->Data()); + + // Prepare the output + Tensor* NumElements_tensor(ctx->Output(0, input_tensor->Shape())); + Tensor* Value_tensor(ctx->Output(1, input_tensor->Shape())); + Tensor* Index_tensor(ctx->Output(2, input_tensor->Shape())); + + uint64_t* NumElements_data(NumElements_tensor->MutableData()); + uint8_t* Value_data(Value_tensor->MutableData()); + uint64_t* Index_data(Index_tensor->MutableData()); + + // Execute + const int64_t length(input_tensor->Shape().Size()); + + for (int64_t i = 0; i < length; ++i) { + auto result(transformer.execute(input_data[i])); + + NumElements_data[i] = std::move(result.NumElements); + Value_data[i] = std::move(result.Value); + Index_data[i] = std::move(result.Index); + } + } +}; + +class HashOneHotVectorizerTransformer final : public OpKernel { + public: + explicit HashOneHotVectorizerTransformer(const OpKernelInfo& info) : OpKernel(info) { + } + + Status Compute(OpKernelContext* ctx) const override { + utils::MLTypeCallDispatcher + t_disp(ctx->Input(1)->GetElementType()); + t_disp.Invoke(ctx); + return Status::OK(); + } +}; + +ONNX_OPERATOR_KERNEL_EX( + HashOneHotVectorizerTransformer, + kMSFeaturizersDomain, + 1, + kCpuExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T0", DataTypeImpl::GetTensorType()) + .TypeConstraint("InputT", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + HashOneHotVectorizerTransformer); + +} // namespace featurizers +} // namespace onnxruntime diff --git a/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc b/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc index bd536156cfa23..96326b10de936 100644 --- a/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc +++ b/onnxruntime/featurizers_ops/cpu_featurizers_kernels.cc @@ -12,6 +12,7 @@ namespace featurizers { // Forward declarations class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, CatImputerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, DateTimeTransformer); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, HashOneHotVectorizerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, ImputationMarkerTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, LabelEncoderTransformer); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, MaxAbsScalarTransformer); @@ -26,6 +27,7 @@ Status RegisterCpuMSFeaturizersKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, From 761fef65cb1c5ecd053eac06d85a5030c497f817 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 13 Jan 2020 14:39:04 -0800 Subject: [PATCH 09/10] Advance FeaturizerLibrary commit. --- cgmanifest.json | 2 +- cmake/external/featurizers.cmake | 2 +- .../hash_one_hot_encoder_transformer_test.cc | 89 +++++++++++++++++++ 3 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc diff --git a/cgmanifest.json b/cgmanifest.json index e6efb3d18f5d0..25fbd376028c7 100644 --- a/cgmanifest.json +++ b/cgmanifest.json @@ -450,7 +450,7 @@ { "component": { "git": { - "commitHash": "4948801a488d7c97ba21ca5c07ccc2977c3838af", + "commitHash": "ebec32ef06859b6399bf8854f18b91158c87760b", "repositoryUrl": "https://github.com/microsoft/FeaturizersLibrary.git" }, "type": "git" diff --git a/cmake/external/featurizers.cmake b/cmake/external/featurizers.cmake index 90e5fe96201f7..5f1c49b73b2f7 100644 --- a/cmake/external/featurizers.cmake +++ b/cmake/external/featurizers.cmake @@ -3,7 +3,7 @@ # This source code should not depend on the onnxruntime and may be built independently set(featurizers_URL "https://github.com/microsoft/FeaturizersLibrary.git") -set(featurizers_TAG "4948801a488d7c97ba21ca5c07ccc2977c3838af") +set(featurizers_TAG "ebec32ef06859b6399bf8854f18b91158c87760b") set(featurizers_pref FeaturizersLibrary) set(featurizers_ROOT ${PROJECT_SOURCE_DIR}/external/${featurizers_pref}) diff --git a/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc b/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc new file mode 100644 index 0000000000000..915173110b69c --- /dev/null +++ b/onnxruntime/test/featurizers_ops/hash_one_hot_encoder_transformer_test.cc @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/providers/provider_test_utils.h" + +#include "Featurizers/HashOneHotVectorizerFeaturizer.h" +#include "Featurizers/TestHelpers.h" +#include "Archive.h" + +namespace NS = Microsoft::Featurizer; + +namespace onnxruntime { +namespace test { + +template +std::vector GetStream() { + NS::Featurizers::HashOneHotVectorizerTransformer hvtransformer(2, 100); + NS::Archive ar; + hvtransformer.save(ar); + return ar.commit(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_int8) { + using Type = int8_t; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {15}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {29u}); + + test.Run(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_int32) { + using Type = int32_t; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {15}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {22u}); + + test.Run(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_double) { + using Type = double; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {15.0}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {99u}); + + test.Run(); +} + +TEST(FeaturizersTests, HashOneHotVectorizerTransformer_string) { + using Type = std::string; + auto stream = GetStream(); + auto dim = static_cast(stream.size()); + + OpTester test("HashOneHotVectorizerTransformer", 1, onnxruntime::kMSFeaturizersDomain); + test.AddInput("State", {dim}, stream); + test.AddInput("Input", {1}, {"hello"}); + + test.AddOutput("NumElements", {1}, {100u}); + test.AddOutput("Value", {1}, {1u}); + test.AddOutput("Index", {1}, {25u}); + + test.Run(); +} + +} // namespace test +} // namespace onnxruntime \ No newline at end of file From 08e08633bb901044a789b623671861cdbe85f540 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 13 Jan 2020 14:55:57 -0800 Subject: [PATCH 10/10] Add typename. --- onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc b/onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc index d5dbf0ba37c48..f60d0b2d60ce2 100644 --- a/onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc +++ b/onnxruntime/test/featurizers_ops/one_hot_encoder_test.cc @@ -19,7 +19,7 @@ std::vector GetStream(const std::vector>& traini Estimator estimator(NS::CreateTestAnnotationMapsPtr(1), 0, allowMissingValues); NS::TestHelpers::Train(estimator, trainingBatches); - Estimator::TransformerUniquePtr pTransformer(estimator.create_transformer()); + typename Estimator::TransformerUniquePtr pTransformer(estimator.create_transformer()); NS::Archive ar; pTransformer->save(ar);