Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add timeseries imputer transformer featurizer kernel #2813

Merged
merged 23 commits into from
Jan 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
98936ab
Make kernels non-template. Add input constraint for learnt data.
yuslepukhin Jan 2, 2020
8d93344
Add two more featurizers along with tests. Tests fail.
yuslepukhin Jan 3, 2020
8be3fcb
Fix tests serialized stream by prepending version bytes.
yuslepukhin Jan 3, 2020
97eb658
Add inputation_marker_transfomer and the test.
yuslepukhin Jan 3, 2020
3fb8952
Added label_encoder_transformer along with a test.
yuslepukhin Jan 4, 2020
5c477bd
Fix labelencodertransfomer_test.cc string_throw case
yuslepukhin Jan 4, 2020
085918f
Add MissingDummiesTransformer along with the test.
yuslepukhin Jan 4, 2020
13255cd
Remove commented code.
yuslepukhin Jan 6, 2020
e692677
Merge branch 'master' into yuslepukhin/import_featurizers
yuslepukhin Jan 6, 2020
f4df726
Update manifest.
yuslepukhin Jan 6, 2020
4e5dd0a
Fix oversights.
yuslepukhin Jan 6, 2020
b6db1b6
Advance Featurizers commit. Add TimeSeriesImputerTransformer definition.
yuslepukhin Jan 7, 2020
204fe89
Start implementation.
yuslepukhin Jan 8, 2020
4806f91
Implement for 3 supported types but all the columns of the same type.
yuslepukhin Jan 9, 2020
b2ac92a
Merge branch 'master' into yuslepukhin/timeseries_imputer
yuslepukhin Jan 9, 2020
95a2ee8
Fix up shape inference, remove batching remains
yuslepukhin Jan 9, 2020
0dab5d7
Add the first test. Fix shape inference error:
yuslepukhin Jan 9, 2020
b5d2e62
Fix shape inference, add kernel definitions, fix implemention bugs,
yuslepukhin Jan 9, 2020
67159dd
Update def, fix time conversions.
yuslepukhin Jan 9, 2020
d25907f
Convert some more tests.
yuslepukhin Jan 10, 2020
fa4d361
Remove type dispatcher as we currently support only one type.
yuslepukhin Jan 10, 2020
781fcb9
Merge branch 'master' into yuslepukhin/timeseries_imputer
yuslepukhin Jan 10, 2020
894187f
Fix unused typedefs errors.
yuslepukhin Jan 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cgmanifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@
{
"component": {
"git": {
"commitHash": "a11f5002af58a03d5902b13ef65c84cedb499024",
"commitHash": "573070aeeb77e267da2579ac1d75d92c688bbe97",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, please take a look #2760

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will submit separate

"repositoryUrl": "https://github.com/microsoft/FeaturizersLibrary.git"
},
"type": "git"
Expand Down
3 changes: 2 additions & 1 deletion cmake/external/featurizers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This source code should not depend on the onnxruntime and may be built independently

set(featurizers_URL "https://github.com/microsoft/FeaturizersLibrary.git")
set(featurizers_TAG "a11f5002af58a03d5902b13ef65c84cedb499024")
set(featurizers_TAG "573070aeeb77e267da2579ac1d75d92c688bbe97")

set(featurizers_pref FeaturizersLibrary)
set(featurizers_ROOT ${PROJECT_SOURCE_DIR}/external/${featurizers_pref})
Expand All @@ -24,6 +24,7 @@ if (WIN32)
BINARY_DIR ${featurizers_BINARY_DIR}
CMAKE_ARGS -Dfeaturizers_MSVC_STATIC_RUNTIME=${onnxruntime_MSVC_STATIC_RUNTIME}
INSTALL_COMMAND ""

)
else()
ExternalProject_Add(featurizers_lib
Expand Down
182 changes: 177 additions & 5 deletions onnxruntime/core/graph/featurizers_ops/featurizers_defs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ static void RegisterMinMaxScalarFeaturizerVer1();
static void RegisterMissingDummiesFeaturizerVer1();
static void RegisterRobustScalarFeaturizerVer1();
static void RegisterStringFeaturizerVer1();
static void RegisterTimeSeriesImputerFeaturizerVer1();

// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
Expand All @@ -55,6 +56,7 @@ void RegisterMSFeaturizersSchemas() {
RegisterMissingDummiesFeaturizerVer1();
RegisterRobustScalarFeaturizerVer1();
RegisterStringFeaturizerVer1();
RegisterTimeSeriesImputerFeaturizerVer1();
}

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -212,7 +214,7 @@ void RegisterDateTimeFeaturizerVer1() {
case 0:
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT32, output);
break;
case 1: // fall through
case 1: // fall through
case 2:
case 3:
case 4:
Expand All @@ -223,11 +225,11 @@ void RegisterDateTimeFeaturizerVer1() {
case 9:
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, output);
break;
case 10: // fall through
case 10: // fall through
case 11:
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT16, output);
break;
case 12: // fall through
case 12: // fall through
case 13:
case 14:
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, output);
Expand Down Expand Up @@ -595,7 +597,6 @@ void RegisterRobustScalarFeaturizerVer1() {
input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_UINT32 ||
input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_UINT64 ||
input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_DOUBLE) {
ctx.getOutputType(0)->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, 0);
} else {
fail_type_inference("input 1 is expected to have a accepted type");
Expand Down Expand Up @@ -648,7 +649,178 @@ void RegisterStringFeaturizerVer1() {
.TypeAndShapeInferenceFunction(
[](ONNX_NAMESPACE::InferenceContext& ctx) {
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_STRING, 0);
propagateShapeFromInputToOutput(ctx, 1, 0);
if (hasInputShape(ctx, 1)) {
propagateShapeFromInputToOutput(ctx, 1, 0);
}
});
}

void RegisterTimeSeriesImputerFeaturizerVer1() {
static const char* doc = R"DOC(
Imputes rows and column values such that the generated output does not contain any
time gaps per grain (based on the time gaps encountered during training) and that
all missing column values are populated according to a strategy (forward fill,
backward fill, mode, etc.).

This Featurizer is unique in that it will produce 0:N rows per invocation, depending upon the
input data.

C++-style pseudo signature:
template <typename... GrainColValueTs, typename... DataColValueTs>
std::vector<
std::tuple<
bool, // true if the row was added
std::chrono::system_clock::time_point,
std::tuple<GrainColValueTs...>,
std::tuple<DataColValueTs...>
>
> execute(
std::chrono::system_clock::time_point const &value,
std::tuple<GrainColValueTs...> const &grain,
std::tuple<DataColValueTs...> const &colData
);

Examples:
During training, the time period was found to be 1 day...

Input:
+------+-------+------------------+-------------------+
| time | grain | forward fill col | backward fill col |
+======+=======+==================+===================+
| 1 | A | 10 | None |
+------+-------+------------------+-------------------+
| 2 | A | None | 200 |
+------+-------+------------------+-------------------+
| 1 | B | -10 | -100 |
+------+-------+------------------+-------------------+
| 4 | A | 40 | 400 |
+------+-------+------------------+-------------------+
| 6 | A | 60 | 600 |
+------+-------+------------------+-------------------+
| 3 | B | -30 | -300 |
+------+-------+------------------+-------------------+

Output:
+-------+------+-------+------------------+-------------------+
| Added | time | grain | forward fill col | backward fill col |
+=======+======+=======+==================+===================+
| false | 1 | A | 10 | 200 (from 2) |
+-------+------+-------+------------------+-------------------+
| false | 2 | A | 10 (from 1) | 200 |
+-------+------+-------+------------------+-------------------+
| true | 3 | A | 10 (from 2) | 400 (from 4) |
+-------+------+-------+------------------+-------------------+
| false | 4 | A | 40 | 400 |
+-------+------+-------+------------------+-------------------+
| true | 5 | A | 40 (from 4) | 600 (from 6) |
+-------+------+-------+------------------+-------------------+
| false | 6 | A | 60 | 600 |
+-------+------+-------+------------------+-------------------+
| false | 1 | B | -10 | -100 |
+-------+------+-------+------------------+-------------------+
| true | 2 | B | -10 (from 1) | -300 (from 3) |
+-------+------+-------+------------------+-------------------+
| false | 3 | B | -30 | -300 |
+-------+------+-------+------------------+-------------------+
)DOC";

MS_FEATURIZERS_OPERATOR_SCHEMA(TimeSeriesImputerTransformer)
.SinceVersion(1)
.SetDomain(kMSFeaturizersDomain)
.SetDoc(doc)
.Input(
0,
"State",
"State generated during training that is used for prediction",
"T0")
.Input(
1,
"Times",
"Tensor of timestamps in seconds since epoch [R] where R is a number of rows.",
"T1")
.Input(
2,
"Keys",
"Composite keys tensor of shape [R][K]. R is the same as Input(1)",
"T2")
.Input(
3,
"Data",
"It is a data tensor of shape [R][C] where R - rows and C - columns. R must be the same with Input(1)",
"T2")
.Output(
0,
"Added",
"Tensor of boolean with a shape of [IR]. Contains a boolean for each row in the result where true represents added row.",
"T3")
.Output(
1,
"ImputedTimes",
"This is a tensor of timestamps in seconds since epoch of shape [IR], where IR is the number of output rows.",
"T1")
.Output(
2,
"ImputedKeys",
"Contains keys along with the imputed keys. Tensor of shape [IR][K].",
"T2")
.Output(
3,
"ImputedData",
"Tensor of shape [IR][C] where IR is the number of rows in the output."
"C is the number of columns.",
"T2")
.TypeConstraint(
"T0",
{"tensor(uint8)"},
"No information is available")
.TypeConstraint(
"T1",
{"tensor(int64)"},
"Represents number of seconds since epoch")
.TypeConstraint(
"T2",
{"tensor(string)"},
"Output data")
.TypeConstraint(
"T3",
{"tensor(bool)"},
"Boolean Tensor")
.TypeAndShapeInferenceFunction(
[](ONNX_NAMESPACE::InferenceContext& ctx) {
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_BOOL, 0);
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT64, 1);
// Number of output rows is not known
ONNX_NAMESPACE::TensorShapeProto shape_0_1;
shape_0_1.add_dim();
ONNX_NAMESPACE::updateOutputShape(ctx, 0, shape_0_1);
ONNX_NAMESPACE::updateOutputShape(ctx, 1, shape_0_1);

// Keys
propagateElemTypeFromInputToOutput(ctx, 2, 2);
// Keys shape
if (hasInputShape(ctx, 2)) {
const auto& input2_shape = getInputShape(ctx, 2);
if (input2_shape.dim_size() != 2) {
fail_shape_inference("Expecting keys to have 2 dimensions");
}
ONNX_NAMESPACE::TensorShapeProto shape;
shape.add_dim();
*shape.add_dim() = input2_shape.dim(1);
ONNX_NAMESPACE::updateOutputShape(ctx, 2, shape);
}

// Data shape
propagateElemTypeFromInputToOutput(ctx, 3, 3);
if (hasInputShape(ctx, 3)) {
const auto& input3_shape = getInputShape(ctx, 3);
if (input3_shape.dim_size() != 2) {
fail_shape_inference("Expecting data to have 2 dimensions");
}
ONNX_NAMESPACE::TensorShapeProto shape;
shape.add_dim();
*shape.add_dim() = input3_shape.dim(1);
ONNX_NAMESPACE::updateOutputShape(ctx, 3, shape);
}
});
}

Expand Down
Loading