From df1df3c05b85300e0c958332741c78b7e7921ca2 Mon Sep 17 00:00:00 2001 From: Jeroen van Straten Date: Tue, 21 Jun 2022 19:52:41 +0200 Subject: [PATCH] feat: introduce compound (parameterizable) extension types and variations --- proto/substrait/algebra.proto | 8 ++++ proto/substrait/type.proto | 44 +++++++++++++++++ site/docs/types/type_classes.md | 77 ++++++++++++++++++++++++++++++ site/docs/types/type_variations.md | 1 + text/simple_extensions_schema.yaml | 73 +++++++++++++++++++++++++++- 5 files changed, 201 insertions(+), 2 deletions(-) diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index fb07ba02f..881a9b894 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -422,6 +422,10 @@ message Expression { // directly declare the type variation). uint32 type_variation_reference = 51; + // The parameters to be bound to the type variation, if a type variation is + // specified and it is defined to be parameterizable. + repeated Type.Parameter type_variation_parameters = 52; + message VarChar { string value = 1; uint32 length = 2; @@ -472,6 +476,10 @@ message Expression { // points to a type_anchor defined in this plan uint32 type_reference = 1; + // The parameters to be bound to the type class, if the type class is + // parameterizable. + repeated Type.Parameter type_parameters = 3; + // the value of the literal, serialized using some type-specific // protobuf message google.protobuf.Any value = 2; diff --git a/proto/substrait/type.proto b/proto/substrait/type.proto index c417b3a10..32e363e81 100644 --- a/proto/substrait/type.proto +++ b/proto/substrait/type.proto @@ -3,6 +3,8 @@ syntax = "proto3"; package substrait; +import "google/protobuf/empty.proto"; + option csharp_namespace = "Substrait.Protobuf"; option go_package = "github.com/substrait-io/substrait-go/proto"; option java_multiple_files = true; @@ -53,81 +55,97 @@ message Type { message Boolean { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message I8 { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message I16 { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message I32 { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message I64 { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message FP32 { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message FP64 { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message String { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message Binary { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message Timestamp { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message Date { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message Time { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message TimestampTZ { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message IntervalYear { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message IntervalDay { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } message UUID { uint32 type_variation_reference = 1; + repeated Parameter type_variation_parameters = 3; Nullability nullability = 2; } @@ -135,18 +153,21 @@ message Type { message FixedChar { int32 length = 1; uint32 type_variation_reference = 2; + repeated Parameter type_variation_parameters = 4; Nullability nullability = 3; } message VarChar { int32 length = 1; uint32 type_variation_reference = 2; + repeated Parameter type_variation_parameters = 4; Nullability nullability = 3; } message FixedBinary { int32 length = 1; uint32 type_variation_reference = 2; + repeated Parameter type_variation_parameters = 4; Nullability nullability = 3; } @@ -154,18 +175,21 @@ message Type { int32 scale = 1; int32 precision = 2; uint32 type_variation_reference = 3; + repeated Parameter type_variation_parameters = 5; Nullability nullability = 4; } message Struct { repeated Type types = 1; uint32 type_variation_reference = 2; + repeated Parameter type_variation_parameters = 4; Nullability nullability = 3; } message List { Type type = 1; uint32 type_variation_reference = 2; + repeated Parameter type_variation_parameters = 4; Nullability nullability = 3; } @@ -173,13 +197,33 @@ message Type { Type key = 1; Type value = 2; uint32 type_variation_reference = 3; + repeated Parameter type_variation_parameters = 5; Nullability nullability = 4; } message UserDefined { uint32 type_reference = 1; uint32 type_variation_reference = 2; + repeated Parameter type_variation_parameters = 4; Nullability nullability = 3; + repeated Parameter type_parameters = 5; + } + + message Parameter { + oneof parameter { + // Explicitly null/unspecified parameter, to select the default value (if + // any). + google.protobuf.Empty null = 1; + + // Data type parameters, like the i32 in LIST. + Type data_type = 2; + + // Value parameters, like the 10 in VARCHAR<10>. + bool boolean = 3; + int64 integer = 4; + string enum = 5; + string string = 6; + } } } diff --git a/site/docs/types/type_classes.md b/site/docs/types/type_classes.md index 1abc0c8f6..b022d7bc3 100644 --- a/site/docs/types/type_classes.md +++ b/site/docs/types/type_classes.md @@ -58,3 +58,80 @@ A YAML example of an extension type is below: This declares a new type (namespaced to the associated YAML file) called "point". This type is composed of two `i32` values named longitude and latitude. Once a type has been declared, it can be used in function declarations. [TBD: should field references be allowed to dereference the components of a user defined type?] Literals for user-defined types are represented using protobuf [Any](https://developers.google.com/protocol-buffers/docs/proto3#any) messages. + +### Parameterization + +User-defined types may be parameterized, in the same way in which the built-in compound types are parameterizable. The supported "meta-types" for parameters are data types, booleans, integers, enumerations, and strings. Using parameters, we could redefine "point" with different types of coordinates. For example: + +```yaml +name: point +parameters: + - name: T + description: | + The type used for the longitude and latitude + components of the point. + type: type +``` + +or: + +```yaml +name: point +parameters: + - name: coordinate_type + type: enum + options: + - integer + - double +``` + +or: + +```yaml +name: point +parameters: + - name: LONG + type: type + - name: LAT + type: type +``` + +We can't specify the internal structure in this case, because there is currently no support for derived types in the structure. + +The allowed range can be limited for integer parameters. For example: + +```yaml +name: vector +parameters: + - name: T + type: type + - name: dimensions + type: integer + min: 2 + max: 3 +``` + +This specifies a vector that can be either 2- or 3-dimensional. + +Similar to function arguments, the last parameter may be specified to be variadic, allowing it to be specified one or more times instead of only once. For example: + +```yaml +name: union +parameters: + - name: T + type: type +variadic: true +``` + +This defines a type that can be parameterized with one or more other data types, for example `union` but also `union`. Zero or more is also possible, by making the last argument optional: + +```yaml +name: tuple +parameters: + - name: T + type: type + optional: true +variadic: true +``` + +This would also allow for `tuple<>`, to define a zero-tuple. diff --git a/site/docs/types/type_variations.md b/site/docs/types/type_variations.md index 8a2ae4aad..44a758b41 100644 --- a/site/docs/types/type_variations.md +++ b/site/docs/types/type_variations.md @@ -10,3 +10,4 @@ All variations except the "system-preferred" variation (a.k.a. `[0]`, see [Type | Name | The name used to reference this type. Should be unique within type variations for this parent type within a simple extension. | | Description | A human description of the purpose of this type variation. | | Function Behavior | **INHERITS** or **SEPARATE**: whether functions that support the system-preferred variation implicitly also support this variation, or whether functions should be resolved independently. For example, if one has the function `add(i8,i8)` defined and then defines an `i8` variation, this determines whether the `i8` variation can be bound to the base `add` operation (inherits) or whether a specialized version of `add` needs to be defined specifically for this variation (separate). Defaults to inherits. | +| Parameterization | Type variations can be parameterized. For example, an implementation may support storing `timestamp_tz` using any timezone, in which case it might not be convenient to create a variation for every possible timezone. Parameterizations for type variations work the same as parameterizations for [compound user-defined types](type_classes.md#parameterization). | diff --git a/text/simple_extensions_schema.yaml b/text/simple_extensions_schema.yaml index 52ffbc782..aa945f296 100644 --- a/text/simple_extensions_schema.yaml +++ b/text/simple_extensions_schema.yaml @@ -16,6 +16,10 @@ properties: type: object additionalProperties: $ref: "#/$defs/type" + parameters: # parameter list for compound types + $ref: "#/$defs/type_param_defs" + variadic: # when set, last parameter may be specified one or more times + type: boolean type_variations: type: array minItems: 1 @@ -25,7 +29,7 @@ properties: required: [parent, name] properties: parent: - type: string + $ref: "#/$defs/type" name: type: string description: @@ -33,6 +37,10 @@ properties: functions: type: string enum: [INHERITS, SEPARATE] + parameters: # parameter list for compound type variations + $ref: "#/$defs/type_param_defs" + variadic: # when set, last parameter may be specified one or more times + type: boolean scalar_functions: type: array items: @@ -45,8 +53,69 @@ properties: $defs: type: oneOf: - - type: string + - type: string # shorthand form for when only name is needed + - type: object + properties: + name: # name: a Substrait type name, or name of a type previously defined in this extension + type: string + nullable: # set to true to make the type nullable + type: boolean + variation: # type variation, if any + $ref: "#/$defs/variation" + parameters: # parameters for compound types + $ref: "#/$defs/type_param_values" + variation: + oneOf: + - type: string # shorthand form for when only name is needed - type: object + properties: + name: # name of a type variation previously defined in this extension + type: string + parameters: # parameters for compound type variations + $ref: "#/$defs/type_param_values" + type_param_defs: # an array of compound type (variation) parameter definitions + type: array + items: + type: object + required: [type] + properties: + name: # name of the parameter (for documentation only) + type: string + description: # description (for documentation only) + type: string + type: # expected metatype for the parameter + type: string + enum: + - type + - boolean + - integer + - enumeration + - string + min: # for integers, the minimum supported value (inclusive) + type: number + max: # for integers, the maximum supported value (inclusive) + type: number + options: # for enums, the list of supported values + type: array + minItems: 1 + uniqueItems: true + items: + type: string + optional: # when set to true, the parameter may be omitted at the end or skipped using null + type: boolean + type_param_values: # an array of compound type (variation) parameter definitions + type: array + items: + oneOf: + - type: "null" # use to skip optional parameters + - type: boolean # for boolean parameters + - type: number # for integer parameters + - type: string # for string and enum parameters + - type: object # for data type parameters + required: [ type ] + properties: + type: + $ref: "#/$defs/type" arguments: # an array of arguments type: array items: