-
Notifications
You must be signed in to change notification settings - Fork 3.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[feature-wip](array) remove array config and check array nested depth #13428
Changes from 8 commits
aae4f6f
d85fed8
b8ecfb1
2a3bce3
7c1edce
35b1fa7
fa28df7
1463a7b
4c6cd60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
#include "vec/data_types/data_type_array.h" | ||
|
||
#include "gen_cpp/data.pb.h" | ||
#include "util/stack_util.h" | ||
#include "vec/columns/column_array.h" | ||
#include "vec/columns/column_nullable.h" | ||
#include "vec/data_types/data_type_nullable.h" | ||
|
@@ -180,10 +181,15 @@ Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const { | |
auto& offsets = array_column->get_offsets(); | ||
|
||
IColumn& nested_column = array_column->get_data(); | ||
DCHECK(nested_column.is_nullable()); | ||
if (*rb.position() != '[') { | ||
return Status::InvalidArgument("Array does not start with '[' character, found '{}'", | ||
*rb.position()); | ||
} | ||
if (*(rb.end() - 1) != ']') { | ||
return Status::InvalidArgument("Array does not end with ']' character, found '{}'", | ||
*(rb.end() - 1)); | ||
} | ||
++rb.position(); | ||
bool first = true; | ||
size_t size = 0; | ||
|
@@ -210,13 +216,9 @@ Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const { | |
|
||
// dispose the case of [123,,,] | ||
if (nested_str_len == 0) { | ||
if (nested_column.is_nullable()) { | ||
auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column); | ||
nested_null_col.get_nested_column().insert_default(); | ||
nested_null_col.get_null_map_data().push_back(0); | ||
} else { | ||
nested_column.insert_default(); | ||
} | ||
auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column); | ||
nested_null_col.get_nested_column().insert_default(); | ||
nested_null_col.get_null_map_data().push_back(0); | ||
Comment on lines
+220
to
+222
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should check whether the nested column is nullable here. In future, we may enable the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. data_type_array.cpp:185 has There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently Doris only support "nullable" element inside array. |
||
++size; | ||
continue; | ||
} | ||
|
@@ -236,19 +238,31 @@ Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const { | |
} | ||
|
||
// dispose the case of ["123"] or ['123'] | ||
bool has_quota = false; | ||
size_t tmp_len = nested_str_len; | ||
ReadBuffer read_buffer(rb.position(), nested_str_len); | ||
auto begin_char = *(rb.position() + begin_pos); | ||
auto end_char = *(rb.position() + end_pos); | ||
if (begin_char == end_char && (begin_char == '"' || begin_char == '\'')) { | ||
int64_t length = end_pos - begin_pos - 1; | ||
read_buffer = ReadBuffer(rb.position() + begin_pos + 1, (length > 0 ? length : 0)); | ||
tmp_len = (length > 0 ? length : 0); | ||
has_quota = true; | ||
} | ||
|
||
auto st = nested->from_string(read_buffer, &nested_column); | ||
if (!st.ok()) { | ||
// we should do revert if error | ||
array_column->pop_back(size); | ||
return st; | ||
// handle null, need to distinguish null and "null" | ||
if (!has_quota && tmp_len == 4 && strncmp(read_buffer.position(), "null", 4) == 0) { | ||
// insert null | ||
auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column); | ||
nested_null_col.get_nested_column().insert_default(); | ||
nested_null_col.get_null_map_data().push_back(1); | ||
} else { | ||
auto st = nested->from_string(read_buffer, &nested_column); | ||
if (!st.ok()) { | ||
// we should do revert if error | ||
array_column->pop_back(size); | ||
return st; | ||
} | ||
} | ||
rb.position() += nested_str_len; | ||
DCHECK_LE(rb.position(), rb.end()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,9 +91,10 @@ Status VBrokerScanner::_fill_dest_columns(const Slice& line, | |
return Status::OK(); | ||
} | ||
|
||
if (!check_array_format(_split_values)) { | ||
return Status::OK(); | ||
} | ||
// This check is meaningless, should be removed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we remove this, may be we should also covert the invalid array value in this row to null. Keep the same logic as cast function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is done in |
||
// if (!check_array_format(_split_values)) { | ||
// return Status::OK(); | ||
// } | ||
|
||
int idx = 0; | ||
for (int i = 0; i < _split_values.size(); ++i) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -320,10 +320,12 @@ struct ConvertImplGenericFromString { | |
check_and_get_column<StringColumnType>(&col_from)) { | ||
auto col_to = data_type_to->create_column(); | ||
|
||
//IColumn & col_to = *res; | ||
size_t size = col_from.size(); | ||
col_to->reserve(size); | ||
|
||
ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(size); | ||
ColumnUInt8::Container* vec_null_map_to = &col_null_map_to->get_data(); | ||
|
||
for (size_t i = 0; i < size; ++i) { | ||
const auto& val = col_from_string->get_data_at(i); | ||
// Note: here we should handle the null element | ||
|
@@ -332,9 +334,16 @@ struct ConvertImplGenericFromString { | |
continue; | ||
} | ||
ReadBuffer read_buffer((char*)(val.data), val.size); | ||
RETURN_IF_ERROR(data_type_to->from_string(read_buffer, col_to)); | ||
Status st = data_type_to->from_string(read_buffer, col_to); | ||
// if parsing failed, will return null | ||
(*vec_null_map_to)[i] = !st.ok(); | ||
if (!st.ok()) { | ||
col_to->insert_default(); | ||
} | ||
} | ||
block.replace_by_position(result, std::move(col_to)); | ||
// block.replace_by_position(result, std::move(col_to)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
block.get_by_position(result).column = | ||
ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); | ||
} else { | ||
return Status::RuntimeError( | ||
"Illegal column {} of first argument of conversion function from string", | ||
|
@@ -870,8 +879,9 @@ struct ConvertThroughParsing { | |
if constexpr (IsDataTypeDecimal<ToDataType>) { | ||
UInt32 scale = additions; | ||
col_to = ColVecTo::create(size, scale); | ||
} else | ||
} else { | ||
col_to = ColVecTo::create(size); | ||
} | ||
|
||
typename ColVecTo::Container& vec_to = col_to->get_data(); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,12 +46,8 @@ | |
* as abstract methods that subclasses must implement. | ||
*/ | ||
public abstract class Type { | ||
// Maximum nesting depth of a type. This limit was determined experimentally by | ||
// org.apache.doris.rewrite.FoldConstantsRule.apply generating and scanning | ||
// deeply nested Parquet and Avro files. In those experiments, we exceeded | ||
// the stack space in the scanner (which uses recursion for dealing with | ||
// nested types) at a nesting depth between 200 and 300 (200 worked, 300 crashed). | ||
public static int MAX_NESTING_DEPTH = 2; | ||
// Currently only support Array type with max 9 depths. | ||
public static int MAX_NESTING_DEPTH = 9; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is 9? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is a hard limit on BE side. See
|
||
|
||
// Static constant types for scalar types that don't require additional information. | ||
public static final ScalarType INVALID = new ScalarType(PrimitiveType.INVALID_TYPE); | ||
|
@@ -488,7 +484,7 @@ public static boolean canCastTo(Type t1, Type t2) { | |
} else if (t1.isArrayType() && t2.isArrayType()) { | ||
return ArrayType.canCastTo((ArrayType) t1, (ArrayType) t2); | ||
} | ||
return t1.isNull() || t1.getPrimitiveType() == PrimitiveType.VARCHAR; | ||
return t1.isNull() || t1.getPrimitiveType().isCharFamily(); | ||
} | ||
|
||
/** | ||
|
@@ -612,7 +608,7 @@ public boolean exceedsMaxNestingDepth() { | |
* MAP<STRING,STRUCT<f1:INT>> --> 3 | ||
*/ | ||
private boolean exceedsMaxNestingDepth(int d) { | ||
if (d >= MAX_NESTING_DEPTH) { | ||
if (d > MAX_NESTING_DEPTH) { | ||
return true; | ||
} | ||
if (isStructType()) { | ||
|
@@ -623,7 +619,9 @@ private boolean exceedsMaxNestingDepth(int d) { | |
} | ||
} | ||
} else if (isArrayType()) { | ||
return false; | ||
ArrayType arrayType = (ArrayType) this; | ||
Type itemType = arrayType.getItemType(); | ||
return itemType.exceedsMaxNestingDepth(d + 1); | ||
} else if (isMultiRowType()) { | ||
MultiRowType multiRowType = (MultiRowType) this; | ||
return multiRowType.getItemType().exceedsMaxNestingDepth(d + 1); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will overflow if rb is empty
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is checked in
ConvertImplGenericFromString
infunction_cast.h
:I will add a DCHECK here