Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feature-wip](array) remove array config and check array nested depth #13428

Merged
merged 9 commits into from
Oct 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion be/src/runtime/decimalv2_value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ int DecimalV2Value::parse_from_str(const char* decimal_str, int32_t length) {

_value = StringParser::string_to_decimal<__int128>(decimal_str, length, PRECISION, SCALE,
&result);

if (result == StringParser::PARSE_FAILURE) {
error = E_DEC_BAD_NUM;
}
Expand Down
39 changes: 27 additions & 12 deletions be/src/vec/data_types/data_type_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "vec/data_types/data_type_array.h"

#include "gen_cpp/data.pb.h"
#include "util/stack_util.h"
#include "vec/columns/column_array.h"
#include "vec/columns/column_nullable.h"
#include "vec/data_types/data_type_nullable.h"
Expand Down Expand Up @@ -175,15 +176,21 @@ std::string DataTypeArray::to_string(const IColumn& column, size_t row_num) cons
}

Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const {
DCHECK(!rb.eof());
// only support one level now
auto* array_column = assert_cast<ColumnArray*>(column);
auto& offsets = array_column->get_offsets();

IColumn& nested_column = array_column->get_data();
DCHECK(nested_column.is_nullable());
if (*rb.position() != '[') {
return Status::InvalidArgument("Array does not start with '[' character, found '{}'",
*rb.position());
}
if (*(rb.end() - 1) != ']') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will overflow if rb is empty

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is checked in ConvertImplGenericFromString in function_cast.h:

 332                 if (val.size == 0) {
 333                     col_to->insert_default();
 334                     continue;
 335                 }
 336                 ReadBuffer read_buffer((char*)(val.data), val.size);
 337                 Status st = data_type_to->from_string(read_buffer, col_to);
 338                 // if parsing failed, will return null
 339                 (*vec_null_map_to)[i] = !st.ok();
 340                 if (!st.ok()) {
 341                     col_to->insert_default();
 342                 }

I will add a DCHECK here

return Status::InvalidArgument("Array does not end with ']' character, found '{}'",
*(rb.end() - 1));
}
++rb.position();
bool first = true;
size_t size = 0;
Expand All @@ -210,13 +217,9 @@ Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const {

// dispose the case of [123,,,]
if (nested_str_len == 0) {
if (nested_column.is_nullable()) {
auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column);
nested_null_col.get_nested_column().insert_default();
nested_null_col.get_null_map_data().push_back(0);
} else {
nested_column.insert_default();
}
auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column);
nested_null_col.get_nested_column().insert_default();
nested_null_col.get_null_map_data().push_back(0);
Comment on lines +220 to +222
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should check whether the nested column is nullable here. In future, we may enable the not_null syntax to create an array with the elements which are all not nullable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. data_type_array.cpp:185 has DCHECK.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently Doris only support "nullable" element inside array.
I will add a DCHECK here.
After we support "not_null" element, I will rethink this logic

++size;
continue;
}
Expand All @@ -236,19 +239,31 @@ Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const {
}

// dispose the case of ["123"] or ['123']
bool has_quota = false;
size_t tmp_len = nested_str_len;
ReadBuffer read_buffer(rb.position(), nested_str_len);
auto begin_char = *(rb.position() + begin_pos);
auto end_char = *(rb.position() + end_pos);
if (begin_char == end_char && (begin_char == '"' || begin_char == '\'')) {
int64_t length = end_pos - begin_pos - 1;
read_buffer = ReadBuffer(rb.position() + begin_pos + 1, (length > 0 ? length : 0));
tmp_len = (length > 0 ? length : 0);
has_quota = true;
}

auto st = nested->from_string(read_buffer, &nested_column);
if (!st.ok()) {
// we should do revert if error
array_column->pop_back(size);
return st;
// handle null, need to distinguish null and "null"
if (!has_quota && tmp_len == 4 && strncmp(read_buffer.position(), "null", 4) == 0) {
// insert null
auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column);
nested_null_col.get_nested_column().insert_default();
nested_null_col.get_null_map_data().push_back(1);
} else {
auto st = nested->from_string(read_buffer, &nested_column);
if (!st.ok()) {
// we should do revert if error
array_column->pop_back(size);
return st;
}
}
rb.position() += nested_str_len;
DCHECK_LE(rb.position(), rb.end());
Expand Down
6 changes: 0 additions & 6 deletions be/src/vec/exec/format/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,12 +237,6 @@ Status CsvReader::_fill_dest_columns(const Slice& line, std::vector<MutableColum
return Status::OK();
}

RETURN_IF_ERROR(_check_array_format(_split_values, &is_success));
if (UNLIKELY(!is_success)) {
// If not success, which means we met an invalid row, filter this row and return.
return Status::OK();
}

// if _split_values.size > _file_slot_descs.size()
// we only take the first few columns
for (int i = 0; i < _file_slot_descs.size(); ++i) {
Expand Down
47 changes: 26 additions & 21 deletions be/src/vec/exec/format/parquet/schema_desc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,31 +157,35 @@ void FieldDescriptor::parse_physical_field(const tparquet::SchemaElement& physic

TypeDescriptor FieldDescriptor::get_doris_type(const tparquet::SchemaElement& physical_schema) {
TypeDescriptor type;
switch (physical_schema.type) {
case tparquet::Type::BOOLEAN:
type.type = TYPE_BOOLEAN;
return type;
case tparquet::Type::INT32:
type.type = TYPE_INT;
return type;
case tparquet::Type::INT64:
case tparquet::Type::INT96:
type.type = TYPE_BIGINT;
return type;
case tparquet::Type::FLOAT:
type.type = TYPE_FLOAT;
return type;
case tparquet::Type::DOUBLE:
type.type = TYPE_DOUBLE;
return type;
default:
break;
}
type.type = INVALID_TYPE;
if (physical_schema.__isset.logicalType) {
type = convert_to_doris_type(physical_schema.logicalType);
} else if (physical_schema.__isset.converted_type) {
type = convert_to_doris_type(physical_schema.converted_type);
}
// use physical type instead
if (type.type == INVALID_TYPE) {
switch (physical_schema.type) {
case tparquet::Type::BOOLEAN:
type.type = TYPE_BOOLEAN;
return type;
case tparquet::Type::INT32:
type.type = TYPE_INT;
return type;
case tparquet::Type::INT64:
case tparquet::Type::INT96:
type.type = TYPE_BIGINT;
return type;
case tparquet::Type::FLOAT:
type.type = TYPE_FLOAT;
return type;
case tparquet::Type::DOUBLE:
type.type = TYPE_DOUBLE;
return type;
default:
break;
}
}
return type;
}

Expand Down Expand Up @@ -214,7 +218,7 @@ TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::LogicalType logi
} else if (logicalType.__isset.TIMESTAMP) {
type.type = TYPE_DATETIMEV2;
} else {
LOG(WARNING) << "Not supported parquet LogicalType";
type.type = INVALID_TYPE;
}
return type;
}
Expand Down Expand Up @@ -253,6 +257,7 @@ TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::ConvertedType::t
break;
default:
LOG(WARNING) << "Not supported parquet ConvertedType: " << convertedType;
type = INVALID_TYPE;
break;
}
return type;
Expand Down
2 changes: 1 addition & 1 deletion be/src/vec/exec/scan/vfile_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ Status VFileScanner::_get_next_reader() {
_name_to_col_type.clear();
_missing_cols.clear();
_cur_reader->get_columns(&_name_to_col_type, &_missing_cols);
if (!_missing_cols.empty() && _is_load && VLOG_NOTICE_IS_ON) {
if (VLOG_NOTICE_IS_ON && !_missing_cols.empty() && _is_load) {
fmt::memory_buffer col_buf;
for (auto& col : _missing_cols) {
fmt::format_to(col_buf, " {}", col);
Expand Down
3 changes: 0 additions & 3 deletions be/src/vec/exec/scan/vscanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,6 @@ Status VScanner::close(RuntimeState* state) {
}

void VScanner::_update_counters_before_close() {
LOG(INFO) << "cmy _update_counters_before_close: _counter.num_rows_filtered: "
<< _counter.num_rows_filtered
<< ", _counter.num_rows_unselected: " << _counter.num_rows_unselected;
if (!_state->enable_profile() && !_is_load) return;
COUNTER_UPDATE(_parent->_rows_read_counter, _num_rows_read);
// Update stats for load
Expand Down
7 changes: 4 additions & 3 deletions be/src/vec/exec/vbroker_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,10 @@ Status VBrokerScanner::_fill_dest_columns(const Slice& line,
return Status::OK();
}

if (!check_array_format(_split_values)) {
return Status::OK();
}
// This check is meaningless, should be removed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we remove this, may be we should also covert the invalid array value in this row to null. Keep the same logic as cast function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done in ConvertImplGenericFromString in function_cast.h.
Just as you recommended, all malformat array string will return null.

// if (!check_array_format(_split_values)) {
// return Status::OK();
// }

int idx = 0;
for (int i = 0; i < _split_values.size(); ++i) {
Expand Down
17 changes: 13 additions & 4 deletions be/src/vec/functions/function_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,12 @@ struct ConvertImplGenericFromString {
check_and_get_column<StringColumnType>(&col_from)) {
auto col_to = data_type_to->create_column();

//IColumn & col_to = *res;
size_t size = col_from.size();
col_to->reserve(size);

ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(size);
ColumnUInt8::Container* vec_null_map_to = &col_null_map_to->get_data();

for (size_t i = 0; i < size; ++i) {
const auto& val = col_from_string->get_data_at(i);
// Note: here we should handle the null element
Expand All @@ -332,9 +334,15 @@ struct ConvertImplGenericFromString {
continue;
}
ReadBuffer read_buffer((char*)(val.data), val.size);
RETURN_IF_ERROR(data_type_to->from_string(read_buffer, col_to));
Status st = data_type_to->from_string(read_buffer, col_to);
// if parsing failed, will return null
(*vec_null_map_to)[i] = !st.ok();
if (!st.ok()) {
col_to->insert_default();
}
}
block.replace_by_position(result, std::move(col_to));
block.get_by_position(result).column =
ColumnNullable::create(std::move(col_to), std::move(col_null_map_to));
} else {
return Status::RuntimeError(
"Illegal column {} of first argument of conversion function from string",
Expand Down Expand Up @@ -870,8 +878,9 @@ struct ConvertThroughParsing {
if constexpr (IsDataTypeDecimal<ToDataType>) {
UInt32 scale = additions;
col_to = ColVecTo::create(size, scale);
} else
} else {
col_to = ColVecTo::create(size);
}

typename ColVecTo::Container& vec_to = col_to->get_data();

Expand Down
2 changes: 2 additions & 0 deletions be/src/vec/io/reader_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class ReadBuffer {

size_t count() { return _end - _start; }

std::string to_string() { return std::string(_start, (_end - _start)); }

private:
char* _start;
char* _end;
Expand Down
4 changes: 2 additions & 2 deletions docs/zh-CN/docs/sql-manual/sql-reference/Data-Types/ARRAY.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ under the License.

### description

ARRAY\<T\>
`ARRAY<T>`

由T类型元素组成的数组,不能作为key列使用。目前支持在Duplicate模型的表中使用。

Expand Down Expand Up @@ -74,7 +74,7 @@ PROPERTIES (
mysql> INSERT INTO `array_test` VALUES (1, [1,2,3,4,5]);
mysql> INSERT INTO `array_test` VALUES (2, array(6,7,8)), (3, array()), (4, null);
```
注意:以上sql仅在非向量化场景下,支持array()函数,向量化场景不支持。
注意:以上sql仅在非向量化场景下,支持 array() 函数,向量化场景不支持。

查询数据示例:

Expand Down
4 changes: 0 additions & 4 deletions fe/check/checkstyle/checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,6 @@ under the License.
<property name="lineSeparator" value="lf"/>
</module>

<module name="RegexpSingleline">
<property name="format" value="&gt;&gt;&gt;&gt;&gt;&gt;&gt;"/>
<property name="message" value="Merge conflicts unresolved."/>
</module>
<module name="RegexpSingleline">
<property name="format" value="&lt;&lt;&lt;&lt;&lt;&lt;&lt;"/>
<property name="message" value="Merge conflicts unresolved."/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ public void analyze(boolean isOlap) throws AnalysisException {
if (type.getPrimitiveType() == PrimitiveType.ARRAY) {
if (isKey()) {
throw new AnalysisException("Array can only be used in the non-key column of"
+ " the duplicate table at present.");
+ " the duplicate table at present.");
}
if (defaultValue.isSet && defaultValue != DefaultValue.NULL_DEFAULT_VALUE) {
throw new AnalysisException("Array type column default value only support null");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,9 @@ public void analyze(Analyzer analyzer) throws UserException {
if (columnDef.getType().getPrimitiveType() == PrimitiveType.JSONB) {
break;
}
if (columnDef.getType().isCollectionType()) {
break;
}
if (columnDef.getType().getPrimitiveType() == PrimitiveType.VARCHAR) {
keysColumnNames.add(columnDef.getName());
break;
Expand Down Expand Up @@ -393,9 +396,6 @@ public void analyze(Analyzer analyzer) throws UserException {
columnDef.analyze(engineName.equals("olap"));

if (columnDef.getType().isArrayType()) {
if (!Config.enable_array_type) {
throw new AnalysisException("Please open enable_array_type config before use Array.");
}
if (columnDef.getAggregateType() != null && columnDef.getAggregateType() != AggregateType.NONE) {
throw new AnalysisException("Array column can't support aggregation "
+ columnDef.getAggregateType());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
*/
public class ArrayType extends Type {

public static final int MAX_NESTED_DEPTH = 9;

@SerializedName(value = "itemType")
private Type itemType;

Expand Down
16 changes: 7 additions & 9 deletions fe/fe-core/src/main/java/org/apache/doris/catalog/Type.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,8 @@
* as abstract methods that subclasses must implement.
*/
public abstract class Type {
// Maximum nesting depth of a type. This limit was determined experimentally by
// org.apache.doris.rewrite.FoldConstantsRule.apply generating and scanning
// deeply nested Parquet and Avro files. In those experiments, we exceeded
// the stack space in the scanner (which uses recursion for dealing with
// nested types) at a nesting depth between 200 and 300 (200 worked, 300 crashed).
public static int MAX_NESTING_DEPTH = 2;
// Currently only support Array type with max 9 depths.
public static int MAX_NESTING_DEPTH = 9;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is 9?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a hard limit on BE side. See TypeInfo* get_array_type_info in be/src/olap/types.cpp:

DCHECK(iterations <= 8) << "the depth of nested array type should not be larger than 8";


// Static constant types for scalar types that don't require additional information.
public static final ScalarType INVALID = new ScalarType(PrimitiveType.INVALID_TYPE);
Expand Down Expand Up @@ -488,7 +484,7 @@ public static boolean canCastTo(Type t1, Type t2) {
} else if (t1.isArrayType() && t2.isArrayType()) {
return ArrayType.canCastTo((ArrayType) t1, (ArrayType) t2);
}
return t1.isNull() || t1.getPrimitiveType() == PrimitiveType.VARCHAR;
return t1.isNull() || t1.getPrimitiveType().isCharFamily();
}

/**
Expand Down Expand Up @@ -612,7 +608,7 @@ public boolean exceedsMaxNestingDepth() {
* MAP<STRING,STRUCT<f1:INT>> --> 3
*/
private boolean exceedsMaxNestingDepth(int d) {
if (d >= MAX_NESTING_DEPTH) {
if (d > MAX_NESTING_DEPTH) {
return true;
}
if (isStructType()) {
Expand All @@ -623,7 +619,9 @@ private boolean exceedsMaxNestingDepth(int d) {
}
}
} else if (isArrayType()) {
return false;
ArrayType arrayType = (ArrayType) this;
Type itemType = arrayType.getItemType();
return itemType.exceedsMaxNestingDepth(d + 1);
} else if (isMultiRowType()) {
MultiRowType multiRowType = (MultiRowType) this;
return multiRowType.getItemType().exceedsMaxNestingDepth(d + 1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
import org.apache.doris.qe.ConnectContext;

import mockit.Mock;
Expand All @@ -47,7 +46,6 @@ public void setUp() {
stringCol = new TypeDef(ScalarType.createChar(10));
floatCol = new TypeDef(ScalarType.createType(PrimitiveType.FLOAT));
booleanCol = new TypeDef(ScalarType.createType(PrimitiveType.BOOLEAN));
Config.enable_array_type = true;

ctx = new ConnectContext(null);
new MockUp<ConnectContext>() {
Expand Down
Loading