Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support regexp_instr function #6272

Merged
merged 119 commits into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
119 commits
Select commit Hold shift + click to select a range
8c5768b
save
xzhangxian1008 Sep 23, 2022
8b9c149
save
xzhangxian1008 Sep 26, 2022
f4aeb00
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 26, 2022
c9922bc
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 27, 2022
c98b151
ready to compile
xzhangxian1008 Sep 27, 2022
c126dc5
successfully compile
xzhangxian1008 Sep 27, 2022
2eabd61
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 27, 2022
51ff173
clean up
xzhangxian1008 Sep 28, 2022
74d7096
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 28, 2022
32879c2
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 29, 2022
1adc613
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Sep 30, 2022
573f877
pass tests, for the moment
xzhangxian1008 Sep 30, 2022
08f3305
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 1, 2022
101c4ff
pass gtests
xzhangxian1008 Oct 1, 2022
2be0ea7
ut passed
xzhangxian1008 Oct 7, 2022
7150b5f
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 7, 2022
61baf63
format
xzhangxian1008 Oct 7, 2022
74f090e
fix ut
xzhangxian1008 Oct 7, 2022
46a0499
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 9, 2022
5f1d1f8
save works
xzhangxian1008 Oct 10, 2022
db38cf4
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 10, 2022
1de2747
pass compilation
xzhangxian1008 Oct 10, 2022
fdade39
format
xzhangxian1008 Oct 11, 2022
9a03b7a
add the convertion of int col
xzhangxian1008 Oct 11, 2022
a6ede08
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 12, 2022
c5e6672
undef
xzhangxian1008 Oct 12, 2022
8f44a5d
save works
xzhangxian1008 Oct 12, 2022
291fcc8
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 13, 2022
efb2d73
tweaking
xzhangxian1008 Oct 13, 2022
5a4b64a
resolve
xzhangxian1008 Oct 13, 2022
4561050
tweaking
xzhangxian1008 Oct 13, 2022
35ac32a
save
xzhangxian1008 Oct 13, 2022
b7f9a5d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 13, 2022
910598f
save works
xzhangxian1008 Oct 13, 2022
85074af
save works
xzhangxian1008 Oct 14, 2022
e471028
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 17, 2022
3561f6d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 18, 2022
ceeb1a2
need gtest
xzhangxian1008 Oct 18, 2022
f3e6979
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 18, 2022
0b18c2a
fix integration test
xzhangxian1008 Oct 18, 2022
5474fd0
fix
xzhangxian1008 Oct 19, 2022
6368c9d
workaround
xzhangxian1008 Oct 19, 2022
7b45047
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 21, 2022
6892ea0
add todo
xzhangxian1008 Oct 21, 2022
e08fbcb
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 21, 2022
72877ce
fix
xzhangxian1008 Oct 21, 2022
34c7849
pass const test
xzhangxian1008 Oct 21, 2022
4003f73
pass some gtests
xzhangxian1008 Oct 21, 2022
45debaf
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 24, 2022
f89f4eb
tweaking
xzhangxian1008 Oct 24, 2022
02f56a6
modify the processing of const null etc...
xzhangxian1008 Oct 24, 2022
4fc6944
merge empty_pattern
xzhangxian1008 Oct 24, 2022
8f89448
refine macro name
xzhangxian1008 Oct 24, 2022
c859f64
pass pure vector tests
xzhangxian1008 Oct 25, 2022
28c14a6
pass collation
xzhangxian1008 Oct 26, 2022
71663c8
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 26, 2022
3400eb6
unport replace and make memorization in multi-threads
xzhangxian1008 Oct 26, 2022
640461d
remove ParamDefault
xzhangxian1008 Oct 26, 2022
df0ef3e
merge
xzhangxian1008 Oct 26, 2022
f7e3302
tweaking
xzhangxian1008 Oct 26, 2022
40c1fd2
tweaking
xzhangxian1008 Oct 26, 2022
89667b6
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Oct 27, 2022
bfcf3fb
resolve comments
xzhangxian1008 Oct 27, 2022
c3417d0
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Oct 27, 2022
54aa324
finish
xzhangxian1008 Oct 27, 2022
6cac98c
tweaking
xzhangxian1008 Oct 28, 2022
6ebc3f6
modify punctuation and format
xzhangxian1008 Oct 31, 2022
08ff36d
resolve comments
xzhangxian1008 Oct 31, 2022
202f44d
tweaking
xzhangxian1008 Oct 31, 2022
8e231fc
solve not all const col
xzhangxian1008 Oct 31, 2022
515d19f
Update dbms/src/Functions/FunctionsRegexp.cpp
xzhangxian1008 Nov 1, 2022
a450985
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 1, 2022
add642f
resolve comments
xzhangxian1008 Nov 1, 2022
e80dc23
fix bug
xzhangxian1008 Nov 1, 2022
a423149
add match_type ft
xzhangxian1008 Nov 1, 2022
b3e6627
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 2, 2022
4e9eb84
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 3, 2022
602b380
refactor the handling of parms
xzhangxian1008 Nov 3, 2022
ad0e351
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 3, 2022
8622c01
compress macros
xzhangxian1008 Nov 3, 2022
a94293f
format
xzhangxian1008 Nov 3, 2022
cb93dea
tweaking
xzhangxian1008 Nov 3, 2022
a91a6cf
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 4, 2022
27ff20e
add tests
xzhangxian1008 Nov 4, 2022
444cb5e
tweaking
xzhangxian1008 Nov 4, 2022
a2d4bae
tweaking
xzhangxian1008 Nov 4, 2022
31ed79e
tweaking
xzhangxian1008 Nov 4, 2022
8cdf3ae
refine
xzhangxian1008 Nov 4, 2022
88edd55
Merge branch 'master' of https://github.com/pingcap/tiflash into empt…
xzhangxian1008 Nov 7, 2022
8ad4307
resolve comment
xzhangxian1008 Nov 7, 2022
8613e89
merge and update instr
xzhangxian1008 Nov 7, 2022
3048031
pass compilation
xzhangxian1008 Nov 7, 2022
38571b8
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 7, 2022
6db90ee
fix ut
xzhangxian1008 Nov 7, 2022
e1d1501
refine header
xzhangxian1008 Nov 7, 2022
3b5806a
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 8, 2022
c23bb97
refine header
xzhangxian1008 Nov 8, 2022
e22079d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 8, 2022
f74afa9
replace getMatchedIndex with find and refine comments
xzhangxian1008 Nov 8, 2022
288ac97
refinw
xzhangxian1008 Nov 8, 2022
94d5b78
clean code
xzhangxian1008 Nov 8, 2022
e94ccd1
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 14, 2022
8efe7af
refine check type
xzhangxian1008 Nov 14, 2022
af469bd
refine substr impl
xzhangxian1008 Nov 14, 2022
5a59a93
replace with template
xzhangxian1008 Nov 15, 2022
377b83d
resolve comments
xzhangxian1008 Nov 17, 2022
be4334f
resolve comments
xzhangxian1008 Nov 21, 2022
9eced0e
add test case
xzhangxian1008 Nov 21, 2022
96087c6
resolve comment
xzhangxian1008 Nov 22, 2022
6b667a8
add some tests
xzhangxian1008 Nov 22, 2022
368046d
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 22, 2022
e2dbec5
add tests
xzhangxian1008 Nov 23, 2022
298611b
resolve comment
xzhangxian1008 Nov 23, 2022
5f60c62
Merge branch 'master' of https://github.com/pingcap/tiflash into instr
xzhangxian1008 Nov 24, 2022
de7fd31
fix critical
xzhangxian1008 Nov 24, 2022
085b454
resolve comments
xzhangxian1008 Nov 28, 2022
584e767
tweaking
xzhangxian1008 Nov 28, 2022
29e6401
fix ft
xzhangxian1008 Nov 28, 2022
7e2e1ca
Merge branch 'master' into instr
ti-chi-bot Nov 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ut passed
  • Loading branch information
xzhangxian1008 committed Oct 7, 2022
commit 2be0ea7a4fee561ffd89a4c6339511e6df3f8492
7 changes: 4 additions & 3 deletions dbms/src/Functions/FunctionsRegexp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ std::set<char> valid_flags{flag_i, flag_c, flag_m, flag_s};

// If characters specifying contradictory options are specified
// within match_type, the rightmost one takes precedence.
String getMatchType(const String & match_type)
String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator)
{
// TODO handle collation
std::set<char> applied_flags;
if (collator != nullptr && collator->isCI())
applied_flags.insert(flag_i);

for (auto flag : match_type)
{
Expand All @@ -47,7 +48,7 @@ String getMatchType(const String & match_type)
// to enable the case-sensitive for the regexp
if (flag == flag_c)
{
auto iter_i = applied_flags.find('i');
auto iter_i = applied_flags.find(flag_i);
if (iter_i != applied_flags.end())
applied_flags.erase(iter_i);

Expand Down
63 changes: 30 additions & 33 deletions dbms/src/Functions/FunctionsRegexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct NameReplaceRegexpAll
static constexpr auto name = "replaceRegexpAll";
};

String getMatchType(const String & match_type);
String getMatchType(const String & match_type, TiDB::TiDBCollatorPtr collator = nullptr);

inline int getDefaultFlags()
{
Expand All @@ -93,18 +93,24 @@ struct NullPresence

NullPresence getNullPresense(const Block & block, const ColumnNumbers & args);

inline String addMatchTypeForPattern(const String & pattern, const String & match_type)
inline String addMatchTypeForPattern(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator)
{
String flags = getMatchType(match_type);
String flags = getMatchType(match_type, collator);
return fmt::format("(?{}){}", flags, pattern);
}

inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type)
inline Regexps::Pool::Pointer createRegexpWithMatchType(const String & pattern, const String & match_type, TiDB::TiDBCollatorPtr collator)
{
String final_pattern = addMatchTypeForPattern(pattern, match_type);
String final_pattern = addMatchTypeForPattern(pattern, match_type, collator);
return Regexps::get<false, true>(final_pattern, getDefaultFlags());
}

inline void handleCollatorWithoutMatchType(String & pattern, TiDB::TiDBCollatorPtr collator)
{
if (collator != nullptr && collator->isCI())
pattern = fmt::format("(?i){}", pattern);
}

// Columns may be const, nullable or plain vector, we can conveniently handle
// these different type columns with Param.
class Param
Expand Down Expand Up @@ -132,18 +138,14 @@ class Param
// const null can't be here as we should have handle it in the previous
Field field;
auto p = static_cast<const ColumnNullable &>(*col_const_data).getNestedColumnPtr();
std::cout << fmt::format("family name: {}", p->getFamilyName()) << std::endl;
col_const->get(0, field);
std::cout << "type name: " << field.getTypeName() << std::endl;
data_string = field.safeGet<String>();
null_map = &(static_cast<const ColumnNullable &>(*col_const_data).getNullMapData());
std::cout << fmt::format("cons data string1: {}", data_string) << std::endl;
}
else
{
StringRef tmp_data = col_const->getDataAt(0);
data_string = String(tmp_data.data, tmp_data.size);
std::cout << fmt::format("cons data string2: {}", data_string) << std::endl;
}

is_const = true;
Expand Down Expand Up @@ -183,12 +185,10 @@ class Param
col_const->get(0, field);
data_int64 = field.get<Int64>();
null_map = &(static_cast<const ColumnNullable &>(*col_ptr).getNullMapData());
std::cout << fmt::format("cons data int 1: {}", data_int64) << std::endl;
}
else
{
data_int64 = col_const->getValue<Int64>();
std::cout << fmt::format("cons data int 2: {}", data_int64) << std::endl;
}

is_const = true;
Expand Down Expand Up @@ -237,12 +237,10 @@ class Param
{
StringRef sr = col_str->getDataAt(idx);
String ret_str(sr.data, sr.size);
std::cout << fmt::format("getString here1: {}", ret_str) << std::endl;
return ret_str;
}
else {
String ret_str(data_string);
std::cout << fmt::format("getString here2: {}", ret_str) << std::endl;
return ret_str;
}
}
Expand Down Expand Up @@ -279,7 +277,7 @@ class FunctionStringRegexpBase
static constexpr size_t REGEXP_REPLACE_MAX_PARAM_NUM = 6;
static constexpr size_t REGEXP_SUBSTR_MAX_PARAM_NUM = 5;

void memorize(const Param & pat_param, const std::unique_ptr<const Param> & match_type_param) const
void memorize(const Param & pat_param, const std::unique_ptr<const Param> & match_type_param, TiDB::TiDBCollatorPtr collator) const
{
String && final_pattern = pat_param.getString(0);
if (final_pattern.empty())
Expand All @@ -288,7 +286,10 @@ class FunctionStringRegexpBase
if (match_type_param != nullptr)
{
String && match_type = match_type_param->getString(0);
final_pattern = addMatchTypeForPattern(final_pattern, match_type);
final_pattern = addMatchTypeForPattern(final_pattern, match_type, collator);
} else
{
handleCollatorWithoutMatchType(final_pattern, collator);
}

int flags = getDefaultFlags();
Expand Down Expand Up @@ -361,7 +362,7 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
args_max_num = REGEXP_MAX_PARAM_NUM;

size_t arg_num = arguments.size();
if (arg_num < REGEXP_XXX_MIN_PARAM_NUM)
if (arg_num < REGEXP_XXX_MIN_PARAM_NUM || arg_num > args_max_num)
throw Exception("Illegal argument number");

bool has_nullable_col = false;
Expand Down Expand Up @@ -408,8 +409,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
auto arg_num = arguments.size();
size_t col_size = expr_param.getDataNum();

std::cout << fmt::format("pat_param get string: {}", pat_param.getString(0)) << std::endl;

// match_type_param will be initialized, only when this is a regexp_like function
std::unique_ptr<const Param> match_type_param;

Expand All @@ -433,19 +432,17 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
// Check if args are all const columns
if (expr_param.isConstCol() && pat_param.isConstCol())
{
#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type) \
#define GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator) \
do { \
int flags = getDefaultFlags(); \
String final_pattern = pat; \
std::cout << fmt::format("pat: {}", pat) << std::endl; \
String final_pattern = (pat); \
if constexpr (has_match_type) \
{ \
/* put match_type into pattern */ \
String match_type = (match_type_param)->getString(0); \
std::cout << fmt::format("match_type: {}", match_type) << std::endl; \
final_pattern = addMatchTypeForPattern(final_pattern, match_type); \
std::cout << fmt::format("final_pattern: {}", final_pattern) << std::endl; \
} \
final_pattern = addMatchTypeForPattern(final_pattern, match_type, (collator)); \
} else \
handleCollatorWithoutMatchType(final_pattern, (collator)); \
Regexps::Regexp regexp(final_pattern, flags); \
ResultType res{regexp.match(expr)}; \
(block).getByPosition(result).column = (block).getByPosition(result).type->createColumnConst((pat_param).getDataNum(), toField(res)); \
Expand All @@ -462,13 +459,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
if (arg_num > 2 && match_type_param->isConstCol())
{
constexpr bool has_match_type = true;
GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type);
GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator);
return;
}
else if (arg_num == 2)
{
constexpr bool has_match_type = false;
GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type);
GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator);
return;
}
// reach here when arg_num == 3 and match_type is not const
Expand All @@ -477,15 +474,15 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
{
// regexp function
constexpr bool has_match_type = false;
GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type);
GET_CONST_RESULT(block, expr, pat, pat_param, match_type_param, has_match_type, collator);
return;
}
#undef GET_CONST_RESULT
}

// Check memorization
if (canMemorize<Name>(arg_num, pat_param, match_type_param))
memorize(pat_param, match_type_param);
memorize(pat_param, match_type_param, collator);

// Initialize result column
auto col_res = ColumnVector<ResultType>::create();
Expand Down Expand Up @@ -528,7 +525,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
{
expr_param.getStringRef(i, expr_ref);
auto res = regexp->match(expr_ref.data, expr_ref.size);
std::cout << fmt::format("memorized not null: {}, res: {}", String(expr_ref.data, expr_ref.size), res) << std::endl;
vec_res[i] = res; // match
}

Expand Down Expand Up @@ -575,12 +571,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
if constexpr (class_name == regexp_like_name)
{
// regexp_like function
auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i));
auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i), collator);
vec_res[i] = regexp->match(expr); // match
}
else
{
// regexp function
handleCollatorWithoutMatchType(pat, collator);
int flags = getDefaultFlags();
const auto & regexp = Regexps::get<false, true>(pat, flags);
vec_res[i] = regexp->match(expr); // match
Expand All @@ -602,12 +599,13 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
if constexpr (class_name == regexp_like_name)
{
// regexp_like function
auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i));
auto regexp = createRegexpWithMatchType(pat, match_type_param->getString(i), collator);
vec_res[i] = regexp->match(expr); // match
}
else
{
// regexp function
handleCollatorWithoutMatchType(pat, collator);
int flags = getDefaultFlags();
const auto & regexp = Regexps::get<false, true>(pat, flags);
vec_res[i] = regexp->match(expr); // match
Expand All @@ -621,7 +619,6 @@ class FunctionStringRegexp : public FunctionStringRegexpBase, public IFunction
private:
void checkInputArg(const DataTypePtr & arg, bool * has_nullable_col) const
{
std::cout << "type name: " << arg->getName() << std::endl;
if (arg->isNullable())
{
*has_nullable_col = true;
Expand Down
Loading