Skip to content

Commit

Permalink
Support regexp_instr function (#6272)
Browse files Browse the repository at this point in the history
ref #6115
  • Loading branch information
xzhangxian1008 authored Nov 28, 2022
1 parent b59bcbd commit 23ec2e1
Show file tree
Hide file tree
Showing 8 changed files with 1,270 additions and 98 deletions.
7 changes: 7 additions & 0 deletions dbms/src/Common/OptimizedRegularExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#pragma once

#include <Common/config.h>
#include <common/types.h>
#include <re2/re2.h>

#include <memory>
Expand Down Expand Up @@ -112,7 +113,13 @@ class OptimizedRegularExpressionImpl
out_required_substring_is_prefix = required_substring_is_prefix;
}

Int64 instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op);

private:
Int64 processEmptyStringExpr(const char * expr, size_t expr_size, size_t byte_pos, Int64 occur);
Int64 getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op);


bool is_trivial;
bool required_substring_is_prefix;
bool is_case_insensitive;
Expand Down
77 changes: 70 additions & 7 deletions dbms/src/Common/OptimizedRegularExpression.inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <Common/Exception.h>
#include <Common/OptimizedRegularExpression.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/UTF8Helpers.h>
#include <Poco/Exception.h>
#include <common/defines.h>
#include <common/types.h>

#include <iostream>


#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5

Expand Down Expand Up @@ -344,7 +348,7 @@ bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, si
pos = strstr(subject, required_substring.data());

if (nullptr == pos)
return 0;
return false;
}

return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
Expand All @@ -364,12 +368,12 @@ bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, si
pos = strstr(subject, required_substring.data());

if (pos == nullptr)
return 0;
return false;
else
{
match.offset = pos - subject;
match.length = required_substring.size();
return 1;
return true;
}
}
else
Expand All @@ -383,18 +387,18 @@ bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, si
pos = strstr(subject, required_substring.data());

if (nullptr == pos)
return 0;
return false;
}

StringPieceType piece;

if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
return 0;
return false;
else
{
match.offset = piece.data() - subject;
match.length = piece.length();
return 1;
return true;
}
}
}
Expand Down Expand Up @@ -469,5 +473,64 @@ unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject
}
}

template <bool thread_safe>
Int64 OptimizedRegularExpressionImpl<thread_safe>::processEmptyStringExpr(const char * expr, size_t expr_size, size_t pos, Int64 occur)
{
if (occur != 1)
return 0;

StringPieceType expr_sp(expr, expr_size);
return RegexType::FindAndConsume(&expr_sp, *re2) ? pos : 0;
}

static inline void checkArgs(Int64 utf8_total_len, size_t subject_size, Int64 pos, Int64 ret_op)
{
RUNTIME_CHECK_MSG(!(ret_op != 0 && ret_op != 1), "Incorrect argument to regexp function: return_option must be 1 or 0");
RUNTIME_CHECK_MSG(!(pos <= 0 || (pos > utf8_total_len && subject_size != 0)), "Index out of bounds in regular function.");
}

static inline void makeOccurValid(Int64 & occur)
{
occur = occur < 0 ? 1 : occur;
}

template <bool thread_safe>
Int64 OptimizedRegularExpressionImpl<thread_safe>::getSubstrMatchedIndex(const char * subject, size_t subject_size, Int64 byte_pos, Int64 occur, Int64 ret_op)
{
size_t byte_offset = byte_pos - 1; // This is a offset for bytes, not utf8
const char * expr = subject + byte_offset; // expr is the string actually passed into regexp to be matched
size_t expr_size = subject_size - byte_offset;

StringPieceType expr_sp(expr, expr_size);
StringPieceType matched_str;

while (occur > 0)
{
if (!RegexType::FindAndConsume(&expr_sp, *re2, &matched_str))
return 0;

--occur;
}

byte_offset = matched_str.data() - subject;
return ret_op == 0 ? DB::UTF8::bytePos2Utf8Pos(reinterpret_cast<const UInt8 *>(subject), byte_offset + 1) : DB::UTF8::bytePos2Utf8Pos(reinterpret_cast<const UInt8 *>(subject), byte_offset + matched_str.size() + 1);
}

template <bool thread_safe>
Int64 OptimizedRegularExpressionImpl<thread_safe>::instr(const char * subject, size_t subject_size, Int64 pos, Int64 occur, Int64 ret_op)
{
Int64 utf8_total_len = DB::UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(subject), subject_size);

checkArgs(utf8_total_len, subject_size, pos, ret_op);

makeOccurValid(occur);

if (unlikely(subject_size == 0))
return processEmptyStringExpr(subject, subject_size, pos, occur);

size_t byte_pos = DB::UTF8::utf8Pos2bytePos(reinterpret_cast<const UInt8 *>(subject), pos);
return getSubstrMatchedIndex(subject, subject_size, byte_pos, occur, ret_op);
}

#undef MIN_LENGTH_FOR_STRSTR
#undef MAX_SUBPATTERNS
20 changes: 20 additions & 0 deletions dbms/src/Common/UTF8Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,26 @@ inline size_t countCodePoints(const UInt8 * data, size_t size)
return res;
}

// Convert utf8 position to byte position.
// For Example:
// Taking string "ni好a" as an example.
// utf8 position of character 'a' in this string is 4 and byte position is 6.
static inline Int64 utf8Pos2bytePos(const UInt8 * str, Int64 utf8_pos)
{
Int64 byte_index = 0;
while (--utf8_pos > 0)
byte_index += seqLength(str[byte_index]);
return byte_index + 1;
}

static inline Int64 bytePos2Utf8Pos(const UInt8 * str, Int64 byte_pos)
{
// byte_num means the number of byte before this byte_pos
Int64 byte_num = byte_pos - 1;
Int64 utf8_num = countCodePoints(str, byte_num);
return utf8_num + 1;
}

} // namespace UTF8


Expand Down
2 changes: 1 addition & 1 deletion dbms/src/Flash/Coprocessor/DAGUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
{tipb::ScalarFuncSig::RegexpSig, "regexp"},
{tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"},
{tipb::ScalarFuncSig::RegexpLikeSig, "regexp_like"},
// {tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"},
{tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"},
// {tipb::ScalarFuncSig::RegexpReplaceSig, "regexp_replace"},
// {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"},

Expand Down
4 changes: 3 additions & 1 deletion dbms/src/Functions/FunctionsRegexp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ struct ReplaceRegexpImpl
if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures))
{
match_occ++;
/// if occ > 0, it will replace all the match expr, otherwise it only replace the occ-th match
/// if occ == 0, it will replace all the match expr, otherwise it only replace the occ-th match
if (occ == 0 || match_occ == occ)
{
const auto & match = matches[0];
Expand Down Expand Up @@ -286,6 +286,7 @@ struct ReplaceRegexpImpl

using FunctionTiDBRegexp = FunctionStringRegexp<NameTiDBRegexp>;
using FunctionRegexpLike = FunctionStringRegexp<NameRegexpLike>;
using FunctionRegexpInstr = FunctionStringRegexpInstr<NameRegexpInstr>;
using FunctionReplaceRegexpOne = FunctionStringReplace<ReplaceRegexpImpl<true>, NameReplaceRegexpOne>;
using FunctionReplaceRegexpAll = FunctionStringReplace<ReplaceRegexpImpl<false>, NameReplaceRegexpAll>;

Expand All @@ -295,6 +296,7 @@ void registerFunctionsRegexp(FunctionFactory & factory)
factory.registerFunction<FunctionReplaceRegexpAll>();
factory.registerFunction<FunctionTiDBRegexp>();
factory.registerFunction<FunctionRegexpLike>();
factory.registerFunction<FunctionRegexpInstr>();
}

} // namespace DB
Loading

0 comments on commit 23ec2e1

Please sign in to comment.