diff --git a/base/base/find_symbols.h b/base/base/find_symbols.h index b28749afda6..488481dbc81 100644 --- a/base/base/find_symbols.h +++ b/base/base/find_symbols.h @@ -15,7 +15,7 @@ * * Allow to search for next character from the set of 'symbols...' in a string. * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), - * but with the following differencies: + * but with the following differences: * - works with any memory ranges, including containing zero bytes; * - doesn't require terminating zero byte: end of memory range is passed explicitly; * - if not found, returns pointer to end instead of nullptr; @@ -34,10 +34,52 @@ * If no such characters, returns nullptr. */ +struct SearchSymbols +{ + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) + : str(std::move(in)) + { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) + { + throw std::runtime_error("SearchSymbols can contain at most " + std::to_string(BUFFER_SIZE) + " symbols and " + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + namespace detail { template constexpr bool is_in(char x) { return ((x == chars) || ...); } +static bool is_in(char c, const char * symbols, size_t num_chars) +{ + for (size_t i = 0u; i < num_chars; ++i) + { + if (c == symbols[i]) + { + return true; + } + } + + return false; +} + #if defined(__SSE2__) template inline __m128i mm_is_in(__m128i bytes) @@ -53,6 +95,43 @@ inline __m128i mm_is_in(__m128i bytes) __m128i eq = mm_is_in(bytes); return _mm_or_si128(eq0, eq); } + +inline __m128i mm_is_in(__m128i bytes, const char * symbols, size_t num_chars) +{ + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) + { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char * symbols, size_t num_chars) +{ + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) + { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u> & needles) +{ + __m128i accumulator = _mm_setzero_si128(); + + for (const auto & needle : needles) + { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} #endif template @@ -99,6 +178,32 @@ inline const char * find_first_symbols_sse2(const char * const begin, const char return return_mode == ReturnMode::End ? end : nullptr; } +template +inline const char * find_first_symbols_sse2(const char * const begin, const char * const end, const char * symbols, size_t num_chars) +{ + const char * pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate(is_in(*pos, symbols, num_chars))) + return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + template inline const char * find_last_symbols_sse2(const char * const begin, const char * const end) @@ -159,26 +264,61 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha #endif for (; pos < end; ++pos) - if ( (num_chars >= 1 && maybe_negate(*pos == c01)) - || (num_chars >= 2 && maybe_negate(*pos == c02)) - || (num_chars >= 3 && maybe_negate(*pos == c03)) - || (num_chars >= 4 && maybe_negate(*pos == c04)) - || (num_chars >= 5 && maybe_negate(*pos == c05)) - || (num_chars >= 6 && maybe_negate(*pos == c06)) - || (num_chars >= 7 && maybe_negate(*pos == c07)) - || (num_chars >= 8 && maybe_negate(*pos == c08)) - || (num_chars >= 9 && maybe_negate(*pos == c09)) - || (num_chars >= 10 && maybe_negate(*pos == c10)) - || (num_chars >= 11 && maybe_negate(*pos == c11)) - || (num_chars >= 12 && maybe_negate(*pos == c12)) - || (num_chars >= 13 && maybe_negate(*pos == c13)) - || (num_chars >= 14 && maybe_negate(*pos == c14)) - || (num_chars >= 15 && maybe_negate(*pos == c15)) - || (num_chars >= 16 && maybe_negate(*pos == c16))) + if ( (num_chars == 1 && maybe_negate(is_in(*pos))) + || (num_chars == 2 && maybe_negate(is_in(*pos))) + || (num_chars == 3 && maybe_negate(is_in(*pos))) + || (num_chars == 4 && maybe_negate(is_in(*pos))) + || (num_chars == 5 && maybe_negate(is_in(*pos))) + || (num_chars == 6 && maybe_negate(is_in(*pos))) + || (num_chars == 7 && maybe_negate(is_in(*pos))) + || (num_chars == 8 && maybe_negate(is_in(*pos))) + || (num_chars == 9 && maybe_negate(is_in(*pos))) + || (num_chars == 10 && maybe_negate(is_in(*pos))) + || (num_chars == 11 && maybe_negate(is_in(*pos))) + || (num_chars == 12 && maybe_negate(is_in(*pos))) + || (num_chars == 13 && maybe_negate(is_in(*pos))) + || (num_chars == 14 && maybe_negate(is_in(*pos))) + || (num_chars == 15 && maybe_negate(is_in(*pos))) + || (num_chars == 16 && maybe_negate(is_in(*pos)))) return pos; return return_mode == ReturnMode::End ? end : nullptr; } +template +inline const char * find_first_symbols_sse42(const char * const begin, const char * const end, const SearchSymbols & symbols) +{ + const char * pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(pos)); + + if constexpr (positive) + { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } + else + { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate(is_in(*pos, symbols.str.data(), num_chars))) + return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} /// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. @@ -194,6 +334,17 @@ inline const char * find_first_symbols_dispatch(const char * begin, const char * return find_first_symbols_sse2(begin, end); } +template +inline const char * find_first_symbols_dispatch(const std::string_view haystack, const SearchSymbols & symbols) +{ +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) + return find_first_symbols_sse42(haystack.begin(), haystack.end(), symbols); + else +#endif + return find_first_symbols_sse2(haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); +} + } @@ -211,6 +362,11 @@ inline char * find_first_symbols(char * begin, char * end) return const_cast(detail::find_first_symbols_dispatch(begin, end)); } +inline const char * find_first_symbols(std::string_view haystack, const SearchSymbols & symbols) +{ + return detail::find_first_symbols_dispatch(haystack, symbols); +} + template inline const char * find_first_not_symbols(const char * begin, const char * end) { @@ -223,6 +379,11 @@ inline char * find_first_not_symbols(char * begin, char * end) return const_cast(detail::find_first_symbols_dispatch(begin, end)); } +inline const char * find_first_not_symbols(std::string_view haystack, const SearchSymbols & symbols) +{ + return detail::find_first_symbols_dispatch(haystack, symbols); +} + template inline const char * find_first_symbols_or_null(const char * begin, const char * end) { @@ -235,6 +396,11 @@ inline char * find_first_symbols_or_null(char * begin, char * end) return const_cast(detail::find_first_symbols_dispatch(begin, end)); } +inline const char * find_first_symbols_or_null(std::string_view haystack, const SearchSymbols & symbols) +{ + return detail::find_first_symbols_dispatch(haystack, symbols); +} + template inline const char * find_first_not_symbols_or_null(const char * begin, const char * end) { @@ -247,6 +413,10 @@ inline char * find_first_not_symbols_or_null(char * begin, char * end) return const_cast(detail::find_first_symbols_dispatch(begin, end)); } +inline const char * find_first_not_symbols_or_null(std::string_view haystack, const SearchSymbols & symbols) +{ + return detail::find_first_symbols_dispatch(haystack, symbols); +} template inline const char * find_last_symbols_or_null(const char * begin, const char * end) diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index 8ead8c58c7a..4a8ac7fbe6f 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -66,6 +66,108 @@ Result: - [Map(key, value)](../../sql-reference/data-types/map.md) data type +## extractKeyValuePairs + +Extracts key-value pairs, i.e. a [Map(String, String)](../../sql-reference/data-types/map.md), from a string. Parsing is robust towards noise (e.g. log files). + +A key-value pair consists of a key, followed by a `key_value_delimiter` and a value. Key value pairs must be separated by `pair_delimiter`. Quoted keys and values are also supported. + +**Syntax** + +``` sql +extractKeyValuePairs(data[, key_value_delimiter[, pair_delimiter[, quoting_character]]]) +``` + +Alias: +- `str_to_map` +- `mapFromString` + +**Arguments** + +- `data` - String to extract key-value pairs from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `key_value_delimiter` - Character to be used as delimiter between the key and the value. Defaults to `:`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `pair_delimiters` - Set of character to be used as delimiters between pairs. Defaults to ` `, `,` and `;`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `quoting_character` - Character to be used as quoting character. Defaults to `"`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). + +**Returned values** + +- A [Map(String, String)](../../sql-reference/data-types/map.md) of key-value pairs. + +**Examples** + +Simple case: + +``` sql +SELECT extractKeyValuePairs('name:neymar, age:31 team:psg,nationality:brazil') as kv +``` + +Result: + +``` Result: +┌─kv──────────────────────────────────────────────────────────────────────┐ +│ {'name':'neymar','age':'31','team':'psg','nationality':'brazil'} │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +Single quote as quoting character: + +``` sql +SELECT extractKeyValuePairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') as kv +``` + +Result: + +``` text +┌─kv───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ {'name':'neymar','age':'31','team':'psg','nationality':'brazil','last_key':'last_value'} │ +└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Escape sequences without escape sequences support: + +``` sql +SELECT extractKeyValuePairs('age:a\\x0A\\n\\0') AS kv +``` + +Result: + +``` text +┌─kv─────────────────────┐ +│ {'age':'a\\x0A\\n\\0'} │ +└────────────────────────┘ +``` + +## extractKeyValuePairsWithEscaping + +Same as `extractKeyValuePairs` but with escaping support. + +Supported escape sequences: `\x`, `\N`, `\a`, `\b`, `\e`, `\f`, `\n`, `\r`, `\t`, `\v` and `\0`. +Non standard escape sequences are returned as it is (including the backslash) unless they are one of the following: +`\\`, `'`, `"`, `backtick`, `/`, `=` or ASCII control characters (c <= 31). + +This function will satisfy the use case where pre-escaping and post-escaping are not suitable. For instance, consider the following +input string: `a: "aaaa\"bbb"`. The expected output is: `a: aaaa\"bbbb`. +- Pre-escaping: Pre-escaping it will output: `a: "aaaa"bbb"` and `extractKeyValuePairs` will then output: `a: aaaa` +- Post-escaping: `extractKeyValuePairs` will output `a: aaaa\` and post-escaping will keep it as it is. + +Leading escape sequences will be skipped in keys and will be considered invalid for values. + +**Examples** + +Escape sequences with escape sequence support turned on: + +``` sql +SELECT extractKeyValuePairsWithEscaping('age:a\\x0A\\n\\0') AS kv +``` + +Result: + +``` result +┌─kv────────────────┐ +│ {'age':'a\n\n\0'} │ +└───────────────────┘ +``` + ## mapAdd {#function-mapadd} Collect all the keys and sum corresponding values. diff --git a/src/Common/tests/gtest_find_symbols.cpp b/src/Common/tests/gtest_find_symbols.cpp index 1daab982d01..d9d3ba7660a 100644 --- a/src/Common/tests/gtest_find_symbols.cpp +++ b/src/Common/tests/gtest_find_symbols.cpp @@ -4,9 +4,26 @@ #include +template +void test_find_first_not(const std::string & haystack, std::size_t expected_pos) +{ + const char * begin = haystack.data(); + const char * end = haystack.data() + haystack.size(); + + ASSERT_EQ(begin + expected_pos, find_first_not_symbols(begin, end)); +} + +void test_find_first_not(const std::string & haystack, const std::string & symbols, const std::size_t expected_pos) +{ + const char * begin = haystack.data(); + + ASSERT_EQ(begin + expected_pos, find_first_not_symbols(haystack, SearchSymbols(symbols))); +} + + TEST(FindSymbols, SimpleTest) { - std::string s = "Hello, world! Goodbye..."; + const std::string s = "Hello, world! Goodbye..."; const char * begin = s.data(); const char * end = s.data() + s.size(); @@ -17,6 +34,9 @@ TEST(FindSymbols, SimpleTest) ASSERT_EQ(find_first_symbols<'H'>(begin, end), begin); ASSERT_EQ((find_first_symbols<'a', 'e'>(begin, end)), begin + 1); + ASSERT_EQ((find_first_symbols<'a', 'e', 'w', 'x', 'z'>(begin, end)), begin + 1); + ASSERT_EQ((find_first_symbols<'p', 'q', 's', 'x', 'z'>(begin, end)), end); + ASSERT_EQ(find_last_symbols_or_null<'a'>(begin, end), nullptr); ASSERT_EQ(find_last_symbols_or_null<'e'>(begin, end), end - 4); ASSERT_EQ(find_last_symbols_or_null<'.'>(begin, end), end - 1); @@ -36,3 +56,153 @@ TEST(FindSymbols, SimpleTest) ASSERT_EQ(vals, (std::vector{"s", "String"})); } } + +TEST(FindSymbols, RunTimeNeedle) +{ + auto test_haystack = [](const auto & haystack, const auto & unfindable_needle) { +#define TEST_HAYSTACK_AND_NEEDLE(haystack_, needle_) \ + do { \ + const auto & h = haystack_; \ + const auto & n = needle_; \ + EXPECT_EQ( \ + std::find_first_of(h.data(), h.data() + h.size(), n.data(), n.data() + n.size()), \ + find_first_symbols(h, SearchSymbols(n)) \ + ) << "haystack: \"" << h << "\" (" << static_cast(h.data()) << ")" \ + << ", needle: \"" << n << "\""; \ + } \ + while (false) + + // can't find needle + TEST_HAYSTACK_AND_NEEDLE(haystack, unfindable_needle); + +#define TEST_WITH_MODIFIED_NEEDLE(haystack, in_needle, needle_update_statement) \ + do \ + { \ + std::string needle = (in_needle); \ + (needle_update_statement); \ + TEST_HAYSTACK_AND_NEEDLE(haystack, needle); \ + } \ + while (false) + + // findable symbol is at beginning of the needle + // Can find at first pos of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.front()); + // Can find at first pos of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.back()); + // Can find in the middle of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack[haystack.size() / 2]); + + // findable symbol is at end of the needle + // Can find at first pos of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.front()); + // Can find at first pos of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.back()); + // Can find in the middle of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack[haystack.size() / 2]); + + // findable symbol is in the middle of the needle + // Can find at first pos of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.front()); + // Can find at first pos of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.back()); + // Can find in the middle of haystack + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack[haystack.size() / 2]); + +#undef TEST_WITH_MODIFIED_NEEDLE +#undef TEST_HAYSTACK_AND_NEEDLE + }; + + // there are 4 major groups of cases: + // haystack < 16 bytes, haystack > 16 bytes + // needle < 5 bytes, needle >= 5 bytes + + // First and last symbols of haystack should be unique + const std::string long_haystack = "Hello, world! Goodbye...?"; + const std::string short_haystack = "Hello, world!"; + + // In sync with find_first_symbols_dispatch code: long needles receive special treatment. + // as of now "long" means >= 5 + const std::string unfindable_long_needle = "0123456789ABCDEF"; + const std::string unfindable_short_needle = "0123"; + + { + SCOPED_TRACE("Long haystack"); + test_haystack(long_haystack, unfindable_long_needle); + test_haystack(long_haystack, unfindable_short_needle); + } + + { + SCOPED_TRACE("Short haystack"); + test_haystack(short_haystack, unfindable_long_needle); + test_haystack(short_haystack, unfindable_short_needle); + } + + // Assert big haystack is not accepted and exception is thrown + ASSERT_ANY_THROW(find_first_symbols(long_haystack, SearchSymbols("ABCDEFIJKLMNOPQRSTUVWXYZacfghijkmnpqstuvxz"))); +} + +TEST(FindNotSymbols, AllSymbolsPresent) +{ + std::string str_with_17_bytes = "hello world hello"; + std::string str_with_16_bytes = {str_with_17_bytes.begin(), str_with_17_bytes.end() - 1u}; + std::string str_with_15_bytes = {str_with_16_bytes.begin(), str_with_16_bytes.end() - 1u}; + + /* + * The below variations will choose different implementation strategies: + * 1. Loop method only because it does not contain enough bytes for SSE 4.2 + * 2. SSE4.2 only since string contains exactly 16 bytes + * 3. SSE4.2 + Loop method will take place because only first 16 bytes are treated by SSE 4.2 and remaining bytes is treated by loop + * + * Below code asserts that all calls return the ::end of the input string. This was not true prior to this fix as mentioned in PR #47304 + * */ + + test_find_first_not<'h', 'e', 'l', 'o', 'w', 'r', 'd', ' '>(str_with_15_bytes, str_with_15_bytes.size()); + test_find_first_not<'h', 'e', 'l', 'o', 'w', 'r', 'd', ' '>(str_with_16_bytes, str_with_16_bytes.size()); + test_find_first_not<'h', 'e', 'l', 'o', 'w', 'r', 'd', ' '>(str_with_17_bytes, str_with_17_bytes.size()); + + const auto * symbols = "helowrd "; + + test_find_first_not(str_with_15_bytes, symbols, str_with_15_bytes.size()); + test_find_first_not(str_with_16_bytes, symbols, str_with_16_bytes.size()); + test_find_first_not(str_with_17_bytes, symbols, str_with_17_bytes.size()); +} + +TEST(FindNotSymbols, NoSymbolsMatch) +{ + std::string s = "abcdefg"; + + // begin should be returned since the first character of the string does not match any of the below symbols + test_find_first_not<'h', 'i', 'j'>(s, 0u); + test_find_first_not(s, "hij", 0u); +} + +TEST(FindNotSymbols, ExtraSymbols) +{ + std::string s = "hello_world_hello"; + test_find_first_not<'h', 'e', 'l', 'o', ' '>(s, 5u); + test_find_first_not(s, "helo ", 5u); +} + +TEST(FindNotSymbols, EmptyString) +{ + std::string s; + test_find_first_not<'h', 'e', 'l', 'o', 'w', 'r', 'd', ' '>(s, s.size()); + test_find_first_not(s, "helowrd ", s.size()); +} + +TEST(FindNotSymbols, SingleChar) +{ + std::string s = "a"; + test_find_first_not<'a'>(s, s.size()); + test_find_first_not(s, "a", s.size()); +} + +TEST(FindNotSymbols, NullCharacter) +{ + // special test to ensure only the passed template arguments are used as needles + // since current find_first_symbols implementation takes in 16 characters and defaults + // to \0. + std::string s("abcdefg\0x", 9u); + test_find_first_not<'a', 'b', 'c', 'd', 'e', 'f', 'g'>(s, 7u); + test_find_first_not(s, "abcdefg", 7u); +} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 54d58658cb8..467d9b58ea8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -624,6 +624,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ M(Bool, optimize_sorting_by_input_stream_properties, true, "Optimize sorting by sorting properties of input stream", 0) \ + M(UInt64, extract_kvp_max_pairs_per_row, 1000, "Max number pairs that can be produced by extractKeyValuePairs function. Used to safeguard against consuming too much memory.", 0) \ \ /** proton: starts */ \ M(String, shards, "", "If not empty, only the specified shard IDs (or partition IDs if the target stream is a Kafka external stream) will be selected to be read data from. IDs are separated by comma. Example: shards='0,2'", 0) \ diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index ec545b57a33..80b1caa8b9f 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -365,6 +365,9 @@ endif() add_subdirectory(JSONPath) list (APPEND PRIVATE_LIBS clickhouse_functions_jsonpath) +add_subdirectory(keyvaluepair) +list (APPEND OBJECT_LIBS $) + # Signed integer overflow on user-provided data inside boost::geometry - ignore. set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow) diff --git a/src/Functions/keyvaluepair/ArgumentExtractor.cpp b/src/Functions/keyvaluepair/ArgumentExtractor.cpp new file mode 100644 index 00000000000..39af80cb561 --- /dev/null +++ b/src/Functions/keyvaluepair/ArgumentExtractor.cpp @@ -0,0 +1,130 @@ +#include + +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ + auto popFrontAndGet(auto & container) + { + auto element = container.front(); + container.pop_front(); + return element; + } +} + +ArgumentExtractor::ParsedArguments ArgumentExtractor::extract(const ColumnsWithTypeAndName & arguments) +{ + return extract(ColumnsWithTypeAndNameList{arguments.begin(), arguments.end()}); +} + +ArgumentExtractor::ParsedArguments ArgumentExtractor::extract(ColumnsWithTypeAndNameList arguments) +{ + static constexpr auto MAX_NUMBER_OF_ARGUMENTS = 4u; + + if (arguments.empty() || arguments.size() > MAX_NUMBER_OF_ARGUMENTS) + { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Function extractKeyValuePairs requires at least 1 argument and at most {}. {} was provided", + MAX_NUMBER_OF_ARGUMENTS, arguments.size()); + } + + auto data_column = extractStringColumn(popFrontAndGet(arguments), "data_column"); + + if (arguments.empty()) + { + return ParsedArguments{data_column}; + } + + auto key_value_delimiter = extractSingleCharacter(popFrontAndGet(arguments), "key_value_delimiter"); + + if (arguments.empty()) + { + return ParsedArguments {data_column, key_value_delimiter}; + } + + auto pair_delimiters = extractVector(popFrontAndGet(arguments), "pair_delimiters"); + + if (arguments.empty()) + { + return ParsedArguments { + data_column, key_value_delimiter, pair_delimiters + }; + } + + auto quoting_character = extractSingleCharacter(popFrontAndGet(arguments), "quoting_character"); + + return ParsedArguments { + data_column, + key_value_delimiter, + pair_delimiters, + quoting_character, + }; +} + +ArgumentExtractor::CharArgument ArgumentExtractor::extractSingleCharacter(const ColumnWithTypeAndName & argument, const std::string & parameter_name) +{ + const auto type = argument.type; + const auto column = argument.column; + + validateColumnType(type, parameter_name); + + auto view = column->getDataAt(0).toView(); + + if (view.empty()) + { + return {}; + } + else if (view.size() == 1u) + { + return view.front(); + } + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Control character argument must either be empty or contain exactly 1 character"); +} + +ColumnPtr ArgumentExtractor::extractStringColumn(const ColumnWithTypeAndName & argument, const std::string & parameter_name) +{ + auto type = argument.type; + auto column = argument.column; + + validateColumnType(type, parameter_name); + + return column; +} + +ArgumentExtractor::VectorArgument ArgumentExtractor::extractVector(const ColumnWithTypeAndName & argument, const std::string & parameter_name) +{ + const auto type = argument.type; + const auto column = argument.column; + + validateColumnType(type, parameter_name); + + auto view = column->getDataAt(0).toView(); + + return {view.begin(), view.end()}; +} + +void ArgumentExtractor::validateColumnType(DataTypePtr type, const std::string & parameter_name) +{ + if (!isStringOrFixedString(type)) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {}. Must be String.", + type, parameter_name); + } +} + +} diff --git a/src/Functions/keyvaluepair/ArgumentExtractor.h b/src/Functions/keyvaluepair/ArgumentExtractor.h new file mode 100644 index 00000000000..e6538584d01 --- /dev/null +++ b/src/Functions/keyvaluepair/ArgumentExtractor.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include + +#include + +namespace DB +{ + +/* + * Validate (API level validation, no business logic validation) and extracts input arguments from + * `ColumnsWithTypeAndName` into ArgumentExtractor::ParsedArguments. + * */ +class ArgumentExtractor +{ +public: + using CharArgument = std::optional; + using VectorArgument = std::vector; + using ColumnsWithTypeAndNameList = std::list; + + struct ParsedArguments + { + ColumnPtr data_column; + + CharArgument key_value_delimiter = {}; + VectorArgument pair_delimiters = {}; + CharArgument quoting_character = {}; + }; + + + static ParsedArguments extract(const ColumnsWithTypeAndName & arguments); + static ParsedArguments extract(ColumnsWithTypeAndNameList arguments); + +private: + static CharArgument extractSingleCharacter(const ColumnWithTypeAndName & arguments, const std::string & parameter_name); + static ColumnPtr extractStringColumn(const ColumnWithTypeAndName & arguments, const std::string & parameter_name); + static VectorArgument extractVector(const ColumnWithTypeAndName & arguments, const std::string & parameter_name); + + static void validateColumnType(DataTypePtr type, const std::string & parameter_name); +}; + +} diff --git a/src/Functions/keyvaluepair/CMakeLists.txt b/src/Functions/keyvaluepair/CMakeLists.txt new file mode 100644 index 00000000000..6f197e05079 --- /dev/null +++ b/src/Functions/keyvaluepair/CMakeLists.txt @@ -0,0 +1,7 @@ +include("${proton_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs .) +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs impl) + +add_library(clickhouse_functions_extractkeyvaluepairs ${clickhouse_functions_extractkeyvaluepairs_sources} ${clickhouse_functions_extractkeyvaluepairs_headers}) + +target_link_libraries(clickhouse_functions_extractkeyvaluepairs PRIVATE dbms) diff --git a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp new file mode 100644 index 00000000000..abceae7f7a8 --- /dev/null +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -0,0 +1,247 @@ +#include +#include + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +namespace DB +{ + +template +class ExtractKeyValuePairs : public IFunction +{ + auto getExtractor(const ArgumentExtractor::ParsedArguments & parsed_arguments) const + { + auto builder = KeyValuePairExtractorBuilder(); + + if constexpr (WITH_ESCAPING) + { + builder.withEscaping(); + } + + if (parsed_arguments.key_value_delimiter) + { + builder.withKeyValueDelimiter(parsed_arguments.key_value_delimiter.value()); + } + + if (!parsed_arguments.pair_delimiters.empty()) + { + builder.withItemDelimiters(parsed_arguments.pair_delimiters); + } + + if (parsed_arguments.quoting_character) + { + builder.withQuotingCharacter(parsed_arguments.quoting_character.value()); + } + + bool is_number_of_pairs_unlimited = context->getSettingsRef().extract_kvp_max_pairs_per_row == 0; + + if (!is_number_of_pairs_unlimited) + { + builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + } + + return builder.build(); + } + + ColumnPtr extract(ColumnPtr data_column, std::shared_ptr extractor) const + { + auto offsets = ColumnUInt64::create(); + + auto keys = ColumnString::create(); + auto values = ColumnString::create(); + + uint64_t offset = 0u; + + for (auto i = 0u; i < data_column->size(); i++) + { + auto row = data_column->getDataAt(i).toView(); + + auto pairs_count = extractor->extract(row, keys, values); + + offset += pairs_count; + + offsets->insert(offset); + } + + keys->validate(); + values->validate(); + + ColumnPtr keys_ptr = std::move(keys); + + return ColumnMap::create(keys_ptr, std::move(values), std::move(offsets)); + } + +public: + explicit ExtractKeyValuePairs(ContextPtr context_) : context(context_) {} + + static constexpr auto name = Name::name; + + String getName() const override + { + return name; + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + { + auto parsed_arguments = ArgumentExtractor::extract(arguments); + + auto extractor = getExtractor(parsed_arguments); + + return extract(parsed_arguments.data_column, extractor); + } + + DataTypePtr getReturnTypeImpl(const DataTypes &) const override + { + return std::make_shared(std::make_shared(), std::make_shared()); + } + + bool isVariadic() const override + { + return true; + } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override + { + return false; + } + + std::size_t getNumberOfArguments() const override + { + return 0u; + } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override + { + return {1, 2, 3, 4}; + } + +private: + ContextPtr context; +}; + +struct NameExtractKeyValuePairs +{ + static constexpr auto name = "extract_key_value_pairs"; +}; + +struct NameExtractKeyValuePairsWithEscaping +{ + static constexpr auto name = "extract_key_value_pairs_with_escaping"; +}; + +REGISTER_FUNCTION(ExtractKeyValuePairs) +{ + factory.registerFunction>( + Documentation( + R"(Extracts key-value pairs from any string. The string does not need to be 100% structured in a key value pair format; + + It can contain noise (e.g. log files). The key-value pair format to be interpreted should be specified via function arguments. + + A key-value pair consists of a key followed by a `key_value_delimiter` and a value. Quoted keys and values are also supported. Key value pairs must be separated by pair delimiters. + + **Syntax** + ``` sql + extract_key_value_pairs(data, [key_value_delimiter], [pair_delimiter], [quoting_character]) + ``` + + **Arguments** + - `data` - String to extract key-value pairs from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). + - `key_value_delimiter` - Character to be used as delimiter between the key and the value. Defaults to `:`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). + - `pair_delimiters` - Set of character to be used as delimiters between pairs. Defaults to `\space`, `,` and `;`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). + - `quoting_character` - Character to be used as quoting character. Defaults to `"`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). + + **Returned values** + - The extracted key-value pairs in a Map(String, String). + + **Examples** + + Query: + + **Simple case** + ``` sql + arthur :) select extract_key_value_pairs('name:neymar, age:31 team:psg,nationality:brazil') as kv + + SELECT extract_key_value_pairs('name:neymar, age:31 team:psg,nationality:brazil') as kv + + Query id: f9e0ca6f-3178-4ee2-aa2c-a5517abb9cee + + ┌─kv──────────────────────────────────────────────────────────────────────┐ + │ {'name':'neymar','age':'31','team':'psg','nationality':'brazil'} │ + └─────────────────────────────────────────────────────────────────────────┘ + ``` + + **Single quote as quoting character** + ``` sql + arthur :) select extract_key_value_pairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') as kv + + SELECT extract_key_value_pairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') as kv + + Query id: 0e22bf6b-9844-414a-99dc-32bf647abd5e + + ┌─kv───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ + │ {'name':'neymar','age':'31','team':'psg','nationality':'brazil','last_key':'last_value'} │ + └──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + ``` + + **Escape sequences without escape sequences support** + ``` sql + arthur :) select extract_key_value_pairs('age:a\\x0A\\n\\0') as kv + + SELECT extract_key_value_pairs('age:a\\x0A\\n\\0') AS kv + + Query id: e9fd26ee-b41f-4a11-b17f-25af6fd5d356 + + ┌─kv────────────────────┐ + │ {'age':'a\\x0A\\n\\0'} │ + └───────────────────────┘ + ```)") + ); + + factory.registerFunction>( + Documentation( + R"(Same as `extract_key_value_pairs` but with escaping support. + + Escape sequences supported: `\x`, `\N`, `\a`, `\b`, `\e`, `\f`, `\n`, `\r`, `\t`, `\v` and `\0`. + Non standard escape sequences are returned as it is (including the backslash) unless they are one of the following: + `\\`, `'`, `"`, `backtick`, `/`, `=` or ASCII control characters (c <= 31). + + This function will satisfy the use case where pre-escaping and post-escaping are not suitable. For instance, consider the following + input string: `a: "aaaa\"bbb"`. The expected output is: `a: aaaa\"bbbb`. + - Pre-escaping: Pre-escaping it will output: `a: "aaaa"bbb"` and `extract_key_value_pairs` will then output: `a: aaaa` + - Post-escaping: `extract_key_value_pairs` will output `a: aaaa\` and post-escaping will keep it as it is. + + Leading escape sequences will be skipped in keys and will be considered invalid for values. + + **Escape sequences with escape sequence support turned on** + ``` sql + arthur :) select extract_key_value_pairs_with_escaping('age:a\\x0A\\n\\0') as kv + + SELECT extract_key_value_pairs_with_escaping('age:a\\x0A\\n\\0') AS kv + + Query id: 44c114f0-5658-4c75-ab87-4574de3a1645 + + ┌─kv───────────────┐ + │ {'age':'a\n\n\0'} │ + └──────────────────┘ + ```)") + ); + factory.registerAlias("str_to_map", NameExtractKeyValuePairs::name, FunctionFactory::CaseInsensitive); + factory.registerAlias("map_from_string", NameExtractKeyValuePairs::name, FunctionFactory::CaseInsensitive); +} + +} diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h new file mode 100644 index 00000000000..3895cf3e77d --- /dev/null +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -0,0 +1,140 @@ +#pragma once + +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int LIMIT_EXCEEDED; +} + +/* + * Handle state transitions and a few states like `FLUSH_PAIR` and `END`. + * */ +template +class CHKeyValuePairExtractor : public KeyValuePairExtractor +{ + using State = typename DB::extractKV::StateHandler::State; + using NextState = DB::extractKV::StateHandler::NextState; + +public: + explicit CHKeyValuePairExtractor(StateHandler state_handler_, uint64_t max_number_of_pairs_) + : state_handler(std::move(state_handler_)), max_number_of_pairs(max_number_of_pairs_) + {} + + uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override + { + return extract(std::string_view {data}, keys, values); + } + + uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override + { + auto state = State::WAITING_KEY; + + auto key = typename StateHandler::StringWriter(*keys); + auto value = typename StateHandler::StringWriter(*values); + + uint64_t row_offset = 0; + + while (state != State::END) + { + auto next_state = processState(data, state, key, value, row_offset); + + if (next_state.position_in_string > data.size() && next_state.state != State::END) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Attempt to move read pointer past end of available data, from state {} to new state: {}, new position: {}, available data: {}", + magic_enum::enum_name(state), magic_enum::enum_name(next_state.state), + next_state.position_in_string, data.size()); + } + + data.remove_prefix(next_state.position_in_string); + state = next_state.state; + } + + // below reset discards invalid keys and values + reset(key, value); + + return row_offset; + } + +private: + + NextState processState(std::string_view file, State state, auto & key, auto & value, uint64_t & row_offset) + { + switch (state) + { + case State::WAITING_KEY: + { + return state_handler.waitKey(file); + } + case State::READING_KEY: + { + return state_handler.readKey(file, key); + } + case State::READING_QUOTED_KEY: + { + return state_handler.readQuotedKey(file, key); + } + case State::READING_KV_DELIMITER: + { + return state_handler.readKeyValueDelimiter(file); + } + case State::WAITING_VALUE: + { + return state_handler.waitValue(file); + } + case State::READING_VALUE: + { + return state_handler.readValue(file, value); + } + case State::READING_QUOTED_VALUE: + { + return state_handler.readQuotedValue(file, value); + } + case State::FLUSH_PAIR: + { + return flushPair(file, key, value, row_offset); + } + case State::END: + { + return {0, state}; + } + } + } + + NextState flushPair(const std::string_view & file, auto & key, + auto & value, uint64_t & row_offset) + { + row_offset++; + + if (row_offset > max_number_of_pairs) + { + throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); + } + + key.commit(); + value.commit(); + + return {0, file.empty() ? State::END : State::WAITING_KEY}; + } + + void reset(auto & key, auto & value) + { + key.reset(); + value.reset(); + } + + StateHandler state_handler; + uint64_t max_number_of_pairs; +}; + +} diff --git a/src/Functions/keyvaluepair/impl/CMakeLists.txt b/src/Functions/keyvaluepair/impl/CMakeLists.txt new file mode 100644 index 00000000000..6843ac97bdf --- /dev/null +++ b/src/Functions/keyvaluepair/impl/CMakeLists.txt @@ -0,0 +1,7 @@ +include("${proton_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core .) +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl) + +add_library(clickhouse_functions_extractkeyvaluepairs_core ${clickhouse_functions_extractkeyvaluepairs_core_sources} ${clickhouse_functions_extractkeyvaluepairs_core_headers}) + +target_link_libraries(clickhouse_functions_extractkeyvaluepairs_core PRIVATE dbms) diff --git a/src/Functions/keyvaluepair/impl/Configuration.cpp b/src/Functions/keyvaluepair/impl/Configuration.cpp new file mode 100644 index 00000000000..1b7f4774158 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/Configuration.cpp @@ -0,0 +1,83 @@ +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace extractKV +{ + +Configuration::Configuration(char key_value_delimiter_, char quoting_character_, std::vector pair_delimiters_) + : key_value_delimiter(key_value_delimiter_), quoting_character(quoting_character_), pair_delimiters(std::move(pair_delimiters_)) +{ +} + +Configuration ConfigurationFactory::createWithoutEscaping(char key_value_delimiter, char quoting_character, std::vector pair_delimiters) +{ + validate(key_value_delimiter, quoting_character, pair_delimiters); + + return Configuration(key_value_delimiter, quoting_character, pair_delimiters); +} + +Configuration ConfigurationFactory::createWithEscaping(char key_value_delimiter, char quoting_character, std::vector pair_delimiters) +{ + static constexpr char ESCAPE_CHARACTER = '\\'; + + if (key_value_delimiter == ESCAPE_CHARACTER + || quoting_character == ESCAPE_CHARACTER + || std::find(pair_delimiters.begin(), pair_delimiters.end(), ESCAPE_CHARACTER) != pair_delimiters.end()) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Invalid arguments, {} is reserved for the escaping character", + ESCAPE_CHARACTER); + } + + return createWithoutEscaping(key_value_delimiter, quoting_character, pair_delimiters); +} + +void ConfigurationFactory::validate(char key_value_delimiter, char quoting_character, std::vector pair_delimiters) +{ + if (key_value_delimiter == quoting_character) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid arguments, key_value_delimiter and quoting_character can not be the same"); + } + + if (pair_delimiters.size() > MAX_NUMBER_OF_PAIR_DELIMITERS) + { + // SSE optimizations require needles to contain up to 16 characters. Needles can be a concatenation of multiple parameters, including + // quoting_character, key_value_delimiter and pair delimiters. Limiting to 8 to be on the safe side. + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid arguments, pair delimiters can contain at most {} characters", MAX_NUMBER_OF_PAIR_DELIMITERS); + } + + if (pair_delimiters.empty()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid arguments, pair delimiters list is empty"); + } + + bool is_key_value_delimiter_in_pair_delimiters + = std::find(pair_delimiters.begin(), pair_delimiters.end(), key_value_delimiter) != pair_delimiters.end(); + + if (is_key_value_delimiter_in_pair_delimiters) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid arguments, key_value_delimiter conflicts with pair delimiters"); + } + + bool is_quoting_character_in_pair_delimiters + = std::find(pair_delimiters.begin(), pair_delimiters.end(), quoting_character) != pair_delimiters.end(); + + if (is_quoting_character_in_pair_delimiters) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid arguments, quoting_character conflicts with pair delimiters"); + } +} + +} + +} diff --git a/src/Functions/keyvaluepair/impl/Configuration.h b/src/Functions/keyvaluepair/impl/Configuration.h new file mode 100644 index 00000000000..322aa575052 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/Configuration.h @@ -0,0 +1,44 @@ +#pragma once + +#include + +namespace DB +{ +namespace extractKV +{ +struct ConfigurationFactory; + +class Configuration +{ + friend struct ConfigurationFactory; + + Configuration( + char key_value_delimiter_, + char quoting_character_, + std::vector pair_delimiters_ + ); + +public: + const char key_value_delimiter; + const char quoting_character; + const std::vector pair_delimiters; +}; + +/* + * Validates (business logic) and creates Configurations for key-value-pair extraction. + * */ +struct ConfigurationFactory +{ +public: + static Configuration createWithoutEscaping(char key_value_delimiter, char quoting_character, std::vector pair_delimiters); + + static Configuration createWithEscaping(char key_value_delimiter, char quoting_character, std::vector pair_delimiters); + +private: + static void validate(char key_value_delimiter, char quoting_character, std::vector pair_delimiters); + + static constexpr auto MAX_NUMBER_OF_PAIR_DELIMITERS = 8u; +}; +} + +} diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h new file mode 100644 index 00000000000..5fd77ce9a99 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include +#include + +namespace DB +{ + +struct KeyValuePairExtractor +{ + virtual ~KeyValuePairExtractor() = default; + + virtual uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) = 0; + + virtual uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) = 0; +}; + +} diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp new file mode 100644 index 00000000000..7f2a6449ab0 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp @@ -0,0 +1,76 @@ +#include + +#include +#include +#include + +namespace DB +{ + +KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withKeyValueDelimiter(char key_value_delimiter_) +{ + key_value_delimiter = key_value_delimiter_; + return *this; +} + +KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withItemDelimiters(std::vector item_delimiters_) +{ + item_delimiters = std::move(item_delimiters_); + return *this; +} + +KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withQuotingCharacter(char quoting_character_) +{ + quoting_character = quoting_character_; + return *this; +} + +KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withEscaping() +{ + with_escaping = true; + return *this; +} + +KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withMaxNumberOfPairs(uint64_t max_number_of_pairs_) +{ + max_number_of_pairs = max_number_of_pairs_; + return *this; +} + +std::shared_ptr KeyValuePairExtractorBuilder::build() const +{ + if (with_escaping) + { + return buildWithEscaping(); + } + + return buildWithoutEscaping(); +} + +namespace +{ +using namespace extractKV; + +template +auto makeStateHandler(const T && handler, uint64_t max_number_of_pairs) +{ + return std::make_shared>(handler, max_number_of_pairs); +} + +} + +std::shared_ptr KeyValuePairExtractorBuilder::buildWithoutEscaping() const +{ + auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters); + + return makeStateHandler(NoEscapingStateHandler(configuration), max_number_of_pairs); +} + +std::shared_ptr KeyValuePairExtractorBuilder::buildWithEscaping() const +{ + auto configuration = ConfigurationFactory::createWithEscaping(key_value_delimiter, quoting_character, item_delimiters); + + return makeStateHandler(InlineEscapingStateHandler(configuration), max_number_of_pairs); +} + +} diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h new file mode 100644 index 00000000000..0c673f12ccf --- /dev/null +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include + +namespace DB +{ + +struct KeyValuePairExtractor; + +class KeyValuePairExtractorBuilder +{ +public: + + KeyValuePairExtractorBuilder & withKeyValueDelimiter(char key_value_delimiter_); + + KeyValuePairExtractorBuilder & withItemDelimiters(std::vector item_delimiters_); + + KeyValuePairExtractorBuilder & withQuotingCharacter(char quoting_character_); + + KeyValuePairExtractorBuilder & withEscaping(); + + KeyValuePairExtractorBuilder & withMaxNumberOfPairs(uint64_t max_number_of_pairs_); + + std::shared_ptr build() const; + +private: + bool with_escaping = false; + char key_value_delimiter = ':'; + char quoting_character = '"'; + std::vector item_delimiters = {' ', ',', ';'}; + uint64_t max_number_of_pairs = std::numeric_limits::max(); + + std::shared_ptr buildWithEscaping() const; + + std::shared_ptr buildWithoutEscaping() const; +}; + +} diff --git a/src/Functions/keyvaluepair/impl/NeedleFactory.h b/src/Functions/keyvaluepair/impl/NeedleFactory.h new file mode 100644 index 00000000000..83862a2281a --- /dev/null +++ b/src/Functions/keyvaluepair/impl/NeedleFactory.h @@ -0,0 +1,100 @@ +#pragma once + +#include +#include + +#include +#include + +namespace DB +{ + +namespace extractKV +{ + +/* + * `StateHandlerImpl` makes use of string search algorithms to find delimiters. This class creates the needles for each state + * based on the contents of `Configuration`. + * */ +template +class NeedleFactory +{ +public: + SearchSymbols getWaitNeedles(const Configuration & extractor_configuration) + { + const auto & [key_value_delimiter, quoting_character, pair_delimiters] + = extractor_configuration; + + std::vector needles; + + needles.push_back(key_value_delimiter); + + std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles)); + + if constexpr (WITH_ESCAPING) + { + needles.push_back('\\'); + } + + return SearchSymbols {std::string{needles.data(), needles.size()}}; + } + + SearchSymbols getReadKeyNeedles(const Configuration & extractor_configuration) + { + const auto & [key_value_delimiter, quoting_character, pair_delimiters] + = extractor_configuration; + + std::vector needles; + + needles.push_back(key_value_delimiter); + needles.push_back(quoting_character); + + std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles)); + + if constexpr (WITH_ESCAPING) + { + needles.push_back('\\'); + } + + return SearchSymbols {std::string{needles.data(), needles.size()}}; + } + + SearchSymbols getReadValueNeedles(const Configuration & extractor_configuration) + { + const auto & [key_value_delimiter, quoting_character, pair_delimiters] + = extractor_configuration; + + std::vector needles; + + needles.push_back(quoting_character); + + std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles)); + + if constexpr (WITH_ESCAPING) + { + needles.push_back('\\'); + } + + return SearchSymbols {std::string{needles.data(), needles.size()}}; + } + + SearchSymbols getReadQuotedNeedles(const Configuration & extractor_configuration) + { + const auto quoting_character = extractor_configuration.quoting_character; + + std::vector needles; + + needles.push_back(quoting_character); + + if constexpr (WITH_ESCAPING) + { + needles.push_back('\\'); + } + + return SearchSymbols {std::string{needles.data(), needles.size()}}; + } +}; + +} + +} diff --git a/src/Functions/keyvaluepair/impl/StateHandler.h b/src/Functions/keyvaluepair/impl/StateHandler.h new file mode 100644 index 00000000000..178974e9d36 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/StateHandler.h @@ -0,0 +1,50 @@ +#pragma once + +#include + + +namespace DB +{ + +namespace extractKV +{ + +class StateHandler +{ +public: + enum State + { + // Skip characters until it finds a valid first key character. Might jump to READING_KEY, READING_QUOTED_KEY or END. + WAITING_KEY, + // Tries to read a key. Might jump to WAITING_KEY, WAITING_VALUE or END. + READING_KEY, + // Tries to read a quoted key. Might jump to WAITING_KEY, READING_KV_DELIMITER or END. + READING_QUOTED_KEY, + // Tries to read the key value pair delimiter. Might jump to WAITING_KEY, WAITING_VALUE or END. + READING_KV_DELIMITER, + // Skip characters until it finds a valid first value character. Might jump to READING_QUOTED_VALUE or READING_VALUE. + WAITING_VALUE, + // Tries to read a value. Jumps to FLUSH_PAIR. + READING_VALUE, + // Tries to read a quoted value. Might jump to FLUSH_PAIR or END. + READING_QUOTED_VALUE, + // In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END. + FLUSH_PAIR, + END + }; + + struct NextState + { + std::size_t position_in_string; + State state; + }; + + StateHandler() = default; + StateHandler(const StateHandler &) = default; + + virtual ~StateHandler() = default; +}; + +} + +} diff --git a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h new file mode 100644 index 00000000000..687d8d95d42 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h @@ -0,0 +1,474 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace extractKV +{ + + +/* + * Handles (almost) all states present in `StateHandler::State`. The description of each state responsibility can be found in + * `StateHandler::State`. Advanced & optimized string search algorithms are used to search for control characters and form key value pairs. + * Each method returns a `StateHandler::NextState` object which contains the next state itself and the number of characters consumed by the previous state. + * + * The class is templated with a boolean that controls escaping support. As of now, there are two specializations: + * `NoEscapingStateHandler` and `InlineEscapingStateHandler`. + * */ +template +class StateHandlerImpl : public StateHandler +{ +public: + explicit StateHandlerImpl(Configuration configuration_) + : configuration(std::move(configuration_)) + { + /* SearchNeedles do not change throughout the algorithm. Therefore, they are created only once in the constructor + * to avoid unnecessary copies. + * */ + NeedleFactory needle_factory; + + wait_needles = needle_factory.getWaitNeedles(configuration); + read_key_needles = needle_factory.getReadKeyNeedles(configuration); + read_value_needles = needle_factory.getReadValueNeedles(configuration); + read_quoted_needles = needle_factory.getReadQuotedNeedles(configuration); + } + + /* + * Find first character that is considered a valid key character and proceeds to READING_KEY like states. + * */ + [[nodiscard]] NextState waitKey(std::string_view file) const + { + if (const auto * p = find_first_not_symbols_or_null(file, wait_needles)) + { + const size_t character_position = p - file.begin(); + if (isQuotingCharacter(*p)) + { + // +1 to skip quoting character + return {character_position + 1u, State::READING_QUOTED_KEY}; + } + else + { + return {character_position, State::READING_KEY}; + } + } + + return {file.size(), State::END}; + } + + /* + * Find first delimiter of interest (`read_needles`). Valid symbols are either `key_value_delimiter` and `escape_character` if escaping + * support is on. If it finds a pair delimiter, it discards the key. + * */ + [[nodiscard]] NextState readKey(std::string_view file, auto & key) const + { + key.reset(); + + size_t pos = 0; + + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_key_needles)) + { + auto character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (WITH_ESCAPING && isEscapeCharacter(*p)) + { + if constexpr (WITH_ESCAPING) + { + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, key); + next_pos = character_position + escape_sequence_length; + + if (!parsed_successfully) + { + return {next_pos, State::WAITING_KEY}; + } + } + } + else if (isKeyValueDelimiter(*p)) + { + key.append(file.begin() + pos, file.begin() + character_position); + + return {next_pos, State::WAITING_VALUE}; + } + else if (isPairDelimiter(*p)) + { + return {next_pos, State::WAITING_KEY}; + } + else if (isQuotingCharacter(*p)) + { + return {next_pos, State::READING_QUOTED_KEY}; + } + + pos = next_pos; + } + + return {file.size(), State::END}; + } + + /* + * Search for closing quoting character and process escape sequences along the way (if escaping support is turned on). + * */ + [[nodiscard]] NextState readQuotedKey(std::string_view file, auto & key) const + { + key.reset(); + + size_t pos = 0; + + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) + { + size_t character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (WITH_ESCAPING && isEscapeCharacter(*p)) + { + if constexpr (WITH_ESCAPING) + { + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, key); + next_pos = character_position + escape_sequence_length; + + if (!parsed_successfully) + { + return {next_pos, State::WAITING_KEY}; + } + } + } + else if (isQuotingCharacter(*p)) + { + key.append(file.begin() + pos, file.begin() + character_position); + + if (key.isEmpty()) + { + return {next_pos, State::WAITING_KEY}; + } + + return {next_pos, State::READING_KV_DELIMITER}; + } + + pos = next_pos; + } + + return {file.size(), State::END}; + } + + /* + * Validate expected key-value-delimiter is in place. + * */ + [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const + { + if (!file.empty()) + { + const auto current_character = file[0]; + + if (isKeyValueDelimiter(current_character)) + { + return {1, WAITING_VALUE}; + } + } + + return {0, State::WAITING_KEY}; + } + + /* + * Check if next character is a valid value character and jumps to read-like states. Caveat here is that a pair delimiter must also lead to + * read-like states because it indicates empty values. + * */ + [[nodiscard]] NextState waitValue(std::string_view file) const + { + size_t pos = 0; + + if (!file.empty()) + { + const auto current_character = file[pos]; + + if (isQuotingCharacter(current_character)) + { + return {pos + 1u, State::READING_QUOTED_VALUE}; + } + + if constexpr (WITH_ESCAPING) + { + if (isEscapeCharacter(current_character)) + { + return {pos, State::WAITING_KEY}; + } + } + } + + return {pos, State::READING_VALUE}; + } + + /* + * Finds next delimiter of interest (`read_needles`). Valid symbols are either `pair_delimiter` and `escape_character` if escaping + * support is on. If it finds a `key_value_delimiter`, it discards the value. + * */ + [[nodiscard]] NextState readValue(std::string_view file, auto & value) const + { + value.reset(); + + size_t pos = 0; + + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_value_needles)) + { + const size_t character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (WITH_ESCAPING && isEscapeCharacter(*p)) + { + if constexpr (WITH_ESCAPING) + { + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, value); + next_pos = character_position + escape_sequence_length; + + if (!parsed_successfully) + { + // Perform best-effort parsing and ignore invalid escape sequences at the end + return {next_pos, State::FLUSH_PAIR}; + } + } + } + else if (isPairDelimiter(*p)) + { + value.append(file.begin() + pos, file.begin() + character_position); + + return {next_pos, State::FLUSH_PAIR}; + } + + pos = next_pos; + } + + // Reached end of input, consume rest of the file as value and make sure KV pair is produced. + value.append(file.begin() + pos, file.end()); + return {file.size(), State::FLUSH_PAIR}; + } + + /* + * Search for closing quoting character and process escape sequences along the way (if escaping support is turned on). + * */ + [[nodiscard]] NextState readQuotedValue(std::string_view file, auto & value) const + { + size_t pos = 0; + + value.reset(); + + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) + { + const size_t character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (WITH_ESCAPING && isEscapeCharacter(*p)) + { + if constexpr (WITH_ESCAPING) + { + auto [parsed_successfully, escape_sequence_length] = consumeWithEscapeSequence(file, pos, character_position, value); + next_pos = character_position + escape_sequence_length; + + if (!parsed_successfully) + { + return {next_pos, State::WAITING_KEY}; + } + } + } + else if (isQuotingCharacter(*p)) + { + value.append(file.begin() + pos, file.begin() + character_position); + + return {next_pos, State::FLUSH_PAIR}; + } + + pos = next_pos; + } + + return {file.size(), State::END}; + } + + const Configuration configuration; + +private: + SearchSymbols wait_needles; + SearchSymbols read_key_needles; + SearchSymbols read_value_needles; + SearchSymbols read_quoted_needles; + + /* + * Helper method to copy bytes until `character_pos` and process possible escape sequence. Returns a pair containing a boolean + * that indicates success and a std::size_t that contains the number of bytes read/ consumed. + * */ + std::pair consumeWithEscapeSequence(std::string_view file, size_t start_pos, size_t character_pos, auto & output) const + { + std::string escaped_sequence; + DB::ReadBufferFromMemory buf(file.begin() + character_pos, file.size() - character_pos); + + output.append(file.begin() + start_pos, file.begin() + character_pos); + + if (DB::parseComplexEscapeSequence(escaped_sequence, buf)) + { + output.append(escaped_sequence); + + return {true, buf.getPosition()}; + } + + return {false, buf.getPosition()}; + } + + bool isKeyValueDelimiter(char character) const + { + return configuration.key_value_delimiter == character; + } + + bool isPairDelimiter(char character) const + { + const auto & pair_delimiters = configuration.pair_delimiters; + return std::find(pair_delimiters.begin(), pair_delimiters.end(), character) != pair_delimiters.end(); + } + + bool isQuotingCharacter(char character) const + { + return configuration.quoting_character == character; + } + + bool isEscapeCharacter(char character) const + { + return character == '\\'; + } +}; + +struct NoEscapingStateHandler : public StateHandlerImpl +{ + /* + * View based StringWriter, no temporary copies are used. + * */ + class StringWriter + { + ColumnString & col; + + std::string_view element; + + public: + explicit StringWriter(ColumnString & col_) + : col(col_) + {} + + ~StringWriter() + { + // Make sure that ColumnString invariants are not broken. + if (!isEmpty()) + { + reset(); + } + } + + void append(std::string_view new_data) + { + element = new_data; + } + + template + void append(const T * begin, const T * end) + { + append({begin, end}); + } + + void reset() + { + element = {}; + } + + bool isEmpty() const + { + return element.empty(); + } + + void commit() + { + col.insertData(element.begin(), element.size()); + reset(); + } + + std::string_view uncommittedChunk() const + { + return element; + } + }; + + template + NoEscapingStateHandler(Args && ... args) + : StateHandlerImpl(std::forward(args)...) {} +}; + +struct InlineEscapingStateHandler : public StateHandlerImpl +{ + class StringWriter + { + ColumnString & col; + ColumnString::Chars & chars; + UInt64 prev_commit_pos; + + public: + explicit StringWriter(ColumnString & col_) + : col(col_), + chars(col.getChars()), + prev_commit_pos(chars.size()) + {} + + ~StringWriter() + { + // Make sure that ColumnString invariants are not broken. + if (!isEmpty()) + { + reset(); + } + } + + void append(std::string_view new_data) + { + chars.insert(new_data.begin(), new_data.end()); + } + + template + void append(const T * begin, const T * end) + { + chars.insert(begin, end); + } + + void reset() + { + chars.resize_assume_reserved(prev_commit_pos); + } + + bool isEmpty() const + { + return chars.size() == prev_commit_pos; + } + + void commit() + { + col.insertData(nullptr, 0); + prev_commit_pos = chars.size(); + } + + std::string_view uncommittedChunk() const + { + return std::string_view(chars.raw_data() + prev_commit_pos, chars.raw_data() + chars.size()); + } + }; + + template + InlineEscapingStateHandler(Args && ... args) + : StateHandlerImpl(std::forward(args)...) {} +}; + +} + +} diff --git a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp new file mode 100644 index 00000000000..3dd914eb5a0 --- /dev/null +++ b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp @@ -0,0 +1,40 @@ +#include +#include + +#include + +#include +#include + +namespace DB +{ + +void assert_byte_equality(StringRef lhs, const std::vector & rhs) +{ + std::vector lhs_vector {lhs.data, lhs.data + lhs.size}; + ASSERT_EQ(lhs_vector, rhs); +} + +TEST(extractKVPairEscapingKeyValuePairExtractor, EscapeSequences) +{ + using namespace std::literals; + + auto extractor = KeyValuePairExtractorBuilder().withEscaping().build(); + + auto keys = ColumnString::create(); + auto values = ColumnString::create(); + + auto pairs_count = extractor->extract(R"(key1:a\xFF key2:a\n\t\r)"sv, keys, values); + + ASSERT_EQ(pairs_count, 2u); + ASSERT_EQ(keys->size(), pairs_count); + ASSERT_EQ(keys->size(), values->size()); + + ASSERT_EQ(keys->getDataAt(0).toView(), "key1"); + ASSERT_EQ(keys->getDataAt(1).toView(), "key2"); + + assert_byte_equality(values->getDataAt(0), {'a', 0xFF}); + assert_byte_equality(values->getDataAt(1), {'a', 0xA, 0x9, 0xD}); +} + +} diff --git a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp new file mode 100644 index 00000000000..76694869b9c --- /dev/null +++ b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp @@ -0,0 +1,176 @@ +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + + +namespace +{ +using namespace DB; +using namespace std::literals; + +// Print as a map with a single row +auto ToColumnMap(const auto & keys, const auto & values, const ColumnPtr offsets = nullptr) +{ + return ColumnMap::create( + std::move(keys->clone()), + std::move(values->clone()), + offsets ? offsets : ColumnUInt64::create(1, keys->size()) + ); +} + +// Print as a map with a single row +std::string PrintMap(const auto & keys, const auto & values) +{ + auto map_column = ToColumnMap(keys, values); + auto serialization = DataTypeFactory::instance().get("map(string, string)")->getSerialization(ISerialization::Kind::DEFAULT); + + WriteBufferFromOwnString buff; + serialization->serializeTextJSON(*map_column, 0, buff, FormatSettings{}); + + return std::move(buff.str()); +} + +template +struct Dump +{ + const T & value; + + friend std::ostream & operator<<(std::ostream & ostr, const Dump & d) + { + return dumpValue(ostr, d.value); + } +}; + +template +auto print_with_dump(const T & value) +{ + return Dump{value}; +} + +} + +struct KeyValuePairExtractorTestParam +{ + KeyValuePairExtractorBuilder builder; + std::string input; + std::vector> expected; +}; + +struct extractKVPairKeyValuePairExtractorTest : public ::testing::TestWithParam +{}; + +TEST_P(extractKVPairKeyValuePairExtractorTest, Match) +{ + const auto & [builder, input, expected] = GetParam(); + SCOPED_TRACE(input); + + auto kv_parser = builder.build(); + SCOPED_TRACE(typeid(kv_parser).name()); + + auto keys = ColumnString::create(); + auto values = ColumnString::create(); + + auto pairs_found = kv_parser->extract(input, keys, values); + ASSERT_EQ(expected.size(), pairs_found) + << "\texpected: " << print_with_dump(expected) << "\n" + << "\tactual : " << print_with_dump(*ToColumnMap(keys, values)); + + size_t i = 0; + for (const auto & expected_kv : expected) + { + EXPECT_EQ(expected_kv.first, keys->getDataAt(i)); + + EXPECT_EQ(expected_kv.second, values->getDataAt(i)); + + ++i; + } +} + +using ExpectedValues = std::vector>; +const ExpectedValues neymar_expected{ + {"name","neymar"}, + {"age","31"}, + {"team","psg"}, + {"nationality","brazil"}, + {"last_key","last_value"} +}; + +INSTANTIATE_TEST_SUITE_P(Simple, extractKVPairKeyValuePairExtractorTest, + ::testing::ValuesIn(std::initializer_list + { + { + KeyValuePairExtractorBuilder().withQuotingCharacter('\''), + R"in(name:'neymar';'age':31;team:psg;nationality:brazil,last_key:last_value)in", + neymar_expected + }, + { + // Different escaping char + KeyValuePairExtractorBuilder().withQuotingCharacter('"'), + R"in(name:"neymar";"age":31;team:psg;nationality:brazil,last_key:last_value)in", + neymar_expected + }, + { + // same as case 1, but with another handler + KeyValuePairExtractorBuilder().withQuotingCharacter('\'').withEscaping(), + R"in(name:'neymar';'age':31;team:psg;nationality:brazil,last_key:last_value)in", + neymar_expected + } + } + ) +); + +// Perform best-effort parsing for invalid escape sequences +INSTANTIATE_TEST_SUITE_P(InvalidEscapeSeqInValue, extractKVPairKeyValuePairExtractorTest, + ::testing::ValuesIn(std::initializer_list + { + { + // Special case when invalid seq is the last symbol + KeyValuePairExtractorBuilder().withEscaping(), + R"in(valid_key:valid_value key:invalid_val\)in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"key", "invalid_val"} + } + }, + // Not handling escape sequences == do not care of broken one, `invalid_val\` must be present + { + KeyValuePairExtractorBuilder(), + R"in(valid_key:valid_value key:invalid_val\ third_key:third_value)in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"key", "invalid_val\\"}, + {"third_key", "third_value"} + } + }, + { + // Special case when invalid seq is the last symbol + KeyValuePairExtractorBuilder(), + R"in(valid_key:valid_value key:invalid_val\)in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"key", "invalid_val\\"} + } + }, + { + KeyValuePairExtractorBuilder().withQuotingCharacter('"'), + R"in(valid_key:valid_value key:"invalid val\ " "third key":"third value")in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"key", "invalid val\\ "}, + {"third key", "third value"}, + } + }, + } + ) +); diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp new file mode 100644 index 00000000000..afffb9f6108 --- /dev/null +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp @@ -0,0 +1,121 @@ +#include +#include + + +#include +#include + +namespace +{ + +using namespace DB; +using namespace DB::extractKV; + +using State = extractKV::StateHandler::State; +using NextState = extractKV::StateHandler::NextState; + +void test_wait(const auto & handler, std::string_view input, std::size_t expected_pos, State expected_state) +{ + auto next_state = handler.waitKey(input); + + ASSERT_EQ(next_state.position_in_string, expected_pos); + ASSERT_EQ(next_state.state, expected_state); +} + +template +void test_read(const auto & handler, std::string_view input, std::string_view expected_element, + std::size_t expected_pos, State expected_state) +{ + auto str = ColumnString::create(); + NextState next_state; + InlineEscapingStateHandler::StringWriter element(*str); + + if constexpr (quoted) + { + next_state = handler.readQuotedKey(input, element); + } + else + { + next_state = handler.readKey(input, element); + } + + ASSERT_EQ(next_state.position_in_string, expected_pos); + ASSERT_EQ(next_state.state, expected_state); + ASSERT_EQ(element.uncommittedChunk(), expected_element); +} + +void test_read(const auto & handler, std::string_view input, std::string_view expected_element, + std::size_t expected_pos, State expected_state) +{ + test_read(handler, input, expected_element, expected_pos, expected_state); +} + +void test_read_quoted(const auto & handler, std::string_view input, std::string_view expected_element, + std::size_t expected_pos, State expected_state) +{ + test_read(handler, input, expected_element, expected_pos, expected_state); +} + +} + +TEST(extractKVPairInlineEscapingKeyStateHandler, Wait) +{ + auto pair_delimiters = std::vector{',', ' '}; + + auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); + + StateHandlerImpl handler(configuration); + + test_wait(handler, "name", 0u, State::READING_KEY); + test_wait(handler, "\\:name", 2u, State::READING_KEY); + test_wait(handler, R"(\\"name)", 3u, State::READING_QUOTED_KEY); + + test_wait(handler, "", 0u, State::END); + test_wait(handler, "\\\\", 2u, State::END); +} + +TEST(extractKVPairInlineEscapingKeyStateHandler, Read) +{ + auto pair_delimiters = std::vector{',', ' '}; + + auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); + + StateHandlerImpl handler(configuration); + + std::string key_str = "name"; + std::string key_with_delimiter_str = key_str + ':'; + std::string key_with_delimiter_and_left_spacing = " " + key_with_delimiter_str; + std::string key_with_delimiter_and_random_characters_str = key_str + ':' + "a$a\\:''\""; + + // no delimiter, should discard + test_read(handler, key_str, "", key_str.size(), State::END); + + // valid + test_read(handler, key_with_delimiter_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); + + // valid as well + test_read(handler, key_with_delimiter_and_random_characters_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); + + test_read(handler, "", "", 0u, State::END); +} + +TEST(extractKVPairInlineEscapingKeyStateHandler, ReadEnclosed) +{ + auto pair_delimiters = std::vector{',', ' '}; + + auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); + + StateHandlerImpl handler(configuration); + + std::string regular_key = "name"; + std::string regular_key_with_end_quote = regular_key + "\""; + std::string key_with_special_characters = "name $!@#¨%&*%&%.569-519"; + std::string key_with_special_characters_with_end_quote = "name $!@#¨%&*%&%.569-519\""; + + std::string key_with_escape_character = regular_key + R"(\n\x4E")"; + + test_read_quoted(handler, regular_key, "", regular_key.size(), State::END); + test_read_quoted(handler, regular_key_with_end_quote, regular_key, regular_key_with_end_quote.size(), State::READING_KV_DELIMITER); + test_read_quoted(handler, key_with_special_characters_with_end_quote, key_with_special_characters, key_with_special_characters_with_end_quote.size(), State::READING_KV_DELIMITER); + test_read_quoted(handler, key_with_escape_character, regular_key + "\nN", key_with_escape_character.size(), State::READING_KV_DELIMITER); +} diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp new file mode 100644 index 00000000000..c350f0f1291 --- /dev/null +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp @@ -0,0 +1,34 @@ +#include + +#include +#include + +namespace +{ + +using namespace DB; +using namespace DB::extractKV; + +using State = extractKV::StateHandler::State; +using NextState = extractKV::StateHandler::NextState; + + +void test_wait(const auto & handler, std::string_view input, std::size_t expected_pos, State expected_state) +{ + auto next_state = handler.waitValue(input); + + ASSERT_EQ(next_state.position_in_string, expected_pos); + ASSERT_EQ(next_state.state, expected_state); +} + +} + +TEST(extractKVPairInlineEscapingValueStateHandler, Wait) +{ + auto pair_delimiters = std::vector {','}; + + auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); + StateHandlerImpl handler(configuration); + + test_wait(handler, " los$ yours3lf", 0u, State::READING_VALUE); +} diff --git a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp new file mode 100644 index 00000000000..d50c351d5c2 --- /dev/null +++ b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp @@ -0,0 +1,103 @@ +#include +#include + +#include + +#include + +namespace +{ +using namespace DB; +using namespace DB::extractKV; + +using State = extractKV::StateHandler::State; +using NextState = extractKV::StateHandler::NextState; + +void test_wait(const auto & handler, std::string_view input, std::size_t expected_pos, State expected_state) +{ + auto next_state = handler.waitKey(input); + + ASSERT_EQ(next_state.position_in_string, expected_pos); + ASSERT_EQ(next_state.state, expected_state); +} + +template +void test_read(const auto & handler, std::string_view input, std::string_view expected_element, + std::size_t expected_pos, State expected_state) +{ + NextState next_state; + + auto col = ColumnString::create(); + NoEscapingStateHandler::StringWriter element(*col); + + if constexpr (quoted) + { + next_state = handler.readQuotedKey(input, element); + } + else + { + next_state = handler.readKey(input, element); + } + + ASSERT_EQ(next_state.position_in_string, expected_pos); + ASSERT_EQ(next_state.state, expected_state); + ASSERT_EQ(element.uncommittedChunk(), expected_element); +} + +void test_read(const auto & handler, std::string_view input, std::string_view expected_element, + std::size_t expected_pos, State expected_state) +{ + test_read(handler, input, expected_element, expected_pos, expected_state); +} + +void test_read_quoted(const auto & handler, std::string_view input, std::string_view expected_element, + std::size_t expected_pos, State expected_state) +{ + test_read(handler, input, expected_element, expected_pos, expected_state); +} + +} + +TEST(extractKVPairNoEscapingKeyStateHandler, Wait) +{ + auto pair_delimiters = std::vector{',', ' ', '$'}; + + auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); + + NoEscapingStateHandler handler(configuration); + + test_wait(handler, "name", 0u, State::READING_KEY); + test_wait(handler, "\\:name", 0u, State::READING_KEY); + // quoted expected pos is + 1 because as of now it is skipped, maybe I should change it + test_wait(handler, "\"name", 1u, State::READING_QUOTED_KEY); + + test_wait(handler, ", $name", 3u, State::READING_KEY); + test_wait(handler, ", $\"name", 4u, State::READING_QUOTED_KEY); + + test_wait(handler, "", 0u, State::END); +} + +TEST(extractKVPairNoEscapingKeyStateHandler, Read) +{ + auto pair_delimiters = std::vector{',', ' '}; + + auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); + + NoEscapingStateHandler handler(configuration); + + std::string key_str = "name"; + std::string key_with_delimiter_str = key_str + ':'; + std::string key_with_delimiter_and_left_spacing = " " + key_with_delimiter_str; + std::string key_with_delimiter_and_random_characters_str = key_str + ':' + "a$a\\:''\""; + + // no delimiter, should discard + test_read(handler, key_str, "", key_str.size(), State::END); + + // valid + test_read(handler, key_with_delimiter_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); + + // valid as well + test_read(handler, key_with_delimiter_and_random_characters_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); + + test_read(handler, "", "", 0u, State::END); +} diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 127912a0b2a..1e98949857a 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -317,12 +317,24 @@ template void readStringUntilEOFInto>(PaddedPODArray -static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) +template +static ReturnType parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; + + auto error = [](const char * message [[maybe_unused]], int code [[maybe_unused]]) + { + if constexpr (throw_exception) + throw Exception(message, code); + return ReturnType(false); + }; + ++buf.position(); + if (buf.eof()) - throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE); + { + return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE); + } char char_after_backslash = *buf.position(); @@ -331,7 +343,14 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) ++buf.position(); /// escape sequence of the form \xAA char hex_code[2]; - readPODBinary(hex_code, buf); + + auto bytes_read = buf.read(hex_code, sizeof(hex_code)); + + if (bytes_read != sizeof(hex_code)) + { + return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE); + } + s.push_back(unhex2(hex_code)); } else if (char_after_backslash == 'N') @@ -361,8 +380,13 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) s.push_back(decoded_char); ++buf.position(); } + return ReturnType(true); } +bool parseComplexEscapeSequence(String & s, ReadBuffer & buf) +{ + return parseComplexEscapeSequence(s, buf); +} template static ReturnType parseJSONEscapeSequence(Vector & s, ReadBuffer & buf) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index ab25ede9504..d1d02c1f67a 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -1588,4 +1588,9 @@ void readQuotedField(String & s, ReadBuffer & buf); void readJSONField(String & s, ReadBuffer & buf); +/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters). + * It is assumed that the cursor is located on the `\` symbol + */ +bool parseComplexEscapeSequence(String & s, ReadBuffer & buf); + } diff --git a/tests/queries_ported/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries_ported/0_stateless/02499_extract_key_value_pairs_multiple_input.reference new file mode 100644 index 00000000000..965cfaa1b6d --- /dev/null +++ b/tests/queries_ported/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -0,0 +1,383 @@ +-- { echoOn } + +-- basic tests + +-- expected output: {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar, age:31 team:psg,nationality:brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +-- keys and values starting with number, underscore and other special characters +-- expected output: {'$nationality':'@brazil','1name':'neymar','4ge':'31','_team':'_psg'} +WITH + extract_key_value_pairs('1name:neymar, 4ge:31 _team:_psg,$nationality:@brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'$nationality':'@brazil','1name':'neymar','4ge':'31','_team':'_psg'} +-- only special characters +-- expected output: {'#':'#','$':'$','@':'@','_':'_'} +WITH + extract_key_value_pairs('_:_, @:@ #:#,$:$') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'#':'#','$':'$','@':'@','_':'_'} +-- special (not control) characters in the middle of elements +-- expected output: {'age':'3!','name':'ney!mar','nationality':'br4z!l','t&am':'@psg'} +WITH + extract_key_value_pairs('name:ney!mar, age:3! t&am:@psg,nationality:br4z!l') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'3!','name':'ney!mar','nationality':'br4z!l','t&am':'@psg'} +-- non-standard escape characters (i.e not \n, \r, \t and etc), back-slash should be preserved +-- expected output: {'amount\\z':'$5\\h','currency':'\\$USD'} +WITH + extract_key_value_pairs('currency:\$USD, amount\z:$5\h') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'amount\\z':'$5\\h','currency':'\\$USD'} +-- invalid escape sequence at the end of file should be ignored +-- expected output: {'key':'invalid_escape_sequence','valid_key':'valid_value'} +WITH + extract_key_value_pairs_with_escaping('valid_key:valid_value key:invalid_escape_sequence\\', ':', ' ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'key':'invalid_escape_sequence','valid_key':'valid_value'} +-- standard escape sequences are covered by unit tests + +-- simple quoting +-- expected output: {'age':'31','name':'neymar','team':'psg'} +WITH + extract_key_value_pairs('name:"neymar", "age":31 "team":"psg"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','name':'neymar','team':'psg'} +-- empty values +-- expected output: {'age':'','name':'','nationality':''} +WITH + extract_key_value_pairs('name:"", age: , nationality:') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'','name':'','nationality':''} +-- empty keys +-- empty keys are not allowed, thus empty output is expected +WITH + extract_key_value_pairs('"":abc, :def') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{} +-- semi-colon as pair delimiter +-- expected output: {'age':'31','anotherkey':'anothervalue','name':'neymar','random_key':'value_with_comma,still_part_of_value:still_part_of_value','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;random_key:value_with_comma,still_part_of_value:still_part_of_value;anotherkey:anothervalue', ':', ';') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','anotherkey':'anothervalue','name':'neymar','random_key':'value_with_comma,still_part_of_value:still_part_of_value','team':'psg'} +-- both comma and semi-colon as pair delimiters +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;nationality:brazil,last_key:last_value', ':', ';,') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +-- single quote as quoting character +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +-- NO ESCAPING TESTS +-- expected output: {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar, age:31 team:psg,nationality:brazil', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +-- special (not control) characters in the middle of elements +-- expected output: {'age':'3!','name':'ney!mar','nationality':'br4z!l','t&am':'@psg'} +WITH + extract_key_value_pairs('name:ney!mar, age:3! t&am:@psg,nationality:br4z!l', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'3!','name':'ney!mar','nationality':'br4z!l','t&am':'@psg'} +-- non-standard escape characters (i.e not \n, \r, \t and etc), it should accept everything +-- expected output: {'amount\\z':'$5\\h','currency':'\\$USD'} +WITH + extract_key_value_pairs('currency:\$USD, amount\z:$5\h', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'amount\\z':'$5\\h','currency':'\\$USD'} +-- standard escape sequences, it should return it as it is +-- expected output: {'key1':'header\nbody','key2':'start_of_text\tend_of_text'} +WITH + extract_key_value_pairs('key1:header\nbody key2:start_of_text\tend_of_text', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'key1':'header\nbody','key2':'start_of_text\tend_of_text'} +-- standard escape sequences are covered by unit tests + +-- simple quoting +-- expected output: {'age':'31','name':'neymar','team':'psg'} +WITH + extract_key_value_pairs('name:"neymar", "age":31 "team":"psg"', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','name':'neymar','team':'psg'} +-- empty values +-- expected output: {'age':'','name':'','nationality':''} +WITH + extract_key_value_pairs('name:"", age: , nationality:', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'','name':'','nationality':''} +-- empty keys +-- empty keys are not allowed, thus empty output is expected +WITH + extract_key_value_pairs('"":abc, :def', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{} +-- semi-colon as pair delimiter +-- expected output: {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;nationality:brazil', ':', ';', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +-- both comma and semi-colon as pair delimiters +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +-- single quote as quoting character +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +-- { echoOn } + +SET extract_kvp_max_pairs_per_row = 2; +-- Should be allowed because it no longer exceeds the max number of pairs +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extract_key_value_pairs('key1:value1,key2:value2') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'key1':'value1','key2':'value2'} +SET extract_kvp_max_pairs_per_row = 0; +-- Should be allowed because max pairs per row is set to 0 (unlimited) +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extract_key_value_pairs('key1:value1,key2:value2') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'key1':'value1','key2':'value2'} +-- should not fail because pair delimiters contains 8 characters, which is within the limit +WITH + extract_key_value_pairs('not_important', ':', '12345678', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{} +-- key value delimiter should be considered valid part of value +WITH + extract_key_value_pairs('formula=1+2=3 argument1=1 argument2=2 result=3, char="=" char2== string="foo=bar"', '=') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'argument1':'1','argument2':'2','char':'=','char2':'=','formula':'1+2=3','result':'3','string':'foo=bar'} +-- https://github.com/ClickHouse/ClickHouse/issues/56357 +WITH + extract_key_value_pairs('{"a":"1", "b":"2"}') as s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'a':'1','b':'2'} +-- check str_to_map alias (it is case-insensitive) +WITH + sTr_tO_mAp('name:neymar, age:31 team:psg,nationality:brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +-- check map_from_string alias +WITH + map_from_string('name:neymar, age:31 team:psg,nationality:brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; +{'age':'31','name':'neymar','nationality':'brazil','team':'psg'} diff --git a/tests/queries_ported/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries_ported/0_stateless/02499_extract_key_value_pairs_multiple_input.sql new file mode 100644 index 00000000000..2e0096aa6a3 --- /dev/null +++ b/tests/queries_ported/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -0,0 +1,518 @@ +-- { echoOn } + +-- basic tests + +-- expected output: {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar, age:31 team:psg,nationality:brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- keys and values starting with number, underscore and other special characters +-- expected output: {'$nationality':'@brazil','1name':'neymar','4ge':'31','_team':'_psg'} +WITH + extract_key_value_pairs('1name:neymar, 4ge:31 _team:_psg,$nationality:@brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- only special characters +-- expected output: {'#':'#','$':'$','@':'@','_':'_'} +WITH + extract_key_value_pairs('_:_, @:@ #:#,$:$') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- special (not control) characters in the middle of elements +-- expected output: {'age':'3!','name':'ney!mar','nationality':'br4z!l','t&am':'@psg'} +WITH + extract_key_value_pairs('name:ney!mar, age:3! t&am:@psg,nationality:br4z!l') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- non-standard escape characters (i.e not \n, \r, \t and etc), back-slash should be preserved +-- expected output: {'amount\\z':'$5\\h','currency':'\\$USD'} +WITH + extract_key_value_pairs('currency:\$USD, amount\z:$5\h') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- invalid escape sequence at the end of file should be ignored +-- expected output: {'key':'invalid_escape_sequence','valid_key':'valid_value'} +WITH + extract_key_value_pairs_with_escaping('valid_key:valid_value key:invalid_escape_sequence\\', ':', ' ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- standard escape sequences are covered by unit tests + +-- simple quoting +-- expected output: {'age':'31','name':'neymar','team':'psg'} +WITH + extract_key_value_pairs('name:"neymar", "age":31 "team":"psg"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- empty values +-- expected output: {'age':'','name':'','nationality':''} +WITH + extract_key_value_pairs('name:"", age: , nationality:') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- empty keys +-- empty keys are not allowed, thus empty output is expected +WITH + extract_key_value_pairs('"":abc, :def') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- semi-colon as pair delimiter +-- expected output: {'age':'31','anotherkey':'anothervalue','name':'neymar','random_key':'value_with_comma,still_part_of_value:still_part_of_value','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;random_key:value_with_comma,still_part_of_value:still_part_of_value;anotherkey:anothervalue', ':', ';') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- both comma and semi-colon as pair delimiters +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;nationality:brazil,last_key:last_value', ':', ';,') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- single quote as quoting character +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- NO ESCAPING TESTS +-- expected output: {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar, age:31 team:psg,nationality:brazil', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- special (not control) characters in the middle of elements +-- expected output: {'age':'3!','name':'ney!mar','nationality':'br4z!l','t&am':'@psg'} +WITH + extract_key_value_pairs('name:ney!mar, age:3! t&am:@psg,nationality:br4z!l', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- non-standard escape characters (i.e not \n, \r, \t and etc), it should accept everything +-- expected output: {'amount\\z':'$5\\h','currency':'\\$USD'} +WITH + extract_key_value_pairs('currency:\$USD, amount\z:$5\h', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- standard escape sequences, it should return it as it is +-- expected output: {'key1':'header\nbody','key2':'start_of_text\tend_of_text'} +WITH + extract_key_value_pairs('key1:header\nbody key2:start_of_text\tend_of_text', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- standard escape sequences are covered by unit tests + +-- simple quoting +-- expected output: {'age':'31','name':'neymar','team':'psg'} +WITH + extract_key_value_pairs('name:"neymar", "age":31 "team":"psg"', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- empty values +-- expected output: {'age':'','name':'','nationality':''} +WITH + extract_key_value_pairs('name:"", age: , nationality:', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- empty keys +-- empty keys are not allowed, thus empty output is expected +WITH + extract_key_value_pairs('"":abc, :def', ':', ', ', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- semi-colon as pair delimiter +-- expected output: {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;nationality:brazil', ':', ';', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- both comma and semi-colon as pair delimiters +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:neymar;age:31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '"') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- single quote as quoting character +-- expected output: {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} +WITH + extract_key_value_pairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- { echoOff } + +-- cross parameter validation tests +-- should fail because key value delimiter conflicts with pair delimiters +WITH + extract_key_value_pairs('not_important', ':', ',:', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError BAD_ARGUMENTS} + +-- should fail because key value delimiter conflicts with quoting characters +WITH + extract_key_value_pairs('not_important', ':', ',', '\':') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError BAD_ARGUMENTS} + +-- should fail because pair delimiters conflicts with quoting characters +WITH + extract_key_value_pairs('not_important', ':', ',', ',') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError BAD_ARGUMENTS} + +-- should fail because data_column argument must be of type string +WITH + extract_key_value_pairs([1, 2]) AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} + +-- should fail because key_value_delimiter argument must be of type string +WITH + extract_key_value_pairs('', [1, 2]) AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} + +-- should fail because pair_delimiters argument must be of type string +WITH + extract_key_value_pairs('', ':', [1, 2]) AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} + +-- should fail because quoting_character argument must be of type string +WITH + extract_key_value_pairs('', ':', ' ', [1, 2]) AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} + +-- should fail because pair delimiters can contain at most 8 characters +WITH + extract_key_value_pairs('not_important', ':', '123456789', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError BAD_ARGUMENTS} + +-- should fail because no argument has been provided +WITH + extract_key_value_pairs() AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} + +-- should fail because one extra argument / non existent has been provided +WITH + extract_key_value_pairs('a', ':', ',', '"', '') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} + +-- Should fail allowed because it exceeds the max number of pairs +SET extract_kvp_max_pairs_per_row = 1; +WITH + extract_key_value_pairs('key1:value1,key2:value2') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; -- {serverError LIMIT_EXCEEDED} + +-- { echoOn } + +SET extract_kvp_max_pairs_per_row = 2; +-- Should be allowed because it no longer exceeds the max number of pairs +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extract_key_value_pairs('key1:value1,key2:value2') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +SET extract_kvp_max_pairs_per_row = 0; +-- Should be allowed because max pairs per row is set to 0 (unlimited) +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extract_key_value_pairs('key1:value1,key2:value2') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- should not fail because pair delimiters contains 8 characters, which is within the limit +WITH + extract_key_value_pairs('not_important', ':', '12345678', '\'') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- key value delimiter should be considered valid part of value +WITH + extract_key_value_pairs('formula=1+2=3 argument1=1 argument2=2 result=3, char="=" char2== string="foo=bar"', '=') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- https://github.com/ClickHouse/ClickHouse/issues/56357 +WITH + extract_key_value_pairs('{"a":"1", "b":"2"}') as s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- check str_to_map alias (it is case-insensitive) +WITH + sTr_tO_mAp('name:neymar, age:31 team:psg,nationality:brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x; + +-- check map_from_string alias +WITH + map_from_string('name:neymar, age:31 team:psg,nationality:brazil') AS s_map, + CAST( + array_map( + (x) -> (x, s_map[x]), array_sort(map_keys(s_map)) + ), + 'map(string,string)' + ) AS x +SELECT + x;