Skip to content

Commit

Permalink
Feature/issue 522 support extract kv pairs (#525)
Browse files Browse the repository at this point in the history
  • Loading branch information
yokofly authored Jan 29, 2024
1 parent 8f0afd0 commit 51ec1d2
Show file tree
Hide file tree
Showing 28 changed files with 3,333 additions and 22 deletions.
204 changes: 187 additions & 17 deletions base/base/find_symbols.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*
* Allow to search for next character from the set of 'symbols...' in a string.
* It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'),
* but with the following differencies:
* but with the following differences:
* - works with any memory ranges, including containing zero bytes;
* - doesn't require terminating zero byte: end of memory range is passed explicitly;
* - if not found, returns pointer to end instead of nullptr;
Expand All @@ -34,10 +34,52 @@
* If no such characters, returns nullptr.
*/

struct SearchSymbols
{
static constexpr auto BUFFER_SIZE = 16;

SearchSymbols() = default;

explicit SearchSymbols(std::string in)
: str(std::move(in))
{
#if defined(__SSE4_2__)
if (str.size() > BUFFER_SIZE)
{
throw std::runtime_error("SearchSymbols can contain at most " + std::to_string(BUFFER_SIZE) + " symbols and " + std::to_string(str.size()) + " was provided\n");
}

char tmp_safety_buffer[BUFFER_SIZE] = {0};

memcpy(tmp_safety_buffer, str.data(), str.size());

simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i *>(tmp_safety_buffer));
#endif
}

#if defined(__SSE4_2__)
__m128i simd_vector;
#endif
std::string str;
};

namespace detail
{
template <char ...chars> constexpr bool is_in(char x) { return ((x == chars) || ...); }

static bool is_in(char c, const char * symbols, size_t num_chars)
{
for (size_t i = 0u; i < num_chars; ++i)
{
if (c == symbols[i])
{
return true;
}
}

return false;
}

#if defined(__SSE2__)
template <char s0>
inline __m128i mm_is_in(__m128i bytes)
Expand All @@ -53,6 +95,43 @@ inline __m128i mm_is_in(__m128i bytes)
__m128i eq = mm_is_in<s1, tail...>(bytes);
return _mm_or_si128(eq0, eq);
}

inline __m128i mm_is_in(__m128i bytes, const char * symbols, size_t num_chars)
{
__m128i accumulator = _mm_setzero_si128();
for (size_t i = 0; i < num_chars; ++i)
{
__m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i]));
accumulator = _mm_or_si128(accumulator, eq);
}

return accumulator;
}

inline std::array<__m128i, 16u> mm_is_in_prepare(const char * symbols, size_t num_chars)
{
std::array<__m128i, 16u> result {};

for (size_t i = 0; i < num_chars; ++i)
{
result[i] = _mm_set1_epi8(symbols[i]);
}

return result;
}

inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u> & needles)
{
__m128i accumulator = _mm_setzero_si128();

for (const auto & needle : needles)
{
__m128i eq = _mm_cmpeq_epi8(bytes, needle);
accumulator = _mm_or_si128(accumulator, eq);
}

return accumulator;
}
#endif

template <bool positive>
Expand Down Expand Up @@ -99,6 +178,32 @@ inline const char * find_first_symbols_sse2(const char * const begin, const char
return return_mode == ReturnMode::End ? end : nullptr;
}

template <bool positive, ReturnMode return_mode>
inline const char * find_first_symbols_sse2(const char * const begin, const char * const end, const char * symbols, size_t num_chars)
{
const char * pos = begin;

#if defined(__SSE2__)
const auto needles = mm_is_in_prepare(symbols, num_chars);
for (; pos + 15 < end; pos += 16)
{
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));

__m128i eq = mm_is_in_execute(bytes, needles);

uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
if (bit_mask)
return pos + __builtin_ctz(bit_mask);
}
#endif

for (; pos < end; ++pos)
if (maybe_negate<positive>(is_in(*pos, symbols, num_chars)))
return pos;

return return_mode == ReturnMode::End ? end : nullptr;
}


template <bool positive, ReturnMode return_mode, char... symbols>
inline const char * find_last_symbols_sse2(const char * const begin, const char * const end)
Expand Down Expand Up @@ -159,26 +264,61 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha
#endif

for (; pos < end; ++pos)
if ( (num_chars >= 1 && maybe_negate<positive>(*pos == c01))
|| (num_chars >= 2 && maybe_negate<positive>(*pos == c02))
|| (num_chars >= 3 && maybe_negate<positive>(*pos == c03))
|| (num_chars >= 4 && maybe_negate<positive>(*pos == c04))
|| (num_chars >= 5 && maybe_negate<positive>(*pos == c05))
|| (num_chars >= 6 && maybe_negate<positive>(*pos == c06))
|| (num_chars >= 7 && maybe_negate<positive>(*pos == c07))
|| (num_chars >= 8 && maybe_negate<positive>(*pos == c08))
|| (num_chars >= 9 && maybe_negate<positive>(*pos == c09))
|| (num_chars >= 10 && maybe_negate<positive>(*pos == c10))
|| (num_chars >= 11 && maybe_negate<positive>(*pos == c11))
|| (num_chars >= 12 && maybe_negate<positive>(*pos == c12))
|| (num_chars >= 13 && maybe_negate<positive>(*pos == c13))
|| (num_chars >= 14 && maybe_negate<positive>(*pos == c14))
|| (num_chars >= 15 && maybe_negate<positive>(*pos == c15))
|| (num_chars >= 16 && maybe_negate<positive>(*pos == c16)))
if ( (num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos)))
|| (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos)))
|| (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos)))
|| (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos)))
|| (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos)))
|| (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos)))
|| (num_chars == 7 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos)))
|| (num_chars == 8 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos)))
|| (num_chars == 9 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos)))
|| (num_chars == 10 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos)))
|| (num_chars == 11 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos)))
|| (num_chars == 12 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos)))
|| (num_chars == 13 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>(*pos)))
|| (num_chars == 14 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>(*pos)))
|| (num_chars == 15 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15>(*pos)))
|| (num_chars == 16 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c16>(*pos))))
return pos;
return return_mode == ReturnMode::End ? end : nullptr;
}

template <bool positive, ReturnMode return_mode>
inline const char * find_first_symbols_sse42(const char * const begin, const char * const end, const SearchSymbols & symbols)
{
const char * pos = begin;

const auto num_chars = symbols.str.size();

#if defined(__SSE4_2__)
constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT;

const __m128i set = symbols.simd_vector;

for (; pos + 15 < end; pos += 16)
{
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));

if constexpr (positive)
{
if (_mm_cmpestrc(set, num_chars, bytes, 16, mode))
return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode);
}
else
{
if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY))
return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY);
}
}
#endif

for (; pos < end; ++pos)
if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars)))
return pos;

return return_mode == ReturnMode::End ? end : nullptr;
}

/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do.

Expand All @@ -194,6 +334,17 @@ inline const char * find_first_symbols_dispatch(const char * begin, const char *
return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end);
}

template <bool positive, ReturnMode return_mode>
inline const char * find_first_symbols_dispatch(const std::string_view haystack, const SearchSymbols & symbols)
{
#if defined(__SSE4_2__)
if (symbols.str.size() >= 5)
return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), symbols);
else
#endif
return find_first_symbols_sse2<positive, return_mode>(haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size());
}

}


Expand All @@ -211,6 +362,11 @@ inline char * find_first_symbols(char * begin, char * end)
return const_cast<char *>(detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, end));
}

inline const char * find_first_symbols(std::string_view haystack, const SearchSymbols & symbols)
{
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols);
}

template <char... symbols>
inline const char * find_first_not_symbols(const char * begin, const char * end)
{
Expand All @@ -223,6 +379,11 @@ inline char * find_first_not_symbols(char * begin, char * end)
return const_cast<char *>(detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, end));
}

inline const char * find_first_not_symbols(std::string_view haystack, const SearchSymbols & symbols)
{
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols);
}

template <char... symbols>
inline const char * find_first_symbols_or_null(const char * begin, const char * end)
{
Expand All @@ -235,6 +396,11 @@ inline char * find_first_symbols_or_null(char * begin, char * end)
return const_cast<char *>(detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, end));
}

inline const char * find_first_symbols_or_null(std::string_view haystack, const SearchSymbols & symbols)
{
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, symbols);
}

template <char... symbols>
inline const char * find_first_not_symbols_or_null(const char * begin, const char * end)
{
Expand All @@ -247,6 +413,10 @@ inline char * find_first_not_symbols_or_null(char * begin, char * end)
return const_cast<char *>(detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>(begin, end));
}

inline const char * find_first_not_symbols_or_null(std::string_view haystack, const SearchSymbols & symbols)
{
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, symbols);
}

template <char... symbols>
inline const char * find_last_symbols_or_null(const char * begin, const char * end)
Expand Down
102 changes: 102 additions & 0 deletions docs/en/sql-reference/functions/tuple-map-functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,108 @@ Result:

- [Map(key, value)](../../sql-reference/data-types/map.md) data type

## extractKeyValuePairs

Extracts key-value pairs, i.e. a [Map(String, String)](../../sql-reference/data-types/map.md), from a string. Parsing is robust towards noise (e.g. log files).

A key-value pair consists of a key, followed by a `key_value_delimiter` and a value. Key value pairs must be separated by `pair_delimiter`. Quoted keys and values are also supported.

**Syntax**

``` sql
extractKeyValuePairs(data[, key_value_delimiter[, pair_delimiter[, quoting_character]]])
```

Alias:
- `str_to_map`
- `mapFromString`

**Arguments**

- `data` - String to extract key-value pairs from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md).
- `key_value_delimiter` - Character to be used as delimiter between the key and the value. Defaults to `:`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md).
- `pair_delimiters` - Set of character to be used as delimiters between pairs. Defaults to ` `, `,` and `;`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md).
- `quoting_character` - Character to be used as quoting character. Defaults to `"`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md).

**Returned values**

- A [Map(String, String)](../../sql-reference/data-types/map.md) of key-value pairs.

**Examples**

Simple case:

``` sql
SELECT extractKeyValuePairs('name:neymar, age:31 team:psg,nationality:brazil') as kv
```

Result:

``` Result:
┌─kv──────────────────────────────────────────────────────────────────────┐
│ {'name':'neymar','age':'31','team':'psg','nationality':'brazil'} │
└─────────────────────────────────────────────────────────────────────────┘
```

Single quote as quoting character:

``` sql
SELECT extractKeyValuePairs('name:\'neymar\';\'age\':31;team:psg;nationality:brazil,last_key:last_value', ':', ';,', '\'') as kv
```

Result:

``` text
┌─kv───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ {'name':'neymar','age':'31','team':'psg','nationality':'brazil','last_key':'last_value'} │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```

Escape sequences without escape sequences support:

``` sql
SELECT extractKeyValuePairs('age:a\\x0A\\n\\0') AS kv
```

Result:

``` text
┌─kv─────────────────────┐
│ {'age':'a\\x0A\\n\\0'} │
└────────────────────────┘
```

## extractKeyValuePairsWithEscaping

Same as `extractKeyValuePairs` but with escaping support.

Supported escape sequences: `\x`, `\N`, `\a`, `\b`, `\e`, `\f`, `\n`, `\r`, `\t`, `\v` and `\0`.
Non standard escape sequences are returned as it is (including the backslash) unless they are one of the following:
`\\`, `'`, `"`, `backtick`, `/`, `=` or ASCII control characters (c <= 31).

This function will satisfy the use case where pre-escaping and post-escaping are not suitable. For instance, consider the following
input string: `a: "aaaa\"bbb"`. The expected output is: `a: aaaa\"bbbb`.
- Pre-escaping: Pre-escaping it will output: `a: "aaaa"bbb"` and `extractKeyValuePairs` will then output: `a: aaaa`
- Post-escaping: `extractKeyValuePairs` will output `a: aaaa\` and post-escaping will keep it as it is.

Leading escape sequences will be skipped in keys and will be considered invalid for values.

**Examples**

Escape sequences with escape sequence support turned on:

``` sql
SELECT extractKeyValuePairsWithEscaping('age:a\\x0A\\n\\0') AS kv
```

Result:

``` result
┌─kv────────────────┐
│ {'age':'a\n\n\0'} │
└───────────────────┘
```

## mapAdd {#function-mapadd}

Collect all the keys and sum corresponding values.
Expand Down
Loading

0 comments on commit 51ec1d2

Please sign in to comment.