Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added json lines partitioning helper #288

Merged
merged 4 commits into from
May 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 40 additions & 105 deletions include/daw/json/daw_json_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <daw/daw_cxmath.h>
#include <daw/daw_move.h>
#include <daw/daw_scope_guard.h>
#include <daw/daw_string_view.h>
#include <daw/daw_traits.h>
#include <daw/daw_utility.h>

Expand All @@ -25,7 +26,6 @@
#include <iterator>
#include <limits>
#include <string>
#include <string_view>
#include <type_traits>

namespace daw::json {
Expand Down Expand Up @@ -65,18 +65,13 @@ namespace daw::json {
*/
template<typename JsonElement, auto... PolicyFlags>
class json_array_iterator {
using ParsePolicy = BasicParsePolicy<
using ParseState = BasicParsePolicy<
options::details::make_parse_flags<PolicyFlags...>( ).value>;
using CharT = typename ParsePolicy::CharT;
using CharT = typename ParseState::CharT;

template<typename String>
static inline constexpr ParsePolicy
get_range( String &&data, std::string_view member_path ) {
static_assert(
std::is_convertible_v<decltype( std::data( data ) ), CharT *>,
"Attempt to assign a const char * to a char *" );

auto [is_found, result] = json_details::find_range<ParsePolicy>(
static inline constexpr ParseState
get_range( daw::string_view data, daw::string_view member_path ) {
auto [is_found, result] = json_details::find_range<ParseState>(
DAW_FWD( data ),
{ std::data( member_path ), std::size( member_path ) } );
daw_json_assert( is_found, ErrorReason::JSONPathNotFound );
Expand All @@ -97,7 +92,6 @@ namespace daw::json {
using iterator_category = std::input_iterator_tag;

private:
using ParseState = ParsePolicy;
ParseState m_state = ParseState( );
/***
* This lets us fastpath and just skip n characters as we have already
Expand All @@ -108,21 +102,9 @@ namespace daw::json {
public:
json_array_iterator( ) = default;

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_iterator,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
inline constexpr explicit json_array_iterator( String &&jd )
: m_state( ParsePolicy( std::data( jd ), daw::data_end( jd ) ) ) {

static_assert(
traits::is_string_view_like_v<daw::remove_cvref_t<String>>,
"String requires being able to call std::data/std::size. char const "
"* are not able to do this, pass a string_view for char const * to "
"ensure you are aware of the strlen cost" );

static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *> );
inline constexpr explicit json_array_iterator( daw::string_view jd )
: m_state( ParseState( std::data( jd ), daw::data_end( jd ) ) ) {

m_state.trim_left( );
daw_json_assert_weak( m_state.is_opening_bracket_checked( ),
ErrorReason::InvalidArrayStart, m_state );
Expand All @@ -131,23 +113,9 @@ namespace daw::json {
m_state.trim_left( );
}

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_iterator,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
inline constexpr explicit json_array_iterator(
String &&jd, std::string_view start_path )
: m_state( get_range( DAW_FWD2( String, jd ), start_path ) ) {

static_assert(
traits::is_string_view_like_v<daw::remove_cvref_t<String>>,
"String requires being able to call std::data/std::size. char const "
"* are not able to do this, pass a string_view for char const * to "
"ensure you are aware of the strlen cost" );

static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *>,
"Attempt to assign a const char * to a char *" );
daw::string_view jd, daw::string_view start_path )
: m_state( get_range( jd, start_path ) ) {

m_state.trim_left( );
daw_json_assert_weak( m_state.is_opening_bracket_checked( ),
Expand All @@ -157,18 +125,16 @@ namespace daw::json {
m_state.trim_left( );
}

/***
* Parse the current element
* @pre good( ) returns true
* @return The parsed result of ParseElement
*/
/// @brief Parse the current element
/// @pre good( ) returns true
/// @return The parsed result of ParseElement
[[nodiscard]] inline constexpr value_type operator*( ) const {
daw_json_assert_weak( m_state.has_more( ) and m_state.front( ) != ']',
ErrorReason::UnexpectedEndOfData, m_state );

auto tmp = m_state;

if constexpr( json_details::use_direct_construction_v<ParsePolicy,
if constexpr( json_details::use_direct_construction_v<ParseState,
JsonElement> ) {
auto const run_after_parse =
json_array_iterator_details::op_star_cleanup<CharT, ParseState>{
Expand All @@ -185,13 +151,11 @@ namespace daw::json {
}
}

/***
* A dereferencable value proxy holding the result of operator*
* This is for compatibility with the Iterator concepts and should be
* avoided
* @pre good( ) returns true
* @return an arrow_proxy of the operator* result
*/
/// @brief A dereferencable value proxy holding the result of operator*
/// This is for compatibility with the Iterator concepts and should be
/// avoided
/// @pre good( ) returns true
/// @return an arrow_proxy of the operator* result
[[nodiscard]] inline pointer operator->( ) const {
return pointer{ operator*( ) };
}
Expand Down Expand Up @@ -229,19 +193,15 @@ namespace daw::json {
m_state.front( ) != ']';
}

/***
* Are we good( )
* @return result of good( )
*/
/// @brief Are we good( )
/// @return result of good( )
[[nodiscard]] explicit inline constexpr operator bool( ) const {
return good( );
}

/***
* Compare rhs for equivalence
* @param rhs Another json_array_iterator
* @return true when equivalent to rhs
*/
/// @brief Compare rhs for equivalence
/// @param rhs Another json_array_iterator
/// @return true when equivalent to rhs
[[nodiscard]] inline constexpr bool
operator==( json_array_iterator const &rhs ) const {
if( not( *this ) ) {
Expand All @@ -253,11 +213,9 @@ namespace daw::json {
return ( m_state.first == rhs.m_state.first );
}

/***
* Check if the other iterator is not equivalent
* @param rhs another json_array_iterator
* @return true when rhs is not equivalent
*/
/// @brief Check if the other iterator is not equivalent
/// @param rhs another json_array_iterator
/// @return true when rhs is not equivalent
[[nodiscard]] inline constexpr bool
operator!=( json_array_iterator const &rhs ) const {
if( not( *this ) ) {
Expand All @@ -270,11 +228,9 @@ namespace daw::json {
}
};

/***
* A range of json_array_iterators
* @tparam JsonElement Type of each element in array
* @tparam ParsePolicy parsing policy type
*/
/// @brief A range of json_array_iterators
/// @tparam JsonElement Type of each element in array
/// @tparam ParsePolicy parsing policy type
template<typename JsonElement, auto... PolicyFlags>
struct json_array_range {
using ParsePolicy = BasicParsePolicy<
Expand All @@ -289,46 +245,25 @@ namespace daw::json {
public:
json_array_range( ) = default;

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_range,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
constexpr explicit json_array_range( String &&jd )
: m_first( DAW_FWD2( String, jd ) ) {
static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *> );
}
constexpr explicit json_array_range( daw::string_view jd )
: m_first( jd ) {}

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_range,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
constexpr explicit json_array_range( String &&jd,
std::string_view start_path )
: m_first( DAW_FWD2( String, jd ), start_path ) {
static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *>,
"Attempt to assign a const char * to a char *" );
}
constexpr explicit json_array_range( daw::string_view jd,
daw::string_view start_path )
: m_first( jd, start_path ) {}

/***
* @return first item in range
*/
/// @return first item in range
[[nodiscard]] inline constexpr iterator begin( ) {
return m_first;
}

/***
* @return one past last item in range
*/
/// @return one past last item in range
[[nodiscard]] inline constexpr iterator end( ) {
return m_last;
}

/***
* Are there any elements in range
* @return true when begin( ) == end( )
*/
/// @brief Are there any elements in range
/// @return true when begin( ) == end( )
[[nodiscard]] inline constexpr bool empty( ) const {
return m_first == m_last;
}
Expand Down
40 changes: 38 additions & 2 deletions include/daw/json/daw_json_lines_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ namespace daw::json {
/// @brief Iterator for iterating over JSON array's
/// @tparam JsonElement type under underlying element in array. If
/// heterogeneous, a basic_json_value_iterator may be more appropriate
template<typename JsonElement, auto... PolicyFlags>
template<typename JsonElement = json_value, auto... PolicyFlags>
class json_lines_iterator {
using ParsePolicy = BasicParsePolicy<
options::details::make_parse_flags<PolicyFlags...>( ).value>;
Expand Down Expand Up @@ -213,7 +213,7 @@ namespace daw::json {
* @tparam JsonElement Type of each element in array
* @tparam ParsePolicy parsing policy type
*/
template<typename JsonElement, auto... PolicyFlags>
template<typename JsonElement = json_value, auto... PolicyFlags>
struct json_lines_range {
using ParsePolicy = BasicParsePolicy<
options::details::make_parse_flags<PolicyFlags...>( ).value>;
Expand Down Expand Up @@ -246,5 +246,41 @@ namespace daw::json {
return m_first == m_last;
}
};

/// @brief parition the document into num_partitions, non overlapping
/// pieces.
template<typename JsonElement = json_value, auto... ParsePolicies>
auto partition_jsonl_document( std::size_t num_partitions,
daw::string_view jsonl_doc ) {
using result_t =
std::vector<json_lines_range<JsonElement, ParsePolicies...>>;
if( num_partitions <= 1 ) {
return result_t{
json_lines_range<JsonElement, ParsePolicies...>( jsonl_doc ) };
}
auto approx_segsize = jsonl_doc.size( ) / num_partitions;
auto result = result_t{ };
char const *const last = daw::data_end( jsonl_doc );
while( not jsonl_doc.empty( ) ) {
char const *tmp = std::data( jsonl_doc ) + approx_segsize;
if( tmp >= last ) {
result.emplace_back( jsonl_doc );
break;
}
while( tmp < last and * tmp != '\n' ) {
++tmp;
}
if( tmp < last ) {
++tmp;
}
auto sz = static_cast<std::size_t>( tmp - std::data( jsonl_doc ) );
auto doc = jsonl_doc.pop_front( sz );
doc.trim_suffix( );
if( not doc.empty( ) ) {
result.emplace_back( doc );
}
}
return result;
}
} // namespace DAW_JSON_VER
} // namespace daw::json
Loading