Skip to content

Commit

Permalink
Merge pull request #288 from beached/v3
Browse files Browse the repository at this point in the history
Added json lines partitioning helper
  • Loading branch information
beached authored May 31, 2022
2 parents 1fce163 + b2b35af commit 4012721
Show file tree
Hide file tree
Showing 10 changed files with 342 additions and 225 deletions.
145 changes: 40 additions & 105 deletions include/daw/json/daw_json_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <daw/daw_cxmath.h>
#include <daw/daw_move.h>
#include <daw/daw_scope_guard.h>
#include <daw/daw_string_view.h>
#include <daw/daw_traits.h>
#include <daw/daw_utility.h>

Expand All @@ -25,7 +26,6 @@
#include <iterator>
#include <limits>
#include <string>
#include <string_view>
#include <type_traits>

namespace daw::json {
Expand Down Expand Up @@ -65,18 +65,13 @@ namespace daw::json {
*/
template<typename JsonElement, auto... PolicyFlags>
class json_array_iterator {
using ParsePolicy = BasicParsePolicy<
using ParseState = BasicParsePolicy<
options::details::make_parse_flags<PolicyFlags...>( ).value>;
using CharT = typename ParsePolicy::CharT;
using CharT = typename ParseState::CharT;

template<typename String>
static inline constexpr ParsePolicy
get_range( String &&data, std::string_view member_path ) {
static_assert(
std::is_convertible_v<decltype( std::data( data ) ), CharT *>,
"Attempt to assign a const char * to a char *" );

auto [is_found, result] = json_details::find_range<ParsePolicy>(
static inline constexpr ParseState
get_range( daw::string_view data, daw::string_view member_path ) {
auto [is_found, result] = json_details::find_range<ParseState>(
DAW_FWD( data ),
{ std::data( member_path ), std::size( member_path ) } );
daw_json_assert( is_found, ErrorReason::JSONPathNotFound );
Expand All @@ -97,7 +92,6 @@ namespace daw::json {
using iterator_category = std::input_iterator_tag;

private:
using ParseState = ParsePolicy;
ParseState m_state = ParseState( );
/***
* This lets us fastpath and just skip n characters as we have already
Expand All @@ -108,21 +102,9 @@ namespace daw::json {
public:
json_array_iterator( ) = default;

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_iterator,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
inline constexpr explicit json_array_iterator( String &&jd )
: m_state( ParsePolicy( std::data( jd ), daw::data_end( jd ) ) ) {

static_assert(
traits::is_string_view_like_v<daw::remove_cvref_t<String>>,
"String requires being able to call std::data/std::size. char const "
"* are not able to do this, pass a string_view for char const * to "
"ensure you are aware of the strlen cost" );

static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *> );
inline constexpr explicit json_array_iterator( daw::string_view jd )
: m_state( ParseState( std::data( jd ), daw::data_end( jd ) ) ) {

m_state.trim_left( );
daw_json_assert_weak( m_state.is_opening_bracket_checked( ),
ErrorReason::InvalidArrayStart, m_state );
Expand All @@ -131,23 +113,9 @@ namespace daw::json {
m_state.trim_left( );
}

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_iterator,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
inline constexpr explicit json_array_iterator(
String &&jd, std::string_view start_path )
: m_state( get_range( DAW_FWD2( String, jd ), start_path ) ) {

static_assert(
traits::is_string_view_like_v<daw::remove_cvref_t<String>>,
"String requires being able to call std::data/std::size. char const "
"* are not able to do this, pass a string_view for char const * to "
"ensure you are aware of the strlen cost" );

static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *>,
"Attempt to assign a const char * to a char *" );
daw::string_view jd, daw::string_view start_path )
: m_state( get_range( jd, start_path ) ) {

m_state.trim_left( );
daw_json_assert_weak( m_state.is_opening_bracket_checked( ),
Expand All @@ -157,18 +125,16 @@ namespace daw::json {
m_state.trim_left( );
}

/***
* Parse the current element
* @pre good( ) returns true
* @return The parsed result of ParseElement
*/
/// @brief Parse the current element
/// @pre good( ) returns true
/// @return The parsed result of ParseElement
[[nodiscard]] inline constexpr value_type operator*( ) const {
daw_json_assert_weak( m_state.has_more( ) and m_state.front( ) != ']',
ErrorReason::UnexpectedEndOfData, m_state );

auto tmp = m_state;

if constexpr( json_details::use_direct_construction_v<ParsePolicy,
if constexpr( json_details::use_direct_construction_v<ParseState,
JsonElement> ) {
auto const run_after_parse =
json_array_iterator_details::op_star_cleanup<CharT, ParseState>{
Expand All @@ -185,13 +151,11 @@ namespace daw::json {
}
}

/***
* A dereferencable value proxy holding the result of operator*
* This is for compatibility with the Iterator concepts and should be
* avoided
* @pre good( ) returns true
* @return an arrow_proxy of the operator* result
*/
/// @brief A dereferencable value proxy holding the result of operator*
/// This is for compatibility with the Iterator concepts and should be
/// avoided
/// @pre good( ) returns true
/// @return an arrow_proxy of the operator* result
[[nodiscard]] inline pointer operator->( ) const {
return pointer{ operator*( ) };
}
Expand Down Expand Up @@ -229,19 +193,15 @@ namespace daw::json {
m_state.front( ) != ']';
}

/***
* Are we good( )
* @return result of good( )
*/
/// @brief Are we good( )
/// @return result of good( )
[[nodiscard]] explicit inline constexpr operator bool( ) const {
return good( );
}

/***
* Compare rhs for equivalence
* @param rhs Another json_array_iterator
* @return true when equivalent to rhs
*/
/// @brief Compare rhs for equivalence
/// @param rhs Another json_array_iterator
/// @return true when equivalent to rhs
[[nodiscard]] inline constexpr bool
operator==( json_array_iterator const &rhs ) const {
if( not( *this ) ) {
Expand All @@ -253,11 +213,9 @@ namespace daw::json {
return ( m_state.first == rhs.m_state.first );
}

/***
* Check if the other iterator is not equivalent
* @param rhs another json_array_iterator
* @return true when rhs is not equivalent
*/
/// @brief Check if the other iterator is not equivalent
/// @param rhs another json_array_iterator
/// @return true when rhs is not equivalent
[[nodiscard]] inline constexpr bool
operator!=( json_array_iterator const &rhs ) const {
if( not( *this ) ) {
Expand All @@ -270,11 +228,9 @@ namespace daw::json {
}
};

/***
* A range of json_array_iterators
* @tparam JsonElement Type of each element in array
* @tparam ParsePolicy parsing policy type
*/
/// @brief A range of json_array_iterators
/// @tparam JsonElement Type of each element in array
/// @tparam ParsePolicy parsing policy type
template<typename JsonElement, auto... PolicyFlags>
struct json_array_range {
using ParsePolicy = BasicParsePolicy<
Expand All @@ -289,46 +245,25 @@ namespace daw::json {
public:
json_array_range( ) = default;

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_range,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
constexpr explicit json_array_range( String &&jd )
: m_first( DAW_FWD2( String, jd ) ) {
static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *> );
}
constexpr explicit json_array_range( daw::string_view jd )
: m_first( jd ) {}

template<typename String,
std::enable_if_t<not std::is_same_v<json_array_range,
daw::remove_cvref_t<String>>,
std::nullptr_t> = nullptr>
constexpr explicit json_array_range( String &&jd,
std::string_view start_path )
: m_first( DAW_FWD2( String, jd ), start_path ) {
static_assert(
std::is_convertible_v<decltype( std::data( jd ) ), CharT *>,
"Attempt to assign a const char * to a char *" );
}
constexpr explicit json_array_range( daw::string_view jd,
daw::string_view start_path )
: m_first( jd, start_path ) {}

/***
* @return first item in range
*/
/// @return first item in range
[[nodiscard]] inline constexpr iterator begin( ) {
return m_first;
}

/***
* @return one past last item in range
*/
/// @return one past last item in range
[[nodiscard]] inline constexpr iterator end( ) {
return m_last;
}

/***
* Are there any elements in range
* @return true when begin( ) == end( )
*/
/// @brief Are there any elements in range
/// @return true when begin( ) == end( )
[[nodiscard]] inline constexpr bool empty( ) const {
return m_first == m_last;
}
Expand Down
40 changes: 38 additions & 2 deletions include/daw/json/daw_json_lines_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ namespace daw::json {
/// @brief Iterator for iterating over JSON array's
/// @tparam JsonElement type under underlying element in array. If
/// heterogeneous, a basic_json_value_iterator may be more appropriate
template<typename JsonElement, auto... PolicyFlags>
template<typename JsonElement = json_value, auto... PolicyFlags>
class json_lines_iterator {
using ParsePolicy = BasicParsePolicy<
options::details::make_parse_flags<PolicyFlags...>( ).value>;
Expand Down Expand Up @@ -213,7 +213,7 @@ namespace daw::json {
* @tparam JsonElement Type of each element in array
* @tparam ParsePolicy parsing policy type
*/
template<typename JsonElement, auto... PolicyFlags>
template<typename JsonElement = json_value, auto... PolicyFlags>
struct json_lines_range {
using ParsePolicy = BasicParsePolicy<
options::details::make_parse_flags<PolicyFlags...>( ).value>;
Expand Down Expand Up @@ -246,5 +246,41 @@ namespace daw::json {
return m_first == m_last;
}
};

/// @brief parition the document into num_partitions, non overlapping
/// pieces.
template<typename JsonElement = json_value, auto... ParsePolicies>
auto partition_jsonl_document( std::size_t num_partitions,
daw::string_view jsonl_doc ) {
using result_t =
std::vector<json_lines_range<JsonElement, ParsePolicies...>>;
if( num_partitions <= 1 ) {
return result_t{
json_lines_range<JsonElement, ParsePolicies...>( jsonl_doc ) };
}
auto approx_segsize = jsonl_doc.size( ) / num_partitions;
auto result = result_t{ };
char const *const last = daw::data_end( jsonl_doc );
while( not jsonl_doc.empty( ) ) {
char const *tmp = std::data( jsonl_doc ) + approx_segsize;
if( tmp >= last ) {
result.emplace_back( jsonl_doc );
break;
}
while( tmp < last and * tmp != '\n' ) {
++tmp;
}
if( tmp < last ) {
++tmp;
}
auto sz = static_cast<std::size_t>( tmp - std::data( jsonl_doc ) );
auto doc = jsonl_doc.pop_front( sz );
doc.trim_suffix( );
if( not doc.empty( ) ) {
result.emplace_back( doc );
}
}
return result;
}
} // namespace DAW_JSON_VER
} // namespace daw::json
Loading

0 comments on commit 4012721

Please sign in to comment.