Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(bb): simplify parallel_for_if_effective #8079

Merged
merged 6 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 40 additions & 79 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "barretenberg/commitment_schemes/claim.hpp"
#include "barretenberg/commitment_schemes/verification_key.hpp"
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
#include "barretenberg/transcript/transcript.hpp"
#include <cstddef>
Expand Down Expand Up @@ -90,8 +91,6 @@ template <typename Curve_> class IPA {
#ifdef IPA_FUZZ_TEST
friend class ProxyCaller;
#endif
// clang-format off

/**
* @brief Compute an inner product argument proof for opening a single polynomial at a single evaluation point.
*
Expand Down Expand Up @@ -128,16 +127,14 @@ template <typename Curve_> class IPA {
*
*7. Send the final \f$\vec{a}_{0} = (a_0)\f$ to the verifier
*/
template <typename Transcript>
static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
const ProverOpeningClaim<Curve>& opening_claim,
const std::shared_ptr<Transcript>& transcript)
{

Polynomial polynomial = opening_claim.polynomial;
template <typename Transcript>
static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
const ProverOpeningClaim<Curve>& opening_claim,
const std::shared_ptr<Transcript>& transcript)
{
const Polynomial& polynomial = opening_claim.polynomial;

// clang-format on
auto poly_length = static_cast<size_t>(polynomial.size());
size_t poly_length = polynomial.size();

// Step 1.
// Send polynomial degree + 1 = d to the verifier
Expand Down Expand Up @@ -169,36 +166,27 @@ template <typename Curve_> class IPA {
// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&G_vec_local, srs_elements](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/0,
/*finite_field_inversions_per_iteration=*/0,
/*group_element_additions_per_iteration=*/0,
/*group_element_doublings_per_iteration=*/0,
/*scalar_multiplications_per_iteration=*/0,
/*sequential_copy_ops_per_iteration=*/1);
}, thread_heuristics::FF_COPY_COST);

// Step 5.
// Compute vector b (vector of the powers of the challenge)
OpeningPair<Curve> opening_pair = opening_claim.opening_pair;
std::vector<Fr> b_vec(poly_length);
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&b_vec, &opening_pair](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
Fr b_power = opening_pair.challenge.pow(start);
for (size_t i = start; i < end; i++) {
b_vec[i] = b_power;
b_power *= opening_pair.challenge;
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/1);
}, thread_heuristics::FF_COPY_COST + thread_heuristics::FF_MULTIPLICATION_COST);

// Iterate for log(poly_degree) rounds to compute the round commitments.
auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_length));
Expand All @@ -221,18 +209,9 @@ template <typename Curve_> class IPA {
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
// Run scalar products in parallel
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
round_size,
[&a_vec,
&b_vec,
round_size,
&inner_prod_L,
&inner_prod_R
#ifndef NO_MULTITHREADING
,
&inner_product_accumulation_mutex
#endif
](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
Expand All @@ -247,9 +226,7 @@ template <typename Curve_> class IPA {
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
}
},
/*finite_field_additions_per_iteration=*/2,
/*finite_field_multiplications_per_iteration=*/2);
}, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);

// Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( )
// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
Expand Down Expand Up @@ -281,29 +258,24 @@ template <typename Curve_> class IPA {
// Step 6.e
// G_vec_new = G_vec_lo + G_vec_hi * round_challenge_inv
auto G_hi_by_inverse_challenge = GroupElement::batch_mul_with_endomorphism(
std::span{ G_vec_local.begin() + static_cast<long>(round_size),
G_vec_local.begin() + static_cast<long>(round_size * 2) },
std::span{ G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size),
G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size * 2) },
round_challenge_inv);
GroupElement::batch_affine_add(
std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size) },
G_hi_by_inverse_challenge,
G_vec_local);

// Steps 6.e and 6.f
// Update the vectors a_vec, b_vec.
// a_vec_new = a_vec_lo + a_vec_hi * round_challenge
// b_vec_new = b_vec_lo + b_vec_hi * round_challenge_inv
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
round_size,
[&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
a_vec[j] += round_challenge * a_vec[round_size + j];
b_vec[j] += round_challenge_inv * b_vec[round_size + j];
}
},
/*finite_field_additions_per_iteration=*/4,
/*finite_field_multiplications_per_iteration=*/8,
/*finite_field_inversions_per_iteration=*/1);
[&](size_t j) {
a_vec[j] += round_challenge * a_vec[round_size + j];
b_vec[j] += round_challenge_inv * b_vec[round_size + j];
}, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);
}

// Step 7
Expand Down Expand Up @@ -409,23 +381,19 @@ template <typename Curve_> class IPA {
// TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its
// O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're
// leaving it unoptimized for now.
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&s_vec, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
}
[&](size_t i) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
}
s_vec[i] = s_vec_scalar;
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/log_poly_degree);
s_vec[i] = s_vec_scalar;
}, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree);

auto* srs_elements = vk->get_monomial_points();

Expand All @@ -435,20 +403,13 @@ template <typename Curve_> class IPA {
// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&G_vec_local, srs_elements](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/0,
/*finite_field_inversions_per_iteration=*/0,
/*group_element_additions_per_iteration=*/0,
/*group_element_doublings_per_iteration=*/0,
/*scalar_multiplications_per_iteration=*/0,
/*sequential_copy_ops_per_iteration=*/1);
}, thread_heuristics::FF_COPY_COST * 2);

// Step 8.
// Compute G₀
Expand Down Expand Up @@ -497,7 +458,7 @@ template <typename Curve_> class IPA {
// Ensure polynomial length cannot be changed from its default specified valued
poly_length_var.fix_witness();

const uint32_t poly_length = static_cast<uint32_t>(poly_length_var.get_value());
const auto poly_length = static_cast<uint32_t>(poly_length_var.get_value());

// Step 2.
// Receive generator challenge u and compute auxiliary generator
Expand Down Expand Up @@ -559,7 +520,7 @@ template <typename Curve_> class IPA {
// O(nlogn). This can be optimized to be linear by computing a tree of products.
for (size_t i = 0; i < poly_length; i++) {
Fr s_vec_scalar = Fr(1);
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
Expand Down
4 changes: 3 additions & 1 deletion barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@
#else
#define BB_LIKELY(x) x
#define BB_UNLIKELY(x) x
#endif
#endif

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a good idea but its worth adding a comment for when people say what the hell is this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll smuggle this next PR

#define BB_UNUSED [[maybe_unused]]
6 changes: 6 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <map>
#include <memory>
#include <unordered_map>
#include <vector>
#ifndef NO_MULTITHREADING
#include <mutex>
#endif
Expand Down Expand Up @@ -75,4 +76,9 @@ template <typename T> class ContainerSlabAllocator {
}
};

/**
* @brief A vector that uses the slab allocator.
*/
template <typename T> using SlabVector = std::vector<T, bb::ContainerSlabAllocator<T>>;

} // namespace bb
74 changes: 9 additions & 65 deletions barretenberg/cpp/src/barretenberg/common/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
* @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
*
*/
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal)
void parallel_for_range(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal)
{
if (num_points <= no_multhreading_if_less_or_equal) {
func(0, num_points);
Expand Down Expand Up @@ -129,45 +129,10 @@ void run_loop_in_parallel(size_t num_points,
});
};

/**
* @brief Split a loop into several loops running in parallel based on operations in 1 iteration
*
* @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
* that should contain the work loop, but only if it's worth it
* @param num_points Total number of elements
* @param func A function or lambda expression with a for loop inside, for example:
* [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
* Or for the version with index:
* [](size_t start, size_t end,size_t workload_index){for (size_t i=start; i<end; i++){(void)i;}}
* @param finite_field_additions_per_iteration The number of additions/subtractions/negations
* @param finite_field_multiplications_per_iteration The number of finite field multiplications and squarings
* @param finite_field_inversions_per_iteration
* @param group_element_additions_per_iteration Projective addition number
* @param group_element_doublings_per_iteration Projective doubling number
* @param scalar_multiplications_per_iteration
* @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number
*/
template <typename FunctionType>
requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
void run_loop_in_parallel_if_effective_internal(size_t num_points,
const FunctionType& func,
size_t finite_field_additions_per_iteration,
size_t finite_field_multiplications_per_iteration,
size_t finite_field_inversions_per_iteration,
size_t group_element_additions_per_iteration,
size_t group_element_doublings_per_iteration,
size_t scalar_multiplications_per_iteration,
size_t sequential_copy_ops_per_iteration)
void parallel_for_heuristic(size_t num_points,
const std::function<void(size_t, size_t, size_t)>& func,
size_t heuristic_cost)
{
// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds):
constexpr size_t FF_ADDITION_COST = 4;
constexpr size_t FF_MULTIPLICATION_COST = 21;
constexpr size_t FF_INVERSION_COST = 7000;
constexpr size_t GE_ADDITION_COST = 350;
constexpr size_t GE_DOUBLING_COST = 194;
constexpr size_t SM_COST = 50000;
constexpr size_t SEQ_COPY_COST = 3;
// We take the maximum observed parallel_for cost (388 us) and round it up.
// The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we
// can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium
Expand All @@ -180,23 +145,11 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);

// Compute the cost of all operations done by other threads
const size_t offset_cost =
(num_points - chunk_size) *
(finite_field_additions_per_iteration * FF_ADDITION_COST +
finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST +
finite_field_inversions_per_iteration * FF_INVERSION_COST +
group_element_additions_per_iteration * GE_ADDITION_COST +
group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST +
sequential_copy_ops_per_iteration * SEQ_COPY_COST);
const size_t offset_cost = (num_points - chunk_size) * heuristic_cost;

// If starting parallel for is longer than computing, just compute
if (offset_cost < PARALLEL_FOR_COST) {
if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(0, num_points);
} else {
func(0, num_points, 0);
}
func(0, num_points, 0);
return;
}
// Parallelize over chunks
Expand All @@ -213,18 +166,9 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
size_t start = chunk_index * chunk_size;
size_t end = chunk_index * chunk_size + current_chunk_size;

if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(start, end);
} else {
func(start, end, chunk_index);
}
func(start, end, chunk_index);
});
};
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);

/**
* @brief calculates number of threads to create based on minimum iterations per thread
Expand Down
Loading
Loading