Skip to content

Commit

Permalink
embeded param for argmin with AVX-512
Browse files Browse the repository at this point in the history
  • Loading branch information
herumi committed Dec 3, 2024
1 parent 13f5ee3 commit 89e4321
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 19 deletions.
40 changes: 22 additions & 18 deletions sample/mt_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ extern cybozu::CpuClock clk2;
extern cybozu::CpuClock clk3;
extern cybozu::CpuClock clk4;
#endif
inline size_t argminForMulVecAVX512(size_t n)
{
if (n <= 2) return 2;
size_t log2n = mcl::ec::ilog2(n);
const size_t tblMin = 7;
if (log2n < tblMin) return 4;
// n >= 2^tblMin
static const size_t tbl[] = {
4, 5, 5, 6, 7, 8, 8, 10, 10, 10, 10, 10, 13, 15, 15, 16, 16, 16, 16, 16
};
if (log2n >= CYBOZU_NUM_OF_ARRAY(tbl)) return 16;
size_t ret = tbl[log2n - tblMin];
return ret;
}

int main(int argc, char *argv[])
try
Expand Down Expand Up @@ -69,28 +83,18 @@ int main(int argc, char *argv[])
G1 P1, P2;
#ifdef MCL_MSM
if (msmOnly) {
const cybozu::CpuClock& clk = cybozu::bench::g_clk;
#if 1
const size_t adj = 8; // AVX-512
#else
const size_t adj = 1; // scalar
#endif
for (size_t nn = 1u<<9; nn <= n; nn *= 2) {
const size_t c = mcl::ec::argminForMulVec(nn/adj*2);
for (size_t bucketN = c-4; bucketN <= c; bucketN++) {
mcl::fp::getRefArgminForce() = bucketN;
CYBOZU_BENCH_C("", C, G1::mulVec, P1, Pvec.data(), xVec.data(), nn);
mcl::fp::getRefArgminForce() = 0;
printf("% 8zd % 8zd %.2f Mclk\n", nn, bucketN, double(clk.getClock())/double(clk.getCount()*C)*1e-6); fflush(stdout);
printf("% 8zd", nn);
CYBOZU_BENCH_C(" ", C, G1::mulVec, P1, Pvec.data(), xVec.data(), nn);
fflush(stdout);
#ifdef USE_CLK
printf("getCount g=%d %d %d %d %d %d %d\n", clk.getCount(), clk0.getCount(), clk1.getCount(), clk2.getCount(), clk3.getCount(), clk4.getCount(), clk5.getCount());
clk0.put("clk0"); clk0.clear();
clk1.put("clk1"); clk1.clear();
clk2.put("clk2"); clk2.clear();
clk3.put("clk3"); clk3.clear();
clk4.put("clk4"); clk4.clear();
clk0.put("clk0"); clk0.clear();
clk1.put("clk1"); clk1.clear();
clk2.put("clk2"); clk2.clear();
clk3.put("clk3"); clk3.clear();
clk4.put("clk4"); clk4.clear();
#endif
}
}
return 0;
}
Expand Down
17 changes: 16 additions & 1 deletion src/msm_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1265,11 +1265,26 @@ clk4.end();
#endif
}

inline size_t argminForMulVecAVX512(size_t n)
{
if (n <= 2) return 2;
size_t log2n = mcl::ec::ilog2(n);
const size_t tblMin = 7;
if (log2n < tblMin) return 4;
// n >= 2^tblMin
static const size_t tbl[] = {
4, 5, 5, 6, 7, 8, 8, 10, 10, 10, 10, 10, 13, 15, 15, 16, 16, 16, 16, 16
};
if (log2n >= CYBOZU_NUM_OF_ARRAY(tbl)) return 16;
size_t ret = tbl[log2n - tblMin];
return ret;
}
// xVec[n], yVec[n * maxBitSize/64]
template<class G=EcM, class V=Vec, bool mixed = false>
inline void mulVecAVX512_inner(mcl::msm::G1A& P, const G *xVec, const V *yVec, size_t n, size_t maxBitSize)
{
size_t c = mcl::ec::argminForMulVec(n);
// size_t c = mcl::ec::argminForMulVec(n);
size_t c = argminForMulVecAVX512(n);
size_t tblN = size_t(1) << c;
G *tbl = (G*)Xbyak::AlignedMalloc(sizeof(G) * tblN, 64);
const size_t yn = maxBitSize / 64;
Expand Down

0 comments on commit 89e4321

Please sign in to comment.