diff --git a/bench/EmbeddingQuantizeFloatToFloatOrHalfBenchmark.cc b/bench/EmbeddingQuantizeFloatToFloatOrHalfBenchmark.cc new file mode 100644 index 0000000000..72146bc299 --- /dev/null +++ b/bench/EmbeddingQuantizeFloatToFloatOrHalfBenchmark.cc @@ -0,0 +1,93 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +#include "./BenchUtils.h" +#include "fbgemm/QuantUtils.h" +#include "fbgemm/Types.h" + +using namespace std; +using namespace fbgemm; + +// T is the type of scale and bias +template +void performance_test() { + constexpr int NWARMUP = 4; + constexpr int NITER = 256; + + if (is_same::value) { + cout << "With result as float16" << endl; + } else { + cout << "With result as float" << endl; + } + cout << setw(6) << "rows" << "," << setw(6) << "cols" << "," << setw(16) + << "elems_per_usec" << "," << setw(10) << "GB/Sec" << endl; + + for (int rowSize : {100, 120, 1000}) { + for (int colSize : {16, 64, 128, 256, 512, 1024, 2048}) { + aligned_vector inpVec(rowSize * colSize); + randFill(inpVec, 0, 20); + + int out_emb_cols = colSize; + + if (is_same::value) { + out_emb_cols /= 2; + } + int outVecSize = rowSize * (out_emb_cols + 2 * sizeof(T)); + aligned_vector outVec(outVecSize); + + double duration = 0.0f; + + duration = measureWithWarmup( + [&]() { + Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf( + inpVec.data(), rowSize, colSize, outVec.data()); + }, + NWARMUP, + NITER, + [&]() { + cache_evict(inpVec); + cache_evict(outVec); + }); + + float elements_per_usec = rowSize * colSize / (duration * 1e6); + + duration *= 1e9; // convert to ns + long bytes_read = rowSize * colSize * sizeof(float); + float gigabyes_per_sec = bytes_read / duration; + + cout << setw(6) << rowSize << ", " << setw(6) << colSize << ","; + cout << setw(16) << std::fixed << std::setprecision(2) + << elements_per_usec << ", "; + cout << setw(10) << std::fixed << std::setprecision(2) << gigabyes_per_sec + << endl; + } // for each cols + } // for each rows +} // performance_test + +int main() { +#ifdef _OPENMP + // Use 1 thread unless OMP_NUM_THREADS is explicit set. + const char* val = getenv("OMP_NUM_THREADS"); + if (val == nullptr || !*val) { + omp_set_num_threads(1); + } +#endif + performance_test(); + performance_test(); + return 0; +}