ling0322 · ling0322 · Jun 18, 2024 · May 29, 2024 · Jun 17, 2024 · Jun 18, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,7 @@ include(CheckTypeSize)
 
 option(WITH_CUDA "CUDA compute support." OFF)
 option(WITH_MKL "MKL support (only for reference)." OFF)
+option(WITH_OPENMP "Build with OpenMP." ON)
 option(WITH_CUTLASS "build MatMul operators with CUTLASS." OFF)
 option(MKL_PREFIX "Prefix for MKL headers and libraries." "/opt/intel/mkl")
 

diff --git a/src/libllm/CMakeLists.txt b/src/libllm/CMakeLists.txt
@@ -1,5 +1,4 @@
-find_package(OpenMP REQUIRED)
-
+find_package(OpenMP)
 if (OPENMP_FOUND)
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
@@ -18,6 +17,7 @@ set(lut_SOURCES
     "lut/reader.cc"
     "lut/strings.cc"
     "lut/time.cc"
+    "lut/thread_pool.cc"
     "lut/zip_file.cc")
 
 set(libllm_SOURCES
@@ -56,6 +56,7 @@ set(libllm_SOURCES
     "llama.cc"
     "model_for_generation.cc"
     "module.cc"
+    "mp.cc"
     "operators.cc"
     "qwen.cc"
     "sampler.cc"
@@ -86,7 +87,19 @@ set(llm_SOURCES
 
 set(benchmark_SOURCES "benchmark_main.cc")
 
-set(libllm_INCDIR ".." "../../third_party" ${OpenMP_CXX_INCLUDE_DIRS})
+set(libllm_INCDIR ".." "../../third_party")
+set(libllm_LIBADD lut ${CMAKE_DL_LIBS})
+
+if(WITH_OPENMP)
+    if(NOT OPENMP_FOUND)
+        message(FATAL_ERROR "WITH_OPENMP=ON build OpenMP package not found.")
+    endif()
+    set(libllm_SOURCES ${libllm_SOURCES} "mp_openmp.cc")
+    set(libllm_INCDIR ${libllm_INCDIR} ${OpenMP_CXX_INCLUDE_DIRS})
+    set(libllm_LIBADD ${libllm_LIBADD} OpenMP::OpenMP_CXX)
+else()
+    set(libllm_SOURCES ${libllm_SOURCES} "mp_thread_pool.cc")
+endif()
 
 if (WITH_CUDA)
     set(libllm_INCDIR ${libllm_INCDIR} ${CUDAToolkit_INCLUDE_DIRS})
@@ -188,11 +201,6 @@ add_library(lut STATIC ${lut_SOURCES})
 set_target_properties(lut PROPERTIES CXX_VISIBILITY_PRESET hidden)
 target_include_directories(lut PRIVATE ${libllm_INCDIR})
 
-set(libllm_LIBADD
-    lut
-    ${CMAKE_DL_LIBS}
-    OpenMP::OpenMP_CXX)
-
 add_library(libllm_static OBJECT ${libllm_SOURCES})
 target_compile_options(libllm_static PRIVATE "-DLIBLLM_EXPORTS")
 set_target_properties(libllm_static PROPERTIES CXX_VISIBILITY_PRESET hidden)

diff --git a/src/libllm/benchmark_main.cc b/src/libllm/benchmark_main.cc
@@ -164,11 +164,10 @@ void benchmarkLlama(std::shared_ptr<llama::LlamaModel> model, int ctxLength, DTy
       tokenPerSec);
 }
 
-int benchmarkMain() {
+int benchmarkMain(Device device) {
   CHECK(llmInit(LLM_API_VERSION) == LLM_OK);
 
   LlamaType llamaType = LlamaType::Llama2_7B;
-  Device device = libllm::Device::getCuda();
   DType weightType = libllm::DType::kQInt4x32;
 
   LOG(INFO) << "intializing model ...";
@@ -189,6 +188,23 @@ int benchmarkMain() {
 }  // namespace libllm
 
 int main(int argc, char **argv) {
-  libllm::benchmarkMain();
-  return 0;
+  const char *usage =
+      "Command line interface for benchmarking libllm.\n"
+      "Usage: benchmark [-d (cpu|cuda)]";
+
+  std::string deviceType = "cuda";
+  lut::Flags flags(usage);
+  flags.define("-d", &deviceType, "device of the model. (cpu|cuda)");
+  flags.parse(argc, argv);
+
+  if (deviceType == "cpu") {
+    libllm::benchmarkMain(libllm::Device::getCpu());
+    return 0;
+  } else if (deviceType == "cuda") {
+    libllm::benchmarkMain(libllm::Device::getCuda());
+    return 0;
+  } else {
+    fprintf(stderr, "unexpected device %s\n", deviceType.c_str());
+    return 1;
+  }
 }
diff --git a/src/libllm/c_api.cc b/src/libllm/c_api.cc
@@ -1,4 +1,3 @@
-#include <omp.h>
 #include <string.h>
 
 #include <atomic>
@@ -127,8 +126,6 @@ llmStatus_t llmInit(int32_t apiVersion) {
       lut::setLogLevel(lut::LogSeverity::kINFO);
       libllm::initOperators();
 
-      LOG(INFO) << "OMP max_threads = " << omp_get_max_threads();
-
       return LLM_OK;
     } catch (const lut::Error &e) {
       gInitialized = false;

diff --git a/src/libllm/cpu/binary_op.cc b/src/libllm/cpu/binary_op.cc
@@ -7,7 +7,7 @@
 // restriction, including without limitation the rights to use, copy, modify, merge, publish,
 // distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
@@ -22,8 +22,9 @@
 #include "libllm/cpu/accessor.h"
 #include "libllm/cpu/common.h"
 #include "libllm/cpu/tensor.h"
-#include "libllm/tensor.h"
 #include "libllm/lut/attributes.h"
+#include "libllm/mp.h"
+#include "libllm/tensor.h"
 
 namespace libllm {
 namespace op {
@@ -44,22 +45,23 @@ Tensor binaryOpKernel(const Tensor &A, const Tensor &B, BinaryOp op) {
   TensorList<T, 1> vC = TensorList<T, 1>::fromTensor(C);
   CHECK(vA.getLength() == vB.getLength() && vC.getLength() == vB.getLength());
 
-  #pragma omp parallel for
-  for (int j = 0; j < vA.getLength(); ++j) {
-    TensorAccessor<const T, 1> a = vA.getTensor(j);
-    TensorAccessor<const T, 1> b = vB.getTensor(j);
-    TensorAccessor<T, 1> c = vC.getTensor(j);
+  MP::parallelFor({vA.getLength()}, [&vA, &vB, &vC, op](MP::Partition partition) {
+    for (int j : partition.getRange()) {
+      TensorAccessor<const T, 1> a = vA.getTensor(j);
+      TensorAccessor<const T, 1> b = vB.getTensor(j);
+      TensorAccessor<T, 1> c = vC.getTensor(j);
 
-    for (int i = 0; i < a.getShape(0); ++i) {
-      if (op == BinaryOp::ADD) {
-        c[i] = a[i] + b[i];
-      } else if (op == BinaryOp::MUL) {
-        c[i] = a[i] * b[i];
-      } else {
-        NOT_IMPL();
+      for (int i = 0; i < a.getShape(0); ++i) {
+        if (op == BinaryOp::ADD) {
+          c[i] = a[i] + b[i];
+        } else if (op == BinaryOp::MUL) {
+          c[i] = a[i] * b[i];
+        } else {
+          NOT_IMPL();
+        }
       }
     }
-  }
+  });
 
   return C;
 }
@@ -74,6 +76,6 @@ Tensor binaryOp(const Tensor &A, const Tensor &B, BinaryOp op) {
   NOT_IMPL();
 }
 
-}  // cpu
-}  // op
-}  // libllm
+}  // namespace cpu
+}  // namespace op
+}  // namespace libllm
diff --git a/src/libllm/cpu/copy.cc b/src/libllm/cpu/copy.cc
@@ -7,7 +7,7 @@
 // restriction, including without limitation the rights to use, copy, modify, merge, publish,
 // distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
@@ -22,6 +22,7 @@
 #include "libllm/cpu/accessor.h"
 #include "libllm/cpu/common.h"
 #include "libllm/cpu/tensor.h"
+#include "libllm/mp.h"
 
 namespace libllm {
 namespace op {
@@ -33,13 +34,14 @@ void copyKernel(const Tensor &src, Tensor &dest) {
   TensorList<T, 1> vC = TensorList<T, 1>::fromTensor(dest);
   CHECK(vA.getLength() == vC.getLength());
 
-  #pragma omp parallel for
-  for (int j = 0; j < vA.getLength(); ++j) {
-    TensorAccessor<const T, 1> a = vA.getTensor(j);
-    TensorAccessor<T, 1> c = vC.getTensor(j);
+  MP::parallelFor({vA.getLength()}, [&vA, &vC](MP::Partition partition) {
+    for (int j : partition.getRange()) {
+      TensorAccessor<const T, 1> a = vA.getTensor(j);
+      TensorAccessor<T, 1> c = vC.getTensor(j);
 
-    copyVector(c, a);
-  }
+      copyVector(c, a);
+    }
+  });
 }
 
 void copy(const Tensor &src, Tensor &dest) {
@@ -56,6 +58,6 @@ void copy(const Tensor &src, Tensor &dest) {
   }
 }
 
-}  // cpu
-}  // op
-}  // libllm
+}  // namespace cpu
+}  // namespace op
+}  // namespace libllm
diff --git a/src/libllm/cpu/kernel/asimdhp_test.cc b/src/libllm/cpu/kernel/asimdhp_test.cc
@@ -20,7 +20,6 @@
 #include "libllm/cpu/kernel/asimdhp.h"
 
 #include <math.h>
-#include <omp.h>
 
 #include "catch2/catch_amalgamated.hpp"
 #include "libllm/cpu/kernel/abstract.h"

diff --git a/src/libllm/cpu/kernel/avx2_test.cc b/src/libllm/cpu/kernel/avx2_test.cc
@@ -18,7 +18,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <math.h>
-#include <omp.h>
 
 #include "catch2/catch_amalgamated.hpp"
 #include "libllm/cpu/kernel/abstract.h"

diff --git a/src/libllm/cpu/kernel/avx512_test.cc b/src/libllm/cpu/kernel/avx512_test.cc
@@ -18,7 +18,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <math.h>
-#include <omp.h>
 
 #include "catch2/catch_amalgamated.hpp"
 #include "libllm/cpu/kernel/abstract.h"

diff --git a/src/libllm/cpu/kernel/block.h b/src/libllm/cpu/kernel/block.h
@@ -7,7 +7,7 @@
 // restriction, including without limitation the rights to use, copy, modify, merge, publish,
 // distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in all copies or
 // substantial portions of the Software.
 //
@@ -22,6 +22,7 @@
 #include "libllm/cpu/kernel/abstract.h"
 #include "libllm/lut/log.h"
 #include "libllm/lut/time.h"
+#include "libllm/mp.h"
 
 namespace libllm {
 namespace op {
@@ -37,9 +38,9 @@ struct Block {
   int32_t numCols;
   bool transposed;
 
-  constexpr Block<T> sliceRow(int row, int nr);
-  constexpr Block<T> sliceCol(int col, int nc);
-  constexpr Block<T> slice(int row, int col, int nr, int nc);
+  constexpr Block<T> sliceRow(int row, int nr) const;
+  constexpr Block<T> sliceCol(int col, int nc) const;
+  constexpr Block<T> slice(int row, int col, int nr, int nc) const;
   inline void copyTo(Block<T> tgt);
   constexpr Block<T> t();
   constexpr void fillZero();
@@ -52,21 +53,28 @@ struct PackedBlock {
   int32_t numRows;
   int32_t numBlocks;
 
-  constexpr Block<T> block(int i);
+  constexpr Block<T> block(int i) const;
 };
 
 template<typename T, Mode MODE>
 PackedBlock<T> Pack(Block<T> src, Block<T> buf, int pack_size) {
   int numBlock = src.numCols / pack_size;
   int kc = src.numRows;
-  PackedBlock<T> tgt { buf.data, pack_size, kc, numBlock };
+  PackedBlock<T> tgt{buf.data, pack_size, kc, numBlock};
   CHECK(pack_size * numBlock * kc <= buf.numCols * buf.numRows);
 
-  #pragma omp parallel for if(MODE == Mode::OMP)
-  for (int b = 0; b < numBlock; ++b) {
-    Block<T> srcBlock = src.sliceCol(b * pack_size, pack_size);
-    Block<T> tgtBlock = tgt.block(b);
-    srcBlock.copyTo(tgtBlock);
+  auto closure = [src, tgt, pack_size](MP::Partition partition) {
+    for (int b : partition.getRange()) {
+      Block<T> srcBlock = src.sliceCol(b * pack_size, pack_size);
+      Block<T> tgtBlock = tgt.block(b);
+      srcBlock.copyTo(tgtBlock);
+    }
+  };
+
+  if (MODE == Mode::OMP) {
+    MP::parallelFor({numBlock}, closure);
+  } else {
+    closure(MP::Partition(lut::Range(numBlock)));
   }
 
   int nc = src.numCols % pack_size;
@@ -84,23 +92,22 @@ PackedBlock<T> Pack(Block<T> src, Block<T> buf, int pack_size) {
 }
 
 template<typename T>
-constexpr Block<T> Block<T>::sliceRow(int row, int nr) {
+constexpr Block<T> Block<T>::sliceRow(int row, int nr) const {
   return slice(row, 0, nr, numCols);
 }
 template<typename T>
-constexpr Block<T> Block<T>::sliceCol(int col, int nc) {
+constexpr Block<T> Block<T>::sliceCol(int col, int nc) const {
   return slice(0, col, numRows, nc);
 }
 
 template<typename T>
-constexpr Block<T> Block<T>::slice(int row, int col, int nr, int nc) {
-  return Block {
-    data + (transposed ? row + col * stride : row * stride + col),
-    stride,
-    nr,
-    nc,
-    transposed
-  };
+constexpr Block<T> Block<T>::slice(int row, int col, int nr, int nc) const {
+  return Block{
+      data + (transposed ? row + col * stride : row * stride + col),
+      stride,
+      nr,
+      nc,
+      transposed};
 }
 
 template<typename T>
@@ -135,21 +142,15 @@ inline void Block<T>::copyTo(Block<T> tgt) {
       int srcOffset = c * stride;
       int tgtOffset = c * tgt.stride;
       for (int r = 0; r < numRows; ++r) {
-          tgt.data[r + tgtOffset] = data[r + srcOffset];
+        tgt.data[r + tgtOffset] = data[r + srcOffset];
       }
     }
   }
 }
 
 template<typename T>
 constexpr Block<T> Block<T>::t() {
-  return Block<T> {
-    data,
-    stride,
-    numCols,
-    numRows,
-    !transposed
-  };
+  return Block<T>{data, stride, numCols, numRows, !transposed};
 }
 
 template<typename T>
@@ -166,18 +167,10 @@ constexpr void Block<T>::fillZero() {
 }
 
 template<typename T>
-constexpr Block<T> PackedBlock<T>::block(int i) {
-  return Block<T> {
-    data + packSize * numRows * i,
-    packSize,
-    numRows,
-    packSize,
-    false
-  };
+constexpr Block<T> PackedBlock<T>::block(int i) const {
+  return Block<T>{data + packSize * numRows * i, packSize, numRows, packSize, false};
 }
 
-
-
 }  // namespace kernel
 }  // namespace cpu
 }  // namespace op