Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add class lut::ThreadPool and make OpenMP optional #72

Merged
merged 4 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include(CheckTypeSize)

option(WITH_CUDA "CUDA compute support." OFF)
option(WITH_MKL "MKL support (only for reference)." OFF)
option(WITH_OPENMP "Build with OpenMP." ON)
option(WITH_CUTLASS "build MatMul operators with CUTLASS." OFF)
option(MKL_PREFIX "Prefix for MKL headers and libraries." "/opt/intel/mkl")

Expand Down
24 changes: 16 additions & 8 deletions src/libllm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
find_package(OpenMP REQUIRED)

find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
Expand All @@ -18,6 +17,7 @@ set(lut_SOURCES
"lut/reader.cc"
"lut/strings.cc"
"lut/time.cc"
"lut/thread_pool.cc"
"lut/zip_file.cc")

set(libllm_SOURCES
Expand Down Expand Up @@ -56,6 +56,7 @@ set(libllm_SOURCES
"llama.cc"
"model_for_generation.cc"
"module.cc"
"mp.cc"
"operators.cc"
"qwen.cc"
"sampler.cc"
Expand Down Expand Up @@ -86,7 +87,19 @@ set(llm_SOURCES

set(benchmark_SOURCES "benchmark_main.cc")

set(libllm_INCDIR ".." "../../third_party" ${OpenMP_CXX_INCLUDE_DIRS})
set(libllm_INCDIR ".." "../../third_party")
set(libllm_LIBADD lut ${CMAKE_DL_LIBS})

if(WITH_OPENMP)
if(NOT OPENMP_FOUND)
message(FATAL_ERROR "WITH_OPENMP=ON build OpenMP package not found.")
endif()
set(libllm_SOURCES ${libllm_SOURCES} "mp_openmp.cc")
set(libllm_INCDIR ${libllm_INCDIR} ${OpenMP_CXX_INCLUDE_DIRS})
set(libllm_LIBADD ${libllm_LIBADD} OpenMP::OpenMP_CXX)
else()
set(libllm_SOURCES ${libllm_SOURCES} "mp_thread_pool.cc")
endif()

if (WITH_CUDA)
set(libllm_INCDIR ${libllm_INCDIR} ${CUDAToolkit_INCLUDE_DIRS})
Expand Down Expand Up @@ -188,11 +201,6 @@ add_library(lut STATIC ${lut_SOURCES})
set_target_properties(lut PROPERTIES CXX_VISIBILITY_PRESET hidden)
target_include_directories(lut PRIVATE ${libllm_INCDIR})

set(libllm_LIBADD
lut
${CMAKE_DL_LIBS}
OpenMP::OpenMP_CXX)

add_library(libllm_static OBJECT ${libllm_SOURCES})
target_compile_options(libllm_static PRIVATE "-DLIBLLM_EXPORTS")
set_target_properties(libllm_static PROPERTIES CXX_VISIBILITY_PRESET hidden)
Expand Down
24 changes: 20 additions & 4 deletions src/libllm/benchmark_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,10 @@ void benchmarkLlama(std::shared_ptr<llama::LlamaModel> model, int ctxLength, DTy
tokenPerSec);
}

int benchmarkMain() {
int benchmarkMain(Device device) {
CHECK(llmInit(LLM_API_VERSION) == LLM_OK);

LlamaType llamaType = LlamaType::Llama2_7B;
Device device = libllm::Device::getCuda();
DType weightType = libllm::DType::kQInt4x32;

LOG(INFO) << "intializing model ...";
Expand All @@ -189,6 +188,23 @@ int benchmarkMain() {
} // namespace libllm

int main(int argc, char **argv) {
libllm::benchmarkMain();
return 0;
const char *usage =
"Command line interface for benchmarking libllm.\n"
"Usage: benchmark [-d (cpu|cuda)]";

std::string deviceType = "cuda";
lut::Flags flags(usage);
flags.define("-d", &deviceType, "device of the model. (cpu|cuda)");
flags.parse(argc, argv);

if (deviceType == "cpu") {
libllm::benchmarkMain(libllm::Device::getCpu());
return 0;
} else if (deviceType == "cuda") {
libllm::benchmarkMain(libllm::Device::getCuda());
return 0;
} else {
fprintf(stderr, "unexpected device %s\n", deviceType.c_str());
return 1;
}
}
3 changes: 0 additions & 3 deletions src/libllm/c_api.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#include <omp.h>
#include <string.h>

#include <atomic>
Expand Down Expand Up @@ -127,8 +126,6 @@ llmStatus_t llmInit(int32_t apiVersion) {
lut::setLogLevel(lut::LogSeverity::kINFO);
libllm::initOperators();

LOG(INFO) << "OMP max_threads = " << omp_get_max_threads();

return LLM_OK;
} catch (const lut::Error &e) {
gInitialized = false;
Expand Down
38 changes: 20 additions & 18 deletions src/libllm/cpu/binary_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// restriction, including without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
Expand All @@ -22,8 +22,9 @@
#include "libllm/cpu/accessor.h"
#include "libllm/cpu/common.h"
#include "libllm/cpu/tensor.h"
#include "libllm/tensor.h"
#include "libllm/lut/attributes.h"
#include "libllm/mp.h"
#include "libllm/tensor.h"

namespace libllm {
namespace op {
Expand All @@ -44,22 +45,23 @@ Tensor binaryOpKernel(const Tensor &A, const Tensor &B, BinaryOp op) {
TensorList<T, 1> vC = TensorList<T, 1>::fromTensor(C);
CHECK(vA.getLength() == vB.getLength() && vC.getLength() == vB.getLength());

#pragma omp parallel for
for (int j = 0; j < vA.getLength(); ++j) {
TensorAccessor<const T, 1> a = vA.getTensor(j);
TensorAccessor<const T, 1> b = vB.getTensor(j);
TensorAccessor<T, 1> c = vC.getTensor(j);
MP::parallelFor({vA.getLength()}, [&vA, &vB, &vC, op](MP::Partition partition) {
for (int j : partition.getRange()) {
TensorAccessor<const T, 1> a = vA.getTensor(j);
TensorAccessor<const T, 1> b = vB.getTensor(j);
TensorAccessor<T, 1> c = vC.getTensor(j);

for (int i = 0; i < a.getShape(0); ++i) {
if (op == BinaryOp::ADD) {
c[i] = a[i] + b[i];
} else if (op == BinaryOp::MUL) {
c[i] = a[i] * b[i];
} else {
NOT_IMPL();
for (int i = 0; i < a.getShape(0); ++i) {
if (op == BinaryOp::ADD) {
c[i] = a[i] + b[i];
} else if (op == BinaryOp::MUL) {
c[i] = a[i] * b[i];
} else {
NOT_IMPL();
}
}
}
}
});

return C;
}
Expand All @@ -74,6 +76,6 @@ Tensor binaryOp(const Tensor &A, const Tensor &B, BinaryOp op) {
NOT_IMPL();
}

} // cpu
} // op
} // libllm
} // namespace cpu
} // namespace op
} // namespace libllm
22 changes: 12 additions & 10 deletions src/libllm/cpu/copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// restriction, including without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
Expand All @@ -22,6 +22,7 @@
#include "libllm/cpu/accessor.h"
#include "libllm/cpu/common.h"
#include "libllm/cpu/tensor.h"
#include "libllm/mp.h"

namespace libllm {
namespace op {
Expand All @@ -33,13 +34,14 @@ void copyKernel(const Tensor &src, Tensor &dest) {
TensorList<T, 1> vC = TensorList<T, 1>::fromTensor(dest);
CHECK(vA.getLength() == vC.getLength());

#pragma omp parallel for
for (int j = 0; j < vA.getLength(); ++j) {
TensorAccessor<const T, 1> a = vA.getTensor(j);
TensorAccessor<T, 1> c = vC.getTensor(j);
MP::parallelFor({vA.getLength()}, [&vA, &vC](MP::Partition partition) {
for (int j : partition.getRange()) {
TensorAccessor<const T, 1> a = vA.getTensor(j);
TensorAccessor<T, 1> c = vC.getTensor(j);

copyVector(c, a);
}
copyVector(c, a);
}
});
}

void copy(const Tensor &src, Tensor &dest) {
Expand All @@ -56,6 +58,6 @@ void copy(const Tensor &src, Tensor &dest) {
}
}

} // cpu
} // op
} // libllm
} // namespace cpu
} // namespace op
} // namespace libllm
1 change: 0 additions & 1 deletion src/libllm/cpu/kernel/asimdhp_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "libllm/cpu/kernel/asimdhp.h"

#include <math.h>
#include <omp.h>

#include "catch2/catch_amalgamated.hpp"
#include "libllm/cpu/kernel/abstract.h"
Expand Down
1 change: 0 additions & 1 deletion src/libllm/cpu/kernel/avx2_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <math.h>
#include <omp.h>

#include "catch2/catch_amalgamated.hpp"
#include "libllm/cpu/kernel/abstract.h"
Expand Down
1 change: 0 additions & 1 deletion src/libllm/cpu/kernel/avx512_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <math.h>
#include <omp.h>

#include "catch2/catch_amalgamated.hpp"
#include "libllm/cpu/kernel/abstract.h"
Expand Down
71 changes: 32 additions & 39 deletions src/libllm/cpu/kernel/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// restriction, including without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
Expand All @@ -22,6 +22,7 @@
#include "libllm/cpu/kernel/abstract.h"
#include "libllm/lut/log.h"
#include "libllm/lut/time.h"
#include "libllm/mp.h"

namespace libllm {
namespace op {
Expand All @@ -37,9 +38,9 @@ struct Block {
int32_t numCols;
bool transposed;

constexpr Block<T> sliceRow(int row, int nr);
constexpr Block<T> sliceCol(int col, int nc);
constexpr Block<T> slice(int row, int col, int nr, int nc);
constexpr Block<T> sliceRow(int row, int nr) const;
constexpr Block<T> sliceCol(int col, int nc) const;
constexpr Block<T> slice(int row, int col, int nr, int nc) const;
inline void copyTo(Block<T> tgt);
constexpr Block<T> t();
constexpr void fillZero();
Expand All @@ -52,21 +53,28 @@ struct PackedBlock {
int32_t numRows;
int32_t numBlocks;

constexpr Block<T> block(int i);
constexpr Block<T> block(int i) const;
};

template<typename T, Mode MODE>
PackedBlock<T> Pack(Block<T> src, Block<T> buf, int pack_size) {
int numBlock = src.numCols / pack_size;
int kc = src.numRows;
PackedBlock<T> tgt { buf.data, pack_size, kc, numBlock };
PackedBlock<T> tgt{buf.data, pack_size, kc, numBlock};
CHECK(pack_size * numBlock * kc <= buf.numCols * buf.numRows);

#pragma omp parallel for if(MODE == Mode::OMP)
for (int b = 0; b < numBlock; ++b) {
Block<T> srcBlock = src.sliceCol(b * pack_size, pack_size);
Block<T> tgtBlock = tgt.block(b);
srcBlock.copyTo(tgtBlock);
auto closure = [src, tgt, pack_size](MP::Partition partition) {
for (int b : partition.getRange()) {
Block<T> srcBlock = src.sliceCol(b * pack_size, pack_size);
Block<T> tgtBlock = tgt.block(b);
srcBlock.copyTo(tgtBlock);
}
};

if (MODE == Mode::OMP) {
MP::parallelFor({numBlock}, closure);
} else {
closure(MP::Partition(lut::Range(numBlock)));
}

int nc = src.numCols % pack_size;
Expand All @@ -84,23 +92,22 @@ PackedBlock<T> Pack(Block<T> src, Block<T> buf, int pack_size) {
}

template<typename T>
constexpr Block<T> Block<T>::sliceRow(int row, int nr) {
constexpr Block<T> Block<T>::sliceRow(int row, int nr) const {
return slice(row, 0, nr, numCols);
}
template<typename T>
constexpr Block<T> Block<T>::sliceCol(int col, int nc) {
constexpr Block<T> Block<T>::sliceCol(int col, int nc) const {
return slice(0, col, numRows, nc);
}

template<typename T>
constexpr Block<T> Block<T>::slice(int row, int col, int nr, int nc) {
return Block {
data + (transposed ? row + col * stride : row * stride + col),
stride,
nr,
nc,
transposed
};
constexpr Block<T> Block<T>::slice(int row, int col, int nr, int nc) const {
return Block{
data + (transposed ? row + col * stride : row * stride + col),
stride,
nr,
nc,
transposed};
}

template<typename T>
Expand Down Expand Up @@ -135,21 +142,15 @@ inline void Block<T>::copyTo(Block<T> tgt) {
int srcOffset = c * stride;
int tgtOffset = c * tgt.stride;
for (int r = 0; r < numRows; ++r) {
tgt.data[r + tgtOffset] = data[r + srcOffset];
tgt.data[r + tgtOffset] = data[r + srcOffset];
}
}
}
}

template<typename T>
constexpr Block<T> Block<T>::t() {
return Block<T> {
data,
stride,
numCols,
numRows,
!transposed
};
return Block<T>{data, stride, numCols, numRows, !transposed};
}

template<typename T>
Expand All @@ -166,18 +167,10 @@ constexpr void Block<T>::fillZero() {
}

template<typename T>
constexpr Block<T> PackedBlock<T>::block(int i) {
return Block<T> {
data + packSize * numRows * i,
packSize,
numRows,
packSize,
false
};
constexpr Block<T> PackedBlock<T>::block(int i) const {
return Block<T>{data + packSize * numRows * i, packSize, numRows, packSize, false};
}



} // namespace kernel
} // namespace cpu
} // namespace op
Expand Down
Loading
Loading