Skip to content

Commit

Permalink
CUDA: fixed mmq build issues
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed Jul 30, 2023
1 parent 11f3ca0 commit 305b304
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 7 deletions.
8 changes: 6 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,14 @@ if (LLAMA_CUBLAS)
endif()

if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
Expand Down
3 changes: 3 additions & 0 deletions examples/perplexity/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
if(LLAMA_CUBLAS AND CMAKE_BUILD_TYPE STREQUAL "Release")
add_compile_definitions(GGML_CUDA_CUBLAS) # DOES NOT WORK
endif()
15 changes: 10 additions & 5 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3536,9 +3536,7 @@ static size_t g_scratch_offset = 0;

static int g_device_count = -1;
static int g_main_device = 0;
#ifndef GGML_CUDA_FORCE_DMMV
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
#endif
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};

static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
Expand All @@ -3561,9 +3559,7 @@ void ggml_init_cublas() {
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;

#ifndef GGML_CUDA_FORCE_DMMV
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif
}
for (int id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram;
Expand Down Expand Up @@ -3916,6 +3912,7 @@ inline void ggml_cuda_op_mul_mat_vec(

#ifdef GGML_CUDA_FORCE_DMMV
const bool use_mul_mat_vec_q = false;
(void) g_compute_capabilities[0];
#else
int id;
CUDA_CHECK(cudaGetDevice(&id));
Expand Down Expand Up @@ -4659,8 +4656,16 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
} else {
#ifdef GGML_CUDA_CUBLAS
const bool use_mul_mat_q = false;
(void) g_compute_capabilities[0];
#else
const bool use_mul_mat_q = ggml_is_quantized(src0->type);
int min_compute_capability = 1000000;
for (int id = 0; id < g_device_count; ++id) {
if (min_compute_capability > g_compute_capabilities[id]) {
min_compute_capability = g_compute_capabilities[id];
}
}

const bool use_mul_mat_q = ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A;
#endif // GGML_CUDA_CUBLAS
if (use_mul_mat_q) {
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
Expand Down

0 comments on commit 305b304

Please sign in to comment.