Skip to content

Commit c9cfe0e

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents e05bd2f + 3a9cb4c commit c9cfe0e

File tree

3 files changed

+9
-9
lines changed

3 files changed

+9
-9
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ variety of hardware - locally and in the cloud.
6161
- Plain C/C++ implementation without any dependencies
6262
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
6363
- AVX, AVX2 and AVX512 support for x86 architectures
64-
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
64+
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
6565
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
6666
- Vulkan, SYCL, and (partial) OpenCL backend support
6767
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

ggml-cuda.cu

+4-4
Original file line numberDiff line numberDiff line change
@@ -6205,7 +6205,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
62056205
const int ix = rowx*ncols + col;
62066206
const int iy = rowy*ncols + col;
62076207

6208-
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + slope*pos[col];
6208+
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
62096209

62106210
vals[col] = val;
62116211
max_val = max(max_val, val);
@@ -9170,17 +9170,17 @@ static void ggml_cuda_op_soft_max(
91709170
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
91719171

91729172
// positions tensor
9173-
float * src2_dd = dst_dd; // default to avoid null checks in the kernel
9173+
float * src2_dd = nullptr;
91749174
cuda_pool_alloc<float> src2_f;
91759175

91769176
ggml_tensor * src2 = dst->src[2];
91779177
const bool use_src2 = src2 != nullptr;
91789178

91799179
if (use_src2) {
9180-
const bool src2_on_device = use_src2 && src2->backend == GGML_BACKEND_GPU;
9181-
ggml_tensor_extra_gpu * src2_extra = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
9180+
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
91829181

91839182
if (src2_on_device) {
9183+
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
91849184
src2_dd = (float *) src2_extra->data_device[g_main_device];
91859185
} else {
91869186
src2_dd = src2_f.alloc(ggml_nelements(src2));

ggml-metal.metal

+4-4
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
392392
float lmax = -INFINITY;
393393

394394
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
395-
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
395+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
396396
}
397397

398398
// find the max value in the block
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
417417
// parallel sum
418418
float lsum = 0.0f;
419419
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
420-
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
420+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
421421
lsum += exp_psrc0;
422422
pdst[i00] = exp_psrc0;
423423
}
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
495495
float4 lmax4 = -INFINITY;
496496

497497
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
498-
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
498+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
499499
}
500500

501501
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
521521
// parallel sum
522522
float4 lsum4 = 0.0f;
523523
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
524-
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
524+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
525525
lsum4 += exp_psrc4;
526526
pdst4[i00] = exp_psrc4;
527527
}

0 commit comments

Comments
 (0)