Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml: replace conv 1D - 2D stage_0 and stage_1 with im2col and mul_mat #564

Merged
merged 31 commits into from
Nov 12, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3c50e46
added conv2d stage 0 - 1 cuda kernels
FSSRepo Oct 9, 2023
27b3ab3
add im2col + refactor conv1d and conv2d
FSSRepo Oct 9, 2023
d5c329b
fix params invalid index
FSSRepo Oct 9, 2023
574735c
add conv1d and conv2d unit tests
FSSRepo Oct 9, 2023
2358d15
resolving wrong values and fix mul_mat validation
FSSRepo Oct 9, 2023
ca56f51
improve tests + reduce code duplication
FSSRepo Oct 10, 2023
15ceadb
add cuda kernels
FSSRepo Oct 10, 2023
872cc04
more data test
FSSRepo Oct 10, 2023
bb340dc
fix ggml_op_count to 70
FSSRepo Oct 10, 2023
419b4b8
add temp test - gemm != mul_mat
FSSRepo Oct 10, 2023
af312e4
tests : fix test-mul-mat matrix multiplication
ggerganov Oct 11, 2023
c692f61
test-mul-mat match gemm == ggml_mul_mat with conv2d op
FSSRepo Oct 14, 2023
3dad5e6
replaced gemm by ggml_mul_mat
FSSRepo Oct 14, 2023
fde8828
ggml_mul_mat cpu backend support fp16 src1
FSSRepo Oct 14, 2023
5377678
ggml_mul_mat cuda backend fp16 fixed
FSSRepo Oct 14, 2023
79af905
Merge branch 'ggerganov:master' into master
FSSRepo Oct 15, 2023
6b42245
remove unnecessary ggml_cont and removed conv1d-2d functions deprecated
FSSRepo Oct 15, 2023
d734040
some fixes
FSSRepo Oct 15, 2023
d47ae58
Merge branch 'ggerganov:master' into master
FSSRepo Oct 15, 2023
d8539f3
explain conv1d reshapes
FSSRepo Oct 16, 2023
53f805e
ggml : fix tests on Arm + do not use BLAS for F16 data
ggerganov Oct 16, 2023
3b9022a
tests : fix FP16 handling on Arm
ggerganov Oct 16, 2023
7193df2
ggml : avoid ggml_cont and ggml_transpose in ggml_conv_xd
ggerganov Oct 16, 2023
c4c0265
Merge branch 'ggerganov:master' into master
FSSRepo Oct 22, 2023
7a4544b
Merge branch 'ggerganov:master' into master
FSSRepo Oct 25, 2023
e0bbb9f
Merge branch 'master' into HEAD
ggerganov Nov 10, 2023
f1879c0
ci : switch back to release
ggerganov Nov 10, 2023
439a79f
cuda : fix wrong pointer usage
ggerganov Nov 10, 2023
a729f6b
ggml : add metal support for im2col and f16xf16 mul mat
ggerganov Nov 11, 2023
406cbc1
ggml : im2col opts
ggerganov Nov 11, 2023
da25cf0
Update src/ggml-cuda.cu
ggerganov Nov 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions include/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,12 +400,9 @@ extern "C" {
GGML_OP_ALIBI,
GGML_OP_CLAMP,
GGML_OP_CONV_1D,
GGML_OP_CONV_1D_STAGE_0, // internal
GGML_OP_CONV_1D_STAGE_1, // internal
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_CONV_2D,
GGML_OP_CONV_2D_STAGE_0, // internal
GGML_OP_CONV_2D_STAGE_1, // internal
GGML_OP_IM2COL,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
Expand Down Expand Up @@ -1376,6 +1373,18 @@ extern "C" {
float min,
float max);

GGML_API struct ggml_tensor * ggml_im2col(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1,
bool is_2D);

GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand Down
111 changes: 110 additions & 1 deletion src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4585,6 +4585,28 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
dst[i] = scale * x[i];
}

static __global__ void gemm_f16_f32(const half *x,const half *y, float *dst, int N, int M, int K) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < K) {
float sum = 0.0f;
for (int i = 0; i < M; ++i) {
sum += __half2float(x[row * M + i]) * __half2float(y[col * M + i]);
}
dst[row * K + col] = sum;
}
}

static __global__ void im2col_f32_f16(const float* x, half* dst, int ofs0, int ofs1, int IW,int IH,int CHW,int s0,int s1,int p0,int p1,int d0,int d1) {
int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
__syncthreads();
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
int offset_dst = (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW;
int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
dst[offset_dst + (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z)] = __float2half(x[offset_src + iih * IW + iiw]);
}
}

template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
Expand Down Expand Up @@ -5534,6 +5556,25 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
}

static void im2col_f32_f16_cuda(const float* x, half* dst,
int OH, int IW, int IH,
int OW, int IC,
int KH, int KW, int N, int ofs0, int ofs1,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
dim3 block_nums(IC, OH, OW);
dim3 block_dims(N, KH, KW);
im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
}

// GEMM
static void gemm_f16_f32_cuda(const half* x,const half* y, float* dst, int m, int n, int k, int N, cudaStream_t stream) {
for(int i = 0; i < N; i++) {
dim3 block_dims(16, 16);
dim3 block_nums((n + block_dims.x - 1) / block_dims.x, (m + block_dims.y - 1) / block_dims.y);
gemm_f16_f32<<<block_nums, block_dims, 0, stream>>>(x, y + i * m * k, dst + i * m * n, m, k, n);
}
}

// buffer pool for cuda
#define MAX_CUDA_BUFFERS 256

Expand Down Expand Up @@ -6438,6 +6479,65 @@ inline void ggml_cuda_op_alibi(
(void) src1_dd;
}

inline void ggml_cuda_op_im2col(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {

GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16);

const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];

const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;

const int64_t N = src1->ne[is_2D ? 3 : 2];
const int64_t IC = src1->ne[is_2D ? 2 : 1];
const int64_t IH = is_2D ? src1->ne[1] : 1;
const int64_t IW = src1->ne[0];

const int64_t KH = is_2D ? src0->ne[1] : 1;
const int64_t KW = src0->ne[0];

const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1];


im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
OH, IW, IH, OW, IC, KH, KW, N,
src1->nb[is_2D ? 3 : 2] / 4, // nb is byte offset, src is type float32
src1->nb[is_2D ? 2 : 1] / 4, // nb is byte offset, src is type float32
s0, s1, p0, p1, d0, d1, main_stream);

(void) src0;
(void) src0_dd;
}

inline void ggml_cuda_op_mul_mat_gemm_f16(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {

GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

bool case_conv_2d = (src0->ne[0] * src0->ne[1] * src0->ne[2]) == src1->ne[0];

int m = src0->ne[case_conv_2d ? 3 : 2];
int n = (case_conv_2d ? src1->ne[2] : 1) * src1->ne[1];
int k = (case_conv_2d ? src0->ne[2] : 1) * src0->ne[1] * src0->ne[0];
int N = src1->ne[case_conv_2d ? 3 : 2];

gemm_f16_f32_cuda(
(const half*)src0_dd, (const half*)src1_dd,
dst_dd, m, n, k, N, main_stream);
}

inline void ggml_cuda_op_diag_mask_inf(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
Expand Down Expand Up @@ -7025,7 +7125,9 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
}
}

if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
if(src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul_mat_gemm_f16);
} else if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
Expand Down Expand Up @@ -7133,6 +7235,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
}

void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
}

static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
(void) src0;
(void) src1;
Expand Down Expand Up @@ -7494,6 +7600,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
case GGML_OP_ALIBI:
func = ggml_cuda_alibi;
break;
case GGML_OP_IM2COL:
func = ggml_cuda_im2col;
break;
default:
return false;
}
Expand Down
Loading