Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Fix SYCL im2col and convert Overflow with Large Dims #9052

Merged
merged 9 commits into from
Aug 20, 2024
104 changes: 0 additions & 104 deletions ggml/src/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -893,43 +893,6 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
}

template <typename T>
static void im2col_kernel(const float *x, T *dst, int offset_delta,
int IW, int IH, int OW, int KW, int KH,
int pelements, int CHW, int s0, int s1, int p0,
int p1, int d0, int d1,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (i >= pelements) {
return;
}

const int ksize = OW * (KH > 1 ? KW : 1);
const int kx = i / ksize;
const int kd = kx * ksize;
const int ky = (i - kd) / OW;
const int ix = i % OW;

const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;

const int64_t offset_dst =
(item_ct1.get_group(1) * OW + ix) * CHW +
(item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);

if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] =
sycl::vec<float, 1>(0.0f)
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
} else {
const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
dst[offset_dst] =
sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
}
}

template <typename Ti, typename To>
static void pool2d_nchw_kernel(
const int ih, const int iw, const int oh, const int ow,
Expand Down Expand Up @@ -1742,32 +1705,6 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
});
}

template <typename T>
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
int OW, int OH, int KW, int KH, int IC,
int offset_delta, int s0, int s1, int p0,
int p1, int d0, int d1,
queue_ptr stream) {
const int parallel_elements = OW * KW * KH;
const int num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
sycl::range<3> block_nums(IC, OH, num_blocks);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});

stream->parallel_for(
sycl::nd_range<3>(block_nums *
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
parallel_elements, (IC * KH * KW), s0, s1, p0,
p1, d0, d1, item_ct1);
});
}
}


static bool g_sycl_loaded = false;

bool ggml_sycl_loaded(void) {
Expand Down Expand Up @@ -2636,47 +2573,6 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
(void) src1_dd;
}

inline void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
float *dst_dd,
const queue_ptr &main_stream) {

GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];

const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;

const int64_t IC = src1->ne[is_2D ? 2 : 1];
const int64_t IH = is_2D ? src1->ne[1] : 1;
const int64_t IW = src1->ne[0];

const int64_t KH = is_2D ? src0->ne[1] : 1;
const int64_t KW = src0->ne[0];

const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1];

const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32

if (dst->type == GGML_TYPE_F16) {
im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
} else {
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
}

(void) src0;
(void) src0_dd;
}

inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-sycl/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,6 @@
#include "norm.hpp"
#include "softmax.hpp"
#include "tsembd.hpp"
#include "im2col.hpp"

#endif // GGML_SYCL_BACKEND_HPP
11 changes: 11 additions & 0 deletions ggml/src/ggml-sycl/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,14 @@ void ggml_sycl_host_free(void* ptr) try {
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}

int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
const int64_t max_range = std::numeric_limits<int>::max();
int64_t sycl_down_blk_size = block_size;
int64_t global_range = accumulate_block_num * sycl_down_blk_size;
while(global_range > max_range) {
sycl_down_blk_size /= 2;
global_range = accumulate_block_num * sycl_down_blk_size;
}
return sycl_down_blk_size;
}
2 changes: 2 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,4 +352,6 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
}

int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);

#endif // GGML_SYCL_COMMON_HPP
Loading
Loading