diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp index 3b08cd987a3aa8..fa6abc149d38e3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/embedding_bag.cpp @@ -56,6 +56,7 @@ attach_embedding_bag_impl::attach_embedding_bag_impl() { implementation_map::add(impl_types::ocl, typed_primitive_impl_ocl::create, { std::make_tuple(data_types::f32, format::bfyx), std::make_tuple(data_types::f16, format::bfyx), + std::make_tuple(data_types::i32, format::bfyx), }); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/embedding_bag_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/embedding_bag_ref.cl index aa9a7f9bd15632..b744fd69ac276e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/embedding_bag_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/embedding_bag_ref.cl @@ -94,7 +94,10 @@ KERNEL(embedding_bag_ref)( const __global INPUT1_TYPE* indices, const __global INPUT2_TYPE* segment_ids, #ifdef INPUT3_TYPE - const __global INPUT3_TYPE* weights, + const __global INPUT3_TYPE* segments_sum, +#endif +#ifdef INPUT4_TYPE + const __global INPUT4_TYPE* weights, #endif __global OUTPUT_TYPE* output) { @@ -114,7 +117,7 @@ KERNEL(embedding_bag_ref)( uint index = indices[INPUT1_OFFSET + i]; uint emb_index = INPUT0_GET_INDEX(index, emb_dim1, emb_dim2, emb_dim3); OUTPUT_TYPE val = emb_table[emb_index]; -#ifdef INPUT3_TYPE +#ifdef INPUT4_TYPE { uint weight_index = INPUT3_OFFSET + i; val *= weights[weight_index]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/embedding_bag/embedding_bag_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/embedding_bag/embedding_bag_kernel_ref.cpp index daa39cce25cb65..f8e2cc7b1b3b03 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/embedding_bag/embedding_bag_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/embedding_bag/embedding_bag_kernel_ref.cpp @@ -86,8 +86,10 @@ ParamsKey EmbeddingBagKernelRef::GetSupportedKey() const { k.EnableInputDataType(Datatype::INT32); k.EnableInputDataType(Datatype::INT64); k.EnableInputDataType(Datatype::UINT32); + k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::INT32); k.EnableAllInputLayout(); k.EnableAllOutputLayout(); diff --git a/src/plugins/intel_gpu/src/plugin/ops/embedding_bag.cpp b/src/plugins/intel_gpu/src/plugin/ops/embedding_bag.cpp index 29bfcf9b7c5c9f..2d7dc03dd77118 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/embedding_bag.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/embedding_bag.cpp @@ -104,11 +104,9 @@ static void CreateEmbeddingSegmentsSumOp(ProgramBuilder& p, const std::shared_pt auto inputs = p.GetInputInfo(op); std::string layerName = layer_type_name_ID(op); - inputs.erase(inputs.begin() + 3); // Remove "num_segments" - int32_t defaultIndex = -1; // port of default_index is 4 by default, but we removed "num_segments" above, so now it's equal to 3 - if (inputs.size() > 3) { + if (inputs.size() > 4) { auto index_node = ov::as_type_ptr(op->get_input_node_shared_ptr(4)); OPENVINO_ASSERT(index_node != nullptr, "[GPU] Unsupported parameter nodes type in ", op->get_friendly_name(), " (", op->get_type_name(), ")"); @@ -117,7 +115,7 @@ static void CreateEmbeddingSegmentsSumOp(ProgramBuilder& p, const std::shared_pt OPENVINO_THROW("Unsupported parameter size in ", op->get_friendly_name(), " (", op->get_type_name(), ")"); defaultIndex = static_cast(val); - inputs.erase(inputs.begin() + 3); // Remove "default_index" + inputs.erase(inputs.begin() + 4); // Remove "default_index" } std::vector reordered_inputs; @@ -141,10 +139,13 @@ static void CreateEmbeddingSegmentsSumOp(ProgramBuilder& p, const std::shared_pt } } + auto p_shape = op->get_output_partial_shape(0); + auto output_shape = p_shape.is_static() ? tensor_from_dims(p_shape.to_shape()) : cldnn::tensor(); + auto embeddingBagPrim = cldnn::embedding_bag(layerName, reordered_inputs, cldnn::embedding_bag::segments_sum, - tensor_from_dims(op->get_output_shape(0)), + output_shape, defaultIndex); p.add_primitive(*op, embeddingBagPrim); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/embedding_bag_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/embedding_bag_gpu_test.cpp index 4319c2de76ddfb..ce4a0cbbb5797a 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/embedding_bag_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/embedding_bag_gpu_test.cpp @@ -773,6 +773,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic) { auto emb_table = engine.allocate_memory({ data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); auto indices = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); auto segment_ids = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segments_num = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } }); auto per_sample_weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); tensor output_shape = {3, 2, 1, 1}; @@ -789,6 +790,8 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic) { set_values(segment_ids, { 0, 0, 2, 2 }); + set_values(segments_num, { 4 }); + set_values(per_sample_weights, { ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f) }); @@ -798,9 +801,10 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic) { topology.add(input_layout("Input0", emb_table->get_layout())); topology.add(input_layout("Input1", indices->get_layout())); topology.add(input_layout("Input2", segment_ids->get_layout())); - topology.add(data("Input3", per_sample_weights)); + topology.add(input_layout("Input3", segments_num->get_layout())); + topology.add(data("Input4", per_sample_weights)); topology.add( - embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3") }, type, output_shape, 0) + embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3"), input_info("Input4") }, type, output_shape, 0) ); network network(engine, topology, get_test_default_config(engine)); @@ -808,6 +812,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic) { network.set_input_data("Input0", emb_table); network.set_input_data("Input1", indices); network.set_input_data("Input2", segment_ids); + network.set_input_data("Input3", segments_num); auto outputs = network.execute(); @@ -838,6 +843,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_first_empty) { auto emb_table = engine.allocate_memory({ data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); auto indices = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); auto segment_ids = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segments_num = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } }); auto per_sample_weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); tensor output_shape = {3, 2, 1, 1}; @@ -854,6 +860,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_first_empty) { set_values(segment_ids, { 1, 1, 2, 2 }); + set_values(segments_num, { 4 }); set_values(per_sample_weights, { ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f) }); @@ -863,9 +870,10 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_first_empty) { topology.add(input_layout("Input0", emb_table->get_layout())); topology.add(input_layout("Input1", indices->get_layout())); topology.add(input_layout("Input2", segment_ids->get_layout())); - topology.add(data("Input3", per_sample_weights)); + topology.add(input_layout("Input3", segments_num->get_layout())); + topology.add(data("Input4", per_sample_weights)); topology.add( - embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3") }, type, output_shape, 2) + embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3"), input_info("Input4") }, type, output_shape, 2) ); network network(engine, topology, get_test_default_config(engine)); @@ -873,6 +881,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_first_empty) { network.set_input_data("Input0", emb_table); network.set_input_data("Input1", indices); network.set_input_data("Input2", segment_ids); + network.set_input_data("Input3", segments_num); auto outputs = network.execute(); @@ -903,6 +912,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_last_empty) { auto emb_table = engine.allocate_memory({ data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); auto indices = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); auto segment_ids = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segments_num = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } }); auto per_sample_weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); tensor output_shape = {3, 2, 1, 1}; @@ -919,6 +929,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_last_empty) { set_values(segment_ids, { 0, 0, 1, 1 }); + set_values(segments_num, { 4 }); set_values(per_sample_weights, { ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f) }); @@ -928,9 +939,10 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_last_empty) { topology.add(input_layout("Input0", emb_table->get_layout())); topology.add(input_layout("Input1", indices->get_layout())); topology.add(input_layout("Input2", segment_ids->get_layout())); - topology.add(data("Input3", per_sample_weights)); + topology.add(input_layout("Input3", segments_num->get_layout())); + topology.add(data("Input4", per_sample_weights)); topology.add( - embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3") }, type, output_shape, 2) + embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3"), input_info("Input4") }, type, output_shape, 2) ); network network(engine, topology, get_test_default_config(engine)); @@ -938,6 +950,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_basic_last_empty) { network.set_input_data("Input0", emb_table); network.set_input_data("Input1", indices); network.set_input_data("Input2", segment_ids); + network.set_input_data("Input3", segments_num); auto outputs = network.execute(); @@ -966,6 +979,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_without_weights_and_def_index) { auto emb_table = engine.allocate_memory({ data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); auto indices = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); auto segment_ids = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segments_num = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } }); tensor output_shape = {3, 2, 1, 1}; set_values(emb_table, { @@ -981,14 +995,16 @@ TEST(embedding_bag_fp16_gpu, segments_sum_without_weights_and_def_index) { set_values(segment_ids, { 0, 0, 2, 2 }); + set_values(segments_num, { 4 }); auto type = embedding_bag::segments_sum; topology topology; topology.add(input_layout("Input0", emb_table->get_layout())); topology.add(input_layout("Input1", indices->get_layout())); topology.add(input_layout("Input2", segment_ids->get_layout())); + topology.add(input_layout("Input3", segments_num->get_layout())); topology.add( - embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2") }, type, output_shape) + embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3") }, type, output_shape) ); network network(engine, topology, get_test_default_config(engine)); @@ -996,6 +1012,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_without_weights_and_def_index) { network.set_input_data("Input0", emb_table); network.set_input_data("Input1", indices); network.set_input_data("Input2", segment_ids); + network.set_input_data("Input3", segments_num); auto outputs = network.execute(); @@ -1026,6 +1043,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_dim3) { auto emb_table = engine.allocate_memory({ data_types::f16, format::bfyx, { 5, 2, 3, 2 } }); auto indices = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); auto segment_ids = engine.allocate_memory({ data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segments_num = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } }); auto per_sample_weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); tensor output_shape = {3, 2, 3, 2}; @@ -1091,6 +1109,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_dim3) { set_values(segment_ids, { 0, 0, 2, 2 }); + set_values(segments_num, { 4 }); set_values(per_sample_weights, { ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f), ov::float16(0.5f) @@ -1101,9 +1120,10 @@ TEST(embedding_bag_fp16_gpu, segments_sum_dim3) { topology.add(input_layout("Input0", emb_table->get_layout())); topology.add(input_layout("Input1", indices->get_layout())); topology.add(input_layout("Input2", segment_ids->get_layout())); - topology.add(data("Input3", per_sample_weights)); + topology.add(input_layout("Input3", segments_num->get_layout())); + topology.add(data("Input4", per_sample_weights)); topology.add( - embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3") }, type, output_shape, 0) + embedding_bag("embedding_bag", { input_info("Input0"), input_info("Input1"), input_info("Input2"), input_info("Input3"), input_info("Input4") }, type, output_shape, 0) ); network network(engine, topology, get_test_default_config(engine)); @@ -1111,6 +1131,7 @@ TEST(embedding_bag_fp16_gpu, segments_sum_dim3) { network.set_input_data("Input0", emb_table); network.set_input_data("Input1", indices); network.set_input_data("Input2", segment_ids); + network.set_input_data("Input3", segments_num); auto outputs = network.execute(); diff --git a/tests/layer_tests/tensorflow_tests/test_tf_SegmentSum.py b/tests/layer_tests/tensorflow_tests/test_tf_SegmentSum.py index 725bd7970e95fc..721804aae1c06e 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_SegmentSum.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_SegmentSum.py @@ -49,8 +49,6 @@ def test_segment_sum_basic(self, params, ie_device, precision, ir_version, temp_ use_legacy_frontend): if use_legacy_frontend: pytest.skip("SegmentSum operation is not supported via legacy frontend.") - if ie_device == 'GPU': - pytest.skip("GPU error: to_shape was called on a dynamic shape") self._test(*self.create_segment_sum_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) @@ -68,8 +66,6 @@ def test_segment_sum_different_types(self, params, ie_device, precision, ir_vers use_legacy_frontend): if use_legacy_frontend: pytest.skip("SegmentSum operation is not supported via legacy frontend.") - if ie_device == 'GPU': - pytest.skip("GPU error: to_shape was called on a dynamic shape") self._test(*self.create_segment_sum_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) @@ -122,8 +118,6 @@ def test_complex_segment_sum(self, params, ie_device, precision, ir_version, tem use_legacy_frontend): if use_legacy_frontend: pytest.skip("SegmentSum operation is not supported via legacy frontend.") - if ie_device == 'GPU': - pytest.skip("GPU error: to_shape was called on a dynamic shape") self._test(*self.create_segment_sum_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, - use_legacy_frontend=use_legacy_frontend) \ No newline at end of file + use_legacy_frontend=use_legacy_frontend) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_SparseSegmentMean.py b/tests/layer_tests/tensorflow_tests/test_tf_SparseSegmentMean.py index 69142cc3bc0e1e..b69d5add3fb5d8 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_SparseSegmentMean.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_SparseSegmentMean.py @@ -71,9 +71,12 @@ def test_sparse_segment_mean(self, data_type, indices_type, segment_indices_type shape, indices_shape, segments_num, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): + kwargs = {} if ie_device == 'GPU': - pytest.skip("GPU error: to_shape was called on a dynamic shape, ticket: 152352") + kwargs = { + 'custom_eps': 1e-2, + } self._test(*self.create_sparse_segment_mean(data_type, indices_type, segment_indices_type, shape, indices_shape, segments_num), ie_device, precision, ir_version, temp_dir=temp_dir, - use_legacy_frontend=use_legacy_frontend) + use_legacy_frontend=use_legacy_frontend, **kwargs) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py b/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py index 1fcb2475cc143a..509baf522c51c9 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py @@ -57,8 +57,6 @@ def create_unsorted_segment_sum_net(self, data_shape, segment_ids_shape, num_seg def test_unsorted_segment_sum_basic(self, params, data_type, segment_ids_type, num_segments_type, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): - if ie_device == 'GPU': - pytest.skip("156362: No layout format available for embeddingsegmentssum:UnsortedSegmentSum on GPU") self._test(*self.create_unsorted_segment_sum_net(**params, data_type=data_type, segment_ids_type=segment_ids_type, num_segments_type=num_segments_type),