[NPUW] Extend npuw::LLMInferRequest to support LM from VLMs (openvinotoolkit#29000)

TolyaTalamanov · AsyaPronina · web-flow · commit 8a6b1ef47b89 · 2025-03-03T12:19:00.000Z
Co-authored-by: Anastasiya Pronina &lt;anastasiya.pronina@intel.com&gt;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -41,7 +41,7 @@ class TransposeValueTensors : public ov::pass::MatcherPass {
         auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(node_matmul);
 
         auto param_shape = matched_param->get_partial_shape();
-        OPENVINO_ASSERT(param_shape.size() == 4u);
+        NPUW_ASSERT(param_shape.size() == 4u);
         // NB: Transpose Parameter that correspond to V-tensor it will
         // speed-up its multiplication with attention scores
         std::swap(param_shape[2], param_shape[3]);
@@ -150,7 +150,7 @@ class TransposeValueTensors_llama3 : public TransposeValueTensors {
             auto matched_reshape = std::static_pointer_cast<ov::op::v1::Reshape>(matched_node_reshape);
 
             auto shape_broadcast = matched_broadcast->get_output_shape(0);
-            OPENVINO_ASSERT(shape_broadcast.size() == 5u);
+            NPUW_ASSERT(shape_broadcast.size() == 5u);
             std::swap(shape_broadcast[3], shape_broadcast[4]);
 
             LOG_DEBUG("shape_broadcast for: " << matched_broadcast->get_friendly_name()
@@ -162,7 +162,7 @@ class TransposeValueTensors_llama3 : public TransposeValueTensors {
             matched_broadcast->input(1).replace_source_output(broadcast_axes_node);
 
             auto shape_reshape = matched_reshape->get_output_shape(0);
-            OPENVINO_ASSERT(shape_reshape.size() == 4u);
+            NPUW_ASSERT(shape_reshape.size() == 4u);
             std::swap(shape_reshape[2], shape_reshape[3]);
 
             LOG_DEBUG("shape_reshape for: " << matched_reshape->get_friendly_name() << ", shape=" << shape_reshape);
@@ -371,6 +371,11 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
         ov::PartialShape new_shape;
         if (input_name.find("input_ids") != std::string::npos) {
             new_shape = ov::PartialShape({1, input_size});
+        } else if (input_name.find("inputs_embeds") != std::string::npos) {
+            // NB: VLMs case, model accepts inputs_embeds[BATCH, SEQ_LEN, EMB_SIZE]
+            NPUW_ASSERT(input.get_partial_shape().size() == 3u);
+            NPUW_ASSERT(input.get_partial_shape()[2].is_static());
+            new_shape = ov::PartialShape({1, input_size, input.get_partial_shape()[2]});
         } else if (input_name.find("attention_mask") != std::string::npos) {
             new_shape = ov::PartialShape({1, kvcache_size});
         } else if (input_name.find("position_ids") != std::string::npos) {
@@ -628,14 +633,12 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
 
     m_kvcache_compiled = std::dynamic_pointer_cast<ov::npuw::CompiledModel>(
         ov::npuw::ICompiledModel::create(kvcache_model, plugin, generate_config));
-    OPENVINO_ASSERT(m_kvcache_compiled,
-                    "Can't create ov::npuw::CompiledModel for passed kvcache "
-                    "model and its config, please check passed config.");
+    NPUW_ASSERT(m_kvcache_compiled && "Can't create ov::npuw::CompiledModel for passed kvcache "
+                                      "model and its config, please check passed config.");
     m_prefill_compiled = std::dynamic_pointer_cast<ov::npuw::CompiledModel>(
         ov::npuw::ICompiledModel::create(prefill_model, plugin, prefill_config));
-    OPENVINO_ASSERT(m_prefill_compiled,
-                    "Can't create ov::npuw::CompiledModel for passed prefill "
-                    "model and its config, please check passed config.");
+    NPUW_ASSERT(m_prefill_compiled && "Can't create ov::npuw::CompiledModel for passed prefill "
+                                      "model and its config, please check passed config.");
 
     implement_properties();
     LOG_DEBUG("Done");
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -18,6 +18,11 @@ void fill_tensor(ov::SoPtr<ov::ITensor> tensor, T fill_val, size_t offset = 0u)
     std::fill(tensor_data + offset, tensor_data + tensor->get_size(), fill_val);
 }
 
+void fill_tensor_bytes(ov::SoPtr<ov::ITensor> tensor, uint8_t fill_val) {
+    auto* tensor_data = reinterpret_cast<uint8_t*>(tensor->data());
+    std::fill_n(tensor_data, tensor->get_byte_size(), fill_val);
+}
+
 ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
                                          uint32_t dim,
                                          uint32_t start_pos,
@@ -100,6 +105,20 @@ void copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITenso
         std::copy_n(src_p + src_offset, chunk_byte_size, dst_p + dst_offset);
     }
 }
+
+std::optional<ov::Output<const ov::Node>> find_port_by_name(const std::vector<ov::Output<const ov::Node>>& ports,
+                                                            const std::string& name) {
+    auto it = std::find_if(ports.begin(), ports.end(), [&](const auto& port) {
+        return port.get_names().count(name) != 0;
+    });
+    if (it == ports.end()) {
+        return std::nullopt;
+    }
+    return std::make_optional(*it);
+}
+
+constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1;
+
 }  // anonymous namespace
 
 ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
@@ -112,6 +131,14 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
         init_tensor(output_port);
     }
 
+    auto input_ids_port = find_port_by_name(compiled_model->m_prefill_compiled->inputs(), "input_ids");
+    if (input_ids_port.has_value()) {
+        m_input_ids_name = "input_ids";
+    } else {
+        OPENVINO_ASSERT(find_port_by_name(compiled_model->m_prefill_compiled->inputs(), "inputs_embeds").has_value());
+        m_input_ids_name = "inputs_embeds";
+    }
+
     m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
     m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();
 
@@ -152,7 +179,7 @@ void ov::npuw::LLMInferRequest::init_tensor(const ov::Output<const ov::Node>& po
 }
 
 void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
-    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
+    fill_tensor_bytes(m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name)), 0u);
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
     fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
@@ -167,20 +194,29 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
 
     prepare_for_new_conversation();
 
-    auto padded_input_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids"));
-    const size_t offset = padded_input_ids->get_size() - input_ids->get_size();
-    std::copy_n(input_ids->data<int64_t>(), input_ids->get_size(), padded_input_ids->data<int64_t>() + offset);
+    auto padded_input = m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name));
+    // NB: padded_input can be either fp32(VLM) or i64(LLM)
+    std::copy_n(
+        reinterpret_cast<uint8_t*>(input_ids->data()),
+        input_ids->get_byte_size(),
+        reinterpret_cast<uint8_t*>(padded_input->data()) + padded_input->get_byte_size() - input_ids->get_byte_size());
 
     auto padded_attention_mask = m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask"));
-    std::copy_n(attention_mask->data<int64_t>(),
-                attention_mask->get_size(),
-                padded_attention_mask->data<int64_t>() + offset);
+    std::copy_n(
+        attention_mask->data<int64_t>(),
+        attention_mask->get_size(),
+        padded_attention_mask->data<int64_t>() + padded_attention_mask->get_size() - attention_mask->get_size());
 
     auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids"));
-    std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), padded_position_ids->data<int64_t>() + offset);
+
+    std::copy_n(position_ids->data<int64_t>(),
+                position_ids->get_size(),
+                padded_position_ids->data<int64_t>() + padded_position_ids->get_size() - position_ids->get_size());
 
     m_prefill_request->infer();
-    m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
+
+    m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens +=
+        static_cast<uint32_t>(input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM]);
     m_need_copy_kvcache = true;
 
     m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits"));
@@ -244,8 +280,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     }
 
     // FIXME: these tensors should be shared between the parent & child models
-    auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("input_ids"));
-    std::copy_n(input_ids->data<int64_t>(), input_ids->get_size(), kv_input_ids->data<int64_t>());
+    auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
+    // NB: input_ids can be either fp32(VLM) or i64(LLM)
+    std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()),
+                input_ids->get_byte_size(),
+                reinterpret_cast<uint8_t*>(kv_input_ids->data()));
 
     auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"));
     std::copy_n(attention_mask->data<int64_t>(), attention_mask->get_size() - 1, kv_attn_mask->data<int64_t>());
@@ -290,15 +329,20 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
 void ov::npuw::LLMInferRequest::infer() {
     const auto& inputs = get_inputs();
 
-    auto input_ids = get_tensor(inputs[0]);
-    auto attention_mask = get_tensor(inputs[1]);
-    auto position_ids = get_tensor(inputs[2]);
+    auto input_ids = get_tensor(find_port_by_name(inputs, m_input_ids_name).value());
+    auto attention_mask = get_tensor(find_port_by_name(inputs, "attention_mask").value());
+    // FIXME: position_ids might be optional for some models!
+    auto position_ids = get_tensor(find_port_by_name(inputs, "position_ids").value());
 
-    OPENVINO_ASSERT(ov::element::i64 == input_ids->get_element_type());
+    // NB: For VLM, the "inputs_embeds" contains float values (embeddings)
+    OPENVINO_ASSERT(ov::element::f32 == input_ids->get_element_type() ||
+                    ov::element::i64 == input_ids->get_element_type());
     OPENVINO_ASSERT(ov::element::i64 == attention_mask->get_element_type());
     OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type());
 
-    if (input_ids->get_size() != 1) {
+    // NB: Check the sequence length provided for input_ids
+    // in order to distinguish prefill / generate stages
+    if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] != 1) {
         infer_prefill(input_ids, attention_mask, position_ids);
     } else {
         infer_generate(input_ids, attention_mask, position_ids);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
@@ -31,9 +31,8 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
     }
 
 private:
-    void init_tensor(const ov::Output<const ov::Node>& port);
-
     void prepare_for_new_conversation();
+    void init_tensor(const ov::Output<const ov::Node>& port);
 
     void infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
                        ov::SoPtr<ov::ITensor> attention_mask,
@@ -53,6 +52,9 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
     std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_out_ports;
     std::unordered_map<std::string, ov::Output<const ov::Node>> m_kvcache_in_ports;
     std::unordered_map<std::string, ov::Output<const ov::Node>> m_kvcache_out_ports;
+
+    // NB: It can be either input_ids(LLM) or inputs_embeds(VLM)
+    std::string m_input_ids_name;
 };
 
 }  // namespace npuw