@@ -18,6 +18,11 @@ void fill_tensor(ov::SoPtr<ov::ITensor> tensor, T fill_val, size_t offset = 0u)
18
18
std::fill (tensor_data + offset, tensor_data + tensor->get_size (), fill_val);
19
19
}
20
20
21
+ void fill_tensor_bytes (ov::SoPtr<ov::ITensor> tensor, uint8_t fill_val) {
22
+ auto * tensor_data = reinterpret_cast <uint8_t *>(tensor->data ());
23
+ std::fill_n (tensor_data, tensor->get_byte_size (), fill_val);
24
+ }
25
+
21
26
ov::SoPtr<ov::ITensor> make_tensor_slice (ov::SoPtr<ov::ITensor> tensor,
22
27
uint32_t dim,
23
28
uint32_t start_pos,
@@ -100,6 +105,20 @@ void copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITenso
100
105
std::copy_n (src_p + src_offset, chunk_byte_size, dst_p + dst_offset);
101
106
}
102
107
}
108
+
109
+ std::optional<ov::Output<const ov::Node>> find_port_by_name (const std::vector<ov::Output<const ov::Node>>& ports,
110
+ const std::string& name) {
111
+ auto it = std::find_if (ports.begin (), ports.end (), [&](const auto & port) {
112
+ return port.get_names ().count (name) != 0 ;
113
+ });
114
+ if (it == ports.end ()) {
115
+ return std::nullopt;
116
+ }
117
+ return std::make_optional (*it);
118
+ }
119
+
120
+ constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1 ;
121
+
103
122
} // anonymous namespace
104
123
105
124
ov::npuw::LLMInferRequest::LLMInferRequest (const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
@@ -112,6 +131,14 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
112
131
init_tensor (output_port);
113
132
}
114
133
134
+ auto input_ids_port = find_port_by_name (compiled_model->m_prefill_compiled ->inputs (), " input_ids" );
135
+ if (input_ids_port.has_value ()) {
136
+ m_input_ids_name = " input_ids" ;
137
+ } else {
138
+ OPENVINO_ASSERT (find_port_by_name (compiled_model->m_prefill_compiled ->inputs (), " inputs_embeds" ).has_value ());
139
+ m_input_ids_name = " inputs_embeds" ;
140
+ }
141
+
115
142
m_kvcache_request = compiled_model->m_kvcache_compiled ->create_infer_request ();
116
143
m_prefill_request = compiled_model->m_prefill_compiled ->create_infer_request ();
117
144
@@ -152,7 +179,7 @@ void ov::npuw::LLMInferRequest::init_tensor(const ov::Output<const ov::Node>& po
152
179
}
153
180
154
181
void ov::npuw::LLMInferRequest::prepare_for_new_conversation () {
155
- fill_tensor< int64_t > (m_prefill_request->get_tensor (m_prefill_in_ports.at (" input_ids " )), 0 );
182
+ fill_tensor_bytes (m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name )), 0u );
156
183
fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" attention_mask" )), 0 );
157
184
fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" position_ids" )), 0 );
158
185
fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (" attention_mask" )), 0 );
@@ -167,20 +194,29 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
167
194
168
195
prepare_for_new_conversation ();
169
196
170
- auto padded_input_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (" input_ids" ));
171
- const size_t offset = padded_input_ids->get_size () - input_ids->get_size ();
172
- std::copy_n (input_ids->data <int64_t >(), input_ids->get_size (), padded_input_ids->data <int64_t >() + offset);
197
+ auto padded_input = m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name));
198
+ // NB: padded_input can be either fp32(VLM) or i64(LLM)
199
+ std::copy_n (
200
+ reinterpret_cast <uint8_t *>(input_ids->data ()),
201
+ input_ids->get_byte_size (),
202
+ reinterpret_cast <uint8_t *>(padded_input->data ()) + padded_input->get_byte_size () - input_ids->get_byte_size ());
173
203
174
204
auto padded_attention_mask = m_prefill_request->get_tensor (m_prefill_in_ports.at (" attention_mask" ));
175
- std::copy_n (attention_mask->data <int64_t >(),
176
- attention_mask->get_size (),
177
- padded_attention_mask->data <int64_t >() + offset);
205
+ std::copy_n (
206
+ attention_mask->data <int64_t >(),
207
+ attention_mask->get_size (),
208
+ padded_attention_mask->data <int64_t >() + padded_attention_mask->get_size () - attention_mask->get_size ());
178
209
179
210
auto padded_position_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (" position_ids" ));
180
- std::copy_n (position_ids->data <int64_t >(), position_ids->get_size (), padded_position_ids->data <int64_t >() + offset);
211
+
212
+ std::copy_n (position_ids->data <int64_t >(),
213
+ position_ids->get_size (),
214
+ padded_position_ids->data <int64_t >() + padded_position_ids->get_size () - position_ids->get_size ());
181
215
182
216
m_prefill_request->infer ();
183
- m_npuw_llm_compiled_model->m_kvcache_desc .num_stored_tokens += static_cast <uint32_t >(input_ids->get_size ());
217
+
218
+ m_npuw_llm_compiled_model->m_kvcache_desc .num_stored_tokens +=
219
+ static_cast <uint32_t >(input_ids->get_shape ()[INPUT_IDS_SEQ_LEN_DIM]);
184
220
m_need_copy_kvcache = true ;
185
221
186
222
m_logits = m_prefill_request->get_tensor (m_prefill_out_ports.at (" logits" ));
@@ -244,8 +280,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
244
280
}
245
281
246
282
// FIXME: these tensors should be shared between the parent & child models
247
- auto kv_input_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (" input_ids" ));
248
- std::copy_n (input_ids->data <int64_t >(), input_ids->get_size (), kv_input_ids->data <int64_t >());
283
+ auto kv_input_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (m_input_ids_name));
284
+ // NB: input_ids can be either fp32(VLM) or i64(LLM)
285
+ std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()),
286
+ input_ids->get_byte_size (),
287
+ reinterpret_cast <uint8_t *>(kv_input_ids->data ()));
249
288
250
289
auto kv_attn_mask = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (" attention_mask" ));
251
290
std::copy_n (attention_mask->data <int64_t >(), attention_mask->get_size () - 1 , kv_attn_mask->data <int64_t >());
@@ -290,15 +329,20 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
290
329
void ov::npuw::LLMInferRequest::infer () {
291
330
const auto & inputs = get_inputs ();
292
331
293
- auto input_ids = get_tensor (inputs[0 ]);
294
- auto attention_mask = get_tensor (inputs[1 ]);
295
- auto position_ids = get_tensor (inputs[2 ]);
332
+ auto input_ids = get_tensor (find_port_by_name (inputs, m_input_ids_name).value ());
333
+ auto attention_mask = get_tensor (find_port_by_name (inputs, " attention_mask" ).value ());
334
+ // FIXME: position_ids might be optional for some models!
335
+ auto position_ids = get_tensor (find_port_by_name (inputs, " position_ids" ).value ());
296
336
297
- OPENVINO_ASSERT (ov::element::i64 == input_ids->get_element_type ());
337
+ // NB: For VLM, the "inputs_embeds" contains float values (embeddings)
338
+ OPENVINO_ASSERT (ov::element::f32 == input_ids->get_element_type () ||
339
+ ov::element::i64 == input_ids->get_element_type ());
298
340
OPENVINO_ASSERT (ov::element::i64 == attention_mask->get_element_type ());
299
341
OPENVINO_ASSERT (ov::element::i64 == position_ids->get_element_type ());
300
342
301
- if (input_ids->get_size () != 1 ) {
343
+ // NB: Check the sequence length provided for input_ids
344
+ // in order to distinguish prefill / generate stages
345
+ if (input_ids->get_shape ()[INPUT_IDS_SEQ_LEN_DIM] != 1 ) {
302
346
infer_prefill (input_ids, attention_mask, position_ids);
303
347
} else {
304
348
infer_generate (input_ids, attention_mask, position_ids);
0 commit comments