@@ -165,31 +165,33 @@ std::vector<ov::Tensor> InputsEmbedder::IInputsEmbedder::to_single_image_tensors
165
165
return single_image_tensors;
166
166
}
167
167
168
- std::vector<ov::genai::EncodedImage> InputsEmbedder::IInputsEmbedder::encode_images (const std::vector<ov::Tensor>& images, const bool & is_video) {
169
- std::vector<ov::Tensor> single_images = to_single_image_tensors (images);
168
+ std::vector<ov::genai::EncodedImage> InputsEmbedder::IInputsEmbedder::encode_images (const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) {
170
169
std::vector<EncodedImage> embeds;
171
170
172
- if (is_video) {
173
- return m_vision_encoder->encode_video (single_images);
171
+ for (const ov::Tensor& single_video : video) {
172
+ std::vector<ov::Tensor> single_frames = to_single_image_tensors ({single_video});
173
+ auto embeds_video = m_vision_encoder->encode_video (single_frames);
174
+ embeds.insert (embeds.end (), embeds_video.begin (), embeds_video.end ());
174
175
}
175
176
177
+ std::vector<ov::Tensor> single_images = to_single_image_tensors (images);
176
178
for (const ov::Tensor& image : single_images) {
177
179
embeds.emplace_back (m_vision_encoder->encode (image));
178
180
}
179
181
return embeds;
180
182
}
181
183
182
- ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds (const std::string& prompt, const std::vector<ov::Tensor>& images, const bool & is_video , ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t >& image_sequence) {
183
- return get_inputs_embeds (prompt, encode_images (images, is_video ), metrics, true , image_sequence);
184
+ ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds (const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video , ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t >& image_sequence) {
185
+ return get_inputs_embeds (prompt, encode_images (images, video ), metrics, true , image_sequence);
184
186
}
185
187
186
188
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids (
187
189
const std::string& prompt,
188
190
const std::vector<ov::Tensor>& images,
189
- const bool & is_video ,
191
+ const std::vector<ov::Tensor>& video ,
190
192
ov::genai::VLMPerfMetrics& metrics,
191
193
const std::vector<size_t >& image_sequence) {
192
- return get_inputs_embeds_with_token_type_ids (prompt, encode_images (images, is_video ), metrics, true , image_sequence);
194
+ return get_inputs_embeds_with_token_type_ids (prompt, encode_images (images, video ), metrics, true , image_sequence);
193
195
}
194
196
195
197
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids (
@@ -267,8 +269,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map,
267
269
}
268
270
}
269
271
270
- ov::Tensor InputsEmbedder::get_inputs_embeds (const std::string& prompt, const std::vector<ov::Tensor>& images, const bool & is_video , ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t >& image_sequence) {
271
- return m_impl->get_inputs_embeds (prompt, images, is_video , metrics, image_sequence);
272
+ ov::Tensor InputsEmbedder::get_inputs_embeds (const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video , ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t >& image_sequence) {
273
+ return m_impl->get_inputs_embeds (prompt, images, video , metrics, image_sequence);
272
274
}
273
275
274
276
ov::Tensor InputsEmbedder::get_inputs_embeds (const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector<size_t >& image_sequence) {
@@ -278,11 +280,10 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st
278
280
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::get_inputs_embeds_with_token_type_ids (
279
281
const std::string& prompt,
280
282
const std::vector<ov::Tensor>& images,
281
- const bool & is_video ,
283
+ const std::vector<ov::Tensor>& video ,
282
284
VLMPerfMetrics& metrics,
283
285
const std::vector<size_t >& image_sequence) {
284
- return m_impl->get_inputs_embeds_with_token_type_ids (
285
- prompt, images, is_video, metrics, image_sequence);
286
+ return m_impl->get_inputs_embeds_with_token_type_ids (prompt, images, video, metrics, image_sequence);
286
287
}
287
288
288
289
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::get_inputs_embeds_with_token_type_ids (
@@ -299,8 +300,8 @@ bool InputsEmbedder::has_token_type_ids() const {
299
300
return m_impl->has_token_type_ids ();
300
301
}
301
302
302
- std::vector<ov::genai::EncodedImage> InputsEmbedder::encode_images (const std::vector<ov::Tensor>& images, const bool & is_video ) {
303
- return m_impl->encode_images (images, is_video );
303
+ std::vector<ov::genai::EncodedImage> InputsEmbedder::encode_images (const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video ) {
304
+ return m_impl->encode_images (images, video );
304
305
}
305
306
306
307
std::pair<ov::Tensor, std::optional<int64_t >> InputsEmbedder::get_position_ids (const size_t inputs_embeds_size, const size_t history_size) {
0 commit comments